nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
File without changes
@@ -0,0 +1,16 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from uuid import UUID
6
+
7
+ from pydantic import BaseModel, Field, ConfigDict
8
+ from typing import Any, Dict, Union
9
+
10
+
11
+ class ControlMessageTask(BaseModel):
12
+ model_config = ConfigDict(extra="forbid")
13
+
14
+ type: str
15
+ id: Union[str, UUID]
16
+ properties: Dict[str, Any] = Field(default_factory=dict)
@@ -0,0 +1,307 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import copy
6
+ import re
7
+ from datetime import datetime
8
+ from collections import defaultdict
9
+ from typing import Any, Dict, Generator, List, Optional, Union
10
+
11
+ import logging
12
+ import pandas as pd
13
+
14
+ from nv_ingest_api.internal.primitives.control_message_task import ControlMessageTask
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def remove_task_by_type(ctrl_msg, task: str):
20
+ """
21
+ Remove a task from the control message by matching its type.
22
+
23
+ This function iterates over the tasks in the control message, and if it finds a task
24
+ whose type matches the provided task string, it removes that task (using its unique id)
25
+ and returns the task's properties.
26
+
27
+ Parameters
28
+ ----------
29
+ ctrl_msg : IngestControlMessage
30
+ The control message from which to remove the task.
31
+ task : str
32
+ The task type to remove.
33
+
34
+ Returns
35
+ -------
36
+ dict
37
+ The properties of the removed task.
38
+
39
+ Raises
40
+ ------
41
+ ValueError
42
+ If no task with the given type is found.
43
+ """
44
+ task_obj = None
45
+ for t in ctrl_msg.get_tasks():
46
+ if t.type == task:
47
+ task_obj = t
48
+ break
49
+
50
+ if task_obj is None:
51
+ err_msg = f"process_control_message: Task '{task}' not found in control message."
52
+ logger.error(err_msg)
53
+ raise ValueError(err_msg)
54
+
55
+ removed_task = ctrl_msg.remove_task(task_obj.id)
56
+ return removed_task.properties
57
+
58
+
59
+ def remove_all_tasks_by_type(ctrl_msg, task: str):
60
+ """
61
+ Remove all tasks from the control message by matching their type.
62
+
63
+ This function iterates over the tasks in the control message, finds all tasks
64
+ whose type matches the provided task string, removes them, and returns their
65
+ properties as a list.
66
+
67
+ Parameters
68
+ ----------
69
+ ctrl_msg : IngestControlMessage
70
+ The control message from which to remove the tasks.
71
+ task : str
72
+ The task type to remove.
73
+
74
+ Returns
75
+ -------
76
+ list[dict]
77
+ A list of dictionaries of properties for all removed tasks.
78
+
79
+ Raises
80
+ ------
81
+ ValueError
82
+ If no tasks with the given type are found.
83
+ """
84
+ matching_tasks = []
85
+
86
+ # Find all tasks with matching type
87
+ for t in ctrl_msg.get_tasks():
88
+ if t.type == task:
89
+ matching_tasks.append(t)
90
+
91
+ if not matching_tasks:
92
+ err_msg = f"process_control_message: No tasks of type '{task}' found in control message."
93
+ logger.error(err_msg)
94
+ raise ValueError(err_msg)
95
+
96
+ # Remove all matching tasks and collect their properties
97
+ removed_task_properties = []
98
+ for task_obj in matching_tasks:
99
+ removed_task = ctrl_msg.remove_task(task_obj.id)
100
+ removed_task_properties.append(removed_task.properties)
101
+
102
+ return removed_task_properties
103
+
104
+
105
+ class IngestControlMessage:
106
+ """
107
+ A control message class for ingesting tasks and managing associated metadata,
108
+ timestamps, configuration, and payload.
109
+ """
110
+
111
+ def __init__(self):
112
+ """
113
+ Initialize a new IngestControlMessage instance.
114
+ """
115
+ self._tasks: Dict[str, List[ControlMessageTask]] = defaultdict(list)
116
+ self._metadata: Dict[str, Any] = {}
117
+ self._timestamps: Dict[str, datetime] = {}
118
+ self._payload: Optional[pd.DataFrame] = None
119
+ self._config: Dict[str, Any] = {}
120
+
121
+ def add_task(self, task: ControlMessageTask):
122
+ """
123
+ Add a task to the control message. Multiple tasks with the same ID are supported.
124
+ """
125
+ self._tasks[task.id].append(task)
126
+
127
+ def get_tasks(self) -> Generator[ControlMessageTask, None, None]:
128
+ """
129
+ Return all tasks as a generator.
130
+ """
131
+ for task_list in self._tasks.values():
132
+ yield from task_list
133
+
134
+ def has_task(self, task_id: str) -> bool:
135
+ """
136
+ Check if any tasks with the given ID exist.
137
+ """
138
+ return task_id in self._tasks and len(self._tasks[task_id]) > 0
139
+
140
+ def remove_task(self, task_id: str) -> ControlMessageTask:
141
+ """
142
+ Remove the first task with the given ID. Warns if no task exists.
143
+ """
144
+ if task_id in self._tasks and self._tasks[task_id]:
145
+ task = self._tasks[task_id].pop(0)
146
+ # Clean up empty lists
147
+ if not self._tasks[task_id]:
148
+ del self._tasks[task_id]
149
+ return task
150
+ else:
151
+ raise RuntimeError(f"Attempted to remove non-existent task with id: {task_id}")
152
+
153
+ def config(self, config: Dict[str, Any] = None) -> Dict[str, Any]:
154
+ """
155
+ Get or update the control message configuration.
156
+
157
+ If 'config' is provided, it must be a dictionary. The configuration is updated with the
158
+ provided values. If no argument is provided, returns a copy of the current configuration.
159
+
160
+ Raises
161
+ ------
162
+ ValueError
163
+ If the provided configuration is not a dictionary.
164
+ """
165
+ if config is None:
166
+ return self._config.copy()
167
+
168
+ if not isinstance(config, dict):
169
+ raise ValueError("Configuration must be provided as a dictionary.")
170
+
171
+ self._config.update(config)
172
+ return self._config.copy()
173
+
174
+ def copy(self) -> "IngestControlMessage":
175
+ """
176
+ Create a deep copy of this control message.
177
+ """
178
+ return copy.deepcopy(self)
179
+
180
+ def get_metadata(self, key: Union[str, re.Pattern] = None, default_value: Any = None) -> Any:
181
+ """
182
+ Retrieve metadata. If 'key' is None, returns a copy of all metadata.
183
+
184
+ Parameters
185
+ ----------
186
+ key : str or re.Pattern, optional
187
+ If a string is provided, returns the value for that exact key.
188
+ If a regex pattern is provided, returns a dictionary of all metadata key-value pairs
189
+ where the key matches the regex. If no matches are found, returns default_value.
190
+ default_value : Any, optional
191
+ The value to return if the key is not found or no regex matches.
192
+
193
+ Returns
194
+ -------
195
+ Any
196
+ The metadata value for an exact string key, or a dict of matching metadata if a regex is provided.
197
+ """
198
+ if key is None:
199
+ return self._metadata.copy()
200
+
201
+ # If key is a regex pattern (i.e. has a search method), perform pattern matching.
202
+ if hasattr(key, "search"):
203
+ matches = {k: v for k, v in self._metadata.items() if key.search(k)}
204
+ return matches if matches else default_value
205
+
206
+ # Otherwise, perform an exact lookup.
207
+ return self._metadata.get(key, default_value)
208
+
209
+ def has_metadata(self, key: Union[str, re.Pattern]) -> bool:
210
+ """
211
+ Check if a metadata key exists.
212
+
213
+ Parameters
214
+ ----------
215
+ key : str or re.Pattern
216
+ If a string is provided, checks for the exact key.
217
+ If a regex pattern is provided, returns True if any metadata key matches the regex.
218
+
219
+ Returns
220
+ -------
221
+ bool
222
+ True if the key (or any matching key, in case of a regex) exists, False otherwise.
223
+ """
224
+ if hasattr(key, "search"):
225
+ return any(key.search(k) for k in self._metadata)
226
+ return key in self._metadata
227
+
228
+ def list_metadata(self) -> list:
229
+ """
230
+ List all metadata keys.
231
+ """
232
+ return list(self._metadata.keys())
233
+
234
+ def set_metadata(self, key: str, value: Any) -> None:
235
+ """
236
+ Set a metadata key-value pair.
237
+ """
238
+ self._metadata[key] = value
239
+
240
+ def filter_timestamp(self, regex_filter: str) -> Dict[str, datetime]:
241
+ """
242
+ Retrieve timestamps whose keys match the regex filter.
243
+ """
244
+ pattern = re.compile(regex_filter)
245
+ timestamps_snapshot = self._timestamps.copy()
246
+ return {key: ts for key, ts in timestamps_snapshot.items() if pattern.search(key)}
247
+
248
+ def get_timestamp(self, key: str, fail_if_nonexist: bool = False) -> datetime:
249
+ """
250
+ Retrieve a timestamp for a given key.
251
+
252
+ Raises
253
+ ------
254
+ KeyError
255
+ If the key is not found and 'fail_if_nonexist' is True.
256
+ """
257
+ if key in self._timestamps:
258
+ return self._timestamps[key]
259
+ if fail_if_nonexist:
260
+ raise KeyError(f"Timestamp for key '{key}' does not exist.")
261
+ return None
262
+
263
+ def get_timestamps(self) -> Dict[str, datetime]:
264
+ """
265
+ Retrieve all timestamps.
266
+ """
267
+ return self._timestamps.copy()
268
+
269
+ def set_timestamp(self, key: str, timestamp: Any) -> None:
270
+ """
271
+ Set a timestamp for a given key. Accepts either a datetime object or an ISO format string.
272
+
273
+ Raises
274
+ ------
275
+ ValueError
276
+ If the provided timestamp is neither a datetime object nor a valid ISO format string.
277
+ """
278
+ if isinstance(timestamp, datetime):
279
+ self._timestamps[key] = timestamp
280
+
281
+ elif isinstance(timestamp, str):
282
+ try:
283
+ dt = datetime.fromisoformat(timestamp)
284
+ self._timestamps[key] = dt
285
+ except ValueError as e:
286
+ raise ValueError(f"Invalid timestamp format: {timestamp}") from e
287
+
288
+ else:
289
+ raise ValueError("timestamp must be a datetime object or ISO format string")
290
+
291
+ def payload(self, payload: pd.DataFrame = None) -> pd.DataFrame:
292
+ """
293
+ Get or set the payload DataFrame.
294
+
295
+ Raises
296
+ ------
297
+ ValueError
298
+ If the provided payload is not a pandas DataFrame.
299
+ """
300
+ if payload is None:
301
+ return self._payload
302
+
303
+ if not isinstance(payload, pd.DataFrame):
304
+ raise ValueError("Payload must be a pandas DataFrame")
305
+
306
+ self._payload = payload
307
+ return self._payload
@@ -0,0 +1,9 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from .nim_client import NimClient
6
+ from .nim_client import get_nim_client_manager
7
+ from .nim_model_interface import ModelInterface
8
+
9
+ __all__ = ["NimClient", "ModelInterface", "get_nim_client_manager"]
@@ -0,0 +1,14 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2024, NVIDIA CORPORATION.
6
+
7
+
8
+ YOLOX_MAX_BATCH_SIZE = 8
9
+ YOLOX_MAX_WIDTH = 1536
10
+ YOLOX_MAX_HEIGHT = 1536
11
+ YOLOX_CONF_THRESHOLD = 0.01
12
+ YOLOX_IOU_THRESHOLD = 0.5
13
+ YOLOX_MIN_SCORE = 0.1
14
+ YOLOX_FINAL_SCORE = 0.48
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,274 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import base64
7
+ import io
8
+ import logging
9
+ import PIL.Image as Image
10
+ from typing import Any, Dict, Optional, List
11
+
12
+ import numpy as np
13
+
14
+ from nv_ingest_api.internal.primitives.nim import ModelInterface
15
+ from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class CachedModelInterface(ModelInterface):
21
+ """
22
+ An interface for handling inference with a Cached model, supporting both gRPC and HTTP
23
+ protocols, including batched input.
24
+ """
25
+
26
+ def name(self) -> str:
27
+ """
28
+ Get the name of the model interface.
29
+
30
+ Returns
31
+ -------
32
+ str
33
+ The name of the model interface ("Cached").
34
+ """
35
+ return "Cached"
36
+
37
+ def prepare_data_for_inference(self, data: Dict[str, Any]) -> Dict[str, Any]:
38
+ """
39
+ Decode base64-encoded images into NumPy arrays, storing them in `data["image_arrays"]`.
40
+
41
+ Parameters
42
+ ----------
43
+ data : dict of str -> Any
44
+ The input data containing either:
45
+ - "base64_image": a single base64-encoded image, or
46
+ - "base64_images": a list of base64-encoded images.
47
+
48
+ Returns
49
+ -------
50
+ dict of str -> Any
51
+ The updated data dictionary with decoded image arrays stored in
52
+ "image_arrays", where each array has shape (H, W, C).
53
+
54
+ Raises
55
+ ------
56
+ KeyError
57
+ If neither 'base64_image' nor 'base64_images' is provided.
58
+ ValueError
59
+ If 'base64_images' is provided but is not a list.
60
+ """
61
+ if "base64_images" in data:
62
+ base64_list = data["base64_images"]
63
+ if not isinstance(base64_list, list):
64
+ raise ValueError("The 'base64_images' key must contain a list of base64-encoded strings.")
65
+ data["image_arrays"] = [base64_to_numpy(img) for img in base64_list]
66
+
67
+ elif "base64_image" in data:
68
+ # Fallback to single image case; wrap it in a list to keep the interface consistent
69
+ data["image_arrays"] = [base64_to_numpy(data["base64_image"])]
70
+
71
+ else:
72
+ raise KeyError("Input data must include 'base64_image' or 'base64_images' with base64-encoded images.")
73
+
74
+ return data
75
+
76
+ def format_input(self, data: Dict[str, Any], protocol: str, max_batch_size: int, **kwargs) -> Any:
77
+ """
78
+ Format input data for the specified protocol ("grpc" or "http"), handling batched images.
79
+ Additionally, returns batched data that coalesces the original image arrays and their dimensions
80
+ in the same order as provided.
81
+
82
+ Parameters
83
+ ----------
84
+ data : dict of str -> Any
85
+ The input data dictionary, expected to contain "image_arrays" (a list of np.ndarray).
86
+ protocol : str
87
+ The protocol to use, "grpc" or "http".
88
+ max_batch_size : int
89
+ The maximum number of images per batch.
90
+
91
+ Returns
92
+ -------
93
+ tuple
94
+ A tuple (formatted_batches, formatted_batch_data) where:
95
+ - For gRPC: formatted_batches is a list of NumPy arrays, each of shape (B, H, W, C)
96
+ with B <= max_batch_size.
97
+ - For HTTP: formatted_batches is a list of JSON-serializable dict payloads.
98
+ - In both cases, formatted_batch_data is a list of dicts with the keys:
99
+ "image_arrays": the list of original np.ndarray images for that batch, and
100
+ "image_dims": a list of (height, width) tuples for each image in the batch.
101
+
102
+ Raises
103
+ ------
104
+ KeyError
105
+ If "image_arrays" is missing in the data dictionary.
106
+ ValueError
107
+ If the protocol is invalid, or if no valid images are found.
108
+ """
109
+ if "image_arrays" not in data:
110
+ raise KeyError("Expected 'image_arrays' in data. Make sure prepare_data_for_inference was called.")
111
+
112
+ image_arrays = data["image_arrays"]
113
+ # Compute dimensions for each image.
114
+ image_dims = [(img.shape[0], img.shape[1]) for img in image_arrays]
115
+
116
+ # Helper: chunk a list into sublists of length up to chunk_size.
117
+ def chunk_list(lst: list, chunk_size: int) -> List[list]:
118
+ return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
119
+
120
+ if protocol == "grpc":
121
+ logger.debug("Formatting input for gRPC Cached model (batched).")
122
+ batched_images = []
123
+ for arr in image_arrays:
124
+ # Expand from (H, W, C) to (1, H, W, C) if needed
125
+ if arr.ndim == 3:
126
+ arr = np.expand_dims(arr, axis=0)
127
+ batched_images.append(arr.astype(np.float32))
128
+
129
+ if not batched_images:
130
+ raise ValueError("No valid images found for gRPC formatting.")
131
+
132
+ # Chunk the processed images, original arrays, and dimensions.
133
+ batched_image_chunks = chunk_list(batched_images, max_batch_size)
134
+ orig_chunks = chunk_list(image_arrays, max_batch_size)
135
+ dims_chunks = chunk_list(image_dims, max_batch_size)
136
+
137
+ batched_inputs = []
138
+ formatted_batch_data = []
139
+ for proc_chunk, orig_chunk, dims_chunk in zip(batched_image_chunks, orig_chunks, dims_chunks):
140
+ # Concatenate along the batch dimension => shape (B, H, W, C)
141
+ batched_input = np.concatenate(proc_chunk, axis=0)
142
+ batched_inputs.append(batched_input)
143
+ formatted_batch_data.append({"image_arrays": orig_chunk, "image_dims": dims_chunk})
144
+ return batched_inputs, formatted_batch_data
145
+
146
+ elif protocol == "http":
147
+ logger.debug("Formatting input for HTTP Cached model (batched).")
148
+ content_list: List[Dict[str, Any]] = []
149
+ for arr in image_arrays:
150
+ # Convert to uint8 if needed, then to PIL Image and base64-encode it.
151
+ if arr.dtype != np.uint8:
152
+ arr = (arr * 255).astype(np.uint8)
153
+ image_pil = Image.fromarray(arr)
154
+ buffered = io.BytesIO()
155
+ image_pil.save(buffered, format="PNG")
156
+ base64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
157
+ image_item = {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_img}"}}
158
+ content_list.append(image_item)
159
+
160
+ # Chunk the content list, original arrays, and dimensions.
161
+ content_chunks = chunk_list(content_list, max_batch_size)
162
+ orig_chunks = chunk_list(image_arrays, max_batch_size)
163
+ dims_chunks = chunk_list(image_dims, max_batch_size)
164
+
165
+ payload_batches = []
166
+ formatted_batch_data = []
167
+ for chunk, orig_chunk, dims_chunk in zip(content_chunks, orig_chunks, dims_chunks):
168
+ message = {"content": chunk}
169
+ payload = {"messages": [message]}
170
+ payload_batches.append(payload)
171
+ formatted_batch_data.append({"image_arrays": orig_chunk, "image_dims": dims_chunk})
172
+ return payload_batches, formatted_batch_data
173
+
174
+ else:
175
+ raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
176
+
177
+ def parse_output(self, response: Any, protocol: str, data: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Any:
178
+ """
179
+ Parse the output from the Cached model's inference response.
180
+
181
+ Parameters
182
+ ----------
183
+ response : Any
184
+ The raw response from the model inference.
185
+ protocol : str
186
+ The protocol used ("grpc" or "http").
187
+ data : dict of str -> Any, optional
188
+ Additional input data (unused here, but available for consistency).
189
+ **kwargs : Any
190
+ Additional keyword arguments for future compatibility.
191
+
192
+ Returns
193
+ -------
194
+ Any
195
+ The parsed output data (e.g., list of strings), depending on the protocol.
196
+
197
+ Raises
198
+ ------
199
+ ValueError
200
+ If the protocol is invalid.
201
+ RuntimeError
202
+ If the HTTP response is not as expected (missing 'data' key).
203
+ """
204
+ if protocol == "grpc":
205
+ logger.debug("Parsing output from gRPC Cached model (batched).")
206
+ parsed: List[str] = []
207
+ # Assume `response` is iterable, each element a list/array of byte strings
208
+ for single_output in response:
209
+ joined_str = " ".join(o.decode("utf-8") for o in single_output)
210
+ parsed.append(joined_str)
211
+ return parsed
212
+
213
+ elif protocol == "http":
214
+ logger.debug("Parsing output from HTTP Cached model (batched).")
215
+ if not isinstance(response, dict):
216
+ raise RuntimeError("Expected JSON/dict response for HTTP, got something else.")
217
+ if "data" not in response or not response["data"]:
218
+ raise RuntimeError("Unexpected response format: 'data' key missing or empty.")
219
+
220
+ contents: List[str] = []
221
+ for item in response["data"]:
222
+ # Each "item" might have a "content" key
223
+ content = item.get("content", "")
224
+ contents.append(content)
225
+
226
+ return contents
227
+
228
+ else:
229
+ raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
230
+
231
+ def process_inference_results(self, output: Any, protocol: str, **kwargs: Any) -> Any:
232
+ """
233
+ Process inference results for the Cached model.
234
+
235
+ Parameters
236
+ ----------
237
+ output : Any
238
+ The raw output from the model.
239
+ protocol : str
240
+ The inference protocol used ("grpc" or "http").
241
+ **kwargs : Any
242
+ Additional parameters for post-processing (not used here).
243
+
244
+ Returns
245
+ -------
246
+ Any
247
+ The processed inference results, which here is simply returned as-is.
248
+ """
249
+ # For Cached model, we simply return what we parsed (e.g., a list of strings or a single string)
250
+ return output
251
+
252
+ def _extract_content_from_nim_response(self, json_response: Dict[str, Any]) -> Any:
253
+ """
254
+ Extract content from the JSON response of a NIM (HTTP) API request.
255
+
256
+ Parameters
257
+ ----------
258
+ json_response : dict of str -> Any
259
+ The JSON response from the NIM API.
260
+
261
+ Returns
262
+ -------
263
+ Any
264
+ The extracted content from the response.
265
+
266
+ Raises
267
+ ------
268
+ RuntimeError
269
+ If the response format is unexpected (missing 'data' or empty).
270
+ """
271
+ if "data" not in json_response or not json_response["data"]:
272
+ raise RuntimeError("Unexpected response format: 'data' key is missing or empty.")
273
+
274
+ return json_response["data"][0]["content"]
@@ -0,0 +1,56 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from functools import wraps
7
+ from multiprocessing import Lock
8
+ from multiprocessing import Manager
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Create a shared manager and lock for thread-safe access
13
+ manager = Manager()
14
+ global_cache = manager.dict()
15
+ lock = Lock()
16
+
17
+
18
+ def multiprocessing_cache(max_calls):
19
+ """
20
+ A decorator that creates a global cache shared between multiple processes.
21
+ The cache is invalidated after `max_calls` number of accesses.
22
+
23
+ Args:
24
+ max_calls (int): The number of calls after which the cache is cleared.
25
+
26
+ Returns:
27
+ function: The decorated function with global cache and invalidation logic.
28
+ """
29
+
30
+ def decorator(func):
31
+ call_count = manager.Value("i", 0) # Shared integer for call counting
32
+
33
+ @wraps(func)
34
+ def wrapper(*args, **kwargs):
35
+ key = (func.__name__, args, frozenset(kwargs.items()))
36
+
37
+ with lock:
38
+ call_count.value += 1
39
+
40
+ if call_count.value > max_calls:
41
+ global_cache.clear()
42
+ call_count.value = 0
43
+
44
+ if key in global_cache:
45
+ return global_cache[key]
46
+
47
+ result = func(*args, **kwargs)
48
+
49
+ with lock:
50
+ global_cache[key] = result
51
+
52
+ return result
53
+
54
+ return wrapper
55
+
56
+ return decorator