nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,239 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import json
6
+ import logging
7
+ from typing import Any
8
+ from typing import Dict
9
+ from typing import List
10
+ from typing import Optional
11
+
12
+ from nv_ingest_api.internal.primitives.nim import ModelInterface
13
+ from nv_ingest_api.util.image_processing.transforms import numpy_to_base64
14
+
15
+ ACCEPTED_TEXT_CLASSES = set(
16
+ [
17
+ "Text",
18
+ "Title",
19
+ "Section-header",
20
+ "List-item",
21
+ "TOC",
22
+ "Bibliography",
23
+ "Formula",
24
+ "Page-header",
25
+ "Page-footer",
26
+ "Caption",
27
+ "Footnote",
28
+ "Floating-text",
29
+ ]
30
+ )
31
+ ACCEPTED_TABLE_CLASSES = set(
32
+ [
33
+ "Table",
34
+ ]
35
+ )
36
+ ACCEPTED_IMAGE_CLASSES = set(
37
+ [
38
+ "Picture",
39
+ ]
40
+ )
41
+ ACCEPTED_CLASSES = ACCEPTED_TEXT_CLASSES | ACCEPTED_TABLE_CLASSES | ACCEPTED_IMAGE_CLASSES
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ class NemotronParseModelInterface(ModelInterface):
47
+ """
48
+ An interface for handling inference with a Nemotron Parse model.
49
+ """
50
+
51
+ def __init__(self, model_name: str = "nvidia/nemotron-parse"):
52
+ """
53
+ Initialize the instance with a specified model name.
54
+ Parameters
55
+ ----------
56
+ model_name : str, optional
57
+ The name of the model to be used, by default "nvidia/nemotron-parse".
58
+ """
59
+ self.model_name = model_name
60
+
61
+ def name(self) -> str:
62
+ """
63
+ Get the name of the model interface.
64
+
65
+ Returns
66
+ -------
67
+ str
68
+ The name of the model interface.
69
+ """
70
+ return "nemotron_parse"
71
+
72
+ def prepare_data_for_inference(self, data: Dict[str, Any]) -> Dict[str, Any]:
73
+ """
74
+ Prepare input data for inference by resizing images and storing their original shapes.
75
+
76
+ Parameters
77
+ ----------
78
+ data : dict
79
+ The input data containing a list of images.
80
+
81
+ Returns
82
+ -------
83
+ dict
84
+ The updated data dictionary with resized images and original image shapes.
85
+ """
86
+
87
+ return data
88
+
89
+ def format_input(self, data: Dict[str, Any], protocol: str, max_batch_size: int, **kwargs) -> Any:
90
+ """
91
+ Format input data for the specified protocol.
92
+
93
+ Parameters
94
+ ----------
95
+ data : dict
96
+ The input data to format.
97
+ protocol : str
98
+ The protocol to use ("grpc" or "http").
99
+ **kwargs : dict
100
+ Additional parameters for HTTP payload formatting.
101
+
102
+ Returns
103
+ -------
104
+ Any
105
+ The formatted input data.
106
+
107
+ Raises
108
+ ------
109
+ ValueError
110
+ If an invalid protocol is specified.
111
+ """
112
+
113
+ # Helper function: chunk a list into sublists of length <= chunk_size.
114
+ def chunk_list(lst: list, chunk_size: int) -> List[list]:
115
+ return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
116
+
117
+ if protocol == "grpc":
118
+ raise ValueError("gRPC protocol is not supported for Nemotron Parse.")
119
+ elif protocol == "http":
120
+ logger.debug("Formatting input for HTTP Nemotron Parse model")
121
+ # Prepare payload for HTTP request
122
+
123
+ ## TODO: Ask @Edward Kim if we want to switch to JPEG/PNG here
124
+ if "images" in data:
125
+ base64_list = [numpy_to_base64(img) for img in data["images"]]
126
+ else:
127
+ base64_list = [numpy_to_base64(data["image"])]
128
+
129
+ formatted_batches = []
130
+ formatted_batch_data = []
131
+ b64_chunks = chunk_list(base64_list, max_batch_size)
132
+
133
+ for b64_chunk in b64_chunks:
134
+ payload = self._prepare_nemotron_parse_payload(b64_chunk)
135
+ formatted_batches.append(payload)
136
+ formatted_batch_data.append({})
137
+ return formatted_batches, formatted_batch_data
138
+
139
+ else:
140
+ raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
141
+
142
+ def parse_output(self, response: Any, protocol: str, data: Optional[Dict[str, Any]] = None, **kwargs) -> Any:
143
+ """
144
+ Parse the output from the model's inference response.
145
+
146
+ Parameters
147
+ ----------
148
+ response : Any
149
+ The response from the model inference.
150
+ protocol : str
151
+ The protocol used ("grpc" or "http").
152
+ data : dict, optional
153
+ Additional input data passed to the function.
154
+
155
+ Returns
156
+ -------
157
+ Any
158
+ The parsed output data.
159
+
160
+ Raises
161
+ ------
162
+ ValueError
163
+ If an invalid protocol is specified.
164
+ """
165
+
166
+ if protocol == "grpc":
167
+ raise ValueError("gRPC protocol is not supported for Nemotron Parse.")
168
+ elif protocol == "http":
169
+ logger.debug("Parsing output from HTTP Nemotron Parse model")
170
+ return self._extract_content_from_nemotron_parse_response(response)
171
+ else:
172
+ raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
173
+
174
+ def process_inference_results(self, output: Any, **kwargs) -> Any:
175
+ """
176
+ Process inference results for the Nemotron Parse model.
177
+
178
+ Parameters
179
+ ----------
180
+ output : Any
181
+ The raw output from the model.
182
+
183
+ Returns
184
+ -------
185
+ Any
186
+ The processed inference results.
187
+ """
188
+
189
+ return output
190
+
191
+ def _prepare_nemotron_parse_payload(self, base64_list: List[str]) -> Dict[str, Any]:
192
+ messages = []
193
+
194
+ for b64_img in base64_list:
195
+ messages.append(
196
+ {
197
+ "role": "user",
198
+ "content": [
199
+ {
200
+ "type": "image_url",
201
+ "image_url": {
202
+ "url": f"data:image/png;base64,{b64_img}",
203
+ },
204
+ }
205
+ ],
206
+ }
207
+ )
208
+ payload = {
209
+ "model": self.model_name,
210
+ "messages": messages,
211
+ }
212
+
213
+ return payload
214
+
215
+ def _extract_content_from_nemotron_parse_response(self, json_response: Dict[str, Any]) -> Any:
216
+ """
217
+ Extract content from the JSON response of a Deplot HTTP API request.
218
+
219
+ Parameters
220
+ ----------
221
+ json_response : dict
222
+ The JSON response from the Deplot API.
223
+
224
+ Returns
225
+ -------
226
+ Any
227
+ The extracted content from the response.
228
+
229
+ Raises
230
+ ------
231
+ RuntimeError
232
+ If the response does not contain the expected "choices" key or if it is empty.
233
+ """
234
+
235
+ if "choices" not in json_response or not json_response["choices"]:
236
+ raise RuntimeError("Unexpected response format: 'choices' key is missing or empty.")
237
+
238
+ tool_call = json_response["choices"][0]["message"]["tool_calls"][0]
239
+ return json.loads(tool_call["function"]["arguments"])