nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,243 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ # Copyright (c) 2024, NVIDIA CORPORATION.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+
20
+ import asyncio
21
+ import io
22
+ import logging
23
+ import time
24
+ from typing import Any, Optional
25
+ from typing import Dict
26
+ from typing import List
27
+
28
+ import aiohttp
29
+
30
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
31
+
32
+ DEFAULT_RESULT_TYPE = "text"
33
+ DEFAULT_FILE_NAME = "_.pdf"
34
+ DEFAULT_CHECK_INTERVAL_SECONDS = 1
35
+ DEFAULT_MAX_TIMEOUT_SECONDS = 2_000
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ def llama_parse_extractor(
41
+ pdf_stream: io.BytesIO,
42
+ extract_text: bool,
43
+ extract_images: bool,
44
+ extract_infographics: bool,
45
+ extract_tables: bool,
46
+ extractor_config: dict,
47
+ execution_trace_log: Optional[List[Any]] = None,
48
+ ) -> List[Dict[ContentTypeEnum, Dict[str, Any]]]:
49
+ """
50
+ Helper function to use LlamaParse API to extract text from a bytestream PDF.
51
+
52
+ Parameters
53
+ ----------
54
+ pdf_stream : io.BytesIO
55
+ A bytestream PDF.
56
+ extract_text : bool
57
+ Specifies whether to extract text.
58
+ extract_images : bool
59
+ Specifies whether to extract images.
60
+ extract_tables : bool
61
+ Specifies whether to extract tables.
62
+ extract_infographics : bool
63
+ Specifies whether to extract infographics.
64
+ extractor_config : dict
65
+ A dictionary containing additional extraction parameters including:
66
+ - api_key: API key for LlamaParse.
67
+ - result_type: Type of result to extract (default provided).
68
+ - file_name: Name of the file (default provided).
69
+ - check_interval: Interval for checking status (default provided).
70
+ - max_timeout: Maximum timeout in seconds (default provided).
71
+ - row_data: Row data for additional metadata.
72
+ - metadata_column: Column name to extract metadata (default "metadata").
73
+ execution_trace_log : optional
74
+ Trace information for debugging purposes.
75
+
76
+ Returns
77
+ -------
78
+ List[Dict[ContentTypeEnum, Dict[str, Any]]]:
79
+ A list of extracted data. Each item is a dictionary where the key is a
80
+ ContentTypeEnum and the value is a dictionary containing content and metadata.
81
+
82
+ Raises
83
+ ------
84
+ ValueError
85
+ If extractor_config is not a dict or required parameters are missing.
86
+ """
87
+
88
+ _ = execution_trace_log # Unused variable
89
+
90
+ logger.debug("Extracting PDF with LlamaParse backend.")
91
+
92
+ # Validate extractor_config.
93
+ if not isinstance(extractor_config, dict):
94
+ raise ValueError("extractor_config must be a dictionary.")
95
+
96
+ api_key = extractor_config.get("llama_api_key")
97
+ if not api_key:
98
+ raise ValueError("LLAMA_CLOUD_API_KEY is required in extractor_config.")
99
+
100
+ result_type = extractor_config.get("result_type", DEFAULT_RESULT_TYPE)
101
+ file_name = extractor_config.get("file_name", DEFAULT_FILE_NAME)
102
+ check_interval = extractor_config.get("check_interval", DEFAULT_CHECK_INTERVAL_SECONDS)
103
+ max_timeout = extractor_config.get("max_timeout", DEFAULT_MAX_TIMEOUT_SECONDS)
104
+
105
+ row_data = extractor_config.get("row_data")
106
+ if row_data is None:
107
+ raise ValueError("Missing 'row_data' in extractor_config.")
108
+ metadata_column = extractor_config.get("metadata_column", "metadata")
109
+ if hasattr(row_data, "index"):
110
+ metadata = row_data[metadata_column] if metadata_column in row_data.index else {}
111
+ else:
112
+ metadata = row_data.get(metadata_column, {})
113
+
114
+ extracted_data = []
115
+
116
+ if extract_text:
117
+ # TODO: As of Feb 2024, LlamaParse returns multi-page documents as one
118
+ # long text. See if we can break it into pages or if LlamaParse adds
119
+ # support for extracting each page.
120
+ text = asyncio.run(
121
+ async_llama_parse(
122
+ pdf_stream,
123
+ api_key,
124
+ file_name=file_name,
125
+ result_type=result_type,
126
+ check_interval_seconds=check_interval,
127
+ max_timeout_seconds=max_timeout,
128
+ )
129
+ )
130
+
131
+ text_metadata = metadata.copy()
132
+ text_metadata.update(
133
+ {
134
+ "content": text,
135
+ "metadata": {
136
+ "document_type": ContentTypeEnum[result_type],
137
+ },
138
+ }
139
+ )
140
+
141
+ payload = {
142
+ ContentTypeEnum[result_type]: text_metadata,
143
+ }
144
+
145
+ extracted_data.append(payload)
146
+
147
+ # TODO: LlamaParse extracts tables, but we have to extract the tables
148
+ # ourselves from text/markdown.
149
+ if extract_tables:
150
+ # Table extraction logic goes here.
151
+ pass
152
+
153
+ # LlamaParse does not support image extraction as of Feb 2024.
154
+ if extract_images:
155
+ # Image extraction logic goes here.
156
+ pass
157
+
158
+ # Infographics extraction is currently not supported by LlamaParse.
159
+ if extract_infographics:
160
+ logger.debug("Infographics extraction requested, but not supported by LlamaParse.")
161
+
162
+ return extracted_data
163
+
164
+
165
+ async def async_llama_parse(
166
+ pdf_stream: io.BytesIO,
167
+ api_key: str,
168
+ file_name: str = DEFAULT_FILE_NAME,
169
+ result_type: str = DEFAULT_RESULT_TYPE,
170
+ check_interval_seconds: int = DEFAULT_CHECK_INTERVAL_SECONDS,
171
+ max_timeout_seconds: int = DEFAULT_MAX_TIMEOUT_SECONDS,
172
+ ) -> str:
173
+ """Uses the LlamaParse API to extract text from bytestream PDF.
174
+
175
+ Parameters
176
+ ----------
177
+ pdf_stream : io.BytesIO
178
+ A bytestream PDF.
179
+ api_key: str
180
+ API key from https://cloud.llamaindex.ai.
181
+ file_name: str
182
+ Name of the PDF file.
183
+ result_type: str
184
+ The result type for the parser. One of `text` or `markdown`.
185
+ check_interval_seconds: int
186
+ The interval in seconds to check if the parsing is done.
187
+ max_timeout_seconds: int
188
+ The maximum timeout in seconds to wait for the parsing to finish.
189
+
190
+ Returns
191
+ -------
192
+ str
193
+ A string of extracted text.
194
+ """
195
+ base_url = "https://api.cloud.llamaindex.ai/api/parsing"
196
+ headers = {"Authorization": f"Bearer {api_key}"}
197
+ mime_type = "application/pdf"
198
+
199
+ try:
200
+ data = aiohttp.FormData()
201
+ data.add_field(
202
+ "file",
203
+ pdf_stream,
204
+ filename=file_name,
205
+ content_type=mime_type,
206
+ )
207
+
208
+ upload_url = f"{base_url}/upload"
209
+
210
+ async with aiohttp.ClientSession() as session:
211
+ async with session.post(
212
+ upload_url,
213
+ data=data,
214
+ headers=headers,
215
+ ) as response:
216
+ response_json = await response.json()
217
+ job_id = response_json["id"]
218
+ logger.debug("Started parsing the file under job_id %s" % job_id)
219
+
220
+ result_url = f"{base_url}/job/{job_id}/result/{result_type}"
221
+
222
+ start = time.time()
223
+ while True:
224
+ await asyncio.sleep(check_interval_seconds)
225
+ result = await session.get(result_url, headers=headers)
226
+
227
+ if result.status == 404:
228
+ end = time.time()
229
+ if end - start > max_timeout_seconds:
230
+ raise Exception("Timeout while parsing PDF.")
231
+ continue
232
+
233
+ result_json = await result.json()
234
+ if result.status == 400:
235
+ detail = result_json.get("detail", "Unknown error")
236
+ raise Exception(f"Failed to parse the PDF file: {detail}")
237
+
238
+ text = result_json[result_type]
239
+ return text
240
+
241
+ except Exception as e:
242
+ logger.error("Error while parsing the PDF file: ", e)
243
+ return ""