nv-ingest-api 2025.4.20.dev20250420__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,426 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ # Copyright (c) 2024, NVIDIA CORPORATION.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+
20
+ import io
21
+ import logging
22
+ import uuid
23
+ import warnings
24
+ from typing import Dict, Any, Optional, List
25
+
26
+ import pandas as pd
27
+ import pypdfium2 as pdfium
28
+ from unstructured_client import UnstructuredClient
29
+ from unstructured_client.models import operations
30
+ from unstructured_client.models import shared
31
+ from unstructured_client.utils import BackoffStrategy
32
+ from unstructured_client.utils import RetryConfig
33
+
34
+ from nv_ingest_api.internal.enums.common import AccessLevelEnum, DocumentTypeEnum
35
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
36
+ from nv_ingest_api.internal.enums.common import ContentDescriptionEnum
37
+ from nv_ingest_api.internal.enums.common import TableFormatEnum
38
+ from nv_ingest_api.internal.enums.common import TextTypeEnum
39
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
40
+ from nv_ingest_api.util.metadata.aggregators import extract_pdf_metadata, construct_text_metadata
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ def unstructured_io_extractor(
46
+ pdf_stream: io.BytesIO,
47
+ extract_text: bool,
48
+ extract_images: bool,
49
+ extract_infographics: bool,
50
+ extract_charts: bool,
51
+ extract_tables: bool,
52
+ extractor_config: Dict[str, Any],
53
+ execution_trace_log: Optional[List[Any]] = None,
54
+ ) -> pd.DataFrame:
55
+ """
56
+ Helper function to use unstructured-io REST API to extract text from a bytestream PDF.
57
+
58
+ This function sends the provided PDF stream to the unstructured-io API and
59
+ returns the extracted text. Additional parameters for the extraction are
60
+ provided via the extractor_config dictionary. Note that although flags for
61
+ image, table, and infographics extraction are provided, the underlying API
62
+ may not support all of these features.
63
+
64
+ Parameters
65
+ ----------
66
+ pdf_stream : io.BytesIO
67
+ A bytestream representing the PDF to be processed.
68
+ extract_text : bool
69
+ Specifies whether to extract text.
70
+ extract_images : bool
71
+ Specifies whether to extract images.
72
+ extract_infographics : bool
73
+ Specifies whether to extract infographics.
74
+ extract_tables : bool
75
+ Specifies whether to extract tables.
76
+ extractor_config : dict
77
+ A dictionary containing additional extraction parameters:
78
+ - unstructured_api_key : API key for unstructured.io.
79
+ - unstructured_url : URL for the unstructured.io API endpoint.
80
+ - unstructured_strategy : Strategy for extraction (default: "auto").
81
+ - unstructured_concurrency_level : Concurrency level for PDF splitting.
82
+ - row_data : Row data containing source information.
83
+ - text_depth : Depth of text extraction (e.g., "page").
84
+ - identify_nearby_objects : Flag for identifying nearby objects.
85
+ - metadata_column : Column name for metadata extraction.
86
+
87
+ Returns
88
+ -------
89
+ str
90
+ A string containing the extracted text.
91
+
92
+ Raises
93
+ ------
94
+ ValueError
95
+ If an invalid text_depth value is provided.
96
+ SDKError
97
+ If there is an error during the extraction process.
98
+ """
99
+
100
+ _ = execution_trace_log
101
+ _ = extract_charts
102
+
103
+ logger = logging.getLogger(__name__)
104
+ logger.debug("Extracting PDF with unstructured-io backend.")
105
+
106
+ # Get unstructured.io API key
107
+ api_key = extractor_config.get("unstructured_api_key", None)
108
+
109
+ # Get unstructured.io URL
110
+ unstructured_url = extractor_config.get("unstructured_url", "https://api.unstructured.io/general/v0/general")
111
+
112
+ # Get unstructured.io strategy
113
+ strategy = extractor_config.get("unstructured_strategy", "auto")
114
+ if (strategy != "hi_res") and (extract_images or extract_tables):
115
+ warnings.warn("'hi_res' strategy required when extracting images or tables")
116
+
117
+ # Get unstructured.io split PDF concurrency level
118
+ concurrency_level = extractor_config.get("unstructured_concurrency_level", 10)
119
+
120
+ # Get row_data from configuration
121
+ row_data = extractor_config.get("row_data", None)
122
+
123
+ # Get source_id and file name from row_data
124
+ source_id = row_data.get("source_id", None) if row_data is not None else None
125
+ file_name = row_data.get("id", "_.pdf") if row_data is not None else "_.pdf"
126
+
127
+ # Get and validate text_depth
128
+ text_depth_str = extractor_config.get("text_depth", "page")
129
+ try:
130
+ text_depth = TextTypeEnum[text_depth_str.upper()]
131
+ except KeyError:
132
+ valid_options = [e.name.lower() for e in TextTypeEnum]
133
+ raise ValueError(f"Invalid text_depth value: {text_depth_str}. Expected one of: {valid_options}")
134
+
135
+ # Optional setting: identify_nearby_objects
136
+ identify_nearby_objects = extractor_config.get("identify_nearby_objects", True)
137
+
138
+ # Get base metadata
139
+ metadata_col = extractor_config.get("metadata_column", "metadata")
140
+ if row_data is not None and hasattr(row_data, "index") and metadata_col in row_data.index:
141
+ base_unified_metadata = row_data[metadata_col]
142
+ elif row_data is not None:
143
+ base_unified_metadata = row_data.get(metadata_col, {})
144
+ else:
145
+ base_unified_metadata = {}
146
+
147
+ # Handle infographics flag
148
+ if extract_infographics:
149
+ logger.debug("Infographics extraction requested but not supported by unstructured-io extractor.")
150
+
151
+ # get base source_metadata
152
+ base_source_metadata = base_unified_metadata.get("source_metadata", {})
153
+ # get source_location
154
+ source_location = base_source_metadata.get("source_location", "")
155
+ # get collection_id (assuming coming in from source_metadata...)
156
+ collection_id = base_source_metadata.get("collection_id", "")
157
+ # get partition_id (assuming coming in from source_metadata...)
158
+ partition_id = base_source_metadata.get("partition_id", -1)
159
+ # get access_level (assuming coming in from source_metadata...)
160
+ access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
161
+
162
+ source_metadata = {
163
+ "source_name": file_name,
164
+ "source_id": source_id,
165
+ "source_location": source_location,
166
+ "collection_id": collection_id,
167
+ "summary": "",
168
+ "partition_id": partition_id,
169
+ "access_level": access_level,
170
+ }
171
+
172
+ doc = pdfium.PdfDocument(pdf_stream)
173
+ pdf_metadata = extract_pdf_metadata(doc, source_id)
174
+
175
+ document_metadata = {
176
+ "source_type": pdf_metadata.source_type,
177
+ "date_created": pdf_metadata.date_created,
178
+ "last_modified": pdf_metadata.last_modified,
179
+ }
180
+
181
+ source_metadata.update(document_metadata)
182
+
183
+ client = UnstructuredClient(
184
+ retry_config=RetryConfig("backoff", BackoffStrategy(1, 50, 1.1, 100), False),
185
+ server_url=unstructured_url,
186
+ api_key_auth=api_key,
187
+ )
188
+
189
+ req = operations.PartitionRequest(
190
+ partition_parameters=shared.PartitionParameters(
191
+ files=shared.Files(
192
+ content=pdf_stream.getvalue(),
193
+ file_name=file_name,
194
+ ),
195
+ strategy=strategy,
196
+ languages=["eng"],
197
+ coordinates=True,
198
+ extract_image_block_types=["Image"] if extract_images else None,
199
+ split_pdf_page=True,
200
+ split_pdf_concurrency_level=concurrency_level,
201
+ ),
202
+ )
203
+
204
+ res = client.general.partition(request=req)
205
+
206
+ extracted_data = []
207
+ accumulated_text = []
208
+ curr_page = 1
209
+ page_nearby_blocks = {
210
+ "text": {"content": [], "bbox": []},
211
+ "images": {"content": [], "bbox": []},
212
+ "structured": {"content": [], "bbox": []},
213
+ }
214
+
215
+ # Extract content from each element of partition response
216
+ for block_idx, item in enumerate(res.elements):
217
+ # Extract text
218
+ if extract_text and item["type"] not in ("Image", "Table"):
219
+ if item["metadata"]["page_number"] != curr_page:
220
+ if text_depth == TextTypeEnum.PAGE:
221
+ text_extraction = construct_text_metadata(
222
+ accumulated_text,
223
+ pdf_metadata.page_count,
224
+ curr_page - 1,
225
+ -1,
226
+ text_depth,
227
+ source_metadata,
228
+ base_unified_metadata,
229
+ )
230
+
231
+ if len(text_extraction) > 0:
232
+ extracted_data.append(text_extraction)
233
+
234
+ accumulated_text = []
235
+
236
+ page_nearby_blocks = {
237
+ "text": {"content": [], "bbox": []},
238
+ "images": {"content": [], "bbox": []},
239
+ "structured": {"content": [], "bbox": []},
240
+ }
241
+ curr_page = item["metadata"]["page_number"]
242
+
243
+ accumulated_text.append(item["text"])
244
+
245
+ if text_depth == TextTypeEnum.BLOCK:
246
+ points = item["metadata"]["coordinates"]["points"]
247
+
248
+ text_extraction = construct_text_metadata(
249
+ accumulated_text,
250
+ pdf_metadata.page_count,
251
+ item["metadata"]["page_number"] - 1,
252
+ block_idx,
253
+ text_depth,
254
+ source_metadata,
255
+ base_unified_metadata,
256
+ bbox=(points[0][0], points[0][1], points[2][0], points[2][1]),
257
+ )
258
+
259
+ if len(text_extraction) > 0:
260
+ extracted_data.append(text_extraction)
261
+
262
+ accumulated_text = []
263
+
264
+ if (extract_images and identify_nearby_objects) and (len(item["text"]) > 0):
265
+ points = item["metadata"]["coordinates"]["points"]
266
+ page_nearby_blocks["text"]["content"].append(" ".join(item["text"]))
267
+ page_nearby_blocks["text"]["bbox"].append((points[0][0], points[0][1], points[2][0], points[2][1]))
268
+
269
+ # Extract images
270
+ if extract_images and item["type"] == "Image":
271
+ base64_img = item["metadata"]["image_base64"]
272
+ points = item["metadata"]["coordinates"]["points"]
273
+
274
+ image_extraction = _construct_image_metadata(
275
+ base64_img,
276
+ item["text"],
277
+ pdf_metadata.page_count,
278
+ item["metadata"]["page_number"] - 1,
279
+ block_idx,
280
+ source_metadata,
281
+ base_unified_metadata,
282
+ page_nearby_blocks,
283
+ bbox=(points[0][0], points[0][1], points[2][0], points[2][1]),
284
+ )
285
+
286
+ extracted_data.append(image_extraction)
287
+
288
+ # Extract tables
289
+ if extract_tables and item["type"] == "Table":
290
+ table = item["metadata"]["text_as_html"]
291
+ points = item["metadata"]["coordinates"]["points"]
292
+
293
+ table_extraction = _construct_table_metadata(
294
+ table,
295
+ pdf_metadata.page_count,
296
+ item["metadata"]["page_number"] - 1,
297
+ block_idx,
298
+ source_metadata,
299
+ base_unified_metadata,
300
+ bbox=(points[0][0], points[0][1], points[2][0], points[2][1]),
301
+ )
302
+
303
+ extracted_data.append(table_extraction)
304
+
305
+ if extract_text and text_depth == TextTypeEnum.PAGE:
306
+ text_extraction = construct_text_metadata(
307
+ accumulated_text,
308
+ pdf_metadata.page_count,
309
+ curr_page - 1,
310
+ -1,
311
+ text_depth,
312
+ source_metadata,
313
+ base_unified_metadata,
314
+ )
315
+
316
+ if len(text_extraction) > 0:
317
+ extracted_data.append(text_extraction)
318
+
319
+ elif extract_text and text_depth == TextTypeEnum.DOCUMENT:
320
+ text_extraction = construct_text_metadata(
321
+ accumulated_text,
322
+ pdf_metadata.page_count,
323
+ -1,
324
+ -1,
325
+ text_depth,
326
+ source_metadata,
327
+ base_unified_metadata,
328
+ )
329
+
330
+ if len(text_extraction) > 0:
331
+ extracted_data.append(text_extraction)
332
+
333
+ return extracted_data
334
+
335
+
336
+ def _construct_image_metadata(
337
+ image,
338
+ image_text,
339
+ page_count,
340
+ page_idx,
341
+ block_idx,
342
+ source_metadata,
343
+ base_unified_metadata,
344
+ page_nearby_blocks,
345
+ bbox,
346
+ ):
347
+ content_metadata = {
348
+ "type": ContentTypeEnum.IMAGE,
349
+ "description": ContentDescriptionEnum.PDF_IMAGE,
350
+ "page_number": page_idx,
351
+ "hierarchy": {
352
+ "page_count": page_count,
353
+ "page": page_idx,
354
+ "block": block_idx,
355
+ "line": -1,
356
+ "span": -1,
357
+ "nearby_objects": page_nearby_blocks,
358
+ },
359
+ }
360
+
361
+ image_metadata = {
362
+ "image_type": DocumentTypeEnum.JPEG,
363
+ "structured_image_type": ContentTypeEnum.UNKNOWN,
364
+ "caption": "",
365
+ "text": image_text,
366
+ "image_location": bbox,
367
+ }
368
+
369
+ unified_metadata = base_unified_metadata.copy()
370
+
371
+ unified_metadata.update(
372
+ {
373
+ "content": image,
374
+ "source_metadata": source_metadata,
375
+ "content_metadata": content_metadata,
376
+ "image_metadata": image_metadata,
377
+ }
378
+ )
379
+
380
+ validated_unified_metadata = validate_metadata(unified_metadata)
381
+
382
+ return [ContentTypeEnum.IMAGE.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
383
+
384
+
385
+ def _construct_table_metadata(
386
+ table,
387
+ page_count,
388
+ page_idx,
389
+ block_idx,
390
+ source_metadata,
391
+ base_unified_metadata,
392
+ bbox,
393
+ ):
394
+ content_metadata = {
395
+ "type": ContentTypeEnum.STRUCTURED,
396
+ "description": ContentDescriptionEnum.PDF_TABLE,
397
+ "page_number": page_idx,
398
+ "hierarchy": {
399
+ "page_count": page_count,
400
+ "page": page_idx,
401
+ "block": block_idx,
402
+ "line": -1,
403
+ "span": -1,
404
+ },
405
+ }
406
+
407
+ table_metadata = {
408
+ "caption": "",
409
+ "table_format": TableFormatEnum.HTML,
410
+ "table_location": bbox,
411
+ }
412
+
413
+ unified_metadata = base_unified_metadata.copy()
414
+
415
+ unified_metadata.update(
416
+ {
417
+ "content": table,
418
+ "source_metadata": source_metadata,
419
+ "content_metadata": content_metadata,
420
+ "table_metadata": table_metadata,
421
+ }
422
+ )
423
+
424
+ validated_unified_metadata = validate_metadata(unified_metadata)
425
+
426
+ return [ContentTypeEnum.STRUCTURED.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
@@ -0,0 +1,74 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # Copyright (c) 2024, NVIDIA CORPORATION.
5
+
6
+ import pandas as pd
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+ import logging
9
+
10
+ from nv_ingest_api.internal.extract.pdf.engines.pdf_helpers import _orchestrate_row_extraction
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def extract_primitives_from_pdf_internal(
16
+ df_extraction_ledger: pd.DataFrame,
17
+ task_config: Dict[str, Any],
18
+ extractor_config: Any,
19
+ execution_trace_log: Optional[List[Any]] = None,
20
+ ) -> Tuple[pd.DataFrame, Dict]:
21
+ """
22
+ Process a DataFrame of PDF documents by orchestrating extraction for each row.
23
+
24
+ This function applies the row-level orchestration function to every row in the
25
+ DataFrame, aggregates the results, and returns a new DataFrame with the extracted
26
+ data along with any trace information.
27
+
28
+ Parameters
29
+ ----------
30
+ df_extraction_ledger : pd.DataFrame
31
+ A pandas DataFrame containing PDF documents. Must include a 'content' column
32
+ with base64-encoded PDF data.
33
+ task_config: dict
34
+ A dictionary of configuration parameters. Expected to include 'task_properties'
35
+ and 'validated_config' keys.
36
+ extractor_config: Any
37
+ A dictionary of configuration parameters for the extraction process.
38
+ execution_trace_log : list, optional
39
+ A list for accumulating trace information during extraction. Defaults to None.
40
+
41
+ Returns
42
+ -------
43
+ tuple of (pd.DataFrame, dict)
44
+ A tuple where the first element is a DataFrame with the extracted data (with
45
+ columns: document_type, metadata, uuid) and the second element is a dictionary
46
+ containing trace information.
47
+
48
+ Raises
49
+ ------
50
+ Exception
51
+ If an error occurs during the extraction process on any row.
52
+ """
53
+ try:
54
+ task_config = task_config
55
+ extractor_config = extractor_config
56
+
57
+ # Apply the orchestration function to each row.
58
+ extraction_series = df_extraction_ledger.apply(
59
+ lambda row: _orchestrate_row_extraction(row, task_config, extractor_config, execution_trace_log), axis=1
60
+ )
61
+ # Explode the results if the extraction returns lists.
62
+ extraction_series = extraction_series.explode().dropna()
63
+
64
+ # Convert the extracted results into a DataFrame.
65
+ if not extraction_series.empty:
66
+ extracted_df = pd.DataFrame(extraction_series.to_list(), columns=["document_type", "metadata", "uuid"])
67
+ else:
68
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
69
+
70
+ return extracted_df, {"execution_trace_log": execution_trace_log}
71
+ except Exception as e:
72
+ err_msg = f"extract_primitives_from_pdf: Error processing PDF bytes: {e}"
73
+ logger.error(err_msg, exc_info=True)
74
+ raise type(e)(err_msg) from e
@@ -0,0 +1,5 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2024, NVIDIA CORPORATION.