nv-ingest-api 2025.4.18.dev20250418__py3-none-any.whl → 2025.4.20.dev20250420__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +0 -3
  2. nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
  3. nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/METADATA +1 -1
  5. nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +9 -0
  6. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/WHEEL +1 -1
  7. nv_ingest_api/interface/__init__.py +0 -215
  8. nv_ingest_api/interface/extract.py +0 -972
  9. nv_ingest_api/interface/mutate.py +0 -154
  10. nv_ingest_api/interface/store.py +0 -218
  11. nv_ingest_api/interface/transform.py +0 -382
  12. nv_ingest_api/interface/utility.py +0 -200
  13. nv_ingest_api/internal/enums/__init__.py +0 -3
  14. nv_ingest_api/internal/enums/common.py +0 -494
  15. nv_ingest_api/internal/extract/__init__.py +0 -3
  16. nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  17. nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  18. nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  19. nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  20. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  21. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  22. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  23. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  24. nv_ingest_api/internal/extract/image/__init__.py +0 -3
  25. nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  26. nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  27. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  28. nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  29. nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  30. nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  31. nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  32. nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  33. nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  34. nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  35. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  36. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  37. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  38. nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  39. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  40. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  41. nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  42. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  43. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  44. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  45. nv_ingest_api/internal/mutate/__init__.py +0 -3
  46. nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  47. nv_ingest_api/internal/mutate/filter.py +0 -133
  48. nv_ingest_api/internal/primitives/__init__.py +0 -0
  49. nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  50. nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  51. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  52. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  53. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  54. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  55. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
  56. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  57. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
  58. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  59. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  60. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  61. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  62. nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  63. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  64. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  65. nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  66. nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  67. nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  68. nv_ingest_api/internal/schemas/__init__.py +0 -3
  69. nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  70. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  71. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  72. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  73. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  74. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  75. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  76. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  77. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  78. nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  79. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  80. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  81. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  82. nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  83. nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  84. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  85. nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  86. nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  87. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  88. nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  89. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  90. nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  91. nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  92. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  93. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  94. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  95. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  96. nv_ingest_api/internal/store/__init__.py +0 -3
  97. nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  98. nv_ingest_api/internal/store/image_upload.py +0 -232
  99. nv_ingest_api/internal/transform/__init__.py +0 -3
  100. nv_ingest_api/internal/transform/caption_image.py +0 -205
  101. nv_ingest_api/internal/transform/embed_text.py +0 -496
  102. nv_ingest_api/internal/transform/split_text.py +0 -157
  103. nv_ingest_api/util/__init__.py +0 -0
  104. nv_ingest_api/util/control_message/__init__.py +0 -0
  105. nv_ingest_api/util/control_message/validators.py +0 -47
  106. nv_ingest_api/util/converters/__init__.py +0 -0
  107. nv_ingest_api/util/converters/bytetools.py +0 -78
  108. nv_ingest_api/util/converters/containers.py +0 -65
  109. nv_ingest_api/util/converters/datetools.py +0 -90
  110. nv_ingest_api/util/converters/dftools.py +0 -127
  111. nv_ingest_api/util/converters/formats.py +0 -64
  112. nv_ingest_api/util/converters/type_mappings.py +0 -27
  113. nv_ingest_api/util/detectors/__init__.py +0 -5
  114. nv_ingest_api/util/detectors/language.py +0 -38
  115. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  116. nv_ingest_api/util/exception_handlers/converters.py +0 -72
  117. nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  118. nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  119. nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  120. nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  121. nv_ingest_api/util/image_processing/__init__.py +0 -5
  122. nv_ingest_api/util/image_processing/clustering.py +0 -260
  123. nv_ingest_api/util/image_processing/processing.py +0 -179
  124. nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  125. nv_ingest_api/util/image_processing/transforms.py +0 -407
  126. nv_ingest_api/util/logging/__init__.py +0 -0
  127. nv_ingest_api/util/logging/configuration.py +0 -31
  128. nv_ingest_api/util/message_brokers/__init__.py +0 -3
  129. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  130. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  131. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  132. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -451
  133. nv_ingest_api/util/metadata/__init__.py +0 -5
  134. nv_ingest_api/util/metadata/aggregators.py +0 -469
  135. nv_ingest_api/util/multi_processing/__init__.py +0 -8
  136. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  137. nv_ingest_api/util/nim/__init__.py +0 -56
  138. nv_ingest_api/util/pdf/__init__.py +0 -3
  139. nv_ingest_api/util/pdf/pdfium.py +0 -427
  140. nv_ingest_api/util/schema/__init__.py +0 -0
  141. nv_ingest_api/util/schema/schema_validator.py +0 -10
  142. nv_ingest_api/util/service_clients/__init__.py +0 -3
  143. nv_ingest_api/util/service_clients/client_base.py +0 -86
  144. nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  145. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  146. nv_ingest_api/util/service_clients/redis/redis_client.py +0 -823
  147. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  148. nv_ingest_api/util/service_clients/rest/rest_client.py +0 -531
  149. nv_ingest_api/util/string_processing/__init__.py +0 -51
  150. nv_ingest_api-2025.4.18.dev20250418.dist-info/RECORD +0 -152
  151. /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
  152. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/top_level.txt +0 -0
@@ -1,426 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- # Copyright (c) 2024, NVIDIA CORPORATION.
7
- #
8
- # Licensed under the Apache License, Version 2.0 (the "License");
9
- # you may not use this file except in compliance with the License.
10
- # You may obtain a copy of the License at
11
- #
12
- # http://www.apache.org/licenses/LICENSE-2.0
13
- #
14
- # Unless required by applicable law or agreed to in writing, software
15
- # distributed under the License is distributed on an "AS IS" BASIS,
16
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
- # See the License for the specific language governing permissions and
18
- # limitations under the License.
19
-
20
- import io
21
- import logging
22
- import uuid
23
- import warnings
24
- from typing import Dict, Any, Optional, List
25
-
26
- import pandas as pd
27
- import pypdfium2 as pdfium
28
- from unstructured_client import UnstructuredClient
29
- from unstructured_client.models import operations
30
- from unstructured_client.models import shared
31
- from unstructured_client.utils import BackoffStrategy
32
- from unstructured_client.utils import RetryConfig
33
-
34
- from nv_ingest_api.internal.enums.common import AccessLevelEnum, DocumentTypeEnum
35
- from nv_ingest_api.internal.enums.common import ContentTypeEnum
36
- from nv_ingest_api.internal.enums.common import ContentDescriptionEnum
37
- from nv_ingest_api.internal.enums.common import TableFormatEnum
38
- from nv_ingest_api.internal.enums.common import TextTypeEnum
39
- from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
40
- from nv_ingest_api.util.metadata.aggregators import extract_pdf_metadata, construct_text_metadata
41
-
42
- logger = logging.getLogger(__name__)
43
-
44
-
45
- def unstructured_io_extractor(
46
- pdf_stream: io.BytesIO,
47
- extract_text: bool,
48
- extract_images: bool,
49
- extract_infographics: bool,
50
- extract_charts: bool,
51
- extract_tables: bool,
52
- extractor_config: Dict[str, Any],
53
- execution_trace_log: Optional[List[Any]] = None,
54
- ) -> pd.DataFrame:
55
- """
56
- Helper function to use unstructured-io REST API to extract text from a bytestream PDF.
57
-
58
- This function sends the provided PDF stream to the unstructured-io API and
59
- returns the extracted text. Additional parameters for the extraction are
60
- provided via the extractor_config dictionary. Note that although flags for
61
- image, table, and infographics extraction are provided, the underlying API
62
- may not support all of these features.
63
-
64
- Parameters
65
- ----------
66
- pdf_stream : io.BytesIO
67
- A bytestream representing the PDF to be processed.
68
- extract_text : bool
69
- Specifies whether to extract text.
70
- extract_images : bool
71
- Specifies whether to extract images.
72
- extract_infographics : bool
73
- Specifies whether to extract infographics.
74
- extract_tables : bool
75
- Specifies whether to extract tables.
76
- extractor_config : dict
77
- A dictionary containing additional extraction parameters:
78
- - unstructured_api_key : API key for unstructured.io.
79
- - unstructured_url : URL for the unstructured.io API endpoint.
80
- - unstructured_strategy : Strategy for extraction (default: "auto").
81
- - unstructured_concurrency_level : Concurrency level for PDF splitting.
82
- - row_data : Row data containing source information.
83
- - text_depth : Depth of text extraction (e.g., "page").
84
- - identify_nearby_objects : Flag for identifying nearby objects.
85
- - metadata_column : Column name for metadata extraction.
86
-
87
- Returns
88
- -------
89
- str
90
- A string containing the extracted text.
91
-
92
- Raises
93
- ------
94
- ValueError
95
- If an invalid text_depth value is provided.
96
- SDKError
97
- If there is an error during the extraction process.
98
- """
99
-
100
- _ = execution_trace_log
101
- _ = extract_charts
102
-
103
- logger = logging.getLogger(__name__)
104
- logger.debug("Extracting PDF with unstructured-io backend.")
105
-
106
- # Get unstructured.io API key
107
- api_key = extractor_config.get("unstructured_api_key", None)
108
-
109
- # Get unstructured.io URL
110
- unstructured_url = extractor_config.get("unstructured_url", "https://api.unstructured.io/general/v0/general")
111
-
112
- # Get unstructured.io strategy
113
- strategy = extractor_config.get("unstructured_strategy", "auto")
114
- if (strategy != "hi_res") and (extract_images or extract_tables):
115
- warnings.warn("'hi_res' strategy required when extracting images or tables")
116
-
117
- # Get unstructured.io split PDF concurrency level
118
- concurrency_level = extractor_config.get("unstructured_concurrency_level", 10)
119
-
120
- # Get row_data from configuration
121
- row_data = extractor_config.get("row_data", None)
122
-
123
- # Get source_id and file name from row_data
124
- source_id = row_data.get("source_id", None) if row_data is not None else None
125
- file_name = row_data.get("id", "_.pdf") if row_data is not None else "_.pdf"
126
-
127
- # Get and validate text_depth
128
- text_depth_str = extractor_config.get("text_depth", "page")
129
- try:
130
- text_depth = TextTypeEnum[text_depth_str.upper()]
131
- except KeyError:
132
- valid_options = [e.name.lower() for e in TextTypeEnum]
133
- raise ValueError(f"Invalid text_depth value: {text_depth_str}. Expected one of: {valid_options}")
134
-
135
- # Optional setting: identify_nearby_objects
136
- identify_nearby_objects = extractor_config.get("identify_nearby_objects", True)
137
-
138
- # Get base metadata
139
- metadata_col = extractor_config.get("metadata_column", "metadata")
140
- if row_data is not None and hasattr(row_data, "index") and metadata_col in row_data.index:
141
- base_unified_metadata = row_data[metadata_col]
142
- elif row_data is not None:
143
- base_unified_metadata = row_data.get(metadata_col, {})
144
- else:
145
- base_unified_metadata = {}
146
-
147
- # Handle infographics flag
148
- if extract_infographics:
149
- logger.debug("Infographics extraction requested but not supported by unstructured-io extractor.")
150
-
151
- # get base source_metadata
152
- base_source_metadata = base_unified_metadata.get("source_metadata", {})
153
- # get source_location
154
- source_location = base_source_metadata.get("source_location", "")
155
- # get collection_id (assuming coming in from source_metadata...)
156
- collection_id = base_source_metadata.get("collection_id", "")
157
- # get partition_id (assuming coming in from source_metadata...)
158
- partition_id = base_source_metadata.get("partition_id", -1)
159
- # get access_level (assuming coming in from source_metadata...)
160
- access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
161
-
162
- source_metadata = {
163
- "source_name": file_name,
164
- "source_id": source_id,
165
- "source_location": source_location,
166
- "collection_id": collection_id,
167
- "summary": "",
168
- "partition_id": partition_id,
169
- "access_level": access_level,
170
- }
171
-
172
- doc = pdfium.PdfDocument(pdf_stream)
173
- pdf_metadata = extract_pdf_metadata(doc, source_id)
174
-
175
- document_metadata = {
176
- "source_type": pdf_metadata.source_type,
177
- "date_created": pdf_metadata.date_created,
178
- "last_modified": pdf_metadata.last_modified,
179
- }
180
-
181
- source_metadata.update(document_metadata)
182
-
183
- client = UnstructuredClient(
184
- retry_config=RetryConfig("backoff", BackoffStrategy(1, 50, 1.1, 100), False),
185
- server_url=unstructured_url,
186
- api_key_auth=api_key,
187
- )
188
-
189
- req = operations.PartitionRequest(
190
- partition_parameters=shared.PartitionParameters(
191
- files=shared.Files(
192
- content=pdf_stream.getvalue(),
193
- file_name=file_name,
194
- ),
195
- strategy=strategy,
196
- languages=["eng"],
197
- coordinates=True,
198
- extract_image_block_types=["Image"] if extract_images else None,
199
- split_pdf_page=True,
200
- split_pdf_concurrency_level=concurrency_level,
201
- ),
202
- )
203
-
204
- res = client.general.partition(request=req)
205
-
206
- extracted_data = []
207
- accumulated_text = []
208
- curr_page = 1
209
- page_nearby_blocks = {
210
- "text": {"content": [], "bbox": []},
211
- "images": {"content": [], "bbox": []},
212
- "structured": {"content": [], "bbox": []},
213
- }
214
-
215
- # Extract content from each element of partition response
216
- for block_idx, item in enumerate(res.elements):
217
- # Extract text
218
- if extract_text and item["type"] not in ("Image", "Table"):
219
- if item["metadata"]["page_number"] != curr_page:
220
- if text_depth == TextTypeEnum.PAGE:
221
- text_extraction = construct_text_metadata(
222
- accumulated_text,
223
- pdf_metadata.page_count,
224
- curr_page - 1,
225
- -1,
226
- text_depth,
227
- source_metadata,
228
- base_unified_metadata,
229
- )
230
-
231
- if len(text_extraction) > 0:
232
- extracted_data.append(text_extraction)
233
-
234
- accumulated_text = []
235
-
236
- page_nearby_blocks = {
237
- "text": {"content": [], "bbox": []},
238
- "images": {"content": [], "bbox": []},
239
- "structured": {"content": [], "bbox": []},
240
- }
241
- curr_page = item["metadata"]["page_number"]
242
-
243
- accumulated_text.append(item["text"])
244
-
245
- if text_depth == TextTypeEnum.BLOCK:
246
- points = item["metadata"]["coordinates"]["points"]
247
-
248
- text_extraction = construct_text_metadata(
249
- accumulated_text,
250
- pdf_metadata.page_count,
251
- item["metadata"]["page_number"] - 1,
252
- block_idx,
253
- text_depth,
254
- source_metadata,
255
- base_unified_metadata,
256
- bbox=(points[0][0], points[0][1], points[2][0], points[2][1]),
257
- )
258
-
259
- if len(text_extraction) > 0:
260
- extracted_data.append(text_extraction)
261
-
262
- accumulated_text = []
263
-
264
- if (extract_images and identify_nearby_objects) and (len(item["text"]) > 0):
265
- points = item["metadata"]["coordinates"]["points"]
266
- page_nearby_blocks["text"]["content"].append(" ".join(item["text"]))
267
- page_nearby_blocks["text"]["bbox"].append((points[0][0], points[0][1], points[2][0], points[2][1]))
268
-
269
- # Extract images
270
- if extract_images and item["type"] == "Image":
271
- base64_img = item["metadata"]["image_base64"]
272
- points = item["metadata"]["coordinates"]["points"]
273
-
274
- image_extraction = _construct_image_metadata(
275
- base64_img,
276
- item["text"],
277
- pdf_metadata.page_count,
278
- item["metadata"]["page_number"] - 1,
279
- block_idx,
280
- source_metadata,
281
- base_unified_metadata,
282
- page_nearby_blocks,
283
- bbox=(points[0][0], points[0][1], points[2][0], points[2][1]),
284
- )
285
-
286
- extracted_data.append(image_extraction)
287
-
288
- # Extract tables
289
- if extract_tables and item["type"] == "Table":
290
- table = item["metadata"]["text_as_html"]
291
- points = item["metadata"]["coordinates"]["points"]
292
-
293
- table_extraction = _construct_table_metadata(
294
- table,
295
- pdf_metadata.page_count,
296
- item["metadata"]["page_number"] - 1,
297
- block_idx,
298
- source_metadata,
299
- base_unified_metadata,
300
- bbox=(points[0][0], points[0][1], points[2][0], points[2][1]),
301
- )
302
-
303
- extracted_data.append(table_extraction)
304
-
305
- if extract_text and text_depth == TextTypeEnum.PAGE:
306
- text_extraction = construct_text_metadata(
307
- accumulated_text,
308
- pdf_metadata.page_count,
309
- curr_page - 1,
310
- -1,
311
- text_depth,
312
- source_metadata,
313
- base_unified_metadata,
314
- )
315
-
316
- if len(text_extraction) > 0:
317
- extracted_data.append(text_extraction)
318
-
319
- elif extract_text and text_depth == TextTypeEnum.DOCUMENT:
320
- text_extraction = construct_text_metadata(
321
- accumulated_text,
322
- pdf_metadata.page_count,
323
- -1,
324
- -1,
325
- text_depth,
326
- source_metadata,
327
- base_unified_metadata,
328
- )
329
-
330
- if len(text_extraction) > 0:
331
- extracted_data.append(text_extraction)
332
-
333
- return extracted_data
334
-
335
-
336
- def _construct_image_metadata(
337
- image,
338
- image_text,
339
- page_count,
340
- page_idx,
341
- block_idx,
342
- source_metadata,
343
- base_unified_metadata,
344
- page_nearby_blocks,
345
- bbox,
346
- ):
347
- content_metadata = {
348
- "type": ContentTypeEnum.IMAGE,
349
- "description": ContentDescriptionEnum.PDF_IMAGE,
350
- "page_number": page_idx,
351
- "hierarchy": {
352
- "page_count": page_count,
353
- "page": page_idx,
354
- "block": block_idx,
355
- "line": -1,
356
- "span": -1,
357
- "nearby_objects": page_nearby_blocks,
358
- },
359
- }
360
-
361
- image_metadata = {
362
- "image_type": DocumentTypeEnum.JPEG,
363
- "structured_image_type": ContentTypeEnum.UNKNOWN,
364
- "caption": "",
365
- "text": image_text,
366
- "image_location": bbox,
367
- }
368
-
369
- unified_metadata = base_unified_metadata.copy()
370
-
371
- unified_metadata.update(
372
- {
373
- "content": image,
374
- "source_metadata": source_metadata,
375
- "content_metadata": content_metadata,
376
- "image_metadata": image_metadata,
377
- }
378
- )
379
-
380
- validated_unified_metadata = validate_metadata(unified_metadata)
381
-
382
- return [ContentTypeEnum.IMAGE.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
383
-
384
-
385
- def _construct_table_metadata(
386
- table,
387
- page_count,
388
- page_idx,
389
- block_idx,
390
- source_metadata,
391
- base_unified_metadata,
392
- bbox,
393
- ):
394
- content_metadata = {
395
- "type": ContentTypeEnum.STRUCTURED,
396
- "description": ContentDescriptionEnum.PDF_TABLE,
397
- "page_number": page_idx,
398
- "hierarchy": {
399
- "page_count": page_count,
400
- "page": page_idx,
401
- "block": block_idx,
402
- "line": -1,
403
- "span": -1,
404
- },
405
- }
406
-
407
- table_metadata = {
408
- "caption": "",
409
- "table_format": TableFormatEnum.HTML,
410
- "table_location": bbox,
411
- }
412
-
413
- unified_metadata = base_unified_metadata.copy()
414
-
415
- unified_metadata.update(
416
- {
417
- "content": table,
418
- "source_metadata": source_metadata,
419
- "content_metadata": content_metadata,
420
- "table_metadata": table_metadata,
421
- }
422
- )
423
-
424
- validated_unified_metadata = validate_metadata(unified_metadata)
425
-
426
- return [ContentTypeEnum.STRUCTURED.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
@@ -1,74 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
- # Copyright (c) 2024, NVIDIA CORPORATION.
5
-
6
- import pandas as pd
7
- from typing import Any, Dict, List, Optional, Tuple
8
- import logging
9
-
10
- from nv_ingest_api.internal.extract.pdf.engines.pdf_helpers import _orchestrate_row_extraction
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- def extract_primitives_from_pdf_internal(
16
- df_extraction_ledger: pd.DataFrame,
17
- task_config: Dict[str, Any],
18
- extractor_config: Any,
19
- execution_trace_log: Optional[List[Any]] = None,
20
- ) -> Tuple[pd.DataFrame, Dict]:
21
- """
22
- Process a DataFrame of PDF documents by orchestrating extraction for each row.
23
-
24
- This function applies the row-level orchestration function to every row in the
25
- DataFrame, aggregates the results, and returns a new DataFrame with the extracted
26
- data along with any trace information.
27
-
28
- Parameters
29
- ----------
30
- df_extraction_ledger : pd.DataFrame
31
- A pandas DataFrame containing PDF documents. Must include a 'content' column
32
- with base64-encoded PDF data.
33
- task_config: dict
34
- A dictionary of configuration parameters. Expected to include 'task_properties'
35
- and 'validated_config' keys.
36
- extractor_config: Any
37
- A dictionary of configuration parameters for the extraction process.
38
- execution_trace_log : list, optional
39
- A list for accumulating trace information during extraction. Defaults to None.
40
-
41
- Returns
42
- -------
43
- tuple of (pd.DataFrame, dict)
44
- A tuple where the first element is a DataFrame with the extracted data (with
45
- columns: document_type, metadata, uuid) and the second element is a dictionary
46
- containing trace information.
47
-
48
- Raises
49
- ------
50
- Exception
51
- If an error occurs during the extraction process on any row.
52
- """
53
- try:
54
- task_config = task_config
55
- extractor_config = extractor_config
56
-
57
- # Apply the orchestration function to each row.
58
- extraction_series = df_extraction_ledger.apply(
59
- lambda row: _orchestrate_row_extraction(row, task_config, extractor_config, execution_trace_log), axis=1
60
- )
61
- # Explode the results if the extraction returns lists.
62
- extraction_series = extraction_series.explode().dropna()
63
-
64
- # Convert the extracted results into a DataFrame.
65
- if not extraction_series.empty:
66
- extracted_df = pd.DataFrame(extraction_series.to_list(), columns=["document_type", "metadata", "uuid"])
67
- else:
68
- extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
69
-
70
- return extracted_df, {"execution_trace_log": execution_trace_log}
71
- except Exception as e:
72
- err_msg = f"extract_primitives_from_pdf: Error processing PDF bytes: {e}"
73
- logger.error(err_msg, exc_info=True)
74
- raise type(e)(err_msg) from e
@@ -1,5 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- # Copyright (c) 2024, NVIDIA CORPORATION.
File without changes