nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,598 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # Copyright (c) 2024, NVIDIA CORPORATION.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ import io
19
+ import logging
20
+ import math
21
+ import uuid
22
+ import concurrent.futures
23
+ from typing import Any
24
+ from typing import Dict
25
+ from typing import Tuple
26
+ from typing import Optional
27
+ from typing import List
28
+
29
+ import numpy as np
30
+ import pypdfium2 as pdfium
31
+
32
+ from nv_ingest_api.internal.extract.pdf.engines.pdfium import _extract_page_elements
33
+ from nv_ingest_api.internal.primitives.nim.model_interface import nemotron_parse as nemotron_parse_utils
34
+ from nv_ingest_api.internal.enums.common import AccessLevelEnum
35
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
36
+ from nv_ingest_api.internal.enums.common import ContentDescriptionEnum
37
+ from nv_ingest_api.internal.enums.common import TableFormatEnum
38
+ from nv_ingest_api.internal.enums.common import TextTypeEnum
39
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
40
+ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
41
+ YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
42
+ YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
43
+ YOLOX_PAGE_IMAGE_FORMAT,
44
+ )
45
+ from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import NemotronParseConfigSchema
46
+ from nv_ingest_api.util.metadata.aggregators import (
47
+ extract_pdf_metadata,
48
+ LatexTable,
49
+ Base64Image,
50
+ construct_image_metadata_from_pdf_image,
51
+ construct_text_metadata,
52
+ )
53
+ from nv_ingest_api.util.pdf.pdfium import pdfium_pages_to_numpy
54
+ from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH_SIZE
55
+ from nv_ingest_api.util.exception_handlers.pdf import pdfium_exception_handler
56
+ from nv_ingest_api.util.image_processing.transforms import numpy_to_base64, crop_image
57
+ from nv_ingest_api.util.nim import create_inference_client
58
+
59
+
60
+ logger = logging.getLogger(__name__)
61
+
62
+ NEMOTRON_PARSE_RENDER_DPI = 300
63
+ NEMOTRON_PARSE_MAX_WIDTH = 1024
64
+ NEMOTRON_PARSE_MAX_HEIGHT = 1280
65
+ NEMOTRON_PARSE_MAX_BATCH_SIZE = 8
66
+
67
+
68
+ # Define a helper function to use nemotron_parse to extract text from a base64 encoded bytestram PDF
69
+ def nemotron_parse_extractor(
70
+ pdf_stream: io.BytesIO,
71
+ extract_text: bool,
72
+ extract_images: bool,
73
+ extract_infographics: bool,
74
+ extract_tables: bool,
75
+ extract_charts: bool,
76
+ extractor_config: dict,
77
+ execution_trace_log: Optional[List[Any]] = None,
78
+ ) -> str:
79
+ """
80
+ Helper function to use nemotron_parse to extract text from a bytestream PDF.
81
+
82
+ Parameters
83
+ ----------
84
+ pdf_stream : io.BytesIO
85
+ A bytestream PDF.
86
+ extract_text : bool
87
+ Specifies whether to extract text.
88
+ extract_images : bool
89
+ Specifies whether to extract images.
90
+ extract_tables : bool
91
+ Specifies whether to extract tables.
92
+ extract_infographics : bool
93
+ Specifies whether to extract infographics.
94
+ extract_charts : bool
95
+ Specifies whether to extract charts.
96
+ execution_trace_log : Optional[List], optional
97
+ Trace information for debugging purposes (default is None).
98
+ extractor_config : dict
99
+ A dictionary containing additional extraction parameters. Expected keys include:
100
+ - row_data : dict
101
+ - text_depth : str, optional (default is "page")
102
+ - extract_tables_method : str, optional (default is "yolox")
103
+ - identify_nearby_objects : bool, optional (default is True)
104
+ - table_output_format : str, optional (default is "pseudo_markdown")
105
+ - pdfium_config : dict, optional (configuration for PDFium)
106
+ - nemotron_parse_config : dict, optional (configuration for Nemotron Parse)
107
+ - metadata_column : str, optional (default is "metadata")
108
+
109
+ Returns
110
+ -------
111
+ str
112
+ A string of extracted text.
113
+
114
+ Raises
115
+ ------
116
+ ValueError
117
+ If required keys are missing in extractor_config or invalid values are provided.
118
+ KeyError
119
+ If required keys are missing in row_data.
120
+ """
121
+ logger = logging.getLogger(__name__)
122
+ logger.debug("Extracting PDF with nemotron_parse backend.")
123
+
124
+ # Retrieve row_data from extractor_config.
125
+ row_data = extractor_config.get("row_data")
126
+ if row_data is None:
127
+ raise ValueError("Missing 'row_data' in extractor_config.")
128
+
129
+ # Get source_id from row_data.
130
+ try:
131
+ source_id = row_data["source_id"]
132
+ except KeyError:
133
+ raise KeyError("row_data must contain 'source_id'.")
134
+
135
+ # Get and validate text_depth.
136
+ text_depth_str = extractor_config.get("text_depth", "page")
137
+ try:
138
+ text_depth = TextTypeEnum[text_depth_str.upper()]
139
+ except KeyError:
140
+ valid_options = [e.name.lower() for e in TextTypeEnum]
141
+ raise ValueError(f"Invalid text_depth value: {text_depth_str}. Expected one of: {valid_options}")
142
+
143
+ # Get extraction method for tables.
144
+ extract_tables_method = extractor_config.get("extract_tables_method", "yolox")
145
+
146
+ # Flag for identifying nearby objects.
147
+ identify_nearby_objects = extractor_config.get("identify_nearby_objects", True)
148
+
149
+ # Get and validate table_output_format.
150
+ table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
151
+ try:
152
+ table_output_format = TableFormatEnum[table_output_format_str.upper()]
153
+ except KeyError:
154
+ valid_options = [e.name.lower() for e in TableFormatEnum]
155
+ raise ValueError(
156
+ f"Invalid table_output_format value: {table_output_format_str}. Expected one of: {valid_options}"
157
+ )
158
+
159
+ # Process nemotron_parse configuration.
160
+ nemotron_parse_config_raw = extractor_config.get("nemotron_parse_config", {})
161
+ if isinstance(nemotron_parse_config_raw, dict):
162
+ nemotron_parse_config = NemotronParseConfigSchema(**nemotron_parse_config_raw)
163
+ elif isinstance(nemotron_parse_config_raw, NemotronParseConfigSchema):
164
+ nemotron_parse_config = nemotron_parse_config_raw
165
+ else:
166
+ raise ValueError("`nemotron_parse_config` must be a dictionary or a NemotronParseConfigSchema instance.")
167
+
168
+ # Get base metadata.
169
+ metadata_col = extractor_config.get("metadata_column", "metadata")
170
+ if hasattr(row_data, "index") and metadata_col in row_data.index:
171
+ base_unified_metadata = row_data[metadata_col]
172
+ else:
173
+ base_unified_metadata = row_data.get(metadata_col, {})
174
+
175
+ # get base source_metadata
176
+ base_source_metadata = base_unified_metadata.get("source_metadata", {})
177
+ # get source_location
178
+ source_location = base_source_metadata.get("source_location", "")
179
+ # get collection_id (assuming coming in from source_metadata...)
180
+ collection_id = base_source_metadata.get("collection_id", "")
181
+ # get partition_id (assuming coming in from source_metadata...)
182
+ partition_id = base_source_metadata.get("partition_id", -1)
183
+ # get access_level (assuming coming in from source_metadata...)
184
+ access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
185
+
186
+ extracted_data = []
187
+ doc = pdfium.PdfDocument(pdf_stream)
188
+ pdf_metadata = extract_pdf_metadata(doc, source_id)
189
+ page_count = pdf_metadata.page_count
190
+
191
+ source_metadata = {
192
+ "source_name": pdf_metadata.filename,
193
+ "source_id": source_id,
194
+ "source_location": source_location,
195
+ "source_type": pdf_metadata.source_type,
196
+ "collection_id": collection_id,
197
+ "date_created": pdf_metadata.date_created,
198
+ "last_modified": pdf_metadata.last_modified,
199
+ "summary": "",
200
+ "partition_id": partition_id,
201
+ "access_level": access_level,
202
+ }
203
+
204
+ accumulated_text = []
205
+ accumulated_tables = []
206
+ accumulated_images = []
207
+
208
+ pages_for_ocr = [] # We'll accumulate (page_idx, np_image) here
209
+ pages_for_tables = [] # We'll accumulate (page_idx, np_image) here
210
+ futures = [] # We'll keep track of all the Future objects for table/charts
211
+
212
+ nemotron_parse_client = None
213
+ if extract_text:
214
+ nemotron_parse_client = _create_clients(nemotron_parse_config)
215
+
216
+ max_workers = nemotron_parse_config.workers_per_progress_engine
217
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
218
+
219
+ for page_idx in range(page_count):
220
+ page = doc.get_page(page_idx)
221
+
222
+ page_image, padding_offset = _convert_pdfium_page_to_numpy_for_parser(page)
223
+ pages_for_ocr.append((page_idx, page_image))
224
+ page_image_for_tables, padding_offset_for_tables = _convert_pdfium_page_to_numpy_for_yolox(page)
225
+ pages_for_tables.append((page_idx, page_image_for_tables, padding_offset_for_tables))
226
+
227
+ page.close()
228
+
229
+ # Whenever pages_as_images hits NEMOTRON_PARSE_MAX_BATCH_SIZE, submit a job
230
+ if (extract_text) and (len(pages_for_ocr) >= NEMOTRON_PARSE_MAX_BATCH_SIZE):
231
+ future_parser = executor.submit(
232
+ lambda *args, **kwargs: ("parser", _extract_text_and_bounding_boxes(*args, **kwargs)),
233
+ pages_for_ocr[:], # pass a copy
234
+ nemotron_parse_client,
235
+ execution_trace_log=execution_trace_log,
236
+ )
237
+ futures.append(future_parser)
238
+ pages_for_ocr.clear()
239
+
240
+ # Whenever pages_as_images hits YOLOX_MAX_BATCH_SIZE, submit a job
241
+ if (
242
+ (extract_tables_method == "yolox")
243
+ and (extract_tables or extract_charts or extract_infographics)
244
+ and (len(pages_for_tables) >= YOLOX_MAX_BATCH_SIZE)
245
+ ):
246
+ future_yolox = executor.submit(
247
+ lambda *args, **kwargs: ("yolox", _extract_page_elements(*args, **kwargs)),
248
+ pages_for_tables[:], # pass a copy
249
+ page_count,
250
+ source_metadata,
251
+ base_unified_metadata,
252
+ extract_tables,
253
+ extract_charts,
254
+ extract_infographics,
255
+ {}, # page_to_text_flag_map
256
+ table_output_format,
257
+ nemotron_parse_config.yolox_endpoints,
258
+ nemotron_parse_config.yolox_infer_protocol,
259
+ nemotron_parse_config.auth_token,
260
+ execution_trace_log=execution_trace_log,
261
+ )
262
+ futures.append(future_yolox)
263
+ pages_for_tables.clear()
264
+
265
+ # After page loop, if we still have leftover pages_as_images, submit one last job
266
+ if extract_text and pages_for_ocr:
267
+ future_parser = executor.submit(
268
+ lambda *args, **kwargs: ("parser", _extract_text_and_bounding_boxes(*args, **kwargs)),
269
+ pages_for_ocr[:], # pass a copy
270
+ nemotron_parse_client,
271
+ execution_trace_log=execution_trace_log,
272
+ )
273
+ futures.append(future_parser)
274
+ pages_for_ocr.clear()
275
+
276
+ if (
277
+ (extract_tables_method == "yolox")
278
+ and (extract_tables or extract_charts or extract_infographics)
279
+ and pages_for_tables
280
+ ):
281
+ future_yolox = executor.submit(
282
+ lambda *args, **kwargs: ("yolox", _extract_page_elements(*args, **kwargs)),
283
+ pages_for_tables[:],
284
+ page_count,
285
+ source_metadata,
286
+ base_unified_metadata,
287
+ extract_tables,
288
+ extract_charts,
289
+ extract_infographics,
290
+ {}, # page_to_text_flag_map
291
+ table_output_format,
292
+ nemotron_parse_config.yolox_endpoints,
293
+ nemotron_parse_config.yolox_infer_protocol,
294
+ nemotron_parse_config.auth_token,
295
+ execution_trace_log=execution_trace_log,
296
+ )
297
+ futures.append(future_yolox)
298
+ pages_for_tables.clear()
299
+
300
+ parser_results = []
301
+ # Now wait for all futures to complete
302
+ for fut in concurrent.futures.as_completed(futures):
303
+ model_name, extracted_items = fut.result() # blocks until finished
304
+ if (model_name == "yolox") and (extract_tables or extract_charts or extract_infographics):
305
+ extracted_data.extend(extracted_items)
306
+ elif model_name == "parser":
307
+ parser_results.extend(extracted_items)
308
+
309
+ for page_idx, parser_output in parser_results:
310
+ page = None
311
+ page_image = None
312
+ page_text = []
313
+
314
+ page_nearby_blocks = {
315
+ "text": {"content": [], "bbox": [], "type": []},
316
+ "images": {"content": [], "bbox": [], "type": []},
317
+ "structured": {"content": [], "bbox": [], "type": []},
318
+ }
319
+
320
+ for bbox_dict in parser_output:
321
+ cls = bbox_dict["type"]
322
+ bbox = bbox_dict["bbox"]
323
+ txt = bbox_dict["text"]
324
+
325
+ transformed_bbox = [
326
+ math.floor(bbox["xmin"] * NEMOTRON_PARSE_MAX_WIDTH),
327
+ math.floor(bbox["ymin"] * NEMOTRON_PARSE_MAX_HEIGHT),
328
+ math.ceil(bbox["xmax"] * NEMOTRON_PARSE_MAX_WIDTH),
329
+ math.ceil(bbox["ymax"] * NEMOTRON_PARSE_MAX_HEIGHT),
330
+ ]
331
+
332
+ if cls not in nemotron_parse_utils.ACCEPTED_CLASSES:
333
+ continue
334
+
335
+ if identify_nearby_objects:
336
+ _insert_page_nearby_blocks(page_nearby_blocks, cls, txt, transformed_bbox)
337
+
338
+ if extract_text:
339
+ page_text.append(txt)
340
+
341
+ if (extract_tables_method == "nemotron_parse") and (extract_tables) and (cls == "Table"):
342
+ table = LatexTable(
343
+ latex=txt,
344
+ bbox=transformed_bbox,
345
+ max_width=NEMOTRON_PARSE_MAX_WIDTH,
346
+ max_height=NEMOTRON_PARSE_MAX_HEIGHT,
347
+ )
348
+ accumulated_tables.append(table)
349
+
350
+ if extract_images and (cls == "Picture"):
351
+ if page is None:
352
+ page = doc.get_page(page_idx)
353
+ if page_image is None:
354
+ page_image, _ = _convert_pdfium_page_to_numpy_for_parser(page)
355
+
356
+ img_numpy = crop_image(page_image, transformed_bbox)
357
+
358
+ if img_numpy is not None:
359
+ base64_img = numpy_to_base64(img_numpy, format=YOLOX_PAGE_IMAGE_FORMAT)
360
+ image = Base64Image(
361
+ image=base64_img,
362
+ bbox=transformed_bbox,
363
+ width=img_numpy.shape[1],
364
+ height=img_numpy.shape[0],
365
+ max_width=NEMOTRON_PARSE_MAX_WIDTH,
366
+ max_height=NEMOTRON_PARSE_MAX_HEIGHT,
367
+ )
368
+ accumulated_images.append(image)
369
+
370
+ # If Nemotron Parse fails to extract anything, fall back to using pdfium.
371
+ if not "".join(page_text).strip():
372
+ if page is None:
373
+ page = doc.get_page(page_idx)
374
+ page_text = [page.get_textpage().get_text_bounded()]
375
+
376
+ accumulated_text.extend(page_text)
377
+
378
+ # Construct tables
379
+ if extract_tables:
380
+ for table in accumulated_tables:
381
+ extracted_data.append(
382
+ _construct_table_metadata(
383
+ table,
384
+ page_idx,
385
+ page_count,
386
+ source_metadata,
387
+ base_unified_metadata,
388
+ )
389
+ )
390
+ accumulated_tables = []
391
+
392
+ # Construct images
393
+ if extract_images:
394
+ for image in accumulated_images:
395
+ extracted_data.append(
396
+ construct_image_metadata_from_pdf_image(
397
+ image,
398
+ page_idx,
399
+ page_count,
400
+ source_metadata,
401
+ base_unified_metadata,
402
+ )
403
+ )
404
+ accumulated_images = []
405
+
406
+ # Construct text - page
407
+ if (extract_text) and (text_depth == TextTypeEnum.PAGE):
408
+ extracted_data.append(
409
+ construct_text_metadata(
410
+ accumulated_text,
411
+ pdf_metadata.keywords,
412
+ page_idx,
413
+ -1,
414
+ -1,
415
+ -1,
416
+ page_count,
417
+ text_depth,
418
+ source_metadata,
419
+ base_unified_metadata,
420
+ delimiter="\n\n",
421
+ bbox_max_dimensions=(NEMOTRON_PARSE_MAX_WIDTH, NEMOTRON_PARSE_MAX_HEIGHT),
422
+ nearby_objects=page_nearby_blocks,
423
+ )
424
+ )
425
+ accumulated_text = []
426
+
427
+ # Construct text - document
428
+ if (extract_text) and (text_depth == TextTypeEnum.DOCUMENT):
429
+ text_extraction = construct_text_metadata(
430
+ accumulated_text,
431
+ pdf_metadata.keywords,
432
+ -1,
433
+ -1,
434
+ -1,
435
+ -1,
436
+ page_count,
437
+ text_depth,
438
+ source_metadata,
439
+ base_unified_metadata,
440
+ delimiter="\n\n",
441
+ )
442
+
443
+ if len(text_extraction) > 0:
444
+ extracted_data.append(text_extraction)
445
+
446
+ if nemotron_parse_client:
447
+ nemotron_parse_client.close()
448
+ doc.close()
449
+
450
+ return extracted_data
451
+
452
+
453
+ def _extract_text_and_bounding_boxes(
454
+ pages: list,
455
+ nemotron_parse_client,
456
+ execution_trace_log=None,
457
+ ) -> list:
458
+
459
+ # Collect all page indices and images in order.
460
+ image_page_indices = [page[0] for page in pages]
461
+ original_images = [page[1] for page in pages]
462
+
463
+ # Prepare the data payload with all images.
464
+ data = {"images": original_images}
465
+
466
+ # Perform inference using the NimClient.
467
+ inference_results = nemotron_parse_client.infer(
468
+ data=data,
469
+ model_name="nemotron_parse",
470
+ stage_name="pdf_extraction",
471
+ max_batch_size=NEMOTRON_PARSE_MAX_BATCH_SIZE,
472
+ execution_trace_log=execution_trace_log,
473
+ )
474
+
475
+ return list(zip(image_page_indices, inference_results))
476
+
477
+
478
+ def _create_clients(nemotron_parse_config):
479
+ model_interface = nemotron_parse_utils.NemotronParseModelInterface(
480
+ model_name=nemotron_parse_config.nemotron_parse_model_name,
481
+ )
482
+ nemotron_parse_client = create_inference_client(
483
+ nemotron_parse_config.nemotron_parse_endpoints,
484
+ model_interface,
485
+ nemotron_parse_config.auth_token,
486
+ nemotron_parse_config.nemotron_parse_infer_protocol,
487
+ nemotron_parse_config.timeout,
488
+ )
489
+
490
+ return nemotron_parse_client
491
+
492
+
493
+ def _send_inference_request(
494
+ nemotron_parse_client,
495
+ image_array: np.ndarray,
496
+ ) -> Dict[str, Any]:
497
+
498
+ try:
499
+ # NIM only supports processing one page at a time (batch size = 1).
500
+ data = {"image": image_array}
501
+ response = nemotron_parse_client.infer(
502
+ data=data,
503
+ model_name="nemotron_parse",
504
+ )
505
+ except Exception as e:
506
+ logger.exception(f"Unhandled error during Nemotron Parse inference: {e}")
507
+ raise e
508
+
509
+ return response
510
+
511
+
512
+ def _convert_pdfium_page_to_numpy_for_parser(
513
+ page: pdfium.PdfPage,
514
+ render_dpi: int = NEMOTRON_PARSE_RENDER_DPI,
515
+ scale_tuple: Tuple[int, int] = (NEMOTRON_PARSE_MAX_WIDTH, NEMOTRON_PARSE_MAX_HEIGHT),
516
+ padding_tuple: Tuple[int, int] = (NEMOTRON_PARSE_MAX_WIDTH, NEMOTRON_PARSE_MAX_HEIGHT),
517
+ ) -> np.ndarray:
518
+ page_images, padding_offsets = pdfium_pages_to_numpy(
519
+ [page], render_dpi=render_dpi, scale_tuple=scale_tuple, padding_tuple=padding_tuple
520
+ )
521
+
522
+ return page_images[0], padding_offsets[0]
523
+
524
+
525
+ def _convert_pdfium_page_to_numpy_for_yolox(
526
+ page: pdfium.PdfPage,
527
+ scale_tuple: Tuple[int, int] = (YOLOX_PAGE_IMAGE_PREPROC_WIDTH, YOLOX_PAGE_IMAGE_PREPROC_HEIGHT),
528
+ padding_tuple: Tuple[int, int] = (YOLOX_PAGE_IMAGE_PREPROC_WIDTH, YOLOX_PAGE_IMAGE_PREPROC_HEIGHT),
529
+ ) -> np.ndarray:
530
+ page_images, padding_offsets = pdfium_pages_to_numpy([page], scale_tuple=scale_tuple, padding_tuple=padding_tuple)
531
+
532
+ return page_images[0], padding_offsets[0]
533
+
534
+
535
+ def _insert_page_nearby_blocks(
536
+ page_nearby_blocks: Dict[str, Any],
537
+ cls: str,
538
+ txt: str,
539
+ bbox: str,
540
+ ):
541
+ if cls in nemotron_parse_utils.ACCEPTED_TEXT_CLASSES:
542
+ nearby_blocks_key = "text"
543
+ elif cls in nemotron_parse_utils.ACCEPTED_TABLE_CLASSES:
544
+ nearby_blocks_key = "structured"
545
+ elif cls in nemotron_parse_utils.ACCEPTED_IMAGE_CLASSES:
546
+ nearby_blocks_key = "images"
547
+
548
+ page_nearby_blocks[nearby_blocks_key]["content"].append(txt)
549
+ page_nearby_blocks[nearby_blocks_key]["bbox"].append(bbox)
550
+ page_nearby_blocks[nearby_blocks_key]["type"].append(cls)
551
+
552
+
553
+ @pdfium_exception_handler(descriptor="nemotron_parse")
554
+ def _construct_table_metadata(
555
+ table: LatexTable,
556
+ page_idx: int,
557
+ page_count: int,
558
+ source_metadata: Dict,
559
+ base_unified_metadata: Dict,
560
+ ):
561
+ content = table.latex
562
+ table_format = TableFormatEnum.LATEX
563
+ subtype = ContentTypeEnum.TABLE
564
+ description = ContentDescriptionEnum.PDF_TABLE
565
+
566
+ content_metadata = {
567
+ "type": ContentTypeEnum.STRUCTURED,
568
+ "description": description,
569
+ "page_number": page_idx,
570
+ "hierarchy": {
571
+ "page_count": page_count,
572
+ "page": page_idx,
573
+ "line": -1,
574
+ "span": -1,
575
+ },
576
+ "subtype": subtype,
577
+ }
578
+ table_metadata = {
579
+ "caption": "",
580
+ "table_content": content,
581
+ "table_format": table_format,
582
+ "table_location": table.bbox,
583
+ "table_location_max_dimensions": (table.max_width, table.max_height),
584
+ }
585
+ ext_unified_metadata = base_unified_metadata.copy()
586
+
587
+ ext_unified_metadata.update(
588
+ {
589
+ "content": "",
590
+ "source_metadata": source_metadata,
591
+ "content_metadata": content_metadata,
592
+ "table_metadata": table_metadata,
593
+ }
594
+ )
595
+
596
+ validated_unified_metadata = validate_metadata(ext_unified_metadata)
597
+
598
+ return [ContentTypeEnum.STRUCTURED, validated_unified_metadata.model_dump(), str(uuid.uuid4())]