nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,603 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2024, NVIDIA CORPORATION.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ import concurrent.futures
20
+ import logging
21
+ from typing import List, Tuple, Optional, Any
22
+
23
+ import numpy as np
24
+ import pandas as pd
25
+ import pypdfium2 as libpdfium
26
+
27
+ from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH_SIZE
28
+ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
29
+ YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
30
+ YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
31
+ get_yolox_model_name,
32
+ YoloxPageElementsModelInterface,
33
+ )
34
+ from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
35
+ from nv_ingest_api.internal.enums.common import TableFormatEnum, TextTypeEnum, AccessLevelEnum
36
+ from nv_ingest_api.util.metadata.aggregators import (
37
+ construct_image_metadata_from_pdf_image,
38
+ extract_pdf_metadata,
39
+ construct_text_metadata,
40
+ construct_page_element_metadata,
41
+ CroppedImageWithContent,
42
+ )
43
+ from nv_ingest_api.util.nim import create_inference_client
44
+ from nv_ingest_api.util.pdf.pdfium import (
45
+ extract_nested_simple_images_from_pdfium_page,
46
+ extract_image_like_objects_from_pdfium_page,
47
+ )
48
+ from nv_ingest_api.util.pdf.pdfium import pdfium_pages_to_numpy
49
+ from nv_ingest_api.util.image_processing.transforms import numpy_to_base64, crop_image
50
+
51
+ logger = logging.getLogger(__name__)
52
+
53
+
54
+ def _extract_page_elements_using_image_ensemble(
55
+ pages: List[Tuple[int, np.ndarray, Tuple[int, int]]],
56
+ yolox_client,
57
+ yolox_model_name: str = "yolox",
58
+ execution_trace_log: Optional[List] = None,
59
+ ) -> List[Tuple[int, object]]:
60
+ """
61
+ Given a list of (page_index, image) tuples and a YOLOX client, this function performs
62
+ inference to extract page element annotations from all pages.
63
+
64
+ Parameters
65
+ ----------
66
+ pages : List[Tuple[int, np.ndarray, Tuple[int, int]]]
67
+ List of tuples containing page index, image data as numpy array,
68
+ and optional padding offset information.
69
+ yolox_client : object
70
+ A pre-configured client instance for the YOLOX inference service.
71
+ yolox_model_name : str, default="yolox"
72
+ The name of the YOLOX model to use for inference.
73
+ execution_trace_log : Optional[List], default=None
74
+ List for accumulating execution trace information.
75
+
76
+ Returns
77
+ -------
78
+ List[Tuple[int, object]]
79
+ For each page, returns (page_index, joined_content) where joined_content
80
+ is the result of combining annotations from the inference.
81
+ """
82
+ page_elements = []
83
+
84
+ try:
85
+ # Collect all page indices and images in order.
86
+ # Optionally, collect padding offsets if present.
87
+ image_page_indices = []
88
+ original_images = []
89
+ padding_offsets = []
90
+ for page in pages:
91
+ image_page_indices.append(page[0])
92
+ original_images.append(page[1])
93
+ if len(pages[0]) > 2:
94
+ padding_offset = page[2]
95
+ else:
96
+ padding_offset = 0
97
+ padding_offsets.append(padding_offset)
98
+
99
+ # Prepare the data payload with all images.
100
+ data = {"images": original_images}
101
+
102
+ # Perform inference using the NimClient.
103
+ inference_results = yolox_client.infer(
104
+ data,
105
+ model_name="yolox",
106
+ max_batch_size=YOLOX_MAX_BATCH_SIZE,
107
+ trace_info=execution_trace_log,
108
+ stage_name="pdf_content_extractor",
109
+ )
110
+
111
+ # Process results: iterate over each image's inference output.
112
+ for annotation_dict, page_index, original_image, padding_offset in zip(
113
+ inference_results, image_page_indices, original_images, padding_offsets
114
+ ):
115
+ _extract_page_element_images(
116
+ annotation_dict,
117
+ original_image,
118
+ page_index,
119
+ page_elements,
120
+ padding_offset,
121
+ )
122
+
123
+ except TimeoutError:
124
+ logger.error("Timeout error during page element extraction.")
125
+ raise
126
+ except Exception as e:
127
+ logger.exception(f"Unhandled error during page element extraction: {str(e)}")
128
+ raise
129
+
130
+ logger.debug(f"Extracted {len(page_elements)} page elements.")
131
+ return page_elements
132
+
133
+
134
+ # Handle individual page element extraction and model inference
135
+ def _extract_page_element_images(
136
+ annotation_dict,
137
+ original_image,
138
+ page_idx,
139
+ page_elements,
140
+ padding_offset=(0, 0),
141
+ ):
142
+ """
143
+ Handle the extraction of page elements from the inference results and run additional model inference.
144
+
145
+ Parameters
146
+ ----------
147
+ annotation_dict : dict/
148
+ A dictionary containing detected objects and their bounding boxes.
149
+ original_image : np.ndarray
150
+ The original image from which objects were detected.
151
+ page_idx : int
152
+ The index of the current page being processed.
153
+ page_elements : List[Tuple[int, ImageTable]]
154
+ A list to which extracted page elements will be appended.
155
+
156
+ Notes
157
+ -----
158
+ This function iterates over detected objects, crops the original image to the bounding boxes,
159
+ and runs additional inference on the cropped images to extract detailed information about page
160
+ elements.
161
+
162
+ Examples
163
+ --------
164
+ >>> annotation_dict = {"table": [], "chart": []}
165
+ >>> original_image = np.random.rand(1536, 1536, 3)
166
+ >>> page_elements = []
167
+ >>> _extract_page_element_images(annotation_dict, original_image, 0, page_elements)
168
+ """
169
+ orig_width, orig_height, *_ = original_image.shape
170
+ pad_width, pad_height = padding_offset
171
+
172
+ for label in ["table", "chart", "infographic"]:
173
+ if not annotation_dict:
174
+ continue
175
+
176
+ if label not in annotation_dict:
177
+ continue
178
+
179
+ objects = annotation_dict[label]
180
+
181
+ for idx, bboxes in enumerate(objects):
182
+ *bbox, _ = bboxes
183
+ w1, h1, w2, h2 = bbox
184
+
185
+ cropped = crop_image(original_image, (int(w1), int(h1), int(w2), int(h2)))
186
+ if cropped is None:
187
+ continue
188
+
189
+ base64_img = numpy_to_base64(cropped)
190
+
191
+ bbox_in_orig_coord = (
192
+ int(w1) - pad_width,
193
+ int(h1) - pad_height,
194
+ int(w2) - pad_width,
195
+ int(h2) - pad_height,
196
+ )
197
+ max_width = orig_width - 2 * pad_width
198
+ max_height = orig_height - 2 * pad_height
199
+
200
+ page_element_data = CroppedImageWithContent(
201
+ content="",
202
+ image=base64_img,
203
+ bbox=bbox_in_orig_coord,
204
+ max_width=max_width,
205
+ max_height=max_height,
206
+ type_string=label,
207
+ )
208
+ page_elements.append((page_idx, page_element_data))
209
+
210
+
211
+ def _extract_page_text(page) -> str:
212
+ """
213
+ Always extract text from the given page and return it as a raw string.
214
+ The caller decides whether to use per-page or doc-level logic.
215
+ """
216
+ textpage = page.get_textpage()
217
+ return textpage.get_text_bounded()
218
+
219
+
220
+ def _extract_page_images(
221
+ extract_images_method: str,
222
+ page,
223
+ page_idx: int,
224
+ page_width: float,
225
+ page_height: float,
226
+ page_count: int,
227
+ source_metadata: dict,
228
+ base_unified_metadata: dict,
229
+ **extract_images_params,
230
+ ) -> list:
231
+ """
232
+ Always extract images from the given page and return a list of image metadata items.
233
+ The caller decides whether to call this based on a flag.
234
+ """
235
+ if extract_images_method == "simple":
236
+ extracted_image_data = extract_nested_simple_images_from_pdfium_page(page)
237
+ else: # if extract_images_method == "group"
238
+ extracted_image_data = extract_image_like_objects_from_pdfium_page(page, merge=True, **extract_images_params)
239
+
240
+ extracted_images = []
241
+ for image_data in extracted_image_data:
242
+ try:
243
+ image_meta = construct_image_metadata_from_pdf_image(
244
+ image_data,
245
+ page_idx,
246
+ page_count,
247
+ source_metadata,
248
+ base_unified_metadata,
249
+ )
250
+ extracted_images.append(image_meta)
251
+ except Exception as e:
252
+ logger.error(f"Unhandled error extracting image on page {page_idx}: {e}")
253
+ # continue extracting other images
254
+
255
+ return extracted_images
256
+
257
+
258
+ def _extract_page_elements(
259
+ pages: list,
260
+ page_count: int,
261
+ source_metadata: dict,
262
+ base_unified_metadata: dict,
263
+ extract_tables: bool,
264
+ extract_charts: bool,
265
+ extract_infographics: bool,
266
+ paddle_output_format: str,
267
+ yolox_endpoints: Tuple[Optional[str], Optional[str]],
268
+ yolox_infer_protocol: str = "http",
269
+ auth_token: Optional[str] = None,
270
+ execution_trace_log=None,
271
+ ) -> list:
272
+ """
273
+ Extract page elements from the given pages using YOLOX-based inference.
274
+
275
+ This function creates a YOLOX client using the provided parameters, extracts elements
276
+ from pages, and builds metadata for each extracted element based on the specified
277
+ extraction flags.
278
+
279
+ Parameters
280
+ ----------
281
+ pages : list
282
+ List of page images to process.
283
+ page_count : int
284
+ Total number of pages in the document.
285
+ source_metadata : dict
286
+ Metadata about the source document.
287
+ base_unified_metadata : dict
288
+ Base metadata to include in all extracted elements.
289
+ extract_tables : bool
290
+ Flag indicating whether to extract tables.
291
+ extract_charts : bool
292
+ Flag indicating whether to extract charts.
293
+ extract_infographics : bool
294
+ Flag indicating whether to extract infographics.
295
+ paddle_output_format : str
296
+ Format to use for table content.
297
+ yolox_endpoints : Tuple[Optional[str], Optional[str]]
298
+ A tuple containing the gRPC and HTTP endpoints for the YOLOX service.
299
+ yolox_infer_protocol : str, default="http"
300
+ Protocol to use for inference (either "http" or "grpc").
301
+ auth_token : Optional[str], default=None
302
+ Authentication token for the inference service.
303
+ execution_trace_log : optional
304
+ List for accumulating execution trace information.
305
+
306
+ Returns
307
+ -------
308
+ list
309
+ List of extracted page elements with their metadata.
310
+ """
311
+ extracted_page_elements = []
312
+ yolox_client = None
313
+
314
+ try:
315
+ # Default model name
316
+ yolox_model_name = "yolox"
317
+
318
+ # Get the HTTP endpoint to determine the model name if needed
319
+ yolox_http_endpoint = yolox_endpoints[1]
320
+ if yolox_http_endpoint:
321
+ try:
322
+ yolox_model_name = get_yolox_model_name(yolox_http_endpoint)
323
+ except Exception as e:
324
+ logger.warning(f"Failed to get YOLOX model name from endpoint: {e}. Using default.")
325
+
326
+ # Create the model interface
327
+ model_interface = YoloxPageElementsModelInterface(yolox_model_name=yolox_model_name)
328
+
329
+ # Create the inference client
330
+ yolox_client = create_inference_client(
331
+ yolox_endpoints,
332
+ model_interface,
333
+ auth_token,
334
+ yolox_infer_protocol,
335
+ )
336
+
337
+ # Extract page elements using the client
338
+ page_element_results = _extract_page_elements_using_image_ensemble(
339
+ pages, yolox_client, yolox_model_name, execution_trace_log=execution_trace_log
340
+ )
341
+
342
+ # Process each extracted element based on extraction flags
343
+ for page_idx, page_element in page_element_results:
344
+ # Skip elements that shouldn't be extracted based on flags
345
+ if (not extract_tables) and (page_element.type_string == "table"):
346
+ continue
347
+ if (not extract_charts) and (page_element.type_string == "chart"):
348
+ continue
349
+ if (not extract_infographics) and (page_element.type_string == "infographic"):
350
+ continue
351
+
352
+ # Set content format for tables
353
+ if page_element.type_string == "table":
354
+ page_element.content_format = paddle_output_format
355
+
356
+ # Construct metadata for the page element
357
+ page_element_meta = construct_page_element_metadata(
358
+ page_element,
359
+ page_idx,
360
+ page_count,
361
+ source_metadata,
362
+ base_unified_metadata,
363
+ )
364
+ extracted_page_elements.append(page_element_meta)
365
+
366
+ except Exception as e:
367
+ logger.exception(f"Error in page element extraction: {str(e)}")
368
+ raise
369
+ finally:
370
+ # Ensure client is closed properly
371
+ if yolox_client:
372
+ try:
373
+ yolox_client.close()
374
+ except Exception as e:
375
+ logger.warning(f"Error closing YOLOX client: {str(e)}")
376
+
377
+ return extracted_page_elements
378
+
379
+
380
+ def pdfium_extractor(
381
+ pdf_stream,
382
+ extract_text: bool,
383
+ extract_images: bool,
384
+ extract_infographics: bool,
385
+ extract_tables: bool,
386
+ extract_charts: bool,
387
+ extractor_config: dict,
388
+ execution_trace_log: Optional[List[Any]] = None,
389
+ ) -> pd.DataFrame:
390
+ # --- Extract and validate extractor_config ---
391
+ if extractor_config is None or not isinstance(extractor_config, dict):
392
+ raise ValueError("`extractor_config` must be provided as a dictionary.")
393
+
394
+ # Validate and extract row_data
395
+ row_data = extractor_config.get("row_data")
396
+ if row_data is None:
397
+ raise ValueError("`extractor_config` must include a valid 'row_data' dictionary.")
398
+ if "source_id" not in row_data:
399
+ raise ValueError("The 'row_data' dictionary must contain the 'source_id' key.")
400
+
401
+ # Validate and extract text_depth
402
+ text_depth_str = extractor_config.get("text_depth", "page")
403
+ try:
404
+ text_depth = TextTypeEnum[text_depth_str.upper()]
405
+ except KeyError:
406
+ raise ValueError(
407
+ f"Invalid text_depth: {text_depth_str}. Valid options: {list(TextTypeEnum.__members__.keys())}"
408
+ )
409
+
410
+ # Validate and extract paddle_output_format
411
+ paddle_output_format_str = extractor_config.get("paddle_output_format", "pseudo_markdown")
412
+ try:
413
+ paddle_output_format = TableFormatEnum[paddle_output_format_str.upper()]
414
+ except KeyError:
415
+ raise ValueError(
416
+ f"Invalid paddle_output_format: {paddle_output_format_str}. "
417
+ f"Valid options: {list(TableFormatEnum.__members__.keys())}"
418
+ )
419
+
420
+ extract_images_method = extractor_config.get("extract_images_method", "group")
421
+ extract_images_params = extractor_config.get("extract_images_params", {})
422
+
423
+ # Extract metadata_column
424
+ metadata_column = extractor_config.get("metadata_column", "metadata")
425
+
426
+ # Process pdfium_config
427
+ pdfium_config_raw = extractor_config.get("pdfium_config", {})
428
+ if isinstance(pdfium_config_raw, dict):
429
+ pdfium_config = PDFiumConfigSchema(**pdfium_config_raw)
430
+ elif isinstance(pdfium_config_raw, PDFiumConfigSchema):
431
+ pdfium_config = pdfium_config_raw
432
+ else:
433
+ raise ValueError("`pdfium_config` must be a dictionary or a PDFiumConfigSchema instance.")
434
+ # --- End extractor_config extraction ---
435
+
436
+ logger.debug("Extracting PDF with pdfium backend.")
437
+ source_id = row_data["source_id"]
438
+
439
+ # Retrieve unified metadata robustly (supporting pandas Series or dict)
440
+ if hasattr(row_data, "index"):
441
+ base_unified_metadata = row_data[metadata_column] if metadata_column in row_data.index else {}
442
+ else:
443
+ base_unified_metadata = row_data.get(metadata_column, {})
444
+
445
+ base_source_metadata = base_unified_metadata.get("source_metadata", {})
446
+ source_location = base_source_metadata.get("source_location", "")
447
+ collection_id = base_source_metadata.get("collection_id", "")
448
+ partition_id = base_source_metadata.get("partition_id", -1)
449
+ access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
450
+
451
+ doc = libpdfium.PdfDocument(pdf_stream)
452
+ pdf_metadata = extract_pdf_metadata(doc, source_id)
453
+ page_count = pdf_metadata.page_count
454
+
455
+ source_metadata = {
456
+ "source_name": pdf_metadata.filename,
457
+ "source_id": source_id,
458
+ "source_location": source_location,
459
+ "source_type": pdf_metadata.source_type,
460
+ "collection_id": collection_id,
461
+ "date_created": pdf_metadata.date_created,
462
+ "last_modified": pdf_metadata.last_modified,
463
+ "summary": "",
464
+ "partition_id": partition_id,
465
+ "access_level": access_level,
466
+ }
467
+
468
+ logger.debug(f"PDF has {page_count} pages.")
469
+ logger.debug(
470
+ f"extract_text={extract_text}, extract_images={extract_images}, "
471
+ f"extract_tables={extract_tables}, extract_charts={extract_charts}, "
472
+ f"extract_infographics={extract_infographics}"
473
+ )
474
+
475
+ # Decide if text extraction should be done at the PAGE or DOCUMENT level
476
+ if text_depth != TextTypeEnum.PAGE:
477
+ text_depth = TextTypeEnum.DOCUMENT
478
+
479
+ extracted_data = []
480
+ accumulated_text = []
481
+
482
+ # Prepare for table/chart extraction
483
+ pages_for_tables = [] # Accumulate tuples of (page_idx, np_image)
484
+ futures = [] # To track asynchronous table/chart extraction tasks
485
+
486
+ with concurrent.futures.ThreadPoolExecutor(max_workers=pdfium_config.workers_per_progress_engine) as executor:
487
+ # PAGE LOOP
488
+ for page_idx in range(page_count):
489
+ page = doc.get_page(page_idx)
490
+ page_width, page_height = page.get_size()
491
+
492
+ # Text extraction
493
+ if extract_text:
494
+ page_text = _extract_page_text(page)
495
+ if text_depth == TextTypeEnum.PAGE:
496
+ text_meta = construct_text_metadata(
497
+ [page_text],
498
+ pdf_metadata.keywords,
499
+ page_idx,
500
+ -1,
501
+ -1,
502
+ -1,
503
+ page_count,
504
+ text_depth,
505
+ source_metadata,
506
+ base_unified_metadata,
507
+ )
508
+ extracted_data.append(text_meta)
509
+ else:
510
+ accumulated_text.append(page_text)
511
+
512
+ # Image extraction
513
+ if extract_images:
514
+ image_data = _extract_page_images(
515
+ extract_images_method,
516
+ page,
517
+ page_idx,
518
+ page_width,
519
+ page_height,
520
+ page_count,
521
+ source_metadata,
522
+ base_unified_metadata,
523
+ **extract_images_params,
524
+ )
525
+ extracted_data.extend(image_data)
526
+
527
+ # If we want tables or charts, rasterize the page and store it
528
+ if extract_tables or extract_charts or extract_infographics:
529
+ image, padding_offsets = pdfium_pages_to_numpy(
530
+ [page],
531
+ scale_tuple=(YOLOX_PAGE_IMAGE_PREPROC_WIDTH, YOLOX_PAGE_IMAGE_PREPROC_HEIGHT),
532
+ padding_tuple=(YOLOX_PAGE_IMAGE_PREPROC_WIDTH, YOLOX_PAGE_IMAGE_PREPROC_HEIGHT),
533
+ trace_info=execution_trace_log,
534
+ )
535
+ pages_for_tables.append((page_idx, image[0], padding_offsets[0]))
536
+
537
+ # Whenever pages_for_tables hits YOLOX_MAX_BATCH_SIZE, submit a job
538
+ if len(pages_for_tables) >= YOLOX_MAX_BATCH_SIZE:
539
+ future = executor.submit(
540
+ _extract_page_elements,
541
+ pages_for_tables[:], # pass a copy
542
+ page_count,
543
+ source_metadata,
544
+ base_unified_metadata,
545
+ extract_tables,
546
+ extract_charts,
547
+ extract_infographics,
548
+ paddle_output_format,
549
+ pdfium_config.yolox_endpoints,
550
+ pdfium_config.yolox_infer_protocol,
551
+ pdfium_config.auth_token,
552
+ execution_trace_log=execution_trace_log,
553
+ )
554
+ futures.append(future)
555
+ pages_for_tables.clear()
556
+
557
+ page.close()
558
+
559
+ # After page loop, if we still have leftover pages_for_tables, submit one last job
560
+ if (extract_tables or extract_charts or extract_infographics) and pages_for_tables:
561
+ future = executor.submit(
562
+ _extract_page_elements,
563
+ pages_for_tables[:],
564
+ page_count,
565
+ source_metadata,
566
+ base_unified_metadata,
567
+ extract_tables,
568
+ extract_charts,
569
+ extract_infographics,
570
+ paddle_output_format,
571
+ pdfium_config.yolox_endpoints,
572
+ pdfium_config.yolox_infer_protocol,
573
+ pdfium_config.auth_token,
574
+ execution_trace_log=execution_trace_log,
575
+ )
576
+ futures.append(future)
577
+ pages_for_tables.clear()
578
+
579
+ # Wait for all asynchronous jobs to complete.
580
+ for fut in concurrent.futures.as_completed(futures):
581
+ table_chart_items = fut.result() # Blocks until the job is finished
582
+ extracted_data.extend(table_chart_items)
583
+
584
+ # For document-level text extraction, combine the accumulated text.
585
+ if extract_text and text_depth == TextTypeEnum.DOCUMENT and accumulated_text:
586
+ doc_text_meta = construct_text_metadata(
587
+ accumulated_text,
588
+ pdf_metadata.keywords,
589
+ -1,
590
+ -1,
591
+ -1,
592
+ -1,
593
+ page_count,
594
+ text_depth,
595
+ source_metadata,
596
+ base_unified_metadata,
597
+ )
598
+ extracted_data.append(doc_text_meta)
599
+
600
+ doc.close()
601
+
602
+ logger.debug(f"Extracted {len(extracted_data)} items from PDF.")
603
+ return extracted_data
@@ -0,0 +1,96 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import io
18
+ from typing import Dict, Any, Optional, List
19
+
20
+ import pandas as pd
21
+
22
+ import requests
23
+
24
+ TIKA_URL = "http://tika:9998/tika"
25
+
26
+
27
+ def tika_extractor(
28
+ pdf_stream: io.BytesIO,
29
+ extract_text: bool,
30
+ extract_images: bool,
31
+ extract_infographics: bool,
32
+ extract_charts: bool,
33
+ extract_tables: bool,
34
+ extractor_config: Dict[str, Any],
35
+ execution_trace_log: Optional[List[Any]] = None,
36
+ ) -> pd.DataFrame:
37
+ """
38
+ Extract text from a PDF using the Apache Tika server.
39
+
40
+ This function sends a PDF stream to the Apache Tika server and returns the
41
+ extracted text. The flags for text, image, and table extraction are provided
42
+ for consistency with the extractor interface; however, this implementation
43
+ currently only supports text extraction.
44
+
45
+ Parameters
46
+ ----------
47
+ pdf_stream : io.BytesIO
48
+ A bytestream representing the PDF to be processed.
49
+ extract_text : bool
50
+ Flag indicating whether text extraction is desired.
51
+ extract_images : bool
52
+ Flag indicating whether image extraction is desired.
53
+ extract_infographics : bool
54
+ Flag indicating whether infographic extraction is desired.
55
+ extract_charts : bool
56
+ Flag indicating whether chart extraction
57
+ extract_tables : bool
58
+ Flag indicating whether table extraction
59
+ extractor_config : dict
60
+ A dictionary of additional configuration options for the extractor. This
61
+ parameter is currently not used by this extractor.
62
+
63
+ Returns
64
+ -------
65
+ str
66
+ The extracted text from the PDF as returned by the Apache Tika server.
67
+
68
+ Raises
69
+ ------
70
+ requests.RequestException
71
+ If the request to the Tika server fails.
72
+
73
+ Examples
74
+ --------
75
+ >>> from io import BytesIO
76
+ >>> with open("document.pdf", "rb") as f:
77
+ ... pdf_stream = BytesIO(f.read())
78
+ >>> text = tika_extractor(pdf_stream, True, False, False, {})
79
+ """
80
+
81
+ _ = execution_trace_log
82
+
83
+ _, _, _, _, _, _ = (
84
+ extract_text,
85
+ extract_images,
86
+ extract_infographics,
87
+ extract_charts,
88
+ extract_tables,
89
+ extractor_config,
90
+ )
91
+
92
+ headers = {"Accept": "text/plain"}
93
+ timeout = 120 # Timeout in seconds
94
+ response = requests.put(TIKA_URL, headers=headers, data=pdf_stream, timeout=timeout)
95
+ response.raise_for_status() # Raise an error for bad responses
96
+ return response.text