nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,652 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2024, NVIDIA CORPORATION.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ import concurrent.futures
20
+ import logging
21
+ from typing import Any, Dict, List, Optional, Tuple
22
+
23
+ import numpy as np
24
+ import pandas as pd
25
+ import pypdfium2 as libpdfium
26
+
27
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
28
+ from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH_SIZE
29
+ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
30
+ YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
31
+ YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
32
+ YoloxPageElementsModelInterface,
33
+ YOLOX_PAGE_IMAGE_FORMAT,
34
+ )
35
+ from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
36
+ from nv_ingest_api.internal.enums.common import TableFormatEnum, TextTypeEnum, AccessLevelEnum
37
+ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
38
+ YOLOX_PAGE_DEFAULT_VERSION,
39
+ YOLOX_PAGE_CLASS_LABELS,
40
+ get_yolox_page_version,
41
+ )
42
+ from nv_ingest_api.util.metadata.aggregators import (
43
+ construct_image_metadata_from_base64,
44
+ construct_image_metadata_from_pdf_image,
45
+ extract_pdf_metadata,
46
+ construct_text_metadata,
47
+ construct_page_element_metadata,
48
+ CroppedImageWithContent,
49
+ )
50
+ from nv_ingest_api.util.nim import create_inference_client
51
+ from nv_ingest_api.util.pdf.pdfium import (
52
+ extract_nested_simple_images_from_pdfium_page,
53
+ extract_image_like_objects_from_pdfium_page,
54
+ is_scanned_page,
55
+ pdfium_pages_to_numpy,
56
+ )
57
+ from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
58
+ from nv_ingest_api.util.image_processing.transforms import numpy_to_base64, crop_image
59
+
60
+ logger = logging.getLogger(__name__)
61
+
62
+
63
+ def _extract_page_elements_using_image_ensemble(
64
+ pages: List[Tuple[int, np.ndarray, Tuple[int, int]]],
65
+ yolox_client,
66
+ execution_trace_log: Optional[List] = None,
67
+ ) -> List[Tuple[int, object]]:
68
+ """
69
+ Given a list of (page_index, image) tuples and a YOLOX client, this function performs
70
+ inference to extract page element annotations from all pages.
71
+
72
+ Parameters
73
+ ----------
74
+ pages : List[Tuple[int, np.ndarray, Tuple[int, int]]]
75
+ List of tuples containing page index, image data as numpy array,
76
+ and optional padding offset information.
77
+ yolox_client : object
78
+ A pre-configured client instance for the YOLOX inference service.
79
+ execution_trace_log : Optional[List], default=None
80
+ List for accumulating execution trace information.
81
+
82
+ Returns
83
+ -------
84
+ List[Tuple[int, object]]
85
+ For each page, returns (page_index, joined_content) where joined_content
86
+ is the result of combining annotations from the inference.
87
+ """
88
+ page_elements = []
89
+
90
+ try:
91
+ # Collect all page indices and images in order.
92
+ # Optionally, collect padding offsets if present.
93
+ image_page_indices = []
94
+ original_images = []
95
+ padding_offsets = []
96
+ for page in pages:
97
+ image_page_indices.append(page[0])
98
+ original_images.append(page[1])
99
+ if len(pages[0]) > 2:
100
+ padding_offset = page[2]
101
+ else:
102
+ padding_offset = 0
103
+ padding_offsets.append(padding_offset)
104
+
105
+ # Prepare the data payload with all images.
106
+ data = {"images": original_images}
107
+
108
+ # Perform inference using the NimClient.
109
+ inference_results = yolox_client.infer(
110
+ data,
111
+ model_name="pipeline" if yolox_client.model_interface.version.endswith("-v3") else "yolox_ensemble",
112
+ max_batch_size=YOLOX_MAX_BATCH_SIZE,
113
+ input_names=["INPUT_IMAGES", "THRESHOLDS"],
114
+ dtypes=["BYTES", "FP32"],
115
+ output_names=["OUTPUT"],
116
+ trace_info=execution_trace_log,
117
+ stage_name="pdf_extraction",
118
+ )
119
+
120
+ # Process results: iterate over each image's inference output.
121
+ for annotation_dict, page_index, original_image, padding_offset in zip(
122
+ inference_results, image_page_indices, original_images, padding_offsets
123
+ ):
124
+ _extract_page_element_images(
125
+ annotation_dict,
126
+ original_image,
127
+ page_index,
128
+ page_elements,
129
+ padding_offset,
130
+ )
131
+
132
+ except TimeoutError:
133
+ logger.error("Timeout error during page element extraction.")
134
+ raise
135
+ except Exception as e:
136
+ logger.exception(f"Unhandled error during page element extraction: {str(e)}")
137
+ raise
138
+
139
+ logger.debug(f"Extracted {len(page_elements)} page elements.")
140
+ return page_elements
141
+
142
+
143
+ # Handle individual page element extraction and model inference
144
+ def _extract_page_element_images(
145
+ annotation_dict,
146
+ original_image,
147
+ page_idx,
148
+ page_elements,
149
+ padding_offset=(0, 0),
150
+ ):
151
+ """
152
+ Handle the extraction of page elements from the inference results and run additional model inference.
153
+
154
+ Parameters
155
+ ----------
156
+ annotation_dict : dict/
157
+ A dictionary containing detected objects and their bounding boxes.
158
+ original_image : np.ndarray
159
+ The original image from which objects were detected.
160
+ page_idx : int
161
+ The index of the current page being processed.
162
+ page_elements : List[Tuple[int, ImageTable]]
163
+ A list to which extracted page elements will be appended.
164
+
165
+ Notes
166
+ -----
167
+ This function iterates over detected objects, crops the original image to the bounding boxes,
168
+ and runs additional inference on the cropped images to extract detailed information about page
169
+ elements.
170
+
171
+ Examples
172
+ --------
173
+ >>> annotation_dict = {"table": [], "chart": []}
174
+ >>> original_image = np.random.rand(1536, 1536, 3)
175
+ >>> page_elements = []
176
+ >>> _extract_page_element_images(annotation_dict, original_image, 0, page_elements)
177
+ """
178
+ orig_width, orig_height, *_ = original_image.shape
179
+ pad_width, pad_height = padding_offset
180
+
181
+ if annotation_dict and (set(YOLOX_PAGE_CLASS_LABELS) <= annotation_dict.keys()):
182
+ labels = YOLOX_PAGE_CLASS_LABELS
183
+ else:
184
+ labels = ["table", "chart", "infographics"]
185
+
186
+ for label in labels:
187
+ if not annotation_dict:
188
+ continue
189
+
190
+ if label not in annotation_dict:
191
+ continue
192
+
193
+ objects = annotation_dict[label]
194
+
195
+ for idx, bboxes in enumerate(objects):
196
+ *bbox, _ = bboxes
197
+ w1, h1, w2, h2 = bbox
198
+
199
+ cropped = crop_image(original_image, (int(w1), int(h1), int(w2), int(h2)))
200
+ if cropped is None:
201
+ continue
202
+
203
+ base64_img = numpy_to_base64(cropped, format=YOLOX_PAGE_IMAGE_FORMAT)
204
+
205
+ bbox_in_orig_coord = (
206
+ int(w1) - pad_width,
207
+ int(h1) - pad_height,
208
+ int(w2) - pad_width,
209
+ int(h2) - pad_height,
210
+ )
211
+ max_width = orig_width - 2 * pad_width
212
+ max_height = orig_height - 2 * pad_height
213
+
214
+ page_element_data = CroppedImageWithContent(
215
+ content="",
216
+ image=base64_img,
217
+ bbox=bbox_in_orig_coord,
218
+ max_width=max_width,
219
+ max_height=max_height,
220
+ type_string=label,
221
+ )
222
+ page_elements.append((page_idx, page_element_data))
223
+
224
+
225
+ def _extract_page_text(page) -> str:
226
+ """
227
+ Always extract text from the given page and return it as a raw string.
228
+ The caller decides whether to use per-page or doc-level logic.
229
+ """
230
+ textpage = page.get_textpage()
231
+ return textpage.get_text_bounded()
232
+
233
+
234
+ def _extract_page_images(
235
+ extract_images_method: str,
236
+ page,
237
+ page_idx: int,
238
+ page_width: float,
239
+ page_height: float,
240
+ page_count: int,
241
+ source_metadata: dict,
242
+ base_unified_metadata: dict,
243
+ **extract_images_params,
244
+ ) -> list:
245
+ """
246
+ Always extract images from the given page and return a list of image metadata items.
247
+ The caller decides whether to call this based on a flag.
248
+ """
249
+ if extract_images_method == "simple":
250
+ extracted_image_data = extract_nested_simple_images_from_pdfium_page(page)
251
+ else: # if extract_images_method == "group"
252
+ extracted_image_data = extract_image_like_objects_from_pdfium_page(page, merge=True, **extract_images_params)
253
+
254
+ extracted_images = []
255
+ for image_data in extracted_image_data:
256
+ try:
257
+ image_meta = construct_image_metadata_from_pdf_image(
258
+ image_data,
259
+ page_idx,
260
+ page_count,
261
+ source_metadata,
262
+ base_unified_metadata,
263
+ )
264
+ extracted_images.append(image_meta)
265
+ except Exception as e:
266
+ logger.error(f"Unhandled error extracting image on page {page_idx}: {e}")
267
+ # continue extracting other images
268
+
269
+ return extracted_images
270
+
271
+
272
+ def _extract_page_elements(
273
+ pages: list,
274
+ page_count: int,
275
+ source_metadata: dict,
276
+ base_unified_metadata: dict,
277
+ extract_tables: bool,
278
+ extract_charts: bool,
279
+ extract_infographics: bool,
280
+ page_to_text_flag_map: Dict[int, bool],
281
+ table_output_format: str,
282
+ yolox_endpoints: Tuple[Optional[str], Optional[str]],
283
+ yolox_infer_protocol: str = "http",
284
+ auth_token: Optional[str] = None,
285
+ execution_trace_log=None,
286
+ ) -> list:
287
+ """
288
+ Extract page elements from the given pages using YOLOX-based inference.
289
+
290
+ This function creates a YOLOX client using the provided parameters, extracts elements
291
+ from pages, and builds metadata for each extracted element based on the specified
292
+ extraction flags.
293
+
294
+ Parameters
295
+ ----------
296
+ pages : list
297
+ List of page images to process.
298
+ page_count : int
299
+ Total number of pages in the document.
300
+ source_metadata : dict
301
+ Metadata about the source document.
302
+ base_unified_metadata : dict
303
+ Base metadata to include in all extracted elements.
304
+ extract_tables : bool
305
+ Flag indicating whether to extract tables.
306
+ extract_charts : bool
307
+ Flag indicating whether to extract charts.
308
+ extract_infographics : bool
309
+ Flag indicating whether to extract infographics.
310
+ table_output_format : str
311
+ Format to use for table content.
312
+ yolox_endpoints : Tuple[Optional[str], Optional[str]]
313
+ A tuple containing the gRPC and HTTP endpoints for the YOLOX service.
314
+ yolox_infer_protocol : str, default="http"
315
+ Protocol to use for inference (either "http" or "grpc").
316
+ auth_token : Optional[str], default=None
317
+ Authentication token for the inference service.
318
+ execution_trace_log : optional
319
+ List for accumulating execution trace information.
320
+
321
+ Returns
322
+ -------
323
+ list
324
+ List of extracted page elements with their metadata.
325
+ """
326
+ extracted_page_elements = []
327
+ yolox_client = None
328
+
329
+ try:
330
+ # Default model name
331
+ yolox_version = YOLOX_PAGE_DEFAULT_VERSION
332
+
333
+ # Get the HTTP endpoint to determine the model name if needed
334
+ yolox_http_endpoint = yolox_endpoints[1]
335
+ if yolox_http_endpoint:
336
+ try:
337
+ yolox_version = get_yolox_page_version(yolox_http_endpoint)
338
+ except Exception as e:
339
+ logger.warning(f"Failed to get YOLOX model name from endpoint: {e}. Using default.")
340
+
341
+ # Create the model interface
342
+ model_interface = YoloxPageElementsModelInterface(version=yolox_version)
343
+ # Create the inference client
344
+ yolox_client = create_inference_client(
345
+ yolox_endpoints,
346
+ model_interface,
347
+ auth_token,
348
+ yolox_infer_protocol,
349
+ )
350
+
351
+ # Extract page elements using the client
352
+ page_element_results = _extract_page_elements_using_image_ensemble(
353
+ pages, yolox_client, execution_trace_log=execution_trace_log
354
+ )
355
+
356
+ # Process each extracted element based on extraction flags
357
+ for page_idx, page_element in page_element_results:
358
+ process_text_for_this_page = page_to_text_flag_map.get(page_idx, False)
359
+ element_type = page_element.type_string
360
+
361
+ page_reading_index = page_idx + 1
362
+
363
+ # Skip elements that shouldn't be extracted based on flags
364
+ if (not extract_tables) and (element_type == "table"):
365
+ continue
366
+ if (not extract_charts) and (element_type == "chart"):
367
+ continue
368
+ if (not extract_infographics) and (element_type == "infographic"):
369
+ continue
370
+ if (not process_text_for_this_page) and (element_type in {"title", "paragraph", "header_footer"}):
371
+ continue
372
+
373
+ # Set content format for tables
374
+ if page_element.type_string == "table":
375
+ page_element.content_format = table_output_format
376
+
377
+ # Construct metadata for the page element
378
+ page_element_meta = construct_page_element_metadata(
379
+ page_element,
380
+ page_reading_index,
381
+ page_count,
382
+ source_metadata,
383
+ base_unified_metadata,
384
+ )
385
+ extracted_page_elements.append(page_element_meta)
386
+
387
+ except Exception as e:
388
+ logger.exception(f"Error in page element extraction: {str(e)}")
389
+ raise
390
+
391
+ return extracted_page_elements
392
+
393
+
394
+ def pdfium_extractor(
395
+ pdf_stream,
396
+ extract_text: bool,
397
+ extract_images: bool,
398
+ extract_infographics: bool,
399
+ extract_tables: bool,
400
+ extract_charts: bool,
401
+ extract_page_as_image: bool,
402
+ extractor_config: dict,
403
+ execution_trace_log: Optional[List[Any]] = None,
404
+ ) -> pd.DataFrame:
405
+ # --- Extract and validate extractor_config ---
406
+ if extractor_config is None or not isinstance(extractor_config, dict):
407
+ raise ValueError("`extractor_config` must be provided as a dictionary.")
408
+
409
+ # Validate and extract row_data
410
+ row_data = extractor_config.get("row_data")
411
+ if row_data is None:
412
+ raise ValueError("`extractor_config` must include a valid 'row_data' dictionary.")
413
+ if "source_id" not in row_data:
414
+ raise ValueError("The 'row_data' dictionary must contain the 'source_id' key.")
415
+
416
+ # Validate and extract text_depth
417
+ text_depth_str = extractor_config.get("text_depth", "page")
418
+ try:
419
+ text_depth = TextTypeEnum[text_depth_str.upper()]
420
+ except KeyError:
421
+ raise ValueError(
422
+ f"Invalid text_depth: {text_depth_str}. Valid options: {list(TextTypeEnum.__members__.keys())}"
423
+ )
424
+
425
+ # Validate and extract table_output_format
426
+ table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
427
+ try:
428
+ table_output_format = TableFormatEnum[table_output_format_str.upper()]
429
+ except KeyError:
430
+ raise ValueError(
431
+ f"Invalid table_output_format: {table_output_format_str}. "
432
+ f"Valid options: {list(TableFormatEnum.__members__.keys())}"
433
+ )
434
+
435
+ text_extraction_method = extractor_config.get("extract_method", "pdfium")
436
+ extract_images_method = extractor_config.get("extract_images_method", "group")
437
+ extract_images_params = extractor_config.get("extract_images_params", {})
438
+
439
+ # Extract metadata_column
440
+ metadata_column = extractor_config.get("metadata_column", "metadata")
441
+
442
+ # Process pdfium_config
443
+ pdfium_config_raw = extractor_config.get("pdfium_config", {})
444
+ if isinstance(pdfium_config_raw, dict):
445
+ pdfium_config = PDFiumConfigSchema(**pdfium_config_raw)
446
+ elif isinstance(pdfium_config_raw, PDFiumConfigSchema):
447
+ pdfium_config = pdfium_config_raw
448
+ else:
449
+ raise ValueError("`pdfium_config` must be a dictionary or a PDFiumConfigSchema instance.")
450
+ # --- End extractor_config extraction ---
451
+
452
+ logger.debug("Extracting PDF with pdfium backend.")
453
+ source_id = row_data["source_id"]
454
+
455
+ # Retrieve unified metadata robustly (supporting pandas Series or dict)
456
+ if hasattr(row_data, "index"):
457
+ base_unified_metadata = row_data[metadata_column] if metadata_column in row_data.index else {}
458
+ else:
459
+ base_unified_metadata = row_data.get(metadata_column, {})
460
+
461
+ base_source_metadata = base_unified_metadata.get("source_metadata", {})
462
+ source_location = base_source_metadata.get("source_location", "")
463
+ collection_id = base_source_metadata.get("collection_id", "")
464
+ partition_id = base_source_metadata.get("partition_id", -1)
465
+ access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
466
+
467
+ doc = libpdfium.PdfDocument(pdf_stream)
468
+ pdf_metadata = extract_pdf_metadata(doc, source_id)
469
+ page_count = pdf_metadata.page_count
470
+
471
+ source_metadata = {
472
+ "source_name": pdf_metadata.filename,
473
+ "source_id": source_id,
474
+ "source_location": source_location,
475
+ "source_type": pdf_metadata.source_type,
476
+ "collection_id": collection_id,
477
+ "date_created": pdf_metadata.date_created,
478
+ "last_modified": pdf_metadata.last_modified,
479
+ "summary": "",
480
+ "partition_id": partition_id,
481
+ "access_level": access_level,
482
+ }
483
+
484
+ logger.debug(f"PDF has {page_count} pages.")
485
+ logger.debug(
486
+ f"extract_text={extract_text}, extract_images={extract_images}, "
487
+ f"extract_tables={extract_tables}, extract_charts={extract_charts}, "
488
+ f"extract_infographics={extract_infographics}"
489
+ )
490
+
491
+ # Decide if text extraction should be done at the PAGE or DOCUMENT level
492
+ if text_depth != TextTypeEnum.PAGE:
493
+ text_depth = TextTypeEnum.DOCUMENT
494
+
495
+ extracted_data = []
496
+ accumulated_text = []
497
+
498
+ # Prepare for table/chart/infographic/text OCR extraction
499
+ pages_for_extractions = [] # Accumulate tuples of (page_idx, np_image)
500
+ page_to_text_flag_map = {} # Maps page_idx -> bool (True if OCR text extraction is needed)
501
+ futures = [] # To track asynchronous table/chart extraction tasks
502
+
503
+ with concurrent.futures.ThreadPoolExecutor(max_workers=pdfium_config.workers_per_progress_engine) as executor:
504
+ # PAGE LOOP
505
+ for page_idx in range(page_count):
506
+ page = doc.get_page(page_idx)
507
+ page_width, page_height = page.get_size()
508
+ page_reading_index = page_idx + 1
509
+
510
+ is_scanned = is_scanned_page(page)
511
+ extraction_needed_for_text = extract_text and (
512
+ (text_extraction_method == "pdfium_hybrid" and is_scanned) or text_extraction_method == "ocr"
513
+ )
514
+ extraction_needed_for_structured = extract_tables or extract_charts or extract_infographics
515
+ page_to_text_flag_map[page_idx] = extraction_needed_for_text
516
+
517
+ # Text extraction
518
+ if extract_text and not extraction_needed_for_text:
519
+ page_text = _extract_page_text(page)
520
+ if text_depth == TextTypeEnum.PAGE:
521
+ text_meta = construct_text_metadata(
522
+ [page_text],
523
+ pdf_metadata.keywords,
524
+ page_reading_index,
525
+ -1,
526
+ -1,
527
+ -1,
528
+ page_count,
529
+ text_depth,
530
+ source_metadata,
531
+ base_unified_metadata,
532
+ )
533
+ extracted_data.append(text_meta)
534
+ else:
535
+ accumulated_text.append(page_text)
536
+
537
+ # Image extraction
538
+ if extract_images:
539
+ image_data = _extract_page_images(
540
+ extract_images_method,
541
+ page,
542
+ page_reading_index,
543
+ page_width,
544
+ page_height,
545
+ page_count,
546
+ source_metadata,
547
+ base_unified_metadata,
548
+ **extract_images_params,
549
+ )
550
+ extracted_data.extend(image_data)
551
+
552
+ # Full page image extraction
553
+ if extract_page_as_image:
554
+ if text_extraction_method == "ocr":
555
+ page_text = ""
556
+ else:
557
+ page_text = _extract_page_text(page)
558
+ image, _ = pdfium_pages_to_numpy([page], scale_tuple=(16384, 16384), trace_info=execution_trace_log)
559
+ base64_image = numpy_to_base64(image[0])
560
+ if len(base64_image) > 2**24 - 1:
561
+ base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
562
+ image_meta = construct_image_metadata_from_base64(
563
+ base64_image,
564
+ page_reading_index,
565
+ page_count,
566
+ source_metadata,
567
+ base_unified_metadata,
568
+ subtype=ContentTypeEnum.PAGE_IMAGE,
569
+ text=page_text,
570
+ )
571
+ extracted_data.append(image_meta)
572
+
573
+ # If we want OCR extraction, rasterize the page and store it
574
+ if extraction_needed_for_text or extraction_needed_for_structured:
575
+ image, padding_offsets = pdfium_pages_to_numpy(
576
+ [page],
577
+ scale_tuple=(YOLOX_PAGE_IMAGE_PREPROC_WIDTH, YOLOX_PAGE_IMAGE_PREPROC_HEIGHT),
578
+ padding_tuple=(YOLOX_PAGE_IMAGE_PREPROC_WIDTH, YOLOX_PAGE_IMAGE_PREPROC_HEIGHT),
579
+ trace_info=execution_trace_log,
580
+ )
581
+ pages_for_extractions.append((page_idx, image[0], padding_offsets[0]))
582
+
583
+ # Whenever pages_for_extractions hits YOLOX_MAX_BATCH_SIZE, submit a job
584
+ if len(pages_for_extractions) >= YOLOX_MAX_BATCH_SIZE:
585
+ future = executor.submit(
586
+ _extract_page_elements,
587
+ pages_for_extractions[:], # pass a copy
588
+ page_count,
589
+ source_metadata,
590
+ base_unified_metadata,
591
+ extract_tables=extract_tables,
592
+ extract_charts=extract_charts,
593
+ extract_infographics=extract_infographics,
594
+ page_to_text_flag_map=page_to_text_flag_map,
595
+ table_output_format=table_output_format,
596
+ yolox_endpoints=pdfium_config.yolox_endpoints,
597
+ yolox_infer_protocol=pdfium_config.yolox_infer_protocol,
598
+ auth_token=pdfium_config.auth_token,
599
+ execution_trace_log=execution_trace_log,
600
+ )
601
+ futures.append(future)
602
+ pages_for_extractions.clear()
603
+
604
+ page.close()
605
+
606
+ # After page loop, if we still have leftover pages_for_extractions, submit one last job
607
+ if (extraction_needed_for_text or extraction_needed_for_structured) and pages_for_extractions:
608
+ future = executor.submit(
609
+ _extract_page_elements,
610
+ pages_for_extractions[:],
611
+ page_count,
612
+ source_metadata,
613
+ base_unified_metadata,
614
+ extract_tables=extract_tables,
615
+ extract_charts=extract_charts,
616
+ extract_infographics=extract_infographics,
617
+ page_to_text_flag_map=page_to_text_flag_map,
618
+ table_output_format=table_output_format,
619
+ yolox_endpoints=pdfium_config.yolox_endpoints,
620
+ yolox_infer_protocol=pdfium_config.yolox_infer_protocol,
621
+ auth_token=pdfium_config.auth_token,
622
+ execution_trace_log=execution_trace_log,
623
+ )
624
+ futures.append(future)
625
+
626
+ pages_for_extractions.clear()
627
+
628
+ # Wait for all asynchronous jobs to complete.
629
+ for fut in concurrent.futures.as_completed(futures):
630
+ table_chart_items = fut.result() # Blocks until the job is finished
631
+ extracted_data.extend(table_chart_items)
632
+
633
+ # For document-level text extraction, combine the accumulated text.
634
+ if extract_text and text_depth == TextTypeEnum.DOCUMENT and accumulated_text:
635
+ doc_text_meta = construct_text_metadata(
636
+ accumulated_text,
637
+ pdf_metadata.keywords,
638
+ -1,
639
+ -1,
640
+ -1,
641
+ -1,
642
+ page_count,
643
+ text_depth,
644
+ source_metadata,
645
+ base_unified_metadata,
646
+ )
647
+ extracted_data.append(doc_text_meta)
648
+
649
+ doc.close()
650
+
651
+ logger.debug(f"Extracted {len(extracted_data)} items from PDF.")
652
+ return extracted_data