nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,433 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2024, NVIDIA CORPORATION.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ import os
20
+ import io
21
+ import logging
22
+ from datetime import datetime
23
+ from typing import Dict, IO, Any
24
+ from typing import List
25
+ from typing import Optional
26
+ from typing import Tuple
27
+
28
+ import numpy as np
29
+ from PIL import Image
30
+
31
+ # from wand.image import Image as WandImage
32
+
33
+ from nv_ingest_api.internal.enums.common import AccessLevelEnum
34
+ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
35
+ YOLOX_PAGE_CLASS_LABELS,
36
+ YOLOX_PAGE_DEFAULT_VERSION,
37
+ YoloxPageElementsModelInterface,
38
+ get_yolox_page_version,
39
+ )
40
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
41
+ from nv_ingest_api.util.image_processing.transforms import crop_image, numpy_to_base64
42
+ from nv_ingest_api.util.metadata.aggregators import (
43
+ CroppedImageWithContent,
44
+ construct_page_element_metadata,
45
+ construct_image_metadata_from_base64,
46
+ )
47
+ from nv_ingest_api.util.nim import create_inference_client
48
+
49
+ logger = logging.getLogger(__name__)
50
+
51
+ YOLOX_MAX_BATCH_SIZE = 8
52
+
53
+ RAW_FILE_FORMATS = ["jpeg", "jpg", "png", "tiff", "bmp"]
54
+ PREPROC_FILE_FORMATS = ["svg"]
55
+
56
+ SUPPORTED_FILE_TYPES = RAW_FILE_FORMATS + ["svg"]
57
+
58
+
59
+ def load_and_preprocess_image(image_stream: io.BytesIO) -> np.ndarray:
60
+ """
61
+ Loads and preprocesses a JPEG, JPG, or PNG image from a bytestream.
62
+
63
+ Parameters
64
+ ----------
65
+ image_stream : io.BytesIO
66
+ A bytestream of the image file.
67
+
68
+ Returns
69
+ -------
70
+ np.ndarray
71
+ Preprocessed image as a numpy array.
72
+ """
73
+ # Load image from the byte stream
74
+ processed_image = Image.open(image_stream).convert("RGB")
75
+
76
+ # Convert image to numpy uint8 array
77
+ image_array = np.asarray(processed_image)
78
+
79
+ return image_array
80
+
81
+
82
+ def convert_svg_to_bitmap(image_stream: io.BytesIO) -> np.ndarray:
83
+ """
84
+ Converts an SVG image from a bytestream to a bitmap format.
85
+
86
+ Parameters
87
+ ----------
88
+ image_stream : io.BytesIO
89
+ A bytestream of the SVG file.
90
+
91
+ Returns
92
+ -------
93
+ np.ndarray
94
+ Preprocessed image as a numpy array in bitmap format.
95
+ """
96
+
97
+ pass
98
+ # Convert SVG to PNG using Wand (ImageMagick)
99
+ # with WandImage(blob=image_stream.read(), format="svg") as img:
100
+ # img.format = "png"
101
+ # png_data = img.make_blob()
102
+
103
+ ## Reload the PNG as a PIL Image
104
+ # processed_image = Image.open(io.BytesIO(png_data)).convert("RGB")
105
+
106
+ ## Convert image to numpy array and normalize pixel values
107
+ # image_array = np.asarray(processed_image, dtype=np.float32)
108
+
109
+ # return image_array
110
+
111
+
112
+ def extract_page_element_images(
113
+ annotation_dict: Dict[str, List[List[float]]],
114
+ original_image: np.ndarray,
115
+ page_idx: int,
116
+ page_elements: List[Tuple[int, "CroppedImageWithContent"]],
117
+ ) -> None:
118
+ """
119
+ Handle the extraction of tables and charts from the inference results and run additional model inference.
120
+
121
+ Parameters
122
+ ----------
123
+ annotation_dict : dict of {str : list of list of float}
124
+ A dictionary containing detected objects and their bounding boxes. Keys should include "table" and "chart",
125
+ and each key's value should be a list of bounding boxes, with each bounding box represented as a list of floats.
126
+ original_image : np.ndarray
127
+ The original image from which objects were detected, expected to be in RGB format with shape (H, W, 3).
128
+ page_idx : int
129
+ The index of the current page being processed.
130
+ page_elements : list of tuple of (int, CroppedImageWithContent)
131
+ A list to which extracted tables and charts will be appended. Each item in the list is a tuple where the first
132
+ element is the page index, and the second is an instance of CroppedImageWithContent representing a cropped image
133
+ and associated metadata.
134
+
135
+ Returns
136
+ -------
137
+ None
138
+
139
+ Notes
140
+ -----
141
+ This function iterates over detected objects labeled as "table" or "chart". For each object, it crops the original
142
+ image according to the bounding box coordinates, then creates an instance of `CroppedImageWithContent` containing
143
+ the cropped image and metadata, and appends it to `page_elements`.
144
+
145
+ Examples
146
+ --------
147
+ >>> annotation_dict = {"table": [[0.1, 0.1, 0.5, 0.5, 0.8]], "chart": [[0.6, 0.6, 0.9, 0.9, 0.9]]}
148
+ >>> original_image = np.random.rand(1536, 1536, 3)
149
+ >>> page_elements = []
150
+ >>> extract_page_element_images(annotation_dict, original_image, 0, page_elements)
151
+ >>> len(page_elements)
152
+ 2
153
+ """
154
+
155
+ width, height, *_ = original_image.shape
156
+
157
+ if annotation_dict and (set(YOLOX_PAGE_CLASS_LABELS) <= annotation_dict.keys()):
158
+ labels = YOLOX_PAGE_CLASS_LABELS
159
+ else:
160
+ labels = ["table", "chart", "infographics"]
161
+
162
+ for label in labels:
163
+ if not annotation_dict or label not in annotation_dict:
164
+ continue
165
+
166
+ objects = annotation_dict[label]
167
+ for idx, bboxes in enumerate(objects):
168
+ *bbox, _ = bboxes
169
+ h1, w1, h2, w2 = bbox
170
+
171
+ cropped_img = crop_image(original_image, (int(h1), int(w1), int(h2), int(w2)))
172
+ base64_img = numpy_to_base64(cropped_img) if cropped_img is not None else None
173
+
174
+ table_data = CroppedImageWithContent(
175
+ content="",
176
+ image=base64_img,
177
+ bbox=(int(w1), int(h1), int(w2), int(h2)),
178
+ max_width=width,
179
+ max_height=height,
180
+ type_string=label,
181
+ )
182
+ page_elements.append((page_idx, table_data))
183
+
184
+
185
+ def extract_page_elements_from_images(
186
+ images: List[np.ndarray],
187
+ config: ImageConfigSchema,
188
+ trace_info: Optional[List] = None,
189
+ ) -> List[Tuple[int, object]]:
190
+ """
191
+ Detect and extract tables/charts from a list of NumPy images using YOLOX.
192
+
193
+ Parameters
194
+ ----------
195
+ images : List[np.ndarray]
196
+ List of images in NumPy array format.
197
+ config : ImageConfigSchema
198
+ Configuration object containing YOLOX endpoints, auth token, etc.
199
+ trace_info : Optional[List], optional
200
+ Optional tracing data for debugging/performance profiling.
201
+
202
+ Returns
203
+ -------
204
+ List[Tuple[int, object]]
205
+ A list of (image_index, CroppedImageWithContent) representing extracted
206
+ table/chart data from each image.
207
+ """
208
+ page_elements = []
209
+ yolox_client = None
210
+
211
+ # Obtain yolox_version
212
+ # Assuming that the http endpoint is at index 1
213
+ yolox_version = YOLOX_PAGE_DEFAULT_VERSION
214
+
215
+ # Get the HTTP endpoint to determine the model name if needed
216
+ yolox_http_endpoint = config.yolox_endpoints[1]
217
+ if yolox_http_endpoint:
218
+ try:
219
+ yolox_version = get_yolox_page_version(yolox_http_endpoint)
220
+ except Exception as e:
221
+ logger.warning(f"Failed to get YOLOX model name from endpoint: {e}. Using default.")
222
+
223
+ try:
224
+ model_interface = YoloxPageElementsModelInterface(version=yolox_version)
225
+ yolox_client = create_inference_client(
226
+ config.yolox_endpoints,
227
+ model_interface,
228
+ config.auth_token,
229
+ config.yolox_infer_protocol,
230
+ )
231
+
232
+ # Prepare the payload with all images.
233
+ data = {"images": images}
234
+
235
+ # Perform inference in a single call. The NimClient handles batching internally.
236
+ inference_results = yolox_client.infer(
237
+ data,
238
+ model_name="pipeline" if yolox_version.endswith("-v3") else "yolox_ensemble",
239
+ max_batch_size=YOLOX_MAX_BATCH_SIZE,
240
+ input_names=["INPUT_IMAGES", "THRESHOLDS"],
241
+ dtypes=["BYTES", "FP32"],
242
+ output_names=["OUTPUT"],
243
+ trace_info=trace_info,
244
+ stage_name="pdf_extraction",
245
+ )
246
+
247
+ # Process each result along with its corresponding image.
248
+ for i, (annotation_dict, original_image) in enumerate(zip(inference_results, images)):
249
+ extract_page_element_images(
250
+ annotation_dict,
251
+ original_image,
252
+ i,
253
+ page_elements,
254
+ )
255
+
256
+ except TimeoutError:
257
+ logger.error("Timeout error during table/chart extraction.")
258
+ raise
259
+
260
+ except Exception as e:
261
+ logger.exception(f"Unhandled error during table/chart extraction: {str(e)}")
262
+ raise
263
+
264
+ logger.debug(f"Extracted {len(page_elements)} tables and charts from image.")
265
+ return page_elements
266
+
267
+
268
+ def unstructured_image_extractor(
269
+ *,
270
+ image_stream: IO[bytes],
271
+ extract_text: bool,
272
+ extract_images: bool,
273
+ extract_infographics: bool,
274
+ extract_tables: bool,
275
+ extract_charts: bool,
276
+ extraction_config: Dict[str, Any],
277
+ extraction_trace_log: Optional[Dict[str, Any]] = None,
278
+ ) -> List[Any]:
279
+ """
280
+ Extract primitives from an unstructured image bytestream.
281
+
282
+ This helper function processes an image bytestream according to the provided extraction
283
+ configuration. It supports extraction of tables, charts, and infographics from the image.
284
+ (Note: text and additional image extraction are not supported yet for raw images.)
285
+
286
+ Parameters
287
+ ----------
288
+ image_stream : IO[bytes]
289
+ A bytestream (e.g. io.BytesIO) containing the image file data.
290
+ image_stream : io.BytesIO
291
+ A bytestream for the image file.
292
+ document_type : str
293
+ Specifies the type of the image document ('png', 'jpeg', 'jpg', 'svg', 'tiff', 'bmp').
294
+ extract_text : bool
295
+ Flag specifying whether to extract text (currently not supported for raw images).
296
+ extract_images : bool
297
+ Flag specifying whether to extract images (currently not supported for raw images).
298
+ extract_infographics : bool
299
+ Flag specifying whether to extract infographics.
300
+ extract_tables : bool
301
+ Flag specifying whether to extract tables.
302
+ extract_charts : bool
303
+ Flag specifying whether to extract charts.
304
+ extraction_config : Dict[str, Any]
305
+ A dictionary containing additional extraction parameters and configurations.
306
+ Expected keys include "document_type", "row_data", "metadata_column", and
307
+ "image_extraction_config".
308
+ extraction_trace_log : Optional[Dict[str, Any]], optional
309
+ An optional dictionary containing trace information for logging or debugging,
310
+ by default None.
311
+
312
+ Returns
313
+ -------
314
+ List[Any]
315
+ A list of extracted data items (e.g., metadata dictionaries) from the image.
316
+
317
+ Raises
318
+ ------
319
+ ValueError
320
+ If the document type is unsupported.
321
+ Exception
322
+ If an error occurs during extraction.
323
+ """
324
+ # Note: extract_infographics flag is not currently used in extraction
325
+ _ = extract_infographics
326
+
327
+ # Determine the type of the document from the extraction config.
328
+ document_type: str = extraction_config.get("document_type", "unknown")
329
+ logger.debug(f"Extracting {document_type.upper()} image with image extractor.")
330
+
331
+ # Ensure the document type is supported.
332
+ if document_type not in SUPPORTED_FILE_TYPES:
333
+ raise ValueError(f"Unsupported document type: {document_type}")
334
+
335
+ # Retrieve additional row-specific data and source identifier.
336
+ row_data: Dict[str, Any] = extraction_config.get("row_data", {})
337
+ source_id: str = row_data.get("source_id", "unknown_source")
338
+
339
+ # Build source metadata based on row data.
340
+ base_unified_metadata: Dict[str, Any] = row_data.get(extraction_config.get("metadata_column", "metadata"), {})
341
+ current_iso_datetime: str = datetime.now().isoformat()
342
+ source_metadata: Dict[str, Any] = {
343
+ "source_name": source_id if os.path.splitext(source_id)[1] else f"{source_id}.{document_type}",
344
+ "source_id": source_id,
345
+ "source_location": row_data.get("source_location", ""),
346
+ "source_type": document_type,
347
+ "collection_id": row_data.get("collection_id", ""),
348
+ "date_created": row_data.get("date_created", current_iso_datetime),
349
+ "last_modified": row_data.get("last_modified", current_iso_datetime),
350
+ "summary": f"Raw {document_type} image extracted from source {source_id}",
351
+ "partition_id": row_data.get("partition_id", -1),
352
+ "access_level": row_data.get("access_level", AccessLevelEnum.UNKNOWN),
353
+ }
354
+
355
+ # Optionally update the extract_infographics flag based on extraction_config.
356
+ extract_infographics = extraction_config.get("extract_infographics", False)
357
+ text_extraction_method = extraction_config.get("extract_method", "ocr")
358
+
359
+ # Log which primitives are requested for extraction.
360
+ logger.debug(f"Extract text: {extract_text} (not supported yet for raw images)")
361
+ logger.debug(f"Extract images: {extract_images} (not supported yet for raw images)")
362
+ logger.debug(f"Extract tables: {extract_tables}")
363
+ logger.debug(f"Extract charts: {extract_charts}")
364
+ logger.debug(f"Extract infographics: {extract_infographics}")
365
+
366
+ # Preprocess the image based on the document type.
367
+ if document_type in RAW_FILE_FORMATS:
368
+ logger.debug(f"Loading and preprocessing {document_type} image.")
369
+ image_array = load_and_preprocess_image(image_stream)
370
+ elif document_type in PREPROC_FILE_FORMATS:
371
+ logger.debug(f"Converting {document_type} to bitmap.")
372
+ image_array = convert_svg_to_bitmap(image_stream)
373
+ else:
374
+ raise ValueError(f"Unsupported document type: {document_type}")
375
+
376
+ extracted_data: List[Any] = []
377
+
378
+ if extract_text:
379
+ if text_extraction_method != "ocr":
380
+ logger.warning(
381
+ f"Text extraction method '{text_extraction_method} is not supported for raw images. "
382
+ "Defaulting to 'ocr'."
383
+ )
384
+
385
+ # Extract tables, charts, or infographics if requested.
386
+ if extract_text or extract_tables or extract_charts or extract_infographics:
387
+ try:
388
+ page_elements = extract_page_elements_from_images(
389
+ [image_array],
390
+ config=extraction_config.get("image_extraction_config"),
391
+ trace_info=extraction_trace_log,
392
+ )
393
+ for item in page_elements:
394
+ table_chart_data = item[1]
395
+
396
+ # Skip elements that shouldn't be extracted based on flags
397
+ element_type = table_chart_data.type_string
398
+ if (not extract_tables) and (element_type == "table"):
399
+ continue
400
+ if (not extract_charts) and (element_type == "chart"):
401
+ continue
402
+ if (not extract_infographics) and (element_type == "infographic"):
403
+ continue
404
+ if (not extract_text) and (element_type in {"title", "paragraph", "header_footer"}):
405
+ continue
406
+
407
+ extracted_data.append(
408
+ construct_page_element_metadata(
409
+ table_chart_data,
410
+ page_idx=0, # Treat single image as one page.
411
+ page_count=1,
412
+ source_metadata=source_metadata,
413
+ base_unified_metadata=base_unified_metadata,
414
+ )
415
+ )
416
+ except Exception as e:
417
+ logger.error(f"Error extracting tables/charts from image: {e}")
418
+ raise
419
+
420
+ # Image extraction stub: if no structured elements were extracted and image extraction is requested.
421
+ if extract_images and not extracted_data:
422
+ extracted_data.append(
423
+ construct_image_metadata_from_base64(
424
+ numpy_to_base64(image_array),
425
+ page_idx=0, # Treat single image as one page.
426
+ page_count=1,
427
+ source_metadata=source_metadata,
428
+ base_unified_metadata=base_unified_metadata,
429
+ )
430
+ )
431
+
432
+ logger.debug(f"Extracted {len(extracted_data)} items from the image.")
433
+ return extracted_data