nv-ingest-api 2025.4.16.dev20250416__py3-none-any.whl → 2025.4.18.dev20250418__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.18.dev20250418.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.18.dev20250418.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.16.dev20250416.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.18.dev20250418.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.18.dev20250418.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.18.dev20250418.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,403 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2024, NVIDIA CORPORATION.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ import os
20
+ import io
21
+ import logging
22
+ from datetime import datetime
23
+ from typing import Dict, IO, Any
24
+ from typing import List
25
+ from typing import Optional
26
+ from typing import Tuple
27
+
28
+ import numpy as np
29
+ from PIL import Image
30
+
31
+ # from wand.image import Image as WandImage
32
+
33
+ from nv_ingest_api.internal.enums.common import AccessLevelEnum
34
+ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
35
+ YoloxPageElementsModelInterface,
36
+ get_yolox_model_name,
37
+ )
38
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
39
+ from nv_ingest_api.util.image_processing.transforms import crop_image, numpy_to_base64
40
+ from nv_ingest_api.util.metadata.aggregators import (
41
+ CroppedImageWithContent,
42
+ construct_page_element_metadata,
43
+ construct_image_metadata_from_base64,
44
+ )
45
+ from nv_ingest_api.util.nim import create_inference_client
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+ YOLOX_MAX_BATCH_SIZE = 8
50
+
51
+ RAW_FILE_FORMATS = ["jpeg", "jpg", "png", "tiff", "bmp"]
52
+ PREPROC_FILE_FORMATS = ["svg"]
53
+
54
+ SUPPORTED_FILE_TYPES = RAW_FILE_FORMATS + ["svg"]
55
+
56
+
57
+ def load_and_preprocess_image(image_stream: io.BytesIO) -> np.ndarray:
58
+ """
59
+ Loads and preprocesses a JPEG, JPG, or PNG image from a bytestream.
60
+
61
+ Parameters
62
+ ----------
63
+ image_stream : io.BytesIO
64
+ A bytestream of the image file.
65
+
66
+ Returns
67
+ -------
68
+ np.ndarray
69
+ Preprocessed image as a numpy array.
70
+ """
71
+ # Load image from the byte stream
72
+ processed_image = Image.open(image_stream).convert("RGB")
73
+
74
+ # Convert image to numpy uint8 array
75
+ image_array = np.asarray(processed_image)
76
+
77
+ return image_array
78
+
79
+
80
+ def convert_svg_to_bitmap(image_stream: io.BytesIO) -> np.ndarray:
81
+ """
82
+ Converts an SVG image from a bytestream to a bitmap format.
83
+
84
+ Parameters
85
+ ----------
86
+ image_stream : io.BytesIO
87
+ A bytestream of the SVG file.
88
+
89
+ Returns
90
+ -------
91
+ np.ndarray
92
+ Preprocessed image as a numpy array in bitmap format.
93
+ """
94
+
95
+ pass
96
+ # Convert SVG to PNG using Wand (ImageMagick)
97
+ # with WandImage(blob=image_stream.read(), format="svg") as img:
98
+ # img.format = "png"
99
+ # png_data = img.make_blob()
100
+
101
+ ## Reload the PNG as a PIL Image
102
+ # processed_image = Image.open(io.BytesIO(png_data)).convert("RGB")
103
+
104
+ ## Convert image to numpy array and normalize pixel values
105
+ # image_array = np.asarray(processed_image, dtype=np.float32)
106
+
107
+ # return image_array
108
+
109
+
110
+ def extract_page_element_images(
111
+ annotation_dict: Dict[str, List[List[float]]],
112
+ original_image: np.ndarray,
113
+ page_idx: int,
114
+ page_elements: List[Tuple[int, "CroppedImageWithContent"]],
115
+ ) -> None:
116
+ """
117
+ Handle the extraction of tables and charts from the inference results and run additional model inference.
118
+
119
+ Parameters
120
+ ----------
121
+ annotation_dict : dict of {str : list of list of float}
122
+ A dictionary containing detected objects and their bounding boxes. Keys should include "table" and "chart",
123
+ and each key's value should be a list of bounding boxes, with each bounding box represented as a list of floats.
124
+ original_image : np.ndarray
125
+ The original image from which objects were detected, expected to be in RGB format with shape (H, W, 3).
126
+ page_idx : int
127
+ The index of the current page being processed.
128
+ page_elements : list of tuple of (int, CroppedImageWithContent)
129
+ A list to which extracted tables and charts will be appended. Each item in the list is a tuple where the first
130
+ element is the page index, and the second is an instance of CroppedImageWithContent representing a cropped image
131
+ and associated metadata.
132
+
133
+ Returns
134
+ -------
135
+ None
136
+
137
+ Notes
138
+ -----
139
+ This function iterates over detected objects labeled as "table" or "chart". For each object, it crops the original
140
+ image according to the bounding box coordinates, then creates an instance of `CroppedImageWithContent` containing
141
+ the cropped image and metadata, and appends it to `page_elements`.
142
+
143
+ Examples
144
+ --------
145
+ >>> annotation_dict = {"table": [[0.1, 0.1, 0.5, 0.5, 0.8]], "chart": [[0.6, 0.6, 0.9, 0.9, 0.9]]}
146
+ >>> original_image = np.random.rand(1536, 1536, 3)
147
+ >>> page_elements = []
148
+ >>> extract_page_element_images(annotation_dict, original_image, 0, page_elements)
149
+ >>> len(page_elements)
150
+ 2
151
+ """
152
+
153
+ width, height, *_ = original_image.shape
154
+ for label in ["table", "chart"]:
155
+ if not annotation_dict or label not in annotation_dict:
156
+ continue
157
+
158
+ objects = annotation_dict[label]
159
+ for idx, bboxes in enumerate(objects):
160
+ *bbox, _ = bboxes
161
+ h1, w1, h2, w2 = bbox
162
+
163
+ cropped_img = crop_image(original_image, (int(h1), int(w1), int(h2), int(w2)))
164
+ base64_img = numpy_to_base64(cropped_img) if cropped_img is not None else None
165
+
166
+ table_data = CroppedImageWithContent(
167
+ content="",
168
+ image=base64_img,
169
+ bbox=(int(w1), int(h1), int(w2), int(h2)),
170
+ max_width=width,
171
+ max_height=height,
172
+ type_string=label,
173
+ )
174
+ page_elements.append((page_idx, table_data))
175
+
176
+
177
+ def extract_page_elements_from_images(
178
+ images: List[np.ndarray],
179
+ config: ImageConfigSchema,
180
+ trace_info: Optional[List] = None,
181
+ ) -> List[Tuple[int, object]]:
182
+ """
183
+ Detect and extract tables/charts from a list of NumPy images using YOLOX.
184
+
185
+ Parameters
186
+ ----------
187
+ images : List[np.ndarray]
188
+ List of images in NumPy array format.
189
+ config : ImageConfigSchema
190
+ Configuration object containing YOLOX endpoints, auth token, etc.
191
+ trace_info : Optional[List], optional
192
+ Optional tracing data for debugging/performance profiling.
193
+
194
+ Returns
195
+ -------
196
+ List[Tuple[int, object]]
197
+ A list of (image_index, CroppedImageWithContent) representing extracted
198
+ table/chart data from each image.
199
+ """
200
+ page_elements = []
201
+ yolox_client = None
202
+
203
+ # Obtain yolox_version
204
+ # Assuming that the http endpoint is at index 1
205
+ yolox_http_endpoint = config.yolox_endpoints[1]
206
+ yolox_model_name = get_yolox_model_name(yolox_http_endpoint)
207
+
208
+ try:
209
+ model_interface = YoloxPageElementsModelInterface(yolox_model_name=yolox_model_name)
210
+ yolox_client = create_inference_client(
211
+ config.yolox_endpoints,
212
+ model_interface,
213
+ config.auth_token,
214
+ config.yolox_infer_protocol,
215
+ )
216
+
217
+ # Prepare the payload with all images.
218
+ data = {"images": images}
219
+
220
+ # Perform inference in a single call. The NimClient handles batching internally.
221
+ inference_results = yolox_client.infer(
222
+ data,
223
+ model_name="yolox",
224
+ max_batch_size=YOLOX_MAX_BATCH_SIZE,
225
+ trace_info=trace_info,
226
+ stage_name="pdf_content_extractor",
227
+ )
228
+
229
+ # Process each result along with its corresponding image.
230
+ for i, (annotation_dict, original_image) in enumerate(zip(inference_results, images)):
231
+ extract_page_element_images(
232
+ annotation_dict,
233
+ original_image,
234
+ i,
235
+ page_elements,
236
+ )
237
+
238
+ except TimeoutError:
239
+ logger.error("Timeout error during table/chart extraction.")
240
+ raise
241
+
242
+ except Exception as e:
243
+ logger.exception(f"Unhandled error during table/chart extraction: {str(e)}")
244
+ raise
245
+
246
+ finally:
247
+ if yolox_client:
248
+ yolox_client.close()
249
+
250
+ logger.debug(f"Extracted {len(page_elements)} tables and charts from image.")
251
+ return page_elements
252
+
253
+
254
+ def unstructured_image_extractor(
255
+ *,
256
+ image_stream: IO[bytes],
257
+ extract_text: bool,
258
+ extract_images: bool,
259
+ extract_infographics: bool,
260
+ extract_tables: bool,
261
+ extract_charts: bool,
262
+ extraction_config: Dict[str, Any],
263
+ extraction_trace_log: Optional[Dict[str, Any]] = None,
264
+ ) -> List[Any]:
265
+ """
266
+ Extract primitives from an unstructured image bytestream.
267
+
268
+ This helper function processes an image bytestream according to the provided extraction
269
+ configuration. It supports extraction of tables, charts, and infographics from the image.
270
+ (Note: text and additional image extraction are not supported yet for raw images.)
271
+
272
+ Parameters
273
+ ----------
274
+ image_stream : IO[bytes]
275
+ A bytestream (e.g. io.BytesIO) containing the image file data.
276
+ image_stream : io.BytesIO
277
+ A bytestream for the image file.
278
+ document_type : str
279
+ Specifies the type of the image document ('png', 'jpeg', 'jpg', 'svg', 'tiff', 'bmp').
280
+ extract_text : bool
281
+ Flag specifying whether to extract text (currently not supported for raw images).
282
+ extract_images : bool
283
+ Flag specifying whether to extract images (currently not supported for raw images).
284
+ extract_infographics : bool
285
+ Flag specifying whether to extract infographics.
286
+ extract_tables : bool
287
+ Flag specifying whether to extract tables.
288
+ extract_charts : bool
289
+ Flag specifying whether to extract charts.
290
+ extraction_config : Dict[str, Any]
291
+ A dictionary containing additional extraction parameters and configurations.
292
+ Expected keys include "document_type", "row_data", "metadata_column", and
293
+ "image_extraction_config".
294
+ extraction_trace_log : Optional[Dict[str, Any]], optional
295
+ An optional dictionary containing trace information for logging or debugging,
296
+ by default None.
297
+
298
+ Returns
299
+ -------
300
+ List[Any]
301
+ A list of extracted data items (e.g., metadata dictionaries) from the image.
302
+
303
+ Raises
304
+ ------
305
+ ValueError
306
+ If the document type is unsupported.
307
+ Exception
308
+ If an error occurs during extraction.
309
+ """
310
+ # Note: extract_infographics flag is not currently used in extraction
311
+ _ = extract_infographics
312
+
313
+ # Determine the type of the document from the extraction config.
314
+ document_type: str = extraction_config.get("document_type", "unknown")
315
+ logger.debug(f"Extracting {document_type.upper()} image with image extractor.")
316
+
317
+ # Ensure the document type is supported.
318
+ if document_type not in SUPPORTED_FILE_TYPES:
319
+ raise ValueError(f"Unsupported document type: {document_type}")
320
+
321
+ # Retrieve additional row-specific data and source identifier.
322
+ row_data: Dict[str, Any] = extraction_config.get("row_data", {})
323
+ source_id: str = row_data.get("source_id", "unknown_source")
324
+
325
+ # Build source metadata based on row data.
326
+ base_unified_metadata: Dict[str, Any] = row_data.get(extraction_config.get("metadata_column", "metadata"), {})
327
+ current_iso_datetime: str = datetime.now().isoformat()
328
+ source_metadata: Dict[str, Any] = {
329
+ "source_name": source_id if os.path.splitext(source_id)[1] else f"{source_id}.{document_type}",
330
+ "source_id": source_id,
331
+ "source_location": row_data.get("source_location", ""),
332
+ "source_type": document_type,
333
+ "collection_id": row_data.get("collection_id", ""),
334
+ "date_created": row_data.get("date_created", current_iso_datetime),
335
+ "last_modified": row_data.get("last_modified", current_iso_datetime),
336
+ "summary": f"Raw {document_type} image extracted from source {source_id}",
337
+ "partition_id": row_data.get("partition_id", -1),
338
+ "access_level": row_data.get("access_level", AccessLevelEnum.UNKNOWN),
339
+ }
340
+
341
+ # Optionally update the extract_infographics flag based on extraction_config.
342
+ extract_infographics = extraction_config.get("extract_infographics", False)
343
+
344
+ # Log which primitives are requested for extraction.
345
+ logger.debug(f"Extract text: {extract_text} (not supported yet for raw images)")
346
+ logger.debug(f"Extract images: {extract_images} (not supported yet for raw images)")
347
+ logger.debug(f"Extract tables: {extract_tables}")
348
+ logger.debug(f"Extract charts: {extract_charts}")
349
+ logger.debug(f"Extract infographics: {extract_infographics}")
350
+
351
+ # Preprocess the image based on the document type.
352
+ if document_type in RAW_FILE_FORMATS:
353
+ logger.debug(f"Loading and preprocessing {document_type} image.")
354
+ image_array = load_and_preprocess_image(image_stream)
355
+ elif document_type in PREPROC_FILE_FORMATS:
356
+ logger.debug(f"Converting {document_type} to bitmap.")
357
+ image_array = convert_svg_to_bitmap(image_stream)
358
+ else:
359
+ raise ValueError(f"Unsupported document type: {document_type}")
360
+
361
+ extracted_data: List[Any] = []
362
+
363
+ # Text extraction stub (not supported for raw images)
364
+ if extract_text:
365
+ logger.warning("Text extraction is not supported for raw images.")
366
+
367
+ # Extract tables, charts, or infographics if requested.
368
+ if extract_tables or extract_charts or extract_infographics:
369
+ try:
370
+ page_elements = extract_page_elements_from_images(
371
+ [image_array],
372
+ config=extraction_config.get("image_extraction_config"),
373
+ trace_info=extraction_trace_log,
374
+ )
375
+ for item in page_elements:
376
+ table_chart_data = item[1]
377
+ extracted_data.append(
378
+ construct_page_element_metadata(
379
+ table_chart_data,
380
+ page_idx=0, # Treat single image as one page.
381
+ page_count=1,
382
+ source_metadata=source_metadata,
383
+ base_unified_metadata=base_unified_metadata,
384
+ )
385
+ )
386
+ except Exception as e:
387
+ logger.error(f"Error extracting tables/charts from image: {e}")
388
+ raise
389
+
390
+ # Image extraction stub: if no structured elements were extracted and image extraction is requested.
391
+ if extract_images and not extracted_data:
392
+ extracted_data.append(
393
+ construct_image_metadata_from_base64(
394
+ numpy_to_base64(image_array),
395
+ page_idx=0, # Treat single image as one page.
396
+ page_count=1,
397
+ source_metadata=source_metadata,
398
+ base_unified_metadata=base_unified_metadata,
399
+ )
400
+ )
401
+
402
+ logger.debug(f"Extracted {len(extracted_data)} items from the image.")
403
+ return extracted_data
@@ -0,0 +1,253 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Any
7
+ from typing import Dict
8
+ from typing import List
9
+ from typing import Optional
10
+ from typing import Tuple
11
+
12
+ import pandas as pd
13
+
14
+ from nv_ingest_api.internal.primitives.nim import NimClient
15
+ from nv_ingest_api.internal.primitives.nim.model_interface.paddle import PaddleOCRModelInterface
16
+ from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import (
17
+ InfographicExtractorSchema,
18
+ )
19
+ from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
20
+ from nv_ingest_api.util.nim import create_inference_client
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ PADDLE_MIN_WIDTH = 32
25
+ PADDLE_MIN_HEIGHT = 32
26
+
27
+
28
+ def _filter_infographic_images(
29
+ base64_images: List[str],
30
+ ) -> Tuple[List[str], List[int], List[Tuple[str, Optional[Any], Optional[Any]]]]:
31
+ """
32
+ Filters base64-encoded images based on minimum size requirements.
33
+
34
+ Parameters
35
+ ----------
36
+ base64_images : List[str]
37
+ List of base64-encoded image strings.
38
+
39
+ Returns
40
+ -------
41
+ Tuple[List[str], List[int], List[Tuple[str, Optional[Any], Optional[Any]]]]
42
+ - valid_images: List of images that meet the size requirements.
43
+ - valid_indices: Original indices of valid images.
44
+ - results: Initialized results list, with invalid images marked as (img, None, None).
45
+ """
46
+ results: List[Tuple[str, Optional[Any], Optional[Any]]] = [("", None, None)] * len(base64_images)
47
+ valid_images: List[str] = []
48
+ valid_indices: List[int] = []
49
+
50
+ for i, img in enumerate(base64_images):
51
+ array = base64_to_numpy(img)
52
+ height, width = array.shape[0], array.shape[1]
53
+ if width >= PADDLE_MIN_WIDTH and height >= PADDLE_MIN_HEIGHT:
54
+ valid_images.append(img)
55
+ valid_indices.append(i)
56
+ else:
57
+ # Mark image as skipped if it does not meet size requirements.
58
+ results[i] = (img, None, None)
59
+ return valid_images, valid_indices, results
60
+
61
+
62
+ def _update_infographic_metadata(
63
+ base64_images: List[str],
64
+ paddle_client: NimClient,
65
+ worker_pool_size: int = 8, # Not currently used
66
+ trace_info: Optional[Dict] = None,
67
+ ) -> List[Tuple[str, Optional[Any], Optional[Any]]]:
68
+ """
69
+ Filters base64-encoded images and uses PaddleOCR to extract infographic data.
70
+
71
+ For each image that meets the minimum size, calls paddle_client.infer to obtain
72
+ (text_predictions, bounding_boxes). Invalid images are marked as skipped.
73
+
74
+ Parameters
75
+ ----------
76
+ base64_images : List[str]
77
+ List of base64-encoded images.
78
+ paddle_client : NimClient
79
+ Client instance for PaddleOCR inference.
80
+ worker_pool_size : int, optional
81
+ Worker pool size (currently not used), by default 8.
82
+ trace_info : Optional[Dict], optional
83
+ Optional trace information for debugging.
84
+
85
+ Returns
86
+ -------
87
+ List[Tuple[str, Optional[Any], Optional[Any]]]
88
+ List of tuples in the same order as base64_images, where each tuple contains:
89
+ (base64_image, text_predictions, bounding_boxes).
90
+ """
91
+ logger.debug(f"Running infographic extraction using protocol {paddle_client.protocol}")
92
+
93
+ valid_images, valid_indices, results = _filter_infographic_images(base64_images)
94
+ data_paddle = {"base64_images": valid_images}
95
+
96
+ # worker_pool_size is not used in current implementation.
97
+ _ = worker_pool_size
98
+
99
+ try:
100
+ paddle_results = paddle_client.infer(
101
+ data=data_paddle,
102
+ model_name="paddle",
103
+ stage_name="infographic_data_extraction",
104
+ max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
105
+ trace_info=trace_info,
106
+ )
107
+ except Exception as e:
108
+ logger.error(f"Error calling paddle_client.infer: {e}", exc_info=True)
109
+ raise
110
+
111
+ if len(paddle_results) != len(valid_images):
112
+ raise ValueError(f"Expected {len(valid_images)} paddle results, got {len(paddle_results)}")
113
+
114
+ for idx, paddle_res in enumerate(paddle_results):
115
+ original_index = valid_indices[idx]
116
+ # Each paddle_res is expected to be a tuple (text_predictions, bounding_boxes)
117
+ results[original_index] = (base64_images[original_index], paddle_res[0], paddle_res[1])
118
+
119
+ return results
120
+
121
+
122
+ def _create_clients(
123
+ paddle_endpoints: Tuple[str, str],
124
+ paddle_protocol: str,
125
+ auth_token: str,
126
+ ) -> NimClient:
127
+ paddle_model_interface = PaddleOCRModelInterface()
128
+
129
+ logger.debug(f"Inference protocols: paddle={paddle_protocol}")
130
+
131
+ paddle_client = create_inference_client(
132
+ endpoints=paddle_endpoints,
133
+ model_interface=paddle_model_interface,
134
+ auth_token=auth_token,
135
+ infer_protocol=paddle_protocol,
136
+ )
137
+
138
+ return paddle_client
139
+
140
+
141
+ def _meets_infographic_criteria(row: pd.Series) -> bool:
142
+ """
143
+ Determines if a DataFrame row meets the criteria for infographic extraction.
144
+
145
+ A row qualifies if:
146
+ - It contains a 'metadata' dictionary.
147
+ - The 'content_metadata' in metadata has type "structured" and subtype "infographic".
148
+ - The 'table_metadata' is not None.
149
+ - The 'content' is not None or an empty string.
150
+
151
+ Parameters
152
+ ----------
153
+ row : pd.Series
154
+ A row from the DataFrame.
155
+
156
+ Returns
157
+ -------
158
+ bool
159
+ True if the row meets all criteria; False otherwise.
160
+ """
161
+ metadata = row.get("metadata", {})
162
+ if not metadata:
163
+ return False
164
+
165
+ content_md = metadata.get("content_metadata", {})
166
+ if (
167
+ content_md.get("type") == "structured"
168
+ and content_md.get("subtype") == "infographic"
169
+ and metadata.get("table_metadata") is not None
170
+ and metadata.get("content") not in [None, ""]
171
+ ):
172
+ return True
173
+
174
+ return False
175
+
176
+
177
+ def extract_infographic_data_from_image_internal(
178
+ df_extraction_ledger: pd.DataFrame,
179
+ task_config: Dict[str, Any],
180
+ extraction_config: InfographicExtractorSchema,
181
+ execution_trace_log: Optional[Dict] = None,
182
+ ) -> Tuple[pd.DataFrame, Dict]:
183
+ """
184
+ Extracts infographic data from a DataFrame in bulk, following the chart extraction pattern.
185
+
186
+ Parameters
187
+ ----------
188
+ df_extraction_ledger : pd.DataFrame
189
+ DataFrame containing the content from which infographic data is to be extracted.
190
+ task_config : Dict[str, Any]
191
+ Dictionary containing task properties and configurations.
192
+ extraction_config : Any
193
+ The validated configuration object for infographic extraction.
194
+ execution_trace_log : Optional[Dict], optional
195
+ Optional trace information for debugging or logging. Defaults to None.
196
+
197
+ Returns
198
+ -------
199
+ Tuple[pd.DataFrame, Dict]
200
+ A tuple containing the updated DataFrame and the trace information.
201
+ """
202
+ _ = task_config # Unused
203
+
204
+ if execution_trace_log is None:
205
+ execution_trace_log = {}
206
+ logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
207
+
208
+ if df_extraction_ledger.empty:
209
+ return df_extraction_ledger, execution_trace_log
210
+
211
+ endpoint_config = extraction_config.endpoint_config
212
+ paddle_client = _create_clients(
213
+ endpoint_config.paddle_endpoints,
214
+ endpoint_config.paddle_infer_protocol,
215
+ endpoint_config.auth_token,
216
+ )
217
+
218
+ try:
219
+ # Identify rows that meet the infographic criteria.
220
+ mask = df_extraction_ledger.apply(_meets_infographic_criteria, axis=1)
221
+ valid_indices = df_extraction_ledger[mask].index.tolist()
222
+
223
+ # If no rows meet the criteria, return early.
224
+ if not valid_indices:
225
+ return df_extraction_ledger, {"trace_info": execution_trace_log}
226
+
227
+ # Extract base64 images from valid rows.
228
+ base64_images = [df_extraction_ledger.at[idx, "metadata"]["content"] for idx in valid_indices]
229
+
230
+ # Call bulk update to extract infographic data.
231
+ bulk_results = _update_infographic_metadata(
232
+ base64_images=base64_images,
233
+ paddle_client=paddle_client,
234
+ worker_pool_size=endpoint_config.workers_per_progress_engine,
235
+ trace_info=execution_trace_log,
236
+ )
237
+
238
+ # Write the extracted results back into the DataFrame.
239
+ for result_idx, df_idx in enumerate(valid_indices):
240
+ # Unpack result: (base64_image, paddle_bounding_boxes, paddle_text_predictions)
241
+ _, _, text_predictions = bulk_results[result_idx]
242
+ table_content = " ".join(text_predictions) if text_predictions else None
243
+ df_extraction_ledger.at[df_idx, "metadata"]["table_metadata"]["table_content"] = table_content
244
+
245
+ return df_extraction_ledger, {"trace_info": execution_trace_log}
246
+
247
+ except Exception:
248
+ err_msg = "Error occurred while extracting infographic data."
249
+ logger.exception(err_msg)
250
+ raise
251
+
252
+ finally:
253
+ paddle_client.close()