nv-ingest-api 2025.4.20.dev20250420__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,353 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from typing import Any, Union
8
+ from typing import Dict
9
+ from typing import List
10
+ from typing import Optional
11
+ from typing import Tuple
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+ from nv_ingest_api.internal.primitives.nim.model_interface.helpers import get_version
17
+ from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
18
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskChartExtraction
19
+ from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_graphic_elements_and_paddle_output
20
+ from nv_ingest_api.util.image_processing.table_and_chart import process_yolox_graphic_elements
21
+ from nv_ingest_api.internal.primitives.nim.model_interface.paddle import PaddleOCRModelInterface
22
+ from nv_ingest_api.internal.primitives.nim import NimClient
23
+ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxGraphicElementsModelInterface
24
+ from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
25
+ from nv_ingest_api.util.nim import create_inference_client
26
+
27
+ PADDLE_MIN_WIDTH = 32
28
+ PADDLE_MIN_HEIGHT = 32
29
+
30
+ logger = logging.getLogger(f"morpheus.{__name__}")
31
+
32
+
33
+ def _filter_valid_chart_images(
34
+ base64_images: List[str],
35
+ ) -> Tuple[List[str], List[np.ndarray], List[int], List[Tuple[str, Optional[Dict]]]]:
36
+ """
37
+ Filter base64-encoded images based on minimum dimensions for chart extraction.
38
+
39
+ Returns:
40
+ - valid_images: Base64 strings meeting size requirements.
41
+ - valid_arrays: Corresponding numpy arrays.
42
+ - valid_indices: Original indices of valid images.
43
+ - results: Initial results list where invalid images are set to (img, None).
44
+ """
45
+ results: List[Tuple[str, Optional[Dict]]] = [("", None)] * len(base64_images)
46
+ valid_images: List[str] = []
47
+ valid_arrays: List[np.ndarray] = []
48
+ valid_indices: List[int] = []
49
+
50
+ for i, img in enumerate(base64_images):
51
+ array = base64_to_numpy(img)
52
+ height, width = array.shape[0], array.shape[1]
53
+ if width >= PADDLE_MIN_WIDTH and height >= PADDLE_MIN_HEIGHT:
54
+ valid_images.append(img)
55
+ valid_arrays.append(array)
56
+ valid_indices.append(i)
57
+ else:
58
+ # Image is too small; mark as skipped.
59
+ results[i] = (img, None)
60
+ return valid_images, valid_arrays, valid_indices, results
61
+
62
+
63
+ def _run_chart_inference(
64
+ yolox_client: Any,
65
+ paddle_client: Any,
66
+ valid_arrays: List[np.ndarray],
67
+ valid_images: List[str],
68
+ trace_info: Dict,
69
+ ) -> Tuple[List[Any], List[Any]]:
70
+ """
71
+ Run concurrent inference for chart extraction using YOLOX and Paddle.
72
+
73
+ Returns a tuple of (yolox_results, paddle_results).
74
+ """
75
+ data_yolox = {"images": valid_arrays}
76
+ data_paddle = {"base64_images": valid_images}
77
+
78
+ with ThreadPoolExecutor(max_workers=2) as executor:
79
+ future_yolox = executor.submit(
80
+ yolox_client.infer,
81
+ data=data_yolox,
82
+ model_name="yolox",
83
+ stage_name="chart_data_extraction",
84
+ max_batch_size=8,
85
+ trace_info=trace_info,
86
+ )
87
+ future_paddle = executor.submit(
88
+ paddle_client.infer,
89
+ data=data_paddle,
90
+ model_name="paddle",
91
+ stage_name="chart_data_extraction",
92
+ max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
93
+ trace_info=trace_info,
94
+ )
95
+
96
+ try:
97
+ yolox_results = future_yolox.result()
98
+ except Exception as e:
99
+ logger.error(f"Error calling yolox_client.infer: {e}", exc_info=True)
100
+ raise
101
+
102
+ try:
103
+ paddle_results = future_paddle.result()
104
+ except Exception as e:
105
+ logger.error(f"Error calling paddle_client.infer: {e}", exc_info=True)
106
+ raise
107
+
108
+ return yolox_results, paddle_results
109
+
110
+
111
+ def _validate_chart_inference_results(
112
+ yolox_results: Any, paddle_results: Any, valid_arrays: List[Any], valid_images: List[str]
113
+ ) -> Tuple[List[Any], List[Any]]:
114
+ """
115
+ Ensure inference results are lists and have expected lengths.
116
+
117
+ Raises:
118
+ ValueError if results do not match expected types or lengths.
119
+ """
120
+ if not (isinstance(yolox_results, list) and isinstance(paddle_results, list)):
121
+ raise ValueError("Expected list results from both yolox_client and paddle_client infer calls.")
122
+
123
+ if len(yolox_results) != len(valid_arrays):
124
+ raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
125
+ if len(paddle_results) != len(valid_images):
126
+ raise ValueError(f"Expected {len(valid_images)} paddle results, got {len(paddle_results)}")
127
+ return yolox_results, paddle_results
128
+
129
+
130
+ def _merge_chart_results(
131
+ base64_images: List[str],
132
+ valid_indices: List[int],
133
+ yolox_results: List[Any],
134
+ paddle_results: List[Any],
135
+ initial_results: List[Tuple[str, Optional[Dict]]],
136
+ ) -> List[Tuple[str, Optional[Dict]]]:
137
+ """
138
+ Merge inference results into the initial results list using the original indices.
139
+
140
+ For each valid image, processes the results from both inference calls and updates the
141
+ corresponding entry in the results list.
142
+ """
143
+ for idx, (yolox_res, paddle_res) in enumerate(zip(yolox_results, paddle_results)):
144
+ # Unpack paddle result into bounding boxes and text predictions.
145
+ bounding_boxes, text_predictions = paddle_res
146
+ yolox_elements = join_yolox_graphic_elements_and_paddle_output(yolox_res, bounding_boxes, text_predictions)
147
+ chart_content = process_yolox_graphic_elements(yolox_elements)
148
+ original_index = valid_indices[idx]
149
+ initial_results[original_index] = (base64_images[original_index], chart_content)
150
+ return initial_results
151
+
152
+
153
+ def _update_chart_metadata(
154
+ base64_images: List[str],
155
+ yolox_client: Any,
156
+ paddle_client: Any,
157
+ trace_info: Dict,
158
+ worker_pool_size: int = 8, # Not currently used.
159
+ ) -> List[Tuple[str, Optional[Dict]]]:
160
+ """
161
+ Given a list of base64-encoded chart images, concurrently call both YOLOX and Paddle
162
+ inference services to extract chart data.
163
+
164
+ For each base64-encoded image, returns:
165
+ (original_image_str, joined_chart_content_dict)
166
+
167
+ Images that do not meet minimum size requirements are marked as skipped.
168
+ """
169
+ logger.debug("Running chart extraction using updated concurrency handling.")
170
+
171
+ # Initialize results with placeholders and filter valid images.
172
+ valid_images, valid_arrays, valid_indices, results = _filter_valid_chart_images(base64_images)
173
+
174
+ # Run concurrent inference only for valid images.
175
+ yolox_results, paddle_results = _run_chart_inference(
176
+ yolox_client=yolox_client,
177
+ paddle_client=paddle_client,
178
+ valid_arrays=valid_arrays,
179
+ valid_images=valid_images,
180
+ trace_info=trace_info,
181
+ )
182
+
183
+ # Validate that the returned inference results are lists of the expected length.
184
+ yolox_results, paddle_results = _validate_chart_inference_results(
185
+ yolox_results, paddle_results, valid_arrays, valid_images
186
+ )
187
+
188
+ # Merge the inference results into the results list.
189
+ return _merge_chart_results(base64_images, valid_indices, yolox_results, paddle_results, results)
190
+
191
+
192
+ def _create_clients(
193
+ yolox_endpoints: Tuple[str, str],
194
+ yolox_protocol: str,
195
+ paddle_endpoints: Tuple[str, str],
196
+ paddle_protocol: str,
197
+ auth_token: str,
198
+ ) -> Tuple[NimClient, NimClient]:
199
+ # Obtain yolox_version
200
+ # Assuming that the grpc endpoint is at index 0
201
+ yolox_http_endpoint = yolox_endpoints[1]
202
+
203
+ try:
204
+ yolox_version = get_version(yolox_http_endpoint)
205
+ if not yolox_version:
206
+ logger.warning(
207
+ "Failed to obtain yolox-page-elements version from the endpoint. Falling back to the latest version."
208
+ )
209
+ yolox_version = None # Default to the latest version
210
+ except Exception:
211
+ logger.warning(
212
+ "Failed to get yolox-page-elements version after 30 seconds. Falling back to the latest version."
213
+ )
214
+ yolox_version = None # Default to the latest version
215
+
216
+ yolox_model_interface = YoloxGraphicElementsModelInterface(yolox_version=yolox_version)
217
+ paddle_model_interface = PaddleOCRModelInterface()
218
+
219
+ logger.debug(f"Inference protocols: yolox={yolox_protocol}, paddle={paddle_protocol}")
220
+
221
+ yolox_client = create_inference_client(
222
+ endpoints=yolox_endpoints,
223
+ model_interface=yolox_model_interface,
224
+ auth_token=auth_token,
225
+ infer_protocol=yolox_protocol,
226
+ )
227
+
228
+ paddle_client = create_inference_client(
229
+ endpoints=paddle_endpoints,
230
+ model_interface=paddle_model_interface,
231
+ auth_token=auth_token,
232
+ infer_protocol=paddle_protocol,
233
+ )
234
+
235
+ return yolox_client, paddle_client
236
+
237
+
238
+ def extract_chart_data_from_image_internal(
239
+ df_extraction_ledger: pd.DataFrame,
240
+ task_config: Union[IngestTaskChartExtraction, Dict[str, Any]],
241
+ extraction_config: ChartExtractorSchema,
242
+ execution_trace_log: Optional[Dict] = None,
243
+ ) -> Tuple[pd.DataFrame, Dict]:
244
+ """
245
+ Extracts chart data from a DataFrame in a bulk fashion rather than row-by-row.
246
+
247
+ Parameters
248
+ ----------
249
+ df_extraction_ledger : pd.DataFrame
250
+ DataFrame containing the content from which chart data is to be extracted.
251
+ task_config : Dict[str, Any]
252
+ Dictionary containing task properties and configurations.
253
+ extraction_config : Any
254
+ The validated configuration object for chart extraction.
255
+ execution_trace_log : Optional[Dict], optional
256
+ Optional trace information for debugging or logging. Defaults to None.
257
+
258
+ Returns
259
+ -------
260
+ Tuple[pd.DataFrame, Dict]
261
+ A tuple containing the updated DataFrame and the trace information.
262
+
263
+ Raises
264
+ ------
265
+ Exception
266
+ If any error occurs during the chart data extraction process.
267
+ """
268
+ _ = task_config # Unused variable
269
+
270
+ if execution_trace_log is None:
271
+ execution_trace_log = {}
272
+ logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
273
+
274
+ if df_extraction_ledger.empty:
275
+ return df_extraction_ledger, execution_trace_log
276
+
277
+ endpoint_config = extraction_config.endpoint_config
278
+ yolox_client, paddle_client = _create_clients(
279
+ endpoint_config.yolox_endpoints,
280
+ endpoint_config.yolox_infer_protocol,
281
+ endpoint_config.paddle_endpoints,
282
+ endpoint_config.paddle_infer_protocol,
283
+ endpoint_config.auth_token,
284
+ )
285
+
286
+ try:
287
+ # 1) Identify rows that meet criteria in a single pass
288
+ # - metadata exists
289
+ # - content_metadata.type == "structured"
290
+ # - content_metadata.subtype == "chart"
291
+ # - table_metadata not None
292
+ # - base64_image not None or ""
293
+ def meets_criteria(row):
294
+ m = row.get("metadata", {})
295
+ if not m:
296
+ return False
297
+
298
+ content_md = m.get("content_metadata", {})
299
+ if (
300
+ content_md.get("type") == "structured"
301
+ and content_md.get("subtype") == "chart"
302
+ and m.get("table_metadata") is not None
303
+ and m.get("content") not in [None, ""]
304
+ ):
305
+ return True
306
+
307
+ return False
308
+
309
+ mask = df_extraction_ledger.apply(meets_criteria, axis=1)
310
+ valid_indices = df_extraction_ledger[mask].index.tolist()
311
+
312
+ # If no rows meet the criteria, just return.
313
+ if not valid_indices:
314
+ return df_extraction_ledger, {"trace_info": execution_trace_log}
315
+
316
+ # 2) Extract base64 images + keep track of row -> image mapping.
317
+ base64_images = []
318
+ for idx in valid_indices:
319
+ meta = df_extraction_ledger.at[idx, "metadata"]
320
+ base64_images.append(meta["content"]) # guaranteed by meets_criteria
321
+
322
+ # 3) Call our bulk _update_metadata to get all results.
323
+ bulk_results = _update_chart_metadata(
324
+ base64_images=base64_images,
325
+ yolox_client=yolox_client,
326
+ paddle_client=paddle_client,
327
+ worker_pool_size=endpoint_config.workers_per_progress_engine,
328
+ trace_info=execution_trace_log,
329
+ )
330
+
331
+ # 4) Write the results back to each row’s table_metadata
332
+ # The order of base64_images in bulk_results should match their original
333
+ # indices if we process them in the same order.
334
+ for row_id, idx in enumerate(valid_indices):
335
+ _, chart_content = bulk_results[row_id]
336
+ df_extraction_ledger.at[idx, "metadata"]["table_metadata"]["table_content"] = chart_content
337
+
338
+ return df_extraction_ledger, {"trace_info": execution_trace_log}
339
+
340
+ except Exception:
341
+ logger.error("Error occurred while extracting chart data.", exc_info=True)
342
+
343
+ raise
344
+
345
+ finally:
346
+ try:
347
+ if paddle_client is not None:
348
+ paddle_client.close()
349
+ if yolox_client is not None:
350
+ yolox_client.close()
351
+
352
+ except Exception as close_err:
353
+ logger.error(f"Error closing clients: {close_err}", exc_info=True)
@@ -0,0 +1,204 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import base64
7
+ import functools
8
+ import io
9
+ import logging
10
+ from typing import Any, Union, Tuple
11
+ from typing import Dict
12
+ from typing import List
13
+ from typing import Optional
14
+
15
+ import pandas as pd
16
+ from pydantic import BaseModel
17
+
18
+ from nv_ingest_api.internal.extract.image.image_helpers.common import unstructured_image_extractor
19
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExtractorSchema
20
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @unified_exception_handler
26
+ def _decode_and_extract_from_image(
27
+ base64_row: pd.Series,
28
+ task_config: Dict[str, Any],
29
+ validated_extraction_config: ImageExtractorSchema,
30
+ execution_trace_log: Optional[List[Any]] = None,
31
+ ) -> Any:
32
+ """
33
+ Decode base64-encoded image content from a DataFrame row and extract data using a specified extraction method.
34
+
35
+ This function extracts the "content" (base64 string) from the row, prepares additional task parameters by
36
+ inserting the remaining row data under "row_data", and decodes the base64 content into a BytesIO stream.
37
+ It then determines which extraction method to use (defaulting to "image" if the specified method is not found)
38
+ and calls the corresponding function from the image_helpers module.
39
+
40
+ Parameters
41
+ ----------
42
+ base64_row : pd.Series
43
+ A pandas Series representing a row containing base64-encoded content under the key "content"
44
+ and optionally a "source_id" and "document_type".
45
+ task_config : Dict[str, Any]
46
+ A dictionary containing task properties. It should include:
47
+ - "method" (str): The extraction method to use (e.g., "image").
48
+ - "params" (dict): Additional parameters to pass to the extraction function.
49
+ validated_extraction_config : Any
50
+ A configuration object that contains an attribute `image_extraction_config` to be used when
51
+ extracting image content.
52
+ default : str, optional
53
+ The default extraction method to use if the specified method is not available (default is "image").
54
+ execution_trace_log : Optional[List[Any]], optional
55
+ An optional list of trace information to pass to the extraction function (default is None).
56
+
57
+ Returns
58
+ -------
59
+ Any
60
+ The extracted data from the decoded image content. The exact return type depends on the extraction method used.
61
+
62
+ Raises
63
+ ------
64
+ KeyError
65
+ If the "content" key is missing from `base64_row`.
66
+ Exception
67
+ For any other unhandled exceptions during extraction.
68
+ """
69
+
70
+ # Retrieve document type and initialize source_id.
71
+ document_type: Any = base64_row["document_type"]
72
+ source_id: Optional[Any] = None
73
+
74
+ try:
75
+ base64_content: str = base64_row["content"]
76
+ except KeyError as e:
77
+ err_msg = f"decode_and_extract: Missing 'content' key in row: {base64_row}"
78
+ logger.error(err_msg, exc_info=True)
79
+ raise KeyError(err_msg) from e
80
+
81
+ try:
82
+ # Prepare additional row data (exclude "content") and inject into task parameters.
83
+ row_data = base64_row.drop(labels=["content"], errors="ignore")
84
+ task_config.setdefault("params", {})["row_data"] = row_data
85
+
86
+ # Retrieve source_id if available.
87
+ source_id = base64_row.get("source_id", None)
88
+
89
+ # Decode the base64 image content.
90
+ image_bytes: bytes = base64.b64decode(base64_content)
91
+ image_stream: io.BytesIO = io.BytesIO(image_bytes)
92
+
93
+ # Determine the extraction method and parameters.
94
+ # extract_method: str = task_config.get("method", "image")
95
+ extract_params: Dict[str, Any] = task_config.get("params", {})
96
+ extract_params["document_type"] = document_type
97
+
98
+ try:
99
+ extract_text: bool = extract_params.pop("extract_text", False)
100
+ extract_images: bool = extract_params.pop("extract_images", False)
101
+ extract_tables: bool = extract_params.pop("extract_tables", False)
102
+ extract_charts: bool = extract_params.pop("extract_charts", False)
103
+ extract_infographics: bool = extract_params.pop("extract_infographics", False)
104
+ except KeyError as e:
105
+ raise ValueError(f"Missing required extraction flag: {e}")
106
+
107
+ logger.debug(
108
+ f"decode_and_extract: Extracting image content using image_extraction_config: "
109
+ f"{validated_extraction_config.image_extraction_config}"
110
+ )
111
+ if validated_extraction_config.image_extraction_config is not None:
112
+ extract_params["image_extraction_config"] = validated_extraction_config.image_extraction_config
113
+
114
+ if execution_trace_log is not None:
115
+ extract_params["trace_info"] = execution_trace_log
116
+
117
+ # func = getattr(image_helpers, extract_method, default)
118
+ extracted_data: Any = unstructured_image_extractor(
119
+ image_stream=image_stream,
120
+ extract_text=extract_text,
121
+ extract_images=extract_images,
122
+ extract_infographics=extract_infographics,
123
+ extract_tables=extract_tables,
124
+ extract_charts=extract_charts,
125
+ extraction_config=extract_params,
126
+ extraction_trace_log=execution_trace_log,
127
+ )
128
+
129
+ return extracted_data
130
+
131
+ except Exception as e:
132
+ err_msg = f"decode_and_extract: Unhandled exception for source '{source_id}'. Original error: {e}"
133
+ logger.error(err_msg, exc_info=True)
134
+ raise type(e)(err_msg) from e
135
+
136
+
137
+ @unified_exception_handler
138
+ def extract_primitives_from_image_internal(
139
+ df_extraction_ledger: pd.DataFrame,
140
+ task_config: Union[Dict[str, Any], BaseModel],
141
+ extraction_config: Any,
142
+ execution_trace_log: Optional[Dict[str, Any]] = None,
143
+ ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
144
+ """
145
+ Process a DataFrame containing base64-encoded image files and extract primitives from each image.
146
+
147
+ This function applies the `decode_and_extract_from_image` routine to every row of the input DataFrame.
148
+ It then explodes any list results into separate rows, drops missing values, and compiles the extracted data
149
+ into a new DataFrame with columns "document_type", "metadata", and "uuid". In addition, trace information is
150
+ collected if provided.
151
+
152
+ Parameters
153
+ ----------
154
+ df_extraction_ledger : pd.DataFrame
155
+ Input DataFrame containing image files in base64 encoding. Expected to include columns 'source_id'
156
+ and 'content'.
157
+ task_config : Union[Dict[str, Any], BaseModel]
158
+ A dictionary or Pydantic model with instructions and parameters for the image processing task.
159
+ extraction_config : Any
160
+ A configuration object validated for processing images (e.g., containing `image_extraction_config`).
161
+ execution_trace_log : Optional[Dict[str, Any]], default=None
162
+ An optional dictionary for tracing and logging additional information during processing.
163
+
164
+ Returns
165
+ -------
166
+ pd.DataFrame
167
+ A DataFrame with the extracted image primitives. Expected columns include "document_type", "metadata",
168
+ and "uuid". Also returns a dictionary containing trace information under the key "trace_info".
169
+
170
+ Raises
171
+ ------
172
+ Exception
173
+ If an error occurs during the image processing stage, the exception is logged and re-raised.
174
+ """
175
+ logger.debug("process_image: Processing image content")
176
+ if execution_trace_log is None:
177
+ execution_trace_log = {}
178
+
179
+ if isinstance(task_config, BaseModel):
180
+ task_config = task_config.model_dump()
181
+
182
+ try:
183
+ # Create a partial function to decode and extract image data for each row.
184
+ _decode_and_extract = functools.partial(
185
+ _decode_and_extract_from_image,
186
+ task_config=task_config,
187
+ validated_extraction_config=extraction_config,
188
+ execution_trace_log=execution_trace_log,
189
+ )
190
+ logger.debug("process_image: Processing with method: %s", task_config.get("method", None))
191
+ sr_extraction = df_extraction_ledger.apply(_decode_and_extract, axis=1)
192
+ sr_extraction = sr_extraction.explode().dropna()
193
+
194
+ if not sr_extraction.empty:
195
+ extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
196
+ else:
197
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
198
+
199
+ return extracted_df, {"trace_info": execution_trace_log}
200
+
201
+ except Exception as e:
202
+ err_msg = f"process_image: Unhandled exception in image extractor stage. Original error: {e}"
203
+ logger.exception(err_msg)
204
+ raise type(e)(err_msg) from e
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0