nv-ingest-api 2025.4.15.dev20250415__py3-none-any.whl → 2025.4.17.dev20250417__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +435 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +72 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +334 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +398 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +152 -0
- nv_ingest_api-2025.4.15.dev20250415.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from typing import Any, Union
|
|
8
|
+
from typing import Dict
|
|
9
|
+
from typing import List
|
|
10
|
+
from typing import Optional
|
|
11
|
+
from typing import Tuple
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.helpers import get_version
|
|
17
|
+
from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
|
|
18
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskChartExtraction
|
|
19
|
+
from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_graphic_elements_and_paddle_output
|
|
20
|
+
from nv_ingest_api.util.image_processing.table_and_chart import process_yolox_graphic_elements
|
|
21
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.paddle import PaddleOCRModelInterface
|
|
22
|
+
from nv_ingest_api.internal.primitives.nim import NimClient
|
|
23
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxGraphicElementsModelInterface
|
|
24
|
+
from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
|
|
25
|
+
from nv_ingest_api.util.nim import create_inference_client
|
|
26
|
+
|
|
27
|
+
PADDLE_MIN_WIDTH = 32
|
|
28
|
+
PADDLE_MIN_HEIGHT = 32
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(f"morpheus.{__name__}")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _filter_valid_chart_images(
|
|
34
|
+
base64_images: List[str],
|
|
35
|
+
) -> Tuple[List[str], List[np.ndarray], List[int], List[Tuple[str, Optional[Dict]]]]:
|
|
36
|
+
"""
|
|
37
|
+
Filter base64-encoded images based on minimum dimensions for chart extraction.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
- valid_images: Base64 strings meeting size requirements.
|
|
41
|
+
- valid_arrays: Corresponding numpy arrays.
|
|
42
|
+
- valid_indices: Original indices of valid images.
|
|
43
|
+
- results: Initial results list where invalid images are set to (img, None).
|
|
44
|
+
"""
|
|
45
|
+
results: List[Tuple[str, Optional[Dict]]] = [("", None)] * len(base64_images)
|
|
46
|
+
valid_images: List[str] = []
|
|
47
|
+
valid_arrays: List[np.ndarray] = []
|
|
48
|
+
valid_indices: List[int] = []
|
|
49
|
+
|
|
50
|
+
for i, img in enumerate(base64_images):
|
|
51
|
+
array = base64_to_numpy(img)
|
|
52
|
+
height, width = array.shape[0], array.shape[1]
|
|
53
|
+
if width >= PADDLE_MIN_WIDTH and height >= PADDLE_MIN_HEIGHT:
|
|
54
|
+
valid_images.append(img)
|
|
55
|
+
valid_arrays.append(array)
|
|
56
|
+
valid_indices.append(i)
|
|
57
|
+
else:
|
|
58
|
+
# Image is too small; mark as skipped.
|
|
59
|
+
results[i] = (img, None)
|
|
60
|
+
return valid_images, valid_arrays, valid_indices, results
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _run_chart_inference(
|
|
64
|
+
yolox_client: Any,
|
|
65
|
+
paddle_client: Any,
|
|
66
|
+
valid_arrays: List[np.ndarray],
|
|
67
|
+
valid_images: List[str],
|
|
68
|
+
trace_info: Dict,
|
|
69
|
+
) -> Tuple[List[Any], List[Any]]:
|
|
70
|
+
"""
|
|
71
|
+
Run concurrent inference for chart extraction using YOLOX and Paddle.
|
|
72
|
+
|
|
73
|
+
Returns a tuple of (yolox_results, paddle_results).
|
|
74
|
+
"""
|
|
75
|
+
data_yolox = {"images": valid_arrays}
|
|
76
|
+
data_paddle = {"base64_images": valid_images}
|
|
77
|
+
|
|
78
|
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
|
79
|
+
future_yolox = executor.submit(
|
|
80
|
+
yolox_client.infer,
|
|
81
|
+
data=data_yolox,
|
|
82
|
+
model_name="yolox",
|
|
83
|
+
stage_name="chart_data_extraction",
|
|
84
|
+
max_batch_size=8,
|
|
85
|
+
trace_info=trace_info,
|
|
86
|
+
)
|
|
87
|
+
future_paddle = executor.submit(
|
|
88
|
+
paddle_client.infer,
|
|
89
|
+
data=data_paddle,
|
|
90
|
+
model_name="paddle",
|
|
91
|
+
stage_name="chart_data_extraction",
|
|
92
|
+
max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
|
|
93
|
+
trace_info=trace_info,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
yolox_results = future_yolox.result()
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"Error calling yolox_client.infer: {e}", exc_info=True)
|
|
100
|
+
raise
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
paddle_results = future_paddle.result()
|
|
104
|
+
except Exception as e:
|
|
105
|
+
logger.error(f"Error calling paddle_client.infer: {e}", exc_info=True)
|
|
106
|
+
raise
|
|
107
|
+
|
|
108
|
+
return yolox_results, paddle_results
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _validate_chart_inference_results(
|
|
112
|
+
yolox_results: Any, paddle_results: Any, valid_arrays: List[Any], valid_images: List[str]
|
|
113
|
+
) -> Tuple[List[Any], List[Any]]:
|
|
114
|
+
"""
|
|
115
|
+
Ensure inference results are lists and have expected lengths.
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
ValueError if results do not match expected types or lengths.
|
|
119
|
+
"""
|
|
120
|
+
if not (isinstance(yolox_results, list) and isinstance(paddle_results, list)):
|
|
121
|
+
raise ValueError("Expected list results from both yolox_client and paddle_client infer calls.")
|
|
122
|
+
|
|
123
|
+
if len(yolox_results) != len(valid_arrays):
|
|
124
|
+
raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
|
|
125
|
+
if len(paddle_results) != len(valid_images):
|
|
126
|
+
raise ValueError(f"Expected {len(valid_images)} paddle results, got {len(paddle_results)}")
|
|
127
|
+
return yolox_results, paddle_results
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _merge_chart_results(
|
|
131
|
+
base64_images: List[str],
|
|
132
|
+
valid_indices: List[int],
|
|
133
|
+
yolox_results: List[Any],
|
|
134
|
+
paddle_results: List[Any],
|
|
135
|
+
initial_results: List[Tuple[str, Optional[Dict]]],
|
|
136
|
+
) -> List[Tuple[str, Optional[Dict]]]:
|
|
137
|
+
"""
|
|
138
|
+
Merge inference results into the initial results list using the original indices.
|
|
139
|
+
|
|
140
|
+
For each valid image, processes the results from both inference calls and updates the
|
|
141
|
+
corresponding entry in the results list.
|
|
142
|
+
"""
|
|
143
|
+
for idx, (yolox_res, paddle_res) in enumerate(zip(yolox_results, paddle_results)):
|
|
144
|
+
# Unpack paddle result into bounding boxes and text predictions.
|
|
145
|
+
bounding_boxes, text_predictions = paddle_res
|
|
146
|
+
yolox_elements = join_yolox_graphic_elements_and_paddle_output(yolox_res, bounding_boxes, text_predictions)
|
|
147
|
+
chart_content = process_yolox_graphic_elements(yolox_elements)
|
|
148
|
+
original_index = valid_indices[idx]
|
|
149
|
+
initial_results[original_index] = (base64_images[original_index], chart_content)
|
|
150
|
+
return initial_results
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _update_chart_metadata(
|
|
154
|
+
base64_images: List[str],
|
|
155
|
+
yolox_client: Any,
|
|
156
|
+
paddle_client: Any,
|
|
157
|
+
trace_info: Dict,
|
|
158
|
+
worker_pool_size: int = 8, # Not currently used.
|
|
159
|
+
) -> List[Tuple[str, Optional[Dict]]]:
|
|
160
|
+
"""
|
|
161
|
+
Given a list of base64-encoded chart images, concurrently call both YOLOX and Paddle
|
|
162
|
+
inference services to extract chart data.
|
|
163
|
+
|
|
164
|
+
For each base64-encoded image, returns:
|
|
165
|
+
(original_image_str, joined_chart_content_dict)
|
|
166
|
+
|
|
167
|
+
Images that do not meet minimum size requirements are marked as skipped.
|
|
168
|
+
"""
|
|
169
|
+
logger.debug("Running chart extraction using updated concurrency handling.")
|
|
170
|
+
|
|
171
|
+
# Initialize results with placeholders and filter valid images.
|
|
172
|
+
valid_images, valid_arrays, valid_indices, results = _filter_valid_chart_images(base64_images)
|
|
173
|
+
|
|
174
|
+
# Run concurrent inference only for valid images.
|
|
175
|
+
yolox_results, paddle_results = _run_chart_inference(
|
|
176
|
+
yolox_client=yolox_client,
|
|
177
|
+
paddle_client=paddle_client,
|
|
178
|
+
valid_arrays=valid_arrays,
|
|
179
|
+
valid_images=valid_images,
|
|
180
|
+
trace_info=trace_info,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Validate that the returned inference results are lists of the expected length.
|
|
184
|
+
yolox_results, paddle_results = _validate_chart_inference_results(
|
|
185
|
+
yolox_results, paddle_results, valid_arrays, valid_images
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Merge the inference results into the results list.
|
|
189
|
+
return _merge_chart_results(base64_images, valid_indices, yolox_results, paddle_results, results)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _create_clients(
|
|
193
|
+
yolox_endpoints: Tuple[str, str],
|
|
194
|
+
yolox_protocol: str,
|
|
195
|
+
paddle_endpoints: Tuple[str, str],
|
|
196
|
+
paddle_protocol: str,
|
|
197
|
+
auth_token: str,
|
|
198
|
+
) -> Tuple[NimClient, NimClient]:
|
|
199
|
+
# Obtain yolox_version
|
|
200
|
+
# Assuming that the grpc endpoint is at index 0
|
|
201
|
+
yolox_http_endpoint = yolox_endpoints[1]
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
yolox_version = get_version(yolox_http_endpoint)
|
|
205
|
+
if not yolox_version:
|
|
206
|
+
logger.warning(
|
|
207
|
+
"Failed to obtain yolox-page-elements version from the endpoint. Falling back to the latest version."
|
|
208
|
+
)
|
|
209
|
+
yolox_version = None # Default to the latest version
|
|
210
|
+
except Exception:
|
|
211
|
+
logger.warning(
|
|
212
|
+
"Failed to get yolox-page-elements version after 30 seconds. Falling back to the latest version."
|
|
213
|
+
)
|
|
214
|
+
yolox_version = None # Default to the latest version
|
|
215
|
+
|
|
216
|
+
yolox_model_interface = YoloxGraphicElementsModelInterface(yolox_version=yolox_version)
|
|
217
|
+
paddle_model_interface = PaddleOCRModelInterface()
|
|
218
|
+
|
|
219
|
+
logger.debug(f"Inference protocols: yolox={yolox_protocol}, paddle={paddle_protocol}")
|
|
220
|
+
|
|
221
|
+
yolox_client = create_inference_client(
|
|
222
|
+
endpoints=yolox_endpoints,
|
|
223
|
+
model_interface=yolox_model_interface,
|
|
224
|
+
auth_token=auth_token,
|
|
225
|
+
infer_protocol=yolox_protocol,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
paddle_client = create_inference_client(
|
|
229
|
+
endpoints=paddle_endpoints,
|
|
230
|
+
model_interface=paddle_model_interface,
|
|
231
|
+
auth_token=auth_token,
|
|
232
|
+
infer_protocol=paddle_protocol,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return yolox_client, paddle_client
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def extract_chart_data_from_image_internal(
|
|
239
|
+
df_extraction_ledger: pd.DataFrame,
|
|
240
|
+
task_config: Union[IngestTaskChartExtraction, Dict[str, Any]],
|
|
241
|
+
extraction_config: ChartExtractorSchema,
|
|
242
|
+
execution_trace_log: Optional[Dict] = None,
|
|
243
|
+
) -> Tuple[pd.DataFrame, Dict]:
|
|
244
|
+
"""
|
|
245
|
+
Extracts chart data from a DataFrame in a bulk fashion rather than row-by-row.
|
|
246
|
+
|
|
247
|
+
Parameters
|
|
248
|
+
----------
|
|
249
|
+
df_extraction_ledger : pd.DataFrame
|
|
250
|
+
DataFrame containing the content from which chart data is to be extracted.
|
|
251
|
+
task_config : Dict[str, Any]
|
|
252
|
+
Dictionary containing task properties and configurations.
|
|
253
|
+
extraction_config : Any
|
|
254
|
+
The validated configuration object for chart extraction.
|
|
255
|
+
execution_trace_log : Optional[Dict], optional
|
|
256
|
+
Optional trace information for debugging or logging. Defaults to None.
|
|
257
|
+
|
|
258
|
+
Returns
|
|
259
|
+
-------
|
|
260
|
+
Tuple[pd.DataFrame, Dict]
|
|
261
|
+
A tuple containing the updated DataFrame and the trace information.
|
|
262
|
+
|
|
263
|
+
Raises
|
|
264
|
+
------
|
|
265
|
+
Exception
|
|
266
|
+
If any error occurs during the chart data extraction process.
|
|
267
|
+
"""
|
|
268
|
+
_ = task_config # Unused variable
|
|
269
|
+
|
|
270
|
+
if execution_trace_log is None:
|
|
271
|
+
execution_trace_log = {}
|
|
272
|
+
logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
|
|
273
|
+
|
|
274
|
+
if df_extraction_ledger.empty:
|
|
275
|
+
return df_extraction_ledger, execution_trace_log
|
|
276
|
+
|
|
277
|
+
endpoint_config = extraction_config.endpoint_config
|
|
278
|
+
yolox_client, paddle_client = _create_clients(
|
|
279
|
+
endpoint_config.yolox_endpoints,
|
|
280
|
+
endpoint_config.yolox_infer_protocol,
|
|
281
|
+
endpoint_config.paddle_endpoints,
|
|
282
|
+
endpoint_config.paddle_infer_protocol,
|
|
283
|
+
endpoint_config.auth_token,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
try:
|
|
287
|
+
# 1) Identify rows that meet criteria in a single pass
|
|
288
|
+
# - metadata exists
|
|
289
|
+
# - content_metadata.type == "structured"
|
|
290
|
+
# - content_metadata.subtype == "chart"
|
|
291
|
+
# - table_metadata not None
|
|
292
|
+
# - base64_image not None or ""
|
|
293
|
+
def meets_criteria(row):
|
|
294
|
+
m = row.get("metadata", {})
|
|
295
|
+
if not m:
|
|
296
|
+
return False
|
|
297
|
+
|
|
298
|
+
content_md = m.get("content_metadata", {})
|
|
299
|
+
if (
|
|
300
|
+
content_md.get("type") == "structured"
|
|
301
|
+
and content_md.get("subtype") == "chart"
|
|
302
|
+
and m.get("table_metadata") is not None
|
|
303
|
+
and m.get("content") not in [None, ""]
|
|
304
|
+
):
|
|
305
|
+
return True
|
|
306
|
+
|
|
307
|
+
return False
|
|
308
|
+
|
|
309
|
+
mask = df_extraction_ledger.apply(meets_criteria, axis=1)
|
|
310
|
+
valid_indices = df_extraction_ledger[mask].index.tolist()
|
|
311
|
+
|
|
312
|
+
# If no rows meet the criteria, just return.
|
|
313
|
+
if not valid_indices:
|
|
314
|
+
return df_extraction_ledger, {"trace_info": execution_trace_log}
|
|
315
|
+
|
|
316
|
+
# 2) Extract base64 images + keep track of row -> image mapping.
|
|
317
|
+
base64_images = []
|
|
318
|
+
for idx in valid_indices:
|
|
319
|
+
meta = df_extraction_ledger.at[idx, "metadata"]
|
|
320
|
+
base64_images.append(meta["content"]) # guaranteed by meets_criteria
|
|
321
|
+
|
|
322
|
+
# 3) Call our bulk _update_metadata to get all results.
|
|
323
|
+
bulk_results = _update_chart_metadata(
|
|
324
|
+
base64_images=base64_images,
|
|
325
|
+
yolox_client=yolox_client,
|
|
326
|
+
paddle_client=paddle_client,
|
|
327
|
+
worker_pool_size=endpoint_config.workers_per_progress_engine,
|
|
328
|
+
trace_info=execution_trace_log,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
# 4) Write the results back to each row’s table_metadata
|
|
332
|
+
# The order of base64_images in bulk_results should match their original
|
|
333
|
+
# indices if we process them in the same order.
|
|
334
|
+
for row_id, idx in enumerate(valid_indices):
|
|
335
|
+
_, chart_content = bulk_results[row_id]
|
|
336
|
+
df_extraction_ledger.at[idx, "metadata"]["table_metadata"]["table_content"] = chart_content
|
|
337
|
+
|
|
338
|
+
return df_extraction_ledger, {"trace_info": execution_trace_log}
|
|
339
|
+
|
|
340
|
+
except Exception:
|
|
341
|
+
logger.error("Error occurred while extracting chart data.", exc_info=True)
|
|
342
|
+
|
|
343
|
+
raise
|
|
344
|
+
|
|
345
|
+
finally:
|
|
346
|
+
try:
|
|
347
|
+
if paddle_client is not None:
|
|
348
|
+
paddle_client.close()
|
|
349
|
+
if yolox_client is not None:
|
|
350
|
+
yolox_client.close()
|
|
351
|
+
|
|
352
|
+
except Exception as close_err:
|
|
353
|
+
logger.error(f"Error closing clients: {close_err}", exc_info=True)
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import functools
|
|
8
|
+
import io
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any, Union, Tuple
|
|
11
|
+
from typing import Dict
|
|
12
|
+
from typing import List
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
from pydantic import BaseModel
|
|
17
|
+
|
|
18
|
+
from nv_ingest_api.internal.extract.image.image_helpers.common import unstructured_image_extractor
|
|
19
|
+
from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExtractorSchema
|
|
20
|
+
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@unified_exception_handler
|
|
26
|
+
def _decode_and_extract_from_image(
|
|
27
|
+
base64_row: pd.Series,
|
|
28
|
+
task_config: Dict[str, Any],
|
|
29
|
+
validated_extraction_config: ImageExtractorSchema,
|
|
30
|
+
execution_trace_log: Optional[List[Any]] = None,
|
|
31
|
+
) -> Any:
|
|
32
|
+
"""
|
|
33
|
+
Decode base64-encoded image content from a DataFrame row and extract data using a specified extraction method.
|
|
34
|
+
|
|
35
|
+
This function extracts the "content" (base64 string) from the row, prepares additional task parameters by
|
|
36
|
+
inserting the remaining row data under "row_data", and decodes the base64 content into a BytesIO stream.
|
|
37
|
+
It then determines which extraction method to use (defaulting to "image" if the specified method is not found)
|
|
38
|
+
and calls the corresponding function from the image_helpers module.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
base64_row : pd.Series
|
|
43
|
+
A pandas Series representing a row containing base64-encoded content under the key "content"
|
|
44
|
+
and optionally a "source_id" and "document_type".
|
|
45
|
+
task_config : Dict[str, Any]
|
|
46
|
+
A dictionary containing task properties. It should include:
|
|
47
|
+
- "method" (str): The extraction method to use (e.g., "image").
|
|
48
|
+
- "params" (dict): Additional parameters to pass to the extraction function.
|
|
49
|
+
validated_extraction_config : Any
|
|
50
|
+
A configuration object that contains an attribute `image_extraction_config` to be used when
|
|
51
|
+
extracting image content.
|
|
52
|
+
default : str, optional
|
|
53
|
+
The default extraction method to use if the specified method is not available (default is "image").
|
|
54
|
+
execution_trace_log : Optional[List[Any]], optional
|
|
55
|
+
An optional list of trace information to pass to the extraction function (default is None).
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
Any
|
|
60
|
+
The extracted data from the decoded image content. The exact return type depends on the extraction method used.
|
|
61
|
+
|
|
62
|
+
Raises
|
|
63
|
+
------
|
|
64
|
+
KeyError
|
|
65
|
+
If the "content" key is missing from `base64_row`.
|
|
66
|
+
Exception
|
|
67
|
+
For any other unhandled exceptions during extraction.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
# Retrieve document type and initialize source_id.
|
|
71
|
+
document_type: Any = base64_row["document_type"]
|
|
72
|
+
source_id: Optional[Any] = None
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
base64_content: str = base64_row["content"]
|
|
76
|
+
except KeyError as e:
|
|
77
|
+
err_msg = f"decode_and_extract: Missing 'content' key in row: {base64_row}"
|
|
78
|
+
logger.error(err_msg, exc_info=True)
|
|
79
|
+
raise KeyError(err_msg) from e
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
# Prepare additional row data (exclude "content") and inject into task parameters.
|
|
83
|
+
row_data = base64_row.drop(labels=["content"], errors="ignore")
|
|
84
|
+
task_config.setdefault("params", {})["row_data"] = row_data
|
|
85
|
+
|
|
86
|
+
# Retrieve source_id if available.
|
|
87
|
+
source_id = base64_row.get("source_id", None)
|
|
88
|
+
|
|
89
|
+
# Decode the base64 image content.
|
|
90
|
+
image_bytes: bytes = base64.b64decode(base64_content)
|
|
91
|
+
image_stream: io.BytesIO = io.BytesIO(image_bytes)
|
|
92
|
+
|
|
93
|
+
# Determine the extraction method and parameters.
|
|
94
|
+
# extract_method: str = task_config.get("method", "image")
|
|
95
|
+
extract_params: Dict[str, Any] = task_config.get("params", {})
|
|
96
|
+
extract_params["document_type"] = document_type
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
extract_text: bool = extract_params.pop("extract_text", False)
|
|
100
|
+
extract_images: bool = extract_params.pop("extract_images", False)
|
|
101
|
+
extract_tables: bool = extract_params.pop("extract_tables", False)
|
|
102
|
+
extract_charts: bool = extract_params.pop("extract_charts", False)
|
|
103
|
+
extract_infographics: bool = extract_params.pop("extract_infographics", False)
|
|
104
|
+
except KeyError as e:
|
|
105
|
+
raise ValueError(f"Missing required extraction flag: {e}")
|
|
106
|
+
|
|
107
|
+
logger.debug(
|
|
108
|
+
f"decode_and_extract: Extracting image content using image_extraction_config: "
|
|
109
|
+
f"{validated_extraction_config.image_extraction_config}"
|
|
110
|
+
)
|
|
111
|
+
if validated_extraction_config.image_extraction_config is not None:
|
|
112
|
+
extract_params["image_extraction_config"] = validated_extraction_config.image_extraction_config
|
|
113
|
+
|
|
114
|
+
if execution_trace_log is not None:
|
|
115
|
+
extract_params["trace_info"] = execution_trace_log
|
|
116
|
+
|
|
117
|
+
# func = getattr(image_helpers, extract_method, default)
|
|
118
|
+
extracted_data: Any = unstructured_image_extractor(
|
|
119
|
+
image_stream=image_stream,
|
|
120
|
+
extract_text=extract_text,
|
|
121
|
+
extract_images=extract_images,
|
|
122
|
+
extract_infographics=extract_infographics,
|
|
123
|
+
extract_tables=extract_tables,
|
|
124
|
+
extract_charts=extract_charts,
|
|
125
|
+
extraction_config=extract_params,
|
|
126
|
+
extraction_trace_log=execution_trace_log,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
return extracted_data
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
err_msg = f"decode_and_extract: Unhandled exception for source '{source_id}'. Original error: {e}"
|
|
133
|
+
logger.error(err_msg, exc_info=True)
|
|
134
|
+
raise type(e)(err_msg) from e
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@unified_exception_handler
|
|
138
|
+
def extract_primitives_from_image_internal(
|
|
139
|
+
df_extraction_ledger: pd.DataFrame,
|
|
140
|
+
task_config: Union[Dict[str, Any], BaseModel],
|
|
141
|
+
extraction_config: Any,
|
|
142
|
+
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
143
|
+
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
144
|
+
"""
|
|
145
|
+
Process a DataFrame containing base64-encoded image files and extract primitives from each image.
|
|
146
|
+
|
|
147
|
+
This function applies the `decode_and_extract_from_image` routine to every row of the input DataFrame.
|
|
148
|
+
It then explodes any list results into separate rows, drops missing values, and compiles the extracted data
|
|
149
|
+
into a new DataFrame with columns "document_type", "metadata", and "uuid". In addition, trace information is
|
|
150
|
+
collected if provided.
|
|
151
|
+
|
|
152
|
+
Parameters
|
|
153
|
+
----------
|
|
154
|
+
df_extraction_ledger : pd.DataFrame
|
|
155
|
+
Input DataFrame containing image files in base64 encoding. Expected to include columns 'source_id'
|
|
156
|
+
and 'content'.
|
|
157
|
+
task_config : Union[Dict[str, Any], BaseModel]
|
|
158
|
+
A dictionary or Pydantic model with instructions and parameters for the image processing task.
|
|
159
|
+
extraction_config : Any
|
|
160
|
+
A configuration object validated for processing images (e.g., containing `image_extraction_config`).
|
|
161
|
+
execution_trace_log : Optional[Dict[str, Any]], default=None
|
|
162
|
+
An optional dictionary for tracing and logging additional information during processing.
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
pd.DataFrame
|
|
167
|
+
A DataFrame with the extracted image primitives. Expected columns include "document_type", "metadata",
|
|
168
|
+
and "uuid". Also returns a dictionary containing trace information under the key "trace_info".
|
|
169
|
+
|
|
170
|
+
Raises
|
|
171
|
+
------
|
|
172
|
+
Exception
|
|
173
|
+
If an error occurs during the image processing stage, the exception is logged and re-raised.
|
|
174
|
+
"""
|
|
175
|
+
logger.debug("process_image: Processing image content")
|
|
176
|
+
if execution_trace_log is None:
|
|
177
|
+
execution_trace_log = {}
|
|
178
|
+
|
|
179
|
+
if isinstance(task_config, BaseModel):
|
|
180
|
+
task_config = task_config.model_dump()
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
# Create a partial function to decode and extract image data for each row.
|
|
184
|
+
_decode_and_extract = functools.partial(
|
|
185
|
+
_decode_and_extract_from_image,
|
|
186
|
+
task_config=task_config,
|
|
187
|
+
validated_extraction_config=extraction_config,
|
|
188
|
+
execution_trace_log=execution_trace_log,
|
|
189
|
+
)
|
|
190
|
+
logger.debug("process_image: Processing with method: %s", task_config.get("method", None))
|
|
191
|
+
sr_extraction = df_extraction_ledger.apply(_decode_and_extract, axis=1)
|
|
192
|
+
sr_extraction = sr_extraction.explode().dropna()
|
|
193
|
+
|
|
194
|
+
if not sr_extraction.empty:
|
|
195
|
+
extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
|
|
196
|
+
else:
|
|
197
|
+
extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
|
|
198
|
+
|
|
199
|
+
return extracted_df, {"trace_info": execution_trace_log}
|
|
200
|
+
|
|
201
|
+
except Exception as e:
|
|
202
|
+
err_msg = f"process_image: Unhandled exception in image extractor stage. Original error: {e}"
|
|
203
|
+
logger.exception(err_msg)
|
|
204
|
+
raise type(e)(err_msg) from e
|