nv-ingest-api 2025.4.15.dev20250415__py3-none-any.whl → 2025.4.17.dev20250417__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +435 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +72 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +334 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +398 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +152 -0
- nv_ingest_api-2025.4.15.dev20250415.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from typing import Any, Union
|
|
8
|
+
from typing import Dict
|
|
9
|
+
from typing import List
|
|
10
|
+
from typing import Optional
|
|
11
|
+
from typing import Tuple
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskTableExtraction
|
|
17
|
+
from nv_ingest_api.internal.enums.common import TableFormatEnum
|
|
18
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.paddle import PaddleOCRModelInterface
|
|
19
|
+
from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
|
|
20
|
+
from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_paddle_output
|
|
21
|
+
from nv_ingest_api.util.image_processing.table_and_chart import convert_paddle_response_to_psuedo_markdown
|
|
22
|
+
from nv_ingest_api.internal.primitives.nim import NimClient
|
|
23
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxTableStructureModelInterface
|
|
24
|
+
from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
|
|
25
|
+
from nv_ingest_api.util.nim import create_inference_client
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
PADDLE_MIN_WIDTH = 32
|
|
30
|
+
PADDLE_MIN_HEIGHT = 32
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _filter_valid_images(base64_images: List[str]) -> Tuple[List[str], List[np.ndarray], List[int]]:
|
|
34
|
+
"""
|
|
35
|
+
Filter base64-encoded images by their dimensions.
|
|
36
|
+
|
|
37
|
+
Returns three lists:
|
|
38
|
+
- valid_images: The base64 strings that meet minimum size requirements.
|
|
39
|
+
- valid_arrays: The corresponding numpy arrays.
|
|
40
|
+
- valid_indices: The original indices in the input list.
|
|
41
|
+
"""
|
|
42
|
+
valid_images: List[str] = []
|
|
43
|
+
valid_arrays: List[np.ndarray] = []
|
|
44
|
+
valid_indices: List[int] = []
|
|
45
|
+
|
|
46
|
+
for i, img in enumerate(base64_images):
|
|
47
|
+
array = base64_to_numpy(img)
|
|
48
|
+
height, width = array.shape[0], array.shape[1]
|
|
49
|
+
if width >= PADDLE_MIN_WIDTH and height >= PADDLE_MIN_HEIGHT:
|
|
50
|
+
valid_images.append(img)
|
|
51
|
+
valid_arrays.append(array)
|
|
52
|
+
valid_indices.append(i)
|
|
53
|
+
else:
|
|
54
|
+
# Image is too small; skip it.
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
return valid_images, valid_arrays, valid_indices
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _run_inference(
|
|
61
|
+
enable_yolox: bool,
|
|
62
|
+
yolox_client: Any,
|
|
63
|
+
paddle_client: Any,
|
|
64
|
+
valid_arrays: List[np.ndarray],
|
|
65
|
+
valid_images: List[str],
|
|
66
|
+
trace_info: Optional[Dict] = None,
|
|
67
|
+
) -> Tuple[List[Any], List[Any]]:
|
|
68
|
+
"""
|
|
69
|
+
Run inference concurrently for YOLOX (if enabled) and Paddle.
|
|
70
|
+
|
|
71
|
+
Returns a tuple of (yolox_results, paddle_results).
|
|
72
|
+
"""
|
|
73
|
+
data_paddle = {"base64_images": valid_images}
|
|
74
|
+
if enable_yolox:
|
|
75
|
+
data_yolox = {"images": valid_arrays}
|
|
76
|
+
|
|
77
|
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
|
78
|
+
future_yolox = None
|
|
79
|
+
if enable_yolox:
|
|
80
|
+
future_yolox = executor.submit(
|
|
81
|
+
yolox_client.infer,
|
|
82
|
+
data=data_yolox,
|
|
83
|
+
model_name="yolox",
|
|
84
|
+
stage_name="table_data_extraction",
|
|
85
|
+
max_batch_size=8,
|
|
86
|
+
trace_info=trace_info,
|
|
87
|
+
)
|
|
88
|
+
future_paddle = executor.submit(
|
|
89
|
+
paddle_client.infer,
|
|
90
|
+
data=data_paddle,
|
|
91
|
+
model_name="paddle",
|
|
92
|
+
stage_name="table_data_extraction",
|
|
93
|
+
max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
|
|
94
|
+
trace_info=trace_info,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if enable_yolox:
|
|
98
|
+
try:
|
|
99
|
+
yolox_results = future_yolox.result()
|
|
100
|
+
except Exception as e:
|
|
101
|
+
logger.error(f"Error calling yolox_client.infer: {e}", exc_info=True)
|
|
102
|
+
raise
|
|
103
|
+
else:
|
|
104
|
+
yolox_results = [None] * len(valid_images)
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
paddle_results = future_paddle.result()
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(f"Error calling paddle_client.infer: {e}", exc_info=True)
|
|
110
|
+
raise
|
|
111
|
+
|
|
112
|
+
return yolox_results, paddle_results
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _validate_inference_results(
|
|
116
|
+
yolox_results: Any,
|
|
117
|
+
paddle_results: Any,
|
|
118
|
+
valid_arrays: List[Any],
|
|
119
|
+
valid_images: List[str],
|
|
120
|
+
) -> Tuple[List[Any], List[Any]]:
|
|
121
|
+
"""
|
|
122
|
+
Validate that both inference results are lists and have the expected lengths.
|
|
123
|
+
|
|
124
|
+
If not, default values are assigned. Raises a ValueError if the lengths do not match.
|
|
125
|
+
"""
|
|
126
|
+
if not isinstance(yolox_results, list) or not isinstance(paddle_results, list):
|
|
127
|
+
logger.warning(
|
|
128
|
+
"Unexpected result types from inference clients: yolox_results=%s, paddle_results=%s. "
|
|
129
|
+
"Proceeding with available results.",
|
|
130
|
+
type(yolox_results).__name__,
|
|
131
|
+
type(paddle_results).__name__,
|
|
132
|
+
)
|
|
133
|
+
if not isinstance(yolox_results, list):
|
|
134
|
+
yolox_results = [None] * len(valid_arrays)
|
|
135
|
+
if not isinstance(paddle_results, list):
|
|
136
|
+
paddle_results = [(None, None)] * len(valid_images)
|
|
137
|
+
|
|
138
|
+
if len(yolox_results) != len(valid_arrays):
|
|
139
|
+
raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
|
|
140
|
+
if len(paddle_results) != len(valid_images):
|
|
141
|
+
raise ValueError(f"Expected {len(valid_images)} paddle results, got {len(paddle_results)}")
|
|
142
|
+
|
|
143
|
+
return yolox_results, paddle_results
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _update_table_metadata(
|
|
147
|
+
base64_images: List[str],
|
|
148
|
+
yolox_client: Any,
|
|
149
|
+
paddle_client: Any,
|
|
150
|
+
worker_pool_size: int = 8, # Not currently used
|
|
151
|
+
enable_yolox: bool = False,
|
|
152
|
+
trace_info: Optional[Dict] = None,
|
|
153
|
+
) -> List[Tuple[str, Any, Any, Any]]:
|
|
154
|
+
"""
|
|
155
|
+
Given a list of base64-encoded images, this function filters out images that do not meet
|
|
156
|
+
the minimum size requirements and then calls the PaddleOCR model via paddle_client.infer
|
|
157
|
+
to extract table data.
|
|
158
|
+
|
|
159
|
+
For each base64-encoded image, the result is a tuple:
|
|
160
|
+
(base64_image, yolox_result, paddle_text_predictions, paddle_bounding_boxes)
|
|
161
|
+
|
|
162
|
+
Images that do not meet the minimum size are skipped (resulting in placeholders).
|
|
163
|
+
The paddle_client is expected to handle any necessary batching and concurrency.
|
|
164
|
+
"""
|
|
165
|
+
logger.debug(f"Running table extraction using protocol {paddle_client.protocol}")
|
|
166
|
+
|
|
167
|
+
# Initialize the results list with default placeholders.
|
|
168
|
+
results: List[Tuple[str, Any, Any, Any]] = [("", None, None, None)] * len(base64_images)
|
|
169
|
+
|
|
170
|
+
# Filter valid images based on size requirements.
|
|
171
|
+
valid_images, valid_arrays, valid_indices = _filter_valid_images(base64_images)
|
|
172
|
+
|
|
173
|
+
if not valid_images:
|
|
174
|
+
return results
|
|
175
|
+
|
|
176
|
+
# Run inference concurrently.
|
|
177
|
+
yolox_results, paddle_results = _run_inference(
|
|
178
|
+
enable_yolox=enable_yolox,
|
|
179
|
+
yolox_client=yolox_client,
|
|
180
|
+
paddle_client=paddle_client,
|
|
181
|
+
valid_arrays=valid_arrays,
|
|
182
|
+
valid_images=valid_images,
|
|
183
|
+
trace_info=trace_info,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Validate that the inference results have the expected structure.
|
|
187
|
+
yolox_results, paddle_results = _validate_inference_results(
|
|
188
|
+
yolox_results, paddle_results, valid_arrays, valid_images
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Combine results with the original order.
|
|
192
|
+
for idx, (yolox_res, paddle_res) in enumerate(zip(yolox_results, paddle_results)):
|
|
193
|
+
original_index = valid_indices[idx]
|
|
194
|
+
results[original_index] = (base64_images[original_index], yolox_res, paddle_res[0], paddle_res[1])
|
|
195
|
+
|
|
196
|
+
return results
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _create_clients(
|
|
200
|
+
yolox_endpoints: Tuple[str, str],
|
|
201
|
+
yolox_protocol: str,
|
|
202
|
+
paddle_endpoints: Tuple[str, str],
|
|
203
|
+
paddle_protocol: str,
|
|
204
|
+
auth_token: str,
|
|
205
|
+
) -> Tuple[NimClient, NimClient]:
|
|
206
|
+
yolox_model_interface = YoloxTableStructureModelInterface()
|
|
207
|
+
paddle_model_interface = PaddleOCRModelInterface()
|
|
208
|
+
|
|
209
|
+
logger.debug(f"Inference protocols: yolox={yolox_protocol}, paddle={paddle_protocol}")
|
|
210
|
+
|
|
211
|
+
yolox_client = create_inference_client(
|
|
212
|
+
endpoints=yolox_endpoints,
|
|
213
|
+
model_interface=yolox_model_interface,
|
|
214
|
+
auth_token=auth_token,
|
|
215
|
+
infer_protocol=yolox_protocol,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
paddle_client = create_inference_client(
|
|
219
|
+
endpoints=paddle_endpoints,
|
|
220
|
+
model_interface=paddle_model_interface,
|
|
221
|
+
auth_token=auth_token,
|
|
222
|
+
infer_protocol=paddle_protocol,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return yolox_client, paddle_client
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def extract_table_data_from_image_internal(
|
|
229
|
+
df_extraction_ledger: pd.DataFrame,
|
|
230
|
+
task_config: Union[IngestTaskTableExtraction, Dict[str, Any]],
|
|
231
|
+
extraction_config: TableExtractorSchema,
|
|
232
|
+
execution_trace_log: Optional[Dict] = None,
|
|
233
|
+
) -> Tuple[pd.DataFrame, Dict]:
|
|
234
|
+
"""
|
|
235
|
+
Extracts table data from a DataFrame in a bulk fashion rather than row-by-row,
|
|
236
|
+
following the chart extraction pattern.
|
|
237
|
+
|
|
238
|
+
Parameters
|
|
239
|
+
----------
|
|
240
|
+
df_extraction_ledger : pd.DataFrame
|
|
241
|
+
DataFrame containing the content from which table data is to be extracted.
|
|
242
|
+
task_config : Dict[str, Any]
|
|
243
|
+
Dictionary containing task properties and configurations.
|
|
244
|
+
extraction_config : Any
|
|
245
|
+
The validated configuration object for table extraction.
|
|
246
|
+
execution_trace_log : Optional[Dict], optional
|
|
247
|
+
Optional trace information for debugging or logging. Defaults to None.
|
|
248
|
+
|
|
249
|
+
Returns
|
|
250
|
+
-------
|
|
251
|
+
Tuple[pd.DataFrame, Dict]
|
|
252
|
+
A tuple containing the updated DataFrame and the trace information.
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
_ = task_config # unused
|
|
256
|
+
|
|
257
|
+
if execution_trace_log is None:
|
|
258
|
+
execution_trace_log = {}
|
|
259
|
+
logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
|
|
260
|
+
|
|
261
|
+
if df_extraction_ledger.empty:
|
|
262
|
+
return df_extraction_ledger, execution_trace_log
|
|
263
|
+
|
|
264
|
+
endpoint_config = extraction_config.endpoint_config
|
|
265
|
+
yolox_client, paddle_client = _create_clients(
|
|
266
|
+
endpoint_config.yolox_endpoints,
|
|
267
|
+
endpoint_config.yolox_infer_protocol,
|
|
268
|
+
endpoint_config.paddle_endpoints,
|
|
269
|
+
endpoint_config.paddle_infer_protocol,
|
|
270
|
+
endpoint_config.auth_token,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
# 1) Identify rows that meet criteria (structured, subtype=table, table_metadata != None, content not empty)
|
|
275
|
+
def meets_criteria(row):
|
|
276
|
+
m = row.get("metadata", {})
|
|
277
|
+
if not m:
|
|
278
|
+
return False
|
|
279
|
+
content_md = m.get("content_metadata", {})
|
|
280
|
+
if (
|
|
281
|
+
content_md.get("type") == "structured"
|
|
282
|
+
and content_md.get("subtype") == "table"
|
|
283
|
+
and m.get("table_metadata") is not None
|
|
284
|
+
and m.get("content") not in [None, ""]
|
|
285
|
+
):
|
|
286
|
+
return True
|
|
287
|
+
return False
|
|
288
|
+
|
|
289
|
+
mask = df_extraction_ledger.apply(meets_criteria, axis=1)
|
|
290
|
+
valid_indices = df_extraction_ledger[mask].index.tolist()
|
|
291
|
+
|
|
292
|
+
# If no rows meet the criteria, just return
|
|
293
|
+
if not valid_indices:
|
|
294
|
+
return df_extraction_ledger, {"trace_info": execution_trace_log}
|
|
295
|
+
|
|
296
|
+
# 2) Extract base64 images in the same order
|
|
297
|
+
base64_images = []
|
|
298
|
+
for idx in valid_indices:
|
|
299
|
+
meta = df_extraction_ledger.at[idx, "metadata"]
|
|
300
|
+
base64_images.append(meta["content"])
|
|
301
|
+
|
|
302
|
+
# 3) Call our bulk _update_metadata to get all results
|
|
303
|
+
table_content_format = (
|
|
304
|
+
df_extraction_ledger.at[valid_indices[0], "metadata"]["table_metadata"].get("table_content_format")
|
|
305
|
+
or TableFormatEnum.PSEUDO_MARKDOWN
|
|
306
|
+
)
|
|
307
|
+
enable_yolox = True if table_content_format in (TableFormatEnum.MARKDOWN,) else False
|
|
308
|
+
|
|
309
|
+
bulk_results = _update_table_metadata(
|
|
310
|
+
base64_images=base64_images,
|
|
311
|
+
yolox_client=yolox_client,
|
|
312
|
+
paddle_client=paddle_client,
|
|
313
|
+
worker_pool_size=endpoint_config.workers_per_progress_engine,
|
|
314
|
+
enable_yolox=enable_yolox,
|
|
315
|
+
trace_info=execution_trace_log,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# 4) Write the results (bounding_boxes, text_predictions) back
|
|
319
|
+
for row_id, idx in enumerate(valid_indices):
|
|
320
|
+
# unpack (base64_image, (yolox_predictions, paddle_bounding boxes, paddle_text_predictions))
|
|
321
|
+
_, cell_predictions, bounding_boxes, text_predictions = bulk_results[row_id]
|
|
322
|
+
|
|
323
|
+
if table_content_format == TableFormatEnum.SIMPLE:
|
|
324
|
+
table_content = " ".join(text_predictions)
|
|
325
|
+
elif table_content_format == TableFormatEnum.PSEUDO_MARKDOWN:
|
|
326
|
+
table_content = convert_paddle_response_to_psuedo_markdown(bounding_boxes, text_predictions)
|
|
327
|
+
elif table_content_format == TableFormatEnum.MARKDOWN:
|
|
328
|
+
table_content = join_yolox_table_structure_and_paddle_output(
|
|
329
|
+
cell_predictions, bounding_boxes, text_predictions
|
|
330
|
+
)
|
|
331
|
+
else:
|
|
332
|
+
raise ValueError(f"Unexpected table format: {table_content_format}")
|
|
333
|
+
|
|
334
|
+
df_extraction_ledger.at[idx, "metadata"]["table_metadata"]["table_content"] = table_content
|
|
335
|
+
df_extraction_ledger.at[idx, "metadata"]["table_metadata"]["table_content_format"] = table_content_format
|
|
336
|
+
|
|
337
|
+
return df_extraction_ledger, {"trace_info": execution_trace_log}
|
|
338
|
+
|
|
339
|
+
except Exception:
|
|
340
|
+
logger.exception("Error occurred while extracting table data.", exc_info=True)
|
|
341
|
+
raise
|
|
342
|
+
finally:
|
|
343
|
+
yolox_client.close()
|
|
344
|
+
paddle_client.close()
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from .adobe import adobe_extractor
|
|
6
|
+
from .llama import llama_parse_extractor
|
|
7
|
+
from .nemoretriever import nemoretriever_parse_extractor
|
|
8
|
+
from .pdfium import pdfium_extractor
|
|
9
|
+
from .tika import tika_extractor
|
|
10
|
+
from .unstructured_io import unstructured_io_extractor
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"adobe_extractor",
|
|
14
|
+
"llama_parse_extractor",
|
|
15
|
+
"nemoretriever_parse_extractor",
|
|
16
|
+
"pdfium_extractor",
|
|
17
|
+
"tika_extractor",
|
|
18
|
+
"unstructured_io_extractor",
|
|
19
|
+
]
|