nv-ingest-api 25.7.7.dev20250707__py3-none-any.whl → 25.8.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/interface/extract.py +18 -18
- nv_ingest_api/internal/enums/common.py +6 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +80 -75
- nv_ingest_api/internal/extract/image/image_helpers/common.py +5 -6
- nv_ingest_api/internal/extract/image/infographic_extractor.py +59 -35
- nv_ingest_api/internal/extract/image/table_extractor.py +84 -64
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +9 -8
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +32 -20
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +40 -29
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +59 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +1 -0
- nv_ingest_api/internal/primitives/nim/model_interface/{paddle.py → ocr.py} +132 -39
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +44 -236
- nv_ingest_api/internal/primitives/nim/nim_client.py +61 -18
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +5 -5
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +5 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +1 -1
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +4 -0
- nv_ingest_api/internal/transform/embed_text.py +105 -12
- nv_ingest_api/internal/transform/split_text.py +13 -8
- nv_ingest_api/util/image_processing/table_and_chart.py +97 -42
- nv_ingest_api/util/image_processing/transforms.py +351 -87
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +1 -1
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +51 -48
- nv_ingest_api/util/metadata/aggregators.py +4 -1
- nv_ingest_api/util/pdf/pdfium.py +6 -14
- {nv_ingest_api-25.7.7.dev20250707.dist-info → nv_ingest_api-25.8.0rc2.dist-info}/METADATA +2 -1
- {nv_ingest_api-25.7.7.dev20250707.dist-info → nv_ingest_api-25.8.0rc2.dist-info}/RECORD +33 -33
- {nv_ingest_api-25.7.7.dev20250707.dist-info → nv_ingest_api-25.8.0rc2.dist-info}/WHEEL +0 -0
- {nv_ingest_api-25.7.7.dev20250707.dist-info → nv_ingest_api-25.8.0rc2.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-25.7.7.dev20250707.dist-info → nv_ingest_api-25.8.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -15,10 +15,11 @@ import pandas as pd
|
|
|
15
15
|
|
|
16
16
|
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskTableExtraction
|
|
17
17
|
from nv_ingest_api.internal.enums.common import TableFormatEnum
|
|
18
|
-
from nv_ingest_api.internal.primitives.nim.model_interface.
|
|
18
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.ocr import OCRModelInterface
|
|
19
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.ocr import get_ocr_model_name
|
|
19
20
|
from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
|
|
20
|
-
from nv_ingest_api.util.image_processing.table_and_chart import
|
|
21
|
-
from nv_ingest_api.util.image_processing.table_and_chart import
|
|
21
|
+
from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_ocr_output
|
|
22
|
+
from nv_ingest_api.util.image_processing.table_and_chart import convert_ocr_response_to_psuedo_markdown
|
|
22
23
|
from nv_ingest_api.internal.primitives.nim import NimClient
|
|
23
24
|
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxTableStructureModelInterface
|
|
24
25
|
from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
|
|
@@ -60,7 +61,8 @@ def _filter_valid_images(base64_images: List[str]) -> Tuple[List[str], List[np.n
|
|
|
60
61
|
def _run_inference(
|
|
61
62
|
enable_yolox: bool,
|
|
62
63
|
yolox_client: Any,
|
|
63
|
-
|
|
64
|
+
ocr_client: Any,
|
|
65
|
+
ocr_model_name: str,
|
|
64
66
|
valid_arrays: List[np.ndarray],
|
|
65
67
|
valid_images: List[str],
|
|
66
68
|
trace_info: Optional[Dict] = None,
|
|
@@ -68,32 +70,45 @@ def _run_inference(
|
|
|
68
70
|
"""
|
|
69
71
|
Run inference concurrently for YOLOX (if enabled) and Paddle.
|
|
70
72
|
|
|
71
|
-
Returns a tuple of (yolox_results,
|
|
73
|
+
Returns a tuple of (yolox_results, ocr_results).
|
|
72
74
|
"""
|
|
73
|
-
|
|
75
|
+
data_ocr = {"base64_images": valid_images}
|
|
74
76
|
if enable_yolox:
|
|
75
77
|
data_yolox = {"images": valid_arrays}
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
if enable_yolox:
|
|
80
|
-
future_yolox = executor.submit(
|
|
81
|
-
yolox_client.infer,
|
|
82
|
-
data=data_yolox,
|
|
83
|
-
model_name="yolox",
|
|
84
|
-
stage_name="table_extraction",
|
|
85
|
-
max_batch_size=8,
|
|
86
|
-
trace_info=trace_info,
|
|
87
|
-
)
|
|
88
|
-
future_paddle = executor.submit(
|
|
89
|
-
paddle_client.infer,
|
|
90
|
-
data=data_paddle,
|
|
91
|
-
model_name="paddle",
|
|
78
|
+
future_yolox_kwargs = dict(
|
|
79
|
+
data=data_yolox,
|
|
80
|
+
model_name="yolox_ensemble",
|
|
92
81
|
stage_name="table_extraction",
|
|
93
|
-
max_batch_size=
|
|
82
|
+
max_batch_size=8,
|
|
83
|
+
input_names=["INPUT_IMAGES", "THRESHOLDS"],
|
|
84
|
+
dtypes=["BYTES", "FP32"],
|
|
85
|
+
output_names=["OUTPUT"],
|
|
94
86
|
trace_info=trace_info,
|
|
95
87
|
)
|
|
96
88
|
|
|
89
|
+
future_ocr_kwargs = dict(
|
|
90
|
+
data=data_ocr,
|
|
91
|
+
stage_name="table_extraction",
|
|
92
|
+
max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
|
|
93
|
+
trace_info=trace_info,
|
|
94
|
+
)
|
|
95
|
+
if ocr_model_name == "paddle":
|
|
96
|
+
future_ocr_kwargs.update(
|
|
97
|
+
model_name="paddle",
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
future_ocr_kwargs.update(
|
|
101
|
+
model_name="scene_text",
|
|
102
|
+
input_names=["input", "merge_levels"],
|
|
103
|
+
dtypes=["FP32", "BYTES"],
|
|
104
|
+
merge_level="word",
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
|
108
|
+
future_ocr = executor.submit(ocr_client.infer, **future_ocr_kwargs)
|
|
109
|
+
future_yolox = None
|
|
110
|
+
if enable_yolox:
|
|
111
|
+
future_yolox = executor.submit(yolox_client.infer, **future_yolox_kwargs)
|
|
97
112
|
if enable_yolox:
|
|
98
113
|
try:
|
|
99
114
|
yolox_results = future_yolox.result()
|
|
@@ -104,17 +119,17 @@ def _run_inference(
|
|
|
104
119
|
yolox_results = [None] * len(valid_images)
|
|
105
120
|
|
|
106
121
|
try:
|
|
107
|
-
|
|
122
|
+
ocr_results = future_ocr.result()
|
|
108
123
|
except Exception as e:
|
|
109
|
-
logger.error(f"Error calling
|
|
124
|
+
logger.error(f"Error calling ocr_client.infer: {e}", exc_info=True)
|
|
110
125
|
raise
|
|
111
126
|
|
|
112
|
-
return yolox_results,
|
|
127
|
+
return yolox_results, ocr_results
|
|
113
128
|
|
|
114
129
|
|
|
115
130
|
def _validate_inference_results(
|
|
116
131
|
yolox_results: Any,
|
|
117
|
-
|
|
132
|
+
ocr_results: Any,
|
|
118
133
|
valid_arrays: List[Any],
|
|
119
134
|
valid_images: List[str],
|
|
120
135
|
) -> Tuple[List[Any], List[Any]]:
|
|
@@ -123,46 +138,47 @@ def _validate_inference_results(
|
|
|
123
138
|
|
|
124
139
|
If not, default values are assigned. Raises a ValueError if the lengths do not match.
|
|
125
140
|
"""
|
|
126
|
-
if not isinstance(yolox_results, list) or not isinstance(
|
|
141
|
+
if not isinstance(yolox_results, list) or not isinstance(ocr_results, list):
|
|
127
142
|
logger.warning(
|
|
128
|
-
"Unexpected result types from inference clients: yolox_results=%s,
|
|
143
|
+
"Unexpected result types from inference clients: yolox_results=%s, ocr_results=%s. "
|
|
129
144
|
"Proceeding with available results.",
|
|
130
145
|
type(yolox_results).__name__,
|
|
131
|
-
type(
|
|
146
|
+
type(ocr_results).__name__,
|
|
132
147
|
)
|
|
133
148
|
if not isinstance(yolox_results, list):
|
|
134
149
|
yolox_results = [None] * len(valid_arrays)
|
|
135
|
-
if not isinstance(
|
|
136
|
-
|
|
150
|
+
if not isinstance(ocr_results, list):
|
|
151
|
+
ocr_results = [(None, None)] * len(valid_images)
|
|
137
152
|
|
|
138
153
|
if len(yolox_results) != len(valid_arrays):
|
|
139
154
|
raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
|
|
140
|
-
if len(
|
|
141
|
-
raise ValueError(f"Expected {len(valid_images)}
|
|
155
|
+
if len(ocr_results) != len(valid_images):
|
|
156
|
+
raise ValueError(f"Expected {len(valid_images)} ocr results, got {len(ocr_results)}")
|
|
142
157
|
|
|
143
|
-
return yolox_results,
|
|
158
|
+
return yolox_results, ocr_results
|
|
144
159
|
|
|
145
160
|
|
|
146
161
|
def _update_table_metadata(
|
|
147
162
|
base64_images: List[str],
|
|
148
163
|
yolox_client: Any,
|
|
149
|
-
|
|
164
|
+
ocr_client: Any,
|
|
165
|
+
ocr_model_name: str,
|
|
150
166
|
worker_pool_size: int = 8, # Not currently used
|
|
151
167
|
enable_yolox: bool = False,
|
|
152
168
|
trace_info: Optional[Dict] = None,
|
|
153
169
|
) -> List[Tuple[str, Any, Any, Any]]:
|
|
154
170
|
"""
|
|
155
171
|
Given a list of base64-encoded images, this function filters out images that do not meet
|
|
156
|
-
the minimum size requirements and then calls the
|
|
172
|
+
the minimum size requirements and then calls the OCR model via ocr_client.infer
|
|
157
173
|
to extract table data.
|
|
158
174
|
|
|
159
175
|
For each base64-encoded image, the result is a tuple:
|
|
160
|
-
(base64_image, yolox_result,
|
|
176
|
+
(base64_image, yolox_result, ocr_text_predictions, ocr_bounding_boxes)
|
|
161
177
|
|
|
162
178
|
Images that do not meet the minimum size are skipped (resulting in placeholders).
|
|
163
|
-
The
|
|
179
|
+
The ocr_client is expected to handle any necessary batching and concurrency.
|
|
164
180
|
"""
|
|
165
|
-
logger.debug(f"Running table extraction using protocol {
|
|
181
|
+
logger.debug(f"Running table extraction using protocol {ocr_client.protocol}")
|
|
166
182
|
|
|
167
183
|
# Initialize the results list with default placeholders.
|
|
168
184
|
results: List[Tuple[str, Any, Any, Any]] = [("", None, None, None)] * len(base64_images)
|
|
@@ -174,24 +190,23 @@ def _update_table_metadata(
|
|
|
174
190
|
return results
|
|
175
191
|
|
|
176
192
|
# Run inference concurrently.
|
|
177
|
-
yolox_results,
|
|
193
|
+
yolox_results, ocr_results = _run_inference(
|
|
178
194
|
enable_yolox=enable_yolox,
|
|
179
195
|
yolox_client=yolox_client,
|
|
180
|
-
|
|
196
|
+
ocr_client=ocr_client,
|
|
197
|
+
ocr_model_name=ocr_model_name,
|
|
181
198
|
valid_arrays=valid_arrays,
|
|
182
199
|
valid_images=valid_images,
|
|
183
200
|
trace_info=trace_info,
|
|
184
201
|
)
|
|
185
202
|
|
|
186
203
|
# Validate that the inference results have the expected structure.
|
|
187
|
-
yolox_results,
|
|
188
|
-
yolox_results, paddle_results, valid_arrays, valid_images
|
|
189
|
-
)
|
|
204
|
+
yolox_results, ocr_results = _validate_inference_results(yolox_results, ocr_results, valid_arrays, valid_images)
|
|
190
205
|
|
|
191
206
|
# Combine results with the original order.
|
|
192
|
-
for idx, (yolox_res,
|
|
207
|
+
for idx, (yolox_res, ocr_res) in enumerate(zip(yolox_results, ocr_results)):
|
|
193
208
|
original_index = valid_indices[idx]
|
|
194
|
-
results[original_index] = (base64_images[original_index], yolox_res,
|
|
209
|
+
results[original_index] = (base64_images[original_index], yolox_res, ocr_res[0], ocr_res[1])
|
|
195
210
|
|
|
196
211
|
return results
|
|
197
212
|
|
|
@@ -199,14 +214,14 @@ def _update_table_metadata(
|
|
|
199
214
|
def _create_clients(
|
|
200
215
|
yolox_endpoints: Tuple[str, str],
|
|
201
216
|
yolox_protocol: str,
|
|
202
|
-
|
|
203
|
-
|
|
217
|
+
ocr_endpoints: Tuple[str, str],
|
|
218
|
+
ocr_protocol: str,
|
|
204
219
|
auth_token: str,
|
|
205
220
|
) -> Tuple[NimClient, NimClient]:
|
|
206
221
|
yolox_model_interface = YoloxTableStructureModelInterface()
|
|
207
|
-
|
|
222
|
+
ocr_model_interface = OCRModelInterface()
|
|
208
223
|
|
|
209
|
-
logger.debug(f"Inference protocols: yolox={yolox_protocol},
|
|
224
|
+
logger.debug(f"Inference protocols: yolox={yolox_protocol}, ocr={ocr_protocol}")
|
|
210
225
|
|
|
211
226
|
yolox_client = create_inference_client(
|
|
212
227
|
endpoints=yolox_endpoints,
|
|
@@ -215,14 +230,14 @@ def _create_clients(
|
|
|
215
230
|
infer_protocol=yolox_protocol,
|
|
216
231
|
)
|
|
217
232
|
|
|
218
|
-
|
|
219
|
-
endpoints=
|
|
220
|
-
model_interface=
|
|
233
|
+
ocr_client = create_inference_client(
|
|
234
|
+
endpoints=ocr_endpoints,
|
|
235
|
+
model_interface=ocr_model_interface,
|
|
221
236
|
auth_token=auth_token,
|
|
222
|
-
infer_protocol=
|
|
237
|
+
infer_protocol=ocr_protocol,
|
|
223
238
|
)
|
|
224
239
|
|
|
225
|
-
return yolox_client,
|
|
240
|
+
return yolox_client, ocr_client
|
|
226
241
|
|
|
227
242
|
|
|
228
243
|
def extract_table_data_from_image_internal(
|
|
@@ -262,14 +277,18 @@ def extract_table_data_from_image_internal(
|
|
|
262
277
|
return df_extraction_ledger, execution_trace_log
|
|
263
278
|
|
|
264
279
|
endpoint_config = extraction_config.endpoint_config
|
|
265
|
-
yolox_client,
|
|
280
|
+
yolox_client, ocr_client = _create_clients(
|
|
266
281
|
endpoint_config.yolox_endpoints,
|
|
267
282
|
endpoint_config.yolox_infer_protocol,
|
|
268
|
-
endpoint_config.
|
|
269
|
-
endpoint_config.
|
|
283
|
+
endpoint_config.ocr_endpoints,
|
|
284
|
+
endpoint_config.ocr_infer_protocol,
|
|
270
285
|
endpoint_config.auth_token,
|
|
271
286
|
)
|
|
272
287
|
|
|
288
|
+
# Get the grpc endpoint to determine the model if needed
|
|
289
|
+
ocr_grpc_endpoint = endpoint_config.ocr_endpoints[0]
|
|
290
|
+
ocr_model_name = get_ocr_model_name(ocr_grpc_endpoint)
|
|
291
|
+
|
|
273
292
|
try:
|
|
274
293
|
# 1) Identify rows that meet criteria (structured, subtype=table, table_metadata != None, content not empty)
|
|
275
294
|
def meets_criteria(row):
|
|
@@ -309,7 +328,8 @@ def extract_table_data_from_image_internal(
|
|
|
309
328
|
bulk_results = _update_table_metadata(
|
|
310
329
|
base64_images=base64_images,
|
|
311
330
|
yolox_client=yolox_client,
|
|
312
|
-
|
|
331
|
+
ocr_client=ocr_client,
|
|
332
|
+
ocr_model_name=ocr_model_name,
|
|
313
333
|
worker_pool_size=endpoint_config.workers_per_progress_engine,
|
|
314
334
|
enable_yolox=enable_yolox,
|
|
315
335
|
trace_info=execution_trace_log,
|
|
@@ -317,15 +337,15 @@ def extract_table_data_from_image_internal(
|
|
|
317
337
|
|
|
318
338
|
# 4) Write the results (bounding_boxes, text_predictions) back
|
|
319
339
|
for row_id, idx in enumerate(valid_indices):
|
|
320
|
-
# unpack (base64_image, (yolox_predictions,
|
|
340
|
+
# unpack (base64_image, (yolox_predictions, ocr_bounding boxes, ocr_text_predictions))
|
|
321
341
|
_, cell_predictions, bounding_boxes, text_predictions = bulk_results[row_id]
|
|
322
342
|
|
|
323
343
|
if table_content_format == TableFormatEnum.SIMPLE:
|
|
324
344
|
table_content = " ".join(text_predictions)
|
|
325
345
|
elif table_content_format == TableFormatEnum.PSEUDO_MARKDOWN:
|
|
326
|
-
table_content =
|
|
346
|
+
table_content = convert_ocr_response_to_psuedo_markdown(bounding_boxes, text_predictions)
|
|
327
347
|
elif table_content_format == TableFormatEnum.MARKDOWN:
|
|
328
|
-
table_content =
|
|
348
|
+
table_content = join_yolox_table_structure_and_ocr_output(
|
|
329
349
|
cell_predictions, bounding_boxes, text_predictions
|
|
330
350
|
)
|
|
331
351
|
else:
|
|
@@ -341,4 +361,4 @@ def extract_table_data_from_image_internal(
|
|
|
341
361
|
raise
|
|
342
362
|
finally:
|
|
343
363
|
yolox_client.close()
|
|
344
|
-
|
|
364
|
+
ocr_client.close()
|
|
@@ -40,6 +40,7 @@ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadat
|
|
|
40
40
|
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
|
|
41
41
|
YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
|
|
42
42
|
YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
|
|
43
|
+
YOLOX_PAGE_IMAGE_FORMAT,
|
|
43
44
|
)
|
|
44
45
|
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import NemoRetrieverParseConfigSchema
|
|
45
46
|
from nv_ingest_api.util.metadata.aggregators import (
|
|
@@ -100,7 +101,7 @@ def nemoretriever_parse_extractor(
|
|
|
100
101
|
- text_depth : str, optional (default is "page")
|
|
101
102
|
- extract_tables_method : str, optional (default is "yolox")
|
|
102
103
|
- identify_nearby_objects : bool, optional (default is True)
|
|
103
|
-
-
|
|
104
|
+
- table_output_format : str, optional (default is "pseudo_markdown")
|
|
104
105
|
- pdfium_config : dict, optional (configuration for PDFium)
|
|
105
106
|
- nemoretriever_parse_config : dict, optional (configuration for NemoRetrieverParse)
|
|
106
107
|
- metadata_column : str, optional (default is "metadata")
|
|
@@ -145,14 +146,14 @@ def nemoretriever_parse_extractor(
|
|
|
145
146
|
# Flag for identifying nearby objects.
|
|
146
147
|
identify_nearby_objects = extractor_config.get("identify_nearby_objects", True)
|
|
147
148
|
|
|
148
|
-
# Get and validate
|
|
149
|
-
|
|
149
|
+
# Get and validate table_output_format.
|
|
150
|
+
table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
|
|
150
151
|
try:
|
|
151
|
-
|
|
152
|
+
table_output_format = TableFormatEnum[table_output_format_str.upper()]
|
|
152
153
|
except KeyError:
|
|
153
154
|
valid_options = [e.name.lower() for e in TableFormatEnum]
|
|
154
155
|
raise ValueError(
|
|
155
|
-
f"Invalid
|
|
156
|
+
f"Invalid table_output_format value: {table_output_format_str}. Expected one of: {valid_options}"
|
|
156
157
|
)
|
|
157
158
|
|
|
158
159
|
# Process nemoretriever_parse configuration.
|
|
@@ -253,7 +254,7 @@ def nemoretriever_parse_extractor(
|
|
|
253
254
|
extract_tables,
|
|
254
255
|
extract_charts,
|
|
255
256
|
extract_infographics,
|
|
256
|
-
|
|
257
|
+
table_output_format,
|
|
257
258
|
nemoretriever_parse_config.yolox_endpoints,
|
|
258
259
|
nemoretriever_parse_config.yolox_infer_protocol,
|
|
259
260
|
nemoretriever_parse_config.auth_token,
|
|
@@ -287,7 +288,7 @@ def nemoretriever_parse_extractor(
|
|
|
287
288
|
extract_tables,
|
|
288
289
|
extract_charts,
|
|
289
290
|
extract_infographics,
|
|
290
|
-
|
|
291
|
+
table_output_format,
|
|
291
292
|
nemoretriever_parse_config.yolox_endpoints,
|
|
292
293
|
nemoretriever_parse_config.yolox_infer_protocol,
|
|
293
294
|
nemoretriever_parse_config.auth_token,
|
|
@@ -355,7 +356,7 @@ def nemoretriever_parse_extractor(
|
|
|
355
356
|
img_numpy = crop_image(page_image, transformed_bbox)
|
|
356
357
|
|
|
357
358
|
if img_numpy is not None:
|
|
358
|
-
base64_img = numpy_to_base64(img_numpy)
|
|
359
|
+
base64_img = numpy_to_base64(img_numpy, format=YOLOX_PAGE_IMAGE_FORMAT)
|
|
359
360
|
image = Base64Image(
|
|
360
361
|
image=base64_img,
|
|
361
362
|
bbox=transformed_bbox,
|
|
@@ -4,20 +4,21 @@
|
|
|
4
4
|
# Copyright (c) 2024, NVIDIA CORPORATION.
|
|
5
5
|
|
|
6
6
|
import base64
|
|
7
|
+
import inspect
|
|
7
8
|
import io
|
|
8
|
-
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from typing import Any, Dict, List, Optional
|
|
11
9
|
import logging
|
|
10
|
+
from typing import Any
|
|
11
|
+
from typing import Dict
|
|
12
|
+
from typing import List
|
|
13
|
+
from typing import Optional
|
|
12
14
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
)
|
|
15
|
+
import pandas as pd
|
|
16
|
+
from nv_ingest_api.internal.extract.pdf.engines import adobe_extractor
|
|
17
|
+
from nv_ingest_api.internal.extract.pdf.engines import llama_parse_extractor
|
|
18
|
+
from nv_ingest_api.internal.extract.pdf.engines import nemoretriever_parse_extractor
|
|
19
|
+
from nv_ingest_api.internal.extract.pdf.engines import pdfium_extractor
|
|
20
|
+
from nv_ingest_api.internal.extract.pdf.engines import tika_extractor
|
|
21
|
+
from nv_ingest_api.internal.extract.pdf.engines import unstructured_io_extractor
|
|
21
22
|
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
22
23
|
|
|
23
24
|
# Import extraction functions for different engines.
|
|
@@ -43,6 +44,7 @@ def _work_extract_pdf(
|
|
|
43
44
|
extract_infographics: bool,
|
|
44
45
|
extract_tables: bool,
|
|
45
46
|
extract_charts: bool,
|
|
47
|
+
extract_page_as_image: bool,
|
|
46
48
|
extractor_config: dict,
|
|
47
49
|
execution_trace_log=None,
|
|
48
50
|
) -> Any:
|
|
@@ -52,17 +54,25 @@ def _work_extract_pdf(
|
|
|
52
54
|
|
|
53
55
|
extract_method = extractor_config["extract_method"]
|
|
54
56
|
extractor_fn = EXTRACTOR_LOOKUP.get(extract_method, pdfium_extractor)
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
57
|
+
|
|
58
|
+
extractor_fn_args = dict(
|
|
59
|
+
pdf_stream=pdf_stream,
|
|
60
|
+
extract_text=extract_text,
|
|
61
|
+
extract_images=extract_images,
|
|
62
|
+
extract_infographics=extract_infographics,
|
|
63
|
+
extract_tables=extract_tables,
|
|
64
|
+
extract_charts=extract_charts,
|
|
65
|
+
extractor_config=extractor_config,
|
|
66
|
+
execution_trace_log=execution_trace_log,
|
|
64
67
|
)
|
|
65
68
|
|
|
69
|
+
if "extract_page_as_image" in inspect.signature(extractor_fn).parameters:
|
|
70
|
+
extractor_fn_args["extract_page_as_image"] = extract_page_as_image
|
|
71
|
+
elif extract_page_as_image:
|
|
72
|
+
logger.warning(f"`extract_page_as_image` is set to True, but {extract_method} does not support it.")
|
|
73
|
+
|
|
74
|
+
return extractor_fn(**extractor_fn_args)
|
|
75
|
+
|
|
66
76
|
|
|
67
77
|
@unified_exception_handler
|
|
68
78
|
def _orchestrate_row_extraction(
|
|
@@ -97,6 +107,7 @@ def _orchestrate_row_extraction(
|
|
|
97
107
|
extract_tables = params.pop("extract_tables", False)
|
|
98
108
|
extract_charts = params.pop("extract_charts", False)
|
|
99
109
|
extract_infographics = params.pop("extract_infographics", False)
|
|
110
|
+
extract_page_as_image = params.pop("extract_page_as_image", False)
|
|
100
111
|
extract_method = params.get("extract_method", "pdfium")
|
|
101
112
|
except KeyError as e:
|
|
102
113
|
raise ValueError(f"Missing required extraction flag: {e}")
|
|
@@ -137,6 +148,7 @@ def _orchestrate_row_extraction(
|
|
|
137
148
|
extract_text=extract_text,
|
|
138
149
|
extract_images=extract_images,
|
|
139
150
|
extract_infographics=extract_infographics,
|
|
151
|
+
extract_page_as_image=extract_page_as_image,
|
|
140
152
|
extract_tables=extract_tables,
|
|
141
153
|
extract_charts=extract_charts,
|
|
142
154
|
extractor_config=extractor_config,
|
|
@@ -24,16 +24,18 @@ import numpy as np
|
|
|
24
24
|
import pandas as pd
|
|
25
25
|
import pypdfium2 as libpdfium
|
|
26
26
|
|
|
27
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
27
28
|
from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH_SIZE
|
|
28
29
|
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
|
|
29
30
|
YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
|
|
30
31
|
YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
|
|
31
|
-
get_yolox_model_name,
|
|
32
32
|
YoloxPageElementsModelInterface,
|
|
33
|
+
YOLOX_PAGE_IMAGE_FORMAT,
|
|
33
34
|
)
|
|
34
35
|
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
|
|
35
36
|
from nv_ingest_api.internal.enums.common import TableFormatEnum, TextTypeEnum, AccessLevelEnum
|
|
36
37
|
from nv_ingest_api.util.metadata.aggregators import (
|
|
38
|
+
construct_image_metadata_from_base64,
|
|
37
39
|
construct_image_metadata_from_pdf_image,
|
|
38
40
|
extract_pdf_metadata,
|
|
39
41
|
construct_text_metadata,
|
|
@@ -46,6 +48,7 @@ from nv_ingest_api.util.pdf.pdfium import (
|
|
|
46
48
|
extract_image_like_objects_from_pdfium_page,
|
|
47
49
|
)
|
|
48
50
|
from nv_ingest_api.util.pdf.pdfium import pdfium_pages_to_numpy
|
|
51
|
+
from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
|
|
49
52
|
from nv_ingest_api.util.image_processing.transforms import numpy_to_base64, crop_image
|
|
50
53
|
|
|
51
54
|
logger = logging.getLogger(__name__)
|
|
@@ -54,7 +57,6 @@ logger = logging.getLogger(__name__)
|
|
|
54
57
|
def _extract_page_elements_using_image_ensemble(
|
|
55
58
|
pages: List[Tuple[int, np.ndarray, Tuple[int, int]]],
|
|
56
59
|
yolox_client,
|
|
57
|
-
yolox_model_name: str = "yolox",
|
|
58
60
|
execution_trace_log: Optional[List] = None,
|
|
59
61
|
) -> List[Tuple[int, object]]:
|
|
60
62
|
"""
|
|
@@ -68,8 +70,6 @@ def _extract_page_elements_using_image_ensemble(
|
|
|
68
70
|
and optional padding offset information.
|
|
69
71
|
yolox_client : object
|
|
70
72
|
A pre-configured client instance for the YOLOX inference service.
|
|
71
|
-
yolox_model_name : str, default="yolox"
|
|
72
|
-
The name of the YOLOX model to use for inference.
|
|
73
73
|
execution_trace_log : Optional[List], default=None
|
|
74
74
|
List for accumulating execution trace information.
|
|
75
75
|
|
|
@@ -102,8 +102,11 @@ def _extract_page_elements_using_image_ensemble(
|
|
|
102
102
|
# Perform inference using the NimClient.
|
|
103
103
|
inference_results = yolox_client.infer(
|
|
104
104
|
data,
|
|
105
|
-
model_name="
|
|
105
|
+
model_name="yolox_ensemble",
|
|
106
106
|
max_batch_size=YOLOX_MAX_BATCH_SIZE,
|
|
107
|
+
input_names=["INPUT_IMAGES", "THRESHOLDS"],
|
|
108
|
+
dtypes=["BYTES", "FP32"],
|
|
109
|
+
output_names=["OUTPUT"],
|
|
107
110
|
trace_info=execution_trace_log,
|
|
108
111
|
stage_name="pdf_extraction",
|
|
109
112
|
)
|
|
@@ -186,7 +189,7 @@ def _extract_page_element_images(
|
|
|
186
189
|
if cropped is None:
|
|
187
190
|
continue
|
|
188
191
|
|
|
189
|
-
base64_img = numpy_to_base64(cropped)
|
|
192
|
+
base64_img = numpy_to_base64(cropped, format=YOLOX_PAGE_IMAGE_FORMAT)
|
|
190
193
|
|
|
191
194
|
bbox_in_orig_coord = (
|
|
192
195
|
int(w1) - pad_width,
|
|
@@ -263,7 +266,7 @@ def _extract_page_elements(
|
|
|
263
266
|
extract_tables: bool,
|
|
264
267
|
extract_charts: bool,
|
|
265
268
|
extract_infographics: bool,
|
|
266
|
-
|
|
269
|
+
table_output_format: str,
|
|
267
270
|
yolox_endpoints: Tuple[Optional[str], Optional[str]],
|
|
268
271
|
yolox_infer_protocol: str = "http",
|
|
269
272
|
auth_token: Optional[str] = None,
|
|
@@ -292,7 +295,7 @@ def _extract_page_elements(
|
|
|
292
295
|
Flag indicating whether to extract charts.
|
|
293
296
|
extract_infographics : bool
|
|
294
297
|
Flag indicating whether to extract infographics.
|
|
295
|
-
|
|
298
|
+
table_output_format : str
|
|
296
299
|
Format to use for table content.
|
|
297
300
|
yolox_endpoints : Tuple[Optional[str], Optional[str]]
|
|
298
301
|
A tuple containing the gRPC and HTTP endpoints for the YOLOX service.
|
|
@@ -313,19 +316,7 @@ def _extract_page_elements(
|
|
|
313
316
|
|
|
314
317
|
try:
|
|
315
318
|
# Default model name
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
# Get the HTTP endpoint to determine the model name if needed
|
|
319
|
-
yolox_http_endpoint = yolox_endpoints[1]
|
|
320
|
-
if yolox_http_endpoint:
|
|
321
|
-
try:
|
|
322
|
-
yolox_model_name = get_yolox_model_name(yolox_http_endpoint)
|
|
323
|
-
except Exception as e:
|
|
324
|
-
logger.warning(f"Failed to get YOLOX model name from endpoint: {e}. Using default.")
|
|
325
|
-
|
|
326
|
-
# Create the model interface
|
|
327
|
-
model_interface = YoloxPageElementsModelInterface(yolox_model_name=yolox_model_name)
|
|
328
|
-
|
|
319
|
+
model_interface = YoloxPageElementsModelInterface()
|
|
329
320
|
# Create the inference client
|
|
330
321
|
yolox_client = create_inference_client(
|
|
331
322
|
yolox_endpoints,
|
|
@@ -336,7 +327,7 @@ def _extract_page_elements(
|
|
|
336
327
|
|
|
337
328
|
# Extract page elements using the client
|
|
338
329
|
page_element_results = _extract_page_elements_using_image_ensemble(
|
|
339
|
-
pages, yolox_client,
|
|
330
|
+
pages, yolox_client, execution_trace_log=execution_trace_log
|
|
340
331
|
)
|
|
341
332
|
|
|
342
333
|
# Process each extracted element based on extraction flags
|
|
@@ -351,7 +342,7 @@ def _extract_page_elements(
|
|
|
351
342
|
|
|
352
343
|
# Set content format for tables
|
|
353
344
|
if page_element.type_string == "table":
|
|
354
|
-
page_element.content_format =
|
|
345
|
+
page_element.content_format = table_output_format
|
|
355
346
|
|
|
356
347
|
# Construct metadata for the page element
|
|
357
348
|
page_element_meta = construct_page_element_metadata(
|
|
@@ -384,6 +375,7 @@ def pdfium_extractor(
|
|
|
384
375
|
extract_infographics: bool,
|
|
385
376
|
extract_tables: bool,
|
|
386
377
|
extract_charts: bool,
|
|
378
|
+
extract_page_as_image: bool,
|
|
387
379
|
extractor_config: dict,
|
|
388
380
|
execution_trace_log: Optional[List[Any]] = None,
|
|
389
381
|
) -> pd.DataFrame:
|
|
@@ -407,13 +399,13 @@ def pdfium_extractor(
|
|
|
407
399
|
f"Invalid text_depth: {text_depth_str}. Valid options: {list(TextTypeEnum.__members__.keys())}"
|
|
408
400
|
)
|
|
409
401
|
|
|
410
|
-
# Validate and extract
|
|
411
|
-
|
|
402
|
+
# Validate and extract table_output_format
|
|
403
|
+
table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
|
|
412
404
|
try:
|
|
413
|
-
|
|
405
|
+
table_output_format = TableFormatEnum[table_output_format_str.upper()]
|
|
414
406
|
except KeyError:
|
|
415
407
|
raise ValueError(
|
|
416
|
-
f"Invalid
|
|
408
|
+
f"Invalid table_output_format: {table_output_format_str}. "
|
|
417
409
|
f"Valid options: {list(TableFormatEnum.__members__.keys())}"
|
|
418
410
|
)
|
|
419
411
|
|
|
@@ -524,6 +516,24 @@ def pdfium_extractor(
|
|
|
524
516
|
)
|
|
525
517
|
extracted_data.extend(image_data)
|
|
526
518
|
|
|
519
|
+
# Full page image extraction
|
|
520
|
+
if extract_page_as_image:
|
|
521
|
+
page_text = _extract_page_text(page)
|
|
522
|
+
image, _ = pdfium_pages_to_numpy([page], scale_tuple=(16384, 16384), trace_info=execution_trace_log)
|
|
523
|
+
base64_image = numpy_to_base64(image[0])
|
|
524
|
+
if len(base64_image) > 2**24 - 1:
|
|
525
|
+
base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
|
|
526
|
+
image_meta = construct_image_metadata_from_base64(
|
|
527
|
+
base64_image,
|
|
528
|
+
page_idx,
|
|
529
|
+
page_count,
|
|
530
|
+
source_metadata,
|
|
531
|
+
base_unified_metadata,
|
|
532
|
+
subtype=ContentTypeEnum.PAGE_IMAGE,
|
|
533
|
+
text=page_text,
|
|
534
|
+
)
|
|
535
|
+
extracted_data.append(image_meta)
|
|
536
|
+
|
|
527
537
|
# If we want tables or charts, rasterize the page and store it
|
|
528
538
|
if extract_tables or extract_charts or extract_infographics:
|
|
529
539
|
image, padding_offsets = pdfium_pages_to_numpy(
|
|
@@ -545,7 +555,7 @@ def pdfium_extractor(
|
|
|
545
555
|
extract_tables,
|
|
546
556
|
extract_charts,
|
|
547
557
|
extract_infographics,
|
|
548
|
-
|
|
558
|
+
table_output_format,
|
|
549
559
|
pdfium_config.yolox_endpoints,
|
|
550
560
|
pdfium_config.yolox_infer_protocol,
|
|
551
561
|
pdfium_config.auth_token,
|
|
@@ -567,13 +577,14 @@ def pdfium_extractor(
|
|
|
567
577
|
extract_tables,
|
|
568
578
|
extract_charts,
|
|
569
579
|
extract_infographics,
|
|
570
|
-
|
|
580
|
+
table_output_format,
|
|
571
581
|
pdfium_config.yolox_endpoints,
|
|
572
582
|
pdfium_config.yolox_infer_protocol,
|
|
573
583
|
pdfium_config.auth_token,
|
|
574
584
|
execution_trace_log=execution_trace_log,
|
|
575
585
|
)
|
|
576
586
|
futures.append(future)
|
|
587
|
+
|
|
577
588
|
pages_for_tables.clear()
|
|
578
589
|
|
|
579
590
|
# Wait for all asynchronous jobs to complete.
|