nv-ingest-api 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.18.dev20250718__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/interface/extract.py +18 -18
- nv_ingest_api/internal/extract/image/chart_extractor.py +80 -75
- nv_ingest_api/internal/extract/image/image_helpers/common.py +5 -6
- nv_ingest_api/internal/extract/image/infographic_extractor.py +59 -35
- nv_ingest_api/internal/extract/image/table_extractor.py +84 -64
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +10 -7
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +16 -29
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +59 -0
- nv_ingest_api/internal/primitives/nim/model_interface/{paddle.py → ocr.py} +132 -39
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +37 -224
- nv_ingest_api/internal/primitives/nim/nim_client.py +55 -14
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +5 -5
- nv_ingest_api/internal/transform/split_text.py +13 -8
- nv_ingest_api/util/image_processing/table_and_chart.py +97 -42
- nv_ingest_api/util/image_processing/transforms.py +16 -5
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +1 -1
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +51 -48
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/RECORD +24 -24
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/top_level.txt +0 -0
|
@@ -15,10 +15,11 @@ import pandas as pd
|
|
|
15
15
|
|
|
16
16
|
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskTableExtraction
|
|
17
17
|
from nv_ingest_api.internal.enums.common import TableFormatEnum
|
|
18
|
-
from nv_ingest_api.internal.primitives.nim.model_interface.
|
|
18
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.ocr import OCRModelInterface
|
|
19
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.ocr import get_ocr_model_name
|
|
19
20
|
from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
|
|
20
|
-
from nv_ingest_api.util.image_processing.table_and_chart import
|
|
21
|
-
from nv_ingest_api.util.image_processing.table_and_chart import
|
|
21
|
+
from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_ocr_output
|
|
22
|
+
from nv_ingest_api.util.image_processing.table_and_chart import convert_ocr_response_to_psuedo_markdown
|
|
22
23
|
from nv_ingest_api.internal.primitives.nim import NimClient
|
|
23
24
|
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxTableStructureModelInterface
|
|
24
25
|
from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
|
|
@@ -60,7 +61,8 @@ def _filter_valid_images(base64_images: List[str]) -> Tuple[List[str], List[np.n
|
|
|
60
61
|
def _run_inference(
|
|
61
62
|
enable_yolox: bool,
|
|
62
63
|
yolox_client: Any,
|
|
63
|
-
|
|
64
|
+
ocr_client: Any,
|
|
65
|
+
ocr_model_name: str,
|
|
64
66
|
valid_arrays: List[np.ndarray],
|
|
65
67
|
valid_images: List[str],
|
|
66
68
|
trace_info: Optional[Dict] = None,
|
|
@@ -68,32 +70,45 @@ def _run_inference(
|
|
|
68
70
|
"""
|
|
69
71
|
Run inference concurrently for YOLOX (if enabled) and Paddle.
|
|
70
72
|
|
|
71
|
-
Returns a tuple of (yolox_results,
|
|
73
|
+
Returns a tuple of (yolox_results, ocr_results).
|
|
72
74
|
"""
|
|
73
|
-
|
|
75
|
+
data_ocr = {"base64_images": valid_images}
|
|
74
76
|
if enable_yolox:
|
|
75
77
|
data_yolox = {"images": valid_arrays}
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
if enable_yolox:
|
|
80
|
-
future_yolox = executor.submit(
|
|
81
|
-
yolox_client.infer,
|
|
82
|
-
data=data_yolox,
|
|
83
|
-
model_name="yolox",
|
|
84
|
-
stage_name="table_extraction",
|
|
85
|
-
max_batch_size=8,
|
|
86
|
-
trace_info=trace_info,
|
|
87
|
-
)
|
|
88
|
-
future_paddle = executor.submit(
|
|
89
|
-
paddle_client.infer,
|
|
90
|
-
data=data_paddle,
|
|
91
|
-
model_name="paddle",
|
|
78
|
+
future_yolox_kwargs = dict(
|
|
79
|
+
data=data_yolox,
|
|
80
|
+
model_name="yolox_ensemble",
|
|
92
81
|
stage_name="table_extraction",
|
|
93
|
-
max_batch_size=
|
|
82
|
+
max_batch_size=8,
|
|
83
|
+
input_names=["INPUT_IMAGES", "THRESHOLDS"],
|
|
84
|
+
dtypes=["BYTES", "FP32"],
|
|
85
|
+
output_names=["OUTPUT"],
|
|
94
86
|
trace_info=trace_info,
|
|
95
87
|
)
|
|
96
88
|
|
|
89
|
+
future_ocr_kwargs = dict(
|
|
90
|
+
data=data_ocr,
|
|
91
|
+
stage_name="table_extraction",
|
|
92
|
+
max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
|
|
93
|
+
trace_info=trace_info,
|
|
94
|
+
)
|
|
95
|
+
if ocr_model_name == "paddle":
|
|
96
|
+
future_ocr_kwargs.update(
|
|
97
|
+
model_name="paddle",
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
future_ocr_kwargs.update(
|
|
101
|
+
model_name="scene_text",
|
|
102
|
+
input_names=["input", "merge_levels"],
|
|
103
|
+
dtypes=["FP32", "BYTES"],
|
|
104
|
+
merge_level="word",
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
|
108
|
+
future_ocr = executor.submit(ocr_client.infer, **future_ocr_kwargs)
|
|
109
|
+
future_yolox = None
|
|
110
|
+
if enable_yolox:
|
|
111
|
+
future_yolox = executor.submit(yolox_client.infer, **future_yolox_kwargs)
|
|
97
112
|
if enable_yolox:
|
|
98
113
|
try:
|
|
99
114
|
yolox_results = future_yolox.result()
|
|
@@ -104,17 +119,17 @@ def _run_inference(
|
|
|
104
119
|
yolox_results = [None] * len(valid_images)
|
|
105
120
|
|
|
106
121
|
try:
|
|
107
|
-
|
|
122
|
+
ocr_results = future_ocr.result()
|
|
108
123
|
except Exception as e:
|
|
109
|
-
logger.error(f"Error calling
|
|
124
|
+
logger.error(f"Error calling ocr_client.infer: {e}", exc_info=True)
|
|
110
125
|
raise
|
|
111
126
|
|
|
112
|
-
return yolox_results,
|
|
127
|
+
return yolox_results, ocr_results
|
|
113
128
|
|
|
114
129
|
|
|
115
130
|
def _validate_inference_results(
|
|
116
131
|
yolox_results: Any,
|
|
117
|
-
|
|
132
|
+
ocr_results: Any,
|
|
118
133
|
valid_arrays: List[Any],
|
|
119
134
|
valid_images: List[str],
|
|
120
135
|
) -> Tuple[List[Any], List[Any]]:
|
|
@@ -123,46 +138,47 @@ def _validate_inference_results(
|
|
|
123
138
|
|
|
124
139
|
If not, default values are assigned. Raises a ValueError if the lengths do not match.
|
|
125
140
|
"""
|
|
126
|
-
if not isinstance(yolox_results, list) or not isinstance(
|
|
141
|
+
if not isinstance(yolox_results, list) or not isinstance(ocr_results, list):
|
|
127
142
|
logger.warning(
|
|
128
|
-
"Unexpected result types from inference clients: yolox_results=%s,
|
|
143
|
+
"Unexpected result types from inference clients: yolox_results=%s, ocr_results=%s. "
|
|
129
144
|
"Proceeding with available results.",
|
|
130
145
|
type(yolox_results).__name__,
|
|
131
|
-
type(
|
|
146
|
+
type(ocr_results).__name__,
|
|
132
147
|
)
|
|
133
148
|
if not isinstance(yolox_results, list):
|
|
134
149
|
yolox_results = [None] * len(valid_arrays)
|
|
135
|
-
if not isinstance(
|
|
136
|
-
|
|
150
|
+
if not isinstance(ocr_results, list):
|
|
151
|
+
ocr_results = [(None, None)] * len(valid_images)
|
|
137
152
|
|
|
138
153
|
if len(yolox_results) != len(valid_arrays):
|
|
139
154
|
raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
|
|
140
|
-
if len(
|
|
141
|
-
raise ValueError(f"Expected {len(valid_images)}
|
|
155
|
+
if len(ocr_results) != len(valid_images):
|
|
156
|
+
raise ValueError(f"Expected {len(valid_images)} ocr results, got {len(ocr_results)}")
|
|
142
157
|
|
|
143
|
-
return yolox_results,
|
|
158
|
+
return yolox_results, ocr_results
|
|
144
159
|
|
|
145
160
|
|
|
146
161
|
def _update_table_metadata(
|
|
147
162
|
base64_images: List[str],
|
|
148
163
|
yolox_client: Any,
|
|
149
|
-
|
|
164
|
+
ocr_client: Any,
|
|
165
|
+
ocr_model_name: str,
|
|
150
166
|
worker_pool_size: int = 8, # Not currently used
|
|
151
167
|
enable_yolox: bool = False,
|
|
152
168
|
trace_info: Optional[Dict] = None,
|
|
153
169
|
) -> List[Tuple[str, Any, Any, Any]]:
|
|
154
170
|
"""
|
|
155
171
|
Given a list of base64-encoded images, this function filters out images that do not meet
|
|
156
|
-
the minimum size requirements and then calls the
|
|
172
|
+
the minimum size requirements and then calls the OCR model via ocr_client.infer
|
|
157
173
|
to extract table data.
|
|
158
174
|
|
|
159
175
|
For each base64-encoded image, the result is a tuple:
|
|
160
|
-
(base64_image, yolox_result,
|
|
176
|
+
(base64_image, yolox_result, ocr_text_predictions, ocr_bounding_boxes)
|
|
161
177
|
|
|
162
178
|
Images that do not meet the minimum size are skipped (resulting in placeholders).
|
|
163
|
-
The
|
|
179
|
+
The ocr_client is expected to handle any necessary batching and concurrency.
|
|
164
180
|
"""
|
|
165
|
-
logger.debug(f"Running table extraction using protocol {
|
|
181
|
+
logger.debug(f"Running table extraction using protocol {ocr_client.protocol}")
|
|
166
182
|
|
|
167
183
|
# Initialize the results list with default placeholders.
|
|
168
184
|
results: List[Tuple[str, Any, Any, Any]] = [("", None, None, None)] * len(base64_images)
|
|
@@ -174,24 +190,23 @@ def _update_table_metadata(
|
|
|
174
190
|
return results
|
|
175
191
|
|
|
176
192
|
# Run inference concurrently.
|
|
177
|
-
yolox_results,
|
|
193
|
+
yolox_results, ocr_results = _run_inference(
|
|
178
194
|
enable_yolox=enable_yolox,
|
|
179
195
|
yolox_client=yolox_client,
|
|
180
|
-
|
|
196
|
+
ocr_client=ocr_client,
|
|
197
|
+
ocr_model_name=ocr_model_name,
|
|
181
198
|
valid_arrays=valid_arrays,
|
|
182
199
|
valid_images=valid_images,
|
|
183
200
|
trace_info=trace_info,
|
|
184
201
|
)
|
|
185
202
|
|
|
186
203
|
# Validate that the inference results have the expected structure.
|
|
187
|
-
yolox_results,
|
|
188
|
-
yolox_results, paddle_results, valid_arrays, valid_images
|
|
189
|
-
)
|
|
204
|
+
yolox_results, ocr_results = _validate_inference_results(yolox_results, ocr_results, valid_arrays, valid_images)
|
|
190
205
|
|
|
191
206
|
# Combine results with the original order.
|
|
192
|
-
for idx, (yolox_res,
|
|
207
|
+
for idx, (yolox_res, ocr_res) in enumerate(zip(yolox_results, ocr_results)):
|
|
193
208
|
original_index = valid_indices[idx]
|
|
194
|
-
results[original_index] = (base64_images[original_index], yolox_res,
|
|
209
|
+
results[original_index] = (base64_images[original_index], yolox_res, ocr_res[0], ocr_res[1])
|
|
195
210
|
|
|
196
211
|
return results
|
|
197
212
|
|
|
@@ -199,14 +214,14 @@ def _update_table_metadata(
|
|
|
199
214
|
def _create_clients(
|
|
200
215
|
yolox_endpoints: Tuple[str, str],
|
|
201
216
|
yolox_protocol: str,
|
|
202
|
-
|
|
203
|
-
|
|
217
|
+
ocr_endpoints: Tuple[str, str],
|
|
218
|
+
ocr_protocol: str,
|
|
204
219
|
auth_token: str,
|
|
205
220
|
) -> Tuple[NimClient, NimClient]:
|
|
206
221
|
yolox_model_interface = YoloxTableStructureModelInterface()
|
|
207
|
-
|
|
222
|
+
ocr_model_interface = OCRModelInterface()
|
|
208
223
|
|
|
209
|
-
logger.debug(f"Inference protocols: yolox={yolox_protocol},
|
|
224
|
+
logger.debug(f"Inference protocols: yolox={yolox_protocol}, ocr={ocr_protocol}")
|
|
210
225
|
|
|
211
226
|
yolox_client = create_inference_client(
|
|
212
227
|
endpoints=yolox_endpoints,
|
|
@@ -215,14 +230,14 @@ def _create_clients(
|
|
|
215
230
|
infer_protocol=yolox_protocol,
|
|
216
231
|
)
|
|
217
232
|
|
|
218
|
-
|
|
219
|
-
endpoints=
|
|
220
|
-
model_interface=
|
|
233
|
+
ocr_client = create_inference_client(
|
|
234
|
+
endpoints=ocr_endpoints,
|
|
235
|
+
model_interface=ocr_model_interface,
|
|
221
236
|
auth_token=auth_token,
|
|
222
|
-
infer_protocol=
|
|
237
|
+
infer_protocol=ocr_protocol,
|
|
223
238
|
)
|
|
224
239
|
|
|
225
|
-
return yolox_client,
|
|
240
|
+
return yolox_client, ocr_client
|
|
226
241
|
|
|
227
242
|
|
|
228
243
|
def extract_table_data_from_image_internal(
|
|
@@ -262,14 +277,18 @@ def extract_table_data_from_image_internal(
|
|
|
262
277
|
return df_extraction_ledger, execution_trace_log
|
|
263
278
|
|
|
264
279
|
endpoint_config = extraction_config.endpoint_config
|
|
265
|
-
yolox_client,
|
|
280
|
+
yolox_client, ocr_client = _create_clients(
|
|
266
281
|
endpoint_config.yolox_endpoints,
|
|
267
282
|
endpoint_config.yolox_infer_protocol,
|
|
268
|
-
endpoint_config.
|
|
269
|
-
endpoint_config.
|
|
283
|
+
endpoint_config.ocr_endpoints,
|
|
284
|
+
endpoint_config.ocr_infer_protocol,
|
|
270
285
|
endpoint_config.auth_token,
|
|
271
286
|
)
|
|
272
287
|
|
|
288
|
+
# Get the grpc endpoint to determine the model if needed
|
|
289
|
+
ocr_grpc_endpoint = endpoint_config.ocr_endpoints[0]
|
|
290
|
+
ocr_model_name = get_ocr_model_name(ocr_grpc_endpoint)
|
|
291
|
+
|
|
273
292
|
try:
|
|
274
293
|
# 1) Identify rows that meet criteria (structured, subtype=table, table_metadata != None, content not empty)
|
|
275
294
|
def meets_criteria(row):
|
|
@@ -309,7 +328,8 @@ def extract_table_data_from_image_internal(
|
|
|
309
328
|
bulk_results = _update_table_metadata(
|
|
310
329
|
base64_images=base64_images,
|
|
311
330
|
yolox_client=yolox_client,
|
|
312
|
-
|
|
331
|
+
ocr_client=ocr_client,
|
|
332
|
+
ocr_model_name=ocr_model_name,
|
|
313
333
|
worker_pool_size=endpoint_config.workers_per_progress_engine,
|
|
314
334
|
enable_yolox=enable_yolox,
|
|
315
335
|
trace_info=execution_trace_log,
|
|
@@ -317,15 +337,15 @@ def extract_table_data_from_image_internal(
|
|
|
317
337
|
|
|
318
338
|
# 4) Write the results (bounding_boxes, text_predictions) back
|
|
319
339
|
for row_id, idx in enumerate(valid_indices):
|
|
320
|
-
# unpack (base64_image, (yolox_predictions,
|
|
340
|
+
# unpack (base64_image, (yolox_predictions, ocr_bounding boxes, ocr_text_predictions))
|
|
321
341
|
_, cell_predictions, bounding_boxes, text_predictions = bulk_results[row_id]
|
|
322
342
|
|
|
323
343
|
if table_content_format == TableFormatEnum.SIMPLE:
|
|
324
344
|
table_content = " ".join(text_predictions)
|
|
325
345
|
elif table_content_format == TableFormatEnum.PSEUDO_MARKDOWN:
|
|
326
|
-
table_content =
|
|
346
|
+
table_content = convert_ocr_response_to_psuedo_markdown(bounding_boxes, text_predictions)
|
|
327
347
|
elif table_content_format == TableFormatEnum.MARKDOWN:
|
|
328
|
-
table_content =
|
|
348
|
+
table_content = join_yolox_table_structure_and_ocr_output(
|
|
329
349
|
cell_predictions, bounding_boxes, text_predictions
|
|
330
350
|
)
|
|
331
351
|
else:
|
|
@@ -341,4 +361,4 @@ def extract_table_data_from_image_internal(
|
|
|
341
361
|
raise
|
|
342
362
|
finally:
|
|
343
363
|
yolox_client.close()
|
|
344
|
-
|
|
364
|
+
ocr_client.close()
|
|
@@ -101,7 +101,7 @@ def nemoretriever_parse_extractor(
|
|
|
101
101
|
- text_depth : str, optional (default is "page")
|
|
102
102
|
- extract_tables_method : str, optional (default is "yolox")
|
|
103
103
|
- identify_nearby_objects : bool, optional (default is True)
|
|
104
|
-
-
|
|
104
|
+
- table_output_format : str, optional (default is "pseudo_markdown")
|
|
105
105
|
- pdfium_config : dict, optional (configuration for PDFium)
|
|
106
106
|
- nemoretriever_parse_config : dict, optional (configuration for NemoRetrieverParse)
|
|
107
107
|
- metadata_column : str, optional (default is "metadata")
|
|
@@ -146,14 +146,14 @@ def nemoretriever_parse_extractor(
|
|
|
146
146
|
# Flag for identifying nearby objects.
|
|
147
147
|
identify_nearby_objects = extractor_config.get("identify_nearby_objects", True)
|
|
148
148
|
|
|
149
|
-
# Get and validate
|
|
150
|
-
|
|
149
|
+
# Get and validate table_output_format.
|
|
150
|
+
table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
|
|
151
151
|
try:
|
|
152
|
-
|
|
152
|
+
table_output_format = TableFormatEnum[table_output_format_str.upper()]
|
|
153
153
|
except KeyError:
|
|
154
154
|
valid_options = [e.name.lower() for e in TableFormatEnum]
|
|
155
155
|
raise ValueError(
|
|
156
|
-
f"Invalid
|
|
156
|
+
f"Invalid table_output_format value: {table_output_format_str}. Expected one of: {valid_options}"
|
|
157
157
|
)
|
|
158
158
|
|
|
159
159
|
# Process nemoretriever_parse configuration.
|
|
@@ -254,10 +254,13 @@ def nemoretriever_parse_extractor(
|
|
|
254
254
|
extract_tables,
|
|
255
255
|
extract_charts,
|
|
256
256
|
extract_infographics,
|
|
257
|
-
|
|
257
|
+
table_output_format,
|
|
258
258
|
nemoretriever_parse_config.yolox_endpoints,
|
|
259
259
|
nemoretriever_parse_config.yolox_infer_protocol,
|
|
260
260
|
nemoretriever_parse_config.auth_token,
|
|
261
|
+
input_names=["INPUT_IMAGES", "THRESHOLDS"],
|
|
262
|
+
dtypes=["BYTES", "FP32"],
|
|
263
|
+
output_names=["OUTPUT"],
|
|
261
264
|
execution_trace_log=execution_trace_log,
|
|
262
265
|
)
|
|
263
266
|
futures.append(future_yolox)
|
|
@@ -288,7 +291,7 @@ def nemoretriever_parse_extractor(
|
|
|
288
291
|
extract_tables,
|
|
289
292
|
extract_charts,
|
|
290
293
|
extract_infographics,
|
|
291
|
-
|
|
294
|
+
table_output_format,
|
|
292
295
|
nemoretriever_parse_config.yolox_endpoints,
|
|
293
296
|
nemoretriever_parse_config.yolox_infer_protocol,
|
|
294
297
|
nemoretriever_parse_config.auth_token,
|
|
@@ -29,9 +29,8 @@ from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH
|
|
|
29
29
|
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
|
|
30
30
|
YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
|
|
31
31
|
YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
|
|
32
|
-
YOLOX_PAGE_IMAGE_FORMAT,
|
|
33
|
-
get_yolox_model_name,
|
|
34
32
|
YoloxPageElementsModelInterface,
|
|
33
|
+
YOLOX_PAGE_IMAGE_FORMAT,
|
|
35
34
|
)
|
|
36
35
|
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
|
|
37
36
|
from nv_ingest_api.internal.enums.common import TableFormatEnum, TextTypeEnum, AccessLevelEnum
|
|
@@ -58,7 +57,6 @@ logger = logging.getLogger(__name__)
|
|
|
58
57
|
def _extract_page_elements_using_image_ensemble(
|
|
59
58
|
pages: List[Tuple[int, np.ndarray, Tuple[int, int]]],
|
|
60
59
|
yolox_client,
|
|
61
|
-
yolox_model_name: str = "yolox",
|
|
62
60
|
execution_trace_log: Optional[List] = None,
|
|
63
61
|
) -> List[Tuple[int, object]]:
|
|
64
62
|
"""
|
|
@@ -72,8 +70,6 @@ def _extract_page_elements_using_image_ensemble(
|
|
|
72
70
|
and optional padding offset information.
|
|
73
71
|
yolox_client : object
|
|
74
72
|
A pre-configured client instance for the YOLOX inference service.
|
|
75
|
-
yolox_model_name : str, default="yolox"
|
|
76
|
-
The name of the YOLOX model to use for inference.
|
|
77
73
|
execution_trace_log : Optional[List], default=None
|
|
78
74
|
List for accumulating execution trace information.
|
|
79
75
|
|
|
@@ -106,8 +102,11 @@ def _extract_page_elements_using_image_ensemble(
|
|
|
106
102
|
# Perform inference using the NimClient.
|
|
107
103
|
inference_results = yolox_client.infer(
|
|
108
104
|
data,
|
|
109
|
-
model_name="
|
|
105
|
+
model_name="yolox_ensemble",
|
|
110
106
|
max_batch_size=YOLOX_MAX_BATCH_SIZE,
|
|
107
|
+
input_names=["INPUT_IMAGES", "THRESHOLDS"],
|
|
108
|
+
dtypes=["BYTES", "FP32"],
|
|
109
|
+
output_names=["OUTPUT"],
|
|
111
110
|
trace_info=execution_trace_log,
|
|
112
111
|
stage_name="pdf_extraction",
|
|
113
112
|
)
|
|
@@ -267,7 +266,7 @@ def _extract_page_elements(
|
|
|
267
266
|
extract_tables: bool,
|
|
268
267
|
extract_charts: bool,
|
|
269
268
|
extract_infographics: bool,
|
|
270
|
-
|
|
269
|
+
table_output_format: str,
|
|
271
270
|
yolox_endpoints: Tuple[Optional[str], Optional[str]],
|
|
272
271
|
yolox_infer_protocol: str = "http",
|
|
273
272
|
auth_token: Optional[str] = None,
|
|
@@ -296,7 +295,7 @@ def _extract_page_elements(
|
|
|
296
295
|
Flag indicating whether to extract charts.
|
|
297
296
|
extract_infographics : bool
|
|
298
297
|
Flag indicating whether to extract infographics.
|
|
299
|
-
|
|
298
|
+
table_output_format : str
|
|
300
299
|
Format to use for table content.
|
|
301
300
|
yolox_endpoints : Tuple[Optional[str], Optional[str]]
|
|
302
301
|
A tuple containing the gRPC and HTTP endpoints for the YOLOX service.
|
|
@@ -317,19 +316,7 @@ def _extract_page_elements(
|
|
|
317
316
|
|
|
318
317
|
try:
|
|
319
318
|
# Default model name
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
# Get the HTTP endpoint to determine the model name if needed
|
|
323
|
-
yolox_http_endpoint = yolox_endpoints[1]
|
|
324
|
-
if yolox_http_endpoint:
|
|
325
|
-
try:
|
|
326
|
-
yolox_model_name = get_yolox_model_name(yolox_http_endpoint)
|
|
327
|
-
except Exception as e:
|
|
328
|
-
logger.warning(f"Failed to get YOLOX model name from endpoint: {e}. Using default.")
|
|
329
|
-
|
|
330
|
-
# Create the model interface
|
|
331
|
-
model_interface = YoloxPageElementsModelInterface(yolox_model_name=yolox_model_name)
|
|
332
|
-
|
|
319
|
+
model_interface = YoloxPageElementsModelInterface()
|
|
333
320
|
# Create the inference client
|
|
334
321
|
yolox_client = create_inference_client(
|
|
335
322
|
yolox_endpoints,
|
|
@@ -340,7 +327,7 @@ def _extract_page_elements(
|
|
|
340
327
|
|
|
341
328
|
# Extract page elements using the client
|
|
342
329
|
page_element_results = _extract_page_elements_using_image_ensemble(
|
|
343
|
-
pages, yolox_client,
|
|
330
|
+
pages, yolox_client, execution_trace_log=execution_trace_log
|
|
344
331
|
)
|
|
345
332
|
|
|
346
333
|
# Process each extracted element based on extraction flags
|
|
@@ -355,7 +342,7 @@ def _extract_page_elements(
|
|
|
355
342
|
|
|
356
343
|
# Set content format for tables
|
|
357
344
|
if page_element.type_string == "table":
|
|
358
|
-
page_element.content_format =
|
|
345
|
+
page_element.content_format = table_output_format
|
|
359
346
|
|
|
360
347
|
# Construct metadata for the page element
|
|
361
348
|
page_element_meta = construct_page_element_metadata(
|
|
@@ -412,13 +399,13 @@ def pdfium_extractor(
|
|
|
412
399
|
f"Invalid text_depth: {text_depth_str}. Valid options: {list(TextTypeEnum.__members__.keys())}"
|
|
413
400
|
)
|
|
414
401
|
|
|
415
|
-
# Validate and extract
|
|
416
|
-
|
|
402
|
+
# Validate and extract table_output_format
|
|
403
|
+
table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
|
|
417
404
|
try:
|
|
418
|
-
|
|
405
|
+
table_output_format = TableFormatEnum[table_output_format_str.upper()]
|
|
419
406
|
except KeyError:
|
|
420
407
|
raise ValueError(
|
|
421
|
-
f"Invalid
|
|
408
|
+
f"Invalid table_output_format: {table_output_format_str}. "
|
|
422
409
|
f"Valid options: {list(TableFormatEnum.__members__.keys())}"
|
|
423
410
|
)
|
|
424
411
|
|
|
@@ -568,7 +555,7 @@ def pdfium_extractor(
|
|
|
568
555
|
extract_tables,
|
|
569
556
|
extract_charts,
|
|
570
557
|
extract_infographics,
|
|
571
|
-
|
|
558
|
+
table_output_format,
|
|
572
559
|
pdfium_config.yolox_endpoints,
|
|
573
560
|
pdfium_config.yolox_infer_protocol,
|
|
574
561
|
pdfium_config.auth_token,
|
|
@@ -590,7 +577,7 @@ def pdfium_extractor(
|
|
|
590
577
|
extract_tables,
|
|
591
578
|
extract_charts,
|
|
592
579
|
extract_infographics,
|
|
593
|
-
|
|
580
|
+
table_output_format,
|
|
594
581
|
pdfium_config.yolox_endpoints,
|
|
595
582
|
pdfium_config.yolox_infer_protocol,
|
|
596
583
|
pdfium_config.auth_token,
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
from typing import Optional
|
|
6
7
|
|
|
7
8
|
import backoff
|
|
8
9
|
import cv2
|
|
@@ -13,6 +14,7 @@ from nv_ingest_api.internal.primitives.nim.model_interface.decorators import mul
|
|
|
13
14
|
from nv_ingest_api.util.image_processing.transforms import pad_image, normalize_image
|
|
14
15
|
from nv_ingest_api.util.string_processing import generate_url, remove_url_endpoints
|
|
15
16
|
|
|
17
|
+
cv2.setNumThreads(1)
|
|
16
18
|
logger = logging.getLogger(__name__)
|
|
17
19
|
|
|
18
20
|
|
|
@@ -81,6 +83,63 @@ def preprocess_image_for_paddle(array: np.ndarray, image_max_dimension: int = 96
|
|
|
81
83
|
return transposed, metadata
|
|
82
84
|
|
|
83
85
|
|
|
86
|
+
def preprocess_image_for_ocr(
|
|
87
|
+
array: np.ndarray,
|
|
88
|
+
target_height: Optional[int] = None,
|
|
89
|
+
target_width: Optional[int] = None,
|
|
90
|
+
pad_how: str = "bottom_right",
|
|
91
|
+
) -> np.ndarray:
|
|
92
|
+
"""
|
|
93
|
+
Preprocesses an input image to be suitable for use with NemoRetriever-OCR.
|
|
94
|
+
|
|
95
|
+
This function is intended for preprocessing images to be passed as input to NemoRetriever-OCR using GRPC.
|
|
96
|
+
It is not necessary when using the HTTP endpoint.
|
|
97
|
+
|
|
98
|
+
Parameters:
|
|
99
|
+
----------
|
|
100
|
+
array : np.ndarray
|
|
101
|
+
The input image array of shape (height, width, channels). It should have pixel values in the range [0, 255].
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
-------
|
|
105
|
+
np.ndarray
|
|
106
|
+
A preprocessed image with the shape (channels, height, width).
|
|
107
|
+
"""
|
|
108
|
+
height, width = array.shape[:2]
|
|
109
|
+
|
|
110
|
+
if target_height is None:
|
|
111
|
+
target_height = height
|
|
112
|
+
|
|
113
|
+
if target_width is None:
|
|
114
|
+
target_width = width
|
|
115
|
+
|
|
116
|
+
padded, (pad_width, pad_height) = pad_image(
|
|
117
|
+
array,
|
|
118
|
+
target_height=target_height,
|
|
119
|
+
target_width=target_width,
|
|
120
|
+
background_color=255,
|
|
121
|
+
dtype=np.float32,
|
|
122
|
+
how=pad_how,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
padded = padded / 255.0
|
|
126
|
+
|
|
127
|
+
# NemoRetriever-OCR NIM (GRPC) requires input to be (channel, height, width).
|
|
128
|
+
transposed = padded.transpose((2, 0, 1))
|
|
129
|
+
|
|
130
|
+
# Metadata can used for inverting transformations on the resulting bounding boxes.
|
|
131
|
+
metadata = {
|
|
132
|
+
"original_height": height,
|
|
133
|
+
"original_width": width,
|
|
134
|
+
"new_height": target_height,
|
|
135
|
+
"new_width": target_width,
|
|
136
|
+
"pad_height": pad_height,
|
|
137
|
+
"pad_width": pad_width,
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return transposed, metadata
|
|
141
|
+
|
|
142
|
+
|
|
84
143
|
def is_ready(http_endpoint: str, ready_endpoint: str) -> bool:
|
|
85
144
|
"""
|
|
86
145
|
Check if the server at the given endpoint is ready.
|