nv-ingest-api 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.17.dev20250717__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/interface/extract.py +18 -18
- nv_ingest_api/internal/extract/image/chart_extractor.py +75 -55
- nv_ingest_api/internal/extract/image/infographic_extractor.py +59 -35
- nv_ingest_api/internal/extract/image/table_extractor.py +81 -63
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +7 -7
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +9 -9
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +58 -0
- nv_ingest_api/internal/primitives/nim/model_interface/{paddle.py → ocr.py} +132 -39
- nv_ingest_api/internal/primitives/nim/nim_client.py +46 -11
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +5 -5
- nv_ingest_api/internal/transform/split_text.py +13 -8
- nv_ingest_api/util/image_processing/table_and_chart.py +97 -42
- nv_ingest_api/util/image_processing/transforms.py +16 -5
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +1 -1
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +51 -48
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/RECORD +22 -22
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/top_level.txt +0 -0
|
@@ -15,10 +15,11 @@ import pandas as pd
|
|
|
15
15
|
|
|
16
16
|
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskTableExtraction
|
|
17
17
|
from nv_ingest_api.internal.enums.common import TableFormatEnum
|
|
18
|
-
from nv_ingest_api.internal.primitives.nim.model_interface.
|
|
18
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.ocr import OCRModelInterface
|
|
19
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.ocr import get_ocr_model_name
|
|
19
20
|
from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
|
|
20
|
-
from nv_ingest_api.util.image_processing.table_and_chart import
|
|
21
|
-
from nv_ingest_api.util.image_processing.table_and_chart import
|
|
21
|
+
from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_ocr_output
|
|
22
|
+
from nv_ingest_api.util.image_processing.table_and_chart import convert_ocr_response_to_psuedo_markdown
|
|
22
23
|
from nv_ingest_api.internal.primitives.nim import NimClient
|
|
23
24
|
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxTableStructureModelInterface
|
|
24
25
|
from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
|
|
@@ -60,7 +61,8 @@ def _filter_valid_images(base64_images: List[str]) -> Tuple[List[str], List[np.n
|
|
|
60
61
|
def _run_inference(
|
|
61
62
|
enable_yolox: bool,
|
|
62
63
|
yolox_client: Any,
|
|
63
|
-
|
|
64
|
+
ocr_client: Any,
|
|
65
|
+
ocr_model_name: str,
|
|
64
66
|
valid_arrays: List[np.ndarray],
|
|
65
67
|
valid_images: List[str],
|
|
66
68
|
trace_info: Optional[Dict] = None,
|
|
@@ -68,31 +70,42 @@ def _run_inference(
|
|
|
68
70
|
"""
|
|
69
71
|
Run inference concurrently for YOLOX (if enabled) and Paddle.
|
|
70
72
|
|
|
71
|
-
Returns a tuple of (yolox_results,
|
|
73
|
+
Returns a tuple of (yolox_results, ocr_results).
|
|
72
74
|
"""
|
|
73
|
-
|
|
75
|
+
data_ocr = {"base64_images": valid_images}
|
|
74
76
|
if enable_yolox:
|
|
75
77
|
data_yolox = {"images": valid_arrays}
|
|
78
|
+
future_yolox_kwargs = dict(
|
|
79
|
+
data=data_yolox,
|
|
80
|
+
model_name="yolox",
|
|
81
|
+
stage_name="table_extraction",
|
|
82
|
+
max_batch_size=8,
|
|
83
|
+
trace_info=trace_info,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
future_ocr_kwargs = dict(
|
|
87
|
+
data=data_ocr,
|
|
88
|
+
stage_name="table_extraction",
|
|
89
|
+
max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
|
|
90
|
+
trace_info=trace_info,
|
|
91
|
+
)
|
|
92
|
+
if ocr_model_name == "paddle":
|
|
93
|
+
future_ocr_kwargs.update(
|
|
94
|
+
model_name="paddle",
|
|
95
|
+
)
|
|
96
|
+
else:
|
|
97
|
+
future_ocr_kwargs.update(
|
|
98
|
+
model_name="scene_text",
|
|
99
|
+
input_names=["input", "merge_levels"],
|
|
100
|
+
dtypes=["FP32", "BYTES"],
|
|
101
|
+
merge_level="word",
|
|
102
|
+
)
|
|
76
103
|
|
|
77
104
|
with ThreadPoolExecutor(max_workers=2) as executor:
|
|
78
105
|
future_yolox = None
|
|
79
106
|
if enable_yolox:
|
|
80
|
-
future_yolox = executor.submit(
|
|
81
|
-
|
|
82
|
-
data=data_yolox,
|
|
83
|
-
model_name="yolox",
|
|
84
|
-
stage_name="table_extraction",
|
|
85
|
-
max_batch_size=8,
|
|
86
|
-
trace_info=trace_info,
|
|
87
|
-
)
|
|
88
|
-
future_paddle = executor.submit(
|
|
89
|
-
paddle_client.infer,
|
|
90
|
-
data=data_paddle,
|
|
91
|
-
model_name="paddle",
|
|
92
|
-
stage_name="table_extraction",
|
|
93
|
-
max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
|
|
94
|
-
trace_info=trace_info,
|
|
95
|
-
)
|
|
107
|
+
future_yolox = executor.submit(yolox_client.infer, **future_yolox_kwargs)
|
|
108
|
+
future_ocr = executor.submit(ocr_client.infer, **future_ocr_kwargs)
|
|
96
109
|
|
|
97
110
|
if enable_yolox:
|
|
98
111
|
try:
|
|
@@ -104,17 +117,17 @@ def _run_inference(
|
|
|
104
117
|
yolox_results = [None] * len(valid_images)
|
|
105
118
|
|
|
106
119
|
try:
|
|
107
|
-
|
|
120
|
+
ocr_results = future_ocr.result()
|
|
108
121
|
except Exception as e:
|
|
109
|
-
logger.error(f"Error calling
|
|
122
|
+
logger.error(f"Error calling ocr_client.infer: {e}", exc_info=True)
|
|
110
123
|
raise
|
|
111
124
|
|
|
112
|
-
return yolox_results,
|
|
125
|
+
return yolox_results, ocr_results
|
|
113
126
|
|
|
114
127
|
|
|
115
128
|
def _validate_inference_results(
|
|
116
129
|
yolox_results: Any,
|
|
117
|
-
|
|
130
|
+
ocr_results: Any,
|
|
118
131
|
valid_arrays: List[Any],
|
|
119
132
|
valid_images: List[str],
|
|
120
133
|
) -> Tuple[List[Any], List[Any]]:
|
|
@@ -123,46 +136,47 @@ def _validate_inference_results(
|
|
|
123
136
|
|
|
124
137
|
If not, default values are assigned. Raises a ValueError if the lengths do not match.
|
|
125
138
|
"""
|
|
126
|
-
if not isinstance(yolox_results, list) or not isinstance(
|
|
139
|
+
if not isinstance(yolox_results, list) or not isinstance(ocr_results, list):
|
|
127
140
|
logger.warning(
|
|
128
|
-
"Unexpected result types from inference clients: yolox_results=%s,
|
|
141
|
+
"Unexpected result types from inference clients: yolox_results=%s, ocr_results=%s. "
|
|
129
142
|
"Proceeding with available results.",
|
|
130
143
|
type(yolox_results).__name__,
|
|
131
|
-
type(
|
|
144
|
+
type(ocr_results).__name__,
|
|
132
145
|
)
|
|
133
146
|
if not isinstance(yolox_results, list):
|
|
134
147
|
yolox_results = [None] * len(valid_arrays)
|
|
135
|
-
if not isinstance(
|
|
136
|
-
|
|
148
|
+
if not isinstance(ocr_results, list):
|
|
149
|
+
ocr_results = [(None, None)] * len(valid_images)
|
|
137
150
|
|
|
138
151
|
if len(yolox_results) != len(valid_arrays):
|
|
139
152
|
raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
|
|
140
|
-
if len(
|
|
141
|
-
raise ValueError(f"Expected {len(valid_images)}
|
|
153
|
+
if len(ocr_results) != len(valid_images):
|
|
154
|
+
raise ValueError(f"Expected {len(valid_images)} ocr results, got {len(ocr_results)}")
|
|
142
155
|
|
|
143
|
-
return yolox_results,
|
|
156
|
+
return yolox_results, ocr_results
|
|
144
157
|
|
|
145
158
|
|
|
146
159
|
def _update_table_metadata(
|
|
147
160
|
base64_images: List[str],
|
|
148
161
|
yolox_client: Any,
|
|
149
|
-
|
|
162
|
+
ocr_client: Any,
|
|
163
|
+
ocr_model_name: str,
|
|
150
164
|
worker_pool_size: int = 8, # Not currently used
|
|
151
165
|
enable_yolox: bool = False,
|
|
152
166
|
trace_info: Optional[Dict] = None,
|
|
153
167
|
) -> List[Tuple[str, Any, Any, Any]]:
|
|
154
168
|
"""
|
|
155
169
|
Given a list of base64-encoded images, this function filters out images that do not meet
|
|
156
|
-
the minimum size requirements and then calls the
|
|
170
|
+
the minimum size requirements and then calls the OCR model via ocr_client.infer
|
|
157
171
|
to extract table data.
|
|
158
172
|
|
|
159
173
|
For each base64-encoded image, the result is a tuple:
|
|
160
|
-
(base64_image, yolox_result,
|
|
174
|
+
(base64_image, yolox_result, ocr_text_predictions, ocr_bounding_boxes)
|
|
161
175
|
|
|
162
176
|
Images that do not meet the minimum size are skipped (resulting in placeholders).
|
|
163
|
-
The
|
|
177
|
+
The ocr_client is expected to handle any necessary batching and concurrency.
|
|
164
178
|
"""
|
|
165
|
-
logger.debug(f"Running table extraction using protocol {
|
|
179
|
+
logger.debug(f"Running table extraction using protocol {ocr_client.protocol}")
|
|
166
180
|
|
|
167
181
|
# Initialize the results list with default placeholders.
|
|
168
182
|
results: List[Tuple[str, Any, Any, Any]] = [("", None, None, None)] * len(base64_images)
|
|
@@ -174,24 +188,23 @@ def _update_table_metadata(
|
|
|
174
188
|
return results
|
|
175
189
|
|
|
176
190
|
# Run inference concurrently.
|
|
177
|
-
yolox_results,
|
|
191
|
+
yolox_results, ocr_results = _run_inference(
|
|
178
192
|
enable_yolox=enable_yolox,
|
|
179
193
|
yolox_client=yolox_client,
|
|
180
|
-
|
|
194
|
+
ocr_client=ocr_client,
|
|
195
|
+
ocr_model_name=ocr_model_name,
|
|
181
196
|
valid_arrays=valid_arrays,
|
|
182
197
|
valid_images=valid_images,
|
|
183
198
|
trace_info=trace_info,
|
|
184
199
|
)
|
|
185
200
|
|
|
186
201
|
# Validate that the inference results have the expected structure.
|
|
187
|
-
yolox_results,
|
|
188
|
-
yolox_results, paddle_results, valid_arrays, valid_images
|
|
189
|
-
)
|
|
202
|
+
yolox_results, ocr_results = _validate_inference_results(yolox_results, ocr_results, valid_arrays, valid_images)
|
|
190
203
|
|
|
191
204
|
# Combine results with the original order.
|
|
192
|
-
for idx, (yolox_res,
|
|
205
|
+
for idx, (yolox_res, ocr_res) in enumerate(zip(yolox_results, ocr_results)):
|
|
193
206
|
original_index = valid_indices[idx]
|
|
194
|
-
results[original_index] = (base64_images[original_index], yolox_res,
|
|
207
|
+
results[original_index] = (base64_images[original_index], yolox_res, ocr_res[0], ocr_res[1])
|
|
195
208
|
|
|
196
209
|
return results
|
|
197
210
|
|
|
@@ -199,14 +212,14 @@ def _update_table_metadata(
|
|
|
199
212
|
def _create_clients(
|
|
200
213
|
yolox_endpoints: Tuple[str, str],
|
|
201
214
|
yolox_protocol: str,
|
|
202
|
-
|
|
203
|
-
|
|
215
|
+
ocr_endpoints: Tuple[str, str],
|
|
216
|
+
ocr_protocol: str,
|
|
204
217
|
auth_token: str,
|
|
205
218
|
) -> Tuple[NimClient, NimClient]:
|
|
206
219
|
yolox_model_interface = YoloxTableStructureModelInterface()
|
|
207
|
-
|
|
220
|
+
ocr_model_interface = OCRModelInterface()
|
|
208
221
|
|
|
209
|
-
logger.debug(f"Inference protocols: yolox={yolox_protocol},
|
|
222
|
+
logger.debug(f"Inference protocols: yolox={yolox_protocol}, ocr={ocr_protocol}")
|
|
210
223
|
|
|
211
224
|
yolox_client = create_inference_client(
|
|
212
225
|
endpoints=yolox_endpoints,
|
|
@@ -215,14 +228,14 @@ def _create_clients(
|
|
|
215
228
|
infer_protocol=yolox_protocol,
|
|
216
229
|
)
|
|
217
230
|
|
|
218
|
-
|
|
219
|
-
endpoints=
|
|
220
|
-
model_interface=
|
|
231
|
+
ocr_client = create_inference_client(
|
|
232
|
+
endpoints=ocr_endpoints,
|
|
233
|
+
model_interface=ocr_model_interface,
|
|
221
234
|
auth_token=auth_token,
|
|
222
|
-
infer_protocol=
|
|
235
|
+
infer_protocol=ocr_protocol,
|
|
223
236
|
)
|
|
224
237
|
|
|
225
|
-
return yolox_client,
|
|
238
|
+
return yolox_client, ocr_client
|
|
226
239
|
|
|
227
240
|
|
|
228
241
|
def extract_table_data_from_image_internal(
|
|
@@ -262,14 +275,18 @@ def extract_table_data_from_image_internal(
|
|
|
262
275
|
return df_extraction_ledger, execution_trace_log
|
|
263
276
|
|
|
264
277
|
endpoint_config = extraction_config.endpoint_config
|
|
265
|
-
yolox_client,
|
|
278
|
+
yolox_client, ocr_client = _create_clients(
|
|
266
279
|
endpoint_config.yolox_endpoints,
|
|
267
280
|
endpoint_config.yolox_infer_protocol,
|
|
268
|
-
endpoint_config.
|
|
269
|
-
endpoint_config.
|
|
281
|
+
endpoint_config.ocr_endpoints,
|
|
282
|
+
endpoint_config.ocr_infer_protocol,
|
|
270
283
|
endpoint_config.auth_token,
|
|
271
284
|
)
|
|
272
285
|
|
|
286
|
+
# Get the grpc endpoint to determine the model if needed
|
|
287
|
+
ocr_grpc_endpoint = endpoint_config.ocr_endpoints[0]
|
|
288
|
+
ocr_model_name = get_ocr_model_name(ocr_grpc_endpoint)
|
|
289
|
+
|
|
273
290
|
try:
|
|
274
291
|
# 1) Identify rows that meet criteria (structured, subtype=table, table_metadata != None, content not empty)
|
|
275
292
|
def meets_criteria(row):
|
|
@@ -309,7 +326,8 @@ def extract_table_data_from_image_internal(
|
|
|
309
326
|
bulk_results = _update_table_metadata(
|
|
310
327
|
base64_images=base64_images,
|
|
311
328
|
yolox_client=yolox_client,
|
|
312
|
-
|
|
329
|
+
ocr_client=ocr_client,
|
|
330
|
+
ocr_model_name=ocr_model_name,
|
|
313
331
|
worker_pool_size=endpoint_config.workers_per_progress_engine,
|
|
314
332
|
enable_yolox=enable_yolox,
|
|
315
333
|
trace_info=execution_trace_log,
|
|
@@ -317,15 +335,15 @@ def extract_table_data_from_image_internal(
|
|
|
317
335
|
|
|
318
336
|
# 4) Write the results (bounding_boxes, text_predictions) back
|
|
319
337
|
for row_id, idx in enumerate(valid_indices):
|
|
320
|
-
# unpack (base64_image, (yolox_predictions,
|
|
338
|
+
# unpack (base64_image, (yolox_predictions, ocr_bounding boxes, ocr_text_predictions))
|
|
321
339
|
_, cell_predictions, bounding_boxes, text_predictions = bulk_results[row_id]
|
|
322
340
|
|
|
323
341
|
if table_content_format == TableFormatEnum.SIMPLE:
|
|
324
342
|
table_content = " ".join(text_predictions)
|
|
325
343
|
elif table_content_format == TableFormatEnum.PSEUDO_MARKDOWN:
|
|
326
|
-
table_content =
|
|
344
|
+
table_content = convert_ocr_response_to_psuedo_markdown(bounding_boxes, text_predictions)
|
|
327
345
|
elif table_content_format == TableFormatEnum.MARKDOWN:
|
|
328
|
-
table_content =
|
|
346
|
+
table_content = join_yolox_table_structure_and_ocr_output(
|
|
329
347
|
cell_predictions, bounding_boxes, text_predictions
|
|
330
348
|
)
|
|
331
349
|
else:
|
|
@@ -341,4 +359,4 @@ def extract_table_data_from_image_internal(
|
|
|
341
359
|
raise
|
|
342
360
|
finally:
|
|
343
361
|
yolox_client.close()
|
|
344
|
-
|
|
362
|
+
ocr_client.close()
|
|
@@ -101,7 +101,7 @@ def nemoretriever_parse_extractor(
|
|
|
101
101
|
- text_depth : str, optional (default is "page")
|
|
102
102
|
- extract_tables_method : str, optional (default is "yolox")
|
|
103
103
|
- identify_nearby_objects : bool, optional (default is True)
|
|
104
|
-
-
|
|
104
|
+
- table_output_format : str, optional (default is "pseudo_markdown")
|
|
105
105
|
- pdfium_config : dict, optional (configuration for PDFium)
|
|
106
106
|
- nemoretriever_parse_config : dict, optional (configuration for NemoRetrieverParse)
|
|
107
107
|
- metadata_column : str, optional (default is "metadata")
|
|
@@ -146,14 +146,14 @@ def nemoretriever_parse_extractor(
|
|
|
146
146
|
# Flag for identifying nearby objects.
|
|
147
147
|
identify_nearby_objects = extractor_config.get("identify_nearby_objects", True)
|
|
148
148
|
|
|
149
|
-
# Get and validate
|
|
150
|
-
|
|
149
|
+
# Get and validate table_output_format.
|
|
150
|
+
table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
|
|
151
151
|
try:
|
|
152
|
-
|
|
152
|
+
table_output_format = TableFormatEnum[table_output_format_str.upper()]
|
|
153
153
|
except KeyError:
|
|
154
154
|
valid_options = [e.name.lower() for e in TableFormatEnum]
|
|
155
155
|
raise ValueError(
|
|
156
|
-
f"Invalid
|
|
156
|
+
f"Invalid table_output_format value: {table_output_format_str}. Expected one of: {valid_options}"
|
|
157
157
|
)
|
|
158
158
|
|
|
159
159
|
# Process nemoretriever_parse configuration.
|
|
@@ -254,7 +254,7 @@ def nemoretriever_parse_extractor(
|
|
|
254
254
|
extract_tables,
|
|
255
255
|
extract_charts,
|
|
256
256
|
extract_infographics,
|
|
257
|
-
|
|
257
|
+
table_output_format,
|
|
258
258
|
nemoretriever_parse_config.yolox_endpoints,
|
|
259
259
|
nemoretriever_parse_config.yolox_infer_protocol,
|
|
260
260
|
nemoretriever_parse_config.auth_token,
|
|
@@ -288,7 +288,7 @@ def nemoretriever_parse_extractor(
|
|
|
288
288
|
extract_tables,
|
|
289
289
|
extract_charts,
|
|
290
290
|
extract_infographics,
|
|
291
|
-
|
|
291
|
+
table_output_format,
|
|
292
292
|
nemoretriever_parse_config.yolox_endpoints,
|
|
293
293
|
nemoretriever_parse_config.yolox_infer_protocol,
|
|
294
294
|
nemoretriever_parse_config.auth_token,
|
|
@@ -267,7 +267,7 @@ def _extract_page_elements(
|
|
|
267
267
|
extract_tables: bool,
|
|
268
268
|
extract_charts: bool,
|
|
269
269
|
extract_infographics: bool,
|
|
270
|
-
|
|
270
|
+
table_output_format: str,
|
|
271
271
|
yolox_endpoints: Tuple[Optional[str], Optional[str]],
|
|
272
272
|
yolox_infer_protocol: str = "http",
|
|
273
273
|
auth_token: Optional[str] = None,
|
|
@@ -296,7 +296,7 @@ def _extract_page_elements(
|
|
|
296
296
|
Flag indicating whether to extract charts.
|
|
297
297
|
extract_infographics : bool
|
|
298
298
|
Flag indicating whether to extract infographics.
|
|
299
|
-
|
|
299
|
+
table_output_format : str
|
|
300
300
|
Format to use for table content.
|
|
301
301
|
yolox_endpoints : Tuple[Optional[str], Optional[str]]
|
|
302
302
|
A tuple containing the gRPC and HTTP endpoints for the YOLOX service.
|
|
@@ -355,7 +355,7 @@ def _extract_page_elements(
|
|
|
355
355
|
|
|
356
356
|
# Set content format for tables
|
|
357
357
|
if page_element.type_string == "table":
|
|
358
|
-
page_element.content_format =
|
|
358
|
+
page_element.content_format = table_output_format
|
|
359
359
|
|
|
360
360
|
# Construct metadata for the page element
|
|
361
361
|
page_element_meta = construct_page_element_metadata(
|
|
@@ -412,13 +412,13 @@ def pdfium_extractor(
|
|
|
412
412
|
f"Invalid text_depth: {text_depth_str}. Valid options: {list(TextTypeEnum.__members__.keys())}"
|
|
413
413
|
)
|
|
414
414
|
|
|
415
|
-
# Validate and extract
|
|
416
|
-
|
|
415
|
+
# Validate and extract table_output_format
|
|
416
|
+
table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
|
|
417
417
|
try:
|
|
418
|
-
|
|
418
|
+
table_output_format = TableFormatEnum[table_output_format_str.upper()]
|
|
419
419
|
except KeyError:
|
|
420
420
|
raise ValueError(
|
|
421
|
-
f"Invalid
|
|
421
|
+
f"Invalid table_output_format: {table_output_format_str}. "
|
|
422
422
|
f"Valid options: {list(TableFormatEnum.__members__.keys())}"
|
|
423
423
|
)
|
|
424
424
|
|
|
@@ -568,7 +568,7 @@ def pdfium_extractor(
|
|
|
568
568
|
extract_tables,
|
|
569
569
|
extract_charts,
|
|
570
570
|
extract_infographics,
|
|
571
|
-
|
|
571
|
+
table_output_format,
|
|
572
572
|
pdfium_config.yolox_endpoints,
|
|
573
573
|
pdfium_config.yolox_infer_protocol,
|
|
574
574
|
pdfium_config.auth_token,
|
|
@@ -590,7 +590,7 @@ def pdfium_extractor(
|
|
|
590
590
|
extract_tables,
|
|
591
591
|
extract_charts,
|
|
592
592
|
extract_infographics,
|
|
593
|
-
|
|
593
|
+
table_output_format,
|
|
594
594
|
pdfium_config.yolox_endpoints,
|
|
595
595
|
pdfium_config.yolox_infer_protocol,
|
|
596
596
|
pdfium_config.auth_token,
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
from typing import Optional
|
|
6
7
|
|
|
7
8
|
import backoff
|
|
8
9
|
import cv2
|
|
@@ -81,6 +82,63 @@ def preprocess_image_for_paddle(array: np.ndarray, image_max_dimension: int = 96
|
|
|
81
82
|
return transposed, metadata
|
|
82
83
|
|
|
83
84
|
|
|
85
|
+
def preprocess_image_for_ocr(
|
|
86
|
+
array: np.ndarray,
|
|
87
|
+
target_height: Optional[int] = None,
|
|
88
|
+
target_width: Optional[int] = None,
|
|
89
|
+
pad_how: str = "bottom_right",
|
|
90
|
+
) -> np.ndarray:
|
|
91
|
+
"""
|
|
92
|
+
Preprocesses an input image to be suitable for use with NemoRetriever-OCR.
|
|
93
|
+
|
|
94
|
+
This function is intended for preprocessing images to be passed as input to NemoRetriever-OCR using GRPC.
|
|
95
|
+
It is not necessary when using the HTTP endpoint.
|
|
96
|
+
|
|
97
|
+
Parameters:
|
|
98
|
+
----------
|
|
99
|
+
array : np.ndarray
|
|
100
|
+
The input image array of shape (height, width, channels). It should have pixel values in the range [0, 255].
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
-------
|
|
104
|
+
np.ndarray
|
|
105
|
+
A preprocessed image with the shape (channels, height, width).
|
|
106
|
+
"""
|
|
107
|
+
height, width = array.shape[:2]
|
|
108
|
+
|
|
109
|
+
if target_height is None:
|
|
110
|
+
target_height = height
|
|
111
|
+
|
|
112
|
+
if target_width is None:
|
|
113
|
+
target_width = width
|
|
114
|
+
|
|
115
|
+
padded, (pad_width, pad_height) = pad_image(
|
|
116
|
+
array,
|
|
117
|
+
target_height=target_height,
|
|
118
|
+
target_width=target_width,
|
|
119
|
+
background_color=255,
|
|
120
|
+
dtype=np.float32,
|
|
121
|
+
how=pad_how,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
padded = padded / 255.0
|
|
125
|
+
|
|
126
|
+
# NemoRetriever-OCR NIM (GRPC) requires input to be (channel, height, width).
|
|
127
|
+
transposed = padded.transpose((2, 0, 1))
|
|
128
|
+
|
|
129
|
+
# Metadata can used for inverting transformations on the resulting bounding boxes.
|
|
130
|
+
metadata = {
|
|
131
|
+
"original_height": height,
|
|
132
|
+
"original_width": width,
|
|
133
|
+
"new_height": target_height,
|
|
134
|
+
"new_width": target_width,
|
|
135
|
+
"pad_height": pad_height,
|
|
136
|
+
"pad_width": pad_width,
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
return transposed, metadata
|
|
140
|
+
|
|
141
|
+
|
|
84
142
|
def is_ready(http_endpoint: str, ready_endpoint: str) -> bool:
|
|
85
143
|
"""
|
|
86
144
|
Check if the server at the given endpoint is ready.
|