nv-ingest-api 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.18.dev20250718__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/interface/extract.py +18 -18
- nv_ingest_api/internal/extract/image/chart_extractor.py +80 -75
- nv_ingest_api/internal/extract/image/image_helpers/common.py +5 -6
- nv_ingest_api/internal/extract/image/infographic_extractor.py +59 -35
- nv_ingest_api/internal/extract/image/table_extractor.py +84 -64
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +10 -7
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +16 -29
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +59 -0
- nv_ingest_api/internal/primitives/nim/model_interface/{paddle.py → ocr.py} +132 -39
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +37 -224
- nv_ingest_api/internal/primitives/nim/nim_client.py +55 -14
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +5 -5
- nv_ingest_api/internal/transform/split_text.py +13 -8
- nv_ingest_api/util/image_processing/table_and_chart.py +97 -42
- nv_ingest_api/util/image_processing/transforms.py +16 -5
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +1 -1
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +51 -48
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/RECORD +24 -24
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/top_level.txt +0 -0
|
@@ -4,22 +4,37 @@
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
|
|
7
|
+
import os
|
|
8
|
+
from typing import Any
|
|
8
9
|
from typing import Dict
|
|
10
|
+
from typing import List
|
|
9
11
|
from typing import Optional
|
|
12
|
+
from typing import Tuple
|
|
10
13
|
|
|
14
|
+
import backoff
|
|
11
15
|
import numpy as np
|
|
16
|
+
import tritonclient.grpc as grpcclient
|
|
12
17
|
|
|
13
18
|
from nv_ingest_api.internal.primitives.nim import ModelInterface
|
|
14
|
-
from nv_ingest_api.internal.primitives.nim.model_interface.
|
|
19
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.decorators import (
|
|
20
|
+
multiprocessing_cache,
|
|
21
|
+
)
|
|
22
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.helpers import (
|
|
23
|
+
preprocess_image_for_ocr,
|
|
24
|
+
)
|
|
25
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.helpers import (
|
|
26
|
+
preprocess_image_for_paddle,
|
|
27
|
+
)
|
|
15
28
|
from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
|
|
16
29
|
|
|
30
|
+
DEFAULT_OCR_MODEL_NAME = "paddle"
|
|
31
|
+
|
|
17
32
|
logger = logging.getLogger(__name__)
|
|
18
33
|
|
|
19
34
|
|
|
20
|
-
class
|
|
35
|
+
class OCRModelInterface(ModelInterface):
|
|
21
36
|
"""
|
|
22
|
-
An interface for handling inference with a
|
|
37
|
+
An interface for handling inference with a OCR model, supporting both gRPC and HTTP protocols.
|
|
23
38
|
"""
|
|
24
39
|
|
|
25
40
|
def name(self) -> str:
|
|
@@ -31,7 +46,7 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
31
46
|
str
|
|
32
47
|
The name of the model interface.
|
|
33
48
|
"""
|
|
34
|
-
return "
|
|
49
|
+
return "OCR"
|
|
35
50
|
|
|
36
51
|
def prepare_data_for_inference(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
37
52
|
"""
|
|
@@ -126,11 +141,26 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
126
141
|
images = data["image_arrays"]
|
|
127
142
|
dims = data["image_dims"]
|
|
128
143
|
|
|
144
|
+
model_name = kwargs.get("model_name", "paddle")
|
|
145
|
+
merge_level = kwargs.get("merge_level", "paragraph")
|
|
146
|
+
|
|
129
147
|
if protocol == "grpc":
|
|
130
|
-
logger.debug("Formatting input for gRPC
|
|
148
|
+
logger.debug("Formatting input for gRPC OCR model (batched).")
|
|
131
149
|
processed: List[np.ndarray] = []
|
|
150
|
+
|
|
151
|
+
max_length = max(max(img.shape[:2]) for img in images)
|
|
152
|
+
|
|
132
153
|
for img in images:
|
|
133
|
-
|
|
154
|
+
if model_name == "paddle":
|
|
155
|
+
arr, _dims = preprocess_image_for_paddle(img)
|
|
156
|
+
else:
|
|
157
|
+
arr, _dims = preprocess_image_for_ocr(
|
|
158
|
+
img,
|
|
159
|
+
target_height=max_length,
|
|
160
|
+
target_width=max_length,
|
|
161
|
+
pad_how="bottom_right",
|
|
162
|
+
)
|
|
163
|
+
|
|
134
164
|
dims.append(_dims)
|
|
135
165
|
arr = arr.astype(np.float32)
|
|
136
166
|
arr = np.expand_dims(arr, axis=0) # => shape (1, H, W, C)
|
|
@@ -144,12 +174,18 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
144
174
|
chunk_list(dims, max_batch_size),
|
|
145
175
|
):
|
|
146
176
|
batched_input = np.concatenate(proc_chunk, axis=0)
|
|
147
|
-
|
|
177
|
+
|
|
178
|
+
if model_name == "paddle":
|
|
179
|
+
batches.append(batched_input)
|
|
180
|
+
else:
|
|
181
|
+
merge_levels = np.array([[merge_level] * len(batched_input)], dtype="object")
|
|
182
|
+
batches.append([batched_input, merge_levels])
|
|
183
|
+
|
|
148
184
|
batch_data_list.append({"image_arrays": orig_chunk, "image_dims": dims_chunk})
|
|
149
185
|
return batches, batch_data_list
|
|
150
186
|
|
|
151
187
|
elif protocol == "http":
|
|
152
|
-
logger.debug("Formatting input for HTTP
|
|
188
|
+
logger.debug("Formatting input for HTTP OCR model (batched).")
|
|
153
189
|
if "base64_images" in data:
|
|
154
190
|
base64_list = data["base64_images"]
|
|
155
191
|
else:
|
|
@@ -170,7 +206,13 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
170
206
|
chunk_list(images, max_batch_size),
|
|
171
207
|
chunk_list(dims, max_batch_size),
|
|
172
208
|
):
|
|
173
|
-
|
|
209
|
+
if model_name == "paddle":
|
|
210
|
+
payload = {"input": input_chunk}
|
|
211
|
+
else:
|
|
212
|
+
payload = {
|
|
213
|
+
"input": input_chunk,
|
|
214
|
+
"merge_levels": [merge_level] * len(input_chunk),
|
|
215
|
+
}
|
|
174
216
|
batches.append(payload)
|
|
175
217
|
batch_data_list.append({"image_arrays": orig_chunk, "image_dims": dims_chunk})
|
|
176
218
|
|
|
@@ -179,7 +221,14 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
179
221
|
else:
|
|
180
222
|
raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
|
|
181
223
|
|
|
182
|
-
def parse_output(
|
|
224
|
+
def parse_output(
|
|
225
|
+
self,
|
|
226
|
+
response: Any,
|
|
227
|
+
protocol: str,
|
|
228
|
+
data: Optional[Dict[str, Any]] = None,
|
|
229
|
+
model_name: str = "paddle",
|
|
230
|
+
**kwargs: Any,
|
|
231
|
+
) -> Any:
|
|
183
232
|
"""
|
|
184
233
|
Parse the model's inference response for the given protocol. The parsing
|
|
185
234
|
may handle batched outputs for multiple images.
|
|
@@ -187,7 +236,7 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
187
236
|
Parameters
|
|
188
237
|
----------
|
|
189
238
|
response : Any
|
|
190
|
-
The raw response from the
|
|
239
|
+
The raw response from the OCR model.
|
|
191
240
|
protocol : str
|
|
192
241
|
The protocol used for inference, "grpc" or "http".
|
|
193
242
|
data : dict of str -> Any, optional
|
|
@@ -209,24 +258,24 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
209
258
|
dims: Optional[List[Tuple[int, int]]] = data.get("image_dims") if data else None
|
|
210
259
|
|
|
211
260
|
if protocol == "grpc":
|
|
212
|
-
logger.debug("Parsing output from gRPC
|
|
213
|
-
return self.
|
|
261
|
+
logger.debug("Parsing output from gRPC OCR model (batched).")
|
|
262
|
+
return self._extract_content_from_ocr_grpc_response(response, dims, model_name=model_name)
|
|
214
263
|
|
|
215
264
|
elif protocol == "http":
|
|
216
|
-
logger.debug("Parsing output from HTTP
|
|
217
|
-
return self.
|
|
265
|
+
logger.debug("Parsing output from HTTP OCR model (batched).")
|
|
266
|
+
return self._extract_content_from_ocr_http_response(response, dims)
|
|
218
267
|
|
|
219
268
|
else:
|
|
220
269
|
raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
|
|
221
270
|
|
|
222
271
|
def process_inference_results(self, output: Any, **kwargs: Any) -> Any:
|
|
223
272
|
"""
|
|
224
|
-
Process inference results for the
|
|
273
|
+
Process inference results for the OCR model.
|
|
225
274
|
|
|
226
275
|
Parameters
|
|
227
276
|
----------
|
|
228
277
|
output : Any
|
|
229
|
-
The raw output parsed from the
|
|
278
|
+
The raw output parsed from the OCR model.
|
|
230
279
|
**kwargs : Any
|
|
231
280
|
Additional keyword arguments for customization.
|
|
232
281
|
|
|
@@ -238,7 +287,7 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
238
287
|
"""
|
|
239
288
|
return output
|
|
240
289
|
|
|
241
|
-
def
|
|
290
|
+
def _prepare_ocr_payload(self, base64_img: str) -> Dict[str, Any]:
|
|
242
291
|
"""
|
|
243
292
|
DEPRECATED by batch logic in format_input. Kept here if you need single-image direct calls.
|
|
244
293
|
|
|
@@ -250,7 +299,7 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
250
299
|
Returns
|
|
251
300
|
-------
|
|
252
301
|
dict of str -> Any
|
|
253
|
-
The payload in either legacy or new format for
|
|
302
|
+
The payload in either legacy or new format for OCR's HTTP endpoint.
|
|
254
303
|
"""
|
|
255
304
|
image_url = f"data:image/png;base64,{base64_img}"
|
|
256
305
|
|
|
@@ -259,18 +308,18 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
259
308
|
|
|
260
309
|
return payload
|
|
261
310
|
|
|
262
|
-
def
|
|
311
|
+
def _extract_content_from_ocr_http_response(
|
|
263
312
|
self,
|
|
264
313
|
json_response: Dict[str, Any],
|
|
265
314
|
dimensions: List[Dict[str, Any]],
|
|
266
315
|
) -> List[Tuple[str, str]]:
|
|
267
316
|
"""
|
|
268
|
-
Extract content from the JSON response of a
|
|
317
|
+
Extract content from the JSON response of a OCR HTTP API request.
|
|
269
318
|
|
|
270
319
|
Parameters
|
|
271
320
|
----------
|
|
272
321
|
json_response : dict of str -> Any
|
|
273
|
-
The JSON response returned by the
|
|
322
|
+
The JSON response returned by the OCR endpoint.
|
|
274
323
|
table_content_format : str or None
|
|
275
324
|
The specified format for table content (e.g., 'simple' or 'pseudo_markdown').
|
|
276
325
|
dimensions : list of dict, optional
|
|
@@ -296,25 +345,29 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
296
345
|
text_detections = item.get("text_detections", [])
|
|
297
346
|
text_predictions = []
|
|
298
347
|
bounding_boxes = []
|
|
348
|
+
conf_scores = []
|
|
299
349
|
for td in text_detections:
|
|
300
350
|
text_predictions.append(td["text_prediction"]["text"])
|
|
301
351
|
bounding_boxes.append([[pt["x"], pt["y"]] for pt in td["bounding_box"]["points"]])
|
|
352
|
+
conf_scores.append(td["text_prediction"]["confidence"])
|
|
302
353
|
|
|
303
|
-
bounding_boxes, text_predictions = self.
|
|
354
|
+
bounding_boxes, text_predictions, conf_scores = self._postprocess_ocr_response(
|
|
304
355
|
bounding_boxes,
|
|
305
356
|
text_predictions,
|
|
357
|
+
conf_scores,
|
|
306
358
|
dimensions,
|
|
307
359
|
img_index=item_idx,
|
|
308
360
|
)
|
|
309
361
|
|
|
310
|
-
results.append([bounding_boxes, text_predictions])
|
|
362
|
+
results.append([bounding_boxes, text_predictions, conf_scores])
|
|
311
363
|
|
|
312
364
|
return results
|
|
313
365
|
|
|
314
|
-
def
|
|
366
|
+
def _extract_content_from_ocr_grpc_response(
|
|
315
367
|
self,
|
|
316
368
|
response: np.ndarray,
|
|
317
369
|
dimensions: List[Dict[str, Any]],
|
|
370
|
+
model_name: str = "paddle",
|
|
318
371
|
) -> List[Tuple[str, str]]:
|
|
319
372
|
"""
|
|
320
373
|
Parse a gRPC response for one or more images. The response can have two possible shapes:
|
|
@@ -367,33 +420,41 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
367
420
|
texts_bytestr: bytes = response[1, i]
|
|
368
421
|
text_predictions = json.loads(texts_bytestr.decode("utf8"))
|
|
369
422
|
|
|
370
|
-
# 3)
|
|
371
|
-
|
|
372
|
-
|
|
423
|
+
# 3) Parse confidence scores
|
|
424
|
+
confs_bytestr: bytes = response[2, i]
|
|
425
|
+
conf_scores = json.loads(confs_bytestr.decode("utf8"))
|
|
373
426
|
|
|
374
427
|
# Some gRPC responses nest single-item lists; flatten them if needed
|
|
375
428
|
if isinstance(bounding_boxes, list) and len(bounding_boxes) == 1:
|
|
376
429
|
bounding_boxes = bounding_boxes[0]
|
|
377
430
|
if isinstance(text_predictions, list) and len(text_predictions) == 1:
|
|
378
431
|
text_predictions = text_predictions[0]
|
|
432
|
+
if isinstance(conf_scores, list) and len(conf_scores) == 1:
|
|
433
|
+
conf_scores = conf_scores[0]
|
|
379
434
|
|
|
380
|
-
|
|
435
|
+
# 4) Postprocess
|
|
436
|
+
bounding_boxes, text_predictions, conf_scores = self._postprocess_ocr_response(
|
|
381
437
|
bounding_boxes,
|
|
382
438
|
text_predictions,
|
|
439
|
+
conf_scores,
|
|
383
440
|
dimensions,
|
|
384
441
|
img_index=i,
|
|
442
|
+
scale_coordinates=True if model_name == "paddle" else False,
|
|
385
443
|
)
|
|
386
444
|
|
|
387
|
-
results.append([bounding_boxes, text_predictions])
|
|
445
|
+
results.append([bounding_boxes, text_predictions, conf_scores])
|
|
388
446
|
|
|
389
447
|
return results
|
|
390
448
|
|
|
391
449
|
@staticmethod
|
|
392
|
-
def
|
|
450
|
+
def _postprocess_ocr_response(
|
|
393
451
|
bounding_boxes: List[Any],
|
|
394
452
|
text_predictions: List[str],
|
|
453
|
+
conf_scores: List[float],
|
|
395
454
|
dims: Optional[List[Dict[str, Any]]] = None,
|
|
396
455
|
img_index: int = 0,
|
|
456
|
+
scale_coordinates: bool = True,
|
|
457
|
+
shift_coordinates: bool = True,
|
|
397
458
|
) -> Tuple[List[Any], List[str]]:
|
|
398
459
|
"""
|
|
399
460
|
Convert bounding boxes with normalized coordinates to pixel cooridnates by using
|
|
@@ -434,17 +495,18 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
434
495
|
logger.warning("Image index out of range for stored dimensions. Using first image dims by default.")
|
|
435
496
|
img_index = 0
|
|
436
497
|
|
|
437
|
-
max_width = dims[img_index]["new_width"]
|
|
438
|
-
max_height = dims[img_index]["new_height"]
|
|
439
|
-
pad_width = dims[img_index].get("pad_width", 0)
|
|
440
|
-
pad_height = dims[img_index].get("pad_height", 0)
|
|
441
|
-
scale_factor = dims[img_index].get("scale_factor", 1.0)
|
|
498
|
+
max_width = dims[img_index]["new_width"] if scale_coordinates else 1.0
|
|
499
|
+
max_height = dims[img_index]["new_height"] if scale_coordinates else 1.0
|
|
500
|
+
pad_width = dims[img_index].get("pad_width", 0) if shift_coordinates else 0.0
|
|
501
|
+
pad_height = dims[img_index].get("pad_height", 0) if shift_coordinates else 0.0
|
|
502
|
+
scale_factor = dims[img_index].get("scale_factor", 1.0) if scale_coordinates else 1.0
|
|
442
503
|
|
|
443
504
|
bboxes: List[List[float]] = []
|
|
444
505
|
texts: List[str] = []
|
|
506
|
+
confs: List[float] = []
|
|
445
507
|
|
|
446
508
|
# Convert normalized coords back to actual pixel coords
|
|
447
|
-
for box, txt in zip(bounding_boxes, text_predictions):
|
|
509
|
+
for box, txt, conf in zip(bounding_boxes, text_predictions, conf_scores):
|
|
448
510
|
if box == "nan":
|
|
449
511
|
continue
|
|
450
512
|
points: List[List[float]] = []
|
|
@@ -458,5 +520,36 @@ class PaddleOCRModelInterface(ModelInterface):
|
|
|
458
520
|
points.append([x_original, y_original])
|
|
459
521
|
bboxes.append(points)
|
|
460
522
|
texts.append(txt)
|
|
523
|
+
confs.append(conf)
|
|
524
|
+
|
|
525
|
+
return bboxes, texts, confs
|
|
461
526
|
|
|
462
|
-
|
|
527
|
+
|
|
528
|
+
@multiprocessing_cache(max_calls=100) # Cache results first to avoid redundant retries from backoff
|
|
529
|
+
@backoff.on_predicate(backoff.expo, max_time=30)
|
|
530
|
+
def get_ocr_model_name(ocr_grpc_endpoint=None, default_model_name=DEFAULT_OCR_MODEL_NAME):
|
|
531
|
+
"""
|
|
532
|
+
Determines the OCR model name by checking the environment, querying the gRPC endpoint,
|
|
533
|
+
or falling back to a default.
|
|
534
|
+
"""
|
|
535
|
+
# 1. Check for an explicit override from the environment variable first.
|
|
536
|
+
ocr_model_name = os.getenv("OCR_MODEL_NAME", None)
|
|
537
|
+
if ocr_model_name is not None:
|
|
538
|
+
return ocr_model_name
|
|
539
|
+
|
|
540
|
+
# 2. If no gRPC endpoint is provided, fall back to the default immediately.
|
|
541
|
+
if not ocr_grpc_endpoint:
|
|
542
|
+
logger.debug(f"No OCR gRPC endpoint provided. Falling back to default model name '{default_model_name}'.")
|
|
543
|
+
return default_model_name
|
|
544
|
+
|
|
545
|
+
# 3. Attempt to query the gRPC endpoint to discover the model name.
|
|
546
|
+
try:
|
|
547
|
+
client = grpcclient.InferenceServerClient(ocr_grpc_endpoint)
|
|
548
|
+
model_index = client.get_model_repository_index(as_json=True)
|
|
549
|
+
model_names = [x["name"] for x in model_index.get("models", [])]
|
|
550
|
+
ocr_model_name = model_names[0]
|
|
551
|
+
except Exception:
|
|
552
|
+
logger.warning(f"Failed to get ocr model name after 30 seconds. Falling back to '{default_model_name}'.")
|
|
553
|
+
ocr_model_name = default_model_name
|
|
554
|
+
|
|
555
|
+
return ocr_model_name
|