nv-ingest-api 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.18.dev20250718__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (24) hide show
  1. nv_ingest_api/interface/extract.py +18 -18
  2. nv_ingest_api/internal/extract/image/chart_extractor.py +80 -75
  3. nv_ingest_api/internal/extract/image/image_helpers/common.py +5 -6
  4. nv_ingest_api/internal/extract/image/infographic_extractor.py +59 -35
  5. nv_ingest_api/internal/extract/image/table_extractor.py +84 -64
  6. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +10 -7
  7. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +16 -29
  8. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +59 -0
  9. nv_ingest_api/internal/primitives/nim/model_interface/{paddle.py → ocr.py} +132 -39
  10. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +37 -224
  11. nv_ingest_api/internal/primitives/nim/nim_client.py +55 -14
  12. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +6 -6
  13. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +6 -6
  14. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +5 -5
  15. nv_ingest_api/internal/transform/split_text.py +13 -8
  16. nv_ingest_api/util/image_processing/table_and_chart.py +97 -42
  17. nv_ingest_api/util/image_processing/transforms.py +16 -5
  18. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +1 -1
  19. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +51 -48
  20. {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/METADATA +1 -1
  21. {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/RECORD +24 -24
  22. {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/WHEEL +0 -0
  23. {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/licenses/LICENSE +0 -0
  24. {nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/top_level.txt +0 -0
@@ -4,22 +4,37 @@
4
4
 
5
5
  import json
6
6
  import logging
7
- from typing import Any, List, Tuple
7
+ import os
8
+ from typing import Any
8
9
  from typing import Dict
10
+ from typing import List
9
11
  from typing import Optional
12
+ from typing import Tuple
10
13
 
14
+ import backoff
11
15
  import numpy as np
16
+ import tritonclient.grpc as grpcclient
12
17
 
13
18
  from nv_ingest_api.internal.primitives.nim import ModelInterface
14
- from nv_ingest_api.internal.primitives.nim.model_interface.helpers import preprocess_image_for_paddle
19
+ from nv_ingest_api.internal.primitives.nim.model_interface.decorators import (
20
+ multiprocessing_cache,
21
+ )
22
+ from nv_ingest_api.internal.primitives.nim.model_interface.helpers import (
23
+ preprocess_image_for_ocr,
24
+ )
25
+ from nv_ingest_api.internal.primitives.nim.model_interface.helpers import (
26
+ preprocess_image_for_paddle,
27
+ )
15
28
  from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
16
29
 
30
+ DEFAULT_OCR_MODEL_NAME = "paddle"
31
+
17
32
  logger = logging.getLogger(__name__)
18
33
 
19
34
 
20
- class PaddleOCRModelInterface(ModelInterface):
35
+ class OCRModelInterface(ModelInterface):
21
36
  """
22
- An interface for handling inference with a PaddleOCR model, supporting both gRPC and HTTP protocols.
37
+ An interface for handling inference with a OCR model, supporting both gRPC and HTTP protocols.
23
38
  """
24
39
 
25
40
  def name(self) -> str:
@@ -31,7 +46,7 @@ class PaddleOCRModelInterface(ModelInterface):
31
46
  str
32
47
  The name of the model interface.
33
48
  """
34
- return "PaddleOCR"
49
+ return "OCR"
35
50
 
36
51
  def prepare_data_for_inference(self, data: Dict[str, Any]) -> Dict[str, Any]:
37
52
  """
@@ -126,11 +141,26 @@ class PaddleOCRModelInterface(ModelInterface):
126
141
  images = data["image_arrays"]
127
142
  dims = data["image_dims"]
128
143
 
144
+ model_name = kwargs.get("model_name", "paddle")
145
+ merge_level = kwargs.get("merge_level", "paragraph")
146
+
129
147
  if protocol == "grpc":
130
- logger.debug("Formatting input for gRPC PaddleOCR model (batched).")
148
+ logger.debug("Formatting input for gRPC OCR model (batched).")
131
149
  processed: List[np.ndarray] = []
150
+
151
+ max_length = max(max(img.shape[:2]) for img in images)
152
+
132
153
  for img in images:
133
- arr, _dims = preprocess_image_for_paddle(img)
154
+ if model_name == "paddle":
155
+ arr, _dims = preprocess_image_for_paddle(img)
156
+ else:
157
+ arr, _dims = preprocess_image_for_ocr(
158
+ img,
159
+ target_height=max_length,
160
+ target_width=max_length,
161
+ pad_how="bottom_right",
162
+ )
163
+
134
164
  dims.append(_dims)
135
165
  arr = arr.astype(np.float32)
136
166
  arr = np.expand_dims(arr, axis=0) # => shape (1, H, W, C)
@@ -144,12 +174,18 @@ class PaddleOCRModelInterface(ModelInterface):
144
174
  chunk_list(dims, max_batch_size),
145
175
  ):
146
176
  batched_input = np.concatenate(proc_chunk, axis=0)
147
- batches.append(batched_input)
177
+
178
+ if model_name == "paddle":
179
+ batches.append(batched_input)
180
+ else:
181
+ merge_levels = np.array([[merge_level] * len(batched_input)], dtype="object")
182
+ batches.append([batched_input, merge_levels])
183
+
148
184
  batch_data_list.append({"image_arrays": orig_chunk, "image_dims": dims_chunk})
149
185
  return batches, batch_data_list
150
186
 
151
187
  elif protocol == "http":
152
- logger.debug("Formatting input for HTTP PaddleOCR model (batched).")
188
+ logger.debug("Formatting input for HTTP OCR model (batched).")
153
189
  if "base64_images" in data:
154
190
  base64_list = data["base64_images"]
155
191
  else:
@@ -170,7 +206,13 @@ class PaddleOCRModelInterface(ModelInterface):
170
206
  chunk_list(images, max_batch_size),
171
207
  chunk_list(dims, max_batch_size),
172
208
  ):
173
- payload = {"input": input_chunk}
209
+ if model_name == "paddle":
210
+ payload = {"input": input_chunk}
211
+ else:
212
+ payload = {
213
+ "input": input_chunk,
214
+ "merge_levels": [merge_level] * len(input_chunk),
215
+ }
174
216
  batches.append(payload)
175
217
  batch_data_list.append({"image_arrays": orig_chunk, "image_dims": dims_chunk})
176
218
 
@@ -179,7 +221,14 @@ class PaddleOCRModelInterface(ModelInterface):
179
221
  else:
180
222
  raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
181
223
 
182
- def parse_output(self, response: Any, protocol: str, data: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Any:
224
+ def parse_output(
225
+ self,
226
+ response: Any,
227
+ protocol: str,
228
+ data: Optional[Dict[str, Any]] = None,
229
+ model_name: str = "paddle",
230
+ **kwargs: Any,
231
+ ) -> Any:
183
232
  """
184
233
  Parse the model's inference response for the given protocol. The parsing
185
234
  may handle batched outputs for multiple images.
@@ -187,7 +236,7 @@ class PaddleOCRModelInterface(ModelInterface):
187
236
  Parameters
188
237
  ----------
189
238
  response : Any
190
- The raw response from the PaddleOCR model.
239
+ The raw response from the OCR model.
191
240
  protocol : str
192
241
  The protocol used for inference, "grpc" or "http".
193
242
  data : dict of str -> Any, optional
@@ -209,24 +258,24 @@ class PaddleOCRModelInterface(ModelInterface):
209
258
  dims: Optional[List[Tuple[int, int]]] = data.get("image_dims") if data else None
210
259
 
211
260
  if protocol == "grpc":
212
- logger.debug("Parsing output from gRPC PaddleOCR model (batched).")
213
- return self._extract_content_from_paddle_grpc_response(response, dims)
261
+ logger.debug("Parsing output from gRPC OCR model (batched).")
262
+ return self._extract_content_from_ocr_grpc_response(response, dims, model_name=model_name)
214
263
 
215
264
  elif protocol == "http":
216
- logger.debug("Parsing output from HTTP PaddleOCR model (batched).")
217
- return self._extract_content_from_paddle_http_response(response, dims)
265
+ logger.debug("Parsing output from HTTP OCR model (batched).")
266
+ return self._extract_content_from_ocr_http_response(response, dims)
218
267
 
219
268
  else:
220
269
  raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
221
270
 
222
271
  def process_inference_results(self, output: Any, **kwargs: Any) -> Any:
223
272
  """
224
- Process inference results for the PaddleOCR model.
273
+ Process inference results for the OCR model.
225
274
 
226
275
  Parameters
227
276
  ----------
228
277
  output : Any
229
- The raw output parsed from the PaddleOCR model.
278
+ The raw output parsed from the OCR model.
230
279
  **kwargs : Any
231
280
  Additional keyword arguments for customization.
232
281
 
@@ -238,7 +287,7 @@ class PaddleOCRModelInterface(ModelInterface):
238
287
  """
239
288
  return output
240
289
 
241
- def _prepare_paddle_payload(self, base64_img: str) -> Dict[str, Any]:
290
+ def _prepare_ocr_payload(self, base64_img: str) -> Dict[str, Any]:
242
291
  """
243
292
  DEPRECATED by batch logic in format_input. Kept here if you need single-image direct calls.
244
293
 
@@ -250,7 +299,7 @@ class PaddleOCRModelInterface(ModelInterface):
250
299
  Returns
251
300
  -------
252
301
  dict of str -> Any
253
- The payload in either legacy or new format for PaddleOCR's HTTP endpoint.
302
+ The payload in either legacy or new format for OCR's HTTP endpoint.
254
303
  """
255
304
  image_url = f"data:image/png;base64,{base64_img}"
256
305
 
@@ -259,18 +308,18 @@ class PaddleOCRModelInterface(ModelInterface):
259
308
 
260
309
  return payload
261
310
 
262
- def _extract_content_from_paddle_http_response(
311
+ def _extract_content_from_ocr_http_response(
263
312
  self,
264
313
  json_response: Dict[str, Any],
265
314
  dimensions: List[Dict[str, Any]],
266
315
  ) -> List[Tuple[str, str]]:
267
316
  """
268
- Extract content from the JSON response of a PaddleOCR HTTP API request.
317
+ Extract content from the JSON response of a OCR HTTP API request.
269
318
 
270
319
  Parameters
271
320
  ----------
272
321
  json_response : dict of str -> Any
273
- The JSON response returned by the PaddleOCR endpoint.
322
+ The JSON response returned by the OCR endpoint.
274
323
  table_content_format : str or None
275
324
  The specified format for table content (e.g., 'simple' or 'pseudo_markdown').
276
325
  dimensions : list of dict, optional
@@ -296,25 +345,29 @@ class PaddleOCRModelInterface(ModelInterface):
296
345
  text_detections = item.get("text_detections", [])
297
346
  text_predictions = []
298
347
  bounding_boxes = []
348
+ conf_scores = []
299
349
  for td in text_detections:
300
350
  text_predictions.append(td["text_prediction"]["text"])
301
351
  bounding_boxes.append([[pt["x"], pt["y"]] for pt in td["bounding_box"]["points"]])
352
+ conf_scores.append(td["text_prediction"]["confidence"])
302
353
 
303
- bounding_boxes, text_predictions = self._postprocess_paddle_response(
354
+ bounding_boxes, text_predictions, conf_scores = self._postprocess_ocr_response(
304
355
  bounding_boxes,
305
356
  text_predictions,
357
+ conf_scores,
306
358
  dimensions,
307
359
  img_index=item_idx,
308
360
  )
309
361
 
310
- results.append([bounding_boxes, text_predictions])
362
+ results.append([bounding_boxes, text_predictions, conf_scores])
311
363
 
312
364
  return results
313
365
 
314
- def _extract_content_from_paddle_grpc_response(
366
+ def _extract_content_from_ocr_grpc_response(
315
367
  self,
316
368
  response: np.ndarray,
317
369
  dimensions: List[Dict[str, Any]],
370
+ model_name: str = "paddle",
318
371
  ) -> List[Tuple[str, str]]:
319
372
  """
320
373
  Parse a gRPC response for one or more images. The response can have two possible shapes:
@@ -367,33 +420,41 @@ class PaddleOCRModelInterface(ModelInterface):
367
420
  texts_bytestr: bytes = response[1, i]
368
421
  text_predictions = json.loads(texts_bytestr.decode("utf8"))
369
422
 
370
- # 3) Log the third element (extra data/metadata) if needed
371
- extra_data_bytestr: bytes = response[2, i]
372
- logger.debug(f"Ignoring extra_data for image {i}: {extra_data_bytestr}")
423
+ # 3) Parse confidence scores
424
+ confs_bytestr: bytes = response[2, i]
425
+ conf_scores = json.loads(confs_bytestr.decode("utf8"))
373
426
 
374
427
  # Some gRPC responses nest single-item lists; flatten them if needed
375
428
  if isinstance(bounding_boxes, list) and len(bounding_boxes) == 1:
376
429
  bounding_boxes = bounding_boxes[0]
377
430
  if isinstance(text_predictions, list) and len(text_predictions) == 1:
378
431
  text_predictions = text_predictions[0]
432
+ if isinstance(conf_scores, list) and len(conf_scores) == 1:
433
+ conf_scores = conf_scores[0]
379
434
 
380
- bounding_boxes, text_predictions = self._postprocess_paddle_response(
435
+ # 4) Postprocess
436
+ bounding_boxes, text_predictions, conf_scores = self._postprocess_ocr_response(
381
437
  bounding_boxes,
382
438
  text_predictions,
439
+ conf_scores,
383
440
  dimensions,
384
441
  img_index=i,
442
+ scale_coordinates=True if model_name == "paddle" else False,
385
443
  )
386
444
 
387
- results.append([bounding_boxes, text_predictions])
445
+ results.append([bounding_boxes, text_predictions, conf_scores])
388
446
 
389
447
  return results
390
448
 
391
449
  @staticmethod
392
- def _postprocess_paddle_response(
450
+ def _postprocess_ocr_response(
393
451
  bounding_boxes: List[Any],
394
452
  text_predictions: List[str],
453
+ conf_scores: List[float],
395
454
  dims: Optional[List[Dict[str, Any]]] = None,
396
455
  img_index: int = 0,
456
+ scale_coordinates: bool = True,
457
+ shift_coordinates: bool = True,
397
458
  ) -> Tuple[List[Any], List[str]]:
398
459
  """
399
460
  Convert bounding boxes with normalized coordinates to pixel cooridnates by using
@@ -434,17 +495,18 @@ class PaddleOCRModelInterface(ModelInterface):
434
495
  logger.warning("Image index out of range for stored dimensions. Using first image dims by default.")
435
496
  img_index = 0
436
497
 
437
- max_width = dims[img_index]["new_width"]
438
- max_height = dims[img_index]["new_height"]
439
- pad_width = dims[img_index].get("pad_width", 0)
440
- pad_height = dims[img_index].get("pad_height", 0)
441
- scale_factor = dims[img_index].get("scale_factor", 1.0)
498
+ max_width = dims[img_index]["new_width"] if scale_coordinates else 1.0
499
+ max_height = dims[img_index]["new_height"] if scale_coordinates else 1.0
500
+ pad_width = dims[img_index].get("pad_width", 0) if shift_coordinates else 0.0
501
+ pad_height = dims[img_index].get("pad_height", 0) if shift_coordinates else 0.0
502
+ scale_factor = dims[img_index].get("scale_factor", 1.0) if scale_coordinates else 1.0
442
503
 
443
504
  bboxes: List[List[float]] = []
444
505
  texts: List[str] = []
506
+ confs: List[float] = []
445
507
 
446
508
  # Convert normalized coords back to actual pixel coords
447
- for box, txt in zip(bounding_boxes, text_predictions):
509
+ for box, txt, conf in zip(bounding_boxes, text_predictions, conf_scores):
448
510
  if box == "nan":
449
511
  continue
450
512
  points: List[List[float]] = []
@@ -458,5 +520,36 @@ class PaddleOCRModelInterface(ModelInterface):
458
520
  points.append([x_original, y_original])
459
521
  bboxes.append(points)
460
522
  texts.append(txt)
523
+ confs.append(conf)
524
+
525
+ return bboxes, texts, confs
461
526
 
462
- return bboxes, texts
527
+
528
+ @multiprocessing_cache(max_calls=100) # Cache results first to avoid redundant retries from backoff
529
+ @backoff.on_predicate(backoff.expo, max_time=30)
530
+ def get_ocr_model_name(ocr_grpc_endpoint=None, default_model_name=DEFAULT_OCR_MODEL_NAME):
531
+ """
532
+ Determines the OCR model name by checking the environment, querying the gRPC endpoint,
533
+ or falling back to a default.
534
+ """
535
+ # 1. Check for an explicit override from the environment variable first.
536
+ ocr_model_name = os.getenv("OCR_MODEL_NAME", None)
537
+ if ocr_model_name is not None:
538
+ return ocr_model_name
539
+
540
+ # 2. If no gRPC endpoint is provided, fall back to the default immediately.
541
+ if not ocr_grpc_endpoint:
542
+ logger.debug(f"No OCR gRPC endpoint provided. Falling back to default model name '{default_model_name}'.")
543
+ return default_model_name
544
+
545
+ # 3. Attempt to query the gRPC endpoint to discover the model name.
546
+ try:
547
+ client = grpcclient.InferenceServerClient(ocr_grpc_endpoint)
548
+ model_index = client.get_model_repository_index(as_json=True)
549
+ model_names = [x["name"] for x in model_index.get("models", [])]
550
+ ocr_model_name = model_names[0]
551
+ except Exception:
552
+ logger.warning(f"Failed to get ocr model name after 30 seconds. Falling back to '{default_model_name}'.")
553
+ ocr_model_name = default_model_name
554
+
555
+ return ocr_model_name