nv-ingest-api 2025.8.24.dev20250824__py3-none-any.whl → 2025.8.25.dev20250825__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -96,13 +96,23 @@ def _run_chart_inference(
96
96
  future_ocr_kwargs.update(
97
97
  model_name="paddle",
98
98
  )
99
- else:
99
+ elif ocr_model_name == "scene_text":
100
100
  future_ocr_kwargs.update(
101
- model_name="scene_text",
101
+ model_name=ocr_model_name,
102
102
  input_names=["input", "merge_levels"],
103
103
  dtypes=["FP32", "BYTES"],
104
104
  merge_level="paragraph",
105
105
  )
106
+ elif ocr_model_name == "scene_text_ensemble":
107
+ future_ocr_kwargs.update(
108
+ model_name=ocr_model_name,
109
+ input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
110
+ output_names=["OUTPUT"],
111
+ dtypes=["BYTES", "BYTES"],
112
+ merge_level="paragraph",
113
+ )
114
+ else:
115
+ raise ValueError(f"Unknown OCR model name: {ocr_model_name}")
106
116
 
107
117
  with ThreadPoolExecutor(max_workers=2) as executor:
108
118
  future_yolox = executor.submit(yolox_client.infer, **future_yolox_kwargs)
@@ -108,13 +108,24 @@ def _update_infographic_metadata(
108
108
  infer_kwargs.update(
109
109
  model_name="paddle",
110
110
  )
111
- else:
111
+ elif ocr_model_name == "scene_text":
112
112
  infer_kwargs.update(
113
- model_name="scene_text",
113
+ model_name=ocr_model_name,
114
114
  input_names=["input", "merge_levels"],
115
115
  dtypes=["FP32", "BYTES"],
116
116
  merge_level="paragraph",
117
117
  )
118
+ elif ocr_model_name == "scene_text_ensemble":
119
+ infer_kwargs.update(
120
+ model_name=ocr_model_name,
121
+ input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
122
+ output_names=["OUTPUT"],
123
+ dtypes=["BYTES", "BYTES"],
124
+ merge_level="paragraph",
125
+ )
126
+ else:
127
+ raise ValueError(f"Unknown OCR model name: {ocr_model_name}")
128
+
118
129
  try:
119
130
  ocr_results = ocr_client.infer(data_ocr, **infer_kwargs)
120
131
  except Exception as e:
@@ -96,13 +96,23 @@ def _run_inference(
96
96
  future_ocr_kwargs.update(
97
97
  model_name="paddle",
98
98
  )
99
- else:
99
+ elif ocr_model_name == "scene_text":
100
100
  future_ocr_kwargs.update(
101
- model_name="scene_text",
101
+ model_name=ocr_model_name,
102
102
  input_names=["input", "merge_levels"],
103
103
  dtypes=["FP32", "BYTES"],
104
104
  merge_level="word",
105
105
  )
106
+ elif ocr_model_name == "scene_text_ensemble":
107
+ future_ocr_kwargs.update(
108
+ model_name=ocr_model_name,
109
+ input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
110
+ output_names=["OUTPUT"],
111
+ dtypes=["BYTES", "BYTES"],
112
+ merge_level="word",
113
+ )
114
+ else:
115
+ raise ValueError(f"Unknown OCR model name: {ocr_model_name}")
106
116
 
107
117
  with ThreadPoolExecutor(max_workers=2) as executor:
108
118
  future_ocr = executor.submit(ocr_client.infer, **future_ocr_kwargs)
@@ -26,8 +26,11 @@ from nv_ingest_api.internal.primitives.nim.model_interface.helpers import (
26
26
  preprocess_image_for_paddle,
27
27
  )
28
28
  from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
29
+ from nv_ingest_api.util.image_processing.transforms import numpy_to_base64
29
30
 
30
31
  DEFAULT_OCR_MODEL_NAME = "paddle"
32
+ NEMORETRIEVER_OCR_EA_MODEL_NAME = "scene_text"
33
+ NEMORETRIEVER_OCR_MODEL_NAME = "scene_text_ensemble"
31
34
 
32
35
  logger = logging.getLogger(__name__)
33
36
 
@@ -141,7 +144,7 @@ class OCRModelInterface(ModelInterface):
141
144
  images = data["image_arrays"]
142
145
  dims = data["image_dims"]
143
146
 
144
- model_name = kwargs.get("model_name", "paddle")
147
+ model_name = kwargs.get("model_name", DEFAULT_OCR_MODEL_NAME)
145
148
  merge_level = kwargs.get("merge_level", "paragraph")
146
149
 
147
150
  if protocol == "grpc":
@@ -149,21 +152,33 @@ class OCRModelInterface(ModelInterface):
149
152
  processed: List[np.ndarray] = []
150
153
 
151
154
  max_length = max(max(img.shape[:2]) for img in images)
155
+ max_length = min(max_length, 65500) # Maximum supported image dimension for JPEG is 65500 pixels.
152
156
 
153
157
  for img in images:
154
- if model_name == "paddle":
158
+ if model_name == DEFAULT_OCR_MODEL_NAME:
155
159
  arr, _dims = preprocess_image_for_paddle(img)
156
- else:
160
+ elif model_name == NEMORETRIEVER_OCR_EA_MODEL_NAME:
157
161
  arr, _dims = preprocess_image_for_ocr(
158
162
  img,
159
163
  target_height=max_length,
160
164
  target_width=max_length,
161
165
  pad_how="bottom_right",
162
166
  )
167
+ elif model_name == NEMORETRIEVER_OCR_MODEL_NAME:
168
+ arr = img
169
+ _dims = {"new_width": img.shape[1], "new_height": img.shape[0]}
170
+ else:
171
+ raise ValueError(f"Unknown model name: {model_name}")
163
172
 
164
173
  dims.append(_dims)
165
- arr = arr.astype(np.float32)
166
- arr = np.expand_dims(arr, axis=0) # => shape (1, H, W, C)
174
+
175
+ if model_name == NEMORETRIEVER_OCR_MODEL_NAME:
176
+ arr = np.array([numpy_to_base64(arr, format="JPEG")], dtype=np.object_)
177
+ else:
178
+ arr = arr.astype(np.float32)
179
+
180
+ arr = np.expand_dims(arr, axis=0)
181
+
167
182
  processed.append(arr)
168
183
 
169
184
  batches = []
@@ -175,7 +190,7 @@ class OCRModelInterface(ModelInterface):
175
190
  ):
176
191
  batched_input = np.concatenate(proc_chunk, axis=0)
177
192
 
178
- if model_name == "paddle":
193
+ if model_name == DEFAULT_OCR_MODEL_NAME:
179
194
  batches.append(batched_input)
180
195
  else:
181
196
  merge_levels = np.array([[merge_level] * len(batched_input)], dtype="object")
@@ -206,7 +221,7 @@ class OCRModelInterface(ModelInterface):
206
221
  chunk_list(images, max_batch_size),
207
222
  chunk_list(dims, max_batch_size),
208
223
  ):
209
- if model_name == "paddle":
224
+ if model_name == DEFAULT_OCR_MODEL_NAME:
210
225
  payload = {"input": input_chunk}
211
226
  else:
212
227
  payload = {
@@ -226,7 +241,7 @@ class OCRModelInterface(ModelInterface):
226
241
  response: Any,
227
242
  protocol: str,
228
243
  data: Optional[Dict[str, Any]] = None,
229
- model_name: str = "paddle",
244
+ model_name: str = DEFAULT_OCR_MODEL_NAME,
230
245
  **kwargs: Any,
231
246
  ) -> Any:
232
247
  """
@@ -367,7 +382,7 @@ class OCRModelInterface(ModelInterface):
367
382
  self,
368
383
  response: np.ndarray,
369
384
  dimensions: List[Dict[str, Any]],
370
- model_name: str = "paddle",
385
+ model_name: str = DEFAULT_OCR_MODEL_NAME,
371
386
  ) -> List[Tuple[str, str]]:
372
387
  """
373
388
  Parse a gRPC response for one or more images. The response can have two possible shapes:
@@ -402,12 +417,14 @@ class OCRModelInterface(ModelInterface):
402
417
  if not isinstance(response, np.ndarray):
403
418
  raise ValueError("Unexpected response format: response is not a NumPy array.")
404
419
 
420
+ if model_name == NEMORETRIEVER_OCR_MODEL_NAME:
421
+ response = response.transpose((1, 0))
422
+
405
423
  # If we have shape (3,), convert to (3, 1)
406
424
  if response.ndim == 1 and response.shape == (3,):
407
425
  response = response.reshape(3, 1)
408
426
  elif response.ndim != 2 or response.shape[0] != 3:
409
427
  raise ValueError(f"Unexpected response shape: {response.shape}. Expecting (3,) or (3, n).")
410
-
411
428
  batch_size = response.shape[1]
412
429
  results: List[Tuple[str, str]] = []
413
430
 
@@ -425,11 +442,17 @@ class OCRModelInterface(ModelInterface):
425
442
  conf_scores = json.loads(confs_bytestr.decode("utf8"))
426
443
 
427
444
  # Some gRPC responses nest single-item lists; flatten them if needed
428
- if isinstance(bounding_boxes, list) and len(bounding_boxes) == 1:
445
+ if (
446
+ (isinstance(bounding_boxes, list) and len(bounding_boxes) == 1 and isinstance(bounding_boxes[0], list))
447
+ and (
448
+ isinstance(text_predictions, list)
449
+ and len(text_predictions) == 1
450
+ and isinstance(text_predictions[0], list)
451
+ )
452
+ and (isinstance(conf_scores, list) and len(conf_scores) == 1 and isinstance(conf_scores[0], list))
453
+ ):
429
454
  bounding_boxes = bounding_boxes[0]
430
- if isinstance(text_predictions, list) and len(text_predictions) == 1:
431
455
  text_predictions = text_predictions[0]
432
- if isinstance(conf_scores, list) and len(conf_scores) == 1:
433
456
  conf_scores = conf_scores[0]
434
457
 
435
458
  # 4) Postprocess
@@ -439,7 +462,7 @@ class OCRModelInterface(ModelInterface):
439
462
  conf_scores,
440
463
  dimensions,
441
464
  img_index=i,
442
- scale_coordinates=True if model_name == "paddle" else False,
465
+ scale_coordinates=False if model_name == NEMORETRIEVER_OCR_EA_MODEL_NAME else True,
443
466
  )
444
467
 
445
468
  results.append([bounding_boxes, text_predictions, conf_scores])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.8.24.dev20250824
3
+ Version: 2025.8.25.dev20250825
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -20,10 +20,10 @@ nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py,sha
20
20
  nv_ingest_api/internal/extract/html/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
21
21
  nv_ingest_api/internal/extract/html/html_extractor.py,sha256=I9oWfj6_As4898GDDh0zsSuKxO3lBsvyYzhvUotjzJI,3282
22
22
  nv_ingest_api/internal/extract/image/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
23
- nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=gk-O-9wjZBoaLVE_6Erb4gMwsSFk4UtPQ2QLpMCW4H4,13212
23
+ nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=HTu0mOIYMpK5JAAiyvlIHgUtmjQV44Gv8dE5hYIqeQE,13633
24
24
  nv_ingest_api/internal/extract/image/image_extractor.py,sha256=gBKjlx28hA_e-dupatu46YQgOHJ0DLpAWxREiLaZLyo,9039
25
- nv_ingest_api/internal/extract/image/infographic_extractor.py,sha256=i7zt_ow1gytU4hK2JCRg7T1wlbokaeuUpXX69LIQkzY,9687
26
- nv_ingest_api/internal/extract/image/table_extractor.py,sha256=O0m3N2Tz9W6X7TBI4o-rbBXc8dFOf9zSZq1v9qC1U4M,13780
25
+ nv_ingest_api/internal/extract/image/infographic_extractor.py,sha256=G5sRnyJ-8ToBbD0_7W6Vemq4a5SBNLtzhZKpuR26mlU,10104
26
+ nv_ingest_api/internal/extract/image/table_extractor.py,sha256=yjSehCTV43a35I_JrVNkgi7yV6RTAEvTeB3kGtM9ZTs,14196
27
27
  nv_ingest_api/internal/extract/image/image_helpers/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
28
28
  nv_ingest_api/internal/extract/image/image_helpers/common.py,sha256=80jRhGzisHvQ9Ky3MKUMM7soKUmvZ5LqRVzwNYjgdPY,14988
29
29
  nv_ingest_api/internal/extract/pdf/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -58,7 +58,7 @@ nv_ingest_api/internal/primitives/nim/model_interface/decorators.py,sha256=qwubk
58
58
  nv_ingest_api/internal/primitives/nim/model_interface/deplot.py,sha256=TvKdk6PTuI1WNhRmNNrvygaI_DIutkJkDL-XdtLZQac,10787
59
59
  nv_ingest_api/internal/primitives/nim/model_interface/helpers.py,sha256=iyGxAr4tG2UZ7LtXXoWO_kF-KsObhPrmZ46Nl0Mi-Ag,11592
60
60
  nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py,sha256=WysjDZeegclO3mZgVcGOwzWbr8wSI4pWRiYD4iC2EXo,7098
61
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py,sha256=Vhim3py_rc5jA0BoKubwfekEqOwxUUePzcmc59pRuOk,21458
61
+ nv_ingest_api/internal/primitives/nim/model_interface/ocr.py,sha256=0Xfuf5_-7LoWnqzZlsJFI53ztneB7Rs-PHZQzDgR0mo,22679
62
62
  nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py,sha256=5PqD2JuHY2rwd-6SSB4axr2Dd79vm95sAEkcmI3U7ME,12977
63
63
  nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py,sha256=lFhppNqrq5X_fzbCWKphvZQMzaJd3gHrkWsyJORzFrU,5010
64
64
  nv_ingest_api/internal/primitives/nim/model_interface/vlm.py,sha256=qJ382PU1ZrIM-SR3cqIhtY_W2rmHec2HIa2aUB2SvaU,6031
@@ -162,8 +162,8 @@ nv_ingest_api/util/string_processing/configuration.py,sha256=2HS08msccuPCT0fn_jf
162
162
  nv_ingest_api/util/string_processing/yaml.py,sha256=6SW2O6wbXRhGbhETMbtXjYCZn53HeCNOP6a96AaxlHs,1454
163
163
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
164
164
  nv_ingest_api/util/system/hardware_info.py,sha256=1UFM8XE6M3pgQcpbVsCsqDQ7Dj-zzptL-XRE-DEu9UA,27213
165
- nv_ingest_api-2025.8.24.dev20250824.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
166
- nv_ingest_api-2025.8.24.dev20250824.dist-info/METADATA,sha256=efa0KmmDvdvacCuWPIHCoYGvujzWjw-LIdWdnhSW3Cw,13947
167
- nv_ingest_api-2025.8.24.dev20250824.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
168
- nv_ingest_api-2025.8.24.dev20250824.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
169
- nv_ingest_api-2025.8.24.dev20250824.dist-info/RECORD,,
165
+ nv_ingest_api-2025.8.25.dev20250825.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
166
+ nv_ingest_api-2025.8.25.dev20250825.dist-info/METADATA,sha256=IBhn7pRL6SlKTG59w68Mo4Gets_IBm_rBOTyDf2aZXU,13947
167
+ nv_ingest_api-2025.8.25.dev20250825.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
168
+ nv_ingest_api-2025.8.25.dev20250825.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
169
+ nv_ingest_api-2025.8.25.dev20250825.dist-info/RECORD,,