PyPI - nv-ingest-api - Versions diffs - 2025.8.24.dev20250824__py3-none-any.whl → 2025.8.25.dev20250825__py3-none-any.whl - Mend

nv-ingest-api 2025.8.24.dev20250824py3-none-any.whl → 2025.8.25.dev20250825py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (9) hide show

nv_ingest_api/internal/extract/image/chart_extractor.py CHANGED Viewed

@@ -96,13 +96,23 @@ def _run_chart_inference(
         future_ocr_kwargs.update(
             model_name="paddle",
         )
-    else:
+    elif ocr_model_name == "scene_text":
         future_ocr_kwargs.update(
-            model_name="scene_text",
+            model_name=ocr_model_name,
             input_names=["input", "merge_levels"],
             dtypes=["FP32", "BYTES"],
             merge_level="paragraph",
         )
+    elif ocr_model_name == "scene_text_ensemble":
+        future_ocr_kwargs.update(
+            model_name=ocr_model_name,
+            input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
+            output_names=["OUTPUT"],
+            dtypes=["BYTES", "BYTES"],
+            merge_level="paragraph",
+        )
+    else:
+        raise ValueError(f"Unknown OCR model name: {ocr_model_name}")
     with ThreadPoolExecutor(max_workers=2) as executor:
         future_yolox = executor.submit(yolox_client.infer, **future_yolox_kwargs)

nv_ingest_api/internal/extract/image/infographic_extractor.py CHANGED Viewed

@@ -108,13 +108,24 @@ def _update_infographic_metadata(
         infer_kwargs.update(
             model_name="paddle",
         )
-    else:
+    elif ocr_model_name == "scene_text":
         infer_kwargs.update(
-            model_name="scene_text",
+            model_name=ocr_model_name,
             input_names=["input", "merge_levels"],
             dtypes=["FP32", "BYTES"],
             merge_level="paragraph",
         )
+    elif ocr_model_name == "scene_text_ensemble":
+        infer_kwargs.update(
+            model_name=ocr_model_name,
+            input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
+            output_names=["OUTPUT"],
+            dtypes=["BYTES", "BYTES"],
+            merge_level="paragraph",
+        )
+    else:
+        raise ValueError(f"Unknown OCR model name: {ocr_model_name}")
     try:
         ocr_results = ocr_client.infer(data_ocr, **infer_kwargs)
     except Exception as e:

nv_ingest_api/internal/extract/image/table_extractor.py CHANGED Viewed

@@ -96,13 +96,23 @@ def _run_inference(
         future_ocr_kwargs.update(
             model_name="paddle",
         )
-    else:
+    elif ocr_model_name == "scene_text":
         future_ocr_kwargs.update(
-            model_name="scene_text",
+            model_name=ocr_model_name,
             input_names=["input", "merge_levels"],
             dtypes=["FP32", "BYTES"],
             merge_level="word",
         )
+    elif ocr_model_name == "scene_text_ensemble":
+        future_ocr_kwargs.update(
+            model_name=ocr_model_name,
+            input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
+            output_names=["OUTPUT"],
+            dtypes=["BYTES", "BYTES"],
+            merge_level="word",
+        )
+    else:
+        raise ValueError(f"Unknown OCR model name: {ocr_model_name}")
     with ThreadPoolExecutor(max_workers=2) as executor:
         future_ocr = executor.submit(ocr_client.infer, **future_ocr_kwargs)

nv_ingest_api/internal/primitives/nim/model_interface/ocr.py CHANGED Viewed

@@ -26,8 +26,11 @@ from nv_ingest_api.internal.primitives.nim.model_interface.helpers import (
     preprocess_image_for_paddle,
 )
 from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
+from nv_ingest_api.util.image_processing.transforms import numpy_to_base64
 DEFAULT_OCR_MODEL_NAME = "paddle"
+NEMORETRIEVER_OCR_EA_MODEL_NAME = "scene_text"
+NEMORETRIEVER_OCR_MODEL_NAME = "scene_text_ensemble"
 logger = logging.getLogger(__name__)
@@ -141,7 +144,7 @@ class OCRModelInterface(ModelInterface):
         images = data["image_arrays"]
         dims = data["image_dims"]
-        model_name = kwargs.get("model_name", "paddle")
+        model_name = kwargs.get("model_name", DEFAULT_OCR_MODEL_NAME)
         merge_level = kwargs.get("merge_level", "paragraph")
         if protocol == "grpc":
@@ -149,21 +152,33 @@ class OCRModelInterface(ModelInterface):
             processed: List[np.ndarray] = []
             max_length = max(max(img.shape[:2]) for img in images)
+            max_length = min(max_length, 65500)  # Maximum supported image dimension for JPEG is 65500 pixels.
             for img in images:
-                if model_name == "paddle":
+                if model_name == DEFAULT_OCR_MODEL_NAME:
                     arr, _dims = preprocess_image_for_paddle(img)
-                else:
+                elif model_name == NEMORETRIEVER_OCR_EA_MODEL_NAME:
                     arr, _dims = preprocess_image_for_ocr(
                         img,
                         target_height=max_length,
                         target_width=max_length,
                         pad_how="bottom_right",
                     )
+                elif model_name == NEMORETRIEVER_OCR_MODEL_NAME:
+                    arr = img
+                    _dims = {"new_width": img.shape[1], "new_height": img.shape[0]}
+                else:
+                    raise ValueError(f"Unknown model name: {model_name}")
                 dims.append(_dims)
-                arr = arr.astype(np.float32)
-                arr = np.expand_dims(arr, axis=0)  # => shape (1, H, W, C)
+                if model_name == NEMORETRIEVER_OCR_MODEL_NAME:
+                    arr = np.array([numpy_to_base64(arr, format="JPEG")], dtype=np.object_)
+                else:
+                    arr = arr.astype(np.float32)
+                arr = np.expand_dims(arr, axis=0)
                 processed.append(arr)
             batches = []
@@ -175,7 +190,7 @@ class OCRModelInterface(ModelInterface):
             ):
                 batched_input = np.concatenate(proc_chunk, axis=0)
-                if model_name == "paddle":
+                if model_name == DEFAULT_OCR_MODEL_NAME:
                     batches.append(batched_input)
                 else:
                     merge_levels = np.array([[merge_level] * len(batched_input)], dtype="object")
@@ -206,7 +221,7 @@ class OCRModelInterface(ModelInterface):
                 chunk_list(images, max_batch_size),
                 chunk_list(dims, max_batch_size),
             ):
-                if model_name == "paddle":
+                if model_name == DEFAULT_OCR_MODEL_NAME:
                     payload = {"input": input_chunk}
                 else:
                     payload = {
@@ -226,7 +241,7 @@ class OCRModelInterface(ModelInterface):
         response: Any,
         protocol: str,
         data: Optional[Dict[str, Any]] = None,
-        model_name: str = "paddle",
+        model_name: str = DEFAULT_OCR_MODEL_NAME,
         **kwargs: Any,
     ) -> Any:
         """
@@ -367,7 +382,7 @@ class OCRModelInterface(ModelInterface):
         self,
         response: np.ndarray,
         dimensions: List[Dict[str, Any]],
-        model_name: str = "paddle",
+        model_name: str = DEFAULT_OCR_MODEL_NAME,
     ) -> List[Tuple[str, str]]:
         """
         Parse a gRPC response for one or more images. The response can have two possible shapes:
@@ -402,12 +417,14 @@ class OCRModelInterface(ModelInterface):
         if not isinstance(response, np.ndarray):
             raise ValueError("Unexpected response format: response is not a NumPy array.")
+        if model_name == NEMORETRIEVER_OCR_MODEL_NAME:
+            response = response.transpose((1, 0))
         # If we have shape (3,), convert to (3, 1)
         if response.ndim == 1 and response.shape == (3,):
             response = response.reshape(3, 1)
         elif response.ndim != 2 or response.shape[0] != 3:
             raise ValueError(f"Unexpected response shape: {response.shape}. Expecting (3,) or (3, n).")
         batch_size = response.shape[1]
         results: List[Tuple[str, str]] = []
@@ -425,11 +442,17 @@ class OCRModelInterface(ModelInterface):
             conf_scores = json.loads(confs_bytestr.decode("utf8"))
             # Some gRPC responses nest single-item lists; flatten them if needed
-            if isinstance(bounding_boxes, list) and len(bounding_boxes) == 1:
+            if (
+                (isinstance(bounding_boxes, list) and len(bounding_boxes) == 1 and isinstance(bounding_boxes[0], list))
+                and (
+                    isinstance(text_predictions, list)
+                    and len(text_predictions) == 1
+                    and isinstance(text_predictions[0], list)
+                )
+                and (isinstance(conf_scores, list) and len(conf_scores) == 1 and isinstance(conf_scores[0], list))
+            ):
                 bounding_boxes = bounding_boxes[0]
-            if isinstance(text_predictions, list) and len(text_predictions) == 1:
                 text_predictions = text_predictions[0]
-            if isinstance(conf_scores, list) and len(conf_scores) == 1:
                 conf_scores = conf_scores[0]
             # 4) Postprocess
@@ -439,7 +462,7 @@ class OCRModelInterface(ModelInterface):
                 conf_scores,
                 dimensions,
                 img_index=i,
-                scale_coordinates=True if model_name == "paddle" else False,
+                scale_coordinates=False if model_name == NEMORETRIEVER_OCR_EA_MODEL_NAME else True,
             )
             results.append([bounding_boxes, text_predictions, conf_scores])

{nv_ingest_api-2025.8.24.dev20250824.dist-info → nv_ingest_api-2025.8.25.dev20250825.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-api
-Version: 2025.8.24.dev20250824
+Version: 2025.8.25.dev20250825
 Summary: Python module with core document ingestion functions.
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

{nv_ingest_api-2025.8.24.dev20250824.dist-info → nv_ingest_api-2025.8.25.dev20250825.dist-info}/RECORD RENAMED Viewed

@@ -20,10 +20,10 @@ nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py,sha
 nv_ingest_api/internal/extract/html/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_api/internal/extract/html/html_extractor.py,sha256=I9oWfj6_As4898GDDh0zsSuKxO3lBsvyYzhvUotjzJI,3282
 nv_ingest_api/internal/extract/image/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
-nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=gk-O-9wjZBoaLVE_6Erb4gMwsSFk4UtPQ2QLpMCW4H4,13212
+nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=HTu0mOIYMpK5JAAiyvlIHgUtmjQV44Gv8dE5hYIqeQE,13633
 nv_ingest_api/internal/extract/image/image_extractor.py,sha256=gBKjlx28hA_e-dupatu46YQgOHJ0DLpAWxREiLaZLyo,9039
-nv_ingest_api/internal/extract/image/infographic_extractor.py,sha256=i7zt_ow1gytU4hK2JCRg7T1wlbokaeuUpXX69LIQkzY,9687
-nv_ingest_api/internal/extract/image/table_extractor.py,sha256=O0m3N2Tz9W6X7TBI4o-rbBXc8dFOf9zSZq1v9qC1U4M,13780
+nv_ingest_api/internal/extract/image/infographic_extractor.py,sha256=G5sRnyJ-8ToBbD0_7W6Vemq4a5SBNLtzhZKpuR26mlU,10104
+nv_ingest_api/internal/extract/image/table_extractor.py,sha256=yjSehCTV43a35I_JrVNkgi7yV6RTAEvTeB3kGtM9ZTs,14196
 nv_ingest_api/internal/extract/image/image_helpers/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest_api/internal/extract/image/image_helpers/common.py,sha256=80jRhGzisHvQ9Ky3MKUMM7soKUmvZ5LqRVzwNYjgdPY,14988
 nv_ingest_api/internal/extract/pdf/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -58,7 +58,7 @@ nv_ingest_api/internal/primitives/nim/model_interface/decorators.py,sha256=qwubk
 nv_ingest_api/internal/primitives/nim/model_interface/deplot.py,sha256=TvKdk6PTuI1WNhRmNNrvygaI_DIutkJkDL-XdtLZQac,10787
 nv_ingest_api/internal/primitives/nim/model_interface/helpers.py,sha256=iyGxAr4tG2UZ7LtXXoWO_kF-KsObhPrmZ46Nl0Mi-Ag,11592
 nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py,sha256=WysjDZeegclO3mZgVcGOwzWbr8wSI4pWRiYD4iC2EXo,7098
-nv_ingest_api/internal/primitives/nim/model_interface/ocr.py,sha256=Vhim3py_rc5jA0BoKubwfekEqOwxUUePzcmc59pRuOk,21458
+nv_ingest_api/internal/primitives/nim/model_interface/ocr.py,sha256=0Xfuf5_-7LoWnqzZlsJFI53ztneB7Rs-PHZQzDgR0mo,22679
 nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py,sha256=5PqD2JuHY2rwd-6SSB4axr2Dd79vm95sAEkcmI3U7ME,12977
 nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py,sha256=lFhppNqrq5X_fzbCWKphvZQMzaJd3gHrkWsyJORzFrU,5010
 nv_ingest_api/internal/primitives/nim/model_interface/vlm.py,sha256=qJ382PU1ZrIM-SR3cqIhtY_W2rmHec2HIa2aUB2SvaU,6031
@@ -162,8 +162,8 @@ nv_ingest_api/util/string_processing/configuration.py,sha256=2HS08msccuPCT0fn_jf
 nv_ingest_api/util/string_processing/yaml.py,sha256=6SW2O6wbXRhGbhETMbtXjYCZn53HeCNOP6a96AaxlHs,1454
 nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nv_ingest_api/util/system/hardware_info.py,sha256=1UFM8XE6M3pgQcpbVsCsqDQ7Dj-zzptL-XRE-DEu9UA,27213
-nv_ingest_api-2025.8.24.dev20250824.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-nv_ingest_api-2025.8.24.dev20250824.dist-info/METADATA,sha256=efa0KmmDvdvacCuWPIHCoYGvujzWjw-LIdWdnhSW3Cw,13947
-nv_ingest_api-2025.8.24.dev20250824.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-nv_ingest_api-2025.8.24.dev20250824.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
-nv_ingest_api-2025.8.24.dev20250824.dist-info/RECORD,,
+nv_ingest_api-2025.8.25.dev20250825.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+nv_ingest_api-2025.8.25.dev20250825.dist-info/METADATA,sha256=IBhn7pRL6SlKTG59w68Mo4Gets_IBm_rBOTyDf2aZXU,13947
+nv_ingest_api-2025.8.25.dev20250825.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+nv_ingest_api-2025.8.25.dev20250825.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
+nv_ingest_api-2025.8.25.dev20250825.dist-info/RECORD,,

{nv_ingest_api-2025.8.24.dev20250824.dist-info → nv_ingest_api-2025.8.25.dev20250825.dist-info}/WHEEL RENAMED Viewed

File without changes

{nv_ingest_api-2025.8.24.dev20250824.dist-info → nv_ingest_api-2025.8.25.dev20250825.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{nv_ingest_api-2025.8.24.dev20250824.dist-info → nv_ingest_api-2025.8.25.dev20250825.dist-info}/top_level.txt RENAMED Viewed

File without changes

nv-ingest-api 2025.8.24.dev20250824__py3-none-any.whl → 2025.8.25.dev20250825__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 2025.8.24.dev20250824py3-none-any.whl → 2025.8.25.dev20250825py3-none-any.whl