PyPI - nv-ingest-api - Versions diffs - 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.17.dev20250717__py3-none-any.whl - Mend

nv-ingest-api 2025.7.16.dev20250716py3-none-any.whl → 2025.7.17.dev20250717py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (22) hide show

nv_ingest_api/internal/primitives/nim/model_interface/{paddle.py → ocr.py} RENAMED Viewed

@@ -4,22 +4,37 @@
 import json
 import logging
-from typing import Any, List, Tuple
+import os
+from typing import Any
 from typing import Dict
+from typing import List
 from typing import Optional
+from typing import Tuple
+import backoff
 import numpy as np
+import tritonclient.grpc as grpcclient
 from nv_ingest_api.internal.primitives.nim import ModelInterface
-from nv_ingest_api.internal.primitives.nim.model_interface.helpers import preprocess_image_for_paddle
+from nv_ingest_api.internal.primitives.nim.model_interface.decorators import (
+    multiprocessing_cache,
+)
+from nv_ingest_api.internal.primitives.nim.model_interface.helpers import (
+    preprocess_image_for_ocr,
+)
+from nv_ingest_api.internal.primitives.nim.model_interface.helpers import (
+    preprocess_image_for_paddle,
+)
 from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
+DEFAULT_OCR_MODEL_NAME = "paddle"
 logger = logging.getLogger(__name__)
-class PaddleOCRModelInterface(ModelInterface):
+class OCRModelInterface(ModelInterface):
     """
-    An interface for handling inference with a PaddleOCR model, supporting both gRPC and HTTP protocols.
+    An interface for handling inference with a OCR model, supporting both gRPC and HTTP protocols.
     """
     def name(self) -> str:
@@ -31,7 +46,7 @@ class PaddleOCRModelInterface(ModelInterface):
         str
             The name of the model interface.
         """
-        return "PaddleOCR"
+        return "OCR"
     def prepare_data_for_inference(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -126,11 +141,26 @@ class PaddleOCRModelInterface(ModelInterface):
         images = data["image_arrays"]
         dims = data["image_dims"]
+        model_name = kwargs.get("model_name", "paddle")
+        merge_level = kwargs.get("merge_level", "paragraph")
         if protocol == "grpc":
-            logger.debug("Formatting input for gRPC PaddleOCR model (batched).")
+            logger.debug("Formatting input for gRPC OCR model (batched).")
             processed: List[np.ndarray] = []
+            max_length = max(max(img.shape[:2]) for img in images)
             for img in images:
-                arr, _dims = preprocess_image_for_paddle(img)
+                if model_name == "paddle":
+                    arr, _dims = preprocess_image_for_paddle(img)
+                else:
+                    arr, _dims = preprocess_image_for_ocr(
+                        img,
+                        target_height=max_length,
+                        target_width=max_length,
+                        pad_how="bottom_right",
+                    )
                 dims.append(_dims)
                 arr = arr.astype(np.float32)
                 arr = np.expand_dims(arr, axis=0)  # => shape (1, H, W, C)
@@ -144,12 +174,18 @@ class PaddleOCRModelInterface(ModelInterface):
                 chunk_list(dims, max_batch_size),
             ):
                 batched_input = np.concatenate(proc_chunk, axis=0)
-                batches.append(batched_input)
+                if model_name == "paddle":
+                    batches.append(batched_input)
+                else:
+                    merge_levels = np.array([[merge_level] * len(batched_input)], dtype="object")
+                    batches.append([batched_input, merge_levels])
                 batch_data_list.append({"image_arrays": orig_chunk, "image_dims": dims_chunk})
             return batches, batch_data_list
         elif protocol == "http":
-            logger.debug("Formatting input for HTTP PaddleOCR model (batched).")
+            logger.debug("Formatting input for HTTP OCR model (batched).")
             if "base64_images" in data:
                 base64_list = data["base64_images"]
             else:
@@ -170,7 +206,13 @@ class PaddleOCRModelInterface(ModelInterface):
                 chunk_list(images, max_batch_size),
                 chunk_list(dims, max_batch_size),
             ):
-                payload = {"input": input_chunk}
+                if model_name == "paddle":
+                    payload = {"input": input_chunk}
+                else:
+                    payload = {
+                        "input": input_chunk,
+                        "merge_levels": [merge_level] * len(input_chunk),
+                    }
                 batches.append(payload)
                 batch_data_list.append({"image_arrays": orig_chunk, "image_dims": dims_chunk})
@@ -179,7 +221,14 @@ class PaddleOCRModelInterface(ModelInterface):
         else:
             raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
-    def parse_output(self, response: Any, protocol: str, data: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Any:
+    def parse_output(
+        self,
+        response: Any,
+        protocol: str,
+        data: Optional[Dict[str, Any]] = None,
+        model_name: str = "paddle",
+        **kwargs: Any,
+    ) -> Any:
         """
         Parse the model's inference response for the given protocol. The parsing
         may handle batched outputs for multiple images.
@@ -187,7 +236,7 @@ class PaddleOCRModelInterface(ModelInterface):
         Parameters
         ----------
         response : Any
-            The raw response from the PaddleOCR model.
+            The raw response from the OCR model.
         protocol : str
             The protocol used for inference, "grpc" or "http".
         data : dict of str -> Any, optional
@@ -209,24 +258,24 @@ class PaddleOCRModelInterface(ModelInterface):
         dims: Optional[List[Tuple[int, int]]] = data.get("image_dims") if data else None
         if protocol == "grpc":
-            logger.debug("Parsing output from gRPC PaddleOCR model (batched).")
-            return self._extract_content_from_paddle_grpc_response(response, dims)
+            logger.debug("Parsing output from gRPC OCR model (batched).")
+            return self._extract_content_from_ocr_grpc_response(response, dims, model_name=model_name)
         elif protocol == "http":
-            logger.debug("Parsing output from HTTP PaddleOCR model (batched).")
-            return self._extract_content_from_paddle_http_response(response, dims)
+            logger.debug("Parsing output from HTTP OCR model (batched).")
+            return self._extract_content_from_ocr_http_response(response, dims)
         else:
             raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
     def process_inference_results(self, output: Any, **kwargs: Any) -> Any:
         """
-        Process inference results for the PaddleOCR model.
+        Process inference results for the OCR model.
         Parameters
         ----------
         output : Any
-            The raw output parsed from the PaddleOCR model.
+            The raw output parsed from the OCR model.
         **kwargs : Any
             Additional keyword arguments for customization.
@@ -238,7 +287,7 @@ class PaddleOCRModelInterface(ModelInterface):
         """
         return output
-    def _prepare_paddle_payload(self, base64_img: str) -> Dict[str, Any]:
+    def _prepare_ocr_payload(self, base64_img: str) -> Dict[str, Any]:
         """
         DEPRECATED by batch logic in format_input. Kept here if you need single-image direct calls.
@@ -250,7 +299,7 @@ class PaddleOCRModelInterface(ModelInterface):
         Returns
         -------
         dict of str -> Any
-            The payload in either legacy or new format for PaddleOCR's HTTP endpoint.
+            The payload in either legacy or new format for OCR's HTTP endpoint.
         """
         image_url = f"data:image/png;base64,{base64_img}"
@@ -259,18 +308,18 @@ class PaddleOCRModelInterface(ModelInterface):
         return payload
-    def _extract_content_from_paddle_http_response(
+    def _extract_content_from_ocr_http_response(
         self,
         json_response: Dict[str, Any],
         dimensions: List[Dict[str, Any]],
     ) -> List[Tuple[str, str]]:
         """
-        Extract content from the JSON response of a PaddleOCR HTTP API request.
+        Extract content from the JSON response of a OCR HTTP API request.
         Parameters
         ----------
         json_response : dict of str -> Any
-            The JSON response returned by the PaddleOCR endpoint.
+            The JSON response returned by the OCR endpoint.
         table_content_format : str or None
             The specified format for table content (e.g., 'simple' or 'pseudo_markdown').
         dimensions : list of dict, optional
@@ -296,25 +345,29 @@ class PaddleOCRModelInterface(ModelInterface):
             text_detections = item.get("text_detections", [])
             text_predictions = []
             bounding_boxes = []
+            conf_scores = []
             for td in text_detections:
                 text_predictions.append(td["text_prediction"]["text"])
                 bounding_boxes.append([[pt["x"], pt["y"]] for pt in td["bounding_box"]["points"]])
+                conf_scores.append(td["text_prediction"]["confidence"])
-            bounding_boxes, text_predictions = self._postprocess_paddle_response(
+            bounding_boxes, text_predictions, conf_scores = self._postprocess_ocr_response(
                 bounding_boxes,
                 text_predictions,
+                conf_scores,
                 dimensions,
                 img_index=item_idx,
             )
-            results.append([bounding_boxes, text_predictions])
+            results.append([bounding_boxes, text_predictions, conf_scores])
         return results
-    def _extract_content_from_paddle_grpc_response(
+    def _extract_content_from_ocr_grpc_response(
         self,
         response: np.ndarray,
         dimensions: List[Dict[str, Any]],
+        model_name: str = "paddle",
     ) -> List[Tuple[str, str]]:
         """
         Parse a gRPC response for one or more images. The response can have two possible shapes:
@@ -367,33 +420,41 @@ class PaddleOCRModelInterface(ModelInterface):
             texts_bytestr: bytes = response[1, i]
             text_predictions = json.loads(texts_bytestr.decode("utf8"))
-            # 3) Log the third element (extra data/metadata) if needed
-            extra_data_bytestr: bytes = response[2, i]
-            logger.debug(f"Ignoring extra_data for image {i}: {extra_data_bytestr}")
+            # 3) Parse confidence scores
+            confs_bytestr: bytes = response[2, i]
+            conf_scores = json.loads(confs_bytestr.decode("utf8"))
             # Some gRPC responses nest single-item lists; flatten them if needed
             if isinstance(bounding_boxes, list) and len(bounding_boxes) == 1:
                 bounding_boxes = bounding_boxes[0]
             if isinstance(text_predictions, list) and len(text_predictions) == 1:
                 text_predictions = text_predictions[0]
+            if isinstance(conf_scores, list) and len(conf_scores) == 1:
+                conf_scores = conf_scores[0]
-            bounding_boxes, text_predictions = self._postprocess_paddle_response(
+            # 4) Postprocess
+            bounding_boxes, text_predictions, conf_scores = self._postprocess_ocr_response(
                 bounding_boxes,
                 text_predictions,
+                conf_scores,
                 dimensions,
                 img_index=i,
+                scale_coordinates=True if model_name == "paddle" else False,
             )
-            results.append([bounding_boxes, text_predictions])
+            results.append([bounding_boxes, text_predictions, conf_scores])
         return results
     @staticmethod
-    def _postprocess_paddle_response(
+    def _postprocess_ocr_response(
         bounding_boxes: List[Any],
         text_predictions: List[str],
+        conf_scores: List[float],
         dims: Optional[List[Dict[str, Any]]] = None,
         img_index: int = 0,
+        scale_coordinates: bool = True,
+        shift_coordinates: bool = True,
     ) -> Tuple[List[Any], List[str]]:
         """
         Convert bounding boxes with normalized coordinates to pixel cooridnates by using
@@ -434,17 +495,18 @@ class PaddleOCRModelInterface(ModelInterface):
                 logger.warning("Image index out of range for stored dimensions. Using first image dims by default.")
                 img_index = 0
-        max_width = dims[img_index]["new_width"]
-        max_height = dims[img_index]["new_height"]
-        pad_width = dims[img_index].get("pad_width", 0)
-        pad_height = dims[img_index].get("pad_height", 0)
-        scale_factor = dims[img_index].get("scale_factor", 1.0)
+        max_width = dims[img_index]["new_width"] if scale_coordinates else 1.0
+        max_height = dims[img_index]["new_height"] if scale_coordinates else 1.0
+        pad_width = dims[img_index].get("pad_width", 0) if shift_coordinates else 0.0
+        pad_height = dims[img_index].get("pad_height", 0) if shift_coordinates else 0.0
+        scale_factor = dims[img_index].get("scale_factor", 1.0) if scale_coordinates else 1.0
         bboxes: List[List[float]] = []
         texts: List[str] = []
+        confs: List[float] = []
         # Convert normalized coords back to actual pixel coords
-        for box, txt in zip(bounding_boxes, text_predictions):
+        for box, txt, conf in zip(bounding_boxes, text_predictions, conf_scores):
             if box == "nan":
                 continue
             points: List[List[float]] = []
@@ -458,5 +520,36 @@ class PaddleOCRModelInterface(ModelInterface):
                 points.append([x_original, y_original])
             bboxes.append(points)
             texts.append(txt)
+            confs.append(conf)
+        return bboxes, texts, confs
-        return bboxes, texts
+@multiprocessing_cache(max_calls=100)  # Cache results first to avoid redundant retries from backoff
+@backoff.on_predicate(backoff.expo, max_time=30)
+def get_ocr_model_name(ocr_grpc_endpoint=None, default_model_name=DEFAULT_OCR_MODEL_NAME):
+    """
+    Determines the OCR model name by checking the environment, querying the gRPC endpoint,
+    or falling back to a default.
+    """
+    # 1. Check for an explicit override from the environment variable first.
+    ocr_model_name = os.getenv("OCR_MODEL_NAME", None)
+    if ocr_model_name is not None:
+        return ocr_model_name
+    # 2. If no gRPC endpoint is provided, fall back to the default immediately.
+    if not ocr_grpc_endpoint:
+        logger.debug(f"No OCR gRPC endpoint provided. Falling back to default model name '{default_model_name}'.")
+        return default_model_name
+    # 3. Attempt to query the gRPC endpoint to discover the model name.
+    try:
+        client = grpcclient.InferenceServerClient(ocr_grpc_endpoint)
+        model_index = client.get_model_repository_index(as_json=True)
+        model_names = [x["name"] for x in model_index.get("models", [])]
+        ocr_model_name = model_names[0]
+    except Exception:
+        logger.warning(f"Failed to get ocr model name after 30 seconds. Falling back to '{default_model_name}'.")
+        ocr_model_name = default_model_name
+    return ocr_model_name

nv_ingest_api/internal/primitives/nim/nim_client.py CHANGED Viewed

@@ -33,6 +33,7 @@ class NimClient:
         auth_token: Optional[str] = None,
         timeout: float = 120.0,
         max_retries: int = 5,
+        max_429_retries: int = 5,
     ):
         """
         Initialize the NimClient with the specified model interface, protocol, and server endpoints.
@@ -49,6 +50,10 @@ class NimClient:
             Authorization token for HTTP requests (default: None).
         timeout : float, optional
             Timeout for HTTP requests in seconds (default: 30.0).
+        max_retries : int, optional
+            The maximum number of retries for non-429 server-side errors (default: 5).
+        max_429_retries : int, optional
+            The maximum number of retries specifically for 429 errors (default: 10).
         Raises
         ------
@@ -62,6 +67,7 @@ class NimClient:
         self.auth_token = auth_token
         self.timeout = timeout  # Timeout for HTTP requests
         self.max_retries = max_retries
+        self.max_429_retries = max_429_retries
         self._grpc_endpoint, self._http_endpoint = endpoints
         self._max_batch_sizes = {}
         self._lock = threading.Lock()
@@ -138,7 +144,9 @@ class NimClient:
         else:
             raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
-        parsed_output = self.model_interface.parse_output(response, protocol=self.protocol, data=batch_data, **kwargs)
+        parsed_output = self.model_interface.parse_output(
+            response, protocol=self.protocol, data=batch_data, model_name=model_name, **kwargs
+        )
         return parsed_output, batch_data
     def try_set_max_batch_size(self, model_name, model_version: str = ""):
@@ -167,8 +175,8 @@ class NimClient:
         try:
             # 1. Retrieve or default to the model's maximum batch size.
             batch_size = self._fetch_max_batch_size(model_name)
-            max_requested_batch_size = kwargs.get("max_batch_size", batch_size)
-            force_requested_batch_size = kwargs.get("force_max_batch_size", False)
+            max_requested_batch_size = kwargs.pop("max_batch_size", batch_size)
+            force_requested_batch_size = kwargs.pop("force_max_batch_size", False)
             max_batch_size = (
                 min(batch_size, max_requested_batch_size)
                 if not force_requested_batch_size
@@ -180,7 +188,11 @@ class NimClient:
             # 3. Format the input based on protocol.
             formatted_batches, formatted_batch_data = self.model_interface.format_input(
-                data, protocol=self.protocol, max_batch_size=max_batch_size, model_name=model_name
+                data,
+                protocol=self.protocol,
+                max_batch_size=max_batch_size,
+                model_name=model_name,
+                **kwargs,
             )
             # Check for a custom maximum pool worker count, and remove it from kwargs.
@@ -237,19 +249,27 @@ class NimClient:
         np.ndarray
             The output of the model as a numpy array.
         """
+        if not isinstance(formatted_input, list):
+            formatted_input = [formatted_input]
         parameters = kwargs.get("parameters", {})
-        output_names = kwargs.get("outputs", ["output"])
-        dtype = kwargs.get("dtype", "FP32")
-        input_name = kwargs.get("input_name", "input")
+        output_names = kwargs.get("output_names", ["output"])
+        dtypes = kwargs.get("dtypes", ["FP32"])
+        input_names = kwargs.get("input_names", ["input"])
+        input_tensors = []
+        for input_name, input_data, dtype in zip(input_names, formatted_input, dtypes):
+            input_tensors.append(grpcclient.InferInput(input_name, input_data.shape, datatype=dtype))
-        input_tensors = grpcclient.InferInput(input_name, formatted_input.shape, datatype=dtype)
-        input_tensors.set_data_from_numpy(formatted_input)
+        for idx, input_data in enumerate(formatted_input):
+            input_tensors[idx].set_data_from_numpy(input_data)
         outputs = [grpcclient.InferRequestedOutput(output_name) for output_name in output_names]
         response = self.client.infer(
-            model_name=model_name, parameters=parameters, inputs=[input_tensors], outputs=outputs
+            model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
         )
         logger.debug(f"gRPC inference response: {response}")
         if len(outputs) == 1:
@@ -281,6 +301,7 @@ class NimClient:
         base_delay = 2.0
         attempt = 0
+        retries_429 = 0
         while attempt < self.max_retries:
             try:
@@ -291,7 +312,21 @@ class NimClient:
                 # Check for server-side or rate-limit type errors
                 # e.g. 5xx => server error, 429 => too many requests
-                if status_code == 429 or status_code == 503 or (500 <= status_code < 600):
+                if status_code == 429:
+                    retries_429 += 1
+                    logger.warning(
+                        f"Received HTTP 429 (Too Many Requests) from {self.model_interface.name()}. "
+                        f"Attempt {retries_429} of {self.max_429_retries}."
+                    )
+                    if retries_429 >= self.max_429_retries:
+                        logger.error("Max retries for HTTP 429 exceeded.")
+                        response.raise_for_status()
+                    else:
+                        backoff_time = base_delay * (2**retries_429)
+                        time.sleep(backoff_time)
+                        continue  # Retry without incrementing the main attempt counter
+                if status_code == 503 or (500 <= status_code < 600):
                     logger.warning(
                         f"Received HTTP {status_code} ({response.reason}) from "
                         f"{self.model_interface.name()}. Attempt {attempt + 1} of {self.max_retries}."

nv_ingest_api/internal/schemas/extract/extract_chart_schema.py CHANGED Viewed

@@ -24,8 +24,8 @@ class ChartExtractorConfigSchema(BaseModel):
         A tuple containing the gRPC and HTTP services for the yolox endpoint.
         Either the gRPC or HTTP service can be empty, but not both.
-    paddle_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
-        A tuple containing the gRPC and HTTP services for the paddle endpoint.
+    ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
+        A tuple containing the gRPC and HTTP services for the ocr endpoint.
         Either the gRPC or HTTP service can be empty, but not both.
     Methods
@@ -49,8 +49,8 @@ class ChartExtractorConfigSchema(BaseModel):
     yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
     yolox_infer_protocol: str = ""
-    paddle_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
-    paddle_infer_protocol: str = ""
+    ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
+    ocr_infer_protocol: str = ""
     nim_batch_size: int = 2
     workers_per_progress_engine: int = 5
@@ -86,7 +86,7 @@ class ChartExtractorConfigSchema(BaseModel):
                 return None
             return service
-        for endpoint_name in ["yolox_endpoints", "paddle_endpoints"]:
+        for endpoint_name in ["yolox_endpoints", "ocr_endpoints"]:
             grpc_service, http_service = values.get(endpoint_name, (None, None))
             grpc_service = clean_service(grpc_service)
             http_service = clean_service(http_service)
@@ -117,7 +117,7 @@ class ChartExtractorSchema(BaseModel):
         A flag indicating whether to raise an exception if a failure occurs during chart extraction.
     extraction_config: Optional[ChartExtractorConfigSchema], default=None
-        Configuration for the chart extraction stage, including yolox and paddle service endpoints.
+        Configuration for the chart extraction stage, including yolox and ocr service endpoints.
     """
     max_queue_size: int = 1

nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py CHANGED Viewed

@@ -20,8 +20,8 @@ class InfographicExtractorConfigSchema(BaseModel):
     auth_token : Optional[str], default=None
         Authentication token required for secure services.
-    paddle_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
-        A tuple containing the gRPC and HTTP services for the paddle endpoint.
+    ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
+        A tuple containing the gRPC and HTTP services for the ocr endpoint.
         Either the gRPC or HTTP service can be empty, but not both.
     Methods
@@ -42,8 +42,8 @@ class InfographicExtractorConfigSchema(BaseModel):
     auth_token: Optional[str] = None
-    paddle_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
-    paddle_infer_protocol: str = ""
+    ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
+    ocr_infer_protocol: str = ""
     nim_batch_size: int = 2
     workers_per_progress_engine: int = 5
@@ -79,7 +79,7 @@ class InfographicExtractorConfigSchema(BaseModel):
                 return None
             return service
-        for endpoint_name in ["paddle_endpoints"]:
+        for endpoint_name in ["ocr_endpoints"]:
             grpc_service, http_service = values.get(endpoint_name, (None, None))
             grpc_service = clean_service(grpc_service)
             http_service = clean_service(http_service)
@@ -110,7 +110,7 @@ class InfographicExtractorSchema(BaseModel):
         A flag indicating whether to raise an exception if a failure occurs during infographic extraction.
     stage_config : Optional[InfographicExtractorConfigSchema], default=None
-        Configuration for the infographic extraction stage, including yolox and paddle service endpoints.
+        Configuration for the infographic extraction stage, including yolox and ocr service endpoints.
     """
     max_queue_size: int = 1

nv_ingest_api/internal/schemas/extract/extract_table_schema.py CHANGED Viewed

@@ -22,8 +22,8 @@ class TableExtractorConfigSchema(BaseModel):
     auth_token : Optional[str], default=None
         Authentication token required for secure services.
-    paddle_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
-        A tuple containing the gRPC and HTTP services for the paddle endpoint.
+    ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
+        A tuple containing the gRPC and HTTP services for the ocr endpoint.
         Either the gRPC or HTTP service can be empty, but not both.
     Methods
@@ -47,8 +47,8 @@ class TableExtractorConfigSchema(BaseModel):
     yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
     yolox_infer_protocol: str = ""
-    paddle_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
-    paddle_infer_protocol: str = ""
+    ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
+    ocr_infer_protocol: str = ""
     nim_batch_size: int = 2
     workers_per_progress_engine: int = 5
@@ -81,7 +81,7 @@ class TableExtractorConfigSchema(BaseModel):
                 return None
             return service
-        for endpoint_name in ["yolox_endpoints", "paddle_endpoints"]:
+        for endpoint_name in ["yolox_endpoints", "ocr_endpoints"]:
             grpc_service, http_service = values.get(endpoint_name, (None, None))
             grpc_service = clean_service(grpc_service)
             http_service = clean_service(http_service)

nv_ingest_api/internal/transform/split_text.py CHANGED Viewed

@@ -141,14 +141,19 @@ def transform_text_split_and_tokenize_internal(
     model_predownload_path = os.environ.get("MODEL_PREDOWNLOAD_PATH")
-    if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
-        tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
-    ):
-        tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
-    elif os.path.exists(os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")) and (
-        tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"
-    ):
-        tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
+    if model_predownload_path is not None:
+        if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
+            tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
+        ):
+            tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
+        elif os.path.exists(
+            os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")
+        ) and (tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"):
+            tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
+    # Defaulto to intfloat/e5-large-unsupervised if no tokenizer predownloaded or specified
+    if tokenizer_identifier is None:
+        tokenizer_identifier = "intfloat/e5-large-unsupervised"
     tokenizer_model = AutoTokenizer.from_pretrained(tokenizer_identifier, token=hf_access_token)

nv-ingest-api 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.17.dev20250717__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 2025.7.16.dev20250716py3-none-any.whl → 2025.7.17.dev20250717py3-none-any.whl