PyPI - nv-ingest-api - Versions diffs - 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.18.dev20250718__py3-none-any.whl - Mend

nv-ingest-api 2025.7.16.dev20250716py3-none-any.whl → 2025.7.18.dev20250718py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (24) hide show

nv_ingest_api/internal/extract/image/table_extractor.py CHANGED Viewed

@@ -15,10 +15,11 @@ import pandas as pd
 from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskTableExtraction
 from nv_ingest_api.internal.enums.common import TableFormatEnum
-from nv_ingest_api.internal.primitives.nim.model_interface.paddle import PaddleOCRModelInterface
+from nv_ingest_api.internal.primitives.nim.model_interface.ocr import OCRModelInterface
+from nv_ingest_api.internal.primitives.nim.model_interface.ocr import get_ocr_model_name
 from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
-from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_paddle_output
-from nv_ingest_api.util.image_processing.table_and_chart import convert_paddle_response_to_psuedo_markdown
+from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_ocr_output
+from nv_ingest_api.util.image_processing.table_and_chart import convert_ocr_response_to_psuedo_markdown
 from nv_ingest_api.internal.primitives.nim import NimClient
 from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxTableStructureModelInterface
 from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
@@ -60,7 +61,8 @@ def _filter_valid_images(base64_images: List[str]) -> Tuple[List[str], List[np.n
 def _run_inference(
     enable_yolox: bool,
     yolox_client: Any,
-    paddle_client: Any,
+    ocr_client: Any,
+    ocr_model_name: str,
     valid_arrays: List[np.ndarray],
     valid_images: List[str],
     trace_info: Optional[Dict] = None,
@@ -68,32 +70,45 @@ def _run_inference(
     """
     Run inference concurrently for YOLOX (if enabled) and Paddle.
-    Returns a tuple of (yolox_results, paddle_results).
+    Returns a tuple of (yolox_results, ocr_results).
     """
-    data_paddle = {"base64_images": valid_images}
+    data_ocr = {"base64_images": valid_images}
     if enable_yolox:
         data_yolox = {"images": valid_arrays}
-    with ThreadPoolExecutor(max_workers=2) as executor:
-        future_yolox = None
-        if enable_yolox:
-            future_yolox = executor.submit(
-                yolox_client.infer,
-                data=data_yolox,
-                model_name="yolox",
-                stage_name="table_extraction",
-                max_batch_size=8,
-                trace_info=trace_info,
-            )
-        future_paddle = executor.submit(
-            paddle_client.infer,
-            data=data_paddle,
-            model_name="paddle",
+        future_yolox_kwargs = dict(
+            data=data_yolox,
+            model_name="yolox_ensemble",
             stage_name="table_extraction",
-            max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
+            max_batch_size=8,
+            input_names=["INPUT_IMAGES", "THRESHOLDS"],
+            dtypes=["BYTES", "FP32"],
+            output_names=["OUTPUT"],
             trace_info=trace_info,
         )
+    future_ocr_kwargs = dict(
+        data=data_ocr,
+        stage_name="table_extraction",
+        max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
+        trace_info=trace_info,
+    )
+    if ocr_model_name == "paddle":
+        future_ocr_kwargs.update(
+            model_name="paddle",
+        )
+    else:
+        future_ocr_kwargs.update(
+            model_name="scene_text",
+            input_names=["input", "merge_levels"],
+            dtypes=["FP32", "BYTES"],
+            merge_level="word",
+        )
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        future_ocr = executor.submit(ocr_client.infer, **future_ocr_kwargs)
+        future_yolox = None
+        if enable_yolox:
+            future_yolox = executor.submit(yolox_client.infer, **future_yolox_kwargs)
         if enable_yolox:
             try:
                 yolox_results = future_yolox.result()
@@ -104,17 +119,17 @@ def _run_inference(
             yolox_results = [None] * len(valid_images)
         try:
-            paddle_results = future_paddle.result()
+            ocr_results = future_ocr.result()
         except Exception as e:
-            logger.error(f"Error calling paddle_client.infer: {e}", exc_info=True)
+            logger.error(f"Error calling ocr_client.infer: {e}", exc_info=True)
             raise
-    return yolox_results, paddle_results
+    return yolox_results, ocr_results
 def _validate_inference_results(
     yolox_results: Any,
-    paddle_results: Any,
+    ocr_results: Any,
     valid_arrays: List[Any],
     valid_images: List[str],
 ) -> Tuple[List[Any], List[Any]]:
@@ -123,46 +138,47 @@ def _validate_inference_results(
     If not, default values are assigned. Raises a ValueError if the lengths do not match.
     """
-    if not isinstance(yolox_results, list) or not isinstance(paddle_results, list):
+    if not isinstance(yolox_results, list) or not isinstance(ocr_results, list):
         logger.warning(
-            "Unexpected result types from inference clients: yolox_results=%s, paddle_results=%s. "
+            "Unexpected result types from inference clients: yolox_results=%s, ocr_results=%s. "
             "Proceeding with available results.",
             type(yolox_results).__name__,
-            type(paddle_results).__name__,
+            type(ocr_results).__name__,
         )
         if not isinstance(yolox_results, list):
             yolox_results = [None] * len(valid_arrays)
-        if not isinstance(paddle_results, list):
-            paddle_results = [(None, None)] * len(valid_images)
+        if not isinstance(ocr_results, list):
+            ocr_results = [(None, None)] * len(valid_images)
     if len(yolox_results) != len(valid_arrays):
         raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
-    if len(paddle_results) != len(valid_images):
-        raise ValueError(f"Expected {len(valid_images)} paddle results, got {len(paddle_results)}")
+    if len(ocr_results) != len(valid_images):
+        raise ValueError(f"Expected {len(valid_images)} ocr results, got {len(ocr_results)}")
-    return yolox_results, paddle_results
+    return yolox_results, ocr_results
 def _update_table_metadata(
     base64_images: List[str],
     yolox_client: Any,
-    paddle_client: Any,
+    ocr_client: Any,
+    ocr_model_name: str,
     worker_pool_size: int = 8,  # Not currently used
     enable_yolox: bool = False,
     trace_info: Optional[Dict] = None,
 ) -> List[Tuple[str, Any, Any, Any]]:
     """
     Given a list of base64-encoded images, this function filters out images that do not meet
-    the minimum size requirements and then calls the PaddleOCR model via paddle_client.infer
+    the minimum size requirements and then calls the OCR model via ocr_client.infer
     to extract table data.
     For each base64-encoded image, the result is a tuple:
-        (base64_image, yolox_result, paddle_text_predictions, paddle_bounding_boxes)
+        (base64_image, yolox_result, ocr_text_predictions, ocr_bounding_boxes)
     Images that do not meet the minimum size are skipped (resulting in placeholders).
-    The paddle_client is expected to handle any necessary batching and concurrency.
+    The ocr_client is expected to handle any necessary batching and concurrency.
     """
-    logger.debug(f"Running table extraction using protocol {paddle_client.protocol}")
+    logger.debug(f"Running table extraction using protocol {ocr_client.protocol}")
     # Initialize the results list with default placeholders.
     results: List[Tuple[str, Any, Any, Any]] = [("", None, None, None)] * len(base64_images)
@@ -174,24 +190,23 @@ def _update_table_metadata(
         return results
     # Run inference concurrently.
-    yolox_results, paddle_results = _run_inference(
+    yolox_results, ocr_results = _run_inference(
         enable_yolox=enable_yolox,
         yolox_client=yolox_client,
-        paddle_client=paddle_client,
+        ocr_client=ocr_client,
+        ocr_model_name=ocr_model_name,
         valid_arrays=valid_arrays,
         valid_images=valid_images,
         trace_info=trace_info,
     )
     # Validate that the inference results have the expected structure.
-    yolox_results, paddle_results = _validate_inference_results(
-        yolox_results, paddle_results, valid_arrays, valid_images
-    )
+    yolox_results, ocr_results = _validate_inference_results(yolox_results, ocr_results, valid_arrays, valid_images)
     # Combine results with the original order.
-    for idx, (yolox_res, paddle_res) in enumerate(zip(yolox_results, paddle_results)):
+    for idx, (yolox_res, ocr_res) in enumerate(zip(yolox_results, ocr_results)):
         original_index = valid_indices[idx]
-        results[original_index] = (base64_images[original_index], yolox_res, paddle_res[0], paddle_res[1])
+        results[original_index] = (base64_images[original_index], yolox_res, ocr_res[0], ocr_res[1])
     return results
@@ -199,14 +214,14 @@ def _update_table_metadata(
 def _create_clients(
     yolox_endpoints: Tuple[str, str],
     yolox_protocol: str,
-    paddle_endpoints: Tuple[str, str],
-    paddle_protocol: str,
+    ocr_endpoints: Tuple[str, str],
+    ocr_protocol: str,
     auth_token: str,
 ) -> Tuple[NimClient, NimClient]:
     yolox_model_interface = YoloxTableStructureModelInterface()
-    paddle_model_interface = PaddleOCRModelInterface()
+    ocr_model_interface = OCRModelInterface()
-    logger.debug(f"Inference protocols: yolox={yolox_protocol}, paddle={paddle_protocol}")
+    logger.debug(f"Inference protocols: yolox={yolox_protocol}, ocr={ocr_protocol}")
     yolox_client = create_inference_client(
         endpoints=yolox_endpoints,
@@ -215,14 +230,14 @@ def _create_clients(
         infer_protocol=yolox_protocol,
     )
-    paddle_client = create_inference_client(
-        endpoints=paddle_endpoints,
-        model_interface=paddle_model_interface,
+    ocr_client = create_inference_client(
+        endpoints=ocr_endpoints,
+        model_interface=ocr_model_interface,
         auth_token=auth_token,
-        infer_protocol=paddle_protocol,
+        infer_protocol=ocr_protocol,
     )
-    return yolox_client, paddle_client
+    return yolox_client, ocr_client
 def extract_table_data_from_image_internal(
@@ -262,14 +277,18 @@ def extract_table_data_from_image_internal(
         return df_extraction_ledger, execution_trace_log
     endpoint_config = extraction_config.endpoint_config
-    yolox_client, paddle_client = _create_clients(
+    yolox_client, ocr_client = _create_clients(
         endpoint_config.yolox_endpoints,
         endpoint_config.yolox_infer_protocol,
-        endpoint_config.paddle_endpoints,
-        endpoint_config.paddle_infer_protocol,
+        endpoint_config.ocr_endpoints,
+        endpoint_config.ocr_infer_protocol,
         endpoint_config.auth_token,
     )
+    # Get the grpc endpoint to determine the model if needed
+    ocr_grpc_endpoint = endpoint_config.ocr_endpoints[0]
+    ocr_model_name = get_ocr_model_name(ocr_grpc_endpoint)
     try:
         # 1) Identify rows that meet criteria (structured, subtype=table, table_metadata != None, content not empty)
         def meets_criteria(row):
@@ -309,7 +328,8 @@ def extract_table_data_from_image_internal(
         bulk_results = _update_table_metadata(
             base64_images=base64_images,
             yolox_client=yolox_client,
-            paddle_client=paddle_client,
+            ocr_client=ocr_client,
+            ocr_model_name=ocr_model_name,
             worker_pool_size=endpoint_config.workers_per_progress_engine,
             enable_yolox=enable_yolox,
             trace_info=execution_trace_log,
@@ -317,15 +337,15 @@ def extract_table_data_from_image_internal(
         # 4) Write the results (bounding_boxes, text_predictions) back
         for row_id, idx in enumerate(valid_indices):
-            # unpack (base64_image, (yolox_predictions, paddle_bounding boxes, paddle_text_predictions))
+            # unpack (base64_image, (yolox_predictions, ocr_bounding boxes, ocr_text_predictions))
             _, cell_predictions, bounding_boxes, text_predictions = bulk_results[row_id]
             if table_content_format == TableFormatEnum.SIMPLE:
                 table_content = " ".join(text_predictions)
             elif table_content_format == TableFormatEnum.PSEUDO_MARKDOWN:
-                table_content = convert_paddle_response_to_psuedo_markdown(bounding_boxes, text_predictions)
+                table_content = convert_ocr_response_to_psuedo_markdown(bounding_boxes, text_predictions)
             elif table_content_format == TableFormatEnum.MARKDOWN:
-                table_content = join_yolox_table_structure_and_paddle_output(
+                table_content = join_yolox_table_structure_and_ocr_output(
                     cell_predictions, bounding_boxes, text_predictions
                 )
             else:
@@ -341,4 +361,4 @@ def extract_table_data_from_image_internal(
         raise
     finally:
         yolox_client.close()
-        paddle_client.close()
+        ocr_client.close()

nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py CHANGED Viewed

@@ -101,7 +101,7 @@ def nemoretriever_parse_extractor(
             - text_depth : str, optional (default is "page")
             - extract_tables_method : str, optional (default is "yolox")
             - identify_nearby_objects : bool, optional (default is True)
-            - paddle_output_format : str, optional (default is "pseudo_markdown")
+            - table_output_format : str, optional (default is "pseudo_markdown")
             - pdfium_config : dict, optional (configuration for PDFium)
             - nemoretriever_parse_config : dict, optional (configuration for NemoRetrieverParse)
             - metadata_column : str, optional (default is "metadata")
@@ -146,14 +146,14 @@ def nemoretriever_parse_extractor(
     # Flag for identifying nearby objects.
     identify_nearby_objects = extractor_config.get("identify_nearby_objects", True)
-    # Get and validate paddle_output_format.
-    paddle_output_format_str = extractor_config.get("paddle_output_format", "pseudo_markdown")
+    # Get and validate table_output_format.
+    table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
     try:
-        paddle_output_format = TableFormatEnum[paddle_output_format_str.upper()]
+        table_output_format = TableFormatEnum[table_output_format_str.upper()]
     except KeyError:
         valid_options = [e.name.lower() for e in TableFormatEnum]
         raise ValueError(
-            f"Invalid paddle_output_format value: {paddle_output_format_str}. Expected one of: {valid_options}"
+            f"Invalid table_output_format value: {table_output_format_str}. Expected one of: {valid_options}"
         )
     # Process nemoretriever_parse configuration.
@@ -254,10 +254,13 @@ def nemoretriever_parse_extractor(
                     extract_tables,
                     extract_charts,
                     extract_infographics,
-                    paddle_output_format,
+                    table_output_format,
                     nemoretriever_parse_config.yolox_endpoints,
                     nemoretriever_parse_config.yolox_infer_protocol,
                     nemoretriever_parse_config.auth_token,
+                    input_names=["INPUT_IMAGES", "THRESHOLDS"],
+                    dtypes=["BYTES", "FP32"],
+                    output_names=["OUTPUT"],
                     execution_trace_log=execution_trace_log,
                 )
                 futures.append(future_yolox)
@@ -288,7 +291,7 @@ def nemoretriever_parse_extractor(
                 extract_tables,
                 extract_charts,
                 extract_infographics,
-                paddle_output_format,
+                table_output_format,
                 nemoretriever_parse_config.yolox_endpoints,
                 nemoretriever_parse_config.yolox_infer_protocol,
                 nemoretriever_parse_config.auth_token,

nv_ingest_api/internal/extract/pdf/engines/pdfium.py CHANGED Viewed

@@ -29,9 +29,8 @@ from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH
 from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
     YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
     YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
-    YOLOX_PAGE_IMAGE_FORMAT,
-    get_yolox_model_name,
     YoloxPageElementsModelInterface,
+    YOLOX_PAGE_IMAGE_FORMAT,
 )
 from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
 from nv_ingest_api.internal.enums.common import TableFormatEnum, TextTypeEnum, AccessLevelEnum
@@ -58,7 +57,6 @@ logger = logging.getLogger(__name__)
 def _extract_page_elements_using_image_ensemble(
     pages: List[Tuple[int, np.ndarray, Tuple[int, int]]],
     yolox_client,
-    yolox_model_name: str = "yolox",
     execution_trace_log: Optional[List] = None,
 ) -> List[Tuple[int, object]]:
     """
@@ -72,8 +70,6 @@ def _extract_page_elements_using_image_ensemble(
         and optional padding offset information.
     yolox_client : object
         A pre-configured client instance for the YOLOX inference service.
-    yolox_model_name : str, default="yolox"
-        The name of the YOLOX model to use for inference.
     execution_trace_log : Optional[List], default=None
         List for accumulating execution trace information.
@@ -106,8 +102,11 @@ def _extract_page_elements_using_image_ensemble(
         # Perform inference using the NimClient.
         inference_results = yolox_client.infer(
             data,
-            model_name="yolox",
+            model_name="yolox_ensemble",
             max_batch_size=YOLOX_MAX_BATCH_SIZE,
+            input_names=["INPUT_IMAGES", "THRESHOLDS"],
+            dtypes=["BYTES", "FP32"],
+            output_names=["OUTPUT"],
             trace_info=execution_trace_log,
             stage_name="pdf_extraction",
         )
@@ -267,7 +266,7 @@ def _extract_page_elements(
     extract_tables: bool,
     extract_charts: bool,
     extract_infographics: bool,
-    paddle_output_format: str,
+    table_output_format: str,
     yolox_endpoints: Tuple[Optional[str], Optional[str]],
     yolox_infer_protocol: str = "http",
     auth_token: Optional[str] = None,
@@ -296,7 +295,7 @@ def _extract_page_elements(
         Flag indicating whether to extract charts.
     extract_infographics : bool
         Flag indicating whether to extract infographics.
-    paddle_output_format : str
+    table_output_format : str
         Format to use for table content.
     yolox_endpoints : Tuple[Optional[str], Optional[str]]
         A tuple containing the gRPC and HTTP endpoints for the YOLOX service.
@@ -317,19 +316,7 @@ def _extract_page_elements(
     try:
         # Default model name
-        yolox_model_name = "yolox"
-        # Get the HTTP endpoint to determine the model name if needed
-        yolox_http_endpoint = yolox_endpoints[1]
-        if yolox_http_endpoint:
-            try:
-                yolox_model_name = get_yolox_model_name(yolox_http_endpoint)
-            except Exception as e:
-                logger.warning(f"Failed to get YOLOX model name from endpoint: {e}. Using default.")
-        # Create the model interface
-        model_interface = YoloxPageElementsModelInterface(yolox_model_name=yolox_model_name)
+        model_interface = YoloxPageElementsModelInterface()
         # Create the inference client
         yolox_client = create_inference_client(
             yolox_endpoints,
@@ -340,7 +327,7 @@ def _extract_page_elements(
         # Extract page elements using the client
         page_element_results = _extract_page_elements_using_image_ensemble(
-            pages, yolox_client, yolox_model_name, execution_trace_log=execution_trace_log
+            pages, yolox_client, execution_trace_log=execution_trace_log
         )
         # Process each extracted element based on extraction flags
@@ -355,7 +342,7 @@ def _extract_page_elements(
             # Set content format for tables
             if page_element.type_string == "table":
-                page_element.content_format = paddle_output_format
+                page_element.content_format = table_output_format
             # Construct metadata for the page element
             page_element_meta = construct_page_element_metadata(
@@ -412,13 +399,13 @@ def pdfium_extractor(
             f"Invalid text_depth: {text_depth_str}. Valid options: {list(TextTypeEnum.__members__.keys())}"
         )
-    # Validate and extract paddle_output_format
-    paddle_output_format_str = extractor_config.get("paddle_output_format", "pseudo_markdown")
+    # Validate and extract table_output_format
+    table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
     try:
-        paddle_output_format = TableFormatEnum[paddle_output_format_str.upper()]
+        table_output_format = TableFormatEnum[table_output_format_str.upper()]
     except KeyError:
         raise ValueError(
-            f"Invalid paddle_output_format: {paddle_output_format_str}. "
+            f"Invalid table_output_format: {table_output_format_str}. "
             f"Valid options: {list(TableFormatEnum.__members__.keys())}"
         )
@@ -568,7 +555,7 @@ def pdfium_extractor(
                         extract_tables,
                         extract_charts,
                         extract_infographics,
-                        paddle_output_format,
+                        table_output_format,
                         pdfium_config.yolox_endpoints,
                         pdfium_config.yolox_infer_protocol,
                         pdfium_config.auth_token,
@@ -590,7 +577,7 @@ def pdfium_extractor(
                 extract_tables,
                 extract_charts,
                 extract_infographics,
-                paddle_output_format,
+                table_output_format,
                 pdfium_config.yolox_endpoints,
                 pdfium_config.yolox_infer_protocol,
                 pdfium_config.auth_token,

nv_ingest_api/internal/primitives/nim/model_interface/helpers.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import logging
+from typing import Optional
 import backoff
 import cv2
@@ -13,6 +14,7 @@ from nv_ingest_api.internal.primitives.nim.model_interface.decorators import mul
 from nv_ingest_api.util.image_processing.transforms import pad_image, normalize_image
 from nv_ingest_api.util.string_processing import generate_url, remove_url_endpoints
+cv2.setNumThreads(1)
 logger = logging.getLogger(__name__)
@@ -81,6 +83,63 @@ def preprocess_image_for_paddle(array: np.ndarray, image_max_dimension: int = 96
     return transposed, metadata
+def preprocess_image_for_ocr(
+    array: np.ndarray,
+    target_height: Optional[int] = None,
+    target_width: Optional[int] = None,
+    pad_how: str = "bottom_right",
+) -> np.ndarray:
+    """
+    Preprocesses an input image to be suitable for use with NemoRetriever-OCR.
+    This function is intended for preprocessing images to be passed as input to NemoRetriever-OCR using GRPC.
+    It is not necessary when using the HTTP endpoint.
+    Parameters:
+    ----------
+    array : np.ndarray
+        The input image array of shape (height, width, channels). It should have pixel values in the range [0, 255].
+    Returns:
+    -------
+    np.ndarray
+        A preprocessed image with the shape (channels, height, width).
+    """
+    height, width = array.shape[:2]
+    if target_height is None:
+        target_height = height
+    if target_width is None:
+        target_width = width
+    padded, (pad_width, pad_height) = pad_image(
+        array,
+        target_height=target_height,
+        target_width=target_width,
+        background_color=255,
+        dtype=np.float32,
+        how=pad_how,
+    )
+    padded = padded / 255.0
+    # NemoRetriever-OCR NIM (GRPC) requires input to be (channel, height, width).
+    transposed = padded.transpose((2, 0, 1))
+    # Metadata can used for inverting transformations on the resulting bounding boxes.
+    metadata = {
+        "original_height": height,
+        "original_width": width,
+        "new_height": target_height,
+        "new_width": target_width,
+        "pad_height": pad_height,
+        "pad_width": pad_width,
+    }
+    return transposed, metadata
 def is_ready(http_endpoint: str, ready_endpoint: str) -> bool:
     """
     Check if the server at the given endpoint is ready.

nv-ingest-api 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.18.dev20250718__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 2025.7.16.dev20250716py3-none-any.whl → 2025.7.18.dev20250718py3-none-any.whl