PyPI - nv-ingest-api - Versions diffs - 25.7.6.dev20250706__py3-none-any.whl → 25.8.0rc1__py3-none-any.whl - Mend

nv-ingest-api 25.7.6.dev20250706py3-none-any.whl → 25.8.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (33) hide show

nv_ingest_api/internal/extract/image/table_extractor.py CHANGED Viewed

@@ -15,10 +15,11 @@ import pandas as pd
 from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskTableExtraction
 from nv_ingest_api.internal.enums.common import TableFormatEnum
-from nv_ingest_api.internal.primitives.nim.model_interface.paddle import PaddleOCRModelInterface
+from nv_ingest_api.internal.primitives.nim.model_interface.ocr import OCRModelInterface
+from nv_ingest_api.internal.primitives.nim.model_interface.ocr import get_ocr_model_name
 from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
-from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_paddle_output
-from nv_ingest_api.util.image_processing.table_and_chart import convert_paddle_response_to_psuedo_markdown
+from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_ocr_output
+from nv_ingest_api.util.image_processing.table_and_chart import convert_ocr_response_to_psuedo_markdown
 from nv_ingest_api.internal.primitives.nim import NimClient
 from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxTableStructureModelInterface
 from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
@@ -60,7 +61,8 @@ def _filter_valid_images(base64_images: List[str]) -> Tuple[List[str], List[np.n
 def _run_inference(
     enable_yolox: bool,
     yolox_client: Any,
-    paddle_client: Any,
+    ocr_client: Any,
+    ocr_model_name: str,
     valid_arrays: List[np.ndarray],
     valid_images: List[str],
     trace_info: Optional[Dict] = None,
@@ -68,32 +70,45 @@ def _run_inference(
     """
     Run inference concurrently for YOLOX (if enabled) and Paddle.
-    Returns a tuple of (yolox_results, paddle_results).
+    Returns a tuple of (yolox_results, ocr_results).
     """
-    data_paddle = {"base64_images": valid_images}
+    data_ocr = {"base64_images": valid_images}
     if enable_yolox:
         data_yolox = {"images": valid_arrays}
-    with ThreadPoolExecutor(max_workers=2) as executor:
-        future_yolox = None
-        if enable_yolox:
-            future_yolox = executor.submit(
-                yolox_client.infer,
-                data=data_yolox,
-                model_name="yolox",
-                stage_name="table_extraction",
-                max_batch_size=8,
-                trace_info=trace_info,
-            )
-        future_paddle = executor.submit(
-            paddle_client.infer,
-            data=data_paddle,
-            model_name="paddle",
+        future_yolox_kwargs = dict(
+            data=data_yolox,
+            model_name="yolox_ensemble",
             stage_name="table_extraction",
-            max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
+            max_batch_size=8,
+            input_names=["INPUT_IMAGES", "THRESHOLDS"],
+            dtypes=["BYTES", "FP32"],
+            output_names=["OUTPUT"],
             trace_info=trace_info,
         )
+    future_ocr_kwargs = dict(
+        data=data_ocr,
+        stage_name="table_extraction",
+        max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
+        trace_info=trace_info,
+    )
+    if ocr_model_name == "paddle":
+        future_ocr_kwargs.update(
+            model_name="paddle",
+        )
+    else:
+        future_ocr_kwargs.update(
+            model_name="scene_text",
+            input_names=["input", "merge_levels"],
+            dtypes=["FP32", "BYTES"],
+            merge_level="word",
+        )
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        future_ocr = executor.submit(ocr_client.infer, **future_ocr_kwargs)
+        future_yolox = None
+        if enable_yolox:
+            future_yolox = executor.submit(yolox_client.infer, **future_yolox_kwargs)
         if enable_yolox:
             try:
                 yolox_results = future_yolox.result()
@@ -104,17 +119,17 @@ def _run_inference(
             yolox_results = [None] * len(valid_images)
         try:
-            paddle_results = future_paddle.result()
+            ocr_results = future_ocr.result()
         except Exception as e:
-            logger.error(f"Error calling paddle_client.infer: {e}", exc_info=True)
+            logger.error(f"Error calling ocr_client.infer: {e}", exc_info=True)
             raise
-    return yolox_results, paddle_results
+    return yolox_results, ocr_results
 def _validate_inference_results(
     yolox_results: Any,
-    paddle_results: Any,
+    ocr_results: Any,
     valid_arrays: List[Any],
     valid_images: List[str],
 ) -> Tuple[List[Any], List[Any]]:
@@ -123,46 +138,47 @@ def _validate_inference_results(
     If not, default values are assigned. Raises a ValueError if the lengths do not match.
     """
-    if not isinstance(yolox_results, list) or not isinstance(paddle_results, list):
+    if not isinstance(yolox_results, list) or not isinstance(ocr_results, list):
         logger.warning(
-            "Unexpected result types from inference clients: yolox_results=%s, paddle_results=%s. "
+            "Unexpected result types from inference clients: yolox_results=%s, ocr_results=%s. "
             "Proceeding with available results.",
             type(yolox_results).__name__,
-            type(paddle_results).__name__,
+            type(ocr_results).__name__,
         )
         if not isinstance(yolox_results, list):
             yolox_results = [None] * len(valid_arrays)
-        if not isinstance(paddle_results, list):
-            paddle_results = [(None, None)] * len(valid_images)
+        if not isinstance(ocr_results, list):
+            ocr_results = [(None, None)] * len(valid_images)
     if len(yolox_results) != len(valid_arrays):
         raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
-    if len(paddle_results) != len(valid_images):
-        raise ValueError(f"Expected {len(valid_images)} paddle results, got {len(paddle_results)}")
+    if len(ocr_results) != len(valid_images):
+        raise ValueError(f"Expected {len(valid_images)} ocr results, got {len(ocr_results)}")
-    return yolox_results, paddle_results
+    return yolox_results, ocr_results
 def _update_table_metadata(
     base64_images: List[str],
     yolox_client: Any,
-    paddle_client: Any,
+    ocr_client: Any,
+    ocr_model_name: str,
     worker_pool_size: int = 8,  # Not currently used
     enable_yolox: bool = False,
     trace_info: Optional[Dict] = None,
 ) -> List[Tuple[str, Any, Any, Any]]:
     """
     Given a list of base64-encoded images, this function filters out images that do not meet
-    the minimum size requirements and then calls the PaddleOCR model via paddle_client.infer
+    the minimum size requirements and then calls the OCR model via ocr_client.infer
     to extract table data.
     For each base64-encoded image, the result is a tuple:
-        (base64_image, yolox_result, paddle_text_predictions, paddle_bounding_boxes)
+        (base64_image, yolox_result, ocr_text_predictions, ocr_bounding_boxes)
     Images that do not meet the minimum size are skipped (resulting in placeholders).
-    The paddle_client is expected to handle any necessary batching and concurrency.
+    The ocr_client is expected to handle any necessary batching and concurrency.
     """
-    logger.debug(f"Running table extraction using protocol {paddle_client.protocol}")
+    logger.debug(f"Running table extraction using protocol {ocr_client.protocol}")
     # Initialize the results list with default placeholders.
     results: List[Tuple[str, Any, Any, Any]] = [("", None, None, None)] * len(base64_images)
@@ -174,24 +190,23 @@ def _update_table_metadata(
         return results
     # Run inference concurrently.
-    yolox_results, paddle_results = _run_inference(
+    yolox_results, ocr_results = _run_inference(
         enable_yolox=enable_yolox,
         yolox_client=yolox_client,
-        paddle_client=paddle_client,
+        ocr_client=ocr_client,
+        ocr_model_name=ocr_model_name,
         valid_arrays=valid_arrays,
         valid_images=valid_images,
         trace_info=trace_info,
     )
     # Validate that the inference results have the expected structure.
-    yolox_results, paddle_results = _validate_inference_results(
-        yolox_results, paddle_results, valid_arrays, valid_images
-    )
+    yolox_results, ocr_results = _validate_inference_results(yolox_results, ocr_results, valid_arrays, valid_images)
     # Combine results with the original order.
-    for idx, (yolox_res, paddle_res) in enumerate(zip(yolox_results, paddle_results)):
+    for idx, (yolox_res, ocr_res) in enumerate(zip(yolox_results, ocr_results)):
         original_index = valid_indices[idx]
-        results[original_index] = (base64_images[original_index], yolox_res, paddle_res[0], paddle_res[1])
+        results[original_index] = (base64_images[original_index], yolox_res, ocr_res[0], ocr_res[1])
     return results
@@ -199,14 +214,14 @@ def _update_table_metadata(
 def _create_clients(
     yolox_endpoints: Tuple[str, str],
     yolox_protocol: str,
-    paddle_endpoints: Tuple[str, str],
-    paddle_protocol: str,
+    ocr_endpoints: Tuple[str, str],
+    ocr_protocol: str,
     auth_token: str,
 ) -> Tuple[NimClient, NimClient]:
     yolox_model_interface = YoloxTableStructureModelInterface()
-    paddle_model_interface = PaddleOCRModelInterface()
+    ocr_model_interface = OCRModelInterface()
-    logger.debug(f"Inference protocols: yolox={yolox_protocol}, paddle={paddle_protocol}")
+    logger.debug(f"Inference protocols: yolox={yolox_protocol}, ocr={ocr_protocol}")
     yolox_client = create_inference_client(
         endpoints=yolox_endpoints,
@@ -215,14 +230,14 @@ def _create_clients(
         infer_protocol=yolox_protocol,
     )
-    paddle_client = create_inference_client(
-        endpoints=paddle_endpoints,
-        model_interface=paddle_model_interface,
+    ocr_client = create_inference_client(
+        endpoints=ocr_endpoints,
+        model_interface=ocr_model_interface,
         auth_token=auth_token,
-        infer_protocol=paddle_protocol,
+        infer_protocol=ocr_protocol,
     )
-    return yolox_client, paddle_client
+    return yolox_client, ocr_client
 def extract_table_data_from_image_internal(
@@ -262,14 +277,18 @@ def extract_table_data_from_image_internal(
         return df_extraction_ledger, execution_trace_log
     endpoint_config = extraction_config.endpoint_config
-    yolox_client, paddle_client = _create_clients(
+    yolox_client, ocr_client = _create_clients(
         endpoint_config.yolox_endpoints,
         endpoint_config.yolox_infer_protocol,
-        endpoint_config.paddle_endpoints,
-        endpoint_config.paddle_infer_protocol,
+        endpoint_config.ocr_endpoints,
+        endpoint_config.ocr_infer_protocol,
         endpoint_config.auth_token,
     )
+    # Get the grpc endpoint to determine the model if needed
+    ocr_grpc_endpoint = endpoint_config.ocr_endpoints[0]
+    ocr_model_name = get_ocr_model_name(ocr_grpc_endpoint)
     try:
         # 1) Identify rows that meet criteria (structured, subtype=table, table_metadata != None, content not empty)
         def meets_criteria(row):
@@ -309,7 +328,8 @@ def extract_table_data_from_image_internal(
         bulk_results = _update_table_metadata(
             base64_images=base64_images,
             yolox_client=yolox_client,
-            paddle_client=paddle_client,
+            ocr_client=ocr_client,
+            ocr_model_name=ocr_model_name,
             worker_pool_size=endpoint_config.workers_per_progress_engine,
             enable_yolox=enable_yolox,
             trace_info=execution_trace_log,
@@ -317,15 +337,15 @@ def extract_table_data_from_image_internal(
         # 4) Write the results (bounding_boxes, text_predictions) back
         for row_id, idx in enumerate(valid_indices):
-            # unpack (base64_image, (yolox_predictions, paddle_bounding boxes, paddle_text_predictions))
+            # unpack (base64_image, (yolox_predictions, ocr_bounding boxes, ocr_text_predictions))
             _, cell_predictions, bounding_boxes, text_predictions = bulk_results[row_id]
             if table_content_format == TableFormatEnum.SIMPLE:
                 table_content = " ".join(text_predictions)
             elif table_content_format == TableFormatEnum.PSEUDO_MARKDOWN:
-                table_content = convert_paddle_response_to_psuedo_markdown(bounding_boxes, text_predictions)
+                table_content = convert_ocr_response_to_psuedo_markdown(bounding_boxes, text_predictions)
             elif table_content_format == TableFormatEnum.MARKDOWN:
-                table_content = join_yolox_table_structure_and_paddle_output(
+                table_content = join_yolox_table_structure_and_ocr_output(
                     cell_predictions, bounding_boxes, text_predictions
                 )
             else:
@@ -341,4 +361,4 @@ def extract_table_data_from_image_internal(
         raise
     finally:
         yolox_client.close()
-        paddle_client.close()
+        ocr_client.close()

nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py CHANGED Viewed

@@ -40,6 +40,7 @@ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadat
 from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
     YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
     YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
+    YOLOX_PAGE_IMAGE_FORMAT,
 )
 from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import NemoRetrieverParseConfigSchema
 from nv_ingest_api.util.metadata.aggregators import (
@@ -100,7 +101,7 @@ def nemoretriever_parse_extractor(
             - text_depth : str, optional (default is "page")
             - extract_tables_method : str, optional (default is "yolox")
             - identify_nearby_objects : bool, optional (default is True)
-            - paddle_output_format : str, optional (default is "pseudo_markdown")
+            - table_output_format : str, optional (default is "pseudo_markdown")
             - pdfium_config : dict, optional (configuration for PDFium)
             - nemoretriever_parse_config : dict, optional (configuration for NemoRetrieverParse)
             - metadata_column : str, optional (default is "metadata")
@@ -145,14 +146,14 @@ def nemoretriever_parse_extractor(
     # Flag for identifying nearby objects.
     identify_nearby_objects = extractor_config.get("identify_nearby_objects", True)
-    # Get and validate paddle_output_format.
-    paddle_output_format_str = extractor_config.get("paddle_output_format", "pseudo_markdown")
+    # Get and validate table_output_format.
+    table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
     try:
-        paddle_output_format = TableFormatEnum[paddle_output_format_str.upper()]
+        table_output_format = TableFormatEnum[table_output_format_str.upper()]
     except KeyError:
         valid_options = [e.name.lower() for e in TableFormatEnum]
         raise ValueError(
-            f"Invalid paddle_output_format value: {paddle_output_format_str}. Expected one of: {valid_options}"
+            f"Invalid table_output_format value: {table_output_format_str}. Expected one of: {valid_options}"
         )
     # Process nemoretriever_parse configuration.
@@ -253,7 +254,7 @@ def nemoretriever_parse_extractor(
                     extract_tables,
                     extract_charts,
                     extract_infographics,
-                    paddle_output_format,
+                    table_output_format,
                     nemoretriever_parse_config.yolox_endpoints,
                     nemoretriever_parse_config.yolox_infer_protocol,
                     nemoretriever_parse_config.auth_token,
@@ -287,7 +288,7 @@ def nemoretriever_parse_extractor(
                 extract_tables,
                 extract_charts,
                 extract_infographics,
-                paddle_output_format,
+                table_output_format,
                 nemoretriever_parse_config.yolox_endpoints,
                 nemoretriever_parse_config.yolox_infer_protocol,
                 nemoretriever_parse_config.auth_token,
@@ -355,7 +356,7 @@ def nemoretriever_parse_extractor(
                 img_numpy = crop_image(page_image, transformed_bbox)
                 if img_numpy is not None:
-                    base64_img = numpy_to_base64(img_numpy)
+                    base64_img = numpy_to_base64(img_numpy, format=YOLOX_PAGE_IMAGE_FORMAT)
                     image = Base64Image(
                         image=base64_img,
                         bbox=transformed_bbox,

nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py CHANGED Viewed

@@ -4,20 +4,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import base64
+import inspect
 import io
-import pandas as pd
-from typing import Any, Dict, List, Optional
 import logging
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
-from nv_ingest_api.internal.extract.pdf.engines import (
-    adobe_extractor,
-    llama_parse_extractor,
-    nemoretriever_parse_extractor,
-    pdfium_extractor,
-    tika_extractor,
-    unstructured_io_extractor,
-)
+import pandas as pd
+from nv_ingest_api.internal.extract.pdf.engines import adobe_extractor
+from nv_ingest_api.internal.extract.pdf.engines import llama_parse_extractor
+from nv_ingest_api.internal.extract.pdf.engines import nemoretriever_parse_extractor
+from nv_ingest_api.internal.extract.pdf.engines import pdfium_extractor
+from nv_ingest_api.internal.extract.pdf.engines import tika_extractor
+from nv_ingest_api.internal.extract.pdf.engines import unstructured_io_extractor
 from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
 # Import extraction functions for different engines.
@@ -43,6 +44,7 @@ def _work_extract_pdf(
     extract_infographics: bool,
     extract_tables: bool,
     extract_charts: bool,
+    extract_page_as_image: bool,
     extractor_config: dict,
     execution_trace_log=None,
 ) -> Any:
@@ -52,17 +54,25 @@ def _work_extract_pdf(
     extract_method = extractor_config["extract_method"]
     extractor_fn = EXTRACTOR_LOOKUP.get(extract_method, pdfium_extractor)
-    return extractor_fn(
-        pdf_stream,
-        extract_text,
-        extract_images,
-        extract_infographics,
-        extract_tables,
-        extract_charts,
-        extractor_config,
-        execution_trace_log,
+    extractor_fn_args = dict(
+        pdf_stream=pdf_stream,
+        extract_text=extract_text,
+        extract_images=extract_images,
+        extract_infographics=extract_infographics,
+        extract_tables=extract_tables,
+        extract_charts=extract_charts,
+        extractor_config=extractor_config,
+        execution_trace_log=execution_trace_log,
     )
+    if "extract_page_as_image" in inspect.signature(extractor_fn).parameters:
+        extractor_fn_args["extract_page_as_image"] = extract_page_as_image
+    elif extract_page_as_image:
+        logger.warning(f"`extract_page_as_image` is set to True, but {extract_method} does not support it.")
+    return extractor_fn(**extractor_fn_args)
 @unified_exception_handler
 def _orchestrate_row_extraction(
@@ -97,6 +107,7 @@ def _orchestrate_row_extraction(
         extract_tables = params.pop("extract_tables", False)
         extract_charts = params.pop("extract_charts", False)
         extract_infographics = params.pop("extract_infographics", False)
+        extract_page_as_image = params.pop("extract_page_as_image", False)
         extract_method = params.get("extract_method", "pdfium")
     except KeyError as e:
         raise ValueError(f"Missing required extraction flag: {e}")
@@ -137,6 +148,7 @@ def _orchestrate_row_extraction(
         extract_text=extract_text,
         extract_images=extract_images,
         extract_infographics=extract_infographics,
+        extract_page_as_image=extract_page_as_image,
         extract_tables=extract_tables,
         extract_charts=extract_charts,
         extractor_config=extractor_config,

nv_ingest_api/internal/extract/pdf/engines/pdfium.py CHANGED Viewed

@@ -24,16 +24,18 @@ import numpy as np
 import pandas as pd
 import pypdfium2 as libpdfium
+from nv_ingest_api.internal.enums.common import ContentTypeEnum
 from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH_SIZE
 from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
     YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
     YOLOX_PAGE_IMAGE_PREPROC_HEIGHT,
-    get_yolox_model_name,
     YoloxPageElementsModelInterface,
+    YOLOX_PAGE_IMAGE_FORMAT,
 )
 from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
 from nv_ingest_api.internal.enums.common import TableFormatEnum, TextTypeEnum, AccessLevelEnum
 from nv_ingest_api.util.metadata.aggregators import (
+    construct_image_metadata_from_base64,
     construct_image_metadata_from_pdf_image,
     extract_pdf_metadata,
     construct_text_metadata,
@@ -46,6 +48,7 @@ from nv_ingest_api.util.pdf.pdfium import (
     extract_image_like_objects_from_pdfium_page,
 )
 from nv_ingest_api.util.pdf.pdfium import pdfium_pages_to_numpy
+from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
 from nv_ingest_api.util.image_processing.transforms import numpy_to_base64, crop_image
 logger = logging.getLogger(__name__)
@@ -54,7 +57,6 @@ logger = logging.getLogger(__name__)
 def _extract_page_elements_using_image_ensemble(
     pages: List[Tuple[int, np.ndarray, Tuple[int, int]]],
     yolox_client,
-    yolox_model_name: str = "yolox",
     execution_trace_log: Optional[List] = None,
 ) -> List[Tuple[int, object]]:
     """
@@ -68,8 +70,6 @@ def _extract_page_elements_using_image_ensemble(
         and optional padding offset information.
     yolox_client : object
         A pre-configured client instance for the YOLOX inference service.
-    yolox_model_name : str, default="yolox"
-        The name of the YOLOX model to use for inference.
     execution_trace_log : Optional[List], default=None
         List for accumulating execution trace information.
@@ -102,8 +102,11 @@ def _extract_page_elements_using_image_ensemble(
         # Perform inference using the NimClient.
         inference_results = yolox_client.infer(
             data,
-            model_name="yolox",
+            model_name="yolox_ensemble",
             max_batch_size=YOLOX_MAX_BATCH_SIZE,
+            input_names=["INPUT_IMAGES", "THRESHOLDS"],
+            dtypes=["BYTES", "FP32"],
+            output_names=["OUTPUT"],
             trace_info=execution_trace_log,
             stage_name="pdf_extraction",
         )
@@ -186,7 +189,7 @@ def _extract_page_element_images(
             if cropped is None:
                 continue
-            base64_img = numpy_to_base64(cropped)
+            base64_img = numpy_to_base64(cropped, format=YOLOX_PAGE_IMAGE_FORMAT)
             bbox_in_orig_coord = (
                 int(w1) - pad_width,
@@ -263,7 +266,7 @@ def _extract_page_elements(
     extract_tables: bool,
     extract_charts: bool,
     extract_infographics: bool,
-    paddle_output_format: str,
+    table_output_format: str,
     yolox_endpoints: Tuple[Optional[str], Optional[str]],
     yolox_infer_protocol: str = "http",
     auth_token: Optional[str] = None,
@@ -292,7 +295,7 @@ def _extract_page_elements(
         Flag indicating whether to extract charts.
     extract_infographics : bool
         Flag indicating whether to extract infographics.
-    paddle_output_format : str
+    table_output_format : str
         Format to use for table content.
     yolox_endpoints : Tuple[Optional[str], Optional[str]]
         A tuple containing the gRPC and HTTP endpoints for the YOLOX service.
@@ -313,19 +316,7 @@ def _extract_page_elements(
     try:
         # Default model name
-        yolox_model_name = "yolox"
-        # Get the HTTP endpoint to determine the model name if needed
-        yolox_http_endpoint = yolox_endpoints[1]
-        if yolox_http_endpoint:
-            try:
-                yolox_model_name = get_yolox_model_name(yolox_http_endpoint)
-            except Exception as e:
-                logger.warning(f"Failed to get YOLOX model name from endpoint: {e}. Using default.")
-        # Create the model interface
-        model_interface = YoloxPageElementsModelInterface(yolox_model_name=yolox_model_name)
+        model_interface = YoloxPageElementsModelInterface()
         # Create the inference client
         yolox_client = create_inference_client(
             yolox_endpoints,
@@ -336,7 +327,7 @@ def _extract_page_elements(
         # Extract page elements using the client
         page_element_results = _extract_page_elements_using_image_ensemble(
-            pages, yolox_client, yolox_model_name, execution_trace_log=execution_trace_log
+            pages, yolox_client, execution_trace_log=execution_trace_log
         )
         # Process each extracted element based on extraction flags
@@ -351,7 +342,7 @@ def _extract_page_elements(
             # Set content format for tables
             if page_element.type_string == "table":
-                page_element.content_format = paddle_output_format
+                page_element.content_format = table_output_format
             # Construct metadata for the page element
             page_element_meta = construct_page_element_metadata(
@@ -384,6 +375,7 @@ def pdfium_extractor(
     extract_infographics: bool,
     extract_tables: bool,
     extract_charts: bool,
+    extract_page_as_image: bool,
     extractor_config: dict,
     execution_trace_log: Optional[List[Any]] = None,
 ) -> pd.DataFrame:
@@ -407,13 +399,13 @@ def pdfium_extractor(
             f"Invalid text_depth: {text_depth_str}. Valid options: {list(TextTypeEnum.__members__.keys())}"
         )
-    # Validate and extract paddle_output_format
-    paddle_output_format_str = extractor_config.get("paddle_output_format", "pseudo_markdown")
+    # Validate and extract table_output_format
+    table_output_format_str = extractor_config.get("table_output_format", "pseudo_markdown")
     try:
-        paddle_output_format = TableFormatEnum[paddle_output_format_str.upper()]
+        table_output_format = TableFormatEnum[table_output_format_str.upper()]
     except KeyError:
         raise ValueError(
-            f"Invalid paddle_output_format: {paddle_output_format_str}. "
+            f"Invalid table_output_format: {table_output_format_str}. "
             f"Valid options: {list(TableFormatEnum.__members__.keys())}"
         )
@@ -524,6 +516,24 @@ def pdfium_extractor(
                 )
                 extracted_data.extend(image_data)
+            # Full page image extraction
+            if extract_page_as_image:
+                page_text = _extract_page_text(page)
+                image, _ = pdfium_pages_to_numpy([page], scale_tuple=(16384, 16384), trace_info=execution_trace_log)
+                base64_image = numpy_to_base64(image[0])
+                if len(base64_image) > 2**24 - 1:
+                    base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
+                image_meta = construct_image_metadata_from_base64(
+                    base64_image,
+                    page_idx,
+                    page_count,
+                    source_metadata,
+                    base_unified_metadata,
+                    subtype=ContentTypeEnum.PAGE_IMAGE,
+                    text=page_text,
+                )
+                extracted_data.append(image_meta)
             # If we want tables or charts, rasterize the page and store it
             if extract_tables or extract_charts or extract_infographics:
                 image, padding_offsets = pdfium_pages_to_numpy(
@@ -545,7 +555,7 @@ def pdfium_extractor(
                         extract_tables,
                         extract_charts,
                         extract_infographics,
-                        paddle_output_format,
+                        table_output_format,
                         pdfium_config.yolox_endpoints,
                         pdfium_config.yolox_infer_protocol,
                         pdfium_config.auth_token,
@@ -567,13 +577,14 @@ def pdfium_extractor(
                 extract_tables,
                 extract_charts,
                 extract_infographics,
-                paddle_output_format,
+                table_output_format,
                 pdfium_config.yolox_endpoints,
                 pdfium_config.yolox_infer_protocol,
                 pdfium_config.auth_token,
                 execution_trace_log=execution_trace_log,
             )
             futures.append(future)
             pages_for_tables.clear()
         # Wait for all asynchronous jobs to complete.

nv-ingest-api 25.7.6.dev20250706__py3-none-any.whl → 25.8.0rc1__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 25.7.6.dev20250706py3-none-any.whl → 25.8.0rc1py3-none-any.whl