PyPI - nv-ingest-api - Versions diffs - 26.1.0rc4__py3-none-any.whl - Mend

nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show

nv_ingest_api/__init__.py +3 -0
nv_ingest_api/interface/__init__.py +218 -0
nv_ingest_api/interface/extract.py +977 -0
nv_ingest_api/interface/mutate.py +154 -0
nv_ingest_api/interface/store.py +200 -0
nv_ingest_api/interface/transform.py +382 -0
nv_ingest_api/interface/utility.py +186 -0
nv_ingest_api/internal/__init__.py +0 -0
nv_ingest_api/internal/enums/__init__.py +3 -0
nv_ingest_api/internal/enums/common.py +550 -0
nv_ingest_api/internal/extract/__init__.py +3 -0
nv_ingest_api/internal/extract/audio/__init__.py +3 -0
nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
nv_ingest_api/internal/extract/docx/__init__.py +5 -0
nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
nv_ingest_api/internal/extract/html/__init__.py +3 -0
nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
nv_ingest_api/internal/extract/image/__init__.py +3 -0
nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
nv_ingest_api/internal/meta/__init__.py +3 -0
nv_ingest_api/internal/meta/udf.py +232 -0
nv_ingest_api/internal/mutate/__init__.py +3 -0
nv_ingest_api/internal/mutate/deduplicate.py +110 -0
nv_ingest_api/internal/mutate/filter.py +133 -0
nv_ingest_api/internal/primitives/__init__.py +0 -0
nv_ingest_api/internal/primitives/control_message_task.py +16 -0
nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
nv_ingest_api/internal/schemas/__init__.py +3 -0
nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
nv_ingest_api/internal/schemas/meta/udf.py +23 -0
nv_ingest_api/internal/schemas/mixins.py +39 -0
nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
nv_ingest_api/internal/schemas/store/__init__.py +3 -0
nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
nv_ingest_api/internal/store/__init__.py +3 -0
nv_ingest_api/internal/store/embed_text_upload.py +236 -0
nv_ingest_api/internal/store/image_upload.py +251 -0
nv_ingest_api/internal/transform/__init__.py +3 -0
nv_ingest_api/internal/transform/caption_image.py +219 -0
nv_ingest_api/internal/transform/embed_text.py +702 -0
nv_ingest_api/internal/transform/split_text.py +182 -0
nv_ingest_api/util/__init__.py +3 -0
nv_ingest_api/util/control_message/__init__.py +0 -0
nv_ingest_api/util/control_message/validators.py +47 -0
nv_ingest_api/util/converters/__init__.py +0 -0
nv_ingest_api/util/converters/bytetools.py +78 -0
nv_ingest_api/util/converters/containers.py +65 -0
nv_ingest_api/util/converters/datetools.py +90 -0
nv_ingest_api/util/converters/dftools.py +127 -0
nv_ingest_api/util/converters/formats.py +64 -0
nv_ingest_api/util/converters/type_mappings.py +27 -0
nv_ingest_api/util/dataloader/__init__.py +9 -0
nv_ingest_api/util/dataloader/dataloader.py +409 -0
nv_ingest_api/util/detectors/__init__.py +5 -0
nv_ingest_api/util/detectors/language.py +38 -0
nv_ingest_api/util/exception_handlers/__init__.py +0 -0
nv_ingest_api/util/exception_handlers/converters.py +72 -0
nv_ingest_api/util/exception_handlers/decorators.py +429 -0
nv_ingest_api/util/exception_handlers/detectors.py +74 -0
nv_ingest_api/util/exception_handlers/pdf.py +116 -0
nv_ingest_api/util/exception_handlers/schemas.py +68 -0
nv_ingest_api/util/image_processing/__init__.py +5 -0
nv_ingest_api/util/image_processing/clustering.py +260 -0
nv_ingest_api/util/image_processing/processing.py +177 -0
nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
nv_ingest_api/util/image_processing/transforms.py +850 -0
nv_ingest_api/util/imports/__init__.py +3 -0
nv_ingest_api/util/imports/callable_signatures.py +108 -0
nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
nv_ingest_api/util/introspection/__init__.py +3 -0
nv_ingest_api/util/introspection/class_inspect.py +145 -0
nv_ingest_api/util/introspection/function_inspect.py +65 -0
nv_ingest_api/util/logging/__init__.py +0 -0
nv_ingest_api/util/logging/configuration.py +102 -0
nv_ingest_api/util/logging/sanitize.py +84 -0
nv_ingest_api/util/message_brokers/__init__.py +3 -0
nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
nv_ingest_api/util/metadata/__init__.py +5 -0
nv_ingest_api/util/metadata/aggregators.py +516 -0
nv_ingest_api/util/multi_processing/__init__.py +8 -0
nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
nv_ingest_api/util/nim/__init__.py +161 -0
nv_ingest_api/util/pdf/__init__.py +3 -0
nv_ingest_api/util/pdf/pdfium.py +428 -0
nv_ingest_api/util/schema/__init__.py +3 -0
nv_ingest_api/util/schema/schema_validator.py +10 -0
nv_ingest_api/util/service_clients/__init__.py +3 -0
nv_ingest_api/util/service_clients/client_base.py +86 -0
nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
nv_ingest_api/util/string_processing/__init__.py +51 -0
nv_ingest_api/util/string_processing/configuration.py +682 -0
nv_ingest_api/util/string_processing/yaml.py +109 -0
nv_ingest_api/util/system/__init__.py +0 -0
nv_ingest_api/util/system/hardware_info.py +594 -0
nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
udfs/__init__.py +5 -0
udfs/llm_summarizer_udf.py +259 -0

nv_ingest_api/internal/transform/caption_image.py ADDED Viewed

@@ -0,0 +1,219 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from typing import Any, Dict, List, Optional, Tuple, Union
+import pandas as pd
+from pydantic import BaseModel
+from nv_ingest_api.internal.primitives.nim.model_interface.vlm import VLMModelInterface
+from nv_ingest_api.internal.enums.common import ContentTypeEnum
+from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
+from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
+from nv_ingest_api.util.nim import create_inference_client
+logger = logging.getLogger(__name__)
+def _prepare_dataframes_mod(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
+    """
+    Prepares and returns three DataFrame-related objects from the input DataFrame.
+    The function performs the following:
+      1. Checks if the DataFrame is empty or if the "document_type" column is missing.
+         In such a case, returns the original DataFrame, an empty DataFrame, and an empty boolean Series.
+      2. Otherwise, it creates a boolean Series identifying rows where "document_type" equals IMAGE.
+      3. Extracts a DataFrame containing only those rows.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The input DataFrame that should contain a "document_type" column.
+    Returns
+    -------
+    Tuple[pd.DataFrame, pd.DataFrame, pd.Series]
+        A tuple containing:
+          - The original DataFrame.
+          - A DataFrame filtered to rows where "document_type" is IMAGE.
+          - A boolean Series indicating which rows in the original DataFrame are IMAGE rows.
+    """
+    try:
+        if df.empty or "document_type" not in df.columns:
+            return df, pd.DataFrame(), pd.Series(dtype=bool)
+        bool_index: pd.Series = df["document_type"] == ContentTypeEnum.IMAGE
+        df_matched: pd.DataFrame = df.loc[bool_index]
+        return df, df_matched, bool_index
+    except Exception as e:
+        err_msg = f"_prepare_dataframes_mod: Error preparing dataframes. Original error: {e}"
+        logger.error(err_msg, exc_info=True)
+        raise type(e)(err_msg) from e
+def _generate_captions(
+    base64_images: List[str],
+    prompt: str,
+    system_prompt: Optional[str],
+    api_key: str,
+    endpoint_url: str,
+    model_name: str,
+) -> List[str]:
+    """
+    Generates captions for a list of base64-encoded PNG images using the VLM model API.
+    This function performs the following steps:
+      1. Scales each image to meet encoding size requirements using `scale_image_to_encoding_size`.
+      2. Constructs the input payload containing the scaled images and the provided prompt.
+      3. Creates an inference client using the VLMModelInterface.
+      4. Calls the client's infer method to obtain a list of captions corresponding to the images.
+    Parameters
+    ----------
+    base64_images : List[str]
+        List of base64-encoded PNG image strings.
+    prompt : str
+        Text prompt to guide caption generation.
+    api_key : str
+        API key for authenticating with the VLM endpoint.
+    endpoint_url : str
+        URL of the VLM model HTTP endpoint.
+    model_name : str
+        The name of the model to use for inference.
+    Returns
+    -------
+    List[str]
+        A list of generated captions, each corresponding to an input image.
+    Raises
+    ------
+    Exception
+        Propagates any exception encountered during caption generation, with added context.
+    """
+    try:
+        # Scale each image to ensure it meets encoding size requirements.
+        scaled_images: List[str] = [scale_image_to_encoding_size(b64)[0] for b64 in base64_images]
+        # Build the input payload for the VLM model.
+        data: Dict[str, Any] = {
+            "base64_images": scaled_images,
+            "prompt": prompt,
+        }
+        if system_prompt:
+            data["system_prompt"] = system_prompt
+        # Create the inference client using the VLMModelInterface.
+        nim_client = create_inference_client(
+            model_interface=VLMModelInterface(),
+            endpoints=(None, endpoint_url),
+            auth_token=api_key,
+            infer_protocol="http",
+        )
+        # Perform inference to generate captions.
+        captions: List[str] = nim_client.infer(data, model_name=model_name)
+        return captions
+    except Exception as e:
+        err_msg = f"_generate_captions: Error generating captions: {e}"
+        logger.error(err_msg, exc_info=True)
+        raise type(e)(err_msg) from e
+@unified_exception_handler
+def transform_image_create_vlm_caption_internal(
+    df_transform_ledger: pd.DataFrame,
+    task_config: Union[BaseModel, Dict[str, Any]],
+    transform_config: Any,
+    execution_trace_log: Optional[Dict[str, Any]] = None,
+) -> pd.DataFrame:
+    """
+    Extracts and adds captions for image content in a DataFrame using the VLM model API.
+    This function updates the 'metadata' column for rows where the content type is "image".
+    It uses configuration values from task_config (or falls back to transform_config defaults)
+    to determine the API key, prompt, endpoint URL, and model name for caption generation.
+    The generated captions are added under the 'image_metadata.caption' key in the metadata.
+    Parameters
+    ----------
+    df_transform_ledger : pd.DataFrame
+        The input DataFrame containing image data. Each row must have a 'metadata' column
+        with at least the 'content' and 'content_metadata' keys.
+    task_config : Union[BaseModel, Dict[str, Any]]
+        Configuration parameters for caption extraction. If provided as a Pydantic model,
+        it will be converted to a dictionary. Expected keys include "api_key", "prompt",
+        "endpoint_url", and "model_name".
+    transform_config : Any
+        A configuration object providing default values for caption extraction. It should have
+        attributes: api_key, prompt, endpoint_url, and model_name.
+    execution_trace_log : Optional[Dict[str, Any]], default=None
+        Optional trace information for debugging or logging purposes.
+    Returns
+    -------
+    pd.DataFrame
+        The updated DataFrame with generated captions added to the 'image_metadata.caption' field
+        within the 'metadata' column for each image row.
+    Raises
+    ------
+    Exception
+        Propagates any exception encountered during the caption extraction process, with added context.
+    """
+    _ = execution_trace_log  # Unused variable; placeholder to prevent linter warnings.
+    logger.debug("Attempting to caption image content")
+    # Convert task_config to dictionary if it is a Pydantic model.
+    if isinstance(task_config, BaseModel):
+        task_config = task_config.model_dump()
+    # Retrieve configuration values with fallback to transform_config defaults.
+    api_key: str = task_config.get("api_key") or transform_config.api_key
+    prompt: str = task_config.get("prompt") or transform_config.prompt
+    system_prompt: str = task_config.get("system_prompt") or transform_config.system_prompt
+    endpoint_url: str = task_config.get("endpoint_url") or transform_config.endpoint_url
+    model_name: str = task_config.get("model_name") or transform_config.model_name
+    # Create a mask for rows where the content type is "image".
+    df_mask: pd.Series = df_transform_ledger["metadata"].apply(
+        lambda meta: meta.get("content_metadata", {}).get("type") == "image"
+    )
+    # If no image rows exist, return the original DataFrame.
+    if not df_mask.any():
+        return df_transform_ledger
+    # Collect base64-encoded images from the rows where the content type is "image".
+    base64_images: List[str] = df_transform_ledger.loc[df_mask, "metadata"].apply(lambda meta: meta["content"]).tolist()
+    # Generate captions for the collected images.
+    captions: List[str] = _generate_captions(
+        base64_images,
+        prompt,
+        system_prompt,
+        api_key,
+        endpoint_url,
+        model_name,
+    )
+    # Update the DataFrame: assign each generated caption to the corresponding row.
+    for idx, caption in zip(df_transform_ledger.loc[df_mask].index, captions):
+        meta: Dict[str, Any] = df_transform_ledger.at[idx, "metadata"]
+        image_meta: Dict[str, Any] = meta.get("image_metadata", {})
+        image_meta["caption"] = caption
+        meta["image_metadata"] = image_meta
+        df_transform_ledger.at[idx, "metadata"] = meta
+    logger.debug("Image content captioning complete")
+    result, execution_trace_log = df_transform_ledger, {}
+    _ = execution_trace_log  # Unused variable; placeholder to prevent linter warnings.
+    return result