PyPI - nv-ingest-api - Versions diffs - 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.23.dev20250423__py3-none-any.whl - Mend

nv-ingest-api 2025.4.21.dev20250421py3-none-any.whl → 2025.4.23.dev20250423py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show

nv_ingest_api/internal/enums/common.py ADDED Viewed

@@ -0,0 +1,494 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from enum import Enum
+from typing import Type, Any
+logger = logging.getLogger(__name__)
+class AccessLevelEnum(int, Enum):
+    """
+    Note
+    ----
+    This is for future use, and currently has no functional use case.
+    Enum for representing different access levels.
+    Attributes
+    ----------
+    LEVEL_1 : int
+        Represents access level 1.
+    LEVEL_2 : int
+        Represents access level 2.
+    LEVEL_3 : int
+        Represents access level 3.
+    """
+    UNKNOWN: int = -1
+    LEVEL_1: int = 1
+    LEVEL_2: int = 2
+    LEVEL_3: int = 3
+class ContentDescriptionEnum(str, Enum):
+    """
+    Enum for standard content descriptions extracted from different source types.
+    Attributes
+    ----------
+    DOCX_IMAGE : str
+        Description for image extracted from DOCX document.
+    DOCX_TABLE : str
+        Description for structured table extracted from DOCX document.
+    DOCX_TEXT : str
+        Description for unstructured text from DOCX document.
+    PDF_CHART : str
+        Description for structured chart extracted from PDF document.
+    PDF_IMAGE : str
+        Description for image extracted from PDF document.
+    PDF_INFOGRAPHIC : str
+        Description for structured infographic extracted from PDF document.
+    PDF_TABLE : str
+        Description for structured table extracted from PDF document.
+    PDF_TEXT : str
+        Description for unstructured text from PDF document.
+    PPTX_IMAGE : str
+        Description for image extracted from PPTX presentation.
+    PPTX_TABLE : str
+        Description for structured table extracted from PPTX presentation.
+    PPTX_TEXT : str
+        Description for unstructured text from PPTX presentation.
+    """
+    DOCX_IMAGE: str = "Image extracted from DOCX document."
+    DOCX_TABLE: str = "Structured table extracted from DOCX document."
+    DOCX_TEXT: str = "Unstructured text from DOCX document."
+    PDF_CHART: str = "Structured chart extracted from PDF document."
+    PDF_IMAGE: str = "Image extracted from PDF document."
+    PDF_INFOGRAPHIC: str = "Structured infographic extracted from PDF document."
+    PDF_TABLE: str = "Structured table extracted from PDF document."
+    PDF_TEXT: str = "Unstructured text from PDF document."
+    PPTX_IMAGE: str = "Image extracted from PPTX presentation."
+    PPTX_TABLE: str = "Structured table extracted from PPTX presentation."
+    PPTX_TEXT: str = "Unstructured text from PPTX presentation."
+class ContentTypeEnum(str, Enum):
+    """
+    Enum for representing various content types.
+    Note: Content type declares the broad category of the content, such as text, image, audio, etc.
+    This is not equivalent to the Document type, which is a specific file format.
+    Attributes
+    ----------
+    AUDIO : str
+        Represents audio content.
+    EMBEDDING : str
+        Represents embedding content.
+    IMAGE : str
+        Represents image content.
+    INFO_MSG : str
+        Represents an informational message.
+    STRUCTURED : str
+        Represents structured content.
+    TEXT : str
+        Represents text content.
+    UNSTRUCTURED : str
+        Represents unstructured content.
+    VIDEO : str
+        Represents video content.
+    """
+    AUDIO: str = "audio"
+    CHART: str = "chart"
+    EMBEDDING: str = "embedding"
+    IMAGE: str = "image"
+    INFOGRAPHIC: str = "infographic"
+    INFO_MSG: str = "info_message"
+    NONE: str = "none"
+    STRUCTURED: str = "structured"
+    TABLE: str = "table"
+    TEXT: str = "text"
+    UNKNOWN: str = "unknown"
+    VIDEO: str = "video"
+class DocumentTypeEnum(str, Enum):
+    """
+    Enum for representing various document file types.
+    Note: Document type refers to the specific file format of the content, such as PDF, DOCX, etc.
+    This is not equivalent to the Content type, which is a broad category of the content.
+    Attributes
+    ----------
+    BMP: str
+        BMP image format.
+    DOCX: str
+        Microsoft Word document format.
+    HTML: str
+        HTML document.
+    JPEG: str
+        JPEG image format.
+    PDF: str
+        PDF document format.
+    PNG: str
+        PNG image format.
+    PPTX: str
+        PowerPoint presentation format.
+    SVG: str
+        SVG image format.
+    TIFF: str
+        TIFF image format.
+    TXT: str
+        Plain text file.
+    MP3: str
+        MP3 audio format.
+    WAV: str
+        WAV audio format.
+    """
+    BMP: str = "bmp"
+    DOCX: str = "docx"
+    HTML: str = "html"
+    JPEG: str = "jpeg"
+    PDF: str = "pdf"
+    PNG: str = "png"
+    PPTX: str = "pptx"
+    SVG: str = "svg"
+    TIFF: str = "tiff"
+    TXT: str = "text"
+    MD: str = "text"
+    MP3: str = "mp3"
+    WAV: str = "wav"
+    UNKNOWN: str = "unknown"
+class LanguageEnum(str, Enum):
+    """
+    Enum for representing various language codes.
+    Attributes
+    ----------
+    AF : str
+        Afrikaans language code.
+    AR : str
+        Arabic language code.
+    BG : str
+        Bulgarian language code.
+    BN : str
+        Bengali language code.
+    CA : str
+        Catalan language code.
+    CS : str
+        Czech language code.
+    CY : str
+        Welsh language code.
+    DA : str
+        Danish language code.
+    DE : str
+        German language code.
+    EL : str
+        Greek language code.
+    EN : str
+        English language code.
+    ES : str
+        Spanish language code.
+    ET : str
+        Estonian language code.
+    FA : str
+        Persian language code.
+    FI : str
+        Finnish language code.
+    FR : str
+        French language code.
+    GU : str
+        Gujarati language code.
+    HE : str
+        Hebrew language code.
+    HI : str
+        Hindi language code.
+    HR : str
+        Croatian language code.
+    HU : str
+        Hungarian language code.
+    ID : str
+        Indonesian language code.
+    IT : str
+        Italian language code.
+    JA : str
+        Japanese language code.
+    KN : str
+        Kannada language code.
+    KO : str
+        Korean language code.
+    LT : str
+        Lithuanian language code.
+    LV : str
+        Latvian language code.
+    MK : str
+        Macedonian language code.
+    ML : str
+        Malayalam language code.
+    MR : str
+        Marathi language code.
+    NE : str
+        Nepali language code.
+    NL : str
+        Dutch language code.
+    NO : str
+        Norwegian language code.
+    PA : str
+        Punjabi language code.
+    PL : str
+        Polish language code.
+    PT : str
+        Portuguese language code.
+    RO : str
+        Romanian language code.
+    RU : str
+        Russian language code.
+    SK : str
+        Slovak language code.
+    SL : str
+        Slovenian language code.
+    SO : str
+        Somali language code.
+    SQ : str
+        Albanian language code.
+    SV : str
+        Swedish language code.
+    SW : str
+        Swahili language code.
+    TA : str
+        Tamil language code.
+    TE : str
+        Telugu language code.
+    TH : str
+        Thai language code.
+    TL : str
+        Tagalog language code.
+    TR : str
+        Turkish language code.
+    UK : str
+        Ukrainian language code.
+    UR : str
+        Urdu language code.
+    VI : str
+        Vietnamese language code.
+    ZH_CN : str
+        Chinese (Simplified) language code.
+    ZH_TW : str
+        Chinese (Traditional) language code.
+    UNKNOWN : str
+        Represents an unknown language.
+    """
+    AF: str = "af"
+    AR: str = "ar"
+    BG: str = "bg"
+    BN: str = "bn"
+    CA: str = "ca"
+    CS: str = "cs"
+    CY: str = "cy"
+    DA: str = "da"
+    DE: str = "de"
+    EL: str = "el"
+    EN: str = "en"
+    ES: str = "es"
+    ET: str = "et"
+    FA: str = "fa"
+    FI: str = "fi"
+    FR: str = "fr"
+    GU: str = "gu"
+    HE: str = "he"
+    HI: str = "hi"
+    HR: str = "hr"
+    HU: str = "hu"
+    ID: str = "id"
+    IT: str = "it"
+    JA: str = "ja"
+    KN: str = "kn"
+    KO: str = "ko"
+    LT: str = "lt"
+    LV: str = "lv"
+    MK: str = "mk"
+    ML: str = "ml"
+    MR: str = "mr"
+    NE: str = "ne"
+    NL: str = "nl"
+    NO: str = "no"
+    PA: str = "pa"
+    PL: str = "pl"
+    PT: str = "pt"
+    RO: str = "ro"
+    RU: str = "ru"
+    SK: str = "sk"
+    SL: str = "sl"
+    SO: str = "so"
+    SQ: str = "sq"
+    SV: str = "sv"
+    SW: str = "sw"
+    TA: str = "ta"
+    TE: str = "te"
+    TH: str = "th"
+    TL: str = "tl"
+    TR: str = "tr"
+    UK: str = "uk"
+    UR: str = "ur"
+    VI: str = "vi"
+    ZH_CN: str = "zh-cn"
+    ZH_TW: str = "zh-tw"
+    UNKNOWN: str = "unknown"
+    @classmethod
+    def has_value(cls: Type["LanguageEnum"], value: Any) -> bool:
+        """
+        Check if the enum contains the given value.
+        Parameters
+        ----------
+        value : Any
+            The value to check against the enum members.
+        Returns
+        -------
+        bool
+            True if the value exists in the enum, False otherwise.
+        """
+        return value in cls._value2member_map_
+class StatusEnum(str, Enum):
+    """
+    Enum for representing status messages.
+    Attributes
+    ----------
+    ERROR : str
+        Represents an error status.
+    SUCCESS : str
+        Represents a success status.
+    """
+    ERROR: str = "error"
+    SUCCESS: str = "success"
+class TableFormatEnum(str, Enum):
+    """
+    Enum for representing table formats.
+    Attributes
+    ----------
+    HTML : str
+        Represents HTML table format.
+    IMAGE : str
+        Represents image table format.
+    LATEX : str
+        Represents LaTeX table format.
+    MARKDOWN : str
+        Represents Markdown table format.
+    PSEUDO_MARKDOWN : str
+        Represents pseudo Markdown table format.
+    SIMPLE : str
+        Represents simple table format.
+    """
+    HTML: str = "html"
+    IMAGE: str = "image"
+    LATEX: str = "latex"
+    MARKDOWN: str = "markdown"
+    PSEUDO_MARKDOWN: str = "pseudo_markdown"
+    SIMPLE: str = "simple"
+class TaskTypeEnum(str, Enum):
+    """
+    Enum for representing various task types.
+    Attributes
+    ----------
+    CAPTION : str
+        Represents a caption task.
+    DEDUP : str
+        Represents a deduplication task.
+    EMBED : str
+        Represents an embedding task.
+    EXTRACT : str
+        Represents an extraction task.
+    FILTER : str
+        Represents a filtering task.
+    SPLIT : str
+        Represents a splitting task.
+    STORE : str
+        Represents a storing task.
+    STORE_EMBEDDING : str
+        Represents a task for storing embeddings.
+    VDB_UPLOAD : str
+        Represents a task for uploading to a vector database.
+    AUDIO_DATA_EXTRACT : str
+        Represents a task for extracting audio data.
+    TABLE_DATA_EXTRACT : str
+        Represents a task for extracting table data.
+    CHART_DATA_EXTRACT : str
+        Represents a task for extracting chart data.
+    INFOGRAPHIC_DATA_EXTRACT : str
+        Represents a task for extracting infographic data.
+    """
+    AUDIO_DATA_EXTRACT: str = "audio_data_extract"
+    CAPTION: str = "caption"
+    CHART_DATA_EXTRACT: str = "chart_data_extract"
+    DEDUP: str = "dedup"
+    EMBED: str = "embed"
+    EXTRACT: str = "extract"
+    FILTER: str = "filter"
+    INFOGRAPHIC_DATA_EXTRACT: str = "infographic_data_extract"
+    SPLIT: str = "split"
+    STORE_EMBEDDING: str = "store_embedding"
+    STORE: str = "store"
+    TABLE_DATA_EXTRACT: str = "table_data_extract"
+    VDB_UPLOAD: str = "vdb_upload"
+class TextTypeEnum(str, Enum):
+    """
+    Enum for representing different types of text segments.
+    Attributes
+    ----------
+    BLOCK : str
+        Represents a text block.
+    BODY : str
+        Represents body text.
+    DOCUMENT : str
+        Represents an entire document.
+    HEADER : str
+        Represents a header text.
+    LINE : str
+        Represents a single line of text.
+    NEARBY_BLOCK : str
+        Represents a block of text in close proximity to another.
+    OTHER : str
+        Represents other unspecified text type.
+    PAGE : str
+        Represents a page of text.
+    SPAN : str
+        Represents an inline text span.
+    """
+    BLOCK: str = "block"
+    BODY: str = "body"
+    DOCUMENT: str = "document"
+    HEADER: str = "header"
+    LINE: str = "line"
+    NEARBY_BLOCK: str = "nearby_block"
+    OTHER: str = "other"
+    PAGE: str = "page"
+    SPAN: str = "span"

nv_ingest_api/internal/extract/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0

nv_ingest_api/internal/extract/audio/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0

nv_ingest_api/internal/extract/audio/audio_extraction.py ADDED Viewed

@@ -0,0 +1,149 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import pandas as pd
+from typing import Any
+from typing import Dict
+from typing import Optional
+from typing import Tuple
+from nv_ingest_api.internal.enums.common import ContentTypeEnum
+from nv_ingest_api.internal.primitives.nim.model_interface.parakeet import create_audio_inference_client
+from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
+from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema, AudioMetadataSchema
+from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
+from nv_ingest_api.util.schema.schema_validator import validate_schema
+logger = logging.getLogger(__name__)
+@unified_exception_handler
+def _update_audio_metadata(row: pd.Series, audio_client: Any, trace_info: Dict) -> Dict:
+    """
+    Modifies the metadata of a row if the conditions for table extraction are met.
+    Parameters
+    ----------
+    row : pd.Series
+        A row from the DataFrame containing metadata for the audio extraction.
+    audio_client : Any
+        The client used to call the audio inference model.
+    trace_info : Dict
+        Trace information used for logging or debugging.
+    Returns
+    -------
+    Dict
+        The modified metadata if conditions are met, otherwise the original metadata.
+    Raises
+    ------
+    ValueError
+        If critical information (such as metadata) is missing from the row.
+    """
+    metadata = row.get("metadata")
+    if metadata is None:
+        logger.error("Row does not contain 'metadata'.")
+        raise ValueError("Row does not contain 'metadata'.")
+    base64_audio = metadata.pop("content")
+    content_metadata = metadata.get("content_metadata", {})
+    # Only modify if content type is audio
+    if (content_metadata.get("type") != ContentTypeEnum.AUDIO) or (base64_audio in (None, "")):
+        return metadata
+    # Modify audio metadata with the result from the inference model
+    audio_result = audio_client.infer(
+        base64_audio,
+        model_name="parakeet",
+        trace_info=trace_info,  # traceable_func arg
+        stage_name="audio_extraction",
+    )
+    row["document_type"] = ContentTypeEnum.AUDIO
+    audio_metadata = {"audio_transcript": audio_result}
+    metadata["audio_metadata"] = validate_schema(audio_metadata, AudioMetadataSchema).model_dump()
+    row["metadata"] = validate_schema(metadata, MetadataSchema).model_dump()
+    return metadata
+def extract_text_from_audio_internal(
+    df_extraction_ledger: pd.DataFrame,
+    task_config: Dict[str, Any],
+    extraction_config: AudioExtractorSchema,
+    execution_trace_log: Optional[Dict] = None,
+) -> Tuple[pd.DataFrame, Dict]:
+    """
+    Extracts audio data from a DataFrame.
+    Parameters
+    ----------
+    df_extraction_ledger : pd.DataFrame
+        DataFrame containing the content from which audio data is to be extracted.
+    task_config : Dict[str, Any]
+        Dictionary containing task properties and configurations.
+    extraction_config : Any
+        The validated configuration object for audio extraction.
+    execution_trace_log : Optional[Dict], optional
+        Optional trace information for debugging or logging. Defaults to None.
+    Returns
+    -------
+    Tuple[pd.DataFrame, Dict]
+        A tuple containing the updated DataFrame and the trace information.
+    Raises
+    ------
+    Exception
+        If any error occurs during the audio data extraction process.
+    """
+    logger.debug(f"Entering audio extraction stage with {len(df_extraction_ledger)} rows.")
+    extract_params = task_config.get("params", {}).get("extract_audio_params", {})
+    audio_extraction_config = extraction_config.audio_extraction_config
+    grpc_endpoint = extract_params.get("grpc_endpoint") or audio_extraction_config.audio_endpoints[0]
+    http_endpoint = extract_params.get("http_endpoint") or audio_extraction_config.audio_endpoints[1]
+    infer_protocol = extract_params.get("infer_protocol") or audio_extraction_config.audio_infer_protocol
+    auth_token = extract_params.get("auth_token") or audio_extraction_config.auth_token
+    function_id = extract_params.get("function_id") or audio_extraction_config.function_id
+    use_ssl = extract_params.get("use_ssl") or audio_extraction_config.use_ssl
+    ssl_cert = extract_params.get("ssl_cert") or audio_extraction_config.ssl_cert
+    parakeet_client = create_audio_inference_client(
+        (grpc_endpoint, http_endpoint),
+        infer_protocol=infer_protocol,
+        auth_token=auth_token,
+        function_id=function_id,
+        use_ssl=use_ssl,
+        ssl_cert=ssl_cert,
+    )
+    if execution_trace_log is None:
+        execution_trace_log = {}
+        logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
+    try:
+        # Apply the _update_metadata function to each row in the DataFrame
+        df_extraction_ledger["metadata"] = df_extraction_ledger.apply(
+            _update_audio_metadata, axis=1, args=(parakeet_client, execution_trace_log)
+        )
+        return df_extraction_ledger, execution_trace_log
+    except Exception as e:
+        logger.exception(f"Error occurred while extracting audio data: {e}", exc_info=True)
+        raise

nv_ingest_api/internal/extract/docx/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2024, NVIDIA CORPORATION.

nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.23.dev20250423__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 2025.4.21.dev20250421py3-none-any.whl → 2025.4.23.dev20250423py3-none-any.whl