PyPI - nv-ingest-api - Versions diffs - 2025.4.15.dev20250415__py3-none-any.whl → 2025.4.17.dev20250417__py3-none-any.whl - Mend

nv-ingest-api 2025.4.15.dev20250415py3-none-any.whl → 2025.4.17.dev20250417py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show

nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py ADDED Viewed

@@ -0,0 +1,367 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import io
+import base64
+import logging
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Tuple
+import grpc
+import numpy as np
+import riva.client
+from scipy.io import wavfile
+from nv_ingest_api.internal.primitives.tracing.tagging import traceable_func
+try:
+    import librosa
+except ImportError:
+    librosa = None
+logger = logging.getLogger(__name__)
+class ParakeetClient:
+    """
+    A simple interface for handling inference with a Parakeet model (e.g., speech, audio-related).
+    """
+    def __init__(
+        self,
+        endpoint: str,
+        auth_token: Optional[str] = None,
+        function_id: Optional[str] = None,
+        use_ssl: Optional[bool] = None,
+        ssl_cert: Optional[str] = None,
+    ):
+        """
+        Initialize the ParakeetClient.
+        Parameters
+        ----------
+        endpoint : str
+            The URL of the Parakeet service endpoint.
+        auth_token : Optional[str], default=None
+            The authentication token for accessing the service.
+        function_id: Optional[str]
+            The NVCF function ID for invoking the service.
+        use_ssl : bool, default=False
+            Whether to use SSL for the connection.
+        ssl_cert : Optional[str], default=None
+            Path to the SSL certificate if required.
+        auth_metadata : Optional[List[Tuple[str, str]]], default=None
+            Additional authentication metadata for the service.
+        """
+        self.endpoint = endpoint
+        self.auth_token = auth_token
+        self.function_id = function_id
+        if use_ssl is None:
+            self.use_ssl = True if ("grpc.nvcf.nvidia.com" in self.endpoint) and self.function_id else False
+        else:
+            self.use_ssl = use_ssl
+        self.ssl_cert = ssl_cert
+        self.auth_metadata = []
+        if self.auth_token:
+            self.auth_metadata.append(("authorization", f"Bearer {self.auth_token}"))
+        if self.function_id:
+            self.auth_metadata.append(("function-id", self.function_id))
+        # Create authentication and ASR service objects.
+        self._auth = riva.client.Auth(self.ssl_cert, self.use_ssl, self.endpoint, self.auth_metadata)
+        self._asr_service = riva.client.ASRService(self._auth)
+    @traceable_func(trace_name="{stage_name}::{model_name}")
+    def infer(self, data: dict, model_name: str, **kwargs) -> Any:
+        """
+        Perform inference using the specified model and input data.
+        Parameters
+        ----------
+        data : dict
+            The input data for inference.
+        model_name : str
+            The model name.
+        kwargs : dict
+            Additional parameters for inference.
+        Returns
+        -------
+        Any
+            The processed inference results, coalesced in the same order as the input images.
+        """
+        response = self.transcribe(data)
+        if response is None:
+            return None
+        segments, transcript = process_transcription_response(response)
+        logger.debug("Processing Parakeet inference results (pass-through).")
+        return transcript
+    def transcribe(
+        self,
+        audio_content: str,
+        language_code: str = "en-US",
+        automatic_punctuation: bool = True,
+        word_time_offsets: bool = True,
+        max_alternatives: int = 1,
+        profanity_filter: bool = False,
+        verbatim_transcripts: bool = True,
+        speaker_diarization: bool = False,
+        boosted_lm_words: Optional[List[str]] = None,
+        boosted_lm_score: float = 0.0,
+        diarization_max_speakers: int = 0,
+        start_history: float = 0.0,
+        start_threshold: float = 0.0,
+        stop_history: float = 0.0,
+        stop_history_eou: bool = False,
+        stop_threshold: float = 0.0,
+        stop_threshold_eou: bool = False,
+    ):
+        """
+        Transcribe an audio file using Riva ASR.
+        Parameters
+        ----------
+        audio_content : str
+            Base64-encoded audio content to be transcribed.
+        language_code : str, default="en-US"
+            The language code for transcription.
+        automatic_punctuation : bool, default=True
+            Whether to enable automatic punctuation in the transcript.
+        word_time_offsets : bool, default=True
+            Whether to include word-level timestamps in the transcript.
+        max_alternatives : int, default=1
+            The maximum number of alternative transcripts to return.
+        profanity_filter : bool, default=False
+            Whether to filter out profanity from the transcript.
+        verbatim_transcripts : bool, default=True
+            Whether to return verbatim transcripts without normalization.
+        speaker_diarization : bool, default=False
+            Whether to enable speaker diarization.
+        boosted_lm_words : Optional[List[str]], default=None
+            A list of words to boost for language modeling.
+        boosted_lm_score : float, default=0.0
+            The boosting score for language model words.
+        diarization_max_speakers : int, default=0
+            The maximum number of speakers to differentiate in speaker diarization.
+        start_history : float, default=0.0
+            History window size for endpoint detection.
+        start_threshold : float, default=0.0
+            The threshold for starting speech detection.
+        stop_history : float, default=0.0
+            History window size for stopping speech detection.
+        stop_history_eou : bool, default=False
+            Whether to use an end-of-utterance flag for stopping detection.
+        stop_threshold : float, default=0.0
+            The threshold for stopping speech detection.
+        stop_threshold_eou : bool, default=False
+            Whether to use an end-of-utterance flag for stop threshold.
+        Returns
+        -------
+        Optional[riva.client.RecognitionResponse]
+            The response containing the transcription results.
+            Returns None if the transcription fails.
+        """
+        # Build the recognition configuration.
+        recognition_config = riva.client.RecognitionConfig(
+            language_code=language_code,
+            max_alternatives=max_alternatives,
+            profanity_filter=profanity_filter,
+            enable_automatic_punctuation=automatic_punctuation,
+            verbatim_transcripts=verbatim_transcripts,
+            enable_word_time_offsets=word_time_offsets,
+        )
+        # Add additional configuration parameters.
+        riva.client.add_word_boosting_to_config(
+            recognition_config,
+            boosted_lm_words or [],
+            boosted_lm_score,
+        )
+        riva.client.add_speaker_diarization_to_config(
+            recognition_config,
+            speaker_diarization,
+            diarization_max_speakers,
+        )
+        riva.client.add_endpoint_parameters_to_config(
+            recognition_config,
+            start_history,
+            start_threshold,
+            stop_history,
+            stop_history_eou,
+            stop_threshold,
+            stop_threshold_eou,
+        )
+        audio_bytes = base64.b64decode(audio_content)
+        mono_audio_bytes = convert_to_mono_wav(audio_bytes)
+        # Perform offline recognition and print the transcript.
+        try:
+            response = self._asr_service.offline_recognize(mono_audio_bytes, recognition_config)
+            return response
+        except grpc.RpcError as e:
+            logger.exception(f"Error transcribing audio file: {e.details()}")
+            raise
+def convert_to_mono_wav(audio_bytes):
+    """
+    Convert an audio file to mono WAV format using Librosa and SciPy.
+    Parameters
+    ----------
+    audio_bytes : bytes
+        The raw audio data in bytes.
+    Returns
+    -------
+    bytes
+        The processed audio in mono WAV format.
+    """
+    if librosa is None:
+        raise ImportError(
+            "Librosa is required for audio processing. "
+            "If you are running this code with the ingest container, it can be installed by setting "
+            "the environment variable. INSTALL_AUDIO_EXTRACTION_DEPS=true"
+        )
+    # Create a BytesIO object from the audio bytes
+    byte_io = io.BytesIO(audio_bytes)
+    # Load the audio file with librosa
+    # librosa.load automatically converts to mono by default
+    audio_data, sample_rate = librosa.load(byte_io, sr=44100, mono=True)
+    # Ensure audio is properly scaled for 16-bit PCM
+    # Librosa normalizes the data between -1 and 1
+    if np.max(np.abs(audio_data)) > 0:
+        audio_data = audio_data / np.max(np.abs(audio_data)) * 0.9
+    # Convert to int16 format for 16-bit PCM WAV
+    audio_data_int16 = (audio_data * 32767).astype(np.int16)
+    # Create a BytesIO buffer to write the WAV file
+    output_io = io.BytesIO()
+    # Write the WAV data using scipy
+    wavfile.write(output_io, sample_rate, audio_data_int16)
+    # Reset the file pointer to the beginning and read all contents
+    output_io.seek(0)
+    wav_bytes = output_io.read()
+    return wav_bytes
+def process_transcription_response(response):
+    """
+    Process a Riva transcription response (a protobuf message) to extract:
+      - final_transcript: the complete transcript.
+      - segments: a list of segments with start/end times and text.
+    Parameters:
+      response: The Riva transcription response message.
+    Returns:
+      segments (list): Each segment is a dict with keys "start", "end", and "text".
+      final_transcript (str): The overall transcript.
+    """
+    words_list = []
+    # Iterate directly over the results.
+    for result in response.results:
+        # Ensure there is at least one alternative.
+        if not result.alternatives:
+            continue
+        alternative = result.alternatives[0]
+        # Each alternative has a repeated field "words"
+        for word_info in alternative.words:
+            words_list.append(word_info)
+    # Build the overall transcript by joining the word strings.
+    final_transcript = " ".join(word.word for word in words_list)
+    # Now, segment the transcript based on punctuation.
+    segments = []
+    current_words = []
+    segment_start = None
+    segment_end = None
+    punctuation_marks = {".", "?", "!"}
+    for word in words_list:
+        # Mark the start of a segment if not already set.
+        if segment_start is None:
+            segment_start = word.start_time
+        segment_end = word.end_time
+        current_words.append(word.word)
+        # End the segment when a word ends with punctuation.
+        if word.word and word.word[-1] in punctuation_marks:
+            segments.append({"start": segment_start, "end": segment_end, "text": " ".join(current_words)})
+            current_words = []
+            segment_start = None
+            segment_end = None
+    # Add any remaining words as a segment.
+    if current_words:
+        segments.append({"start": segment_start, "end": segment_end, "text": " ".join(current_words)})
+    return segments, final_transcript
+def create_audio_inference_client(
+    endpoints: Tuple[str, str],
+    infer_protocol: Optional[str] = None,
+    auth_token: Optional[str] = None,
+    function_id: Optional[str] = None,
+    use_ssl: bool = False,
+    ssl_cert: Optional[str] = None,
+):
+    """
+    Create a ParakeetClient for interfacing with an audio model inference server.
+    Parameters
+    ----------
+    endpoints : tuple
+        A tuple containing the gRPC and HTTP endpoints. Only the gRPC endpoint is used.
+    infer_protocol : str, optional
+        The protocol to use ("grpc" or "http").
+        If not specified, defaults to "grpc" if a valid gRPC endpoint is provided.
+        HTTP endpoints are not supported for audio inference.
+    auth_token : str, optional
+        Authorization token for authentication (default: None).
+    function_id : str, optional
+        NVCF function ID of the invocation (default: None)
+    use_ssl : bool, optional
+        Whether to use SSL for secure communication (default: False).
+    ssl_cert : str, optional
+        Path to the SSL certificate file if `use_ssl` is enabled (default: None).
+    Returns
+    -------
+    ParakeetClient
+        The initialized ParakeetClient configured for audio inference over gRPC.
+    Raises
+    ------
+    ValueError
+        If an invalid `infer_protocol` is specified or if an HTTP endpoint is provided.
+    """
+    grpc_endpoint, http_endpoint = endpoints
+    if (infer_protocol is None) and (grpc_endpoint and grpc_endpoint.strip()):
+        infer_protocol = "grpc"
+    if infer_protocol == "http":
+        raise ValueError("`http` endpoints are not supported for audio. Use `grpc`.")
+    return ParakeetClient(
+        grpc_endpoint, auth_token=auth_token, function_id=function_id, use_ssl=use_ssl, ssl_cert=ssl_cert
+    )

nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py ADDED Viewed

@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Optional, Tuple
+from nv_ingest_api.internal.primitives.nim import ModelInterface
+# Assume ModelInterface is defined elsewhere in the project.
+class EmbeddingModelInterface(ModelInterface):
+    """
+    An interface for handling inference with an embedding model endpoint.
+    This implementation supports HTTP inference for generating embeddings from text prompts.
+    """
+    def name(self) -> str:
+        """
+        Return the name of this model interface.
+        """
+        return "Embedding"
+    def prepare_data_for_inference(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Prepare input data for embedding inference. Ensures that a 'prompts' key is provided
+        and that its value is a list.
+        Raises
+        ------
+        KeyError
+            If the 'prompts' key is missing.
+        """
+        if "prompts" not in data:
+            raise KeyError("Input data must include 'prompts'.")
+        # Ensure the prompts are in list format.
+        if not isinstance(data["prompts"], list):
+            data["prompts"] = [data["prompts"]]
+        return data
+    def format_input(
+        self, data: Dict[str, Any], protocol: str, max_batch_size: int, **kwargs
+    ) -> Tuple[List[Any], List[Dict[str, Any]]]:
+        """
+        Format the input payload for the embedding endpoint. This method constructs one payload per batch,
+        where each payload includes a list of text prompts.
+        Additionally, it returns batch data that preserves the original order of prompts.
+        Parameters
+        ----------
+        data : dict
+            The input data containing "prompts" (a list of text prompts).
+        protocol : str
+            Only "http" is supported.
+        max_batch_size : int
+            Maximum number of prompts per payload.
+        kwargs : dict
+            Additional parameters including model_name, encoding_format, input_type, and truncate.
+        Returns
+        -------
+        tuple
+            A tuple (payloads, batch_data_list) where:
+              - payloads is a list of JSON-serializable payload dictionaries.
+              - batch_data_list is a list of dictionaries containing the key "prompts" corresponding to each batch.
+        """
+        if protocol != "http":
+            raise ValueError("EmbeddingModelInterface only supports HTTP protocol.")
+        prompts = data.get("prompts", [])
+        def chunk_list(lst, chunk_size):
+            return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
+        batches = chunk_list(prompts, max_batch_size)
+        payloads = []
+        batch_data_list = []
+        for batch in batches:
+            payload = {
+                "model": kwargs.get("model_name"),
+                "input": batch,
+                "encoding_format": kwargs.get("encoding_format", "float"),
+                "extra_body": {
+                    "input_type": kwargs.get("input_type", "query"),
+                    "truncate": kwargs.get("truncate", "NONE"),
+                },
+            }
+            payloads.append(payload)
+            batch_data_list.append({"prompts": batch})
+        return payloads, batch_data_list
+    def parse_output(self, response: Any, protocol: str, data: Optional[Dict[str, Any]] = None, **kwargs) -> Any:
+        """
+        Parse the HTTP response from the embedding endpoint. Expects a response structure with a "data" key.
+        Parameters
+        ----------
+        response : Any
+            The raw HTTP response (assumed to be already decoded as JSON).
+        protocol : str
+            Only "http" is supported.
+        data : dict, optional
+            The original input data.
+        kwargs : dict
+            Additional keyword arguments.
+        Returns
+        -------
+        list
+            A list of generated embeddings extracted from the response.
+        """
+        if protocol != "http":
+            raise ValueError("EmbeddingModelInterface only supports HTTP protocol.")
+        if isinstance(response, dict):
+            embeddings = response.get("data")
+            if not embeddings:
+                raise RuntimeError("Unexpected response format: 'data' key is missing or empty.")
+            # Each item in embeddings is expected to have an 'embedding' field.
+            return [item.get("embedding", None) for item in embeddings]
+        else:
+            return [str(response)]
+    def process_inference_results(self, output: Any, protocol: str, **kwargs) -> Any:
+        """
+        Process inference results for the embedding model.
+        For this implementation, the output is expected to be a list of embeddings.
+        Returns
+        -------
+        list
+            The processed list of embeddings.
+        """
+        return output

nv_ingest_api/internal/primitives/nim/model_interface/vlm.py ADDED Viewed

@@ -0,0 +1,152 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from typing import Dict, Any, Optional, Tuple, List
+import logging
+from nv_ingest_api.internal.primitives.nim import ModelInterface
+logger = logging.getLogger(__name__)
+class VLMModelInterface(ModelInterface):
+    """
+    An interface for handling inference with a VLM model endpoint (e.g., NVIDIA LLaMA-based VLM).
+    This implementation supports HTTP inference with one or more base64-encoded images and a caption prompt.
+    """
+    def name(self) -> str:
+        """
+        Return the name of this model interface.
+        """
+        return "VLM"
+    def prepare_data_for_inference(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Prepare input data for VLM inference. Accepts either a single base64 image or a list of images.
+        Ensures that a 'prompt' is provided.
+        Raises
+        ------
+        KeyError
+            If neither "base64_image" nor "base64_images" is provided or if "prompt" is missing.
+        ValueError
+            If "base64_images" exists but is not a list.
+        """
+        # Allow either a single image with "base64_image" or multiple images with "base64_images".
+        if "base64_images" in data:
+            if not isinstance(data["base64_images"], list):
+                raise ValueError("The 'base64_images' key must contain a list of base64-encoded strings.")
+        elif "base64_image" in data:
+            # Convert a single image into a list.
+            data["base64_images"] = [data["base64_image"]]
+        else:
+            raise KeyError("Input data must include 'base64_image' or 'base64_images'.")
+        if "prompt" not in data:
+            raise KeyError("Input data must include 'prompt'.")
+        return data
+    def format_input(
+        self, data: Dict[str, Any], protocol: str, max_batch_size: int, **kwargs
+    ) -> Tuple[List[Any], List[Dict[str, Any]]]:
+        """
+        Format the input payload for the VLM endpoint. This method constructs one payload per batch,
+        where each payload includes one message per image in the batch.
+        Additionally, it returns batch data that preserves the original order of images by including
+        the list of base64 images and the prompt for each batch.
+        Parameters
+        ----------
+        data : dict
+            The input data containing "base64_images" (a list of base64-encoded images) and "prompt".
+        protocol : str
+            Only "http" is supported.
+        max_batch_size : int
+            Maximum number of images per payload.
+        kwargs : dict
+            Additional parameters including model_name, max_tokens, temperature, top_p, and stream.
+        Returns
+        -------
+        tuple
+            A tuple (payloads, batch_data_list) where:
+              - payloads is a list of JSON-serializable payload dictionaries.
+              - batch_data_list is a list of dictionaries containing the keys "base64_images" and "prompt"
+                corresponding to each batch.
+        """
+        if protocol != "http":
+            raise ValueError("VLMModelInterface only supports HTTP protocol.")
+        images = data.get("base64_images", [])
+        prompt = data["prompt"]
+        # Helper function to chunk the list into batches.
+        def chunk_list(lst, chunk_size):
+            return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
+        batches = chunk_list(images, max_batch_size)
+        payloads = []
+        batch_data_list = []
+        for batch in batches:
+            # Create one message per image in the batch.
+            messages = [
+                {"role": "user", "content": f'{prompt} <img src="data:image/png;base64,{img}" />'} for img in batch
+            ]
+            payload = {
+                "model": kwargs.get("model_name"),
+                "messages": messages,
+                "max_tokens": kwargs.get("max_tokens", 512),
+                "temperature": kwargs.get("temperature", 1.0),
+                "top_p": kwargs.get("top_p", 1.0),
+                "stream": kwargs.get("stream", False),
+            }
+            payloads.append(payload)
+            batch_data_list.append({"base64_images": batch, "prompt": prompt})
+        return payloads, batch_data_list
+    def parse_output(self, response: Any, protocol: str, data: Optional[Dict[str, Any]] = None, **kwargs) -> Any:
+        """
+        Parse the HTTP response from the VLM endpoint. Expects a response structure with a "choices" key.
+        Parameters
+        ----------
+        response : Any
+            The raw HTTP response (assumed to be already decoded as JSON).
+        protocol : str
+            Only "http" is supported.
+        data : dict, optional
+            The original input data.
+        kwargs : dict
+            Additional keyword arguments.
+        Returns
+        -------
+        list
+            A list of generated captions extracted from the response.
+        """
+        if protocol != "http":
+            raise ValueError("VLMModelInterface only supports HTTP protocol.")
+        if isinstance(response, dict):
+            choices = response.get("choices", [])
+            if not choices:
+                raise RuntimeError("Unexpected response format: 'choices' key is missing or empty.")
+            # Return a list of captions, one per choice.
+            return [choice.get("message", {}).get("content", "No caption returned") for choice in choices]
+        else:
+            # If response is not a dict, return its string representation in a list.
+            return [str(response)]
+    def process_inference_results(self, output: Any, protocol: str, **kwargs) -> Any:
+        """
+        Process inference results for the VLM model.
+        For this implementation, the output is expected to be a list of captions.
+        Returns
+        -------
+        list
+            The processed list of captions.
+        """
+        return output

nv-ingest-api 2025.4.15.dev20250415__py3-none-any.whl → 2025.4.17.dev20250417__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 2025.4.15.dev20250415py3-none-any.whl → 2025.4.17.dev20250417py3-none-any.whl