PyPI - nv-ingest-api - Versions diffs - 25.4.2__py3-none-any.whl → 25.6.0__py3-none-any.whl - Mend

nv-ingest-api 25.4.2py3-none-any.whl → 25.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (46) hide show

nv_ingest_api/internal/primitives/nim/model_interface/yolox.py CHANGED Viewed

@@ -709,7 +709,13 @@ def postprocess_results(
             raise ValueError(f"Error in postprocessing {result.shape} and {original_image_shape}: {e}")
         for box, score, label in zip(bboxes, scores, labels):
-            class_name = class_labels[int(label)]
+            # TODO(Devin): Sometimes we get back unexpected class labels?
+            if (label < 0) or (label >= len(class_labels)):
+                logger.warning(f"Invalid class label {label} found in postprocessing")
+                continue
+            else:
+                class_name = class_labels[int(label)]
             annotation_dict[class_name].append([round(float(x), 4) for x in np.concatenate((box, [score]))])
         out.append(annotation_dict)

nv_ingest_api/internal/primitives/nim/nim_client.py CHANGED Viewed

@@ -129,7 +129,7 @@ class NimClient:
         """
         if self.protocol == "grpc":
             logger.debug("Performing gRPC inference for a batch...")
-            response = self._grpc_infer(batch_input, model_name)
+            response = self._grpc_infer(batch_input, model_name, **kwargs)
             logger.debug("gRPC inference received response for a batch")
         elif self.protocol == "http":
             logger.debug("Performing HTTP inference for a batch...")
@@ -221,7 +221,7 @@ class NimClient:
         return all_results
-    def _grpc_infer(self, formatted_input: np.ndarray, model_name: str) -> np.ndarray:
+    def _grpc_infer(self, formatted_input: np.ndarray, model_name: str, **kwargs) -> np.ndarray:
         """
         Perform inference using the gRPC protocol.
@@ -238,16 +238,24 @@ class NimClient:
             The output of the model as a numpy array.
         """
-        input_tensors = [grpcclient.InferInput("input", formatted_input.shape, datatype="FP32")]
-        input_tensors[0].set_data_from_numpy(formatted_input)
+        parameters = kwargs.get("parameters", {})
+        output_names = kwargs.get("outputs", ["output"])
+        dtype = kwargs.get("dtype", "FP32")
+        input_name = kwargs.get("input_name", "input")
-        outputs = [grpcclient.InferRequestedOutput("output")]
-        response = self.client.infer(model_name=model_name, inputs=input_tensors, outputs=outputs)
-        logger.debug(f"gRPC inference response: {response}")
+        input_tensors = grpcclient.InferInput(input_name, formatted_input.shape, datatype=dtype)
+        input_tensors.set_data_from_numpy(formatted_input)
-        # TODO(self.client.has_error(response)) => raise error
+        outputs = [grpcclient.InferRequestedOutput(output_name) for output_name in output_names]
+        response = self.client.infer(
+            model_name=model_name, parameters=parameters, inputs=[input_tensors], outputs=outputs
+        )
+        logger.debug(f"gRPC inference response: {response}")
-        return response.as_numpy("output")
+        if len(outputs) == 1:
+            return response.as_numpy(outputs[0].name())
+        else:
+            return [response.as_numpy(output.name()) for output in outputs]
     def _http_infer(self, formatted_input: dict) -> dict:
         """

nv_ingest_api/internal/primitives/tracing/tagging.py CHANGED Viewed

@@ -31,13 +31,15 @@ def traceable(trace_name=None):
     Notes
     -----
-    The decorated function must accept a IngestControlMessage object as its first argument. The
-    IngestControlMessage object must implement `has_metadata`, `get_metadata`, and `set_metadata`
-    methods used by the decorator to check for the trace tagging flag and to add trace metadata.
+    The decorated function must accept a IngestControlMessage object as one of its arguments.
+    For a regular function, this is expected to be the first argument; for a class method,
+    this is expected to be the second argument (after 'self'). The IngestControlMessage object
+    must implement `has_metadata`, `get_metadata`, and `set_metadata` methods used by the decorator
+    to check for the trace tagging flag and to add trace metadata.
     The trace metadata added by the decorator includes two entries:
-    - 'trace::entry::<trace_name>': The monotonic timestamp marking the function's entry.
-    - 'trace::exit::<trace_name>': The monotonic timestamp marking the function's exit.
+    - 'trace::entry::<trace_name>': The timestamp marking the function's entry.
+    - 'trace::exit::<trace_name>': The timestamp marking the function's exit.
     Example
     -------
@@ -47,23 +49,25 @@ def traceable(trace_name=None):
     ... def process_message(message):
     ...     pass
-    Applying the decorator with a custom trace name:
-    >>> @traceable(custom_trace_name="CustomTraceName")
-    ... def process_message(message):
-    ...     pass
-    In both examples, `process_message` will have entry and exit timestamps added to the
-    IngestControlMessage's metadata if 'config::add_trace_tagging' is True.
+    Applying the decorator with a custom trace name on a class method:
+    >>> class Processor:
+    ...     @traceable(trace_name="CustomTrace")
+    ...     def process(self, message):
+    ...         pass
     """
     def decorator_trace_tagging(func):
         @functools.wraps(func)
         def wrapper_trace_tagging(*args, **kwargs):
-            # Assuming the first argument is always the message
             ts_fetched = datetime.now()
-            message = args[0]
+            # Determine which argument is the message.
+            if hasattr(args[0], "has_metadata"):
+                message = args[0]
+            elif len(args) > 1 and hasattr(args[1], "has_metadata"):
+                message = args[1]
+            else:
+                raise ValueError("traceable decorator could not find a message argument with 'has_metadata()'")
             do_trace_tagging = (message.has_metadata("config::add_trace_tagging") is True) and (
                 message.get_metadata("config::add_trace_tagging") is True
@@ -79,7 +83,7 @@ def traceable(trace_name=None):
                     message.set_timestamp(f"trace::entry::{trace_prefix}_channel_in", ts_send)
                     message.set_timestamp(f"trace::exit::{trace_prefix}_channel_in", ts_fetched)
-            # Call the decorated function
+            # Call the decorated function.
             result = func(*args, **kwargs)
             if do_trace_tagging:

nv_ingest_api/internal/schemas/extract/extract_chart_schema.py CHANGED Viewed

@@ -129,7 +129,7 @@ class ChartExtractorSchema(BaseModel):
     @field_validator("max_queue_size", "n_workers")
     def check_positive(cls, v, field):
         if v <= 0:
-            raise ValueError(f"{field.field_name} must be greater than 10.")
+            raise ValueError(f"{field.field_name} must be greater than 0.")
         return v
     model_config = ConfigDict(extra="forbid")

nv_ingest_api/internal/schemas/extract/extract_html_schema.py ADDED Viewed

@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from pydantic import ConfigDict, BaseModel
+logger = logging.getLogger(__name__)
+class HtmlExtractorSchema(BaseModel):
+    """
+    Configuration schema for the Html extractor settings.
+    Parameters
+    ----------
+    max_queue_size : int, default=1
+        The maximum number of items allowed in the processing queue.
+    n_workers : int, default=16
+        The number of worker threads to use for processing.
+    raise_on_failure : bool, default=False
+        A flag indicating whether to raise an exception on processing failure.
+    """
+    max_queue_size: int = 1
+    n_workers: int = 16
+    raise_on_failure: bool = False
+    model_config = ConfigDict(extra="forbid")

nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py CHANGED Viewed

@@ -122,7 +122,7 @@ class InfographicExtractorSchema(BaseModel):
     @field_validator("max_queue_size", "n_workers")
     def check_positive(cls, v, field):
         if v <= 0:
-            raise ValueError(f"{field.field_name} must be greater than 10.")
+            raise ValueError(f"{field.field_name} must be greater than 0.")
         return v
     model_config = ConfigDict(extra="forbid")

nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py CHANGED Viewed

@@ -131,7 +131,7 @@ class NemoRetrieverParseConfigSchema(BaseModel):
     nemoretriever_parse_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
     nemoretriever_parse_infer_protocol: str = ""
-    model_name: str = "nvidia/nemoretriever-parse"
+    nemoretriever_parse_model_name: str = "nvidia/nemoretriever-parse"
     timeout: float = 300.0

nv_ingest_api/internal/schemas/extract/extract_table_schema.py CHANGED Viewed

@@ -122,7 +122,7 @@ class TableExtractorSchema(BaseModel):
     @field_validator("max_queue_size", "n_workers")
     def check_positive(cls, v, field):
         if v <= 0:
-            raise ValueError(f"{field.field_name} must be greater than 10.")
+            raise ValueError(f"{field.field_name} must be greater than 0.")
         return v
     endpoint_config: Optional[TableExtractorConfigSchema] = None

nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py CHANGED Viewed

@@ -2,22 +2,36 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+from pydantic import BaseModel, Field
+from typing import Optional, Literal, Annotated
-from typing import Optional, Literal
-from pydantic import Field, BaseModel
-from typing_extensions import Annotated
+class MessageBrokerClientSchema(BaseModel):
+    """
+    Configuration schema for message broker client connections.
+    Supports Redis or simple in-memory clients.
+    """
+    host: str = Field(default="redis", description="Hostname of the broker service.")
-class MessageBrokerClientSchema(BaseModel):
-    host: str = "redis"
-    port: Annotated[int, Field(gt=0, lt=65536)] = 6379
+    port: Annotated[int, Field(gt=0, lt=65536)] = Field(
+        default=6379, description="Port to connect to. Must be between 1 and 65535."
+    )
+    client_type: Literal["redis", "simple"] = Field(
+        default="redis", description="Type of broker client. Supported values: 'redis', 'simple'."
+    )
+    broker_params: Optional[dict] = Field(
+        default_factory=dict, description="Optional parameters passed to the broker client."
+    )
-    # Update this for new broker types
-    client_type: Literal["redis", "simple"] = "redis"  # Restrict to 'redis' or 'simple'
+    connection_timeout: Annotated[int, Field(ge=0)] = Field(
+        default=300, description="Connection timeout in seconds. Must be >= 0."
+    )
-    broker_params: Optional[dict] = Field(default_factory=dict)
+    max_backoff: Annotated[int, Field(ge=0)] = Field(
+        default=300, description="Maximum backoff time in seconds. Must be >= 0."
+    )
-    connection_timeout: Optional[Annotated[int, Field(ge=0)]] = 300
-    max_backoff: Optional[Annotated[int, Field(ge=0)]] = 300
-    max_retries: Optional[Annotated[int, Field(ge=0)]] = 0
+    max_retries: Annotated[int, Field(ge=0)] = Field(default=0, description="Maximum number of retries. Must be >= 0.")

nv_ingest_api/internal/schemas/meta/ingest_job_schema.py CHANGED Viewed

@@ -160,29 +160,40 @@ class IngestTaskSchema(BaseModelNoExt):
     @model_validator(mode="before")
     @classmethod
     def check_task_properties_type(cls, values):
-        task_type, task_properties = values.get("type"), values.get("task_properties", {})
-        if task_type and task_properties:
-            expected_type = {
-                TaskTypeEnum.CAPTION: IngestTaskCaptionSchema,
-                TaskTypeEnum.DEDUP: IngestTaskDedupSchema,
-                TaskTypeEnum.EMBED: IngestTaskEmbedSchema,
-                TaskTypeEnum.EXTRACT: IngestTaskExtractSchema,
-                TaskTypeEnum.FILTER: IngestTaskFilterSchema,  # Extend mapping as necessary
-                TaskTypeEnum.SPLIT: IngestTaskSplitSchema,
-                TaskTypeEnum.STORE_EMBEDDING: IngestTaskStoreEmbedSchema,
-                TaskTypeEnum.STORE: IngestTaskStoreSchema,
-                TaskTypeEnum.VDB_UPLOAD: IngestTaskVdbUploadSchema,
-                TaskTypeEnum.AUDIO_DATA_EXTRACT: IngestTaskAudioExtraction,
-                TaskTypeEnum.TABLE_DATA_EXTRACT: IngestTaskTableExtraction,
-                TaskTypeEnum.CHART_DATA_EXTRACT: IngestTaskChartExtraction,
-                TaskTypeEnum.INFOGRAPHIC_DATA_EXTRACT: IngestTaskInfographicExtraction,
-            }.get(
-                task_type
-            )  # Removed .upper()
-            # Validate task_properties against the expected schema.
-            validated_task_properties = expected_type(**task_properties)
-            values["task_properties"] = validated_task_properties
+        task_type = values.get("type")
+        task_properties = values.get("task_properties", {})
+        # Ensure task_type is lowercased and converted to enum early
+        if isinstance(task_type, str):
+            task_type = task_type.lower()
+            try:
+                task_type = TaskTypeEnum(task_type)
+            except ValueError:
+                raise ValueError(f"{task_type} is not a valid TaskTypeEnum value")
+        task_type_to_schema = {
+            TaskTypeEnum.CAPTION: IngestTaskCaptionSchema,
+            TaskTypeEnum.DEDUP: IngestTaskDedupSchema,
+            TaskTypeEnum.EMBED: IngestTaskEmbedSchema,
+            TaskTypeEnum.EXTRACT: IngestTaskExtractSchema,
+            TaskTypeEnum.FILTER: IngestTaskFilterSchema,
+            TaskTypeEnum.SPLIT: IngestTaskSplitSchema,
+            TaskTypeEnum.STORE_EMBEDDING: IngestTaskStoreEmbedSchema,
+            TaskTypeEnum.STORE: IngestTaskStoreSchema,
+            TaskTypeEnum.VDB_UPLOAD: IngestTaskVdbUploadSchema,
+            TaskTypeEnum.AUDIO_DATA_EXTRACT: IngestTaskAudioExtraction,
+            TaskTypeEnum.TABLE_DATA_EXTRACT: IngestTaskTableExtraction,
+            TaskTypeEnum.CHART_DATA_EXTRACT: IngestTaskChartExtraction,
+            TaskTypeEnum.INFOGRAPHIC_DATA_EXTRACT: IngestTaskInfographicExtraction,
+        }
+        expected_schema_cls = task_type_to_schema.get(task_type)
+        if expected_schema_cls is None:
+            raise ValueError(f"Unsupported or missing task_type '{task_type}'")
+        validated_task_properties = expected_schema_cls(**task_properties)
+        values["type"] = task_type  # ensure type is now always the enum
+        values["task_properties"] = validated_task_properties
         return values
     @field_validator("type", mode="before")

nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py CHANGED Viewed

@@ -5,7 +5,7 @@
 import logging
-from pydantic import ConfigDict, BaseModel
+from pydantic import ConfigDict, BaseModel, Field
 from nv_ingest_api.util.logging.configuration import LogLevel
@@ -13,13 +13,14 @@ logger = logging.getLogger(__name__)
 class TextEmbeddingSchema(BaseModel):
-    api_key: str = "api_key"
-    batch_size: int = 4
-    embedding_model: str = "nvidia/nv-embedqa-e5-v5"
-    embedding_nim_endpoint: str = "http://embedding:8000/v1"
-    encoding_format: str = "float"
-    httpx_log_level: LogLevel = LogLevel.WARNING
-    input_type: str = "passage"
-    raise_on_failure: bool = False
-    truncate: str = "END"
+    api_key: str = Field(default="api_key")
+    batch_size: int = Field(default=4)
+    embedding_model: str = Field(default="nvidia/llama-3.2-nv-embedqa-1b-v2")
+    embedding_nim_endpoint: str = Field(default="http://embedding:8000/v1")
+    encoding_format: str = Field(default="float")
+    httpx_log_level: LogLevel = Field(default=LogLevel.WARNING)
+    input_type: str = Field(default="passage")
+    raise_on_failure: bool = Field(default=False)
+    truncate: str = Field(default="END")
     model_config = ConfigDict(extra="forbid")

nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py CHANGED Viewed

@@ -2,21 +2,23 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-from pydantic import Field, BaseModel, field_validator
+from pydantic import Field, BaseModel, field_validator, ConfigDict
 from typing import Optional
-from typing_extensions import Annotated
 class TextSplitterSchema(BaseModel):
     tokenizer: Optional[str] = None
-    chunk_size: Annotated[int, Field(gt=0)] = 1024
-    chunk_overlap: Annotated[int, Field(ge=0)] = 150
+    chunk_size: int = Field(default=1024, gt=0)
+    chunk_overlap: int = Field(default=150, ge=0)
     raise_on_failure: bool = False
     @field_validator("chunk_overlap")
-    def check_chunk_overlap(cls, v, values, **kwargs):
-        if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]:
+    @classmethod
+    def check_chunk_overlap(cls, v, values):
+        chunk_size = values.data.get("chunk_size")
+        if chunk_size is not None and v >= chunk_size:
             raise ValueError("chunk_overlap must be less than chunk_size")
         return v
+    model_config = ConfigDict(extra="forbid")

nv_ingest_api/internal/store/image_upload.py CHANGED Viewed

@@ -116,6 +116,7 @@ def _upload_images_to_minio(df: pd.DataFrame, params: Dict[str, Any]) -> pd.Data
             if "content" not in metadata:
                 logger.error("Row %s: missing 'content' in metadata", idx)
                 continue
             if "source_metadata" not in metadata or not isinstance(metadata["source_metadata"], dict):
                 logger.error("Row %s: missing or invalid 'source_metadata' in metadata", idx)
                 continue

nv_ingest_api/internal/transform/embed_text.py CHANGED Viewed

@@ -230,28 +230,35 @@ def _async_runner(
 def _add_embeddings(row, embeddings, info_msgs):
     """
     Updates a DataFrame row with embedding data and associated error info.
+    Ensures the 'embedding' field is always present, even if None.
     Parameters
     ----------
     row : pandas.Series
         A row of the DataFrame.
-    embeddings : list
-        List of embeddings corresponding to DataFrame rows.
-    info_msgs : list
-        List of info message dictionaries corresponding to DataFrame rows.
+    embeddings : dict
+        Dictionary mapping row indices to embeddings.
+    info_msgs : dict
+        Dictionary mapping row indices to info message dicts.
     Returns
     -------
     pandas.Series
-        The updated row with embedding and info message metadata added.
+        The updated row with 'embedding', 'info_message_metadata', and
+        '_contains_embeddings' appropriately set.
     """
-    row["metadata"]["embedding"] = embeddings[row.name]
-    if info_msgs[row.name] is not None:
-        row["metadata"]["info_message_metadata"] = info_msgs[row.name]
+    embedding = embeddings.get(row.name, None)
+    info_msg = info_msgs.get(row.name, None)
+    # Always set embedding, even if None
+    row["metadata"]["embedding"] = embedding
+    if info_msg:
+        row["metadata"]["info_message_metadata"] = info_msg
         row["document_type"] = ContentTypeEnum.INFO_MSG
         row["_contains_embeddings"] = False
     else:
-        row["_contains_embeddings"] = True
+        row["_contains_embeddings"] = embedding is not None
     return row
@@ -287,7 +294,7 @@ def _get_pandas_table_content(row):
     str
         The table/chart content from the row.
     """
-    return row["table_metadata"]["table_content"]
+    return row.get("table_metadata", {}).get("table_content")
 def _get_pandas_image_content(row):
@@ -304,7 +311,14 @@ def _get_pandas_image_content(row):
     str
         The image caption from the row.
     """
-    return row["image_metadata"]["caption"]
+    return row.get("image_metadata", {}).get("caption")
+def _get_pandas_audio_content(row):
+    """
+    A pandas UDF used to select extracted audio transcription to be used to create embeddings.
+    """
+    return row.get("audio_metadata", {}).get("audio_transcript")
 # ------------------------------------------------------------------------------
@@ -352,13 +366,6 @@ def _generate_batches(prompts: List[str], batch_size: int = 100) -> List[str]:
     return [batch for batch in _batch_generator(prompts, batch_size)]
-def _get_pandas_audio_content(row):
-    """
-    A pandas UDF used to select extracted audio transcription to be used to create embeddings.
-    """
-    return row["audio_metadata"]["audio_transcript"]
 # ------------------------------------------------------------------------------
 # DataFrame Concatenation Utility
 # ------------------------------------------------------------------------------
@@ -408,17 +415,20 @@ def transform_create_text_embeddings_internal(
     execution_trace_log: Optional[Dict] = None,
 ) -> Tuple[pd.DataFrame, Dict]:
     """
-    Generates text embeddings for supported content types (TEXT, STRUCTURED, IMAGE)
+    Generates text embeddings for supported content types (TEXT, STRUCTURED, IMAGE, AUDIO)
     from a pandas DataFrame using asynchronous requests.
+    This function ensures that even if the extracted content is empty or None,
+    the embedding field is explicitly created and set to None.
     Parameters
     ----------
     df_transform_ledger : pd.DataFrame
         The DataFrame containing content for embedding extraction.
     task_config : Dict[str, Any]
         Dictionary containing task properties (e.g., filter error flag).
-    transform_config : Any
-        Validated configuration for text embedding extraction (EmbedExtractionsSchema).
+    transform_config : TextEmbeddingSchema, optional
+        Validated configuration for text embedding extraction.
     execution_trace_log : Optional[Dict], optional
         Optional trace information for debugging or logging (default is None).
@@ -429,20 +439,20 @@ def transform_create_text_embeddings_internal(
             - The updated DataFrame with embeddings applied.
             - A dictionary with trace information.
     """
-    _ = task_config  # Currently unused.
+    api_key = task_config.get("api_key") or transform_config.api_key
+    endpoint_url = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
+    model_name = task_config.get("model_name") or transform_config.embedding_model
     if execution_trace_log is None:
         execution_trace_log = {}
         logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
-    # TODO(Devin)
     if df_transform_ledger.empty:
         return df_transform_ledger, {"trace_info": execution_trace_log}
     embedding_dataframes = []
-    content_masks = []  # List of pandas boolean Series
+    content_masks = []
-    # Define pandas content extractors for supported content types.
     pandas_content_extractor = {
         ContentTypeEnum.TEXT: _get_pandas_text_content,
         ContentTypeEnum.STRUCTURED: _get_pandas_table_content,
@@ -451,49 +461,62 @@ def transform_create_text_embeddings_internal(
         ContentTypeEnum.VIDEO: lambda x: None,  # Not supported yet.
     }
-    logger.debug("Generating text embeddings for supported content types: TEXT, STRUCTURED, IMAGE.")
     def _content_type_getter(row):
         return row["content_metadata"]["type"]
-    # Process each supported content type.
     for content_type, content_getter in pandas_content_extractor.items():
         if not content_getter:
             logger.debug(f"Skipping unsupported content type: {content_type}")
             continue
+        # Get rows matching the content type
         content_mask = df_transform_ledger["metadata"].apply(_content_type_getter) == content_type.value
         if not content_mask.any():
             continue
-        # Extract content from metadata and filter out rows with empty content.
-        extracted_content = df_transform_ledger.loc[content_mask, "metadata"].apply(content_getter)
-        non_empty_mask = extracted_content.notna() & (extracted_content.str.strip() != "")
-        final_mask = content_mask & non_empty_mask
-        if not final_mask.any():
-            continue
+        # Always include all content_mask rows and prepare them
+        df_content = df_transform_ledger.loc[content_mask].copy().reset_index(drop=True)
-        df_content = df_transform_ledger.loc[final_mask].copy().reset_index(drop=True)
-        filtered_content = df_content["metadata"].apply(content_getter)
-        filtered_content_batches = _generate_batches(filtered_content.tolist(), batch_size=transform_config.batch_size)
-        content_embeddings = _async_runner(
-            filtered_content_batches,
-            transform_config.api_key,
-            transform_config.embedding_nim_endpoint,
-            transform_config.embedding_model,
-            transform_config.encoding_format,
-            transform_config.input_type,
-            transform_config.truncate,
-            False,
+        # Extract content and normalize empty or non-str to None
+        extracted_content = (
+            df_content["metadata"]
+            .apply(content_getter)
+            .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
         )
-        # Apply the embeddings (and any error info) to each row.
-        df_content[["metadata", "document_type", "_contains_embeddings"]] = df_content.apply(
-            _add_embeddings, **content_embeddings, axis=1
-        )[["metadata", "document_type", "_contains_embeddings"]]
-        df_content["_content"] = filtered_content
+        df_content["_content"] = extracted_content
+        # Prepare batches for only valid (non-None) content
+        valid_content_mask = df_content["_content"].notna()
+        if valid_content_mask.any():
+            filtered_content_batches = _generate_batches(
+                df_content.loc[valid_content_mask, "_content"].tolist(), batch_size=transform_config.batch_size
+            )
+            content_embeddings = _async_runner(
+                filtered_content_batches,
+                api_key,
+                endpoint_url,
+                model_name,
+                transform_config.encoding_format,
+                transform_config.input_type,
+                transform_config.truncate,
+                False,
+            )
+            # Build a simple row index -> embedding map
+            embeddings_dict = dict(
+                zip(df_content.loc[valid_content_mask].index, content_embeddings.get("embeddings", []))
+            )
+            info_msgs_dict = dict(
+                zip(df_content.loc[valid_content_mask].index, content_embeddings.get("info_msgs", []))
+            )
+        else:
+            embeddings_dict = {}
+            info_msgs_dict = {}
+        # Apply embeddings or None to all rows
+        df_content = df_content.apply(_add_embeddings, embeddings=embeddings_dict, info_msgs=info_msgs_dict, axis=1)
         embedding_dataframes.append(df_content)
-        content_masks.append(final_mask)
+        content_masks.append(content_mask)
     combined_df = _concatenate_extractions_pandas(df_transform_ledger, embedding_dataframes, content_masks)
     return combined_df, {"trace_info": execution_trace_log}

nv_ingest_api/internal/transform/split_text.py CHANGED Viewed

@@ -118,9 +118,15 @@ def transform_text_split_and_tokenize_internal(
     )
     # Filter to documents with text content.
-    bool_index = (df_transform_ledger["document_type"] == ContentTypeEnum.TEXT) & (
-        pd.json_normalize(df_transform_ledger["metadata"])["source_metadata.source_type"].isin(split_source_types)
-    )
+    text_type_condition = df_transform_ledger["document_type"] == ContentTypeEnum.TEXT
+    normalized_meta_df = pd.json_normalize(df_transform_ledger["metadata"], errors="ignore")
+    if "source_metadata.source_type" in normalized_meta_df.columns:
+        source_type_condition = normalized_meta_df["source_metadata.source_type"].isin(split_source_types)
+    else:
+        source_type_condition = False
+    bool_index = text_type_condition & source_type_condition
     df_filtered: pd.DataFrame = df_transform_ledger.loc[bool_index]
     if df_filtered.empty:

nv_ingest_api/util/__init__.py CHANGED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0

nv-ingest-api 25.4.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 25.4.2py3-none-any.whl → 25.6.0py3-none-any.whl