PyPI - nv-ingest - Versions diffs - 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl - Mend

nv-ingest 2025.8.4.dev20250804py3-none-any.whl → 2025.12.10.dev20251210py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py CHANGED Viewed

@@ -3,12 +3,13 @@
 # SPDX-License-Identifier: Apache-2.0
 import logging
-from typing import Dict, Any
+from typing import Dict, Any, Optional
 import ray
 from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
 from nv_ingest.framework.util.flow_control import filter_by_task
+from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
 from nv_ingest_api.internal.mutate.filter import filter_images_internal
 from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
 from nv_ingest_api.internal.primitives.tracing.tagging import traceable
@@ -16,6 +17,7 @@ from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema impo
 from nv_ingest_api.util.exception_handlers.decorators import (
     nv_ingest_node_failure_try_except,
 )
+from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
 logger = logging.getLogger(__name__)
@@ -31,18 +33,19 @@ class ImageFilterStage(RayActorStage):
       3. Updates the message payload with the filtered DataFrame.
     """
-    def __init__(self, config: ImageFilterSchema) -> None:
-        super().__init__(config)
+    def __init__(self, config: ImageFilterSchema, stage_name: Optional[str] = None) -> None:
+        super().__init__(config, stage_name=stage_name)
         try:
             self.validated_config = config
-            logger.info("ImageFilterStage configuration validated successfully.")
+            logger.debug("ImageFilterStage configuration validated successfully.")
         except Exception as e:
             logger.exception(f"Error validating Image Filter config: {e}")
             raise
-    @traceable("image_filter")
+    @nv_ingest_node_failure_try_except()
+    @traceable()
+    @udf_intercept_hook()
     @filter_by_task(required_tasks=["filter"])
-    @nv_ingest_node_failure_try_except(annotation_id="image_filter", raise_on_failure=False)
     def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
         """
         Process the control message by filtering images.
@@ -57,7 +60,7 @@ class ImageFilterStage(RayActorStage):
         IngestControlMessage
             The updated message with filtered images in the payload.
         """
-        logger.info("ImageFilterStage.on_data: Starting image filtering process.")
+        logger.debug("ImageFilterStage.on_data: Starting image filtering process.")
         # Extract the DataFrame payload.
         df_ledger = control_message.payload()
@@ -65,7 +68,7 @@ class ImageFilterStage(RayActorStage):
         # Remove the "filter" task from the message to obtain task-specific configuration.
         task_config = remove_task_by_type(control_message, "filter")
-        logger.debug("Extracted task config: %s", task_config)
+        logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
         task_params: Dict[str, Any] = task_config.get("params", {})
@@ -76,7 +79,7 @@ class ImageFilterStage(RayActorStage):
             mutate_config=self.validated_config,
             execution_trace_log=None,
         )
-        logger.info("Image filtering completed. Resulting DataFrame has %d rows.", len(new_df))
+        logger.debug("Image filtering completed. Resulting DataFrame has %d rows.", len(new_df))
         # Update the message payload with the filtered DataFrame.
         control_message.payload(new_df)

nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 import ray
 from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_sink_stage_base import RayActorSinkStage
@@ -12,13 +12,13 @@ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_fail
 @ray.remote
 class DefaultDrainSink(RayActorSinkStage):
-    def __init__(self, config: Any) -> None:
-        super().__init__(config, log_to_stdout=False)
+    def __init__(self, config: Any, stage_name: Optional[str] = None) -> None:
+        super().__init__(config, log_to_stdout=False, stage_name=stage_name)
         self._last_sunk_count = 0
         self._sunk_count = 0
-    @nv_ingest_node_failure_try_except(annotation_id="drain_sink", raise_on_failure=False)
+    @nv_ingest_node_failure_try_except()
     def on_data(self, message: IngestControlMessage) -> IngestControlMessage:
         self._sunk_count += 1

nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py CHANGED Viewed

@@ -14,6 +14,8 @@ from nv_ingest_api.internal.primitives.tracing.logging import annotate_cm
 from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
 from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
+from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
 logger = logging.getLogger(__name__)
@@ -75,8 +77,8 @@ class MessageBrokerTaskSinkConfig(BaseModel):
 @ray.remote
 class MessageBrokerTaskSinkStage(RayActorStage):
-    def __init__(self, config: MessageBrokerTaskSinkConfig) -> None:
-        super().__init__(config, log_to_stdout=False)
+    def __init__(self, config: MessageBrokerTaskSinkConfig, stage_name: Optional[str] = None) -> None:
+        super().__init__(config, log_to_stdout=False, stage_name=stage_name)
         self.config: MessageBrokerTaskSinkConfig
@@ -224,6 +226,7 @@ class MessageBrokerTaskSinkStage(RayActorStage):
     # --- Public API Methods for message broker sink ---
+    @udf_intercept_hook()
     def on_data(self, control_message: Any) -> Any:
         """
         Processes the control message and pushes the resulting JSON payloads to the broker.

nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py CHANGED Viewed

@@ -3,9 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import logging
-import multiprocessing
 import uuid
-import socket
 from typing import Optional, Literal, Dict, Any, Union
 import ray
@@ -13,6 +11,7 @@ import json
 import copy
 import threading
 import time
+import random
 from datetime import datetime
 import pandas as pd
@@ -30,6 +29,8 @@ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_inges
 # Import clients
 from nv_ingest_api.util.message_brokers.simple_message_broker.simple_client import SimpleClient
 from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
+from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
+from nv_ingest_api.util.message_brokers.qos_scheduler import QosScheduler
 logger = logging.getLogger(__name__)
@@ -89,8 +90,10 @@ class MessageBrokerTaskSourceConfig(BaseModel):
     # Use the discriminated union for broker_client
     broker_client: Union[RedisClientConfig, SimpleClientConfig] = Field(..., discriminator="client_type")
-    task_queue: str = Field(..., description="The name of the queue to fetch tasks from.")
-    poll_interval: float = Field(default=0.1, gt=0, description="Polling interval in seconds.")
+    task_queue: str = Field(
+        ..., description="The base name of the queue to fetch tasks from. Derives sub-queues for fair scheduling."
+    )
+    poll_interval: float = Field(default=0.0, gt=0, description="Polling interval in seconds.")
 @ray.remote
@@ -102,11 +105,14 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
     """
     # Use the updated config type hint
-    def __init__(self, config: MessageBrokerTaskSourceConfig) -> None:
-        super().__init__(config, log_to_stdout=False)
-        self.config: MessageBrokerTaskSourceConfig  # Add type hint for self.config
+    def __init__(self, config: MessageBrokerTaskSourceConfig, stage_name: Optional[str] = None) -> None:
+        super().__init__(config, log_to_stdout=False, stage_name=stage_name)
+        self.config: MessageBrokerTaskSourceConfig  # Add a type hint for self.config
+        # Sanitize config before logging to avoid leaking secrets
+        _sanitized = sanitize_for_logging(config)
         self._logger.debug(
-            "Initializing MessageBrokerTaskSourceStage with config: %s", config.dict()
+            "Initializing MessageBrokerTaskSourceStage with config: %s", _sanitized
         )  # Log validated config
         # Access validated configuration directly via self.config
@@ -126,13 +132,40 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
         self._pause_event = threading.Event()
         self._pause_event.set()  # Initially not paused
-        self._logger.debug("MessageBrokerTaskSourceStage initialized. Task queue: %s", self.task_queue)
+        # Backoff state for graceful retries when broker is unavailable
+        self._fetch_failure_count: int = 0
+        self._current_backoff_sleep: float = 0.0
+        self._last_backoff_log_time: float = 0.0
+        # Initialize QoS scheduler. Use a simple base-queue strategy for SimpleClient.
+        strategy = "simple" if isinstance(self.client, SimpleClient) else "lottery"
+        self.scheduler = QosScheduler(
+            self.task_queue,
+            num_prefetch_threads=6,  # one per category (no-op for simple strategy)
+            total_buffer_capacity=96,  # e.g., ~16 per thread
+            prefetch_poll_interval=0.002,  # faster polling for responsiveness
+            prefetch_non_immediate=True,  # enable prefetch for non-immediate categories
+            strategy=strategy,
+        )
+        self._logger.info(
+            "MessageBrokerTaskSourceStage initialized. Base task queue: %s | Derived queues: %s",
+            self.task_queue,
+            {
+                "immediate": f"{self.task_queue}_immediate",
+                "micro": f"{self.task_queue}_micro",
+                "small": f"{self.task_queue}_small",
+                "medium": f"{self.task_queue}_medium",
+                "large": f"{self.task_queue}_large",
+                "default": f"{self.task_queue}",
+            },
+        )
     # --- Private helper methods ---
     def _create_client(self):
         # Access broker config via self.config.broker_client
         broker_config = self.config.broker_client
-        self._logger.info("Creating client of type: %s", broker_config.client_type)
+        self._logger.debug("Creating client of type: %s", broker_config.client_type)
         if broker_config.client_type == "redis":
             client = RedisClient(
@@ -257,14 +290,24 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
         return control_message
-    def _fetch_message(self, timeout=100):
+    def _fetch_message(self, timeout=0):
         """
-        Fetch a message from the message broker.
+        Fetch a message from the message broker using fair scheduling across derived queues.
+        This is a non-blocking sweep across all queues for the current scheduling cycle. If no
+        message is found across any queue, return None so the caller can sleep briefly.
         """
         try:
-            job = self.client.fetch_message(self.task_queue, timeout)
+            # Use scheduler to fetch next. In simple strategy this will block up to poll_interval on base queue.
+            job = self.scheduler.fetch_next(self.client, timeout=self.config.poll_interval)
             if job is None:
-                self._logger.debug("No message received from '%s'", self.task_queue)
+                self._logger.debug(
+                    "No message received from derived queues for base "
+                    "'%s' (immediate, micro, small, medium, large, default)",
+                    self.task_queue,
+                )
+                # Do not treat normal empty polls as failures
+                self._fetch_failure_count = 0
+                self._current_backoff_sleep = 0.0
                 return None
             self._logger.debug("Received message type: %s", type(job))
             if isinstance(job, BaseModel):
@@ -277,12 +320,46 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
                     return None
                 job = json.loads(job.response)
             self._logger.debug("Successfully fetched message with job_id: %s", job.get("job_id", "unknown"))
+            # Success: reset backoff state
+            self._fetch_failure_count = 0
+            self._current_backoff_sleep = 0.0
             return job
         except TimeoutError:
             self._logger.debug("Timeout waiting for message")
+            # Timeout is not a connectivity failure; do not escalate backoff
             return None
         except Exception as err:
-            self._logger.exception("Error during message fetching: %s", err)
+            # Connectivity or other fetch issue: apply graceful backoff and avoid stacktrace spam
+            self._fetch_failure_count += 1
+            # Compute exponential backoff with jitter, capped by configured max_backoff
+            try:
+                max_backoff = getattr(self.config.broker_client, "max_backoff", 5.0)
+            except Exception:
+                max_backoff = 5.0
+            # Start from 0.5s, double each failure
+            base = 0.5
+            backoff_no_jitter = min(max_backoff, base * (2 ** (self._fetch_failure_count - 1)))
+            jitter = random.uniform(0, backoff_no_jitter * 0.2)
+            self._current_backoff_sleep = backoff_no_jitter + jitter
+            now = time.time()
+            # Throttle warning logs to at most once per 5 seconds to avoid spam
+            if now - self._last_backoff_log_time >= 5.0:
+                self._logger.warning(
+                    "Broker fetch failed (%d consecutive failures). Backing off for %.2fs. Error: %s",
+                    self._fetch_failure_count,
+                    self._current_backoff_sleep,
+                    err,
+                )
+                self._last_backoff_log_time = now
+            else:
+                self._logger.debug(
+                    "Broker fetch failed (%d). Backoff %.2fs. Error: %s",
+                    self._fetch_failure_count,
+                    self._current_backoff_sleep,
+                    err,
+                )
             return None
     def _read_input(self) -> any:
@@ -291,10 +368,20 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
         Instead of reading from an input edge, fetch a message from the broker.
         """
         self._logger.debug("read_input: calling _fetch_message()")
-        job = self._fetch_message(timeout=100)
+        # Perform a non-blocking sweep across all queues for this cycle
+        job = self._fetch_message(timeout=0)
         if job is None:
-            self._logger.debug("read_input: No job received, sleeping for poll_interval: %s", self.config.poll_interval)
-            time.sleep(self.config.poll_interval)
+            # Sleep for either the configured poll interval or the current backoff, whichever is larger
+            sleep_time = max(self.config.poll_interval, getattr(self, "_current_backoff_sleep", 0.0))
+            self._logger.debug(
+                "read_input: No job received; sleeping %.2fs (poll_interval=%.2fs, backoff=%.2fs)",
+                sleep_time,
+                self.config.poll_interval,
+                getattr(self, "_current_backoff_sleep", 0.0),
+            )
+            time.sleep(sleep_time)
+            # Reset one-shot backoff so that repeated failures recompute progressively
+            self._current_backoff_sleep = 0.0
             return None
@@ -314,7 +401,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
         This loop fetches messages from the broker and writes them to the output queue,
         but blocks on the pause event when the stage is paused.
         """
-        self._logger.info("Processing loop started")
+        self._logger.debug("Processing loop started")
         iteration = 0
         while self._running:
             iteration += 1
@@ -381,25 +468,25 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
                 self._active_processing = False
                 self._shutdown_signal_complete = True
-        self._logger.info("Processing loop ending")
+        self._logger.debug("Processing loop ending")
     @ray.method(num_returns=1)
     def start(self) -> bool:
         if self._running:
-            self._logger.info("Start called but stage is already running.")
+            self._logger.warning("Start called but stage is already running.")
             return False
         self._running = True
         self.start_time = time.time()
         self._message_count = 0
-        self._logger.info("Starting processing loop thread.")
+        self._logger.debug("Starting processing loop thread.")
         threading.Thread(target=self._processing_loop, daemon=True).start()
-        self._logger.info("MessageBrokerTaskSourceStage started.")
+        self._logger.debug("MessageBrokerTaskSourceStage started.")
         return True
     @ray.method(num_returns=1)
     def stop(self) -> bool:
         self._running = False
-        self._logger.info("Stop called on MessageBrokerTaskSourceStage")
+        self._logger.debug("Stop called on MessageBrokerTaskSourceStage")
         return True
     @ray.method(num_returns=1)
@@ -425,7 +512,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
     @ray.method(num_returns=1)
     def set_output_queue(self, queue_handle: any) -> bool:
         self.output_queue = queue_handle
-        self._logger.info("Output queue set: %s", queue_handle)
+        self._logger.debug("Output queue set: %s", queue_handle)
         return True
     @ray.method(num_returns=1)
@@ -440,7 +527,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
             True after the stage is paused.
         """
         self._pause_event.clear()
-        self._logger.info("Stage paused.")
+        self._logger.debug("Stage paused.")
         return True
@@ -456,7 +543,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
             True after the stage is resumed.
         """
         self._pause_event.set()
-        self._logger.info("Stage resumed.")
+        self._logger.debug("Stage resumed.")
         return True
     @ray.method(num_returns=1)
@@ -466,49 +553,9 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
         This method pauses the stage, waits for any current processing to finish,
         replaces the output queue, and then resumes the stage.
         """
-        self._logger.info("Swapping output queue: pausing stage first.")
+        self._logger.debug("Swapping output queue: pausing stage first.")
         self.pause()
         self.set_output_queue(new_queue)
-        self._logger.info("Output queue swapped. Resuming stage.")
+        self._logger.debug("Output queue swapped. Resuming stage.")
         self.resume()
         return True
-def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
-    """
-    Starts a SimpleMessageBroker server in a separate process.
-    Parameters
-    ----------
-    broker_client : dict
-        Broker configuration. Expected keys include:
-          - "port": the port to bind the server to,
-          - "broker_params": optionally including "max_queue_size",
-          - and any other parameters required by SimpleMessageBroker.
-    Returns
-    -------
-    multiprocessing.Process
-        The process running the SimpleMessageBroker server.
-    """
-    def broker_server():
-        from nv_ingest_api.util.message_brokers.simple_message_broker.broker import SimpleMessageBroker
-        # Use max_queue_size from broker_params or default to 10000.
-        broker_params = broker_client.get("broker_params", {})
-        max_queue_size = broker_params.get("max_queue_size", 10000)
-        server_host = broker_client.get("host", "0.0.0.0")
-        server_port = broker_client.get("port", 7671)
-        # Optionally, set socket options here for reuse.
-        server = SimpleMessageBroker(server_host, server_port, max_queue_size)
-        # Enable address reuse on the server socket.
-        server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-        server.serve_forever()
-    p = multiprocessing.Process(target=broker_server)
-    p.daemon = False
-    p.start()
-    logger.info(f"Started SimpleMessageBroker server in separate process on port {broker_client['port']}")
-    return p

nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py CHANGED Viewed

@@ -3,13 +3,16 @@
 # SPDX-License-Identifier: Apache-2.0
 import logging
-from typing import Dict, Any
+import os
+from typing import Dict, Any, Optional
+from urllib.parse import urlparse
 import pandas as pd
 import ray
 from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
 from nv_ingest.framework.util.flow_control import filter_by_task
+from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
 from nv_ingest_api.internal.enums.common import ContentTypeEnum
 from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
 from nv_ingest_api.internal.primitives.tracing.tagging import traceable
@@ -25,14 +28,15 @@ logger = logging.getLogger(__name__)
 @ray.remote
 class ImageStorageStage(RayActorStage):
     """
-    A Ray actor stage that stores images or structured content in MinIO and updates metadata with storage URLs.
+    A Ray actor stage that stores images or structured content using an fsspec-compatible backend and updates
+    metadata with storage URLs.
     This stage uses the validated configuration (ImageStorageModuleSchema) to process and store the DataFrame
     payload and updates the control message accordingly.
     """
-    def __init__(self, config: ImageStorageModuleSchema) -> None:
-        super().__init__(config)
+    def __init__(self, config: ImageStorageModuleSchema, stage_name: Optional[str] = None) -> None:
+        super().__init__(config, stage_name=stage_name)
         try:
             self.validated_config = config
             logger.info("ImageStorageStage configuration validated successfully.")
@@ -40,9 +44,10 @@ class ImageStorageStage(RayActorStage):
             logger.exception("Error validating image storage config")
             raise e
-    @traceable("image_storage")
+    @nv_ingest_node_failure_try_except()
+    @traceable()
+    @udf_intercept_hook()
     @filter_by_task(required_tasks=["store"])
-    @nv_ingest_node_failure_try_except(annotation_id="image_storage", raise_on_failure=False)
     def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
         """
         Process the control message by storing images or structured content.
@@ -67,8 +72,16 @@ class ImageStorageStage(RayActorStage):
         task_config = remove_task_by_type(control_message, "store")
         # logger.debug("ImageStorageStage: Task configuration extracted: %s", pprint.pformat(task_config))
-        store_structured: bool = task_config.get("structured", True)
-        store_unstructured: bool = task_config.get("images", False)
+        stage_defaults = {
+            "structured": self.validated_config.structured,
+            "images": self.validated_config.images,
+            "storage_uri": self.validated_config.storage_uri,
+            "storage_options": self.validated_config.storage_options,
+            "public_base_url": self.validated_config.public_base_url,
+        }
+        store_structured: bool = task_config.get("structured", stage_defaults["structured"])
+        store_unstructured: bool = task_config.get("images", stage_defaults["images"])
         content_types: Dict[Any, Any] = {}
         if store_structured:
@@ -78,14 +91,34 @@ class ImageStorageStage(RayActorStage):
             content_types[ContentTypeEnum.IMAGE] = store_unstructured
         params: Dict[str, Any] = task_config.get("params", {})
-        params["content_types"] = content_types
-        logger.debug(f"Processing storage task with parameters: {params}")
+        storage_uri = task_config.get("storage_uri") or params.get("storage_uri") or stage_defaults["storage_uri"]
+        storage_options = {
+            **(stage_defaults["storage_options"] or {}),
+            **(task_config.get("storage_options") or {}),
+            **params.get("storage_options", {}),
+        }
+        if "public_base_url" in task_config:
+            public_base_url = task_config["public_base_url"]
+        else:
+            public_base_url = params.get("public_base_url", stage_defaults["public_base_url"])
+        storage_options = self._inject_storage_defaults(storage_uri, storage_options)
+        storage_params: Dict[str, Any] = {
+            "content_types": content_types,
+            "storage_uri": storage_uri,
+            "storage_options": storage_options,
+        }
+        if public_base_url:
+            storage_params["public_base_url"] = public_base_url
+        logger.debug("Processing storage task with parameters: %s", storage_params)
         # Store images or structured content.
         df_storage_ledger: pd.DataFrame = store_images_to_minio_internal(
             df_storage_ledger=df_payload,
-            task_config=params,
+            task_config=storage_params,
             storage_config={},
             execution_trace_log=None,
         )
@@ -96,3 +129,38 @@ class ImageStorageStage(RayActorStage):
         control_message.payload(df_storage_ledger)
         return control_message
+    @staticmethod
+    def _inject_storage_defaults(storage_uri: str, storage_options: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Populate storage options for common backends (e.g., MinIO/S3) using environment defaults.
+        """
+        parsed_scheme = urlparse(storage_uri).scheme.lower()
+        merged_options: Dict[str, Any] = {k: v for k, v in storage_options.items() if v is not None}
+        if parsed_scheme not in {"s3", "s3a", "s3n"}:
+            return merged_options
+        def _set_if_absent(key: str, env_var: str) -> None:
+            if key not in merged_options and env_var in os.environ:
+                merged_options[key] = os.environ[env_var]
+        _set_if_absent("key", "MINIO_ACCESS_KEY")
+        _set_if_absent("secret", "MINIO_SECRET_KEY")
+        if "token" not in merged_options and os.environ.get("MINIO_SESSION_TOKEN"):
+            merged_options["token"] = os.environ["MINIO_SESSION_TOKEN"]
+        client_kwargs = dict(merged_options.get("client_kwargs", {}))
+        endpoint = os.environ.get("MINIO_INTERNAL_ADDRESS")
+        if not endpoint:
+            endpoint = "http://minio:9000"
+        if endpoint and not endpoint.startswith(("http://", "https://")):
+            endpoint = f"http://{endpoint}"
+        client_kwargs.setdefault("endpoint_url", endpoint)
+        region = os.environ.get("MINIO_REGION")
+        if region:
+            client_kwargs.setdefault("region_name", region)
+        if client_kwargs:
+            merged_options["client_kwargs"] = client_kwargs
+        return merged_options

nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import logging
+from typing import Optional
 import ray
@@ -15,6 +16,9 @@ from nv_ingest_api.internal.store.embed_text_upload import store_text_embeddings
 from nv_ingest_api.util.exception_handlers.decorators import (
     nv_ingest_node_failure_try_except,
 )
+from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
+from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
 logger = logging.getLogger(__name__)
@@ -30,8 +34,8 @@ class EmbeddingStorageStage(RayActorStage):
       3. Updates the message payload with the stored embeddings DataFrame.
     """
-    def __init__(self, config: EmbeddingStorageSchema) -> None:
-        super().__init__(config)
+    def __init__(self, config: EmbeddingStorageSchema, stage_name: Optional[str] = None) -> None:
+        super().__init__(config, stage_name=stage_name)
         try:
             self.validated_config = config
             logger.info("EmbeddingStorageStage configuration validated successfully.")
@@ -39,9 +43,10 @@ class EmbeddingStorageStage(RayActorStage):
             logger.exception(f"Error validating Embedding Storage config: {e}")
             raise
-    @traceable("embedding_storage")
+    @nv_ingest_node_failure_try_except()
+    @traceable()
+    @udf_intercept_hook()
     @filter_by_task(required_tasks=["store_embedding"])
-    @nv_ingest_node_failure_try_except(annotation_id="embedding_storage", raise_on_failure=False)
     def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
         """
         Process the control message by storing embeddings.
@@ -64,7 +69,7 @@ class EmbeddingStorageStage(RayActorStage):
         # Remove the "store_embedding" task from the message to obtain task-specific configuration.
         task_config = remove_task_by_type(control_message, "store_embedding")
-        logger.debug("Extracted task config: %s", task_config)
+        logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
         # Perform embedding storage.
         new_df = store_text_embeddings_internal(

nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl

nv-ingest 2025.8.4.dev20250804py3-none-any.whl → 2025.12.10.dev20251210py3-none-any.whl