PyPI - nv-ingest - Versions diffs - 25.7.7.dev20250707__py3-none-any.whl → 25.8.0rc1__py3-none-any.whl - Mend

nv-ingest 25.7.7.dev20250707py3-none-any.whl → 25.8.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py CHANGED Viewed

@@ -26,6 +26,7 @@ class RayStatsCollector:
         interval: float = 30.0,
         actor_timeout: float = 5.0,
         queue_timeout: float = 2.0,
+        ema_alpha: float = 0.1,  # Alpha for EMA memory cost calculation
     ):
         """
         Initializes the RayStatsCollector.
@@ -45,6 +46,9 @@ class RayStatsCollector:
             Timeout in seconds for waiting for stats from a single actor, by default 5.0.
         queue_timeout : float, optional
             Timeout in seconds for waiting for qsize from a single queue, by default 2.0.
+        ema_alpha : float, optional
+            The smoothing factor for the Exponential Moving Average (EMA)
+            calculation of memory cost. Defaults to 0.1.
         """
         if not ray:
             logger.warning("RayStatsCollector initialized but Ray is not available.")
@@ -53,6 +57,7 @@ class RayStatsCollector:
         self._interval = interval
         self._actor_timeout = actor_timeout
         self._queue_timeout = queue_timeout
+        self.ema_alpha = ema_alpha
         self._lock: threading.Lock = threading.Lock()  # Protects access to collected stats and status
         self._running: bool = False
@@ -65,10 +70,12 @@ class RayStatsCollector:
         self._last_update_successful: bool = False
         self._cumulative_stats: Dict[str, Dict[str, int]] = defaultdict(lambda: {"processed": 0})
+        self.ema_memory_per_replica: Dict[str, float] = {}  # EMA of memory per replica
         logger.info(
             f"RayStatsCollector initialized (Interval: {self._interval}s, "
-            f"Actor Timeout: {self._actor_timeout}s, Queue Timeout: {self._queue_timeout}s)"
+            f"Actor Timeout: {self._actor_timeout}s, Queue Timeout: {self._queue_timeout}s, "
+            f"EMA Alpha: {self.ema_alpha})"
         )
         # --- Helper function to be run in threads ---
@@ -243,6 +250,7 @@ class RayStatsCollector:
         stage_stats_updates: Dict[str, Dict[str, int]] = {}
         actor_tasks: Dict[ray.ObjectRef, Tuple[Any, str]] = {}
         queue_sizes: Dict[str, int] = {}
+        stage_memory_samples: Dict[str, list[float]] = defaultdict(list)
         try:
             current_stages = self._pipeline.get_stages_info()
@@ -257,7 +265,7 @@ class RayStatsCollector:
         # --- 1. Prepare Actor Stat Requests ---
         for stage_info in current_stages:
             stage_name = stage_info.name
-            stage_stats_updates[stage_name] = {"processing": 0, "in_flight": 0}
+            stage_stats_updates[stage_name] = {"processing": 0, "in_flight": 0, "memory_mb": 0}
             if stage_info.pending_shutdown:
                 logger.debug(f"[StatsCollectNow] Stage '{stage_name}' pending shutdown. Skipping actor queries.")
@@ -302,6 +310,8 @@ class RayStatsCollector:
                         stats = ray.get(ref)
                         active = int(stats.get("active_processing", 0))
                         delta = int(stats.get("delta_processed", 0))
+                        memory_mb = float(stats.get("memory_mb", 0.0))
                         processed = stage_stats_updates[stage_name].get("processed", 0)
                         processing = stage_stats_updates[stage_name].get("processing", 0)
                         stage_stats_updates[stage_name]["processing"] = processing + active
@@ -309,6 +319,7 @@ class RayStatsCollector:
                         stage_stats_updates[stage_name]["delta_processed"] = (
                             stage_stats_updates[stage_name].get("delta_processed", 0) + delta
                         )
+                        stage_memory_samples[stage_name].append(memory_mb)
                     except Exception as e:
                         logger.warning(
@@ -324,7 +335,23 @@ class RayStatsCollector:
                 logger.error(f"[StatsCollectNow] Error during actor stats collection: {e}", exc_info=True)
                 overall_success = False
-        # --- 4. Aggregate In-Flight Stats ---
+        # --- 4. Aggregate Memory and Update EMA ---
+        for stage_name, samples in stage_memory_samples.items():
+            if not samples:
+                continue
+            total_memory = sum(samples)
+            num_replicas = len(samples)
+            current_memory_per_replica = total_memory / num_replicas
+            stage_stats_updates[stage_name]["memory_mb"] = total_memory
+            # Update EMA
+            current_ema = self.ema_memory_per_replica.get(stage_name, current_memory_per_replica)
+            new_ema = (self.ema_alpha * current_memory_per_replica) + ((1 - self.ema_alpha) * current_ema)
+            self.ema_memory_per_replica[stage_name] = new_ema
+            stage_stats_updates[stage_name]["ema_memory_per_replica"] = new_ema
+        # --- 5. Aggregate In-Flight Stats ---
         _total_inflight = 0
         for stage_info in current_stages:
             stage_name = stage_info.name

nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py CHANGED Viewed

@@ -7,6 +7,8 @@ import threading
 import time
 from abc import ABC, abstractmethod
 from typing import Any, Dict, Optional
+import os
+import psutil
 import ray
 import ray.actor
@@ -29,49 +31,6 @@ def setup_stdout_logging(name: str = __name__, level: int = logging.INFO) -> log
     return logger
-@ray.remote
-def external_monitor_actor_shutdown(actor_handle: "RayActorStage", poll_interval: float = 0.1) -> bool:
-    """
-    Polls the provided actor's `is_shutdown_complete` method until it returns True
-    or the actor becomes unreachable.
-    """
-    logger = setup_stdout_logging("_external_monitor_actor_shutdown")  # Optional: for monitor's own logs
-    if actor_handle is None:
-        logger.error("Received null actor_handle. Cannot monitor shutdown.")
-        return False  # Or raise error
-    actor_id_to_monitor = None
-    try:
-        # Try to get a string representation for logging, might fail if already gone
-        actor_id_to_monitor = str(actor_handle)  # Basic representation
-    except Exception:
-        actor_id_to_monitor = "unknown_actor"
-    logger.debug(f"Monitoring shutdown for actor: {actor_id_to_monitor}")
-    while True:
-        try:
-            # Remotely call the actor's method
-            if ray.get(actor_handle.is_shutdown_complete.remote()):
-                logger.debug(f"Actor {actor_id_to_monitor} reported shutdown complete.")
-                actor_handle.request_actor_exit.remote()
-                return True
-        except ray.exceptions.RayActorError:
-            # Actor has died or is otherwise unreachable.
-            # Consider this as shutdown complete for the purpose of the future.
-            logger.warning(f"Actor {actor_id_to_monitor} became unreachable (RayActorError). Assuming shutdown.")
-            return True
-        except Exception as e:
-            # Catch other potential errors during the remote call
-            logger.error(f"Unexpected error while polling shutdown status for {actor_id_to_monitor}: {e}")
-            # Depending on policy, either continue polling or assume failure
-            return True  # Or True if any exit is "shutdown"
-        time.sleep(poll_interval)
 class RayActorStage(ABC):
     """
     Abstract base class for a stateful Ray actor stage in a processing pipeline.
@@ -163,12 +122,13 @@ class RayActorStage(ABC):
         # Lock specifically for coordinating the final shutdown sequence (_request_actor_exit)
         self._lock = threading.Lock()
         self._shutdown_signal_complete = False  # Initialize flag
-        self._shutdown_future: Optional[ray.ObjectRef] = None
         # --- Logging ---
         # Ray won't propagate logging to the root logger by default, so we set up a custom logger for debugging
         self._logger = setup_stdout_logging(self.__class__.__name__) if log_to_stdout else logging.getLogger(__name__)
+        self._actor_id_str = self._get_actor_id_str()
     @staticmethod
     def _get_actor_id_str() -> str:
         """
@@ -215,19 +175,36 @@ class RayActorStage(ABC):
         if self._input_queue is None:
             # This check should ideally not fail if start() is called after setup
             if self._running:
-                self._logger.error(f"{self._get_actor_id_str()}: Input queue not set while running")
+                self._logger.error(f"{self._actor_id_str}: Input queue not set while running")
                 # Indicate a programming error - queue should be set before starting
                 raise ValueError("Input queue not set while running")
             return None  # Should not happen if self._running is False, but defensive check
+        item: Optional[Any] = None
         try:
-            # Perform a non-blocking or short-blocking read from the queue
-            # The timeout allows the loop to check self._running periodically
-            return self._input_queue.get(timeout=1.0)
+            item = self._input_queue.get(timeout=1.0)
+            if item is None:
+                return None
+            if isinstance(item, ray.ObjectRef):
+                try:
+                    deserialized_object = ray.get(item)
+                except ray.exceptions.ObjectLostError:
+                    self._logger.error(
+                        f"[{self._actor_id_str}] Failed to retrieve object from Ray object store. "
+                        f"It has been lost and cannot be recovered."
+                    )
+                    raise  # Re-raise the exception to be handled by the processing loop
+                del item
+                return deserialized_object
+            return item
         except Exception:
-            # Common exceptions include queue.Empty in older Ray versions or
-            # custom queue implementations raising timeout errors.
-            # Return None to signify no item was retrieved this cycle.
+            if item is not None and isinstance(item, ray.ObjectRef):
+                del item
             return None
     @abstractmethod
@@ -290,7 +267,7 @@ class RayActorStage(ABC):
           read from the input queue.
         - `errors`: Incremented if `on_data` returns `None` or if an
           exception occurs during `on_data` or output queuing.
-        - `processed`: Incremented after processing a control message
+        - `processed`: Incremented after successful processing and output (if any).
         - `successful_queue_writes`: Incremented when an item is successfully
           put onto the output queue.
         - `queue_full`: Incremented when an attempt to put to the output
@@ -305,8 +282,7 @@ class RayActorStage(ABC):
         - Thread safety for `self.stats` relies on the GIL for simple
           increment operations
         """
-        actor_id_str = self._get_actor_id_str()
-        self._logger.debug(f"{actor_id_str}: Processing loop thread starting.")
+        self._logger.debug(f"{self._actor_id_str}: Processing loop thread starting.")
         try:
             while self._running:
@@ -328,38 +304,58 @@ class RayActorStage(ABC):
                     self._active_processing = True
                     # Step 2: Process the retrieved message using subclass-specific logic.
-                    updated_cm: Optional[Any] = self.on_data(control_message)
+                    updated_cm = self.on_data(control_message)
                     # If there's a valid result and an output queue is configured, attempt to put.
-                    if self._output_queue is not None:
-                        # This loop will retry indefinitely until the item is put successfully
-                        # or an unrecoverable error occurs (which is not explicitly handled to break here).
-                        # TODO(Devin) -- This can be improved, should probably fail at some point?
-                        #                Consider max retries or specific error handling for RayActorError
-                        #                to prevent indefinite blocking if the queue actor is permanently dead.
-                        is_put_successful = False
-                        while not is_put_successful:  # Renamed loop variable for clarity
-                            try:
-                                self._output_queue.put(updated_cm)
-                                self.stats["successful_queue_writes"] += 1
-                                is_put_successful = True  # Exit retry loop on success
-                            except Exception as e_put:  # Broad exception catch for put failures
-                                self._logger.warning(
-                                    f"[{actor_id_str}] Output queue put failed (e.g., full, "
-                                    f"timeout, or actor error), retrying. Error: {e_put}"
-                                )
-                                self.stats["queue_full"] += 1  # Consider renaming if it catches more than "full"
-                                time.sleep(0.1)  # Brief pause before retrying
+                    if self._output_queue is not None and updated_cm is not None:
+                        object_ref_to_put = None  # Ensure var exists for the finally block
+                        try:
+                            # Get the handle of the queue actor to set it as the owner.
+                            # This decouples the object's lifetime from this actor.
+                            owner_actor = self._output_queue.actor
+                            # Put the object into Plasma, transferring ownership.
+                            object_ref_to_put = ray.put(updated_cm, _owner=owner_actor)
+                            # Now that the object is safely in Plasma, we can delete the large local copy.
+                            del updated_cm
+                            # This loop will retry until the ObjectRef is put successfully or shutdown is initiated.
+                            is_put_successful = False
+                            while not is_put_successful:
+                                try:
+                                    self._output_queue.put(object_ref_to_put)
+                                    self.stats["successful_queue_writes"] += 1
+                                    is_put_successful = True  # Exit retry loop on success
+                                except Exception as e_put:
+                                    self._logger.warning(
+                                        f"[{self._actor_id_str}] Output queue put failed (e.g., full, "
+                                        f"timeout, or actor error), retrying. Error: {e_put}"
+                                    )
+                                    self.stats["queue_full"] += 1
+                                    time.sleep(0.1)  # Brief pause before retrying
+                        finally:
+                            # After the operation, delete the local ObjectRef.
+                            # The primary reference is now held by the queue actor.
+                            if object_ref_to_put is not None:
+                                del object_ref_to_put
                     # Step 3: Increment "processed" count after successful processing and output (if any).
                     # This is the primary path for "successful processing".
                     self.stats["processed"] += 1
+                except ray.exceptions.ObjectLostError:
+                    # This error is handled inside the loop to prevent the actor from crashing.
+                    # We log it and continue to the next message.
+                    self._logger.error(f"[{self._actor_id_str}] CRITICAL: An object was lost in transit. Skipping.")
+                    # In a real-world scenario, you might want to increment a metric for monitoring.
+                    continue
                 except Exception as e_item_processing:
                     # Catch exceptions from on_data() or unexpected issues in the item handling block.
                     cm_info_str = f" (message type: {type(control_message).__name__})" if control_message else ""
                     self._logger.exception(
-                        f"[{actor_id_str}] Error during processing of item{cm_info_str}: {e_item_processing}"
+                        f"[{self._actor_id_str}] Error during processing of item{cm_info_str}: {e_item_processing}"
                     )
                     self.stats["errors"] += 1
@@ -370,180 +366,48 @@ class RayActorStage(ABC):
                     # Ensure _active_processing is reset after each item attempt (success, failure, or no item).
                     self._active_processing = False
+                    # Explicitly delete the reference to the control message to aid garbage collection.
+                    # This is important for large messages, as it helps release memory and ObjectRefs sooner.
+                    if control_message is not None:
+                        del control_message
             # --- Loop Exit Condition Met ---
             # This point is reached when self._running becomes False.
-            self._logger.debug(f"[{actor_id_str}] Graceful exit: self._running is False. Processing loop terminating.")
+            self._logger.debug(
+                f"[{self._actor_id_str}] Graceful exit: self._running is False. Processing loop terminating."
+            )
         except Exception as e_outer_loop:
             # Catches very unexpected errors in the structure of the while loop itself.
             self._logger.exception(
-                f"[{actor_id_str}] Unexpected critical error caused processing loop termination: {e_outer_loop}"
+                f"[{self._actor_id_str}] Unexpected critical error caused processing loop termination: {e_outer_loop}"
             )
         finally:
             # This block executes when the processing thread is about to exit,
             # either due to self._running becoming False or an unhandled critical exception.
-            self._logger.debug(f"[{actor_id_str}] Processing loop thread finished.")
+            self._logger.debug(f"[{self._actor_id_str}] Processing loop thread finished.")
             # Signal that this actor's processing duties are complete.
             # External monitors (e.g., via a future from stop()) can use this signal.
             self._shutdown_signal_complete = True
-    @staticmethod
-    @ray.remote
-    def _immediate_true() -> bool:
-        """
-        A tiny remote method that immediately returns True.
-        Used to create a resolved ObjectRef when shutdown is already complete.
-        """
-        return True
-    @ray.method(num_returns=1)
-    def _finalize_shutdown(self) -> None:
-        """
-        Internal Ray method called remotely by the processing thread to safely exit the actor.
-        This method runs in the main Ray actor thread context. It acquires a lock
-        to prevent multiple exit attempts and then calls `ray.actor.exit_actor()`
-        to terminate the actor process gracefully.
-        Note: Only necessary if running in a detached actor context.
-        """
-        actor_id_str = self._get_actor_id_str()
-        with self._lock:
-            if self._shutting_down:
-                return
-            self._shutting_down = True
-        self._logger.info(f"{actor_id_str}: Executing actor exit process.")
-        get_runtime_context().current_actor.request_actor_exit.remote()
-    @ray.method(num_returns=1)
-    def request_actor_exit(self) -> None:
-        """
-        Request the actor to exit gracefully.
-        This method is called from the main Ray actor thread to ensure a clean
-        shutdown of the actor. It should be called when the processing loop
-        has completed its work and is ready to exit.
-        """
-        if self._processing_thread:
-            self._processing_thread.join()
-        self._shutdown_signal_complete = True
-        self._logger.debug(f"{self._get_actor_id_str()}: Requesting actor exit.")
-        ray.actor.exit_actor()
-    @ray.method(num_returns=1)
-    def start(self) -> bool:
+    def _get_memory_usage_mb(self) -> float:
         """
-        Starts the actor's processing loop in a background thread.
-        Initializes state, resets statistics, and launches the `_processing_loop`
-        thread. Idempotent: if called while already running, it logs a warning
-        and returns False.
+        Gets the total memory usage of the current actor process (RSS).
         Returns
         -------
-        bool
-            True if the actor was successfully started, False if it was already running.
+        float
+            The memory usage in megabytes (MB).
         """
-        actor_id_str = self._get_actor_id_str()
-        # Prevent starting if already running
-        if self._running:
-            self._logger.warning(f"{actor_id_str}: Start called but actor is already running.")
-            return False
-        self._logger.info(f"{actor_id_str}: Starting actor...")
-        # --- Initialize Actor State ---
-        self._running = True
-        self._shutting_down = False  # Reset shutdown flag on start
-        self._shutdown_signal_complete = False
-        self.start_time = time.time()
-        # --- Reset Statistics ---
-        self._last_stats_time = self.start_time
-        self._last_processed_count = 0
-        # --- Start Background Processing Thread ---
-        self._logger.debug(f"{actor_id_str}: Creating and starting processing thread.")
-        self._processing_thread = threading.Thread(
-            target=self._processing_loop,
-            daemon=False,
-        )
-        self._processing_thread.start()
-        self._logger.info(f"{actor_id_str}: Actor started successfully.")
-        return True
-    @ray.method(num_returns=1)
-    def stop(self) -> ray.ObjectRef:
-        actor_id_str = self._get_actor_id_str()
-        self._logger.info(f"{actor_id_str}: Received external stop request.")
-        if self._shutdown_future is not None:
-            self._logger.debug(f"{actor_id_str}: Stop called again, returning existing shutdown future.")
-            return self._shutdown_future
-        if not self._running and self._shutdown_signal_complete:  # Check if already fully shutdown
-            self._logger.info(f"{actor_id_str}: Stop called, but actor was already shutdown and signal complete.")
-            if self._shutdown_future:  # Should have been set by the previous shutdown sequence
-                return self._shutdown_future
-            else:  # Should not happen if shutdown_signal_complete is true, but as a fallback
-                self._shutdown_future = self._immediate_true.remote()
-                return self._shutdown_future
-        elif not self._running:  # Was stopped but maybe not fully signaled (e.g. mid-shutdown)
-            self._logger.warning(
-                f"{actor_id_str}: Stop called but actor was not running (or already stopping). "
-                "Will create/return monitor future."
-            )
-            # If _shutdown_future is None here, it means stop wasn't called before OR a previous
-            # monitor didn't get stored. Proceed to create a new monitor.
-            # If it *was* already stopping and _shutdown_future exists, the first `if` catches it.
-        # --- Initiate Shutdown signal to internal loop (if still running) ---
-        if self._running:  # Only set self._running = False if it was actually running
-            self._running = False
-            self._logger.info(f"{actor_id_str}: Stop signal sent to processing loop. Shutdown initiated.")
-        else:
-            self._logger.info(
-                f"{actor_id_str}: Actor processing loop was already stopped. Monitoring for final shutdown signal."
-            )
-        # --- Spawn shutdown watcher task ---
-        # Get a handle to the current actor instance to pass to the monitor.
-        # This is crucial: the monitor needs to call methods on *this specific actor*.
         try:
-            self_handle = get_runtime_context().current_actor
+            pid = os.getpid()
+            process = psutil.Process(pid)
+            # rss is the Resident Set Size, which is the non-swapped physical memory a process has used.
+            memory_bytes = process.memory_info().rss
+            return memory_bytes / (1024 * 1024)
         except Exception as e:
-            self._logger.error(
-                f"{actor_id_str}: Failed to get current_actor handle for monitoring: {e}. Returning a failing future."
-            )
-            # Cannot proceed to monitor, return a future that resolves to False or raises
-            @ray.remote
-            def failed_future():
-                raise RuntimeError("Failed to initiate shutdown monitoring due to missing actor handle.")
-            return failed_future.remote()  # Or ray.put(False) directly
-        self._shutdown_future = external_monitor_actor_shutdown.remote(self_handle)
-        return self._shutdown_future
-    @ray.method(num_returns=1)
-    def is_shutdown_complete(self) -> bool:
-        """
-        Checks if the actor's processing loop has finished and signaled completion.
-        Raises RayActorError if the actor process has terminated.
-        """
-        return self._shutdown_signal_complete
-    # --- get_stats ---
+            self._logger.warning(f"[{self._actor_id_str}] Could not retrieve process memory usage: {e}")
+            return 0.0
     @ray.method(num_returns=1)
     def get_stats(self) -> Dict[str, Any]:
@@ -566,7 +430,16 @@ class RayActorStage(ABC):
                                                second during the last interval.
                                                Can be zero if no items were
                                                processed or the interval was too short.
+              - 'memory_mb' (float): The total memory usage of the current actor process (RSS) in megabytes (MB).
         """
+        # If the actor is not running, return the last known stats to ensure this
+        # call is non-blocking during shutdown.
+        if not self._running:
+            stats_copy = self.stats.copy()
+            stats_copy["active_processing"] = False  # It's not active if not running
+            stats_copy["memory_mb"] = self._get_memory_usage_mb()
+            return stats_copy
         current_time: float = time.time()
         current_processed: int = self.stats.get("processed", 0)
         is_active: bool = self._active_processing
@@ -605,8 +478,64 @@ class RayActorStage(ABC):
             "queue_full": self.stats.get("queue_full", 0),
             "successful_queue_reads": self.stats.get("successful_queue_reads", 0),
             "successful_queue_writes": self.stats.get("successful_queue_writes", 0),
+            "memory_mb": self._get_memory_usage_mb(),
         }
+    @ray.method(num_returns=1)
+    def start(self) -> bool:
+        """
+        Starts the actor's processing loop in a background thread.
+        Initializes state, resets statistics, and launches the `_processing_loop`
+        thread. Idempotent: if called while already running, it logs a warning
+        and returns False.
+        Returns
+        -------
+        bool
+            True if the actor was successfully started, False if it was already running.
+        """
+        # Prevent starting if already running
+        if self._running:
+            self._logger.warning(f"{self._actor_id_str}: Start called but actor is already running.")
+            return False
+        self._logger.info(f"{self._actor_id_str}: Starting actor...")
+        # --- Initialize Actor State ---
+        self._running = True
+        self._shutting_down = False  # Reset shutdown flag on start
+        self._shutdown_signal_complete = False
+        self.start_time = time.time()
+        # --- Reset Statistics ---
+        self._last_stats_time = self.start_time
+        self._last_processed_count = 0
+        # --- Start Background Processing Thread ---
+        self._logger.debug(f"{self._actor_id_str}: Creating and starting processing thread.")
+        self._processing_thread = threading.Thread(
+            target=self._processing_loop,
+            daemon=False,
+        )
+        self._processing_thread.start()
+        self._logger.info(f"{self._actor_id_str}: Actor started successfully.")
+        return True
+    @ray.method(num_returns=0)
+    def stop(self) -> None:
+        """Stops the actor's processing loop by setting the running flag to False."""
+        self._logger.info(f"[{self._actor_id_str}] Stop signal received. Initiating graceful shutdown.")
+        self._running = False
+    def is_shutdown_complete(self) -> bool:
+        """
+        Checks if the actor's processing loop has finished and signaled completion.
+        Raises RayActorError if the actor process has terminated.
+        """
+        return self._shutdown_signal_complete
     @ray.method(num_returns=1)
     def set_input_queue(self, queue_handle: Any) -> bool:
         """
@@ -625,7 +554,7 @@ class RayActorStage(ABC):
         bool
             True indicating the queue was set.
         """
-        self._logger.debug(f"{self._get_actor_id_str()}: Setting input queue.")
+        self._logger.debug(f"{self._actor_id_str}: Setting input queue.")
         self._input_queue = queue_handle
         return True
@@ -647,6 +576,6 @@ class RayActorStage(ABC):
         bool
             True indicating the queue was set.
         """
-        self._logger.debug(f"{self._get_actor_id_str()}: Setting output queue.")
+        self._logger.debug(f"{self._actor_id_str}: Setting output queue.")
         self._output_queue = queue_handle
         return True

nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py CHANGED Viewed

@@ -269,8 +269,11 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
             self._logger.debug("Received message type: %s", type(job))
             if isinstance(job, BaseModel):
                 self._logger.debug("Message is a BaseModel with response_code: %s", job.response_code)
-                if job.response_code != 0:
-                    self._logger.debug("Message response_code != 0, returning None")
+                if job.response_code not in (0, 2):
+                    self._logger.debug("Message received with unhandled response_code, returning None")
+                    return None
+                if job.response_code == 2:
+                    self._logger.debug("Message response_code == 2, returning None")
                     return None
                 job = json.loads(job.response)
             self._logger.debug("Successfully fetched message with job_id: %s", job.get("job_id", "unknown"))
@@ -338,15 +341,33 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
                         self._pause_event.wait()  # Block if paused
                         self._active_processing = True
-                    while True:
-                        try:
-                            self.output_queue.put(control_message)
-                            self.stats["successful_queue_writes"] += 1
-                            break
-                        except Exception:
-                            self._logger.warning("Output queue full, retrying put()...")
-                            self.stats["queue_full"] += 1
-                            time.sleep(0.1)
+                    object_ref_to_put = None
+                    try:
+                        # Get the handle of the queue actor to set it as the owner.
+                        owner_actor = self.output_queue.actor
+                        # Put the object into Plasma, transferring ownership.
+                        object_ref_to_put = ray.put(control_message, _owner=owner_actor)
+                        # Now that the object is safely in Plasma, delete the large local copy.
+                        del control_message
+                        # This loop will retry indefinitely until the ObjectRef is put successfully.
+                        is_put_successful = False
+                        while not is_put_successful:
+                            try:
+                                self.output_queue.put(object_ref_to_put)
+                                self.stats["successful_queue_writes"] += 1
+                                is_put_successful = True  # Exit retry loop on success
+                            except Exception:
+                                self._logger.warning("Output queue full, retrying put()...")
+                                self.stats["queue_full"] += 1
+                                time.sleep(0.1)
+                    finally:
+                        # After the operation, delete the local ObjectRef.
+                        # The primary reference is now held by the queue actor.
+                        if object_ref_to_put is not None:
+                            del object_ref_to_put
                 self.stats["processed"] += 1
                 self._message_count += 1

nv-ingest 25.7.7.dev20250707__py3-none-any.whl → 25.8.0rc1__py3-none-any.whl

nv-ingest 25.7.7.dev20250707py3-none-any.whl → 25.8.0rc1py3-none-any.whl