nv-ingest 25.7.7.dev20250707__py3-none-any.whl → 25.8.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,6 +26,7 @@ class RayStatsCollector:
26
26
  interval: float = 30.0,
27
27
  actor_timeout: float = 5.0,
28
28
  queue_timeout: float = 2.0,
29
+ ema_alpha: float = 0.1, # Alpha for EMA memory cost calculation
29
30
  ):
30
31
  """
31
32
  Initializes the RayStatsCollector.
@@ -45,6 +46,9 @@ class RayStatsCollector:
45
46
  Timeout in seconds for waiting for stats from a single actor, by default 5.0.
46
47
  queue_timeout : float, optional
47
48
  Timeout in seconds for waiting for qsize from a single queue, by default 2.0.
49
+ ema_alpha : float, optional
50
+ The smoothing factor for the Exponential Moving Average (EMA)
51
+ calculation of memory cost. Defaults to 0.1.
48
52
  """
49
53
  if not ray:
50
54
  logger.warning("RayStatsCollector initialized but Ray is not available.")
@@ -53,6 +57,7 @@ class RayStatsCollector:
53
57
  self._interval = interval
54
58
  self._actor_timeout = actor_timeout
55
59
  self._queue_timeout = queue_timeout
60
+ self.ema_alpha = ema_alpha
56
61
 
57
62
  self._lock: threading.Lock = threading.Lock() # Protects access to collected stats and status
58
63
  self._running: bool = False
@@ -65,10 +70,12 @@ class RayStatsCollector:
65
70
  self._last_update_successful: bool = False
66
71
 
67
72
  self._cumulative_stats: Dict[str, Dict[str, int]] = defaultdict(lambda: {"processed": 0})
73
+ self.ema_memory_per_replica: Dict[str, float] = {} # EMA of memory per replica
68
74
 
69
75
  logger.info(
70
76
  f"RayStatsCollector initialized (Interval: {self._interval}s, "
71
- f"Actor Timeout: {self._actor_timeout}s, Queue Timeout: {self._queue_timeout}s)"
77
+ f"Actor Timeout: {self._actor_timeout}s, Queue Timeout: {self._queue_timeout}s, "
78
+ f"EMA Alpha: {self.ema_alpha})"
72
79
  )
73
80
 
74
81
  # --- Helper function to be run in threads ---
@@ -243,6 +250,7 @@ class RayStatsCollector:
243
250
  stage_stats_updates: Dict[str, Dict[str, int]] = {}
244
251
  actor_tasks: Dict[ray.ObjectRef, Tuple[Any, str]] = {}
245
252
  queue_sizes: Dict[str, int] = {}
253
+ stage_memory_samples: Dict[str, list[float]] = defaultdict(list)
246
254
 
247
255
  try:
248
256
  current_stages = self._pipeline.get_stages_info()
@@ -257,7 +265,7 @@ class RayStatsCollector:
257
265
  # --- 1. Prepare Actor Stat Requests ---
258
266
  for stage_info in current_stages:
259
267
  stage_name = stage_info.name
260
- stage_stats_updates[stage_name] = {"processing": 0, "in_flight": 0}
268
+ stage_stats_updates[stage_name] = {"processing": 0, "in_flight": 0, "memory_mb": 0}
261
269
 
262
270
  if stage_info.pending_shutdown:
263
271
  logger.debug(f"[StatsCollectNow] Stage '{stage_name}' pending shutdown. Skipping actor queries.")
@@ -302,6 +310,8 @@ class RayStatsCollector:
302
310
  stats = ray.get(ref)
303
311
  active = int(stats.get("active_processing", 0))
304
312
  delta = int(stats.get("delta_processed", 0))
313
+ memory_mb = float(stats.get("memory_mb", 0.0))
314
+
305
315
  processed = stage_stats_updates[stage_name].get("processed", 0)
306
316
  processing = stage_stats_updates[stage_name].get("processing", 0)
307
317
  stage_stats_updates[stage_name]["processing"] = processing + active
@@ -309,6 +319,7 @@ class RayStatsCollector:
309
319
  stage_stats_updates[stage_name]["delta_processed"] = (
310
320
  stage_stats_updates[stage_name].get("delta_processed", 0) + delta
311
321
  )
322
+ stage_memory_samples[stage_name].append(memory_mb)
312
323
 
313
324
  except Exception as e:
314
325
  logger.warning(
@@ -324,7 +335,23 @@ class RayStatsCollector:
324
335
  logger.error(f"[StatsCollectNow] Error during actor stats collection: {e}", exc_info=True)
325
336
  overall_success = False
326
337
 
327
- # --- 4. Aggregate In-Flight Stats ---
338
+ # --- 4. Aggregate Memory and Update EMA ---
339
+ for stage_name, samples in stage_memory_samples.items():
340
+ if not samples:
341
+ continue
342
+
343
+ total_memory = sum(samples)
344
+ num_replicas = len(samples)
345
+ current_memory_per_replica = total_memory / num_replicas
346
+ stage_stats_updates[stage_name]["memory_mb"] = total_memory
347
+
348
+ # Update EMA
349
+ current_ema = self.ema_memory_per_replica.get(stage_name, current_memory_per_replica)
350
+ new_ema = (self.ema_alpha * current_memory_per_replica) + ((1 - self.ema_alpha) * current_ema)
351
+ self.ema_memory_per_replica[stage_name] = new_ema
352
+ stage_stats_updates[stage_name]["ema_memory_per_replica"] = new_ema
353
+
354
+ # --- 5. Aggregate In-Flight Stats ---
328
355
  _total_inflight = 0
329
356
  for stage_info in current_stages:
330
357
  stage_name = stage_info.name
@@ -7,6 +7,8 @@ import threading
7
7
  import time
8
8
  from abc import ABC, abstractmethod
9
9
  from typing import Any, Dict, Optional
10
+ import os
11
+ import psutil
10
12
 
11
13
  import ray
12
14
  import ray.actor
@@ -29,49 +31,6 @@ def setup_stdout_logging(name: str = __name__, level: int = logging.INFO) -> log
29
31
  return logger
30
32
 
31
33
 
32
- @ray.remote
33
- def external_monitor_actor_shutdown(actor_handle: "RayActorStage", poll_interval: float = 0.1) -> bool:
34
- """
35
- Polls the provided actor's `is_shutdown_complete` method until it returns True
36
- or the actor becomes unreachable.
37
- """
38
- logger = setup_stdout_logging("_external_monitor_actor_shutdown") # Optional: for monitor's own logs
39
-
40
- if actor_handle is None:
41
- logger.error("Received null actor_handle. Cannot monitor shutdown.")
42
- return False # Or raise error
43
-
44
- actor_id_to_monitor = None
45
- try:
46
- # Try to get a string representation for logging, might fail if already gone
47
- actor_id_to_monitor = str(actor_handle) # Basic representation
48
- except Exception:
49
- actor_id_to_monitor = "unknown_actor"
50
-
51
- logger.debug(f"Monitoring shutdown for actor: {actor_id_to_monitor}")
52
-
53
- while True:
54
- try:
55
- # Remotely call the actor's method
56
- if ray.get(actor_handle.is_shutdown_complete.remote()):
57
- logger.debug(f"Actor {actor_id_to_monitor} reported shutdown complete.")
58
- actor_handle.request_actor_exit.remote()
59
-
60
- return True
61
- except ray.exceptions.RayActorError:
62
- # Actor has died or is otherwise unreachable.
63
- # Consider this as shutdown complete for the purpose of the future.
64
- logger.warning(f"Actor {actor_id_to_monitor} became unreachable (RayActorError). Assuming shutdown.")
65
- return True
66
- except Exception as e:
67
- # Catch other potential errors during the remote call
68
- logger.error(f"Unexpected error while polling shutdown status for {actor_id_to_monitor}: {e}")
69
- # Depending on policy, either continue polling or assume failure
70
- return True # Or True if any exit is "shutdown"
71
-
72
- time.sleep(poll_interval)
73
-
74
-
75
34
  class RayActorStage(ABC):
76
35
  """
77
36
  Abstract base class for a stateful Ray actor stage in a processing pipeline.
@@ -163,12 +122,13 @@ class RayActorStage(ABC):
163
122
  # Lock specifically for coordinating the final shutdown sequence (_request_actor_exit)
164
123
  self._lock = threading.Lock()
165
124
  self._shutdown_signal_complete = False # Initialize flag
166
- self._shutdown_future: Optional[ray.ObjectRef] = None
167
125
 
168
126
  # --- Logging ---
169
127
  # Ray won't propagate logging to the root logger by default, so we set up a custom logger for debugging
170
128
  self._logger = setup_stdout_logging(self.__class__.__name__) if log_to_stdout else logging.getLogger(__name__)
171
129
 
130
+ self._actor_id_str = self._get_actor_id_str()
131
+
172
132
  @staticmethod
173
133
  def _get_actor_id_str() -> str:
174
134
  """
@@ -215,19 +175,36 @@ class RayActorStage(ABC):
215
175
  if self._input_queue is None:
216
176
  # This check should ideally not fail if start() is called after setup
217
177
  if self._running:
218
- self._logger.error(f"{self._get_actor_id_str()}: Input queue not set while running")
178
+ self._logger.error(f"{self._actor_id_str}: Input queue not set while running")
219
179
  # Indicate a programming error - queue should be set before starting
220
180
  raise ValueError("Input queue not set while running")
221
181
  return None # Should not happen if self._running is False, but defensive check
222
182
 
183
+ item: Optional[Any] = None
223
184
  try:
224
- # Perform a non-blocking or short-blocking read from the queue
225
- # The timeout allows the loop to check self._running periodically
226
- return self._input_queue.get(timeout=1.0)
185
+ item = self._input_queue.get(timeout=1.0)
186
+
187
+ if item is None:
188
+ return None
189
+
190
+ if isinstance(item, ray.ObjectRef):
191
+ try:
192
+ deserialized_object = ray.get(item)
193
+ except ray.exceptions.ObjectLostError:
194
+ self._logger.error(
195
+ f"[{self._actor_id_str}] Failed to retrieve object from Ray object store. "
196
+ f"It has been lost and cannot be recovered."
197
+ )
198
+ raise # Re-raise the exception to be handled by the processing loop
199
+
200
+ del item
201
+ return deserialized_object
202
+
203
+ return item
204
+
227
205
  except Exception:
228
- # Common exceptions include queue.Empty in older Ray versions or
229
- # custom queue implementations raising timeout errors.
230
- # Return None to signify no item was retrieved this cycle.
206
+ if item is not None and isinstance(item, ray.ObjectRef):
207
+ del item
231
208
  return None
232
209
 
233
210
  @abstractmethod
@@ -290,7 +267,7 @@ class RayActorStage(ABC):
290
267
  read from the input queue.
291
268
  - `errors`: Incremented if `on_data` returns `None` or if an
292
269
  exception occurs during `on_data` or output queuing.
293
- - `processed`: Incremented after processing a control message
270
+ - `processed`: Incremented after successful processing and output (if any).
294
271
  - `successful_queue_writes`: Incremented when an item is successfully
295
272
  put onto the output queue.
296
273
  - `queue_full`: Incremented when an attempt to put to the output
@@ -305,8 +282,7 @@ class RayActorStage(ABC):
305
282
  - Thread safety for `self.stats` relies on the GIL for simple
306
283
  increment operations
307
284
  """
308
- actor_id_str = self._get_actor_id_str()
309
- self._logger.debug(f"{actor_id_str}: Processing loop thread starting.")
285
+ self._logger.debug(f"{self._actor_id_str}: Processing loop thread starting.")
310
286
 
311
287
  try:
312
288
  while self._running:
@@ -328,38 +304,58 @@ class RayActorStage(ABC):
328
304
  self._active_processing = True
329
305
 
330
306
  # Step 2: Process the retrieved message using subclass-specific logic.
331
- updated_cm: Optional[Any] = self.on_data(control_message)
307
+ updated_cm = self.on_data(control_message)
332
308
 
333
309
  # If there's a valid result and an output queue is configured, attempt to put.
334
- if self._output_queue is not None:
335
- # This loop will retry indefinitely until the item is put successfully
336
- # or an unrecoverable error occurs (which is not explicitly handled to break here).
337
- # TODO(Devin) -- This can be improved, should probably fail at some point?
338
- # Consider max retries or specific error handling for RayActorError
339
- # to prevent indefinite blocking if the queue actor is permanently dead.
340
- is_put_successful = False
341
- while not is_put_successful: # Renamed loop variable for clarity
342
- try:
343
- self._output_queue.put(updated_cm)
344
- self.stats["successful_queue_writes"] += 1
345
- is_put_successful = True # Exit retry loop on success
346
- except Exception as e_put: # Broad exception catch for put failures
347
- self._logger.warning(
348
- f"[{actor_id_str}] Output queue put failed (e.g., full, "
349
- f"timeout, or actor error), retrying. Error: {e_put}"
350
- )
351
- self.stats["queue_full"] += 1 # Consider renaming if it catches more than "full"
352
- time.sleep(0.1) # Brief pause before retrying
310
+ if self._output_queue is not None and updated_cm is not None:
311
+ object_ref_to_put = None # Ensure var exists for the finally block
312
+ try:
313
+ # Get the handle of the queue actor to set it as the owner.
314
+ # This decouples the object's lifetime from this actor.
315
+ owner_actor = self._output_queue.actor
316
+
317
+ # Put the object into Plasma, transferring ownership.
318
+ object_ref_to_put = ray.put(updated_cm, _owner=owner_actor)
319
+
320
+ # Now that the object is safely in Plasma, we can delete the large local copy.
321
+ del updated_cm
322
+
323
+ # This loop will retry until the ObjectRef is put successfully or shutdown is initiated.
324
+ is_put_successful = False
325
+ while not is_put_successful:
326
+ try:
327
+ self._output_queue.put(object_ref_to_put)
328
+ self.stats["successful_queue_writes"] += 1
329
+ is_put_successful = True # Exit retry loop on success
330
+ except Exception as e_put:
331
+ self._logger.warning(
332
+ f"[{self._actor_id_str}] Output queue put failed (e.g., full, "
333
+ f"timeout, or actor error), retrying. Error: {e_put}"
334
+ )
335
+ self.stats["queue_full"] += 1
336
+ time.sleep(0.1) # Brief pause before retrying
337
+ finally:
338
+ # After the operation, delete the local ObjectRef.
339
+ # The primary reference is now held by the queue actor.
340
+ if object_ref_to_put is not None:
341
+ del object_ref_to_put
353
342
 
354
343
  # Step 3: Increment "processed" count after successful processing and output (if any).
355
344
  # This is the primary path for "successful processing".
356
345
  self.stats["processed"] += 1
357
346
 
347
+ except ray.exceptions.ObjectLostError:
348
+ # This error is handled inside the loop to prevent the actor from crashing.
349
+ # We log it and continue to the next message.
350
+ self._logger.error(f"[{self._actor_id_str}] CRITICAL: An object was lost in transit. Skipping.")
351
+ # In a real-world scenario, you might want to increment a metric for monitoring.
352
+ continue
353
+
358
354
  except Exception as e_item_processing:
359
355
  # Catch exceptions from on_data() or unexpected issues in the item handling block.
360
356
  cm_info_str = f" (message type: {type(control_message).__name__})" if control_message else ""
361
357
  self._logger.exception(
362
- f"[{actor_id_str}] Error during processing of item{cm_info_str}: {e_item_processing}"
358
+ f"[{self._actor_id_str}] Error during processing of item{cm_info_str}: {e_item_processing}"
363
359
  )
364
360
  self.stats["errors"] += 1
365
361
 
@@ -370,180 +366,48 @@ class RayActorStage(ABC):
370
366
  # Ensure _active_processing is reset after each item attempt (success, failure, or no item).
371
367
  self._active_processing = False
372
368
 
369
+ # Explicitly delete the reference to the control message to aid garbage collection.
370
+ # This is important for large messages, as it helps release memory and ObjectRefs sooner.
371
+ if control_message is not None:
372
+ del control_message
373
+
373
374
  # --- Loop Exit Condition Met ---
374
375
  # This point is reached when self._running becomes False.
375
- self._logger.debug(f"[{actor_id_str}] Graceful exit: self._running is False. Processing loop terminating.")
376
+ self._logger.debug(
377
+ f"[{self._actor_id_str}] Graceful exit: self._running is False. Processing loop terminating."
378
+ )
376
379
 
377
380
  except Exception as e_outer_loop:
378
381
  # Catches very unexpected errors in the structure of the while loop itself.
379
382
  self._logger.exception(
380
- f"[{actor_id_str}] Unexpected critical error caused processing loop termination: {e_outer_loop}"
383
+ f"[{self._actor_id_str}] Unexpected critical error caused processing loop termination: {e_outer_loop}"
381
384
  )
382
385
  finally:
383
386
  # This block executes when the processing thread is about to exit,
384
387
  # either due to self._running becoming False or an unhandled critical exception.
385
- self._logger.debug(f"[{actor_id_str}] Processing loop thread finished.")
388
+ self._logger.debug(f"[{self._actor_id_str}] Processing loop thread finished.")
386
389
  # Signal that this actor's processing duties are complete.
387
390
  # External monitors (e.g., via a future from stop()) can use this signal.
388
391
  self._shutdown_signal_complete = True
389
392
 
390
- @staticmethod
391
- @ray.remote
392
- def _immediate_true() -> bool:
393
- """
394
- A tiny remote method that immediately returns True.
395
- Used to create a resolved ObjectRef when shutdown is already complete.
396
- """
397
- return True
398
-
399
- @ray.method(num_returns=1)
400
- def _finalize_shutdown(self) -> None:
401
- """
402
- Internal Ray method called remotely by the processing thread to safely exit the actor.
403
-
404
- This method runs in the main Ray actor thread context. It acquires a lock
405
- to prevent multiple exit attempts and then calls `ray.actor.exit_actor()`
406
- to terminate the actor process gracefully.
407
-
408
- Note: Only necessary if running in a detached actor context.
409
- """
410
-
411
- actor_id_str = self._get_actor_id_str()
412
- with self._lock:
413
- if self._shutting_down:
414
- return
415
-
416
- self._shutting_down = True
417
-
418
- self._logger.info(f"{actor_id_str}: Executing actor exit process.")
419
-
420
- get_runtime_context().current_actor.request_actor_exit.remote()
421
-
422
- @ray.method(num_returns=1)
423
- def request_actor_exit(self) -> None:
424
- """
425
- Request the actor to exit gracefully.
426
-
427
- This method is called from the main Ray actor thread to ensure a clean
428
- shutdown of the actor. It should be called when the processing loop
429
- has completed its work and is ready to exit.
430
- """
431
-
432
- if self._processing_thread:
433
- self._processing_thread.join()
434
-
435
- self._shutdown_signal_complete = True
436
-
437
- self._logger.debug(f"{self._get_actor_id_str()}: Requesting actor exit.")
438
- ray.actor.exit_actor()
439
-
440
- @ray.method(num_returns=1)
441
- def start(self) -> bool:
393
+ def _get_memory_usage_mb(self) -> float:
442
394
  """
443
- Starts the actor's processing loop in a background thread.
444
-
445
- Initializes state, resets statistics, and launches the `_processing_loop`
446
- thread. Idempotent: if called while already running, it logs a warning
447
- and returns False.
395
+ Gets the total memory usage of the current actor process (RSS).
448
396
 
449
397
  Returns
450
398
  -------
451
- bool
452
- True if the actor was successfully started, False if it was already running.
399
+ float
400
+ The memory usage in megabytes (MB).
453
401
  """
454
- actor_id_str = self._get_actor_id_str()
455
- # Prevent starting if already running
456
- if self._running:
457
- self._logger.warning(f"{actor_id_str}: Start called but actor is already running.")
458
- return False
459
-
460
- self._logger.info(f"{actor_id_str}: Starting actor...")
461
- # --- Initialize Actor State ---
462
- self._running = True
463
- self._shutting_down = False # Reset shutdown flag on start
464
- self._shutdown_signal_complete = False
465
- self.start_time = time.time()
466
-
467
- # --- Reset Statistics ---
468
- self._last_stats_time = self.start_time
469
- self._last_processed_count = 0
470
-
471
- # --- Start Background Processing Thread ---
472
- self._logger.debug(f"{actor_id_str}: Creating and starting processing thread.")
473
- self._processing_thread = threading.Thread(
474
- target=self._processing_loop,
475
- daemon=False,
476
- )
477
- self._processing_thread.start()
478
-
479
- self._logger.info(f"{actor_id_str}: Actor started successfully.")
480
-
481
- return True
482
-
483
- @ray.method(num_returns=1)
484
- def stop(self) -> ray.ObjectRef:
485
- actor_id_str = self._get_actor_id_str()
486
- self._logger.info(f"{actor_id_str}: Received external stop request.")
487
-
488
- if self._shutdown_future is not None:
489
- self._logger.debug(f"{actor_id_str}: Stop called again, returning existing shutdown future.")
490
- return self._shutdown_future
491
-
492
- if not self._running and self._shutdown_signal_complete: # Check if already fully shutdown
493
- self._logger.info(f"{actor_id_str}: Stop called, but actor was already shutdown and signal complete.")
494
- if self._shutdown_future: # Should have been set by the previous shutdown sequence
495
- return self._shutdown_future
496
- else: # Should not happen if shutdown_signal_complete is true, but as a fallback
497
- self._shutdown_future = self._immediate_true.remote()
498
- return self._shutdown_future
499
- elif not self._running: # Was stopped but maybe not fully signaled (e.g. mid-shutdown)
500
- self._logger.warning(
501
- f"{actor_id_str}: Stop called but actor was not running (or already stopping). "
502
- "Will create/return monitor future."
503
- )
504
- # If _shutdown_future is None here, it means stop wasn't called before OR a previous
505
- # monitor didn't get stored. Proceed to create a new monitor.
506
- # If it *was* already stopping and _shutdown_future exists, the first `if` catches it.
507
-
508
- # --- Initiate Shutdown signal to internal loop (if still running) ---
509
- if self._running: # Only set self._running = False if it was actually running
510
- self._running = False
511
- self._logger.info(f"{actor_id_str}: Stop signal sent to processing loop. Shutdown initiated.")
512
- else:
513
- self._logger.info(
514
- f"{actor_id_str}: Actor processing loop was already stopped. Monitoring for final shutdown signal."
515
- )
516
-
517
- # --- Spawn shutdown watcher task ---
518
- # Get a handle to the current actor instance to pass to the monitor.
519
- # This is crucial: the monitor needs to call methods on *this specific actor*.
520
402
  try:
521
- self_handle = get_runtime_context().current_actor
403
+ pid = os.getpid()
404
+ process = psutil.Process(pid)
405
+ # rss is the Resident Set Size, which is the non-swapped physical memory a process has used.
406
+ memory_bytes = process.memory_info().rss
407
+ return memory_bytes / (1024 * 1024)
522
408
  except Exception as e:
523
- self._logger.error(
524
- f"{actor_id_str}: Failed to get current_actor handle for monitoring: {e}. Returning a failing future."
525
- )
526
-
527
- # Cannot proceed to monitor, return a future that resolves to False or raises
528
- @ray.remote
529
- def failed_future():
530
- raise RuntimeError("Failed to initiate shutdown monitoring due to missing actor handle.")
531
-
532
- return failed_future.remote() # Or ray.put(False) directly
533
-
534
- self._shutdown_future = external_monitor_actor_shutdown.remote(self_handle)
535
-
536
- return self._shutdown_future
537
-
538
- @ray.method(num_returns=1)
539
- def is_shutdown_complete(self) -> bool:
540
- """
541
- Checks if the actor's processing loop has finished and signaled completion.
542
- Raises RayActorError if the actor process has terminated.
543
- """
544
- return self._shutdown_signal_complete
545
-
546
- # --- get_stats ---
409
+ self._logger.warning(f"[{self._actor_id_str}] Could not retrieve process memory usage: {e}")
410
+ return 0.0
547
411
 
548
412
  @ray.method(num_returns=1)
549
413
  def get_stats(self) -> Dict[str, Any]:
@@ -566,7 +430,16 @@ class RayActorStage(ABC):
566
430
  second during the last interval.
567
431
  Can be zero if no items were
568
432
  processed or the interval was too short.
433
+ - 'memory_mb' (float): The total memory usage of the current actor process (RSS) in megabytes (MB).
569
434
  """
435
+ # If the actor is not running, return the last known stats to ensure this
436
+ # call is non-blocking during shutdown.
437
+ if not self._running:
438
+ stats_copy = self.stats.copy()
439
+ stats_copy["active_processing"] = False # It's not active if not running
440
+ stats_copy["memory_mb"] = self._get_memory_usage_mb()
441
+ return stats_copy
442
+
570
443
  current_time: float = time.time()
571
444
  current_processed: int = self.stats.get("processed", 0)
572
445
  is_active: bool = self._active_processing
@@ -605,8 +478,64 @@ class RayActorStage(ABC):
605
478
  "queue_full": self.stats.get("queue_full", 0),
606
479
  "successful_queue_reads": self.stats.get("successful_queue_reads", 0),
607
480
  "successful_queue_writes": self.stats.get("successful_queue_writes", 0),
481
+ "memory_mb": self._get_memory_usage_mb(),
608
482
  }
609
483
 
484
+ @ray.method(num_returns=1)
485
+ def start(self) -> bool:
486
+ """
487
+ Starts the actor's processing loop in a background thread.
488
+
489
+ Initializes state, resets statistics, and launches the `_processing_loop`
490
+ thread. Idempotent: if called while already running, it logs a warning
491
+ and returns False.
492
+
493
+ Returns
494
+ -------
495
+ bool
496
+ True if the actor was successfully started, False if it was already running.
497
+ """
498
+ # Prevent starting if already running
499
+ if self._running:
500
+ self._logger.warning(f"{self._actor_id_str}: Start called but actor is already running.")
501
+ return False
502
+
503
+ self._logger.info(f"{self._actor_id_str}: Starting actor...")
504
+ # --- Initialize Actor State ---
505
+ self._running = True
506
+ self._shutting_down = False # Reset shutdown flag on start
507
+ self._shutdown_signal_complete = False
508
+ self.start_time = time.time()
509
+
510
+ # --- Reset Statistics ---
511
+ self._last_stats_time = self.start_time
512
+ self._last_processed_count = 0
513
+
514
+ # --- Start Background Processing Thread ---
515
+ self._logger.debug(f"{self._actor_id_str}: Creating and starting processing thread.")
516
+ self._processing_thread = threading.Thread(
517
+ target=self._processing_loop,
518
+ daemon=False,
519
+ )
520
+ self._processing_thread.start()
521
+
522
+ self._logger.info(f"{self._actor_id_str}: Actor started successfully.")
523
+
524
+ return True
525
+
526
+ @ray.method(num_returns=0)
527
+ def stop(self) -> None:
528
+ """Stops the actor's processing loop by setting the running flag to False."""
529
+ self._logger.info(f"[{self._actor_id_str}] Stop signal received. Initiating graceful shutdown.")
530
+ self._running = False
531
+
532
+ def is_shutdown_complete(self) -> bool:
533
+ """
534
+ Checks if the actor's processing loop has finished and signaled completion.
535
+ Raises RayActorError if the actor process has terminated.
536
+ """
537
+ return self._shutdown_signal_complete
538
+
610
539
  @ray.method(num_returns=1)
611
540
  def set_input_queue(self, queue_handle: Any) -> bool:
612
541
  """
@@ -625,7 +554,7 @@ class RayActorStage(ABC):
625
554
  bool
626
555
  True indicating the queue was set.
627
556
  """
628
- self._logger.debug(f"{self._get_actor_id_str()}: Setting input queue.")
557
+ self._logger.debug(f"{self._actor_id_str}: Setting input queue.")
629
558
  self._input_queue = queue_handle
630
559
  return True
631
560
 
@@ -647,6 +576,6 @@ class RayActorStage(ABC):
647
576
  bool
648
577
  True indicating the queue was set.
649
578
  """
650
- self._logger.debug(f"{self._get_actor_id_str()}: Setting output queue.")
579
+ self._logger.debug(f"{self._actor_id_str}: Setting output queue.")
651
580
  self._output_queue = queue_handle
652
581
  return True
@@ -269,8 +269,11 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
269
269
  self._logger.debug("Received message type: %s", type(job))
270
270
  if isinstance(job, BaseModel):
271
271
  self._logger.debug("Message is a BaseModel with response_code: %s", job.response_code)
272
- if job.response_code != 0:
273
- self._logger.debug("Message response_code != 0, returning None")
272
+ if job.response_code not in (0, 2):
273
+ self._logger.debug("Message received with unhandled response_code, returning None")
274
+ return None
275
+ if job.response_code == 2:
276
+ self._logger.debug("Message response_code == 2, returning None")
274
277
  return None
275
278
  job = json.loads(job.response)
276
279
  self._logger.debug("Successfully fetched message with job_id: %s", job.get("job_id", "unknown"))
@@ -338,15 +341,33 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
338
341
  self._pause_event.wait() # Block if paused
339
342
  self._active_processing = True
340
343
 
341
- while True:
342
- try:
343
- self.output_queue.put(control_message)
344
- self.stats["successful_queue_writes"] += 1
345
- break
346
- except Exception:
347
- self._logger.warning("Output queue full, retrying put()...")
348
- self.stats["queue_full"] += 1
349
- time.sleep(0.1)
344
+ object_ref_to_put = None
345
+ try:
346
+ # Get the handle of the queue actor to set it as the owner.
347
+ owner_actor = self.output_queue.actor
348
+
349
+ # Put the object into Plasma, transferring ownership.
350
+ object_ref_to_put = ray.put(control_message, _owner=owner_actor)
351
+
352
+ # Now that the object is safely in Plasma, delete the large local copy.
353
+ del control_message
354
+
355
+ # This loop will retry indefinitely until the ObjectRef is put successfully.
356
+ is_put_successful = False
357
+ while not is_put_successful:
358
+ try:
359
+ self.output_queue.put(object_ref_to_put)
360
+ self.stats["successful_queue_writes"] += 1
361
+ is_put_successful = True # Exit retry loop on success
362
+ except Exception:
363
+ self._logger.warning("Output queue full, retrying put()...")
364
+ self.stats["queue_full"] += 1
365
+ time.sleep(0.1)
366
+ finally:
367
+ # After the operation, delete the local ObjectRef.
368
+ # The primary reference is now held by the queue actor.
369
+ if object_ref_to_put is not None:
370
+ del object_ref_to_put
350
371
 
351
372
  self.stats["processed"] += 1
352
373
  self._message_count += 1