nv-ingest 25.7.6.dev20250706__py3-none-any.whl → 25.8.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/v1/health.py +1 -1
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +10 -9
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +65 -303
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +438 -163
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +30 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +159 -230
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +32 -11
- nv_ingest/framework/orchestration/ray/util/env_config.py +75 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +7 -72
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +26 -13
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +55 -28
- {nv_ingest-25.7.6.dev20250706.dist-info → nv_ingest-25.8.0rc1.dist-info}/METADATA +2 -5
- {nv_ingest-25.7.6.dev20250706.dist-info → nv_ingest-25.8.0rc1.dist-info}/RECORD +16 -15
- {nv_ingest-25.7.6.dev20250706.dist-info → nv_ingest-25.8.0rc1.dist-info}/WHEEL +0 -0
- {nv_ingest-25.7.6.dev20250706.dist-info → nv_ingest-25.8.0rc1.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-25.7.6.dev20250706.dist-info → nv_ingest-25.8.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -26,6 +26,7 @@ class RayStatsCollector:
|
|
|
26
26
|
interval: float = 30.0,
|
|
27
27
|
actor_timeout: float = 5.0,
|
|
28
28
|
queue_timeout: float = 2.0,
|
|
29
|
+
ema_alpha: float = 0.1, # Alpha for EMA memory cost calculation
|
|
29
30
|
):
|
|
30
31
|
"""
|
|
31
32
|
Initializes the RayStatsCollector.
|
|
@@ -45,6 +46,9 @@ class RayStatsCollector:
|
|
|
45
46
|
Timeout in seconds for waiting for stats from a single actor, by default 5.0.
|
|
46
47
|
queue_timeout : float, optional
|
|
47
48
|
Timeout in seconds for waiting for qsize from a single queue, by default 2.0.
|
|
49
|
+
ema_alpha : float, optional
|
|
50
|
+
The smoothing factor for the Exponential Moving Average (EMA)
|
|
51
|
+
calculation of memory cost. Defaults to 0.1.
|
|
48
52
|
"""
|
|
49
53
|
if not ray:
|
|
50
54
|
logger.warning("RayStatsCollector initialized but Ray is not available.")
|
|
@@ -53,6 +57,7 @@ class RayStatsCollector:
|
|
|
53
57
|
self._interval = interval
|
|
54
58
|
self._actor_timeout = actor_timeout
|
|
55
59
|
self._queue_timeout = queue_timeout
|
|
60
|
+
self.ema_alpha = ema_alpha
|
|
56
61
|
|
|
57
62
|
self._lock: threading.Lock = threading.Lock() # Protects access to collected stats and status
|
|
58
63
|
self._running: bool = False
|
|
@@ -65,10 +70,12 @@ class RayStatsCollector:
|
|
|
65
70
|
self._last_update_successful: bool = False
|
|
66
71
|
|
|
67
72
|
self._cumulative_stats: Dict[str, Dict[str, int]] = defaultdict(lambda: {"processed": 0})
|
|
73
|
+
self.ema_memory_per_replica: Dict[str, float] = {} # EMA of memory per replica
|
|
68
74
|
|
|
69
75
|
logger.info(
|
|
70
76
|
f"RayStatsCollector initialized (Interval: {self._interval}s, "
|
|
71
|
-
f"Actor Timeout: {self._actor_timeout}s, Queue Timeout: {self._queue_timeout}s
|
|
77
|
+
f"Actor Timeout: {self._actor_timeout}s, Queue Timeout: {self._queue_timeout}s, "
|
|
78
|
+
f"EMA Alpha: {self.ema_alpha})"
|
|
72
79
|
)
|
|
73
80
|
|
|
74
81
|
# --- Helper function to be run in threads ---
|
|
@@ -243,6 +250,7 @@ class RayStatsCollector:
|
|
|
243
250
|
stage_stats_updates: Dict[str, Dict[str, int]] = {}
|
|
244
251
|
actor_tasks: Dict[ray.ObjectRef, Tuple[Any, str]] = {}
|
|
245
252
|
queue_sizes: Dict[str, int] = {}
|
|
253
|
+
stage_memory_samples: Dict[str, list[float]] = defaultdict(list)
|
|
246
254
|
|
|
247
255
|
try:
|
|
248
256
|
current_stages = self._pipeline.get_stages_info()
|
|
@@ -257,7 +265,7 @@ class RayStatsCollector:
|
|
|
257
265
|
# --- 1. Prepare Actor Stat Requests ---
|
|
258
266
|
for stage_info in current_stages:
|
|
259
267
|
stage_name = stage_info.name
|
|
260
|
-
stage_stats_updates[stage_name] = {"processing": 0, "in_flight": 0}
|
|
268
|
+
stage_stats_updates[stage_name] = {"processing": 0, "in_flight": 0, "memory_mb": 0}
|
|
261
269
|
|
|
262
270
|
if stage_info.pending_shutdown:
|
|
263
271
|
logger.debug(f"[StatsCollectNow] Stage '{stage_name}' pending shutdown. Skipping actor queries.")
|
|
@@ -302,6 +310,8 @@ class RayStatsCollector:
|
|
|
302
310
|
stats = ray.get(ref)
|
|
303
311
|
active = int(stats.get("active_processing", 0))
|
|
304
312
|
delta = int(stats.get("delta_processed", 0))
|
|
313
|
+
memory_mb = float(stats.get("memory_mb", 0.0))
|
|
314
|
+
|
|
305
315
|
processed = stage_stats_updates[stage_name].get("processed", 0)
|
|
306
316
|
processing = stage_stats_updates[stage_name].get("processing", 0)
|
|
307
317
|
stage_stats_updates[stage_name]["processing"] = processing + active
|
|
@@ -309,6 +319,7 @@ class RayStatsCollector:
|
|
|
309
319
|
stage_stats_updates[stage_name]["delta_processed"] = (
|
|
310
320
|
stage_stats_updates[stage_name].get("delta_processed", 0) + delta
|
|
311
321
|
)
|
|
322
|
+
stage_memory_samples[stage_name].append(memory_mb)
|
|
312
323
|
|
|
313
324
|
except Exception as e:
|
|
314
325
|
logger.warning(
|
|
@@ -324,7 +335,23 @@ class RayStatsCollector:
|
|
|
324
335
|
logger.error(f"[StatsCollectNow] Error during actor stats collection: {e}", exc_info=True)
|
|
325
336
|
overall_success = False
|
|
326
337
|
|
|
327
|
-
# --- 4. Aggregate
|
|
338
|
+
# --- 4. Aggregate Memory and Update EMA ---
|
|
339
|
+
for stage_name, samples in stage_memory_samples.items():
|
|
340
|
+
if not samples:
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
total_memory = sum(samples)
|
|
344
|
+
num_replicas = len(samples)
|
|
345
|
+
current_memory_per_replica = total_memory / num_replicas
|
|
346
|
+
stage_stats_updates[stage_name]["memory_mb"] = total_memory
|
|
347
|
+
|
|
348
|
+
# Update EMA
|
|
349
|
+
current_ema = self.ema_memory_per_replica.get(stage_name, current_memory_per_replica)
|
|
350
|
+
new_ema = (self.ema_alpha * current_memory_per_replica) + ((1 - self.ema_alpha) * current_ema)
|
|
351
|
+
self.ema_memory_per_replica[stage_name] = new_ema
|
|
352
|
+
stage_stats_updates[stage_name]["ema_memory_per_replica"] = new_ema
|
|
353
|
+
|
|
354
|
+
# --- 5. Aggregate In-Flight Stats ---
|
|
328
355
|
_total_inflight = 0
|
|
329
356
|
for stage_info in current_stages:
|
|
330
357
|
stage_name = stage_info.name
|
|
@@ -7,6 +7,8 @@ import threading
|
|
|
7
7
|
import time
|
|
8
8
|
from abc import ABC, abstractmethod
|
|
9
9
|
from typing import Any, Dict, Optional
|
|
10
|
+
import os
|
|
11
|
+
import psutil
|
|
10
12
|
|
|
11
13
|
import ray
|
|
12
14
|
import ray.actor
|
|
@@ -29,49 +31,6 @@ def setup_stdout_logging(name: str = __name__, level: int = logging.INFO) -> log
|
|
|
29
31
|
return logger
|
|
30
32
|
|
|
31
33
|
|
|
32
|
-
@ray.remote
|
|
33
|
-
def external_monitor_actor_shutdown(actor_handle: "RayActorStage", poll_interval: float = 0.1) -> bool:
|
|
34
|
-
"""
|
|
35
|
-
Polls the provided actor's `is_shutdown_complete` method until it returns True
|
|
36
|
-
or the actor becomes unreachable.
|
|
37
|
-
"""
|
|
38
|
-
logger = setup_stdout_logging("_external_monitor_actor_shutdown") # Optional: for monitor's own logs
|
|
39
|
-
|
|
40
|
-
if actor_handle is None:
|
|
41
|
-
logger.error("Received null actor_handle. Cannot monitor shutdown.")
|
|
42
|
-
return False # Or raise error
|
|
43
|
-
|
|
44
|
-
actor_id_to_monitor = None
|
|
45
|
-
try:
|
|
46
|
-
# Try to get a string representation for logging, might fail if already gone
|
|
47
|
-
actor_id_to_monitor = str(actor_handle) # Basic representation
|
|
48
|
-
except Exception:
|
|
49
|
-
actor_id_to_monitor = "unknown_actor"
|
|
50
|
-
|
|
51
|
-
logger.debug(f"Monitoring shutdown for actor: {actor_id_to_monitor}")
|
|
52
|
-
|
|
53
|
-
while True:
|
|
54
|
-
try:
|
|
55
|
-
# Remotely call the actor's method
|
|
56
|
-
if ray.get(actor_handle.is_shutdown_complete.remote()):
|
|
57
|
-
logger.debug(f"Actor {actor_id_to_monitor} reported shutdown complete.")
|
|
58
|
-
actor_handle.request_actor_exit.remote()
|
|
59
|
-
|
|
60
|
-
return True
|
|
61
|
-
except ray.exceptions.RayActorError:
|
|
62
|
-
# Actor has died or is otherwise unreachable.
|
|
63
|
-
# Consider this as shutdown complete for the purpose of the future.
|
|
64
|
-
logger.warning(f"Actor {actor_id_to_monitor} became unreachable (RayActorError). Assuming shutdown.")
|
|
65
|
-
return True
|
|
66
|
-
except Exception as e:
|
|
67
|
-
# Catch other potential errors during the remote call
|
|
68
|
-
logger.error(f"Unexpected error while polling shutdown status for {actor_id_to_monitor}: {e}")
|
|
69
|
-
# Depending on policy, either continue polling or assume failure
|
|
70
|
-
return True # Or True if any exit is "shutdown"
|
|
71
|
-
|
|
72
|
-
time.sleep(poll_interval)
|
|
73
|
-
|
|
74
|
-
|
|
75
34
|
class RayActorStage(ABC):
|
|
76
35
|
"""
|
|
77
36
|
Abstract base class for a stateful Ray actor stage in a processing pipeline.
|
|
@@ -163,12 +122,13 @@ class RayActorStage(ABC):
|
|
|
163
122
|
# Lock specifically for coordinating the final shutdown sequence (_request_actor_exit)
|
|
164
123
|
self._lock = threading.Lock()
|
|
165
124
|
self._shutdown_signal_complete = False # Initialize flag
|
|
166
|
-
self._shutdown_future: Optional[ray.ObjectRef] = None
|
|
167
125
|
|
|
168
126
|
# --- Logging ---
|
|
169
127
|
# Ray won't propagate logging to the root logger by default, so we set up a custom logger for debugging
|
|
170
128
|
self._logger = setup_stdout_logging(self.__class__.__name__) if log_to_stdout else logging.getLogger(__name__)
|
|
171
129
|
|
|
130
|
+
self._actor_id_str = self._get_actor_id_str()
|
|
131
|
+
|
|
172
132
|
@staticmethod
|
|
173
133
|
def _get_actor_id_str() -> str:
|
|
174
134
|
"""
|
|
@@ -215,19 +175,36 @@ class RayActorStage(ABC):
|
|
|
215
175
|
if self._input_queue is None:
|
|
216
176
|
# This check should ideally not fail if start() is called after setup
|
|
217
177
|
if self._running:
|
|
218
|
-
self._logger.error(f"{self.
|
|
178
|
+
self._logger.error(f"{self._actor_id_str}: Input queue not set while running")
|
|
219
179
|
# Indicate a programming error - queue should be set before starting
|
|
220
180
|
raise ValueError("Input queue not set while running")
|
|
221
181
|
return None # Should not happen if self._running is False, but defensive check
|
|
222
182
|
|
|
183
|
+
item: Optional[Any] = None
|
|
223
184
|
try:
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
185
|
+
item = self._input_queue.get(timeout=1.0)
|
|
186
|
+
|
|
187
|
+
if item is None:
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
if isinstance(item, ray.ObjectRef):
|
|
191
|
+
try:
|
|
192
|
+
deserialized_object = ray.get(item)
|
|
193
|
+
except ray.exceptions.ObjectLostError:
|
|
194
|
+
self._logger.error(
|
|
195
|
+
f"[{self._actor_id_str}] Failed to retrieve object from Ray object store. "
|
|
196
|
+
f"It has been lost and cannot be recovered."
|
|
197
|
+
)
|
|
198
|
+
raise # Re-raise the exception to be handled by the processing loop
|
|
199
|
+
|
|
200
|
+
del item
|
|
201
|
+
return deserialized_object
|
|
202
|
+
|
|
203
|
+
return item
|
|
204
|
+
|
|
227
205
|
except Exception:
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
# Return None to signify no item was retrieved this cycle.
|
|
206
|
+
if item is not None and isinstance(item, ray.ObjectRef):
|
|
207
|
+
del item
|
|
231
208
|
return None
|
|
232
209
|
|
|
233
210
|
@abstractmethod
|
|
@@ -290,7 +267,7 @@ class RayActorStage(ABC):
|
|
|
290
267
|
read from the input queue.
|
|
291
268
|
- `errors`: Incremented if `on_data` returns `None` or if an
|
|
292
269
|
exception occurs during `on_data` or output queuing.
|
|
293
|
-
- `processed`: Incremented after processing
|
|
270
|
+
- `processed`: Incremented after successful processing and output (if any).
|
|
294
271
|
- `successful_queue_writes`: Incremented when an item is successfully
|
|
295
272
|
put onto the output queue.
|
|
296
273
|
- `queue_full`: Incremented when an attempt to put to the output
|
|
@@ -305,8 +282,7 @@ class RayActorStage(ABC):
|
|
|
305
282
|
- Thread safety for `self.stats` relies on the GIL for simple
|
|
306
283
|
increment operations
|
|
307
284
|
"""
|
|
308
|
-
|
|
309
|
-
self._logger.debug(f"{actor_id_str}: Processing loop thread starting.")
|
|
285
|
+
self._logger.debug(f"{self._actor_id_str}: Processing loop thread starting.")
|
|
310
286
|
|
|
311
287
|
try:
|
|
312
288
|
while self._running:
|
|
@@ -328,38 +304,58 @@ class RayActorStage(ABC):
|
|
|
328
304
|
self._active_processing = True
|
|
329
305
|
|
|
330
306
|
# Step 2: Process the retrieved message using subclass-specific logic.
|
|
331
|
-
updated_cm
|
|
307
|
+
updated_cm = self.on_data(control_message)
|
|
332
308
|
|
|
333
309
|
# If there's a valid result and an output queue is configured, attempt to put.
|
|
334
|
-
if self._output_queue is not None:
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
310
|
+
if self._output_queue is not None and updated_cm is not None:
|
|
311
|
+
object_ref_to_put = None # Ensure var exists for the finally block
|
|
312
|
+
try:
|
|
313
|
+
# Get the handle of the queue actor to set it as the owner.
|
|
314
|
+
# This decouples the object's lifetime from this actor.
|
|
315
|
+
owner_actor = self._output_queue.actor
|
|
316
|
+
|
|
317
|
+
# Put the object into Plasma, transferring ownership.
|
|
318
|
+
object_ref_to_put = ray.put(updated_cm, _owner=owner_actor)
|
|
319
|
+
|
|
320
|
+
# Now that the object is safely in Plasma, we can delete the large local copy.
|
|
321
|
+
del updated_cm
|
|
322
|
+
|
|
323
|
+
# This loop will retry until the ObjectRef is put successfully or shutdown is initiated.
|
|
324
|
+
is_put_successful = False
|
|
325
|
+
while not is_put_successful:
|
|
326
|
+
try:
|
|
327
|
+
self._output_queue.put(object_ref_to_put)
|
|
328
|
+
self.stats["successful_queue_writes"] += 1
|
|
329
|
+
is_put_successful = True # Exit retry loop on success
|
|
330
|
+
except Exception as e_put:
|
|
331
|
+
self._logger.warning(
|
|
332
|
+
f"[{self._actor_id_str}] Output queue put failed (e.g., full, "
|
|
333
|
+
f"timeout, or actor error), retrying. Error: {e_put}"
|
|
334
|
+
)
|
|
335
|
+
self.stats["queue_full"] += 1
|
|
336
|
+
time.sleep(0.1) # Brief pause before retrying
|
|
337
|
+
finally:
|
|
338
|
+
# After the operation, delete the local ObjectRef.
|
|
339
|
+
# The primary reference is now held by the queue actor.
|
|
340
|
+
if object_ref_to_put is not None:
|
|
341
|
+
del object_ref_to_put
|
|
353
342
|
|
|
354
343
|
# Step 3: Increment "processed" count after successful processing and output (if any).
|
|
355
344
|
# This is the primary path for "successful processing".
|
|
356
345
|
self.stats["processed"] += 1
|
|
357
346
|
|
|
347
|
+
except ray.exceptions.ObjectLostError:
|
|
348
|
+
# This error is handled inside the loop to prevent the actor from crashing.
|
|
349
|
+
# We log it and continue to the next message.
|
|
350
|
+
self._logger.error(f"[{self._actor_id_str}] CRITICAL: An object was lost in transit. Skipping.")
|
|
351
|
+
# In a real-world scenario, you might want to increment a metric for monitoring.
|
|
352
|
+
continue
|
|
353
|
+
|
|
358
354
|
except Exception as e_item_processing:
|
|
359
355
|
# Catch exceptions from on_data() or unexpected issues in the item handling block.
|
|
360
356
|
cm_info_str = f" (message type: {type(control_message).__name__})" if control_message else ""
|
|
361
357
|
self._logger.exception(
|
|
362
|
-
f"[{
|
|
358
|
+
f"[{self._actor_id_str}] Error during processing of item{cm_info_str}: {e_item_processing}"
|
|
363
359
|
)
|
|
364
360
|
self.stats["errors"] += 1
|
|
365
361
|
|
|
@@ -370,180 +366,48 @@ class RayActorStage(ABC):
|
|
|
370
366
|
# Ensure _active_processing is reset after each item attempt (success, failure, or no item).
|
|
371
367
|
self._active_processing = False
|
|
372
368
|
|
|
369
|
+
# Explicitly delete the reference to the control message to aid garbage collection.
|
|
370
|
+
# This is important for large messages, as it helps release memory and ObjectRefs sooner.
|
|
371
|
+
if control_message is not None:
|
|
372
|
+
del control_message
|
|
373
|
+
|
|
373
374
|
# --- Loop Exit Condition Met ---
|
|
374
375
|
# This point is reached when self._running becomes False.
|
|
375
|
-
self._logger.debug(
|
|
376
|
+
self._logger.debug(
|
|
377
|
+
f"[{self._actor_id_str}] Graceful exit: self._running is False. Processing loop terminating."
|
|
378
|
+
)
|
|
376
379
|
|
|
377
380
|
except Exception as e_outer_loop:
|
|
378
381
|
# Catches very unexpected errors in the structure of the while loop itself.
|
|
379
382
|
self._logger.exception(
|
|
380
|
-
f"[{
|
|
383
|
+
f"[{self._actor_id_str}] Unexpected critical error caused processing loop termination: {e_outer_loop}"
|
|
381
384
|
)
|
|
382
385
|
finally:
|
|
383
386
|
# This block executes when the processing thread is about to exit,
|
|
384
387
|
# either due to self._running becoming False or an unhandled critical exception.
|
|
385
|
-
self._logger.debug(f"[{
|
|
388
|
+
self._logger.debug(f"[{self._actor_id_str}] Processing loop thread finished.")
|
|
386
389
|
# Signal that this actor's processing duties are complete.
|
|
387
390
|
# External monitors (e.g., via a future from stop()) can use this signal.
|
|
388
391
|
self._shutdown_signal_complete = True
|
|
389
392
|
|
|
390
|
-
|
|
391
|
-
@ray.remote
|
|
392
|
-
def _immediate_true() -> bool:
|
|
393
|
-
"""
|
|
394
|
-
A tiny remote method that immediately returns True.
|
|
395
|
-
Used to create a resolved ObjectRef when shutdown is already complete.
|
|
396
|
-
"""
|
|
397
|
-
return True
|
|
398
|
-
|
|
399
|
-
@ray.method(num_returns=1)
|
|
400
|
-
def _finalize_shutdown(self) -> None:
|
|
401
|
-
"""
|
|
402
|
-
Internal Ray method called remotely by the processing thread to safely exit the actor.
|
|
403
|
-
|
|
404
|
-
This method runs in the main Ray actor thread context. It acquires a lock
|
|
405
|
-
to prevent multiple exit attempts and then calls `ray.actor.exit_actor()`
|
|
406
|
-
to terminate the actor process gracefully.
|
|
407
|
-
|
|
408
|
-
Note: Only necessary if running in a detached actor context.
|
|
409
|
-
"""
|
|
410
|
-
|
|
411
|
-
actor_id_str = self._get_actor_id_str()
|
|
412
|
-
with self._lock:
|
|
413
|
-
if self._shutting_down:
|
|
414
|
-
return
|
|
415
|
-
|
|
416
|
-
self._shutting_down = True
|
|
417
|
-
|
|
418
|
-
self._logger.info(f"{actor_id_str}: Executing actor exit process.")
|
|
419
|
-
|
|
420
|
-
get_runtime_context().current_actor.request_actor_exit.remote()
|
|
421
|
-
|
|
422
|
-
@ray.method(num_returns=1)
|
|
423
|
-
def request_actor_exit(self) -> None:
|
|
424
|
-
"""
|
|
425
|
-
Request the actor to exit gracefully.
|
|
426
|
-
|
|
427
|
-
This method is called from the main Ray actor thread to ensure a clean
|
|
428
|
-
shutdown of the actor. It should be called when the processing loop
|
|
429
|
-
has completed its work and is ready to exit.
|
|
430
|
-
"""
|
|
431
|
-
|
|
432
|
-
if self._processing_thread:
|
|
433
|
-
self._processing_thread.join()
|
|
434
|
-
|
|
435
|
-
self._shutdown_signal_complete = True
|
|
436
|
-
|
|
437
|
-
self._logger.debug(f"{self._get_actor_id_str()}: Requesting actor exit.")
|
|
438
|
-
ray.actor.exit_actor()
|
|
439
|
-
|
|
440
|
-
@ray.method(num_returns=1)
|
|
441
|
-
def start(self) -> bool:
|
|
393
|
+
def _get_memory_usage_mb(self) -> float:
|
|
442
394
|
"""
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
Initializes state, resets statistics, and launches the `_processing_loop`
|
|
446
|
-
thread. Idempotent: if called while already running, it logs a warning
|
|
447
|
-
and returns False.
|
|
395
|
+
Gets the total memory usage of the current actor process (RSS).
|
|
448
396
|
|
|
449
397
|
Returns
|
|
450
398
|
-------
|
|
451
|
-
|
|
452
|
-
|
|
399
|
+
float
|
|
400
|
+
The memory usage in megabytes (MB).
|
|
453
401
|
"""
|
|
454
|
-
actor_id_str = self._get_actor_id_str()
|
|
455
|
-
# Prevent starting if already running
|
|
456
|
-
if self._running:
|
|
457
|
-
self._logger.warning(f"{actor_id_str}: Start called but actor is already running.")
|
|
458
|
-
return False
|
|
459
|
-
|
|
460
|
-
self._logger.info(f"{actor_id_str}: Starting actor...")
|
|
461
|
-
# --- Initialize Actor State ---
|
|
462
|
-
self._running = True
|
|
463
|
-
self._shutting_down = False # Reset shutdown flag on start
|
|
464
|
-
self._shutdown_signal_complete = False
|
|
465
|
-
self.start_time = time.time()
|
|
466
|
-
|
|
467
|
-
# --- Reset Statistics ---
|
|
468
|
-
self._last_stats_time = self.start_time
|
|
469
|
-
self._last_processed_count = 0
|
|
470
|
-
|
|
471
|
-
# --- Start Background Processing Thread ---
|
|
472
|
-
self._logger.debug(f"{actor_id_str}: Creating and starting processing thread.")
|
|
473
|
-
self._processing_thread = threading.Thread(
|
|
474
|
-
target=self._processing_loop,
|
|
475
|
-
daemon=False,
|
|
476
|
-
)
|
|
477
|
-
self._processing_thread.start()
|
|
478
|
-
|
|
479
|
-
self._logger.info(f"{actor_id_str}: Actor started successfully.")
|
|
480
|
-
|
|
481
|
-
return True
|
|
482
|
-
|
|
483
|
-
@ray.method(num_returns=1)
|
|
484
|
-
def stop(self) -> ray.ObjectRef:
|
|
485
|
-
actor_id_str = self._get_actor_id_str()
|
|
486
|
-
self._logger.info(f"{actor_id_str}: Received external stop request.")
|
|
487
|
-
|
|
488
|
-
if self._shutdown_future is not None:
|
|
489
|
-
self._logger.debug(f"{actor_id_str}: Stop called again, returning existing shutdown future.")
|
|
490
|
-
return self._shutdown_future
|
|
491
|
-
|
|
492
|
-
if not self._running and self._shutdown_signal_complete: # Check if already fully shutdown
|
|
493
|
-
self._logger.info(f"{actor_id_str}: Stop called, but actor was already shutdown and signal complete.")
|
|
494
|
-
if self._shutdown_future: # Should have been set by the previous shutdown sequence
|
|
495
|
-
return self._shutdown_future
|
|
496
|
-
else: # Should not happen if shutdown_signal_complete is true, but as a fallback
|
|
497
|
-
self._shutdown_future = self._immediate_true.remote()
|
|
498
|
-
return self._shutdown_future
|
|
499
|
-
elif not self._running: # Was stopped but maybe not fully signaled (e.g. mid-shutdown)
|
|
500
|
-
self._logger.warning(
|
|
501
|
-
f"{actor_id_str}: Stop called but actor was not running (or already stopping). "
|
|
502
|
-
"Will create/return monitor future."
|
|
503
|
-
)
|
|
504
|
-
# If _shutdown_future is None here, it means stop wasn't called before OR a previous
|
|
505
|
-
# monitor didn't get stored. Proceed to create a new monitor.
|
|
506
|
-
# If it *was* already stopping and _shutdown_future exists, the first `if` catches it.
|
|
507
|
-
|
|
508
|
-
# --- Initiate Shutdown signal to internal loop (if still running) ---
|
|
509
|
-
if self._running: # Only set self._running = False if it was actually running
|
|
510
|
-
self._running = False
|
|
511
|
-
self._logger.info(f"{actor_id_str}: Stop signal sent to processing loop. Shutdown initiated.")
|
|
512
|
-
else:
|
|
513
|
-
self._logger.info(
|
|
514
|
-
f"{actor_id_str}: Actor processing loop was already stopped. Monitoring for final shutdown signal."
|
|
515
|
-
)
|
|
516
|
-
|
|
517
|
-
# --- Spawn shutdown watcher task ---
|
|
518
|
-
# Get a handle to the current actor instance to pass to the monitor.
|
|
519
|
-
# This is crucial: the monitor needs to call methods on *this specific actor*.
|
|
520
402
|
try:
|
|
521
|
-
|
|
403
|
+
pid = os.getpid()
|
|
404
|
+
process = psutil.Process(pid)
|
|
405
|
+
# rss is the Resident Set Size, which is the non-swapped physical memory a process has used.
|
|
406
|
+
memory_bytes = process.memory_info().rss
|
|
407
|
+
return memory_bytes / (1024 * 1024)
|
|
522
408
|
except Exception as e:
|
|
523
|
-
self._logger.
|
|
524
|
-
|
|
525
|
-
)
|
|
526
|
-
|
|
527
|
-
# Cannot proceed to monitor, return a future that resolves to False or raises
|
|
528
|
-
@ray.remote
|
|
529
|
-
def failed_future():
|
|
530
|
-
raise RuntimeError("Failed to initiate shutdown monitoring due to missing actor handle.")
|
|
531
|
-
|
|
532
|
-
return failed_future.remote() # Or ray.put(False) directly
|
|
533
|
-
|
|
534
|
-
self._shutdown_future = external_monitor_actor_shutdown.remote(self_handle)
|
|
535
|
-
|
|
536
|
-
return self._shutdown_future
|
|
537
|
-
|
|
538
|
-
@ray.method(num_returns=1)
|
|
539
|
-
def is_shutdown_complete(self) -> bool:
|
|
540
|
-
"""
|
|
541
|
-
Checks if the actor's processing loop has finished and signaled completion.
|
|
542
|
-
Raises RayActorError if the actor process has terminated.
|
|
543
|
-
"""
|
|
544
|
-
return self._shutdown_signal_complete
|
|
545
|
-
|
|
546
|
-
# --- get_stats ---
|
|
409
|
+
self._logger.warning(f"[{self._actor_id_str}] Could not retrieve process memory usage: {e}")
|
|
410
|
+
return 0.0
|
|
547
411
|
|
|
548
412
|
@ray.method(num_returns=1)
|
|
549
413
|
def get_stats(self) -> Dict[str, Any]:
|
|
@@ -566,7 +430,16 @@ class RayActorStage(ABC):
|
|
|
566
430
|
second during the last interval.
|
|
567
431
|
Can be zero if no items were
|
|
568
432
|
processed or the interval was too short.
|
|
433
|
+
- 'memory_mb' (float): The total memory usage of the current actor process (RSS) in megabytes (MB).
|
|
569
434
|
"""
|
|
435
|
+
# If the actor is not running, return the last known stats to ensure this
|
|
436
|
+
# call is non-blocking during shutdown.
|
|
437
|
+
if not self._running:
|
|
438
|
+
stats_copy = self.stats.copy()
|
|
439
|
+
stats_copy["active_processing"] = False # It's not active if not running
|
|
440
|
+
stats_copy["memory_mb"] = self._get_memory_usage_mb()
|
|
441
|
+
return stats_copy
|
|
442
|
+
|
|
570
443
|
current_time: float = time.time()
|
|
571
444
|
current_processed: int = self.stats.get("processed", 0)
|
|
572
445
|
is_active: bool = self._active_processing
|
|
@@ -605,8 +478,64 @@ class RayActorStage(ABC):
|
|
|
605
478
|
"queue_full": self.stats.get("queue_full", 0),
|
|
606
479
|
"successful_queue_reads": self.stats.get("successful_queue_reads", 0),
|
|
607
480
|
"successful_queue_writes": self.stats.get("successful_queue_writes", 0),
|
|
481
|
+
"memory_mb": self._get_memory_usage_mb(),
|
|
608
482
|
}
|
|
609
483
|
|
|
484
|
+
@ray.method(num_returns=1)
|
|
485
|
+
def start(self) -> bool:
|
|
486
|
+
"""
|
|
487
|
+
Starts the actor's processing loop in a background thread.
|
|
488
|
+
|
|
489
|
+
Initializes state, resets statistics, and launches the `_processing_loop`
|
|
490
|
+
thread. Idempotent: if called while already running, it logs a warning
|
|
491
|
+
and returns False.
|
|
492
|
+
|
|
493
|
+
Returns
|
|
494
|
+
-------
|
|
495
|
+
bool
|
|
496
|
+
True if the actor was successfully started, False if it was already running.
|
|
497
|
+
"""
|
|
498
|
+
# Prevent starting if already running
|
|
499
|
+
if self._running:
|
|
500
|
+
self._logger.warning(f"{self._actor_id_str}: Start called but actor is already running.")
|
|
501
|
+
return False
|
|
502
|
+
|
|
503
|
+
self._logger.info(f"{self._actor_id_str}: Starting actor...")
|
|
504
|
+
# --- Initialize Actor State ---
|
|
505
|
+
self._running = True
|
|
506
|
+
self._shutting_down = False # Reset shutdown flag on start
|
|
507
|
+
self._shutdown_signal_complete = False
|
|
508
|
+
self.start_time = time.time()
|
|
509
|
+
|
|
510
|
+
# --- Reset Statistics ---
|
|
511
|
+
self._last_stats_time = self.start_time
|
|
512
|
+
self._last_processed_count = 0
|
|
513
|
+
|
|
514
|
+
# --- Start Background Processing Thread ---
|
|
515
|
+
self._logger.debug(f"{self._actor_id_str}: Creating and starting processing thread.")
|
|
516
|
+
self._processing_thread = threading.Thread(
|
|
517
|
+
target=self._processing_loop,
|
|
518
|
+
daemon=False,
|
|
519
|
+
)
|
|
520
|
+
self._processing_thread.start()
|
|
521
|
+
|
|
522
|
+
self._logger.info(f"{self._actor_id_str}: Actor started successfully.")
|
|
523
|
+
|
|
524
|
+
return True
|
|
525
|
+
|
|
526
|
+
@ray.method(num_returns=0)
|
|
527
|
+
def stop(self) -> None:
|
|
528
|
+
"""Stops the actor's processing loop by setting the running flag to False."""
|
|
529
|
+
self._logger.info(f"[{self._actor_id_str}] Stop signal received. Initiating graceful shutdown.")
|
|
530
|
+
self._running = False
|
|
531
|
+
|
|
532
|
+
def is_shutdown_complete(self) -> bool:
|
|
533
|
+
"""
|
|
534
|
+
Checks if the actor's processing loop has finished and signaled completion.
|
|
535
|
+
Raises RayActorError if the actor process has terminated.
|
|
536
|
+
"""
|
|
537
|
+
return self._shutdown_signal_complete
|
|
538
|
+
|
|
610
539
|
@ray.method(num_returns=1)
|
|
611
540
|
def set_input_queue(self, queue_handle: Any) -> bool:
|
|
612
541
|
"""
|
|
@@ -625,7 +554,7 @@ class RayActorStage(ABC):
|
|
|
625
554
|
bool
|
|
626
555
|
True indicating the queue was set.
|
|
627
556
|
"""
|
|
628
|
-
self._logger.debug(f"{self.
|
|
557
|
+
self._logger.debug(f"{self._actor_id_str}: Setting input queue.")
|
|
629
558
|
self._input_queue = queue_handle
|
|
630
559
|
return True
|
|
631
560
|
|
|
@@ -647,6 +576,6 @@ class RayActorStage(ABC):
|
|
|
647
576
|
bool
|
|
648
577
|
True indicating the queue was set.
|
|
649
578
|
"""
|
|
650
|
-
self._logger.debug(f"{self.
|
|
579
|
+
self._logger.debug(f"{self._actor_id_str}: Setting output queue.")
|
|
651
580
|
self._output_queue = queue_handle
|
|
652
581
|
return True
|
|
@@ -269,8 +269,11 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
269
269
|
self._logger.debug("Received message type: %s", type(job))
|
|
270
270
|
if isinstance(job, BaseModel):
|
|
271
271
|
self._logger.debug("Message is a BaseModel with response_code: %s", job.response_code)
|
|
272
|
-
if job.response_code
|
|
273
|
-
self._logger.debug("Message
|
|
272
|
+
if job.response_code not in (0, 2):
|
|
273
|
+
self._logger.debug("Message received with unhandled response_code, returning None")
|
|
274
|
+
return None
|
|
275
|
+
if job.response_code == 2:
|
|
276
|
+
self._logger.debug("Message response_code == 2, returning None")
|
|
274
277
|
return None
|
|
275
278
|
job = json.loads(job.response)
|
|
276
279
|
self._logger.debug("Successfully fetched message with job_id: %s", job.get("job_id", "unknown"))
|
|
@@ -338,15 +341,33 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
338
341
|
self._pause_event.wait() # Block if paused
|
|
339
342
|
self._active_processing = True
|
|
340
343
|
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
344
|
+
object_ref_to_put = None
|
|
345
|
+
try:
|
|
346
|
+
# Get the handle of the queue actor to set it as the owner.
|
|
347
|
+
owner_actor = self.output_queue.actor
|
|
348
|
+
|
|
349
|
+
# Put the object into Plasma, transferring ownership.
|
|
350
|
+
object_ref_to_put = ray.put(control_message, _owner=owner_actor)
|
|
351
|
+
|
|
352
|
+
# Now that the object is safely in Plasma, delete the large local copy.
|
|
353
|
+
del control_message
|
|
354
|
+
|
|
355
|
+
# This loop will retry indefinitely until the ObjectRef is put successfully.
|
|
356
|
+
is_put_successful = False
|
|
357
|
+
while not is_put_successful:
|
|
358
|
+
try:
|
|
359
|
+
self.output_queue.put(object_ref_to_put)
|
|
360
|
+
self.stats["successful_queue_writes"] += 1
|
|
361
|
+
is_put_successful = True # Exit retry loop on success
|
|
362
|
+
except Exception:
|
|
363
|
+
self._logger.warning("Output queue full, retrying put()...")
|
|
364
|
+
self.stats["queue_full"] += 1
|
|
365
|
+
time.sleep(0.1)
|
|
366
|
+
finally:
|
|
367
|
+
# After the operation, delete the local ObjectRef.
|
|
368
|
+
# The primary reference is now held by the queue actor.
|
|
369
|
+
if object_ref_to_put is not None:
|
|
370
|
+
del object_ref_to_put
|
|
350
371
|
|
|
351
372
|
self.stats["processed"] += 1
|
|
352
373
|
self._message_count += 1
|