nv-ingest 2025.7.8.dev20250708__py3-none-any.whl → 2025.7.10.dev20250710__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +65 -303
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +438 -163
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +30 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +159 -230
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +27 -9
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +7 -72
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +2 -1
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +22 -12
- {nv_ingest-2025.7.8.dev20250708.dist-info → nv_ingest-2025.7.10.dev20250710.dist-info}/METADATA +1 -1
- {nv_ingest-2025.7.8.dev20250708.dist-info → nv_ingest-2025.7.10.dev20250710.dist-info}/RECORD +13 -13
- {nv_ingest-2025.7.8.dev20250708.dist-info → nv_ingest-2025.7.10.dev20250710.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.7.8.dev20250708.dist-info → nv_ingest-2025.7.10.dev20250710.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.7.8.dev20250708.dist-info → nv_ingest-2025.7.10.dev20250710.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,6 @@ import os
|
|
|
7
7
|
import signal
|
|
8
8
|
import threading
|
|
9
9
|
from abc import ABC, abstractmethod
|
|
10
|
-
from collections import defaultdict
|
|
11
10
|
from dataclasses import dataclass
|
|
12
11
|
from types import FunctionType
|
|
13
12
|
|
|
@@ -70,14 +69,13 @@ class ScalingConfig:
|
|
|
70
69
|
|
|
71
70
|
dynamic_memory_scaling: bool = True
|
|
72
71
|
dynamic_memory_threshold: float = 0.75
|
|
73
|
-
pid_kp: float = 0.
|
|
74
|
-
pid_ki: float = 0.
|
|
72
|
+
pid_kp: float = 0.2
|
|
73
|
+
pid_ki: float = 0.01
|
|
75
74
|
pid_kd: float = 0.0
|
|
75
|
+
pid_ema_alpha: float = 0.1
|
|
76
76
|
pid_target_queue_depth: int = 0
|
|
77
77
|
pid_penalty_factor: float = 0.1
|
|
78
78
|
pid_error_boost_factor: float = 1.5
|
|
79
|
-
pid_window_size: int = 10
|
|
80
|
-
rcm_estimated_edge_cost_mb: int = 5000
|
|
81
79
|
rcm_memory_safety_buffer_fraction: float = 0.15
|
|
82
80
|
|
|
83
81
|
|
|
@@ -88,6 +86,7 @@ class FlushingConfig:
|
|
|
88
86
|
queue_flush_interval_seconds: int = 600
|
|
89
87
|
queue_flush_drain_timeout_seconds: int = 300
|
|
90
88
|
quiet_period_threshold: int = 0
|
|
89
|
+
consecutive_quiet_cycles_for_flush: int = 3
|
|
91
90
|
|
|
92
91
|
|
|
93
92
|
@dataclass
|
|
@@ -197,6 +196,18 @@ class RayPipeline(PipelineInterface):
|
|
|
197
196
|
flushing_config: FlushingConfig = FlushingConfig(),
|
|
198
197
|
stats_config: StatsConfig = StatsConfig(),
|
|
199
198
|
) -> None:
|
|
199
|
+
"""
|
|
200
|
+
Initializes the RayPipeline.
|
|
201
|
+
|
|
202
|
+
Parameters
|
|
203
|
+
----------
|
|
204
|
+
scaling_config : ScalingConfig, optional
|
|
205
|
+
Configuration for PID and resource constraint-based scaling, by default ScalingConfig().
|
|
206
|
+
flushing_config : FlushingConfig, optional
|
|
207
|
+
Configuration for queue flushing behavior, by default FlushingConfig().
|
|
208
|
+
stats_config : StatsConfig, optional
|
|
209
|
+
Configuration for the RayStatsCollector, by default StatsConfig().
|
|
210
|
+
"""
|
|
200
211
|
# Store config objects
|
|
201
212
|
self.scaling_config = scaling_config
|
|
202
213
|
self.flushing_config = flushing_config
|
|
@@ -218,7 +229,6 @@ class RayPipeline(PipelineInterface):
|
|
|
218
229
|
# Use scaling_config for these
|
|
219
230
|
self.dynamic_memory_scaling = self.scaling_config.dynamic_memory_scaling
|
|
220
231
|
self.dynamic_memory_threshold = self.scaling_config.dynamic_memory_threshold
|
|
221
|
-
self.stage_memory_overhead: Dict[str, float] = {}
|
|
222
232
|
|
|
223
233
|
# --- Background Threads ---
|
|
224
234
|
self._scaling_thread: Optional[threading.Thread] = None
|
|
@@ -229,6 +239,8 @@ class RayPipeline(PipelineInterface):
|
|
|
229
239
|
self.queue_flush_interval_seconds = self.flushing_config.queue_flush_interval_seconds
|
|
230
240
|
self.queue_flush_drain_timeout_seconds = self.flushing_config.queue_flush_drain_timeout_seconds
|
|
231
241
|
self.quiet_period_threshold = self.flushing_config.quiet_period_threshold
|
|
242
|
+
self.consecutive_quiet_cycles_for_flush = self.flushing_config.consecutive_quiet_cycles_for_flush
|
|
243
|
+
self._consecutive_quiet_cycles = 0
|
|
232
244
|
|
|
233
245
|
# --- Instantiate Autoscaling Controllers ---
|
|
234
246
|
# Use scaling_config
|
|
@@ -236,9 +248,7 @@ class RayPipeline(PipelineInterface):
|
|
|
236
248
|
kp=self.scaling_config.pid_kp,
|
|
237
249
|
ki=self.scaling_config.pid_ki,
|
|
238
250
|
kd=self.scaling_config.pid_kd,
|
|
239
|
-
stage_cost_estimates={}, # Populated during build
|
|
240
251
|
target_queue_depth=self.scaling_config.pid_target_queue_depth,
|
|
241
|
-
window_size=self.scaling_config.pid_window_size,
|
|
242
252
|
penalty_factor=self.scaling_config.pid_penalty_factor,
|
|
243
253
|
error_boost_factor=self.scaling_config.pid_error_boost_factor,
|
|
244
254
|
)
|
|
@@ -258,7 +268,6 @@ class RayPipeline(PipelineInterface):
|
|
|
258
268
|
self.constraint_manager = ResourceConstraintManager(
|
|
259
269
|
max_replicas=1, # Updated during build
|
|
260
270
|
memory_threshold=absolute_memory_threshold_mb,
|
|
261
|
-
estimated_edge_cost_mb=self.scaling_config.rcm_estimated_edge_cost_mb,
|
|
262
271
|
memory_safety_buffer_fraction=self.scaling_config.rcm_memory_safety_buffer_fraction,
|
|
263
272
|
)
|
|
264
273
|
logger.info("ResourceConstraintManager initialized using ScalingConfig.")
|
|
@@ -270,6 +279,7 @@ class RayPipeline(PipelineInterface):
|
|
|
270
279
|
interval=self.stats_config.collection_interval_seconds,
|
|
271
280
|
actor_timeout=self.stats_config.actor_timeout_seconds,
|
|
272
281
|
queue_timeout=self.stats_config.queue_timeout_seconds,
|
|
282
|
+
ema_alpha=self.scaling_config.pid_ema_alpha,
|
|
273
283
|
)
|
|
274
284
|
|
|
275
285
|
logger.info("RayStatsCollector initialized using StatsConfig.")
|
|
@@ -277,21 +287,43 @@ class RayPipeline(PipelineInterface):
|
|
|
277
287
|
# --- Accessor Methods for Stat Collector (and internal use) ---
|
|
278
288
|
|
|
279
289
|
def __del__(self):
|
|
290
|
+
"""Ensures the pipeline is stopped upon garbage collection."""
|
|
280
291
|
try:
|
|
281
292
|
self.stop()
|
|
282
293
|
except Exception as e:
|
|
283
294
|
logger.error(f"Exception during RayPipeline cleanup: {e}")
|
|
284
295
|
|
|
285
296
|
def get_stages_info(self) -> List[StageInfo]:
|
|
286
|
-
"""
|
|
297
|
+
"""
|
|
298
|
+
Returns a snapshot of the current stage information.
|
|
299
|
+
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
List[StageInfo]
|
|
303
|
+
A list of StageInfo objects from the topology.
|
|
304
|
+
"""
|
|
287
305
|
return self.topology.get_stages_info()
|
|
288
306
|
|
|
289
307
|
def get_stage_actors(self) -> Dict[str, List[Any]]:
|
|
290
|
-
"""
|
|
308
|
+
"""
|
|
309
|
+
Returns a snapshot of the current actors per stage.
|
|
310
|
+
|
|
311
|
+
Returns
|
|
312
|
+
-------
|
|
313
|
+
Dict[str, List[Any]]
|
|
314
|
+
A dictionary mapping stage names to lists of actor handles.
|
|
315
|
+
"""
|
|
291
316
|
return self.topology.get_stage_actors()
|
|
292
317
|
|
|
293
318
|
def get_edge_queues(self) -> Dict[str, Tuple[Any, int]]:
|
|
294
|
-
"""
|
|
319
|
+
"""
|
|
320
|
+
Returns a snapshot of the current edge queues.
|
|
321
|
+
|
|
322
|
+
Returns
|
|
323
|
+
-------
|
|
324
|
+
Dict[str, Tuple[Any, int]]
|
|
325
|
+
A dictionary mapping queue names to tuples of (queue_handle, queue_size).
|
|
326
|
+
"""
|
|
295
327
|
return self.topology.get_edge_queues()
|
|
296
328
|
|
|
297
329
|
def _configure_autoscalers(self) -> None:
|
|
@@ -310,9 +342,6 @@ class RayPipeline(PipelineInterface):
|
|
|
310
342
|
# For now, let's store a dummy overhead in topology during build
|
|
311
343
|
overhead_bytes = default_cost_bytes # Simplification for now
|
|
312
344
|
stage_overheads[stage.name] = overhead_bytes # Store locally first
|
|
313
|
-
cost_mb = max(1, int(overhead_bytes / (1024 * 1024)))
|
|
314
|
-
# Update controller directly (or via dedicated method if preferred)
|
|
315
|
-
self.pid_controller.stage_cost_estimates[stage.name] = cost_mb
|
|
316
345
|
|
|
317
346
|
# Update topology with collected overheads
|
|
318
347
|
self.topology.set_stage_memory_overhead(stage_overheads)
|
|
@@ -321,7 +350,6 @@ class RayPipeline(PipelineInterface):
|
|
|
321
350
|
self.constraint_manager.max_replicas = total_max_replicas
|
|
322
351
|
|
|
323
352
|
logger.info(f"[Build-Configure] Autoscalers configured. Total Max Replicas: {total_max_replicas}")
|
|
324
|
-
logger.debug(f"[Build-Configure] PID stage cost estimates (MB): {self.pid_controller.stage_cost_estimates}")
|
|
325
353
|
|
|
326
354
|
def _instantiate_initial_actors(self) -> None:
|
|
327
355
|
"""Instantiates initial actors and updates topology."""
|
|
@@ -348,9 +376,9 @@ class RayPipeline(PipelineInterface):
|
|
|
348
376
|
f" for '{stage.name}'"
|
|
349
377
|
)
|
|
350
378
|
try:
|
|
351
|
-
actor = stage.callable.options(
|
|
352
|
-
|
|
353
|
-
)
|
|
379
|
+
actor = stage.callable.options(name=actor_name, max_concurrency=1, max_restarts=0).remote(
|
|
380
|
+
config=stage.config
|
|
381
|
+
)
|
|
354
382
|
replicas.append(actor)
|
|
355
383
|
except Exception as e:
|
|
356
384
|
logger.error(f"[Build-Actors] Failed create actor '{actor_name}': {e}", exc_info=True)
|
|
@@ -363,7 +391,14 @@ class RayPipeline(PipelineInterface):
|
|
|
363
391
|
logger.info("[Build-Actors] Initial actor instantiation complete.")
|
|
364
392
|
|
|
365
393
|
def _create_and_wire_edges(self) -> List[ray.ObjectRef]:
|
|
366
|
-
"""
|
|
394
|
+
"""
|
|
395
|
+
Creates queues, wires actors (using topology), and updates topology.
|
|
396
|
+
|
|
397
|
+
Returns
|
|
398
|
+
-------
|
|
399
|
+
List[ray.ObjectRef]
|
|
400
|
+
A list of object references for the remote wiring calls.
|
|
401
|
+
"""
|
|
367
402
|
logger.info("[Build-Wiring] Creating and wiring edges...")
|
|
368
403
|
wiring_refs = []
|
|
369
404
|
new_edge_queues: Dict[str, Tuple[Any, int]] = {}
|
|
@@ -400,7 +435,14 @@ class RayPipeline(PipelineInterface):
|
|
|
400
435
|
|
|
401
436
|
@staticmethod
|
|
402
437
|
def _wait_for_wiring(wiring_refs: List[ray.ObjectRef]) -> None:
|
|
403
|
-
"""
|
|
438
|
+
"""
|
|
439
|
+
Waits for remote wiring calls to complete.
|
|
440
|
+
|
|
441
|
+
Parameters
|
|
442
|
+
----------
|
|
443
|
+
wiring_refs : List[ray.ObjectRef]
|
|
444
|
+
A list of object references for the wiring calls.
|
|
445
|
+
"""
|
|
404
446
|
if not wiring_refs:
|
|
405
447
|
logger.debug("[Build-WaitWiring] No wiring calls.")
|
|
406
448
|
return
|
|
@@ -415,6 +457,27 @@ class RayPipeline(PipelineInterface):
|
|
|
415
457
|
def add_source(
|
|
416
458
|
self, *, name: str, source_actor: Any, config: BaseModel, min_replicas: int = 1, max_replicas: int = 1
|
|
417
459
|
) -> "RayPipeline":
|
|
460
|
+
"""
|
|
461
|
+
Adds a source stage to the pipeline.
|
|
462
|
+
|
|
463
|
+
Parameters
|
|
464
|
+
----------
|
|
465
|
+
name : str
|
|
466
|
+
The name of the source stage.
|
|
467
|
+
source_actor : Any
|
|
468
|
+
The actor or callable for the source stage.
|
|
469
|
+
config : BaseModel
|
|
470
|
+
The configuration for the source stage.
|
|
471
|
+
min_replicas : int, optional
|
|
472
|
+
The minimum number of replicas for the source stage, by default 1.
|
|
473
|
+
max_replicas : int, optional
|
|
474
|
+
The maximum number of replicas for the source stage, by default 1.
|
|
475
|
+
|
|
476
|
+
Returns
|
|
477
|
+
-------
|
|
478
|
+
RayPipeline
|
|
479
|
+
The pipeline instance.
|
|
480
|
+
"""
|
|
418
481
|
if min_replicas < 1:
|
|
419
482
|
logger.warning(f"Source stage '{name}': min_replicas must be >= 1. Overriding.")
|
|
420
483
|
min_replicas = 1
|
|
@@ -440,6 +503,27 @@ class RayPipeline(PipelineInterface):
|
|
|
440
503
|
min_replicas: int = 0,
|
|
441
504
|
max_replicas: int = 1,
|
|
442
505
|
) -> "RayPipeline":
|
|
506
|
+
"""
|
|
507
|
+
Adds a stage to the pipeline.
|
|
508
|
+
|
|
509
|
+
Parameters
|
|
510
|
+
----------
|
|
511
|
+
name : str
|
|
512
|
+
The name of the stage.
|
|
513
|
+
stage_actor : Any
|
|
514
|
+
The actor or callable for the stage.
|
|
515
|
+
config : BaseModel
|
|
516
|
+
The configuration for the stage.
|
|
517
|
+
min_replicas : int, optional
|
|
518
|
+
The minimum number of replicas for the stage, by default 0.
|
|
519
|
+
max_replicas : int, optional
|
|
520
|
+
The maximum number of replicas for the stage, by default 1.
|
|
521
|
+
|
|
522
|
+
Returns
|
|
523
|
+
-------
|
|
524
|
+
RayPipeline
|
|
525
|
+
The pipeline instance.
|
|
526
|
+
"""
|
|
443
527
|
if min_replicas < 0:
|
|
444
528
|
logger.warning(f"Stage '{name}': min_replicas cannot be negative. Overriding to 0.")
|
|
445
529
|
min_replicas = 0
|
|
@@ -471,6 +555,27 @@ class RayPipeline(PipelineInterface):
|
|
|
471
555
|
def add_sink(
|
|
472
556
|
self, *, name: str, sink_actor: Any, config: BaseModel, min_replicas: int = 1, max_replicas: int = 1
|
|
473
557
|
) -> "RayPipeline":
|
|
558
|
+
"""
|
|
559
|
+
Adds a sink stage to the pipeline.
|
|
560
|
+
|
|
561
|
+
Parameters
|
|
562
|
+
----------
|
|
563
|
+
name : str
|
|
564
|
+
The name of the sink stage.
|
|
565
|
+
sink_actor : Any
|
|
566
|
+
The actor or callable for the sink stage.
|
|
567
|
+
config : BaseModel
|
|
568
|
+
The configuration for the sink stage.
|
|
569
|
+
min_replicas : int, optional
|
|
570
|
+
The minimum number of replicas for the sink stage, by default 1.
|
|
571
|
+
max_replicas : int, optional
|
|
572
|
+
The maximum number of replicas for the sink stage, by default 1.
|
|
573
|
+
|
|
574
|
+
Returns
|
|
575
|
+
-------
|
|
576
|
+
RayPipeline
|
|
577
|
+
The pipeline instance.
|
|
578
|
+
"""
|
|
474
579
|
# Sink min_replicas can realistically be 0 if data drain is optional/best-effort? Let's allow 0.
|
|
475
580
|
if min_replicas < 0:
|
|
476
581
|
logger.warning(f"Sink stage '{name}': min_replicas cannot be negative. Overriding to 0.")
|
|
@@ -489,6 +594,23 @@ class RayPipeline(PipelineInterface):
|
|
|
489
594
|
|
|
490
595
|
# --- Method for defining connections ---
|
|
491
596
|
def make_edge(self, from_stage: str, to_stage: str, queue_size: int = 100) -> "RayPipeline":
|
|
597
|
+
"""
|
|
598
|
+
Creates an edge between two stages in the pipeline.
|
|
599
|
+
|
|
600
|
+
Parameters
|
|
601
|
+
----------
|
|
602
|
+
from_stage : str
|
|
603
|
+
The name of the source stage.
|
|
604
|
+
to_stage : str
|
|
605
|
+
The name of the destination stage.
|
|
606
|
+
queue_size : int, optional
|
|
607
|
+
The size of the queue between the stages, by default 100.
|
|
608
|
+
|
|
609
|
+
Returns
|
|
610
|
+
-------
|
|
611
|
+
RayPipeline
|
|
612
|
+
The pipeline instance.
|
|
613
|
+
"""
|
|
492
614
|
try:
|
|
493
615
|
self.topology.add_connection(from_stage, to_stage, queue_size) # Delegate (includes validation)
|
|
494
616
|
except ValueError as e:
|
|
@@ -498,7 +620,14 @@ class RayPipeline(PipelineInterface):
|
|
|
498
620
|
|
|
499
621
|
# ----- Pipeline Build Process ---
|
|
500
622
|
def build(self) -> Dict[str, List[Any]]:
|
|
501
|
-
"""
|
|
623
|
+
"""
|
|
624
|
+
Builds the pipeline: configures, instantiates, wires, using topology.
|
|
625
|
+
|
|
626
|
+
Returns
|
|
627
|
+
-------
|
|
628
|
+
Dict[str, List[Any]]
|
|
629
|
+
A dictionary mapping stage names to lists of actor handles.
|
|
630
|
+
"""
|
|
502
631
|
logger.info("--- Starting Pipeline Build Process ---")
|
|
503
632
|
try:
|
|
504
633
|
if not self.topology.get_stages_info():
|
|
@@ -527,13 +656,25 @@ class RayPipeline(PipelineInterface):
|
|
|
527
656
|
# --- Scaling Logic ---
|
|
528
657
|
@staticmethod
|
|
529
658
|
def _create_single_replica(stage_info: StageInfo) -> Any:
|
|
530
|
-
"""
|
|
659
|
+
"""
|
|
660
|
+
Creates a single new Ray actor replica for the given stage.
|
|
661
|
+
|
|
662
|
+
Parameters
|
|
663
|
+
----------
|
|
664
|
+
stage_info : StageInfo
|
|
665
|
+
The stage information.
|
|
666
|
+
|
|
667
|
+
Returns
|
|
668
|
+
-------
|
|
669
|
+
Any
|
|
670
|
+
The new actor handle.
|
|
671
|
+
"""
|
|
531
672
|
actor_name = f"{stage_info.name}_{uuid.uuid4()}"
|
|
532
673
|
logger.debug(f"[ScaleUtil] Creating new actor '{actor_name}' for stage '{stage_info.name}'")
|
|
533
674
|
try:
|
|
534
|
-
new_actor = stage_info.callable.options(
|
|
535
|
-
|
|
536
|
-
)
|
|
675
|
+
new_actor = stage_info.callable.options(name=actor_name, max_concurrency=1, max_restarts=0).remote(
|
|
676
|
+
config=stage_info.config
|
|
677
|
+
)
|
|
537
678
|
|
|
538
679
|
return new_actor
|
|
539
680
|
except Exception as e:
|
|
@@ -546,7 +687,21 @@ class RayPipeline(PipelineInterface):
|
|
|
546
687
|
raise RuntimeError(f"Actor creation failed for stage '{stage_info.name}' during scale up") from e
|
|
547
688
|
|
|
548
689
|
def _get_wiring_refs_for_actor(self, actor: Any, stage_name: str) -> List[ray.ObjectRef]:
|
|
549
|
-
"""
|
|
690
|
+
"""
|
|
691
|
+
Gets wiring futures for a single actor using topology for queues/connections.
|
|
692
|
+
|
|
693
|
+
Parameters
|
|
694
|
+
----------
|
|
695
|
+
actor : Any
|
|
696
|
+
The actor handle.
|
|
697
|
+
stage_name : str
|
|
698
|
+
The name of the stage.
|
|
699
|
+
|
|
700
|
+
Returns
|
|
701
|
+
-------
|
|
702
|
+
List[ray.ObjectRef]
|
|
703
|
+
A list of object references for the wiring calls.
|
|
704
|
+
"""
|
|
550
705
|
wiring_refs = []
|
|
551
706
|
|
|
552
707
|
# Use topology accessors
|
|
@@ -574,7 +729,16 @@ class RayPipeline(PipelineInterface):
|
|
|
574
729
|
|
|
575
730
|
@staticmethod
|
|
576
731
|
def _start_actors(actors_to_start: List[Any], stage_name: str) -> None:
|
|
577
|
-
"""
|
|
732
|
+
"""
|
|
733
|
+
Starts a list of actors if they have a 'start' method and waits for completion.
|
|
734
|
+
|
|
735
|
+
Parameters
|
|
736
|
+
----------
|
|
737
|
+
actors_to_start : List[Any]
|
|
738
|
+
A list of actor handles.
|
|
739
|
+
stage_name : str
|
|
740
|
+
The name of the stage.
|
|
741
|
+
"""
|
|
578
742
|
start_refs = []
|
|
579
743
|
for actor in actors_to_start:
|
|
580
744
|
if hasattr(actor, "start"):
|
|
@@ -598,7 +762,18 @@ class RayPipeline(PipelineInterface):
|
|
|
598
762
|
raise RuntimeError(f"Error confirming actor starts for stage '{stage_name}'") from e
|
|
599
763
|
|
|
600
764
|
def _handle_scale_up(self, stage_info: StageInfo, current_count: int, target_count: int) -> None:
|
|
601
|
-
"""
|
|
765
|
+
"""
|
|
766
|
+
Handles scaling up, interacting with topology.
|
|
767
|
+
|
|
768
|
+
Parameters
|
|
769
|
+
----------
|
|
770
|
+
stage_info : StageInfo
|
|
771
|
+
The stage information.
|
|
772
|
+
current_count : int
|
|
773
|
+
The current number of replicas.
|
|
774
|
+
target_count : int
|
|
775
|
+
The target number of replicas.
|
|
776
|
+
"""
|
|
602
777
|
stage_name = stage_info.name
|
|
603
778
|
num_to_add = target_count - current_count
|
|
604
779
|
logger.debug(f"[ScaleUp-{stage_name}] Scaling up from {current_count} to {target_count} (+{num_to_add}).")
|
|
@@ -662,8 +837,17 @@ class RayPipeline(PipelineInterface):
|
|
|
662
837
|
|
|
663
838
|
def _handle_scale_down(self, stage_name: str, current_replicas: List[Any], target_count: int) -> None:
|
|
664
839
|
"""
|
|
665
|
-
Handles scaling down: initiates stop on actors
|
|
666
|
-
the topology
|
|
840
|
+
Handles scaling down: initiates stop on actors and marks them for removal
|
|
841
|
+
by the topology's garbage collection mechanism.
|
|
842
|
+
|
|
843
|
+
Parameters
|
|
844
|
+
----------
|
|
845
|
+
stage_name : str
|
|
846
|
+
The name of the stage.
|
|
847
|
+
current_replicas : List[Any]
|
|
848
|
+
A list of actor handles.
|
|
849
|
+
target_count : int
|
|
850
|
+
The target number of replicas.
|
|
667
851
|
"""
|
|
668
852
|
current_count = len(current_replicas)
|
|
669
853
|
num_to_remove = current_count - target_count
|
|
@@ -671,59 +855,36 @@ class RayPipeline(PipelineInterface):
|
|
|
671
855
|
f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove})."
|
|
672
856
|
)
|
|
673
857
|
|
|
674
|
-
# Basic validation
|
|
675
858
|
if num_to_remove <= 0:
|
|
676
|
-
logger.warning(f"[ScaleDown-{stage_name}] Invalid num_to_remove {num_to_remove}. Aborting.")
|
|
677
859
|
return
|
|
678
860
|
|
|
679
|
-
#
|
|
861
|
+
# Select actors to remove (e.g., the most recently added)
|
|
680
862
|
actors_to_remove = current_replicas[-num_to_remove:]
|
|
681
|
-
logger.debug(f"[ScaleDown-{stage_name}] Identified {len(actors_to_remove)} actors for removal.")
|
|
682
863
|
|
|
683
|
-
|
|
684
|
-
stop_initiation_failures = 0
|
|
864
|
+
logger.info(f"[ScaleDown-{stage_name}] Selected {len(actors_to_remove)} actors for removal.")
|
|
685
865
|
|
|
866
|
+
# Signal each actor to stop and mark it for removal by the topology.
|
|
867
|
+
# The topology's cleanup thread will handle polling and final removal.
|
|
686
868
|
for actor in actors_to_remove:
|
|
687
|
-
actor_id_str = str(actor)
|
|
688
869
|
try:
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
actors_to_register_map[stage_name].append((actor, shutdown_future))
|
|
692
|
-
logger.debug(f"[ScaleDown-{stage_name}] Submitted stop() call for actor '{actor_id_str}'.")
|
|
870
|
+
actor.stop.remote() # Signal the actor's loop to stop
|
|
871
|
+
self.topology.mark_actor_for_removal(stage_name, actor)
|
|
693
872
|
except Exception as e:
|
|
694
|
-
logger.error(
|
|
695
|
-
f"[ScaleDown-{stage_name}] Error submitting stop() for actor '{actor_id_str}': "
|
|
696
|
-
f"{e}. Cannot register.",
|
|
697
|
-
exc_info=False,
|
|
698
|
-
)
|
|
699
|
-
stop_initiation_failures += 1
|
|
700
|
-
|
|
701
|
-
# Register actors pending removal (with their shutdown futures)
|
|
702
|
-
if actors_to_register_map:
|
|
703
|
-
num_registered = sum(len(v) for v in actors_to_register_map.values())
|
|
704
|
-
logger.debug(
|
|
705
|
-
f"[ScaleDown-{stage_name}] Registering {num_registered} "
|
|
706
|
-
f"actor handles with topology for shutdown monitoring."
|
|
707
|
-
)
|
|
708
|
-
try:
|
|
709
|
-
self.topology.register_actors_pending_removal(actors_to_register_map)
|
|
710
|
-
except Exception as e:
|
|
711
|
-
logger.error(
|
|
712
|
-
f"[ScaleDown-{stage_name}] CRITICAL - Failed to register actors pending removal with topology: {e}",
|
|
713
|
-
exc_info=True,
|
|
714
|
-
)
|
|
715
|
-
self.topology.update_scaling_state(stage_name, "Error")
|
|
716
|
-
elif actors_to_remove:
|
|
717
|
-
logger.warning(f"[ScaleDown-{stage_name}] No actors successfully initiated stop for registration.")
|
|
873
|
+
logger.error(f"[ScaleDown-{stage_name}] Failed to initiate stop for actor {actor}: {e}")
|
|
718
874
|
|
|
719
|
-
|
|
720
|
-
logger.debug(
|
|
721
|
-
f"[ScaleDown-{stage_name}] Scale down initiation process complete for {total_attempted} actors "
|
|
722
|
-
f"(Skipped/Failed Initiation: {stop_initiation_failures}). Topology cleanup will handle final removal."
|
|
723
|
-
)
|
|
875
|
+
logger.debug(f"[ScaleDown-{stage_name}] Scale down initiation complete for {len(actors_to_remove)} actors.")
|
|
724
876
|
|
|
725
877
|
def _scale_stage(self, stage_name: str, new_replica_count: int) -> None:
|
|
726
|
-
"""
|
|
878
|
+
"""
|
|
879
|
+
Orchestrates scaling using topology for state and info.
|
|
880
|
+
|
|
881
|
+
Parameters
|
|
882
|
+
----------
|
|
883
|
+
stage_name : str
|
|
884
|
+
The name of the stage.
|
|
885
|
+
new_replica_count : int
|
|
886
|
+
The new number of replicas.
|
|
887
|
+
"""
|
|
727
888
|
logger.debug(f"[ScaleStage-{stage_name}] Request for target count: {new_replica_count}")
|
|
728
889
|
|
|
729
890
|
# --- Use Topology Accessors ---
|
|
@@ -764,47 +925,45 @@ class RayPipeline(PipelineInterface):
|
|
|
764
925
|
self.topology.update_scaling_state(stage_name, "Error") # Ensure error state
|
|
765
926
|
|
|
766
927
|
def _is_pipeline_quiet(self) -> bool:
|
|
767
|
-
"""
|
|
928
|
+
"""
|
|
929
|
+
Checks if pipeline is quiet using topology state and stats collector.
|
|
930
|
+
|
|
931
|
+
Returns
|
|
932
|
+
-------
|
|
933
|
+
bool
|
|
934
|
+
True if the pipeline is quiet, False otherwise.
|
|
935
|
+
"""
|
|
936
|
+
return False # TODO: disabled for debugging
|
|
768
937
|
|
|
769
938
|
# Check topology state first
|
|
770
939
|
if self.topology.get_is_flushing():
|
|
771
940
|
logger.debug("Pipeline quiet check: False (Flush in progress via topology state)")
|
|
772
941
|
return False
|
|
773
942
|
|
|
774
|
-
#
|
|
775
|
-
|
|
776
|
-
if
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
# Stats check (same as before)
|
|
780
|
-
current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
|
|
781
|
-
self.stats_collector.get_latest_stats()
|
|
782
|
-
)
|
|
783
|
-
last_update_age = time.time() - last_update_time
|
|
784
|
-
max_stats_age_for_quiet = max(10.0, self._stats_collection_interval_seconds * 2.5)
|
|
785
|
-
|
|
786
|
-
if not stats_were_successful:
|
|
787
|
-
logger.warning(f"Pipeline quiet check: False (Stats failed {last_update_age:.1f}s ago).")
|
|
788
|
-
return False
|
|
789
|
-
|
|
790
|
-
if last_update_age > max_stats_age_for_quiet:
|
|
791
|
-
logger.warning(
|
|
792
|
-
f"Pipeline quiet check: False (Stats too old: {last_update_age:.1f}s > {max_stats_age_for_quiet:.1f}s)."
|
|
793
|
-
)
|
|
794
|
-
return False
|
|
795
|
-
|
|
796
|
-
if not current_stage_stats:
|
|
797
|
-
logger.warning("Pipeline quiet check: False (No stats currently available).")
|
|
798
|
-
return False
|
|
943
|
+
# Check stats collector for recent activity
|
|
944
|
+
latest_stats = self.stats_collector.get_latest_stats()
|
|
945
|
+
if not latest_stats:
|
|
946
|
+
logger.debug("Pipeline quiet check: True (No stats available, assuming quiet)")
|
|
947
|
+
return True # No stats could mean it's idle
|
|
799
948
|
|
|
800
|
-
|
|
801
|
-
|
|
949
|
+
total_in_flight = latest_stats.get("total_items_in_flight", 0)
|
|
950
|
+
logger.debug(f"Pipeline quiet check: Total in-flight items: {total_in_flight}")
|
|
802
951
|
|
|
803
|
-
return
|
|
952
|
+
return total_in_flight <= self.quiet_period_threshold
|
|
804
953
|
|
|
805
954
|
def _wait_for_pipeline_drain(self, timeout_seconds: int) -> bool:
|
|
806
955
|
"""
|
|
807
956
|
Actively monitors pipeline drain using direct calls to the stats collector.
|
|
957
|
+
|
|
958
|
+
Parameters
|
|
959
|
+
----------
|
|
960
|
+
timeout_seconds : int
|
|
961
|
+
The timeout in seconds.
|
|
962
|
+
|
|
963
|
+
Returns
|
|
964
|
+
-------
|
|
965
|
+
bool
|
|
966
|
+
True if the pipeline drained successfully, False otherwise.
|
|
808
967
|
"""
|
|
809
968
|
start_time = time.time()
|
|
810
969
|
logger.info(f"Waiting for pipeline drain (Timeout: {timeout_seconds}s)...")
|
|
@@ -860,7 +1019,14 @@ class RayPipeline(PipelineInterface):
|
|
|
860
1019
|
time.sleep(sleep_duration)
|
|
861
1020
|
|
|
862
1021
|
def _execute_queue_flush(self) -> bool:
|
|
863
|
-
"""
|
|
1022
|
+
"""
|
|
1023
|
+
Executes queue flush, using topology for state and structure.
|
|
1024
|
+
|
|
1025
|
+
Returns
|
|
1026
|
+
-------
|
|
1027
|
+
bool
|
|
1028
|
+
True if the flush was successful, False otherwise.
|
|
1029
|
+
"""
|
|
864
1030
|
if self.topology.get_is_flushing() or self._stopping: # Check topology state
|
|
865
1031
|
logger.warning("Queue flush requested but already in progress or pipeline is stopping. Ignoring.")
|
|
866
1032
|
return False
|
|
@@ -893,6 +1059,7 @@ class RayPipeline(PipelineInterface):
|
|
|
893
1059
|
source_actors_paused.append(actor)
|
|
894
1060
|
except Exception as e:
|
|
895
1061
|
logger.error(f"Failed sending pause to {actor}: {e}")
|
|
1062
|
+
|
|
896
1063
|
if pause_refs:
|
|
897
1064
|
logger.debug(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
|
|
898
1065
|
try:
|
|
@@ -997,7 +1164,14 @@ class RayPipeline(PipelineInterface):
|
|
|
997
1164
|
return overall_success
|
|
998
1165
|
|
|
999
1166
|
def request_queue_flush(self, force: bool = False) -> None:
|
|
1000
|
-
"""
|
|
1167
|
+
"""
|
|
1168
|
+
Requests a queue flush, checking topology state.
|
|
1169
|
+
|
|
1170
|
+
Parameters
|
|
1171
|
+
----------
|
|
1172
|
+
force : bool, optional
|
|
1173
|
+
Whether to force the flush, by default False.
|
|
1174
|
+
"""
|
|
1001
1175
|
logger.info(f"Manual queue flush requested (force={force}).")
|
|
1002
1176
|
|
|
1003
1177
|
if self.topology.get_is_flushing() or self._stopping: # Check topology
|
|
@@ -1014,7 +1188,21 @@ class RayPipeline(PipelineInterface):
|
|
|
1014
1188
|
def _gather_controller_metrics(
|
|
1015
1189
|
self, current_stage_stats: Dict[str, Dict[str, int]], global_in_flight: int
|
|
1016
1190
|
) -> Dict[str, Dict[str, Any]]:
|
|
1017
|
-
"""
|
|
1191
|
+
"""
|
|
1192
|
+
Gathers metrics using provided stats and topology.
|
|
1193
|
+
|
|
1194
|
+
Parameters
|
|
1195
|
+
----------
|
|
1196
|
+
current_stage_stats : Dict[str, Dict[str, int]]
|
|
1197
|
+
The current stage statistics.
|
|
1198
|
+
global_in_flight : int
|
|
1199
|
+
The global in-flight count.
|
|
1200
|
+
|
|
1201
|
+
Returns
|
|
1202
|
+
-------
|
|
1203
|
+
Dict[str, Dict[str, Any]]
|
|
1204
|
+
A dictionary of metrics for the controllers.
|
|
1205
|
+
"""
|
|
1018
1206
|
logger.debug("[ScalingMetrics] Gathering metrics for controllers...")
|
|
1019
1207
|
current_stage_metrics = {}
|
|
1020
1208
|
|
|
@@ -1046,11 +1234,11 @@ class RayPipeline(PipelineInterface):
|
|
|
1046
1234
|
def _get_current_global_memory(self) -> int:
|
|
1047
1235
|
"""
|
|
1048
1236
|
Safely retrieves the current global system memory usage (used, not free) in MB.
|
|
1049
|
-
Uses the previous measurement as a fallback only if the current read fails.
|
|
1050
1237
|
|
|
1051
|
-
Returns
|
|
1052
|
-
|
|
1053
|
-
|
|
1238
|
+
Returns
|
|
1239
|
+
-------
|
|
1240
|
+
int
|
|
1241
|
+
The current global memory usage (RSS/used) in MB.
|
|
1054
1242
|
"""
|
|
1055
1243
|
try:
|
|
1056
1244
|
# psutil.virtual_memory().used provides total RAM used by processes
|
|
@@ -1073,7 +1261,23 @@ class RayPipeline(PipelineInterface):
|
|
|
1073
1261
|
def _calculate_scaling_adjustments(
|
|
1074
1262
|
self, current_stage_metrics: Dict[str, Dict[str, Any]], global_in_flight: int, current_global_memory_mb: int
|
|
1075
1263
|
) -> Dict[str, int]:
|
|
1076
|
-
"""
|
|
1264
|
+
"""
|
|
1265
|
+
Runs controllers to get target replica counts using topology for edge count.
|
|
1266
|
+
|
|
1267
|
+
Parameters
|
|
1268
|
+
----------
|
|
1269
|
+
current_stage_metrics : Dict[str, Dict[str, Any]]
|
|
1270
|
+
The current stage metrics.
|
|
1271
|
+
global_in_flight : int
|
|
1272
|
+
The global in-flight count.
|
|
1273
|
+
current_global_memory_mb : int
|
|
1274
|
+
The current global memory usage in MB.
|
|
1275
|
+
|
|
1276
|
+
Returns
|
|
1277
|
+
-------
|
|
1278
|
+
Dict[str, int]
|
|
1279
|
+
A dictionary of target replica counts for the stages.
|
|
1280
|
+
"""
|
|
1077
1281
|
logger.debug("[ScalingCalc] Calculating adjustments via PID and RCM...")
|
|
1078
1282
|
# Get edge count from topology
|
|
1079
1283
|
num_edges = len(self.topology.get_edge_queues())
|
|
@@ -1099,7 +1303,14 @@ class RayPipeline(PipelineInterface):
|
|
|
1099
1303
|
return {name: metrics.get("replicas", 0) for name, metrics in current_stage_metrics.items()}
|
|
1100
1304
|
|
|
1101
1305
|
def _apply_scaling_actions(self, final_adjustments: Dict[str, int]) -> None:
|
|
1102
|
-
"""
|
|
1306
|
+
"""
|
|
1307
|
+
Applies scaling by calling _scale_stage, using topology for validation.
|
|
1308
|
+
|
|
1309
|
+
Parameters
|
|
1310
|
+
----------
|
|
1311
|
+
final_adjustments : Dict[str, int]
|
|
1312
|
+
A dictionary of target replica counts for the stages.
|
|
1313
|
+
"""
|
|
1103
1314
|
stages_needing_action = []
|
|
1104
1315
|
current_actors_map = self.topology.get_stage_actors() # Snapshot
|
|
1105
1316
|
|
|
@@ -1166,8 +1377,9 @@ class RayPipeline(PipelineInterface):
|
|
|
1166
1377
|
logger.debug(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
|
|
1167
1378
|
|
|
1168
1379
|
def _perform_scaling_and_maintenance(self) -> None:
|
|
1169
|
-
"""
|
|
1170
|
-
|
|
1380
|
+
"""
|
|
1381
|
+
Orchestrates scaling/maintenance using topology and stats collector.
|
|
1382
|
+
"""
|
|
1171
1383
|
if self._stopping:
|
|
1172
1384
|
logger.debug("Pipeline is stopping. Skipping scaling cycle.")
|
|
1173
1385
|
return
|
|
@@ -1194,10 +1406,29 @@ class RayPipeline(PipelineInterface):
|
|
|
1194
1406
|
logger.debug("--- Performing Scaling & Maintenance Cycle ---")
|
|
1195
1407
|
|
|
1196
1408
|
if self._is_pipeline_quiet():
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1409
|
+
self._consecutive_quiet_cycles += 1
|
|
1410
|
+
logger.debug(f"Pipeline is quiet. Consecutive quiet cycles: {self._consecutive_quiet_cycles}")
|
|
1411
|
+
if self._consecutive_quiet_cycles >= self.consecutive_quiet_cycles_for_flush:
|
|
1412
|
+
logger.info(
|
|
1413
|
+
f"Pipeline has been quiet for {self._consecutive_quiet_cycles} cycles. "
|
|
1414
|
+
"Initiating queue flush."
|
|
1415
|
+
)
|
|
1416
|
+
if self._execute_queue_flush():
|
|
1417
|
+
self._last_queue_flush_time = time.time()
|
|
1418
|
+
self._consecutive_quiet_cycles = 0 # Reset after attempting flush
|
|
1419
|
+
else:
|
|
1420
|
+
logger.debug(
|
|
1421
|
+
f"Pipeline is quiet, but waiting for {self.consecutive_quiet_cycles_for_flush} "
|
|
1422
|
+
"consecutive quiet cycles before flushing."
|
|
1423
|
+
)
|
|
1424
|
+
else:
|
|
1425
|
+
if self._consecutive_quiet_cycles > 0:
|
|
1426
|
+
logger.info(
|
|
1427
|
+
f"Pipeline is no longer quiet. Resetting consecutive quiet cycle count "
|
|
1428
|
+
f"from {self._consecutive_quiet_cycles} to 0."
|
|
1429
|
+
)
|
|
1430
|
+
self._consecutive_quiet_cycles = 0
|
|
1431
|
+
logger.debug("Queue flush interval reached, but pipeline is not quiet. Deferring.")
|
|
1201
1432
|
|
|
1202
1433
|
# Fast return check if stopping occurred while flushing or checking flush status
|
|
1203
1434
|
if self._stopping:
|
|
@@ -1240,7 +1471,14 @@ class RayPipeline(PipelineInterface):
|
|
|
1240
1471
|
|
|
1241
1472
|
# --- Lifecycle Methods for Monitoring/Scaling Threads ---
|
|
1242
1473
|
def _scaling_loop(self, interval: float) -> None:
|
|
1243
|
-
"""
|
|
1474
|
+
"""
|
|
1475
|
+
Main loop for the scaling thread.
|
|
1476
|
+
|
|
1477
|
+
Parameters
|
|
1478
|
+
----------
|
|
1479
|
+
interval : float
|
|
1480
|
+
The interval in seconds.
|
|
1481
|
+
"""
|
|
1244
1482
|
logger.info(f"Scaling loop started. Interval: {interval}s")
|
|
1245
1483
|
while self._scaling_monitoring:
|
|
1246
1484
|
try:
|
|
@@ -1255,6 +1493,14 @@ class RayPipeline(PipelineInterface):
|
|
|
1255
1493
|
logger.info("Scaling loop finished.")
|
|
1256
1494
|
|
|
1257
1495
|
def _start_scaling(self, poll_interval: float = 10.0) -> None:
|
|
1496
|
+
"""
|
|
1497
|
+
Starts the scaling thread.
|
|
1498
|
+
|
|
1499
|
+
Parameters
|
|
1500
|
+
----------
|
|
1501
|
+
poll_interval : float, optional
|
|
1502
|
+
The interval in seconds, by default 10.0.
|
|
1503
|
+
"""
|
|
1258
1504
|
if not self._scaling_monitoring:
|
|
1259
1505
|
self._scaling_monitoring = True
|
|
1260
1506
|
self._scaling_thread = threading.Thread(target=self._scaling_loop, args=(poll_interval,), daemon=True)
|
|
@@ -1262,6 +1508,9 @@ class RayPipeline(PipelineInterface):
|
|
|
1262
1508
|
logger.info(f"Scaling/Maintenance thread launched (Interval: {poll_interval}s).")
|
|
1263
1509
|
|
|
1264
1510
|
def _stop_scaling(self) -> None:
|
|
1511
|
+
"""
|
|
1512
|
+
Stops the scaling thread.
|
|
1513
|
+
"""
|
|
1265
1514
|
if self._scaling_monitoring:
|
|
1266
1515
|
logger.debug("Stopping scaling/maintenance thread...")
|
|
1267
1516
|
self._scaling_monitoring = False
|
|
@@ -1274,77 +1523,103 @@ class RayPipeline(PipelineInterface):
|
|
|
1274
1523
|
|
|
1275
1524
|
# --- Pipeline Start/Stop ---
|
|
1276
1525
|
def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
|
|
1277
|
-
"""
|
|
1278
|
-
|
|
1279
|
-
|
|
1526
|
+
"""
|
|
1527
|
+
Starts the pipeline actors and background monitoring threads.
|
|
1528
|
+
|
|
1529
|
+
Assumes the pipeline has already been built via the `build()` method.
|
|
1530
|
+
|
|
1531
|
+
Parameters
|
|
1532
|
+
----------
|
|
1533
|
+
monitor_poll_interval : float, optional
|
|
1534
|
+
This parameter is currently unused but is kept for interface compatibility.
|
|
1535
|
+
scaling_poll_interval : float, optional
|
|
1536
|
+
The interval in seconds for the autoscaling and maintenance thread to run, by default 30.0.
|
|
1537
|
+
"""
|
|
1538
|
+
if not self.topology.get_all_actors():
|
|
1280
1539
|
logger.error("Cannot start: Pipeline not built or has no actors.")
|
|
1281
1540
|
return
|
|
1282
1541
|
|
|
1283
1542
|
logger.info("Starting pipeline execution...")
|
|
1284
|
-
start_refs = []
|
|
1285
|
-
# Get actors from topology
|
|
1286
|
-
actors_to_start = [actor for actors in self.topology.get_stage_actors().values() for actor in actors]
|
|
1287
|
-
|
|
1288
|
-
for actor in actors_to_start:
|
|
1289
|
-
start_refs.append(actor.start.remote())
|
|
1290
1543
|
|
|
1291
|
-
|
|
1292
|
-
|
|
1544
|
+
# Start all actors
|
|
1545
|
+
actors_to_start = self.topology.get_all_actors()
|
|
1546
|
+
start_futures = [actor.start.remote() for actor in actors_to_start]
|
|
1547
|
+
if start_futures:
|
|
1548
|
+
logger.debug(f"Waiting for {len(start_futures)} actors to start...")
|
|
1293
1549
|
try:
|
|
1294
|
-
ray.get(
|
|
1295
|
-
logger.info(f"{len(
|
|
1550
|
+
ray.get(start_futures, timeout=60.0)
|
|
1551
|
+
logger.info(f"{len(start_futures)} actors started.")
|
|
1296
1552
|
except Exception as e:
|
|
1297
1553
|
logger.error(f"Error/Timeout starting actors: {e}", exc_info=True)
|
|
1298
1554
|
self.stop() # Attempt cleanup
|
|
1299
|
-
|
|
1300
1555
|
raise RuntimeError("Pipeline start failed: actors did not start.") from e
|
|
1301
1556
|
|
|
1557
|
+
# Start background threads
|
|
1558
|
+
self.topology.start_cleanup_thread()
|
|
1302
1559
|
self.stats_collector.start()
|
|
1303
1560
|
self._start_scaling(poll_interval=scaling_poll_interval)
|
|
1561
|
+
|
|
1304
1562
|
logger.info("Pipeline started successfully.")
|
|
1305
1563
|
|
|
1306
1564
|
def stop(self) -> None:
|
|
1307
|
-
"""
|
|
1565
|
+
"""
|
|
1566
|
+
Stops the pipeline and all associated actors and threads.
|
|
1567
|
+
|
|
1568
|
+
This method performs a graceful shutdown by:
|
|
1569
|
+
1. Stopping the autoscaling and statistics collection threads.
|
|
1570
|
+
2. Signaling all actors to stop and waiting for confirmation.
|
|
1571
|
+
3. Stopping the topology cleanup thread.
|
|
1572
|
+
4. Clearing all runtime state from the topology.
|
|
1573
|
+
"""
|
|
1308
1574
|
logger.info("Stopping pipeline...")
|
|
1309
1575
|
|
|
1310
1576
|
if self._stopping:
|
|
1577
|
+
logger.warning("Stop already in progress.")
|
|
1311
1578
|
return
|
|
1312
1579
|
self._stopping = True
|
|
1313
1580
|
|
|
1314
|
-
# 1. Stop background threads first
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
self.stats_collector.stop()
|
|
1581
|
+
# 1. Stop background threads first to prevent new actions
|
|
1582
|
+
self._stop_scaling()
|
|
1583
|
+
self.stats_collector.stop()
|
|
1318
1584
|
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1585
|
+
# 2. Stop all actors and wait for them
|
|
1586
|
+
logger.debug("Stopping all actors in the topology...")
|
|
1587
|
+
all_actors = self.topology.get_all_actors()
|
|
1588
|
+
if all_actors:
|
|
1589
|
+
stop_futures = [actor.stop.remote() for actor in all_actors]
|
|
1590
|
+
try:
|
|
1591
|
+
ready, not_ready = ray.wait(stop_futures, num_returns=len(stop_futures), timeout=60.0)
|
|
1592
|
+
if not_ready:
|
|
1593
|
+
logger.warning(
|
|
1594
|
+
f"Timeout waiting for {len(not_ready)} actors to stop. " f"Proceeding with shutdown."
|
|
1595
|
+
)
|
|
1596
|
+
logger.info(f"{len(ready)} actors confirmed stop.")
|
|
1597
|
+
except Exception as e:
|
|
1598
|
+
logger.error(f"An unexpected error occurred during actor shutdown: {e}", exc_info=True)
|
|
1322
1599
|
|
|
1323
|
-
|
|
1324
|
-
|
|
1600
|
+
# 3. Stop the topology cleanup thread now that actors are stopped
|
|
1601
|
+
self.topology.stop_cleanup_thread()
|
|
1325
1602
|
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
stop_refs_map[actor.stop.remote()] = actor
|
|
1330
|
-
except Exception as e:
|
|
1331
|
-
logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Skipping.")
|
|
1603
|
+
# 4. Clear all state and references
|
|
1604
|
+
logger.debug("Clearing pipeline topology runtime state.")
|
|
1605
|
+
self.topology.clear_runtime_state()
|
|
1332
1606
|
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
|
|
1336
|
-
try:
|
|
1337
|
-
ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
|
|
1338
|
-
if not_ready:
|
|
1339
|
-
logger.warning(
|
|
1340
|
-
f"Timeout waiting for {len(not_ready)} actors to stop. Allowing Ray to clean up."
|
|
1341
|
-
)
|
|
1342
|
-
logger.info(f"{len(ready)} actors stopped via stop().")
|
|
1343
|
-
except Exception as e:
|
|
1344
|
-
logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
|
|
1607
|
+
self._stopping = False
|
|
1608
|
+
logger.info("Pipeline stopped successfully.")
|
|
1345
1609
|
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1610
|
+
def __enter__(self) -> "RayPipeline":
|
|
1611
|
+
"""
|
|
1612
|
+
Enter the runtime context related to this object.
|
|
1349
1613
|
|
|
1350
|
-
|
|
1614
|
+
Returns
|
|
1615
|
+
-------
|
|
1616
|
+
RayPipeline
|
|
1617
|
+
The pipeline instance.
|
|
1618
|
+
"""
|
|
1619
|
+
return self
|
|
1620
|
+
|
|
1621
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
1622
|
+
"""
|
|
1623
|
+
Exit the runtime context related to this object.
|
|
1624
|
+
"""
|
|
1625
|
+
self.stop()
|