nv-ingest 25.7.7.dev20250707__py3-none-any.whl → 25.8.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

@@ -7,7 +7,6 @@ import os
7
7
  import signal
8
8
  import threading
9
9
  from abc import ABC, abstractmethod
10
- from collections import defaultdict
11
10
  from dataclasses import dataclass
12
11
  from types import FunctionType
13
12
 
@@ -70,14 +69,13 @@ class ScalingConfig:
70
69
 
71
70
  dynamic_memory_scaling: bool = True
72
71
  dynamic_memory_threshold: float = 0.75
73
- pid_kp: float = 0.1
74
- pid_ki: float = 0.001
72
+ pid_kp: float = 0.2
73
+ pid_ki: float = 0.01
75
74
  pid_kd: float = 0.0
75
+ pid_ema_alpha: float = 0.1
76
76
  pid_target_queue_depth: int = 0
77
77
  pid_penalty_factor: float = 0.1
78
78
  pid_error_boost_factor: float = 1.5
79
- pid_window_size: int = 10
80
- rcm_estimated_edge_cost_mb: int = 5000
81
79
  rcm_memory_safety_buffer_fraction: float = 0.15
82
80
 
83
81
 
@@ -88,6 +86,7 @@ class FlushingConfig:
88
86
  queue_flush_interval_seconds: int = 600
89
87
  queue_flush_drain_timeout_seconds: int = 300
90
88
  quiet_period_threshold: int = 0
89
+ consecutive_quiet_cycles_for_flush: int = 3
91
90
 
92
91
 
93
92
  @dataclass
@@ -197,6 +196,18 @@ class RayPipeline(PipelineInterface):
197
196
  flushing_config: FlushingConfig = FlushingConfig(),
198
197
  stats_config: StatsConfig = StatsConfig(),
199
198
  ) -> None:
199
+ """
200
+ Initializes the RayPipeline.
201
+
202
+ Parameters
203
+ ----------
204
+ scaling_config : ScalingConfig, optional
205
+ Configuration for PID and resource constraint-based scaling, by default ScalingConfig().
206
+ flushing_config : FlushingConfig, optional
207
+ Configuration for queue flushing behavior, by default FlushingConfig().
208
+ stats_config : StatsConfig, optional
209
+ Configuration for the RayStatsCollector, by default StatsConfig().
210
+ """
200
211
  # Store config objects
201
212
  self.scaling_config = scaling_config
202
213
  self.flushing_config = flushing_config
@@ -218,7 +229,6 @@ class RayPipeline(PipelineInterface):
218
229
  # Use scaling_config for these
219
230
  self.dynamic_memory_scaling = self.scaling_config.dynamic_memory_scaling
220
231
  self.dynamic_memory_threshold = self.scaling_config.dynamic_memory_threshold
221
- self.stage_memory_overhead: Dict[str, float] = {}
222
232
 
223
233
  # --- Background Threads ---
224
234
  self._scaling_thread: Optional[threading.Thread] = None
@@ -229,6 +239,8 @@ class RayPipeline(PipelineInterface):
229
239
  self.queue_flush_interval_seconds = self.flushing_config.queue_flush_interval_seconds
230
240
  self.queue_flush_drain_timeout_seconds = self.flushing_config.queue_flush_drain_timeout_seconds
231
241
  self.quiet_period_threshold = self.flushing_config.quiet_period_threshold
242
+ self.consecutive_quiet_cycles_for_flush = self.flushing_config.consecutive_quiet_cycles_for_flush
243
+ self._consecutive_quiet_cycles = 0
232
244
 
233
245
  # --- Instantiate Autoscaling Controllers ---
234
246
  # Use scaling_config
@@ -236,9 +248,7 @@ class RayPipeline(PipelineInterface):
236
248
  kp=self.scaling_config.pid_kp,
237
249
  ki=self.scaling_config.pid_ki,
238
250
  kd=self.scaling_config.pid_kd,
239
- stage_cost_estimates={}, # Populated during build
240
251
  target_queue_depth=self.scaling_config.pid_target_queue_depth,
241
- window_size=self.scaling_config.pid_window_size,
242
252
  penalty_factor=self.scaling_config.pid_penalty_factor,
243
253
  error_boost_factor=self.scaling_config.pid_error_boost_factor,
244
254
  )
@@ -258,7 +268,6 @@ class RayPipeline(PipelineInterface):
258
268
  self.constraint_manager = ResourceConstraintManager(
259
269
  max_replicas=1, # Updated during build
260
270
  memory_threshold=absolute_memory_threshold_mb,
261
- estimated_edge_cost_mb=self.scaling_config.rcm_estimated_edge_cost_mb,
262
271
  memory_safety_buffer_fraction=self.scaling_config.rcm_memory_safety_buffer_fraction,
263
272
  )
264
273
  logger.info("ResourceConstraintManager initialized using ScalingConfig.")
@@ -270,6 +279,7 @@ class RayPipeline(PipelineInterface):
270
279
  interval=self.stats_config.collection_interval_seconds,
271
280
  actor_timeout=self.stats_config.actor_timeout_seconds,
272
281
  queue_timeout=self.stats_config.queue_timeout_seconds,
282
+ ema_alpha=self.scaling_config.pid_ema_alpha,
273
283
  )
274
284
 
275
285
  logger.info("RayStatsCollector initialized using StatsConfig.")
@@ -277,21 +287,43 @@ class RayPipeline(PipelineInterface):
277
287
  # --- Accessor Methods for Stat Collector (and internal use) ---
278
288
 
279
289
  def __del__(self):
290
+ """Ensures the pipeline is stopped upon garbage collection."""
280
291
  try:
281
292
  self.stop()
282
293
  except Exception as e:
283
294
  logger.error(f"Exception during RayPipeline cleanup: {e}")
284
295
 
285
296
  def get_stages_info(self) -> List[StageInfo]:
286
- """Returns a snapshot of the current stage information."""
297
+ """
298
+ Returns a snapshot of the current stage information.
299
+
300
+ Returns
301
+ -------
302
+ List[StageInfo]
303
+ A list of StageInfo objects from the topology.
304
+ """
287
305
  return self.topology.get_stages_info()
288
306
 
289
307
  def get_stage_actors(self) -> Dict[str, List[Any]]:
290
- """Returns a snapshot of the current actors per stage."""
308
+ """
309
+ Returns a snapshot of the current actors per stage.
310
+
311
+ Returns
312
+ -------
313
+ Dict[str, List[Any]]
314
+ A dictionary mapping stage names to lists of actor handles.
315
+ """
291
316
  return self.topology.get_stage_actors()
292
317
 
293
318
  def get_edge_queues(self) -> Dict[str, Tuple[Any, int]]:
294
- """Returns a snapshot of the current edge queues."""
319
+ """
320
+ Returns a snapshot of the current edge queues.
321
+
322
+ Returns
323
+ -------
324
+ Dict[str, Tuple[Any, int]]
325
+ A dictionary mapping queue names to tuples of (queue_handle, queue_size).
326
+ """
295
327
  return self.topology.get_edge_queues()
296
328
 
297
329
  def _configure_autoscalers(self) -> None:
@@ -310,9 +342,6 @@ class RayPipeline(PipelineInterface):
310
342
  # For now, let's store a dummy overhead in topology during build
311
343
  overhead_bytes = default_cost_bytes # Simplification for now
312
344
  stage_overheads[stage.name] = overhead_bytes # Store locally first
313
- cost_mb = max(1, int(overhead_bytes / (1024 * 1024)))
314
- # Update controller directly (or via dedicated method if preferred)
315
- self.pid_controller.stage_cost_estimates[stage.name] = cost_mb
316
345
 
317
346
  # Update topology with collected overheads
318
347
  self.topology.set_stage_memory_overhead(stage_overheads)
@@ -321,7 +350,6 @@ class RayPipeline(PipelineInterface):
321
350
  self.constraint_manager.max_replicas = total_max_replicas
322
351
 
323
352
  logger.info(f"[Build-Configure] Autoscalers configured. Total Max Replicas: {total_max_replicas}")
324
- logger.debug(f"[Build-Configure] PID stage cost estimates (MB): {self.pid_controller.stage_cost_estimates}")
325
353
 
326
354
  def _instantiate_initial_actors(self) -> None:
327
355
  """Instantiates initial actors and updates topology."""
@@ -348,9 +376,9 @@ class RayPipeline(PipelineInterface):
348
376
  f" for '{stage.name}'"
349
377
  )
350
378
  try:
351
- actor = stage.callable.options(
352
- name=actor_name, max_concurrency=10, max_restarts=0, lifetime="detached"
353
- ).remote(config=stage.config)
379
+ actor = stage.callable.options(name=actor_name, max_concurrency=1, max_restarts=0).remote(
380
+ config=stage.config
381
+ )
354
382
  replicas.append(actor)
355
383
  except Exception as e:
356
384
  logger.error(f"[Build-Actors] Failed create actor '{actor_name}': {e}", exc_info=True)
@@ -363,7 +391,14 @@ class RayPipeline(PipelineInterface):
363
391
  logger.info("[Build-Actors] Initial actor instantiation complete.")
364
392
 
365
393
  def _create_and_wire_edges(self) -> List[ray.ObjectRef]:
366
- """Creates queues, wires actors (using topology), and updates topology."""
394
+ """
395
+ Creates queues, wires actors (using topology), and updates topology.
396
+
397
+ Returns
398
+ -------
399
+ List[ray.ObjectRef]
400
+ A list of object references for the remote wiring calls.
401
+ """
367
402
  logger.info("[Build-Wiring] Creating and wiring edges...")
368
403
  wiring_refs = []
369
404
  new_edge_queues: Dict[str, Tuple[Any, int]] = {}
@@ -400,7 +435,14 @@ class RayPipeline(PipelineInterface):
400
435
 
401
436
  @staticmethod
402
437
  def _wait_for_wiring(wiring_refs: List[ray.ObjectRef]) -> None:
403
- """Waits for remote wiring calls to complete. (Static, no changes needed)."""
438
+ """
439
+ Waits for remote wiring calls to complete.
440
+
441
+ Parameters
442
+ ----------
443
+ wiring_refs : List[ray.ObjectRef]
444
+ A list of object references for the wiring calls.
445
+ """
404
446
  if not wiring_refs:
405
447
  logger.debug("[Build-WaitWiring] No wiring calls.")
406
448
  return
@@ -415,6 +457,27 @@ class RayPipeline(PipelineInterface):
415
457
  def add_source(
416
458
  self, *, name: str, source_actor: Any, config: BaseModel, min_replicas: int = 1, max_replicas: int = 1
417
459
  ) -> "RayPipeline":
460
+ """
461
+ Adds a source stage to the pipeline.
462
+
463
+ Parameters
464
+ ----------
465
+ name : str
466
+ The name of the source stage.
467
+ source_actor : Any
468
+ The actor or callable for the source stage.
469
+ config : BaseModel
470
+ The configuration for the source stage.
471
+ min_replicas : int, optional
472
+ The minimum number of replicas for the source stage, by default 1.
473
+ max_replicas : int, optional
474
+ The maximum number of replicas for the source stage, by default 1.
475
+
476
+ Returns
477
+ -------
478
+ RayPipeline
479
+ The pipeline instance.
480
+ """
418
481
  if min_replicas < 1:
419
482
  logger.warning(f"Source stage '{name}': min_replicas must be >= 1. Overriding.")
420
483
  min_replicas = 1
@@ -440,6 +503,27 @@ class RayPipeline(PipelineInterface):
440
503
  min_replicas: int = 0,
441
504
  max_replicas: int = 1,
442
505
  ) -> "RayPipeline":
506
+ """
507
+ Adds a stage to the pipeline.
508
+
509
+ Parameters
510
+ ----------
511
+ name : str
512
+ The name of the stage.
513
+ stage_actor : Any
514
+ The actor or callable for the stage.
515
+ config : BaseModel
516
+ The configuration for the stage.
517
+ min_replicas : int, optional
518
+ The minimum number of replicas for the stage, by default 0.
519
+ max_replicas : int, optional
520
+ The maximum number of replicas for the stage, by default 1.
521
+
522
+ Returns
523
+ -------
524
+ RayPipeline
525
+ The pipeline instance.
526
+ """
443
527
  if min_replicas < 0:
444
528
  logger.warning(f"Stage '{name}': min_replicas cannot be negative. Overriding to 0.")
445
529
  min_replicas = 0
@@ -471,6 +555,27 @@ class RayPipeline(PipelineInterface):
471
555
  def add_sink(
472
556
  self, *, name: str, sink_actor: Any, config: BaseModel, min_replicas: int = 1, max_replicas: int = 1
473
557
  ) -> "RayPipeline":
558
+ """
559
+ Adds a sink stage to the pipeline.
560
+
561
+ Parameters
562
+ ----------
563
+ name : str
564
+ The name of the sink stage.
565
+ sink_actor : Any
566
+ The actor or callable for the sink stage.
567
+ config : BaseModel
568
+ The configuration for the sink stage.
569
+ min_replicas : int, optional
570
+ The minimum number of replicas for the sink stage, by default 1.
571
+ max_replicas : int, optional
572
+ The maximum number of replicas for the sink stage, by default 1.
573
+
574
+ Returns
575
+ -------
576
+ RayPipeline
577
+ The pipeline instance.
578
+ """
474
579
  # Sink min_replicas can realistically be 0 if data drain is optional/best-effort? Let's allow 0.
475
580
  if min_replicas < 0:
476
581
  logger.warning(f"Sink stage '{name}': min_replicas cannot be negative. Overriding to 0.")
@@ -489,6 +594,23 @@ class RayPipeline(PipelineInterface):
489
594
 
490
595
  # --- Method for defining connections ---
491
596
  def make_edge(self, from_stage: str, to_stage: str, queue_size: int = 100) -> "RayPipeline":
597
+ """
598
+ Creates an edge between two stages in the pipeline.
599
+
600
+ Parameters
601
+ ----------
602
+ from_stage : str
603
+ The name of the source stage.
604
+ to_stage : str
605
+ The name of the destination stage.
606
+ queue_size : int, optional
607
+ The size of the queue between the stages, by default 100.
608
+
609
+ Returns
610
+ -------
611
+ RayPipeline
612
+ The pipeline instance.
613
+ """
492
614
  try:
493
615
  self.topology.add_connection(from_stage, to_stage, queue_size) # Delegate (includes validation)
494
616
  except ValueError as e:
@@ -498,7 +620,14 @@ class RayPipeline(PipelineInterface):
498
620
 
499
621
  # ----- Pipeline Build Process ---
500
622
  def build(self) -> Dict[str, List[Any]]:
501
- """Builds the pipeline: configures, instantiates, wires, using topology."""
623
+ """
624
+ Builds the pipeline: configures, instantiates, wires, using topology.
625
+
626
+ Returns
627
+ -------
628
+ Dict[str, List[Any]]
629
+ A dictionary mapping stage names to lists of actor handles.
630
+ """
502
631
  logger.info("--- Starting Pipeline Build Process ---")
503
632
  try:
504
633
  if not self.topology.get_stages_info():
@@ -527,13 +656,25 @@ class RayPipeline(PipelineInterface):
527
656
  # --- Scaling Logic ---
528
657
  @staticmethod
529
658
  def _create_single_replica(stage_info: StageInfo) -> Any:
530
- """Creates a single new Ray actor replica for the given stage."""
659
+ """
660
+ Creates a single new Ray actor replica for the given stage.
661
+
662
+ Parameters
663
+ ----------
664
+ stage_info : StageInfo
665
+ The stage information.
666
+
667
+ Returns
668
+ -------
669
+ Any
670
+ The new actor handle.
671
+ """
531
672
  actor_name = f"{stage_info.name}_{uuid.uuid4()}"
532
673
  logger.debug(f"[ScaleUtil] Creating new actor '{actor_name}' for stage '{stage_info.name}'")
533
674
  try:
534
- new_actor = stage_info.callable.options(
535
- name=actor_name, max_concurrency=10, max_restarts=0, lifetime="detached"
536
- ).remote(config=stage_info.config)
675
+ new_actor = stage_info.callable.options(name=actor_name, max_concurrency=1, max_restarts=0).remote(
676
+ config=stage_info.config
677
+ )
537
678
 
538
679
  return new_actor
539
680
  except Exception as e:
@@ -546,7 +687,21 @@ class RayPipeline(PipelineInterface):
546
687
  raise RuntimeError(f"Actor creation failed for stage '{stage_info.name}' during scale up") from e
547
688
 
548
689
  def _get_wiring_refs_for_actor(self, actor: Any, stage_name: str) -> List[ray.ObjectRef]:
549
- """Gets wiring futures for a single actor using topology for queues/connections."""
690
+ """
691
+ Gets wiring futures for a single actor using topology for queues/connections.
692
+
693
+ Parameters
694
+ ----------
695
+ actor : Any
696
+ The actor handle.
697
+ stage_name : str
698
+ The name of the stage.
699
+
700
+ Returns
701
+ -------
702
+ List[ray.ObjectRef]
703
+ A list of object references for the wiring calls.
704
+ """
550
705
  wiring_refs = []
551
706
 
552
707
  # Use topology accessors
@@ -574,7 +729,16 @@ class RayPipeline(PipelineInterface):
574
729
 
575
730
  @staticmethod
576
731
  def _start_actors(actors_to_start: List[Any], stage_name: str) -> None:
577
- """Starts a list of actors if they have a 'start' method and waits for completion."""
732
+ """
733
+ Starts a list of actors if they have a 'start' method and waits for completion.
734
+
735
+ Parameters
736
+ ----------
737
+ actors_to_start : List[Any]
738
+ A list of actor handles.
739
+ stage_name : str
740
+ The name of the stage.
741
+ """
578
742
  start_refs = []
579
743
  for actor in actors_to_start:
580
744
  if hasattr(actor, "start"):
@@ -598,7 +762,18 @@ class RayPipeline(PipelineInterface):
598
762
  raise RuntimeError(f"Error confirming actor starts for stage '{stage_name}'") from e
599
763
 
600
764
  def _handle_scale_up(self, stage_info: StageInfo, current_count: int, target_count: int) -> None:
601
- """Handles scaling up, interacting with topology."""
765
+ """
766
+ Handles scaling up, interacting with topology.
767
+
768
+ Parameters
769
+ ----------
770
+ stage_info : StageInfo
771
+ The stage information.
772
+ current_count : int
773
+ The current number of replicas.
774
+ target_count : int
775
+ The target number of replicas.
776
+ """
602
777
  stage_name = stage_info.name
603
778
  num_to_add = target_count - current_count
604
779
  logger.debug(f"[ScaleUp-{stage_name}] Scaling up from {current_count} to {target_count} (+{num_to_add}).")
@@ -662,8 +837,17 @@ class RayPipeline(PipelineInterface):
662
837
 
663
838
  def _handle_scale_down(self, stage_name: str, current_replicas: List[Any], target_count: int) -> None:
664
839
  """
665
- Handles scaling down: initiates stop on actors, registers handles with
666
- the topology for pending removal if stop was successfully initiated.
840
+ Handles scaling down: initiates stop on actors and marks them for removal
841
+ by the topology's garbage collection mechanism.
842
+
843
+ Parameters
844
+ ----------
845
+ stage_name : str
846
+ The name of the stage.
847
+ current_replicas : List[Any]
848
+ A list of actor handles.
849
+ target_count : int
850
+ The target number of replicas.
667
851
  """
668
852
  current_count = len(current_replicas)
669
853
  num_to_remove = current_count - target_count
@@ -671,59 +855,36 @@ class RayPipeline(PipelineInterface):
671
855
  f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove})."
672
856
  )
673
857
 
674
- # Basic validation
675
858
  if num_to_remove <= 0:
676
- logger.warning(f"[ScaleDown-{stage_name}] Invalid num_to_remove {num_to_remove}. Aborting.")
677
859
  return
678
860
 
679
- # Identify actors to remove (last N)
861
+ # Select actors to remove (e.g., the most recently added)
680
862
  actors_to_remove = current_replicas[-num_to_remove:]
681
- logger.debug(f"[ScaleDown-{stage_name}] Identified {len(actors_to_remove)} actors for removal.")
682
863
 
683
- actors_to_register_map: Dict[str, List[Tuple[Any, ray.ObjectRef]]] = defaultdict(list)
684
- stop_initiation_failures = 0
864
+ logger.info(f"[ScaleDown-{stage_name}] Selected {len(actors_to_remove)} actors for removal.")
685
865
 
866
+ # Signal each actor to stop and mark it for removal by the topology.
867
+ # The topology's cleanup thread will handle polling and final removal.
686
868
  for actor in actors_to_remove:
687
- actor_id_str = str(actor)
688
869
  try:
689
- # Call stop(), which now returns shutdown future
690
- shutdown_future = actor.stop.remote()
691
- actors_to_register_map[stage_name].append((actor, shutdown_future))
692
- logger.debug(f"[ScaleDown-{stage_name}] Submitted stop() call for actor '{actor_id_str}'.")
870
+ actor.stop.remote() # Signal the actor's loop to stop
871
+ self.topology.mark_actor_for_removal(stage_name, actor)
693
872
  except Exception as e:
694
- logger.error(
695
- f"[ScaleDown-{stage_name}] Error submitting stop() for actor '{actor_id_str}': "
696
- f"{e}. Cannot register.",
697
- exc_info=False,
698
- )
699
- stop_initiation_failures += 1
700
-
701
- # Register actors pending removal (with their shutdown futures)
702
- if actors_to_register_map:
703
- num_registered = sum(len(v) for v in actors_to_register_map.values())
704
- logger.debug(
705
- f"[ScaleDown-{stage_name}] Registering {num_registered} "
706
- f"actor handles with topology for shutdown monitoring."
707
- )
708
- try:
709
- self.topology.register_actors_pending_removal(actors_to_register_map)
710
- except Exception as e:
711
- logger.error(
712
- f"[ScaleDown-{stage_name}] CRITICAL - Failed to register actors pending removal with topology: {e}",
713
- exc_info=True,
714
- )
715
- self.topology.update_scaling_state(stage_name, "Error")
716
- elif actors_to_remove:
717
- logger.warning(f"[ScaleDown-{stage_name}] No actors successfully initiated stop for registration.")
873
+ logger.error(f"[ScaleDown-{stage_name}] Failed to initiate stop for actor {actor}: {e}")
718
874
 
719
- total_attempted = len(actors_to_remove)
720
- logger.debug(
721
- f"[ScaleDown-{stage_name}] Scale down initiation process complete for {total_attempted} actors "
722
- f"(Skipped/Failed Initiation: {stop_initiation_failures}). Topology cleanup will handle final removal."
723
- )
875
+ logger.debug(f"[ScaleDown-{stage_name}] Scale down initiation complete for {len(actors_to_remove)} actors.")
724
876
 
725
877
  def _scale_stage(self, stage_name: str, new_replica_count: int) -> None:
726
- """Orchestrates scaling using topology for state and info."""
878
+ """
879
+ Orchestrates scaling using topology for state and info.
880
+
881
+ Parameters
882
+ ----------
883
+ stage_name : str
884
+ The name of the stage.
885
+ new_replica_count : int
886
+ The new number of replicas.
887
+ """
727
888
  logger.debug(f"[ScaleStage-{stage_name}] Request for target count: {new_replica_count}")
728
889
 
729
890
  # --- Use Topology Accessors ---
@@ -764,47 +925,45 @@ class RayPipeline(PipelineInterface):
764
925
  self.topology.update_scaling_state(stage_name, "Error") # Ensure error state
765
926
 
766
927
  def _is_pipeline_quiet(self) -> bool:
767
- """Checks if pipeline is quiet using topology state and stats collector."""
928
+ """
929
+ Checks if pipeline is quiet using topology state and stats collector.
930
+
931
+ Returns
932
+ -------
933
+ bool
934
+ True if the pipeline is quiet, False otherwise.
935
+ """
936
+ return False # TODO: disabled for debugging
768
937
 
769
938
  # Check topology state first
770
939
  if self.topology.get_is_flushing():
771
940
  logger.debug("Pipeline quiet check: False (Flush in progress via topology state)")
772
941
  return False
773
942
 
774
- # Time check
775
- time_since_last_flush = time.time() - self._last_queue_flush_time
776
- if time_since_last_flush < self.queue_flush_interval_seconds:
777
- return False
778
-
779
- # Stats check (same as before)
780
- current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
781
- self.stats_collector.get_latest_stats()
782
- )
783
- last_update_age = time.time() - last_update_time
784
- max_stats_age_for_quiet = max(10.0, self._stats_collection_interval_seconds * 2.5)
785
-
786
- if not stats_were_successful:
787
- logger.warning(f"Pipeline quiet check: False (Stats failed {last_update_age:.1f}s ago).")
788
- return False
789
-
790
- if last_update_age > max_stats_age_for_quiet:
791
- logger.warning(
792
- f"Pipeline quiet check: False (Stats too old: {last_update_age:.1f}s > {max_stats_age_for_quiet:.1f}s)."
793
- )
794
- return False
795
-
796
- if not current_stage_stats:
797
- logger.warning("Pipeline quiet check: False (No stats currently available).")
798
- return False
943
+ # Check stats collector for recent activity
944
+ latest_stats = self.stats_collector.get_latest_stats()
945
+ if not latest_stats:
946
+ logger.debug("Pipeline quiet check: True (No stats available, assuming quiet)")
947
+ return True # No stats could mean it's idle
799
948
 
800
- # Activity check
801
- is_quiet = global_in_flight <= self.quiet_period_threshold
949
+ total_in_flight = latest_stats.get("total_items_in_flight", 0)
950
+ logger.debug(f"Pipeline quiet check: Total in-flight items: {total_in_flight}")
802
951
 
803
- return is_quiet
952
+ return total_in_flight <= self.quiet_period_threshold
804
953
 
805
954
  def _wait_for_pipeline_drain(self, timeout_seconds: int) -> bool:
806
955
  """
807
956
  Actively monitors pipeline drain using direct calls to the stats collector.
957
+
958
+ Parameters
959
+ ----------
960
+ timeout_seconds : int
961
+ The timeout in seconds.
962
+
963
+ Returns
964
+ -------
965
+ bool
966
+ True if the pipeline drained successfully, False otherwise.
808
967
  """
809
968
  start_time = time.time()
810
969
  logger.info(f"Waiting for pipeline drain (Timeout: {timeout_seconds}s)...")
@@ -860,7 +1019,14 @@ class RayPipeline(PipelineInterface):
860
1019
  time.sleep(sleep_duration)
861
1020
 
862
1021
  def _execute_queue_flush(self) -> bool:
863
- """Executes queue flush, using topology for state and structure."""
1022
+ """
1023
+ Executes queue flush, using topology for state and structure.
1024
+
1025
+ Returns
1026
+ -------
1027
+ bool
1028
+ True if the flush was successful, False otherwise.
1029
+ """
864
1030
  if self.topology.get_is_flushing() or self._stopping: # Check topology state
865
1031
  logger.warning("Queue flush requested but already in progress or pipeline is stopping. Ignoring.")
866
1032
  return False
@@ -893,6 +1059,7 @@ class RayPipeline(PipelineInterface):
893
1059
  source_actors_paused.append(actor)
894
1060
  except Exception as e:
895
1061
  logger.error(f"Failed sending pause to {actor}: {e}")
1062
+
896
1063
  if pause_refs:
897
1064
  logger.debug(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
898
1065
  try:
@@ -997,7 +1164,14 @@ class RayPipeline(PipelineInterface):
997
1164
  return overall_success
998
1165
 
999
1166
  def request_queue_flush(self, force: bool = False) -> None:
1000
- """Requests a queue flush, checking topology state."""
1167
+ """
1168
+ Requests a queue flush, checking topology state.
1169
+
1170
+ Parameters
1171
+ ----------
1172
+ force : bool, optional
1173
+ Whether to force the flush, by default False.
1174
+ """
1001
1175
  logger.info(f"Manual queue flush requested (force={force}).")
1002
1176
 
1003
1177
  if self.topology.get_is_flushing() or self._stopping: # Check topology
@@ -1014,7 +1188,21 @@ class RayPipeline(PipelineInterface):
1014
1188
  def _gather_controller_metrics(
1015
1189
  self, current_stage_stats: Dict[str, Dict[str, int]], global_in_flight: int
1016
1190
  ) -> Dict[str, Dict[str, Any]]:
1017
- """Gathers metrics using provided stats and topology."""
1191
+ """
1192
+ Gathers metrics using provided stats and topology.
1193
+
1194
+ Parameters
1195
+ ----------
1196
+ current_stage_stats : Dict[str, Dict[str, int]]
1197
+ The current stage statistics.
1198
+ global_in_flight : int
1199
+ The global in-flight count.
1200
+
1201
+ Returns
1202
+ -------
1203
+ Dict[str, Dict[str, Any]]
1204
+ A dictionary of metrics for the controllers.
1205
+ """
1018
1206
  logger.debug("[ScalingMetrics] Gathering metrics for controllers...")
1019
1207
  current_stage_metrics = {}
1020
1208
 
@@ -1046,11 +1234,11 @@ class RayPipeline(PipelineInterface):
1046
1234
  def _get_current_global_memory(self) -> int:
1047
1235
  """
1048
1236
  Safely retrieves the current global system memory usage (used, not free) in MB.
1049
- Uses the previous measurement as a fallback only if the current read fails.
1050
1237
 
1051
- Returns:
1052
- int: Current global memory usage (RSS/used) in MB. Returns previous value
1053
- or 0 if the read fails and no previous value exists.
1238
+ Returns
1239
+ -------
1240
+ int
1241
+ The current global memory usage (RSS/used) in MB.
1054
1242
  """
1055
1243
  try:
1056
1244
  # psutil.virtual_memory().used provides total RAM used by processes
@@ -1073,7 +1261,23 @@ class RayPipeline(PipelineInterface):
1073
1261
  def _calculate_scaling_adjustments(
1074
1262
  self, current_stage_metrics: Dict[str, Dict[str, Any]], global_in_flight: int, current_global_memory_mb: int
1075
1263
  ) -> Dict[str, int]:
1076
- """Runs controllers to get target replica counts using topology for edge count."""
1264
+ """
1265
+ Runs controllers to get target replica counts using topology for edge count.
1266
+
1267
+ Parameters
1268
+ ----------
1269
+ current_stage_metrics : Dict[str, Dict[str, Any]]
1270
+ The current stage metrics.
1271
+ global_in_flight : int
1272
+ The global in-flight count.
1273
+ current_global_memory_mb : int
1274
+ The current global memory usage in MB.
1275
+
1276
+ Returns
1277
+ -------
1278
+ Dict[str, int]
1279
+ A dictionary of target replica counts for the stages.
1280
+ """
1077
1281
  logger.debug("[ScalingCalc] Calculating adjustments via PID and RCM...")
1078
1282
  # Get edge count from topology
1079
1283
  num_edges = len(self.topology.get_edge_queues())
@@ -1099,7 +1303,14 @@ class RayPipeline(PipelineInterface):
1099
1303
  return {name: metrics.get("replicas", 0) for name, metrics in current_stage_metrics.items()}
1100
1304
 
1101
1305
  def _apply_scaling_actions(self, final_adjustments: Dict[str, int]) -> None:
1102
- """Applies scaling by calling _scale_stage, using topology for validation."""
1306
+ """
1307
+ Applies scaling by calling _scale_stage, using topology for validation.
1308
+
1309
+ Parameters
1310
+ ----------
1311
+ final_adjustments : Dict[str, int]
1312
+ A dictionary of target replica counts for the stages.
1313
+ """
1103
1314
  stages_needing_action = []
1104
1315
  current_actors_map = self.topology.get_stage_actors() # Snapshot
1105
1316
 
@@ -1166,8 +1377,9 @@ class RayPipeline(PipelineInterface):
1166
1377
  logger.debug(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
1167
1378
 
1168
1379
  def _perform_scaling_and_maintenance(self) -> None:
1169
- """Orchestrates scaling/maintenance using topology and stats collector."""
1170
-
1380
+ """
1381
+ Orchestrates scaling/maintenance using topology and stats collector.
1382
+ """
1171
1383
  if self._stopping:
1172
1384
  logger.debug("Pipeline is stopping. Skipping scaling cycle.")
1173
1385
  return
@@ -1194,10 +1406,29 @@ class RayPipeline(PipelineInterface):
1194
1406
  logger.debug("--- Performing Scaling & Maintenance Cycle ---")
1195
1407
 
1196
1408
  if self._is_pipeline_quiet():
1197
- logger.info("[Drain] Pipeline quiet, initiating queue flush.")
1198
- flush_success = self._execute_queue_flush()
1199
- logger.info(f"[Drain] Automatic queue flush completed. Success: {flush_success}")
1200
- return
1409
+ self._consecutive_quiet_cycles += 1
1410
+ logger.debug(f"Pipeline is quiet. Consecutive quiet cycles: {self._consecutive_quiet_cycles}")
1411
+ if self._consecutive_quiet_cycles >= self.consecutive_quiet_cycles_for_flush:
1412
+ logger.info(
1413
+ f"Pipeline has been quiet for {self._consecutive_quiet_cycles} cycles. "
1414
+ "Initiating queue flush."
1415
+ )
1416
+ if self._execute_queue_flush():
1417
+ self._last_queue_flush_time = time.time()
1418
+ self._consecutive_quiet_cycles = 0 # Reset after attempting flush
1419
+ else:
1420
+ logger.debug(
1421
+ f"Pipeline is quiet, but waiting for {self.consecutive_quiet_cycles_for_flush} "
1422
+ "consecutive quiet cycles before flushing."
1423
+ )
1424
+ else:
1425
+ if self._consecutive_quiet_cycles > 0:
1426
+ logger.info(
1427
+ f"Pipeline is no longer quiet. Resetting consecutive quiet cycle count "
1428
+ f"from {self._consecutive_quiet_cycles} to 0."
1429
+ )
1430
+ self._consecutive_quiet_cycles = 0
1431
+ logger.debug("Queue flush interval reached, but pipeline is not quiet. Deferring.")
1201
1432
 
1202
1433
  # Fast return check if stopping occurred while flushing or checking flush status
1203
1434
  if self._stopping:
@@ -1240,7 +1471,14 @@ class RayPipeline(PipelineInterface):
1240
1471
 
1241
1472
  # --- Lifecycle Methods for Monitoring/Scaling Threads ---
1242
1473
  def _scaling_loop(self, interval: float) -> None:
1243
- """Main loop for the scaling thread."""
1474
+ """
1475
+ Main loop for the scaling thread.
1476
+
1477
+ Parameters
1478
+ ----------
1479
+ interval : float
1480
+ The interval in seconds.
1481
+ """
1244
1482
  logger.info(f"Scaling loop started. Interval: {interval}s")
1245
1483
  while self._scaling_monitoring:
1246
1484
  try:
@@ -1255,6 +1493,14 @@ class RayPipeline(PipelineInterface):
1255
1493
  logger.info("Scaling loop finished.")
1256
1494
 
1257
1495
  def _start_scaling(self, poll_interval: float = 10.0) -> None:
1496
+ """
1497
+ Starts the scaling thread.
1498
+
1499
+ Parameters
1500
+ ----------
1501
+ poll_interval : float, optional
1502
+ The interval in seconds, by default 10.0.
1503
+ """
1258
1504
  if not self._scaling_monitoring:
1259
1505
  self._scaling_monitoring = True
1260
1506
  self._scaling_thread = threading.Thread(target=self._scaling_loop, args=(poll_interval,), daemon=True)
@@ -1262,6 +1508,9 @@ class RayPipeline(PipelineInterface):
1262
1508
  logger.info(f"Scaling/Maintenance thread launched (Interval: {poll_interval}s).")
1263
1509
 
1264
1510
  def _stop_scaling(self) -> None:
1511
+ """
1512
+ Stops the scaling thread.
1513
+ """
1265
1514
  if self._scaling_monitoring:
1266
1515
  logger.debug("Stopping scaling/maintenance thread...")
1267
1516
  self._scaling_monitoring = False
@@ -1274,77 +1523,103 @@ class RayPipeline(PipelineInterface):
1274
1523
 
1275
1524
  # --- Pipeline Start/Stop ---
1276
1525
  def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
1277
- """Starts actors (via topology) and background threads."""
1278
- # Check topology for actors (indicates built)
1279
- if not self.topology.get_stage_actors():
1526
+ """
1527
+ Starts the pipeline actors and background monitoring threads.
1528
+
1529
+ Assumes the pipeline has already been built via the `build()` method.
1530
+
1531
+ Parameters
1532
+ ----------
1533
+ monitor_poll_interval : float, optional
1534
+ This parameter is currently unused but is kept for interface compatibility.
1535
+ scaling_poll_interval : float, optional
1536
+ The interval in seconds for the autoscaling and maintenance thread to run, by default 30.0.
1537
+ """
1538
+ if not self.topology.get_all_actors():
1280
1539
  logger.error("Cannot start: Pipeline not built or has no actors.")
1281
1540
  return
1282
1541
 
1283
1542
  logger.info("Starting pipeline execution...")
1284
- start_refs = []
1285
- # Get actors from topology
1286
- actors_to_start = [actor for actors in self.topology.get_stage_actors().values() for actor in actors]
1287
-
1288
- for actor in actors_to_start:
1289
- start_refs.append(actor.start.remote())
1290
1543
 
1291
- if start_refs:
1292
- logger.debug(f"Waiting for {len(start_refs)} actors to start...")
1544
+ # Start all actors
1545
+ actors_to_start = self.topology.get_all_actors()
1546
+ start_futures = [actor.start.remote() for actor in actors_to_start]
1547
+ if start_futures:
1548
+ logger.debug(f"Waiting for {len(start_futures)} actors to start...")
1293
1549
  try:
1294
- ray.get(start_refs, timeout=60.0)
1295
- logger.info(f"{len(start_refs)} actors started.")
1550
+ ray.get(start_futures, timeout=60.0)
1551
+ logger.info(f"{len(start_futures)} actors started.")
1296
1552
  except Exception as e:
1297
1553
  logger.error(f"Error/Timeout starting actors: {e}", exc_info=True)
1298
1554
  self.stop() # Attempt cleanup
1299
-
1300
1555
  raise RuntimeError("Pipeline start failed: actors did not start.") from e
1301
1556
 
1557
+ # Start background threads
1558
+ self.topology.start_cleanup_thread()
1302
1559
  self.stats_collector.start()
1303
1560
  self._start_scaling(poll_interval=scaling_poll_interval)
1561
+
1304
1562
  logger.info("Pipeline started successfully.")
1305
1563
 
1306
1564
  def stop(self) -> None:
1307
- """Stops background threads and actors (via topology)."""
1565
+ """
1566
+ Stops the pipeline and all associated actors and threads.
1567
+
1568
+ This method performs a graceful shutdown by:
1569
+ 1. Stopping the autoscaling and statistics collection threads.
1570
+ 2. Signaling all actors to stop and waiting for confirmation.
1571
+ 3. Stopping the topology cleanup thread.
1572
+ 4. Clearing all runtime state from the topology.
1573
+ """
1308
1574
  logger.info("Stopping pipeline...")
1309
1575
 
1310
1576
  if self._stopping:
1577
+ logger.warning("Stop already in progress.")
1311
1578
  return
1312
1579
  self._stopping = True
1313
1580
 
1314
- # 1. Stop background threads first
1315
- with self._state_lock:
1316
- self._stop_scaling()
1317
- self.stats_collector.stop()
1581
+ # 1. Stop background threads first to prevent new actions
1582
+ self._stop_scaling()
1583
+ self.stats_collector.stop()
1318
1584
 
1319
- # 2. Stop actors (using topology)
1320
- logger.debug("Stopping all stage actors...")
1321
- stop_refs_map: Dict[ray.ObjectRef, Any] = {}
1585
+ # 2. Stop all actors and wait for them
1586
+ logger.debug("Stopping all actors in the topology...")
1587
+ all_actors = self.topology.get_all_actors()
1588
+ if all_actors:
1589
+ stop_futures = [actor.stop.remote() for actor in all_actors]
1590
+ try:
1591
+ ready, not_ready = ray.wait(stop_futures, num_returns=len(stop_futures), timeout=60.0)
1592
+ if not_ready:
1593
+ logger.warning(
1594
+ f"Timeout waiting for {len(not_ready)} actors to stop. " f"Proceeding with shutdown."
1595
+ )
1596
+ logger.info(f"{len(ready)} actors confirmed stop.")
1597
+ except Exception as e:
1598
+ logger.error(f"An unexpected error occurred during actor shutdown: {e}", exc_info=True)
1322
1599
 
1323
- # Get actors snapshot from topology
1324
- current_actors = {name: list(actors) for name, actors in self.topology.get_stage_actors().items()}
1600
+ # 3. Stop the topology cleanup thread now that actors are stopped
1601
+ self.topology.stop_cleanup_thread()
1325
1602
 
1326
- for stage_name, actors in current_actors.items():
1327
- for actor in actors:
1328
- try:
1329
- stop_refs_map[actor.stop.remote()] = actor
1330
- except Exception as e:
1331
- logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Skipping.")
1603
+ # 4. Clear all state and references
1604
+ logger.debug("Clearing pipeline topology runtime state.")
1605
+ self.topology.clear_runtime_state()
1332
1606
 
1333
- if stop_refs_map:
1334
- stop_refs = list(stop_refs_map.keys())
1335
- logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
1336
- try:
1337
- ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
1338
- if not_ready:
1339
- logger.warning(
1340
- f"Timeout waiting for {len(not_ready)} actors to stop. Allowing Ray to clean up."
1341
- )
1342
- logger.info(f"{len(ready)} actors stopped via stop().")
1343
- except Exception as e:
1344
- logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
1607
+ self._stopping = False
1608
+ logger.info("Pipeline stopped successfully.")
1345
1609
 
1346
- # Clear runtime state in topology
1347
- self.topology.clear_runtime_state()
1348
- del self.topology
1610
+ def __enter__(self) -> "RayPipeline":
1611
+ """
1612
+ Enter the runtime context related to this object.
1349
1613
 
1350
- logger.info("Pipeline stopped.")
1614
+ Returns
1615
+ -------
1616
+ RayPipeline
1617
+ The pipeline instance.
1618
+ """
1619
+ return self
1620
+
1621
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
1622
+ """
1623
+ Exit the runtime context related to this object.
1624
+ """
1625
+ self.stop()