nv-ingest 25.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (102) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +45 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/api/v1/metrics.py +29 -0
  8. nv_ingest/framework/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  12. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  13. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  14. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  15. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  16. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  18. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  19. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  20. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  22. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +591 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1322 -0
  24. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  25. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +82 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  34. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  35. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  36. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  41. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  42. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  44. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  45. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  47. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  48. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  49. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  52. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  53. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  56. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  60. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  61. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  62. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  64. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +200 -0
  68. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +376 -0
  69. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +624 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  71. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  72. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  73. nv_ingest/framework/schemas/__init__.py +0 -0
  74. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  75. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  76. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  77. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  78. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  79. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  80. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  81. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  82. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  83. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  84. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  85. nv_ingest/framework/util/__init__.py +3 -0
  86. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  87. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  88. nv_ingest/framework/util/service/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  90. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  91. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  92. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  93. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  94. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  95. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  96. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  97. nv_ingest/version.py +38 -0
  98. nv_ingest-25.6.0.dist-info/METADATA +266 -0
  99. nv_ingest-25.6.0.dist-info/RECORD +102 -0
  100. nv_ingest-25.6.0.dist-info/WHEEL +5 -0
  101. nv_ingest-25.6.0.dist-info/licenses/LICENSE +201 -0
  102. nv_ingest-25.6.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1322 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import multiprocessing
6
+ import os
7
+ import signal
8
+ import threading
9
+ from abc import ABC, abstractmethod
10
+ from collections import defaultdict
11
+ from dataclasses import dataclass
12
+
13
+ import psutil
14
+ import uuid
15
+ import ray
16
+ from ray.exceptions import GetTimeoutError
17
+ from ray.util.queue import Queue as RayQueue
18
+ from typing import Dict, Optional, List, Tuple, Any
19
+ from pydantic import BaseModel
20
+ import concurrent.futures
21
+ import logging
22
+ import time
23
+
24
+ from nv_ingest.framework.orchestration.ray.primitives.pipeline_topology import PipelineTopology, StageInfo
25
+ from nv_ingest.framework.orchestration.ray.primitives.ray_stat_collector import RayStatsCollector
26
+ from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import PIDController, ResourceConstraintManager
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class PipelineInterface(ABC):
32
+ """
33
+ Abstract base class for pipeline implementations.
34
+
35
+ Any concrete pipeline must implement start and stop methods.
36
+ """
37
+
38
+ @abstractmethod
39
+ def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
40
+ """
41
+ Start the pipeline.
42
+
43
+ Parameters
44
+ ----------
45
+ monitor_poll_interval : float
46
+ Interval in seconds for monitoring poll (default: 5.0).
47
+ scaling_poll_interval : float
48
+ Interval in seconds for scaling decisions (default: 30.0).
49
+ """
50
+ pass
51
+
52
+ @abstractmethod
53
+ def stop(self) -> None:
54
+ """
55
+ Stop the pipeline and perform any necessary cleanup.
56
+ """
57
+ pass
58
+
59
+
60
+ # --- Configuration Objects ---
61
+
62
+
63
+ @dataclass
64
+ class ScalingConfig:
65
+ """Configuration for PID and Resource Constraint Manager based scaling."""
66
+
67
+ dynamic_memory_scaling: bool = True
68
+ dynamic_memory_threshold: float = 0.75
69
+ pid_kp: float = 0.1
70
+ pid_ki: float = 0.001
71
+ pid_kd: float = 0.0
72
+ pid_target_queue_depth: int = 0
73
+ pid_penalty_factor: float = 0.1
74
+ pid_error_boost_factor: float = 1.5
75
+ pid_window_size: int = 10
76
+ rcm_estimated_edge_cost_mb: int = 5000
77
+ rcm_memory_safety_buffer_fraction: float = 0.15
78
+
79
+
80
+ @dataclass
81
+ class FlushingConfig:
82
+ """Configuration for queue flushing behavior."""
83
+
84
+ queue_flush_interval_seconds: int = 600
85
+ queue_flush_drain_timeout_seconds: int = 300
86
+ quiet_period_threshold: int = 0
87
+
88
+
89
+ @dataclass
90
+ class StatsConfig:
91
+ """Configuration for the RayStatsCollector."""
92
+
93
+ collection_interval_seconds: float = 10.0
94
+ actor_timeout_seconds: float = 5.0
95
+ queue_timeout_seconds: float = 2.0
96
+
97
+
98
+ class RayPipelineSubprocessInterface(PipelineInterface):
99
+ """
100
+ Pipeline interface implementation for a subprocess-based Ray pipeline.
101
+ """
102
+
103
+ def __init__(self, process: multiprocessing.Process):
104
+ """
105
+ Parameters
106
+ ----------
107
+ process : multiprocessing.Process
108
+ A handle to the running subprocess.
109
+ """
110
+ self._process: multiprocessing.Process = process
111
+
112
+ def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
113
+ """
114
+ Start is not supported because the subprocess is assumed to already be running.
115
+ """
116
+ pass
117
+
118
+ def stop(self) -> None:
119
+ """
120
+ Stops the subprocess pipeline. Tries terminate(), then escalates to SIGKILL on the process group if needed.
121
+ """
122
+ if not self._process.is_alive():
123
+ return
124
+
125
+ try:
126
+ self._process.terminate()
127
+ self._process.join(timeout=5.0)
128
+ except Exception as e:
129
+ logger.warning(f"Failed to terminate process cleanly: {e}")
130
+
131
+ if self._process.is_alive():
132
+ try:
133
+ pgid = os.getpgid(self._process.pid)
134
+ os.killpg(pgid, signal.SIGKILL)
135
+ except Exception as e:
136
+ logger.error(f"Failed to force-kill process group: {e}")
137
+ self._process.join(timeout=3.0)
138
+
139
+
140
+ class RayPipelineInterface(PipelineInterface):
141
+ """
142
+ Pipeline interface for an in-process RayPipeline instance.
143
+ """
144
+
145
+ def __init__(self, pipeline: "RayPipeline"):
146
+ """
147
+ Parameters
148
+ ----------
149
+ pipeline : RayPipeline
150
+ The instantiated pipeline to control.
151
+ """
152
+ self._pipeline = pipeline
153
+
154
+ def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
155
+ """
156
+ Starts the RayPipeline.
157
+
158
+ Parameters
159
+ ----------
160
+ monitor_poll_interval : float
161
+ Unused here; provided for interface compatibility.
162
+ scaling_poll_interval : float
163
+ Unused here; provided for interface compatibility.
164
+ """
165
+ self._pipeline.start(monitor_poll_interval, scaling_poll_interval)
166
+
167
+ def stop(self) -> None:
168
+ """
169
+ Stops the RayPipeline and shuts down Ray.
170
+ """
171
+ self._pipeline.stop()
172
+
173
+ try:
174
+ import ray
175
+
176
+ ray.shutdown()
177
+ except Exception:
178
+ pass
179
+
180
+
181
+ class RayPipeline(PipelineInterface):
182
+ """
183
+ A structured pipeline supporting dynamic scaling and queue flushing.
184
+ Uses PIDController and ResourceConstraintManager. Supports optional GUI display.
185
+ Delegates statistics collection to RayStatsCollector.
186
+
187
+ Configuration is managed via dedicated config objects (ScalingConfig, etc.).
188
+ """
189
+
190
+ def __init__(
191
+ self,
192
+ scaling_config: ScalingConfig = ScalingConfig(),
193
+ flushing_config: FlushingConfig = FlushingConfig(),
194
+ stats_config: StatsConfig = StatsConfig(),
195
+ ) -> None:
196
+ # Store config objects
197
+ self.scaling_config = scaling_config
198
+ self.flushing_config = flushing_config
199
+ self.stats_config = stats_config
200
+
201
+ # --- Instantiate Topology ---
202
+ self.topology = PipelineTopology()
203
+
204
+ # --- Structure Lock ---
205
+ self._structure_lock: threading.Lock = threading.Lock()
206
+
207
+ # --- State ---
208
+ # self.scaling_state: Dict[str, str] = {}
209
+ self.prev_global_memory_usage: Optional[int] = None
210
+ self._state_lock: threading.Lock = threading.Lock()
211
+ self._stopping = False
212
+
213
+ # --- Build Time Config & State ---
214
+ # Use scaling_config for these
215
+ self.dynamic_memory_scaling = self.scaling_config.dynamic_memory_scaling
216
+ self.dynamic_memory_threshold = self.scaling_config.dynamic_memory_threshold
217
+ self.stage_memory_overhead: Dict[str, float] = {}
218
+
219
+ # --- Background Threads ---
220
+ self._scaling_thread: Optional[threading.Thread] = None
221
+ self._scaling_monitoring = False
222
+
223
+ # --- Queue Flushing ---
224
+ self._last_queue_flush_time: float = time.time()
225
+ self.queue_flush_interval_seconds = self.flushing_config.queue_flush_interval_seconds
226
+ self.queue_flush_drain_timeout_seconds = self.flushing_config.queue_flush_drain_timeout_seconds
227
+ self.quiet_period_threshold = self.flushing_config.quiet_period_threshold
228
+
229
+ # --- Instantiate Autoscaling Controllers ---
230
+ # Use scaling_config
231
+ self.pid_controller = PIDController(
232
+ kp=self.scaling_config.pid_kp,
233
+ ki=self.scaling_config.pid_ki,
234
+ kd=self.scaling_config.pid_kd,
235
+ stage_cost_estimates={}, # Populated during build
236
+ target_queue_depth=self.scaling_config.pid_target_queue_depth,
237
+ window_size=self.scaling_config.pid_window_size,
238
+ penalty_factor=self.scaling_config.pid_penalty_factor,
239
+ error_boost_factor=self.scaling_config.pid_error_boost_factor,
240
+ )
241
+ logger.info("PIDController initialized using ScalingConfig.")
242
+
243
+ try:
244
+ total_system_memory_bytes = psutil.virtual_memory().total
245
+ # Use scaling_config for dynamic_memory_threshold
246
+ absolute_memory_threshold_mb = int(
247
+ self.scaling_config.dynamic_memory_threshold * total_system_memory_bytes / (1024 * 1024)
248
+ )
249
+ except Exception as e:
250
+ logger.error(f"Failed to get system memory: {e}. Using high limit.")
251
+ absolute_memory_threshold_mb = 1_000_000 # Fallback value
252
+
253
+ # Use scaling_config
254
+ self.constraint_manager = ResourceConstraintManager(
255
+ max_replicas=1, # Updated during build
256
+ memory_threshold=absolute_memory_threshold_mb,
257
+ estimated_edge_cost_mb=self.scaling_config.rcm_estimated_edge_cost_mb,
258
+ memory_safety_buffer_fraction=self.scaling_config.rcm_memory_safety_buffer_fraction,
259
+ )
260
+ logger.info("ResourceConstraintManager initialized using ScalingConfig.")
261
+
262
+ # --- Instantiate Stats Collector ---
263
+ self._stats_collection_interval_seconds = self.stats_config.collection_interval_seconds
264
+ self.stats_collector = RayStatsCollector(
265
+ pipeline_accessor=self, # This dependency remains for now
266
+ interval=self.stats_config.collection_interval_seconds,
267
+ actor_timeout=self.stats_config.actor_timeout_seconds,
268
+ queue_timeout=self.stats_config.queue_timeout_seconds,
269
+ )
270
+
271
+ logger.info("RayStatsCollector initialized using StatsConfig.")
272
+
273
+ # --- Accessor Methods for Stats Collector (and internal use) ---
274
+
275
+ def __del__(self):
276
+ try:
277
+ self.stop()
278
+ except Exception as e:
279
+ logger.error(f"Exception during RayPipeline cleanup: {e}")
280
+
281
+ def get_stages_info(self) -> List[StageInfo]:
282
+ """Returns a snapshot of the current stage information."""
283
+ return self.topology.get_stages_info()
284
+
285
+ def get_stage_actors(self) -> Dict[str, List[Any]]:
286
+ """Returns a snapshot of the current actors per stage."""
287
+ return self.topology.get_stage_actors()
288
+
289
+ def get_edge_queues(self) -> Dict[str, Tuple[Any, int]]:
290
+ """Returns a snapshot of the current edge queues."""
291
+ return self.topology.get_edge_queues()
292
+
293
+ def _configure_autoscalers(self) -> None:
294
+ """Updates controllers based on current pipeline configuration via topology."""
295
+ logger.debug("[Build-Configure] Configuring autoscalers...")
296
+ total_max_replicas = 0
297
+ default_cost_bytes = 100 * 1024 * 1024
298
+ stage_overheads = {} # Collect locally
299
+
300
+ # Use topology accessor
301
+ current_stages = self.topology.get_stages_info()
302
+
303
+ for stage in current_stages:
304
+ total_max_replicas += stage.max_replicas
305
+ # Use estimated overhead if available (Assume it's calculated elsewhere or default)
306
+ # For now, let's store a dummy overhead in topology during build
307
+ overhead_bytes = default_cost_bytes # Simplification for now
308
+ stage_overheads[stage.name] = overhead_bytes # Store locally first
309
+ cost_mb = max(1, int(overhead_bytes / (1024 * 1024)))
310
+ # Update controller directly (or via dedicated method if preferred)
311
+ self.pid_controller.stage_cost_estimates[stage.name] = cost_mb
312
+
313
+ # Update topology with collected overheads
314
+ self.topology.set_stage_memory_overhead(stage_overheads)
315
+
316
+ # Update constraint manager
317
+ self.constraint_manager.max_replicas = total_max_replicas
318
+
319
+ logger.info(f"[Build-Configure] Autoscalers configured. Total Max Replicas: {total_max_replicas}")
320
+ logger.debug(f"[Build-Configure] PID stage cost estimates (MB): {self.pid_controller.stage_cost_estimates}")
321
+
322
+ def _instantiate_initial_actors(self) -> None:
323
+ """Instantiates initial actors and updates topology."""
324
+ logger.info("[Build-Actors] Instantiating initial stage actors (min_replicas)...")
325
+ # Use topology accessor
326
+ current_stages = self.topology.get_stages_info()
327
+
328
+ for stage in current_stages:
329
+ replicas = []
330
+
331
+ if not self.dynamic_memory_scaling:
332
+ num_initial_actors = stage.max_replicas
333
+ else:
334
+ num_initial_actors = (
335
+ max(stage.min_replicas, 1) if stage.is_source or stage.is_sink else stage.min_replicas
336
+ )
337
+
338
+ if num_initial_actors > 0:
339
+ logger.debug(f"[Build-Actors] Stage '{stage.name}' creating {num_initial_actors} initial actor(s).")
340
+ for i in range(num_initial_actors):
341
+ actor_name = f"{stage.name}_{uuid.uuid4()}"
342
+ logger.debug(
343
+ f"[Build-Actors] Creating actor '{actor_name}' ({i + 1}/{num_initial_actors})"
344
+ f" for '{stage.name}'"
345
+ )
346
+ try:
347
+ actor = stage.callable.options(
348
+ name=actor_name, max_concurrency=10, max_restarts=0, lifetime="detached"
349
+ ).remote(config=stage.config)
350
+ replicas.append(actor)
351
+ except Exception as e:
352
+ logger.error(f"[Build-Actors] Failed create actor '{actor_name}': {e}", exc_info=True)
353
+ raise RuntimeError(f"Build failed: actor creation error for stage '{stage.name}'") from e
354
+
355
+ # Update topology for this stage
356
+ self.topology.set_actors_for_stage(stage.name, replicas)
357
+ logger.debug(f"[Build-Actors] Stage '{stage.name}' initial actors set in topology: count={len(replicas)}")
358
+
359
+ logger.info("[Build-Actors] Initial actor instantiation complete.")
360
+
361
+ def _create_and_wire_edges(self) -> List[ray.ObjectRef]:
362
+ """Creates queues, wires actors (using topology), and updates topology."""
363
+ logger.info("[Build-Wiring] Creating and wiring edges...")
364
+ wiring_refs = []
365
+ new_edge_queues: Dict[str, Tuple[Any, int]] = {}
366
+
367
+ current_connections = self.topology.get_connections()
368
+ current_stage_actors = self.topology.get_stage_actors() # Gets copy
369
+
370
+ for from_stage_name, connections_list in current_connections.items():
371
+ for to_stage_name, queue_size in connections_list:
372
+ queue_name = f"{from_stage_name}_to_{to_stage_name}"
373
+ logger.debug(f"[Build-Wiring] Creating queue '{queue_name}' (size {queue_size}) and wiring.")
374
+ try:
375
+ edge_queue = RayQueue(maxsize=queue_size, actor_options={"max_restarts": 0})
376
+ new_edge_queues[queue_name] = (edge_queue, queue_size)
377
+
378
+ # Wire using current actors from topology snapshot
379
+ source_actors = current_stage_actors.get(from_stage_name, [])
380
+ for actor in source_actors:
381
+ wiring_refs.append(actor.set_output_queue.remote(edge_queue))
382
+
383
+ dest_actors = current_stage_actors.get(to_stage_name, [])
384
+ for actor in dest_actors:
385
+ wiring_refs.append(actor.set_input_queue.remote(edge_queue))
386
+
387
+ except Exception as e:
388
+ logger.error(f"[Build-Wiring] Failed create/wire queue '{queue_name}': {e}", exc_info=True)
389
+ raise RuntimeError(f"Build failed: queue wiring error for '{queue_name}'") from e
390
+
391
+ # Update topology with the new queues
392
+ self.topology.set_edge_queues(new_edge_queues)
393
+
394
+ logger.debug(f"[Build-Wiring] Submitted {len(wiring_refs)} wiring calls. Queues set in topology.")
395
+ return wiring_refs
396
+
397
+ @staticmethod
398
+ def _wait_for_wiring(wiring_refs: List[ray.ObjectRef]) -> None:
399
+ """Waits for remote wiring calls to complete. (Static, no changes needed)."""
400
+ if not wiring_refs:
401
+ logger.debug("[Build-WaitWiring] No wiring calls.")
402
+ return
403
+ logger.debug(f"[Build-WaitWiring] Waiting for {len(wiring_refs)} wiring calls...")
404
+ try:
405
+ ray.get(wiring_refs)
406
+ logger.debug("[Build-WaitWiring] All wiring calls completed.")
407
+ except Exception as e:
408
+ logger.error(f"[Build-WaitWiring] Error during wiring confirmation: {e}", exc_info=True)
409
+ raise RuntimeError("Build failed: error confirming initial wiring") from e
410
+
411
+ def add_source(
412
+ self, *, name: str, source_actor: Any, config: BaseModel, min_replicas: int = 1, max_replicas: int = 1
413
+ ) -> "RayPipeline":
414
+ if min_replicas < 1:
415
+ logger.warning(f"Source stage '{name}': min_replicas must be >= 1. Overriding.")
416
+ min_replicas = 1
417
+
418
+ stage_info = StageInfo(
419
+ name=name,
420
+ callable=source_actor,
421
+ config=config,
422
+ is_source=True,
423
+ min_replicas=min_replicas,
424
+ max_replicas=max_replicas,
425
+ )
426
+ self.topology.add_stage(stage_info) # Delegate
427
+
428
+ return self
429
+
430
+ def add_stage(
431
+ self, *, name: str, stage_actor: Any, config: BaseModel, min_replicas: int = 0, max_replicas: int = 1
432
+ ) -> "RayPipeline":
433
+ if min_replicas < 0:
434
+ logger.warning(f"Stage '{name}': min_replicas cannot be negative. Overriding to 0.")
435
+ min_replicas = 0
436
+ stage_info = StageInfo(
437
+ name=name, callable=stage_actor, config=config, min_replicas=min_replicas, max_replicas=max_replicas
438
+ )
439
+ self.topology.add_stage(stage_info) # Delegate
440
+
441
+ return self
442
+
443
+ def add_sink(
444
+ self, *, name: str, sink_actor: Any, config: BaseModel, min_replicas: int = 1, max_replicas: int = 1
445
+ ) -> "RayPipeline":
446
+ # Sink min_replicas can realistically be 0 if data drain is optional/best-effort? Let's allow 0.
447
+ if min_replicas < 0:
448
+ logger.warning(f"Sink stage '{name}': min_replicas cannot be negative. Overriding to 0.")
449
+ min_replicas = 0
450
+ stage_info = StageInfo(
451
+ name=name,
452
+ callable=sink_actor,
453
+ config=config,
454
+ is_sink=True,
455
+ min_replicas=min_replicas,
456
+ max_replicas=max_replicas,
457
+ )
458
+ self.topology.add_stage(stage_info) # Delegate
459
+
460
+ return self
461
+
462
+ # --- Method for defining connections ---
463
+ def make_edge(self, from_stage: str, to_stage: str, queue_size: int = 100) -> "RayPipeline":
464
+ try:
465
+ self.topology.add_connection(from_stage, to_stage, queue_size) # Delegate (includes validation)
466
+ except ValueError as e:
467
+ logger.error(f"make_edge failed: {e}")
468
+ raise # Re-raise the error
469
+ return self
470
+
471
+ # ----- Pipeline Build Process ---
472
+ def build(self) -> Dict[str, List[Any]]:
473
+ """Builds the pipeline: configures, instantiates, wires, using topology."""
474
+ logger.info("--- Starting Pipeline Build Process ---")
475
+ try:
476
+ if not self.topology.get_stages_info():
477
+ logger.error("Build failed: No stages defined in topology.")
478
+ return {}
479
+
480
+ # Steps interact with self.topology
481
+ self._configure_autoscalers()
482
+ self._instantiate_initial_actors()
483
+ wiring_futures = self._create_and_wire_edges()
484
+ self._wait_for_wiring(wiring_futures)
485
+
486
+ logger.info("--- Pipeline Build Completed Successfully ---")
487
+ return self.topology.get_stage_actors() # Return actors from topology
488
+
489
+ except RuntimeError as e:
490
+ logger.critical(f"Pipeline build failed: {e}", exc_info=False)
491
+ # Clean up topology runtime state?
492
+ self.topology.clear_runtime_state()
493
+ return {}
494
+ except Exception as e:
495
+ logger.critical(f"Unexpected error during pipeline build: {e}", exc_info=True)
496
+ self.topology.clear_runtime_state()
497
+ return {}
498
+
499
+ # --- Scaling Logic ---
500
+ @staticmethod
501
+ def _create_single_replica(stage_info: StageInfo) -> Any:
502
+ """Creates a single new Ray actor replica for the given stage."""
503
+ actor_name = f"{stage_info.name}_{uuid.uuid4()}"
504
+ logger.debug(f"[ScaleUtil] Creating new actor '{actor_name}' for stage '{stage_info.name}'")
505
+ try:
506
+ new_actor = stage_info.callable.options(
507
+ name=actor_name, max_concurrency=10, max_restarts=0, lifetime="detached"
508
+ ).remote(config=stage_info.config)
509
+
510
+ return new_actor
511
+ except Exception as e:
512
+ logger.error(
513
+ f"[ScaleUtil] Failed to create actor '{actor_name}' for stage '{stage_info.name}':" f" {e}",
514
+ exc_info=True,
515
+ )
516
+
517
+ # Propagate error to halt the scaling operation
518
+ raise RuntimeError(f"Actor creation failed for stage '{stage_info.name}' during scale up") from e
519
+
520
+ def _get_wiring_refs_for_actor(self, actor: Any, stage_name: str) -> List[ray.ObjectRef]:
521
+ """Gets wiring futures for a single actor using topology for queues/connections."""
522
+ wiring_refs = []
523
+
524
+ # Use topology accessors
525
+ connections = self.topology.get_connections()
526
+ edge_queues = self.topology.get_edge_queues()
527
+
528
+ # Wire outputs
529
+ if stage_name in connections:
530
+ for to_stage, _ in connections[stage_name]:
531
+ queue_name = f"{stage_name}_to_{to_stage}"
532
+ if queue_name in edge_queues:
533
+ edge_queue, _ = edge_queues[queue_name]
534
+ wiring_refs.append(actor.set_output_queue.remote(edge_queue))
535
+
536
+ # Wire inputs
537
+ for from_stage, conns in connections.items():
538
+ for to_stage, _ in conns:
539
+ if to_stage == stage_name:
540
+ queue_name = f"{from_stage}_to_{stage_name}"
541
+ if queue_name in edge_queues:
542
+ edge_queue, _ = edge_queues[queue_name]
543
+ wiring_refs.append(actor.set_input_queue.remote(edge_queue))
544
+
545
+ return wiring_refs
546
+
547
+ @staticmethod
548
+ def _start_actors(actors_to_start: List[Any], stage_name: str) -> None:
549
+ """Starts a list of actors if they have a 'start' method and waits for completion."""
550
+ start_refs = []
551
+ for actor in actors_to_start:
552
+ if hasattr(actor, "start"):
553
+ logger.debug(f"[ScaleUtil] Starting actor '{actor}' for stage '{stage_name}'")
554
+ start_refs.append(actor.start.remote())
555
+
556
+ if not start_refs:
557
+ logger.debug(f"[ScaleUtil] No actors with start() method found for stage '{stage_name}'.")
558
+ return
559
+
560
+ logger.debug(f"[ScaleUtil] Waiting for {len(start_refs)} actor starts for stage '{stage_name}'...")
561
+ try:
562
+ ray.get(start_refs)
563
+ logger.debug(f"[ScaleUtil] {len(start_refs)} actors started successfully for stage '{stage_name}'.")
564
+ except Exception as e:
565
+ logger.error(
566
+ f"[ScaleUtil] Error waiting for actors to start for stage '{stage_name}':" f" {e}", exc_info=True
567
+ )
568
+ # Note: Actors might be started but confirmation failed. State might be inconsistent.
569
+ # Consider raising an error to signal potential inconsistency?
570
+ raise RuntimeError(f"Error confirming actor starts for stage '{stage_name}'") from e
571
+
572
+ def _handle_scale_up(self, stage_info: StageInfo, current_count: int, target_count: int) -> None:
573
+ """Handles scaling up, interacting with topology."""
574
+ stage_name = stage_info.name
575
+ num_to_add = target_count - current_count
576
+ logger.debug(f"[ScaleUp-{stage_name}] Scaling up from {current_count} to {target_count} (+{num_to_add}).")
577
+ # Update topology state
578
+ self.topology.update_scaling_state(stage_name, "Scaling Up")
579
+
580
+ new_actors = []
581
+ all_wiring_refs = []
582
+ successfully_added_actors = []
583
+
584
+ try:
585
+ # 1. Create actors
586
+ for _ in range(num_to_add):
587
+ new_actor = self._create_single_replica(stage_info)
588
+ new_actors.append(new_actor)
589
+
590
+ # 2. Get wiring refs (uses topology internally)
591
+ for actor in new_actors:
592
+ all_wiring_refs.extend(self._get_wiring_refs_for_actor(actor, stage_name))
593
+
594
+ # 3. Wait for wiring (static helper)
595
+ self._wait_for_wiring(all_wiring_refs) # Handles errors
596
+
597
+ # 4. Start actors (static helper)
598
+ self._start_actors(new_actors, stage_name) # Handles errors
599
+
600
+ # 5. Add successfully created/wired/started actors to topology
601
+ for actor in new_actors:
602
+ self.topology.add_actor_to_stage(stage_name, actor)
603
+ successfully_added_actors.append(actor) # Keep track
604
+
605
+ final_count = self.topology.get_actor_count(stage_name)
606
+ logger.debug(
607
+ f"[ScaleUp-{stage_name}] Scale up complete. Added {len(successfully_added_actors)}. "
608
+ f"New count: {final_count}"
609
+ )
610
+
611
+ except Exception as e:
612
+ logger.error(f"[ScaleUp-{stage_name}] Error during scale up: {e}", exc_info=False)
613
+ self.topology.update_scaling_state(stage_name, "Error")
614
+ # --- Cleanup Attempt ---
615
+ # Actors created but potentially not wired/started/added to topology.
616
+ # Only kill actors that were definitely *not* added to the topology.
617
+ actors_to_kill = [a for a in new_actors if a not in successfully_added_actors]
618
+ if actors_to_kill:
619
+ logger.warning(
620
+ f"[ScaleUp-{stage_name}] Attempting to kill {len(actors_to_kill)} partially created actors."
621
+ )
622
+ for actor in actors_to_kill:
623
+ try:
624
+ ray.kill(actor, no_restart=True)
625
+ except Exception as kill_e:
626
+ logger.warning(f"Failed to kill actor {actor}: {kill_e}")
627
+ logger.critical(f"[ScaleUp-{stage_name}] Scale up failed. State potentially inconsistent.")
628
+
629
+ finally:
630
+ # Reset state only if it was Scaling Up and didn't end in Error
631
+ current_state = self.topology.get_scaling_state().get(stage_name)
632
+ if current_state == "Scaling Up":
633
+ self.topology.update_scaling_state(stage_name, "Idle")
634
+
635
+ def _handle_scale_down(self, stage_name: str, current_replicas: List[Any], target_count: int) -> None:
636
+ """
637
+ Handles scaling down: initiates stop on actors, registers handles with
638
+ the topology for pending removal if stop was successfully initiated.
639
+ """
640
+ current_count = len(current_replicas)
641
+ num_to_remove = current_count - target_count
642
+ logger.debug(
643
+ f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove})."
644
+ )
645
+
646
+ # Basic validation
647
+ if num_to_remove <= 0:
648
+ logger.warning(f"[ScaleDown-{stage_name}] Invalid num_to_remove {num_to_remove}. Aborting.")
649
+ return
650
+
651
+ # Identify actors to remove (last N)
652
+ actors_to_remove = current_replicas[-num_to_remove:]
653
+ logger.debug(f"[ScaleDown-{stage_name}] Identified {len(actors_to_remove)} actors for removal.")
654
+
655
+ actors_to_register_map: Dict[str, List[Tuple[Any, ray.ObjectRef]]] = defaultdict(list)
656
+ stop_initiation_failures = 0
657
+
658
+ for actor in actors_to_remove:
659
+ actor_id_str = str(actor)
660
+ try:
661
+ # Call stop(), which now returns shutdown future
662
+ shutdown_future = actor.stop.remote()
663
+ actors_to_register_map[stage_name].append((actor, shutdown_future))
664
+ logger.debug(f"[ScaleDown-{stage_name}] Submitted stop() call for actor '{actor_id_str}'.")
665
+ except Exception as e:
666
+ logger.error(
667
+ f"[ScaleDown-{stage_name}] Error submitting stop() for actor '{actor_id_str}': "
668
+ f"{e}. Cannot register.",
669
+ exc_info=False,
670
+ )
671
+ stop_initiation_failures += 1
672
+
673
+ # Register actors pending removal (with their shutdown futures)
674
+ if actors_to_register_map:
675
+ num_registered = sum(len(v) for v in actors_to_register_map.values())
676
+ logger.debug(
677
+ f"[ScaleDown-{stage_name}] Registering {num_registered} "
678
+ f"actor handles with topology for shutdown monitoring."
679
+ )
680
+ try:
681
+ self.topology.register_actors_pending_removal(actors_to_register_map)
682
+ except Exception as e:
683
+ logger.error(
684
+ f"[ScaleDown-{stage_name}] CRITICAL - Failed to register actors pending removal with topology: {e}",
685
+ exc_info=True,
686
+ )
687
+ self.topology.update_scaling_state(stage_name, "Error")
688
+ elif actors_to_remove:
689
+ logger.warning(f"[ScaleDown-{stage_name}] No actors successfully initiated stop for registration.")
690
+
691
+ total_attempted = len(actors_to_remove)
692
+ logger.debug(
693
+ f"[ScaleDown-{stage_name}] Scale down initiation process complete for {total_attempted} actors "
694
+ f"(Skipped/Failed Initiation: {stop_initiation_failures}). Topology cleanup will handle final removal."
695
+ )
696
+
697
+ def _scale_stage(self, stage_name: str, new_replica_count: int) -> None:
698
+ """Orchestrates scaling using topology for state and info."""
699
+ logger.debug(f"[ScaleStage-{stage_name}] Request for target count: {new_replica_count}")
700
+
701
+ # --- Use Topology Accessors ---
702
+ stage_info = self.topology.get_stage_info(stage_name)
703
+ current_replicas = self.topology.get_stage_actors().get(stage_name, []) # Get current actors safely
704
+ current_count = len(current_replicas)
705
+
706
+ if stage_info is None:
707
+ logger.error(f"[ScaleStage-{stage_name}] Stage info not found. Cannot scale.")
708
+ return
709
+
710
+ target_count = max(stage_info.min_replicas, min(new_replica_count, stage_info.max_replicas))
711
+ if target_count != new_replica_count:
712
+ logger.debug(
713
+ f"[ScaleStage-{stage_name}] Count {new_replica_count} adjusted to {target_count} "
714
+ f"by bounds ({stage_info.min_replicas}/{stage_info.max_replicas})."
715
+ )
716
+
717
+ if target_count == current_count:
718
+ logger.debug(f"[ScaleStage-{stage_name}] Already at target count ({current_count}). No action.")
719
+ # Reset state if needed
720
+ if self.topology.get_scaling_state().get(stage_name) != "Idle":
721
+ self.topology.update_scaling_state(stage_name, "Idle")
722
+ return
723
+
724
+ # --- Delegate ---
725
+ try:
726
+ if target_count > current_count:
727
+ self._handle_scale_up(stage_info, current_count, target_count)
728
+ else: # target_count < current_count
729
+ # Pass the list of actors we know about *now*
730
+ self._handle_scale_down(stage_name, current_replicas, target_count)
731
+ except RuntimeError as e: # Catch specific errors from handlers
732
+ logger.error(f"[ScaleStage-{stage_name}] Scaling failed: {e}", exc_info=False)
733
+ # State should have been set to "Error" within the handler
734
+ except Exception as e:
735
+ logger.error(f"[ScaleStage-{stage_name}] Unexpected error: {e}", exc_info=True)
736
+ self.topology.update_scaling_state(stage_name, "Error") # Ensure error state
737
+
738
+ def _is_pipeline_quiet(self) -> bool:
739
+ """Checks if pipeline is quiet using topology state and stats collector."""
740
+
741
+ # Check topology state first
742
+ if self.topology.get_is_flushing():
743
+ logger.debug("Pipeline quiet check: False (Flush in progress via topology state)")
744
+ return False
745
+
746
+ # Time check
747
+ time_since_last_flush = time.time() - self._last_queue_flush_time
748
+ if time_since_last_flush < self.queue_flush_interval_seconds:
749
+ return False
750
+
751
+ # Stats check (same as before)
752
+ current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
753
+ self.stats_collector.get_latest_stats()
754
+ )
755
+ last_update_age = time.time() - last_update_time
756
+ max_stats_age_for_quiet = max(10.0, self._stats_collection_interval_seconds * 2.5)
757
+
758
+ if not stats_were_successful:
759
+ logger.warning(f"Pipeline quiet check: False (Stats failed {last_update_age:.1f}s ago).")
760
+ return False
761
+
762
+ if last_update_age > max_stats_age_for_quiet:
763
+ logger.warning(
764
+ f"Pipeline quiet check: False (Stats too old: {last_update_age:.1f}s > {max_stats_age_for_quiet:.1f}s)."
765
+ )
766
+ return False
767
+
768
+ if not current_stage_stats:
769
+ logger.warning("Pipeline quiet check: False (No stats currently available).")
770
+ return False
771
+
772
+ # Activity check
773
+ is_quiet = global_in_flight <= self.quiet_period_threshold
774
+
775
+ return is_quiet
776
+
777
+ def _wait_for_pipeline_drain(self, timeout_seconds: int) -> bool:
778
+ """
779
+ Actively monitors pipeline drain using direct calls to the stats collector.
780
+ """
781
+ start_time = time.time()
782
+ logger.info(f"Waiting for pipeline drain (Timeout: {timeout_seconds}s)...")
783
+ last_in_flight = -1
784
+ drain_check_interval = 1.0 # Check every second
785
+
786
+ while True:
787
+ current_time = time.time()
788
+ elapsed_time = current_time - start_time
789
+
790
+ if elapsed_time >= timeout_seconds:
791
+ logger.warning(f"Pipeline drain timed out after {elapsed_time:.1f}s. Last In-Flight: {last_in_flight}")
792
+ return False
793
+
794
+ # --- Trigger immediate stats collection via the collector instance ---
795
+ drain_success = False
796
+ collection_error = None
797
+
798
+ global_in_flight = -1
799
+ try:
800
+ # Use the collector's method for a one-off, blocking collection
801
+ drain_stats, global_in_flight, drain_success = self.stats_collector.collect_stats_now()
802
+ except Exception as e:
803
+ logger.error(f"[DrainWait] Critical error during direct stats collection call: {e}.", exc_info=True)
804
+ collection_error = e # Indicate failure to even run collection
805
+
806
+ # --- Process collection results ---
807
+ if global_in_flight != last_in_flight:
808
+ status_msg = (
809
+ f"Collection Success: {drain_success}"
810
+ if not collection_error
811
+ else f"Collection Error: {type(collection_error).__name__}"
812
+ )
813
+ logger.debug(
814
+ f"[Drain] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
815
+ )
816
+ last_in_flight = global_in_flight
817
+
818
+ # --- Check for successful drain ---
819
+ # Requires BOTH in-flight=0 AND the collection reporting it was successful
820
+ if global_in_flight == 0 and drain_success and not collection_error:
821
+ return True
822
+ elif global_in_flight == 0: # Saw zero, but collection wasn't fully successful
823
+ logger.warning(
824
+ "[Drain] In-Flight reached 0, but stats collection had errors/timeouts."
825
+ " Cannot confirm drain yet."
826
+ )
827
+
828
+ # --- Wait ---
829
+ remaining_time = timeout_seconds - elapsed_time
830
+ sleep_duration = min(drain_check_interval, remaining_time, 1.0) # Ensure positive sleep
831
+ if sleep_duration > 0:
832
+ time.sleep(sleep_duration)
833
+
834
+ def _execute_queue_flush(self) -> bool:
835
+ """Executes queue flush, using topology for state and structure."""
836
+ if self.topology.get_is_flushing() or self._stopping: # Check topology state
837
+ logger.warning("Queue flush requested but already in progress or pipeline is stopping. Ignoring.")
838
+ return False
839
+
840
+ # Set flushing state in topology
841
+ self.topology.set_flushing(True)
842
+ overall_success = False
843
+ source_actors_paused = []
844
+ pause_refs = []
845
+ new_edge_queues_map: Optional[Dict[str, Tuple[Any, int]]] = None
846
+
847
+ try:
848
+ # --- Get structure snapshots from topology ---
849
+ # Use lock context for multiple reads if needed, but individual accessors are locked too
850
+ current_stages = self.topology.get_stages_info()
851
+ current_stage_actors = self.topology.get_stage_actors()
852
+ current_edge_queues = self.topology.get_edge_queues()
853
+ current_connections = self.topology.get_connections()
854
+
855
+ # --- 1. Pause Source Stages (using snapshots) ---
856
+ logger.debug("Pausing source stages...")
857
+ pause_timeout = 60.0
858
+ for stage in current_stages:
859
+ if stage.is_source:
860
+ actors = current_stage_actors.get(stage.name, [])
861
+ for actor in actors:
862
+ if hasattr(actor, "pause") and hasattr(actor.pause, "remote"):
863
+ try:
864
+ pause_refs.append(actor.pause.remote())
865
+ source_actors_paused.append(actor)
866
+ except Exception as e:
867
+ logger.error(f"Failed sending pause to {actor}: {e}")
868
+ if pause_refs:
869
+ logger.debug(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
870
+ try:
871
+ ray.get(pause_refs, timeout=pause_timeout)
872
+ logger.debug(f"{len(pause_refs)} sources acknowledged pause.")
873
+ except GetTimeoutError:
874
+ logger.warning(f"Timeout waiting for {len(pause_refs)} sources to pause.")
875
+ except Exception as e:
876
+ logger.error(f"Error waiting for sources pause: {e}. Proceeding cautiously.")
877
+
878
+ # --- 2. Wait for Drain ---
879
+ logger.debug("Waiting for pipeline to drain...")
880
+ if not self._wait_for_pipeline_drain(self.queue_flush_drain_timeout_seconds):
881
+ raise RuntimeError("Pipeline drain failed or timed out, aborting flush.")
882
+
883
+ # --- 3. Create New Queues (using snapshot) ---
884
+ logger.debug("Creating new replacement queues...")
885
+ new_edge_queues_map = {}
886
+ for queue_name, (_, queue_size) in current_edge_queues.items():
887
+ try:
888
+ new_edge_queues_map[queue_name] = (
889
+ RayQueue(maxsize=queue_size, actor_options={"max_restarts": 0}),
890
+ queue_size,
891
+ )
892
+ logger.debug(f"Created new queue: {queue_name}")
893
+ except Exception as e:
894
+ raise RuntimeError(f"Failed to create new queue '{queue_name}'.") from e
895
+
896
+ # --- 4. Re-wire Actors to New Queues (using snapshots) ---
897
+ logger.debug("Re-wiring actors to new queues...")
898
+ wiring_refs = []
899
+ wiring_timeout = 120.0
900
+ for from_stage_name, conns in current_connections.items():
901
+ for to_stage_name, _ in conns:
902
+ queue_name = f"{from_stage_name}_to_{to_stage_name}"
903
+ if queue_name not in new_edge_queues_map:
904
+ raise RuntimeError(f"New queue missing for {queue_name}")
905
+ new_queue_actor, _ = new_edge_queues_map[queue_name]
906
+
907
+ # Re-wire sources outputs
908
+ for actor in current_stage_actors.get(from_stage_name, []):
909
+ try:
910
+ wiring_refs.append(actor.set_output_queue.remote(new_queue_actor))
911
+ except Exception as e:
912
+ logger.error(f"Failed sending set_output_queue to {actor}: {e}")
913
+
914
+ # Re-wire destinations inputs
915
+ for actor in current_stage_actors.get(to_stage_name, []):
916
+ try:
917
+ wiring_refs.append(actor.set_input_queue.remote(new_queue_actor))
918
+ except Exception as e:
919
+ logger.error(f"Failed sending set_input_queue to {actor}: {e}")
920
+
921
+ if wiring_refs:
922
+ logger.debug(f"Waiting up to {wiring_timeout}s for {len(wiring_refs)} actors to re-wire...")
923
+ try:
924
+ ready, not_ready = ray.wait(wiring_refs, num_returns=len(wiring_refs), timeout=wiring_timeout)
925
+ if not_ready:
926
+ raise RuntimeError("Actor re-wiring timed out or failed.")
927
+ ray.get(ready) # Check for internal errors
928
+ logger.debug(f"{len(ready)} actors re-wired successfully.")
929
+ except Exception as e:
930
+ raise RuntimeError("Actor re-wiring failed.") from e
931
+
932
+ # --- 5. Update Topology State (Commit Point) ---
933
+ logger.debug("Committing new queues to pipeline topology.")
934
+ self.topology.set_edge_queues(new_edge_queues_map) # Commit the change
935
+ overall_success = True
936
+
937
+ except Exception as e:
938
+ logger.error(f"Error during queue flush: {e}", exc_info=True)
939
+ overall_success = False
940
+
941
+ finally:
942
+ # --- 6. Resume Source Stages (Always attempt) ---
943
+ if source_actors_paused:
944
+ logger.debug(f"Attempting to resume {len(source_actors_paused)} source actors...")
945
+ resume_timeout = 30.0
946
+ resume_refs = []
947
+ for actor in source_actors_paused:
948
+ try:
949
+ resume_refs.append(actor.resume.remote())
950
+ except Exception as e:
951
+ logger.error(f"Failed sending resume to {actor}: {e}")
952
+ if resume_refs:
953
+ logger.debug(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
954
+ try:
955
+ ray.get(resume_refs, timeout=resume_timeout)
956
+ logger.debug(f"{len(resume_refs)} sources resumed.")
957
+ except GetTimeoutError:
958
+ logger.warning(f"Timeout waiting for {len(resume_refs)} sources to resume.")
959
+ except Exception as e:
960
+ logger.error(f"Error waiting for sources resume: {e}")
961
+
962
+ # Update flush timestamp only on success
963
+ if overall_success:
964
+ self._last_queue_flush_time = time.time()
965
+
966
+ # Reset flushing state in topology
967
+ self.topology.set_flushing(False)
968
+
969
+ return overall_success
970
+
971
+ def request_queue_flush(self, force: bool = False) -> None:
972
+ """Requests a queue flush, checking topology state."""
973
+ logger.info(f"Manual queue flush requested (force={force}).")
974
+
975
+ if self.topology.get_is_flushing() or self._stopping: # Check topology
976
+ logger.warning("Flush already in progress or pipeline is stopping.")
977
+ return
978
+ if force or self._is_pipeline_quiet():
979
+ # Consider running _execute_queue_flush in a separate thread
980
+ # to avoid blocking the caller, especially if 'force=True'.
981
+ # For now, run synchronously:
982
+ self._execute_queue_flush()
983
+ else:
984
+ logger.info("Manual flush denied: pipeline not quiet or interval not met.")
985
+
986
+ def _gather_controller_metrics(
987
+ self, current_stage_stats: Dict[str, Dict[str, int]], global_in_flight: int
988
+ ) -> Dict[str, Dict[str, Any]]:
989
+ """Gathers metrics using provided stats and topology."""
990
+ logger.debug("[ScalingMetrics] Gathering metrics for controllers...")
991
+ current_stage_metrics = {}
992
+
993
+ # Use topology accessors
994
+ current_stages = self.topology.get_stages_info()
995
+ current_actors = self.topology.get_stage_actors() # Snapshot
996
+
997
+ for stage in current_stages:
998
+ stage_name = stage.name
999
+ replicas = len(current_actors.get(stage_name, []))
1000
+ stats = current_stage_stats.get(stage_name, {"processing": 0, "in_flight": 0})
1001
+ processing = stats.get("processing", 0)
1002
+ in_flight = stats.get("in_flight", 0)
1003
+ queue_depth = max(0, in_flight - processing)
1004
+
1005
+ current_stage_metrics[stage_name] = {
1006
+ "replicas": replicas,
1007
+ "queue_depth": queue_depth,
1008
+ "processing": processing,
1009
+ "in_flight": in_flight,
1010
+ "min_replicas": stage.min_replicas,
1011
+ "max_replicas": stage.max_replicas,
1012
+ "pipeline_in_flight": global_in_flight,
1013
+ }
1014
+
1015
+ logger.debug(f"[ScalingMetrics] Gathered metrics for {len(current_stage_metrics)} stages.")
1016
+ return current_stage_metrics
1017
+
1018
+ def _get_current_global_memory(self) -> int:
1019
+ """
1020
+ Safely retrieves the current global system memory usage (used, not free) in MB.
1021
+ Uses the previous measurement as a fallback only if the current read fails.
1022
+
1023
+ Returns:
1024
+ int: Current global memory usage (RSS/used) in MB. Returns previous value
1025
+ or 0 if the read fails and no previous value exists.
1026
+ """
1027
+ try:
1028
+ # psutil.virtual_memory().used provides total RAM used by processes
1029
+ current_global_memory_bytes = psutil.virtual_memory().used
1030
+ current_global_memory_mb = int(current_global_memory_bytes / (1024 * 1024))
1031
+ logger.debug(f"[ScalingMemCheck] Current global memory usage (used): {current_global_memory_mb} MB")
1032
+
1033
+ return current_global_memory_mb
1034
+ except Exception as e:
1035
+ logger.error(
1036
+ f"[ScalingMemCheck] Failed to get current system memory usage: {e}. "
1037
+ f"Attempting to use previous value ({self.prev_global_memory_usage} MB).",
1038
+ exc_info=False,
1039
+ )
1040
+
1041
+ # Use previous value if available, otherwise default to 0 (less ideal, but avoids None)
1042
+ # Returning 0 might incorrectly signal low memory usage if it's the first read that fails.
1043
+ return self.prev_global_memory_usage if self.prev_global_memory_usage is not None else 0
1044
+
1045
+ def _calculate_scaling_adjustments(
1046
+ self, current_stage_metrics: Dict[str, Dict[str, Any]], global_in_flight: int, current_global_memory_mb: int
1047
+ ) -> Dict[str, int]:
1048
+ """Runs controllers to get target replica counts using topology for edge count."""
1049
+ logger.debug("[ScalingCalc] Calculating adjustments via PID and RCM...")
1050
+ # Get edge count from topology
1051
+ num_edges = len(self.topology.get_edge_queues())
1052
+
1053
+ try:
1054
+ initial_proposals = self.pid_controller.calculate_initial_proposals(current_stage_metrics)
1055
+ logger.debug(
1056
+ "[ScalingCalc] PID Initial Proposals:"
1057
+ f" { {n: p.proposed_replicas for n, p in initial_proposals.items()} }" # noqa E201,E202
1058
+ )
1059
+
1060
+ final_adjustments = self.constraint_manager.apply_constraints(
1061
+ initial_proposals=initial_proposals,
1062
+ global_in_flight=global_in_flight,
1063
+ current_global_memory_usage_mb=current_global_memory_mb,
1064
+ num_edges=num_edges,
1065
+ )
1066
+ logger.debug(f"[ScalingCalc] RCM Final Adjustments: {final_adjustments}")
1067
+ return final_adjustments
1068
+ except Exception as e:
1069
+ logger.error(f"[ScalingCalc] Error during controller execution: {e}", exc_info=True)
1070
+ logger.warning("[ScalingCalc] Falling back to current replica counts.")
1071
+ return {name: metrics.get("replicas", 0) for name, metrics in current_stage_metrics.items()}
1072
+
1073
+ def _apply_scaling_actions(self, final_adjustments: Dict[str, int]) -> None:
1074
+ """Applies scaling by calling _scale_stage, using topology for validation."""
1075
+ stages_needing_action = []
1076
+ current_actors_map = self.topology.get_stage_actors() # Snapshot
1077
+
1078
+ for stage_name, target_replica_count in final_adjustments.items():
1079
+ current_count = len(current_actors_map.get(stage_name, []))
1080
+ stage_info = self.topology.get_stage_info(stage_name) # Get info from topology
1081
+
1082
+ if not stage_info:
1083
+ logger.warning(f"[ScalingApply] Cannot apply scaling for unknown stage '{stage_name}'. Skipping.")
1084
+ continue
1085
+
1086
+ # Clamp target using StageInfo from topology
1087
+ clamped_target = max(stage_info.min_replicas, min(stage_info.max_replicas, target_replica_count))
1088
+ if clamped_target != target_replica_count:
1089
+ logger.warning(
1090
+ f"[ScalingApply-{stage_name}] Target {target_replica_count} clamped to {clamped_target} by bounds."
1091
+ )
1092
+ target_replica_count = clamped_target
1093
+
1094
+ if target_replica_count != current_count:
1095
+ stages_needing_action.append((stage_name, target_replica_count))
1096
+ logger.debug(
1097
+ f"[ScalingApply-{stage_name}] Action: Current={current_count}, "
1098
+ f"Target={target_replica_count} (Min={stage_info.min_replicas}, Max={stage_info.max_replicas})"
1099
+ )
1100
+
1101
+ if not stages_needing_action:
1102
+ logger.debug("[ScalingApply] No scaling actions required.")
1103
+ return
1104
+
1105
+ max_workers = min(len(stages_needing_action), 8)
1106
+ logger.debug(
1107
+ f"[ScalingApply] Submitting {len(stages_needing_action)} scaling actions ({max_workers} workers)..."
1108
+ )
1109
+ action_results = {}
1110
+
1111
+ with concurrent.futures.ThreadPoolExecutor(
1112
+ max_workers=max_workers, thread_name_prefix="ScalingAction"
1113
+ ) as executor:
1114
+ future_to_stage = {
1115
+ executor.submit(self._scale_stage, stage_name, target_count): stage_name
1116
+ for stage_name, target_count in stages_needing_action
1117
+ }
1118
+ wait_timeout = 180.0
1119
+ logger.debug(f"[ScalingApply] Waiting up to {wait_timeout}s for actions...")
1120
+ for future in concurrent.futures.as_completed(future_to_stage, timeout=wait_timeout):
1121
+ stage_name = future_to_stage[future]
1122
+ try:
1123
+ result = future.result() # Raises exception if _scale_stage failed internally
1124
+ action_results[stage_name] = {"status": "completed", "result": result}
1125
+ logger.debug(f"[ScalingApply-{stage_name}] Action completed.")
1126
+ except TimeoutError:
1127
+ logger.error(f"[ScalingApply-{stage_name}] Action timed out ({wait_timeout}s).")
1128
+ action_results[stage_name] = {"status": "timeout"}
1129
+ self.topology.update_scaling_state(stage_name, "Error") # Mark as error on timeout
1130
+ except Exception as exc:
1131
+ logger.error(f"[ScalingApply-{stage_name}] Action failed: {exc}", exc_info=True)
1132
+ action_results[stage_name] = {"status": "error", "exception": exc}
1133
+ # State should be set to Error inside _scale_stage or its handlers on failure
1134
+
1135
+ completed = sum(1 for r in action_results.values() if r["status"] == "completed")
1136
+ errors = sum(1 for r in action_results.values() if r["status"] == "error")
1137
+ timeouts = sum(1 for r in action_results.values() if r["status"] == "timeout")
1138
+ logger.debug(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
1139
+
1140
+ def _perform_scaling_and_maintenance(self) -> None:
1141
+ """Orchestrates scaling/maintenance using topology and stats collector."""
1142
+
1143
+ if self._stopping:
1144
+ logger.debug("Pipeline is stopping. Skipping scaling cycle.")
1145
+ return
1146
+
1147
+ if not self.dynamic_memory_scaling:
1148
+ logger.debug("Dynamic memory scaling disabled. Skipping cycle.")
1149
+ return
1150
+
1151
+ if self.topology.get_is_flushing():
1152
+ logger.debug("Skipping scaling cycle: Queue flush in progress (topology state).")
1153
+ return
1154
+
1155
+ got_lock = self._state_lock.acquire(timeout=0.1)
1156
+ if not got_lock:
1157
+ logger.debug("Could not acquire lock for maintenance; skipping cycle.")
1158
+ return
1159
+
1160
+ cycle_start_time = time.time()
1161
+ try:
1162
+ if self._stopping:
1163
+ logger.debug("Pipeline began stopping after acquiring lock. Skipping maintenance logic.")
1164
+ return
1165
+
1166
+ logger.debug("--- Performing Scaling & Maintenance Cycle ---")
1167
+
1168
+ if self._is_pipeline_quiet():
1169
+ logger.info("[Drain] Pipeline quiet, initiating queue flush.")
1170
+ flush_success = self._execute_queue_flush()
1171
+ logger.info(f"[Drain] Automatic queue flush completed. Success: {flush_success}")
1172
+ return
1173
+
1174
+ # Fast return check if stopping occurred while flushing or checking flush status
1175
+ if self._stopping:
1176
+ return
1177
+
1178
+ current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
1179
+ self.stats_collector.get_latest_stats()
1180
+ )
1181
+
1182
+ last_update_age = time.time() - last_update_time
1183
+ max_age = max(15.0, self._stats_collection_interval_seconds)
1184
+ if not current_stage_stats or not stats_were_successful or last_update_age > max_age:
1185
+ status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
1186
+ logger.warning(
1187
+ f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
1188
+ )
1189
+ return
1190
+
1191
+ current_stage_metrics = self._gather_controller_metrics(current_stage_stats, global_in_flight)
1192
+ if not current_stage_metrics:
1193
+ logger.error("[Scaling] Failed to gather metrics. Skipping.")
1194
+ return
1195
+
1196
+ current_global_memory_mb = self._get_current_global_memory()
1197
+ final_adjustments = self._calculate_scaling_adjustments(
1198
+ current_stage_metrics, global_in_flight, current_global_memory_mb
1199
+ )
1200
+ self.prev_global_memory_usage = current_global_memory_mb
1201
+ self._apply_scaling_actions(final_adjustments)
1202
+
1203
+ logger.debug(
1204
+ f"--- Scaling & Maintenance Cycle Complete (Duration: {time.time() - cycle_start_time:.2f}s) ---"
1205
+ )
1206
+
1207
+ except Exception as e: # noqa
1208
+ logger.error("Exception during maintenance cycle", exc_info=True)
1209
+
1210
+ finally:
1211
+ self._state_lock.release()
1212
+
1213
+ # --- Lifecycle Methods for Monitoring/Scaling Threads ---
1214
+ def _scaling_loop(self, interval: float) -> None:
1215
+ """Main loop for the scaling thread."""
1216
+ logger.info(f"Scaling loop started. Interval: {interval}s")
1217
+ while self._scaling_monitoring:
1218
+ try:
1219
+ self._perform_scaling_and_maintenance()
1220
+ except Exception as e:
1221
+ logger.error(f"Error in scaling loop: {e}", exc_info=True)
1222
+
1223
+ sleep_time = interval
1224
+ if not self._scaling_monitoring:
1225
+ break
1226
+ time.sleep(sleep_time)
1227
+ logger.info("Scaling loop finished.")
1228
+
1229
+ def _start_scaling(self, poll_interval: float = 10.0) -> None:
1230
+ if not self._scaling_monitoring:
1231
+ self._scaling_monitoring = True
1232
+ self._scaling_thread = threading.Thread(target=self._scaling_loop, args=(poll_interval,), daemon=True)
1233
+ self._scaling_thread.start()
1234
+ logger.info(f"Scaling/Maintenance thread launched (Interval: {poll_interval}s).")
1235
+
1236
+ def _stop_scaling(self) -> None:
1237
+ if self._scaling_monitoring:
1238
+ logger.debug("Stopping scaling/maintenance thread...")
1239
+ self._scaling_monitoring = False
1240
+ if self._scaling_thread is not None:
1241
+ self._scaling_thread.join(timeout=15) # Allow more time for scaling actions
1242
+ if self._scaling_thread.is_alive():
1243
+ logger.warning("Scaling thread did not exit cleanly.")
1244
+ self._scaling_thread = None
1245
+ logger.info("Scaling/Maintenance stopped.")
1246
+
1247
+ # --- Pipeline Start/Stop ---
1248
+ def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
1249
+ """Starts actors (via topology) and background threads."""
1250
+ # Check topology for actors (indicates built)
1251
+ if not self.topology.get_stage_actors():
1252
+ logger.error("Cannot start: Pipeline not built or has no actors.")
1253
+ return
1254
+
1255
+ logger.info("Starting pipeline execution...")
1256
+ start_refs = []
1257
+ # Get actors from topology
1258
+ actors_to_start = [actor for actors in self.topology.get_stage_actors().values() for actor in actors]
1259
+
1260
+ for actor in actors_to_start:
1261
+ start_refs.append(actor.start.remote())
1262
+
1263
+ if start_refs:
1264
+ logger.debug(f"Waiting for {len(start_refs)} actors to start...")
1265
+ try:
1266
+ ray.get(start_refs, timeout=60.0)
1267
+ logger.info(f"{len(start_refs)} actors started.")
1268
+ except Exception as e:
1269
+ logger.error(f"Error/Timeout starting actors: {e}", exc_info=True)
1270
+ self.stop() # Attempt cleanup
1271
+
1272
+ raise RuntimeError("Pipeline start failed: actors did not start.") from e
1273
+
1274
+ self.stats_collector.start()
1275
+ self._start_scaling(poll_interval=scaling_poll_interval)
1276
+ logger.info("Pipeline started successfully.")
1277
+
1278
+ def stop(self) -> None:
1279
+ """Stops background threads and actors (via topology)."""
1280
+ logger.info("Stopping pipeline...")
1281
+
1282
+ if self._stopping:
1283
+ return
1284
+ self._stopping = True
1285
+
1286
+ # 1. Stop background threads first
1287
+ with self._state_lock:
1288
+ self._stop_scaling()
1289
+ self.stats_collector.stop()
1290
+
1291
+ # 2. Stop actors (using topology)
1292
+ logger.debug("Stopping all stage actors...")
1293
+ stop_refs_map: Dict[ray.ObjectRef, Any] = {}
1294
+
1295
+ # Get actors snapshot from topology
1296
+ current_actors = {name: list(actors) for name, actors in self.topology.get_stage_actors().items()}
1297
+
1298
+ for stage_name, actors in current_actors.items():
1299
+ for actor in actors:
1300
+ try:
1301
+ stop_refs_map[actor.stop.remote()] = actor
1302
+ except Exception as e:
1303
+ logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Skipping.")
1304
+
1305
+ if stop_refs_map:
1306
+ stop_refs = list(stop_refs_map.keys())
1307
+ logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
1308
+ try:
1309
+ ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
1310
+ if not_ready:
1311
+ logger.warning(
1312
+ f"Timeout waiting for {len(not_ready)} actors to stop. Allowing Ray to clean up."
1313
+ )
1314
+ logger.info(f"{len(ready)} actors stopped via stop().")
1315
+ except Exception as e:
1316
+ logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
1317
+
1318
+ # Clear runtime state in topology
1319
+ self.topology.clear_runtime_state()
1320
+ del self.topology
1321
+
1322
+ logger.info("Pipeline stopped.")