nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1187 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import threading
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass
8
+
9
+ import psutil
10
+ import uuid
11
+ import ray
12
+ from ray.exceptions import GetTimeoutError
13
+ from ray.util.queue import Queue as RayQueue
14
+ from typing import Dict, Optional, List, Tuple, Any
15
+ from pydantic import BaseModel
16
+ import concurrent.futures
17
+ import logging
18
+ import time
19
+
20
+ from nv_ingest.framework.orchestration.ray.primitives.pipeline_topology import PipelineTopology, StageInfo
21
+ from nv_ingest.framework.orchestration.ray.primitives.ray_stat_collector import RayStatsCollector
22
+ from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import PIDController, ResourceConstraintManager
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ # --- Configuration Objects ---
28
+
29
+
30
+ @dataclass
31
+ class ScalingConfig:
32
+ """Configuration for PID and Resource Constraint Manager based scaling."""
33
+
34
+ dynamic_memory_scaling: bool = True
35
+ dynamic_memory_threshold: float = 0.75
36
+ pid_kp: float = 0.1
37
+ pid_ki: float = 0.001
38
+ pid_kd: float = 0.0
39
+ pid_target_queue_depth: int = 0
40
+ pid_penalty_factor: float = 0.1
41
+ pid_error_boost_factor: float = 1.5
42
+ pid_window_size: int = 10
43
+ rcm_estimated_edge_cost_mb: int = 5000
44
+ rcm_memory_safety_buffer_fraction: float = 0.15
45
+
46
+
47
+ @dataclass
48
+ class FlushingConfig:
49
+ """Configuration for queue flushing behavior."""
50
+
51
+ queue_flush_interval_seconds: int = 600
52
+ queue_flush_drain_timeout_seconds: int = 300
53
+ quiet_period_threshold: int = 0
54
+
55
+
56
+ @dataclass
57
+ class StatsConfig:
58
+ """Configuration for the RayStatsCollector."""
59
+
60
+ collection_interval_seconds: float = 10.0
61
+ actor_timeout_seconds: float = 5.0
62
+ queue_timeout_seconds: float = 2.0
63
+
64
+
65
+ class RayPipeline:
66
+ """
67
+ A structured pipeline supporting dynamic scaling and queue flushing.
68
+ Uses PIDController and ResourceConstraintManager. Supports optional GUI display.
69
+ Delegates statistics collection to RayStatsCollector.
70
+
71
+ Configuration is managed via dedicated config objects (ScalingConfig, etc.).
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ scaling_config: ScalingConfig = ScalingConfig(),
77
+ flushing_config: FlushingConfig = FlushingConfig(),
78
+ stats_config: StatsConfig = StatsConfig(),
79
+ ) -> None:
80
+ # Store config objects
81
+ self.scaling_config = scaling_config
82
+ self.flushing_config = flushing_config
83
+ self.stats_config = stats_config
84
+
85
+ # --- Instantiate Topology ---
86
+ self.topology = PipelineTopology()
87
+
88
+ # --- Structure Lock ---
89
+ self._structure_lock: threading.Lock = threading.Lock()
90
+
91
+ # --- State ---
92
+ # self.scaling_state: Dict[str, str] = {}
93
+ self.prev_global_memory_usage: Optional[int] = None
94
+
95
+ # --- Build Time Config & State ---
96
+ # Use scaling_config for these
97
+ self.dynamic_memory_scaling = self.scaling_config.dynamic_memory_scaling
98
+ self.dynamic_memory_threshold = self.scaling_config.dynamic_memory_threshold
99
+ self.stage_memory_overhead: Dict[str, float] = {}
100
+
101
+ # --- Background Threads ---
102
+ self._scaling_thread: Optional[threading.Thread] = None
103
+ self._scaling_monitoring = False
104
+
105
+ # --- Queue Flushing ---
106
+ self._last_queue_flush_time: float = time.time()
107
+ self.queue_flush_interval_seconds = self.flushing_config.queue_flush_interval_seconds
108
+ self.queue_flush_drain_timeout_seconds = self.flushing_config.queue_flush_drain_timeout_seconds
109
+ self.quiet_period_threshold = self.flushing_config.quiet_period_threshold
110
+
111
+ # --- Instantiate Autoscaling Controllers ---
112
+ # Use scaling_config
113
+ self.pid_controller = PIDController(
114
+ kp=self.scaling_config.pid_kp,
115
+ ki=self.scaling_config.pid_ki,
116
+ kd=self.scaling_config.pid_kd,
117
+ stage_cost_estimates={}, # Populated during build
118
+ target_queue_depth=self.scaling_config.pid_target_queue_depth,
119
+ window_size=self.scaling_config.pid_window_size,
120
+ penalty_factor=self.scaling_config.pid_penalty_factor,
121
+ error_boost_factor=self.scaling_config.pid_error_boost_factor,
122
+ )
123
+ logger.info("PIDController initialized using ScalingConfig.")
124
+
125
+ try:
126
+ total_system_memory_bytes = psutil.virtual_memory().total
127
+ # Use scaling_config for dynamic_memory_threshold
128
+ absolute_memory_threshold_mb = int(
129
+ self.scaling_config.dynamic_memory_threshold * total_system_memory_bytes / (1024 * 1024)
130
+ )
131
+ except Exception as e:
132
+ logger.error(f"Failed to get system memory: {e}. Using high limit.")
133
+ absolute_memory_threshold_mb = 1_000_000 # Fallback value
134
+
135
+ # Use scaling_config
136
+ self.constraint_manager = ResourceConstraintManager(
137
+ max_replicas=1, # Updated during build
138
+ memory_threshold=absolute_memory_threshold_mb,
139
+ estimated_edge_cost_mb=self.scaling_config.rcm_estimated_edge_cost_mb,
140
+ memory_safety_buffer_fraction=self.scaling_config.rcm_memory_safety_buffer_fraction,
141
+ )
142
+ logger.info("ResourceConstraintManager initialized using ScalingConfig.")
143
+
144
+ # --- Instantiate Stats Collector ---
145
+ self._stats_collection_interval_seconds = self.stats_config.collection_interval_seconds
146
+ self.stats_collector = RayStatsCollector(
147
+ pipeline_accessor=self, # This dependency remains for now
148
+ interval=self.stats_config.collection_interval_seconds,
149
+ actor_timeout=self.stats_config.actor_timeout_seconds,
150
+ queue_timeout=self.stats_config.queue_timeout_seconds,
151
+ )
152
+ logger.info("RayStatsCollector initialized using StatsConfig.")
153
+
154
+ # --- Accessor Methods for Stats Collector (and internal use) ---
155
+
156
+ def get_stages_info(self) -> List[StageInfo]:
157
+ """Returns a snapshot of the current stage information."""
158
+ return self.topology.get_stages_info()
159
+
160
+ def get_stage_actors(self) -> Dict[str, List[Any]]:
161
+ """Returns a snapshot of the current actors per stage."""
162
+ return self.topology.get_stage_actors()
163
+
164
+ def get_edge_queues(self) -> Dict[str, Tuple[Any, int]]:
165
+ """Returns a snapshot of the current edge queues."""
166
+ return self.topology.get_edge_queues()
167
+
168
+ def _configure_autoscalers(self) -> None:
169
+ """Updates controllers based on current pipeline configuration via topology."""
170
+ logger.debug("[Build-Configure] Configuring autoscalers...")
171
+ total_max_replicas = 0
172
+ default_cost_bytes = 100 * 1024 * 1024
173
+ stage_overheads = {} # Collect locally
174
+
175
+ # Use topology accessor
176
+ current_stages = self.topology.get_stages_info()
177
+
178
+ for stage in current_stages:
179
+ total_max_replicas += stage.max_replicas
180
+ # Use estimated overhead if available (Assume it's calculated elsewhere or default)
181
+ # For now, let's store a dummy overhead in topology during build
182
+ overhead_bytes = default_cost_bytes # Simplification for now
183
+ stage_overheads[stage.name] = overhead_bytes # Store locally first
184
+ cost_mb = max(1, int(overhead_bytes / (1024 * 1024)))
185
+ # Update controller directly (or via dedicated method if preferred)
186
+ self.pid_controller.stage_cost_estimates[stage.name] = cost_mb
187
+
188
+ # Update topology with collected overheads
189
+ self.topology.set_stage_memory_overhead(stage_overheads)
190
+
191
+ # Update constraint manager
192
+ self.constraint_manager.max_replicas = total_max_replicas
193
+
194
+ logger.info(f"[Build-Configure] Autoscalers configured. Total Max Replicas: {total_max_replicas}")
195
+ logger.debug(f"[Build-Configure] PID stage cost estimates (MB): {self.pid_controller.stage_cost_estimates}")
196
+
197
+ def _instantiate_initial_actors(self) -> None:
198
+ """Instantiates initial actors and updates topology."""
199
+ logger.info("[Build-Actors] Instantiating initial stage actors (min_replicas)...")
200
+ # Use topology accessor
201
+ current_stages = self.topology.get_stages_info()
202
+
203
+ for stage in current_stages:
204
+ replicas = []
205
+
206
+ if not self.dynamic_memory_scaling:
207
+ num_initial_actors = stage.max_replicas
208
+ else:
209
+ num_initial_actors = (
210
+ max(stage.min_replicas, 1) if stage.is_source or stage.is_sink else stage.min_replicas
211
+ )
212
+
213
+ if num_initial_actors > 0:
214
+ logger.debug(f"[Build-Actors] Stage '{stage.name}' creating {num_initial_actors} initial actor(s).")
215
+ for i in range(num_initial_actors):
216
+ actor_name = f"{stage.name}_{uuid.uuid4()}"
217
+ logger.debug(
218
+ f"[Build-Actors] Creating actor '{actor_name}' ({i + 1}/{num_initial_actors})"
219
+ f" for '{stage.name}'"
220
+ )
221
+ try:
222
+ actor = stage.callable.options(
223
+ name=actor_name, max_concurrency=10, max_restarts=0, lifetime="detached"
224
+ ).remote(config=stage.config)
225
+ replicas.append(actor)
226
+ except Exception as e:
227
+ logger.error(f"[Build-Actors] Failed create actor '{actor_name}': {e}", exc_info=True)
228
+ raise RuntimeError(f"Build failed: actor creation error for stage '{stage.name}'") from e
229
+
230
+ # Update topology for this stage
231
+ self.topology.set_actors_for_stage(stage.name, replicas)
232
+ logger.debug(f"[Build-Actors] Stage '{stage.name}' initial actors set in topology: count={len(replicas)}")
233
+
234
+ logger.info("[Build-Actors] Initial actor instantiation complete.")
235
+
236
+ def _create_and_wire_edges(self) -> List[ray.ObjectRef]:
237
+ """Creates queues, wires actors (using topology), and updates topology."""
238
+ logger.info("[Build-Wiring] Creating and wiring edges...")
239
+ wiring_refs = []
240
+ new_edge_queues: Dict[str, Tuple[Any, int]] = {}
241
+
242
+ current_connections = self.topology.get_connections()
243
+ current_stage_actors = self.topology.get_stage_actors() # Gets copy
244
+
245
+ for from_stage_name, connections_list in current_connections.items():
246
+ for to_stage_name, queue_size in connections_list:
247
+ queue_name = f"{from_stage_name}_to_{to_stage_name}"
248
+ logger.debug(f"[Build-Wiring] Creating queue '{queue_name}' (size {queue_size}) and wiring.")
249
+ try:
250
+ edge_queue = RayQueue(maxsize=queue_size, actor_options={"max_restarts": 0})
251
+ new_edge_queues[queue_name] = (edge_queue, queue_size)
252
+
253
+ # Wire using current actors from topology snapshot
254
+ source_actors = current_stage_actors.get(from_stage_name, [])
255
+ for actor in source_actors:
256
+ wiring_refs.append(actor.set_output_queue.remote(edge_queue))
257
+
258
+ dest_actors = current_stage_actors.get(to_stage_name, [])
259
+ for actor in dest_actors:
260
+ wiring_refs.append(actor.set_input_queue.remote(edge_queue))
261
+
262
+ except Exception as e:
263
+ logger.error(f"[Build-Wiring] Failed create/wire queue '{queue_name}': {e}", exc_info=True)
264
+ raise RuntimeError(f"Build failed: queue wiring error for '{queue_name}'") from e
265
+
266
+ # Update topology with the new queues
267
+ self.topology.set_edge_queues(new_edge_queues)
268
+
269
+ logger.debug(f"[Build-Wiring] Submitted {len(wiring_refs)} wiring calls. Queues set in topology.")
270
+ return wiring_refs
271
+
272
+ @staticmethod
273
+ def _wait_for_wiring(wiring_refs: List[ray.ObjectRef]) -> None:
274
+ """Waits for remote wiring calls to complete. (Static, no changes needed)."""
275
+ if not wiring_refs:
276
+ logger.debug("[Build-WaitWiring] No wiring calls.")
277
+ return
278
+ logger.debug(f"[Build-WaitWiring] Waiting for {len(wiring_refs)} wiring calls...")
279
+ try:
280
+ ray.get(wiring_refs)
281
+ logger.debug("[Build-WaitWiring] All wiring calls completed.")
282
+ except Exception as e:
283
+ logger.error(f"[Build-WaitWiring] Error during wiring confirmation: {e}", exc_info=True)
284
+ raise RuntimeError("Build failed: error confirming initial wiring") from e
285
+
286
+ def add_source(
287
+ self, *, name: str, source_actor: Any, config: BaseModel, min_replicas: int = 1, max_replicas: int = 1
288
+ ) -> "RayPipeline":
289
+ if min_replicas < 1:
290
+ logger.warning(f"Source stage '{name}': min_replicas must be >= 1. Overriding.")
291
+ min_replicas = 1
292
+
293
+ stage_info = StageInfo(
294
+ name=name,
295
+ callable=source_actor,
296
+ config=config,
297
+ is_source=True,
298
+ min_replicas=min_replicas,
299
+ max_replicas=max_replicas,
300
+ )
301
+ self.topology.add_stage(stage_info) # Delegate
302
+
303
+ return self
304
+
305
+ def add_stage(
306
+ self, *, name: str, stage_actor: Any, config: BaseModel, min_replicas: int = 0, max_replicas: int = 1
307
+ ) -> "RayPipeline":
308
+ if min_replicas < 0:
309
+ logger.warning(f"Stage '{name}': min_replicas cannot be negative. Overriding to 0.")
310
+ min_replicas = 0
311
+ stage_info = StageInfo(
312
+ name=name, callable=stage_actor, config=config, min_replicas=min_replicas, max_replicas=max_replicas
313
+ )
314
+ self.topology.add_stage(stage_info) # Delegate
315
+
316
+ return self
317
+
318
+ def add_sink(
319
+ self, *, name: str, sink_actor: Any, config: BaseModel, min_replicas: int = 1, max_replicas: int = 1
320
+ ) -> "RayPipeline":
321
+ # Sink min_replicas can realistically be 0 if data drain is optional/best-effort? Let's allow 0.
322
+ if min_replicas < 0:
323
+ logger.warning(f"Sink stage '{name}': min_replicas cannot be negative. Overriding to 0.")
324
+ min_replicas = 0
325
+ stage_info = StageInfo(
326
+ name=name,
327
+ callable=sink_actor,
328
+ config=config,
329
+ is_sink=True,
330
+ min_replicas=min_replicas,
331
+ max_replicas=max_replicas,
332
+ )
333
+ self.topology.add_stage(stage_info) # Delegate
334
+
335
+ return self
336
+
337
+ # --- Method for defining connections ---
338
+ def make_edge(self, from_stage: str, to_stage: str, queue_size: int = 100) -> "RayPipeline":
339
+ try:
340
+ self.topology.add_connection(from_stage, to_stage, queue_size) # Delegate (includes validation)
341
+ except ValueError as e:
342
+ logger.error(f"make_edge failed: {e}")
343
+ raise # Re-raise the error
344
+ return self
345
+
346
+ # ----- Pipeline Build Process ---
347
+ def build(self) -> Dict[str, List[Any]]:
348
+ """Builds the pipeline: configures, instantiates, wires, using topology."""
349
+ logger.info("--- Starting Pipeline Build Process ---")
350
+ try:
351
+ if not self.topology.get_stages_info():
352
+ logger.error("Build failed: No stages defined in topology.")
353
+ return {}
354
+
355
+ # Steps interact with self.topology
356
+ self._configure_autoscalers()
357
+ self._instantiate_initial_actors()
358
+ wiring_futures = self._create_and_wire_edges()
359
+ self._wait_for_wiring(wiring_futures)
360
+
361
+ logger.info("--- Pipeline Build Completed Successfully ---")
362
+ return self.topology.get_stage_actors() # Return actors from topology
363
+
364
+ except RuntimeError as e:
365
+ logger.critical(f"Pipeline build failed: {e}", exc_info=False)
366
+ # Clean up topology runtime state?
367
+ self.topology.clear_runtime_state()
368
+ return {}
369
+ except Exception as e:
370
+ logger.critical(f"Unexpected error during pipeline build: {e}", exc_info=True)
371
+ self.topology.clear_runtime_state()
372
+ return {}
373
+
374
+ # --- Scaling Logic ---
375
+ @staticmethod
376
+ def _create_single_replica(stage_info: StageInfo) -> Any:
377
+ """Creates a single new Ray actor replica for the given stage."""
378
+ actor_name = f"{stage_info.name}_{uuid.uuid4()}"
379
+ logger.debug(f"[ScaleUtil] Creating new actor '{actor_name}' for stage '{stage_info.name}'")
380
+ try:
381
+ new_actor = stage_info.callable.options(
382
+ name=actor_name, max_concurrency=10, max_restarts=0, lifetime="detached"
383
+ ).remote(config=stage_info.config)
384
+
385
+ return new_actor
386
+ except Exception as e:
387
+ logger.error(
388
+ f"[ScaleUtil] Failed to create actor '{actor_name}' for stage '{stage_info.name}':" f" {e}",
389
+ exc_info=True,
390
+ )
391
+
392
+ # Propagate error to halt the scaling operation
393
+ raise RuntimeError(f"Actor creation failed for stage '{stage_info.name}' during scale up") from e
394
+
395
+ def _get_wiring_refs_for_actor(self, actor: Any, stage_name: str) -> List[ray.ObjectRef]:
396
+ """Gets wiring futures for a single actor using topology for queues/connections."""
397
+ wiring_refs = []
398
+
399
+ # Use topology accessors
400
+ connections = self.topology.get_connections()
401
+ edge_queues = self.topology.get_edge_queues()
402
+
403
+ # Wire outputs
404
+ if stage_name in connections:
405
+ for to_stage, _ in connections[stage_name]:
406
+ queue_name = f"{stage_name}_to_{to_stage}"
407
+ if queue_name in edge_queues:
408
+ edge_queue, _ = edge_queues[queue_name]
409
+ wiring_refs.append(actor.set_output_queue.remote(edge_queue))
410
+
411
+ # Wire inputs
412
+ for from_stage, conns in connections.items():
413
+ for to_stage, _ in conns:
414
+ if to_stage == stage_name:
415
+ queue_name = f"{from_stage}_to_{stage_name}"
416
+ if queue_name in edge_queues:
417
+ edge_queue, _ = edge_queues[queue_name]
418
+ wiring_refs.append(actor.set_input_queue.remote(edge_queue))
419
+
420
+ return wiring_refs
421
+
422
+ @staticmethod
423
+ def _start_actors(actors_to_start: List[Any], stage_name: str) -> None:
424
+ """Starts a list of actors if they have a 'start' method and waits for completion."""
425
+ start_refs = []
426
+ for actor in actors_to_start:
427
+ if hasattr(actor, "start"):
428
+ logger.debug(f"[ScaleUtil] Starting actor '{actor}' for stage '{stage_name}'")
429
+ start_refs.append(actor.start.remote())
430
+
431
+ if not start_refs:
432
+ logger.debug(f"[ScaleUtil] No actors with start() method found for stage '{stage_name}'.")
433
+ return
434
+
435
+ logger.debug(f"[ScaleUtil] Waiting for {len(start_refs)} actor starts for stage '{stage_name}'...")
436
+ try:
437
+ ray.get(start_refs)
438
+ logger.debug(f"[ScaleUtil] {len(start_refs)} actors started successfully for stage '{stage_name}'.")
439
+ except Exception as e:
440
+ logger.error(
441
+ f"[ScaleUtil] Error waiting for actors to start for stage '{stage_name}':" f" {e}", exc_info=True
442
+ )
443
+ # Note: Actors might be started but confirmation failed. State might be inconsistent.
444
+ # Consider raising an error to signal potential inconsistency?
445
+ raise RuntimeError(f"Error confirming actor starts for stage '{stage_name}'") from e
446
+
447
+ def _handle_scale_up(self, stage_info: StageInfo, current_count: int, target_count: int) -> None:
448
+ """Handles scaling up, interacting with topology."""
449
+ stage_name = stage_info.name
450
+ num_to_add = target_count - current_count
451
+ logger.debug(f"[ScaleUp-{stage_name}] Scaling up from {current_count} to {target_count} (+{num_to_add}).")
452
+ # Update topology state
453
+ self.topology.update_scaling_state(stage_name, "Scaling Up")
454
+
455
+ new_actors = []
456
+ all_wiring_refs = []
457
+ successfully_added_actors = []
458
+
459
+ try:
460
+ # 1. Create actors
461
+ for _ in range(num_to_add):
462
+ new_actor = self._create_single_replica(stage_info)
463
+ new_actors.append(new_actor)
464
+
465
+ # 2. Get wiring refs (uses topology internally)
466
+ for actor in new_actors:
467
+ all_wiring_refs.extend(self._get_wiring_refs_for_actor(actor, stage_name))
468
+
469
+ # 3. Wait for wiring (static helper)
470
+ self._wait_for_wiring(all_wiring_refs) # Handles errors
471
+
472
+ # 4. Start actors (static helper)
473
+ self._start_actors(new_actors, stage_name) # Handles errors
474
+
475
+ # 5. Add successfully created/wired/started actors to topology
476
+ for actor in new_actors:
477
+ self.topology.add_actor_to_stage(stage_name, actor)
478
+ successfully_added_actors.append(actor) # Keep track
479
+
480
+ final_count = self.topology.get_actor_count(stage_name)
481
+ logger.debug(
482
+ f"[ScaleUp-{stage_name}] Scale up complete. Added {len(successfully_added_actors)}. "
483
+ f"New count: {final_count}"
484
+ )
485
+
486
+ except Exception as e:
487
+ logger.error(f"[ScaleUp-{stage_name}] Error during scale up: {e}", exc_info=False)
488
+ self.topology.update_scaling_state(stage_name, "Error")
489
+ # --- Cleanup Attempt ---
490
+ # Actors created but potentially not wired/started/added to topology.
491
+ # Only kill actors that were definitely *not* added to the topology.
492
+ actors_to_kill = [a for a in new_actors if a not in successfully_added_actors]
493
+ if actors_to_kill:
494
+ logger.warning(
495
+ f"[ScaleUp-{stage_name}] Attempting to kill {len(actors_to_kill)} partially created actors."
496
+ )
497
+ for actor in actors_to_kill:
498
+ try:
499
+ ray.kill(actor, no_restart=True)
500
+ except Exception as kill_e:
501
+ logger.warning(f"Failed to kill actor {actor}: {kill_e}")
502
+ logger.critical(f"[ScaleUp-{stage_name}] Scale up failed. State potentially inconsistent.")
503
+
504
+ finally:
505
+ # Reset state only if it was Scaling Up and didn't end in Error
506
+ current_state = self.topology.get_scaling_state().get(stage_name)
507
+ if current_state == "Scaling Up":
508
+ self.topology.update_scaling_state(stage_name, "Idle")
509
+
510
+ def _handle_scale_down(self, stage_name: str, current_replicas: List[Any], target_count: int) -> None:
511
+ """
512
+ Handles scaling down: initiates stop on actors, registers handles with
513
+ the topology for pending removal if stop was successfully initiated.
514
+ """
515
+ current_count = len(current_replicas)
516
+ num_to_remove = current_count - target_count
517
+ logger.info(f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove}).")
518
+
519
+ # Basic validation
520
+ if num_to_remove <= 0:
521
+ logger.warning(f"[ScaleDown-{stage_name}] Invalid num_to_remove {num_to_remove}. Aborting.")
522
+ return
523
+
524
+ # Identify actors to remove (last N)
525
+ actors_to_remove = current_replicas[-num_to_remove:]
526
+ logger.debug(f"[ScaleDown-{stage_name}] Identified {len(actors_to_remove)} actors for removal.")
527
+
528
+ actors_to_register_map: Dict[str, List[Tuple[Any, ray.ObjectRef]]] = defaultdict(list)
529
+ stop_initiation_failures = 0
530
+
531
+ for actor in actors_to_remove:
532
+ actor_id_str = str(actor)
533
+ try:
534
+ # Call stop(), which now returns shutdown future
535
+ shutdown_future = actor.stop.remote()
536
+ actors_to_register_map[stage_name].append((actor, shutdown_future))
537
+ logger.debug(f"[ScaleDown-{stage_name}] Submitted stop() call for actor '{actor_id_str}'.")
538
+ except Exception as e:
539
+ logger.error(
540
+ f"[ScaleDown-{stage_name}] Error submitting stop() for actor '{actor_id_str}': "
541
+ f"{e}. Cannot register.",
542
+ exc_info=False,
543
+ )
544
+ stop_initiation_failures += 1
545
+
546
+ # Register actors pending removal (with their shutdown futures)
547
+ if actors_to_register_map:
548
+ num_registered = sum(len(v) for v in actors_to_register_map.values())
549
+ logger.debug(
550
+ f"[ScaleDown-{stage_name}] Registering {num_registered} "
551
+ f"actor handles with topology for shutdown monitoring."
552
+ )
553
+ try:
554
+ self.topology.register_actors_pending_removal(actors_to_register_map)
555
+ except Exception as e:
556
+ logger.error(
557
+ f"[ScaleDown-{stage_name}] CRITICAL - Failed to register actors pending removal with topology: {e}",
558
+ exc_info=True,
559
+ )
560
+ self.topology.update_scaling_state(stage_name, "Error")
561
+ elif actors_to_remove:
562
+ logger.warning(f"[ScaleDown-{stage_name}] No actors successfully initiated stop for registration.")
563
+
564
+ total_attempted = len(actors_to_remove)
565
+ logger.info(
566
+ f"[ScaleDown-{stage_name}] Scale down initiation process complete for {total_attempted} actors "
567
+ f"(Skipped/Failed Initiation: {stop_initiation_failures}). Topology cleanup will handle final removal."
568
+ )
569
+
570
+ def _scale_stage(self, stage_name: str, new_replica_count: int) -> None:
571
+ """Orchestrates scaling using topology for state and info."""
572
+ logger.debug(f"[ScaleStage-{stage_name}] Request for target count: {new_replica_count}")
573
+
574
+ # --- Use Topology Accessors ---
575
+ stage_info = self.topology.get_stage_info(stage_name)
576
+ current_replicas = self.topology.get_stage_actors().get(stage_name, []) # Get current actors safely
577
+ current_count = len(current_replicas)
578
+
579
+ if stage_info is None:
580
+ logger.error(f"[ScaleStage-{stage_name}] Stage info not found. Cannot scale.")
581
+ return
582
+
583
+ target_count = max(stage_info.min_replicas, min(new_replica_count, stage_info.max_replicas))
584
+ if target_count != new_replica_count:
585
+ logger.debug(
586
+ f"[ScaleStage-{stage_name}] Count {new_replica_count} adjusted to {target_count} "
587
+ f"by bounds ({stage_info.min_replicas}/{stage_info.max_replicas})."
588
+ )
589
+
590
+ if target_count == current_count:
591
+ logger.debug(f"[ScaleStage-{stage_name}] Already at target count ({current_count}). No action.")
592
+ # Reset state if needed
593
+ if self.topology.get_scaling_state().get(stage_name) != "Idle":
594
+ self.topology.update_scaling_state(stage_name, "Idle")
595
+ return
596
+
597
+ # --- Delegate ---
598
+ try:
599
+ if target_count > current_count:
600
+ self._handle_scale_up(stage_info, current_count, target_count)
601
+ else: # target_count < current_count
602
+ # Pass the list of actors we know about *now*
603
+ self._handle_scale_down(stage_name, current_replicas, target_count)
604
+ except RuntimeError as e: # Catch specific errors from handlers
605
+ logger.error(f"[ScaleStage-{stage_name}] Scaling failed: {e}", exc_info=False)
606
+ # State should have been set to "Error" within the handler
607
+ except Exception as e:
608
+ logger.error(f"[ScaleStage-{stage_name}] Unexpected error: {e}", exc_info=True)
609
+ self.topology.update_scaling_state(stage_name, "Error") # Ensure error state
610
+
611
+ def _is_pipeline_quiet(self) -> bool:
612
+ """Checks if pipeline is quiet using topology state and stats collector."""
613
+
614
+ # Check topology state first
615
+ if self.topology.get_is_flushing():
616
+ logger.debug("Pipeline quiet check: False (Flush in progress via topology state)")
617
+ return False
618
+
619
+ # Time check
620
+ time_since_last_flush = time.time() - self._last_queue_flush_time
621
+ if time_since_last_flush < self.queue_flush_interval_seconds:
622
+ return False
623
+
624
+ # Stats check (same as before)
625
+ current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
626
+ self.stats_collector.get_latest_stats()
627
+ )
628
+ last_update_age = time.time() - last_update_time
629
+ max_stats_age_for_quiet = max(10.0, self._stats_collection_interval_seconds * 2.5)
630
+
631
+ if not stats_were_successful:
632
+ logger.warning(f"Pipeline quiet check: False (Stats failed {last_update_age:.1f}s ago).")
633
+ return False
634
+
635
+ if last_update_age > max_stats_age_for_quiet:
636
+ logger.warning(
637
+ f"Pipeline quiet check: False (Stats too old: {last_update_age:.1f}s > {max_stats_age_for_quiet:.1f}s)."
638
+ )
639
+ return False
640
+
641
+ if not current_stage_stats:
642
+ logger.warning("Pipeline quiet check: False (No stats currently available).")
643
+ return False
644
+
645
+ # Activity check
646
+ is_quiet = global_in_flight <= self.quiet_period_threshold
647
+
648
+ if is_quiet:
649
+ logger.info(f"Pipeline IS quiet. In-Flight: {global_in_flight} <= Threshold: {self.quiet_period_threshold}")
650
+
651
+ return is_quiet
652
+
653
+ def _wait_for_pipeline_drain(self, timeout_seconds: int) -> bool:
654
+ """
655
+ Actively monitors pipeline drain using direct calls to the stats collector.
656
+ """
657
+ start_time = time.time()
658
+ logger.info(f"Waiting for pipeline drain (Timeout: {timeout_seconds}s)...")
659
+ last_in_flight = -1
660
+ drain_check_interval = 1.0 # Check every second
661
+
662
+ while True:
663
+ current_time = time.time()
664
+ elapsed_time = current_time - start_time
665
+
666
+ if elapsed_time >= timeout_seconds:
667
+ logger.warning(f"Pipeline drain timed out after {elapsed_time:.1f}s. Last In-Flight: {last_in_flight}")
668
+ return False
669
+
670
+ # --- Trigger immediate stats collection via the collector instance ---
671
+ drain_stats = {}
672
+ drain_success = False
673
+ collection_error = None
674
+
675
+ global_in_flight = -1
676
+ try:
677
+ # Use the collector's method for a one-off, blocking collection
678
+ drain_stats, global_in_flight, drain_success = self.stats_collector.collect_stats_now()
679
+ except Exception as e:
680
+ logger.error(f"[DrainWait] Critical error during direct stats collection call: {e}.", exc_info=True)
681
+ collection_error = e # Indicate failure to even run collection
682
+
683
+ # --- Process collection results ---
684
+ if global_in_flight != last_in_flight:
685
+ status_msg = (
686
+ f"Collection Success: {drain_success}"
687
+ if not collection_error
688
+ else f"Collection Error: {type(collection_error).__name__}"
689
+ )
690
+ logger.info(
691
+ f"[DrainWait] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
692
+ )
693
+ last_in_flight = global_in_flight
694
+
695
+ # --- Check for successful drain ---
696
+ # Requires BOTH in-flight=0 AND the collection reporting it was successful
697
+ if global_in_flight == 0 and drain_success and not collection_error:
698
+ logger.info(f"Pipeline confirmed drained (In-Flight=0) in {elapsed_time:.1f}s.")
699
+ return True
700
+ elif global_in_flight == 0: # Saw zero, but collection wasn't fully successful
701
+ logger.warning(
702
+ "[DrainWait] In-Flight reached 0, but stats collection had errors/timeouts."
703
+ " Cannot confirm drain yet."
704
+ )
705
+
706
+ # --- Wait ---
707
+ remaining_time = timeout_seconds - elapsed_time
708
+ sleep_duration = min(drain_check_interval, remaining_time, 1.0) # Ensure positive sleep
709
+ if sleep_duration > 0:
710
+ time.sleep(sleep_duration)
711
+
712
+ def _execute_queue_flush(self) -> bool:
713
+ """Executes queue flush, using topology for state and structure."""
714
+ if self.topology.get_is_flushing(): # Check topology state
715
+ logger.warning("Queue flush requested but already in progress. Ignoring.")
716
+ return False
717
+
718
+ # Set flushing state in topology
719
+ self.topology.set_flushing(True)
720
+ logger.info("--- Starting Queue Flush ---")
721
+ overall_success = False
722
+ source_actors_paused = []
723
+ pause_refs = []
724
+ new_edge_queues_map: Optional[Dict[str, Tuple[Any, int]]] = None
725
+
726
+ try:
727
+ # --- Get structure snapshots from topology ---
728
+ # Use lock context for multiple reads if needed, but individual accessors are locked too
729
+ current_stages = self.topology.get_stages_info()
730
+ current_stage_actors = self.topology.get_stage_actors()
731
+ current_edge_queues = self.topology.get_edge_queues()
732
+ current_connections = self.topology.get_connections()
733
+
734
+ # --- 1. Pause Source Stages (using snapshots) ---
735
+ logger.info("Pausing source stages...")
736
+ pause_timeout = 60.0
737
+ for stage in current_stages:
738
+ if stage.is_source:
739
+ actors = current_stage_actors.get(stage.name, [])
740
+ for actor in actors:
741
+ if hasattr(actor, "pause") and hasattr(actor.pause, "remote"):
742
+ try:
743
+ pause_refs.append(actor.pause.remote())
744
+ source_actors_paused.append(actor)
745
+ except Exception as e:
746
+ logger.error(f"Failed sending pause to {actor}: {e}")
747
+ if pause_refs:
748
+ logger.info(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
749
+ try:
750
+ ray.get(pause_refs, timeout=pause_timeout)
751
+ logger.info(f"{len(pause_refs)} sources acknowledged pause.")
752
+ except GetTimeoutError:
753
+ logger.warning(f"Timeout waiting for {len(pause_refs)} sources to pause.")
754
+ except Exception as e:
755
+ logger.error(f"Error waiting for sources pause: {e}. Proceeding cautiously.")
756
+
757
+ # --- 2. Wait for Drain ---
758
+ logger.info("Waiting for pipeline to drain...")
759
+ if not self._wait_for_pipeline_drain(self.queue_flush_drain_timeout_seconds):
760
+ raise RuntimeError("Pipeline drain failed or timed out, aborting flush.")
761
+
762
+ # --- 3. Create New Queues (using snapshot) ---
763
+ logger.info("Creating new replacement queues...")
764
+ new_edge_queues_map = {}
765
+ for queue_name, (_, queue_size) in current_edge_queues.items():
766
+ try:
767
+ new_edge_queues_map[queue_name] = (
768
+ RayQueue(maxsize=queue_size, actor_options={"max_restarts": 0}),
769
+ queue_size,
770
+ )
771
+ logger.debug(f"Created new queue: {queue_name}")
772
+ except Exception as e:
773
+ raise RuntimeError(f"Failed to create new queue '{queue_name}'.") from e
774
+
775
+ # --- 4. Re-wire Actors to New Queues (using snapshots) ---
776
+ logger.info("Re-wiring actors to new queues...")
777
+ wiring_refs = []
778
+ wiring_timeout = 120.0
779
+ for from_stage_name, conns in current_connections.items():
780
+ for to_stage_name, _ in conns:
781
+ queue_name = f"{from_stage_name}_to_{to_stage_name}"
782
+ if queue_name not in new_edge_queues_map:
783
+ raise RuntimeError(f"New queue missing for {queue_name}")
784
+ new_queue_actor, _ = new_edge_queues_map[queue_name]
785
+
786
+ # Re-wire sources outputs
787
+ for actor in current_stage_actors.get(from_stage_name, []):
788
+ try:
789
+ wiring_refs.append(actor.set_output_queue.remote(new_queue_actor))
790
+ except Exception as e:
791
+ logger.error(f"Failed sending set_output_queue to {actor}: {e}")
792
+
793
+ # Re-wire destinations inputs
794
+ for actor in current_stage_actors.get(to_stage_name, []):
795
+ try:
796
+ wiring_refs.append(actor.set_input_queue.remote(new_queue_actor))
797
+ except Exception as e:
798
+ logger.error(f"Failed sending set_input_queue to {actor}: {e}")
799
+
800
+ if wiring_refs:
801
+ logger.debug(f"Waiting up to {wiring_timeout}s for {len(wiring_refs)} actors to re-wire...")
802
+ try:
803
+ ready, not_ready = ray.wait(wiring_refs, num_returns=len(wiring_refs), timeout=wiring_timeout)
804
+ if not_ready:
805
+ raise RuntimeError("Actor re-wiring timed out or failed.")
806
+ ray.get(ready) # Check for internal errors
807
+ logger.debug(f"{len(ready)} actors re-wired successfully.")
808
+ except Exception as e:
809
+ raise RuntimeError("Actor re-wiring failed.") from e
810
+
811
+ # --- 5. Update Topology State (Commit Point) ---
812
+ logger.info("Committing new queues to pipeline topology.")
813
+ self.topology.set_edge_queues(new_edge_queues_map) # Commit the change
814
+ overall_success = True
815
+
816
+ except Exception as e:
817
+ logger.error(f"Error during queue flush: {e}", exc_info=True)
818
+ overall_success = False
819
+
820
+ finally:
821
+ # --- 6. Resume Source Stages (Always attempt) ---
822
+ if source_actors_paused:
823
+ logger.info(f"Attempting to resume {len(source_actors_paused)} source actors...")
824
+ resume_timeout = 30.0
825
+ resume_refs = []
826
+ for actor in source_actors_paused:
827
+ try:
828
+ resume_refs.append(actor.resume.remote())
829
+ except Exception as e:
830
+ logger.error(f"Failed sending resume to {actor}: {e}")
831
+ if resume_refs:
832
+ logger.info(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
833
+ try:
834
+ ray.get(resume_refs, timeout=resume_timeout)
835
+ logger.info(f"{len(resume_refs)} sources resumed.")
836
+ except GetTimeoutError:
837
+ logger.warning(f"Timeout waiting for {len(resume_refs)} sources to resume.")
838
+ except Exception as e:
839
+ logger.error(f"Error waiting for sources resume: {e}")
840
+
841
+ # Update flush timestamp only on success
842
+ if overall_success:
843
+ self._last_queue_flush_time = time.time()
844
+ logger.info("--- Queue Flush Completed Successfully ---")
845
+ else:
846
+ logger.error("--- Queue Flush Failed ---")
847
+
848
+ # Reset flushing state in topology
849
+ self.topology.set_flushing(False)
850
+
851
+ return overall_success
852
+
853
+ def request_queue_flush(self, force: bool = False) -> None:
854
+ """Requests a queue flush, checking topology state."""
855
+ logger.info(f"Manual queue flush requested (force={force}).")
856
+ if self.topology.get_is_flushing(): # Check topology
857
+ logger.warning("Flush already in progress.")
858
+ return
859
+ if force or self._is_pipeline_quiet():
860
+ # Consider running _execute_queue_flush in a separate thread
861
+ # to avoid blocking the caller, especially if 'force=True'.
862
+ # For now, run synchronously:
863
+ self._execute_queue_flush()
864
+ else:
865
+ logger.info("Manual flush denied: pipeline not quiet or interval not met.")
866
+
867
+ def _gather_controller_metrics(
868
+ self, current_stage_stats: Dict[str, Dict[str, int]], global_in_flight: int
869
+ ) -> Dict[str, Dict[str, Any]]:
870
+ """Gathers metrics using provided stats and topology."""
871
+ logger.debug("[ScalingMetrics] Gathering metrics for controllers...")
872
+ current_stage_metrics = {}
873
+
874
+ # Use topology accessors
875
+ current_stages = self.topology.get_stages_info()
876
+ current_actors = self.topology.get_stage_actors() # Snapshot
877
+
878
+ for stage in current_stages:
879
+ stage_name = stage.name
880
+ replicas = len(current_actors.get(stage_name, []))
881
+ stats = current_stage_stats.get(stage_name, {"processing": 0, "in_flight": 0})
882
+ processing = stats.get("processing", 0)
883
+ in_flight = stats.get("in_flight", 0)
884
+ queue_depth = max(0, in_flight - processing)
885
+
886
+ current_stage_metrics[stage_name] = {
887
+ "replicas": replicas,
888
+ "queue_depth": queue_depth,
889
+ "processing": processing,
890
+ "in_flight": in_flight,
891
+ "min_replicas": stage.min_replicas,
892
+ "max_replicas": stage.max_replicas,
893
+ "pipeline_in_flight": global_in_flight,
894
+ }
895
+
896
+ logger.debug(f"[ScalingMetrics] Gathered metrics for {len(current_stage_metrics)} stages.")
897
+ return current_stage_metrics
898
+
899
+ def _get_current_global_memory(self) -> int:
900
+ """
901
+ Safely retrieves the current global system memory usage (used, not free) in MB.
902
+ Uses the previous measurement as a fallback only if the current read fails.
903
+
904
+ Returns:
905
+ int: Current global memory usage (RSS/used) in MB. Returns previous value
906
+ or 0 if the read fails and no previous value exists.
907
+ """
908
+ try:
909
+ # psutil.virtual_memory().used provides total RAM used by processes
910
+ current_global_memory_bytes = psutil.virtual_memory().used
911
+ current_global_memory_mb = int(current_global_memory_bytes / (1024 * 1024))
912
+ logger.debug(f"[ScalingMemCheck] Current global memory usage (used): {current_global_memory_mb} MB")
913
+
914
+ return current_global_memory_mb
915
+ except Exception as e:
916
+ logger.error(
917
+ f"[ScalingMemCheck] Failed to get current system memory usage: {e}. "
918
+ f"Attempting to use previous value ({self.prev_global_memory_usage} MB).",
919
+ exc_info=False,
920
+ )
921
+
922
+ # Use previous value if available, otherwise default to 0 (less ideal, but avoids None)
923
+ # Returning 0 might incorrectly signal low memory usage if it's the first read that fails.
924
+ return self.prev_global_memory_usage if self.prev_global_memory_usage is not None else 0
925
+
926
+ def _calculate_scaling_adjustments(
927
+ self, current_stage_metrics: Dict[str, Dict[str, Any]], global_in_flight: int, current_global_memory_mb: int
928
+ ) -> Dict[str, int]:
929
+ """Runs controllers to get target replica counts using topology for edge count."""
930
+ logger.debug("[ScalingCalc] Calculating adjustments via PID and RCM...")
931
+ # Get edge count from topology
932
+ num_edges = len(self.topology.get_edge_queues())
933
+
934
+ try:
935
+ initial_proposals = self.pid_controller.calculate_initial_proposals(current_stage_metrics)
936
+ logger.debug(
937
+ "[ScalingCalc] PID Initial Proposals:"
938
+ f" { {n: p.proposed_replicas for n, p in initial_proposals.items()} }" # noqa E201,E202
939
+ )
940
+
941
+ final_adjustments = self.constraint_manager.apply_constraints(
942
+ initial_proposals=initial_proposals,
943
+ global_in_flight=global_in_flight,
944
+ current_global_memory_usage_mb=current_global_memory_mb,
945
+ num_edges=num_edges,
946
+ )
947
+ logger.debug(f"[ScalingCalc] RCM Final Adjustments: {final_adjustments}")
948
+ return final_adjustments
949
+ except Exception as e:
950
+ logger.error(f"[ScalingCalc] Error during controller execution: {e}", exc_info=True)
951
+ logger.warning("[ScalingCalc] Falling back to current replica counts.")
952
+ return {name: metrics.get("replicas", 0) for name, metrics in current_stage_metrics.items()}
953
+
954
+ def _apply_scaling_actions(self, final_adjustments: Dict[str, int]) -> None:
955
+ """Applies scaling by calling _scale_stage, using topology for validation."""
956
+ stages_needing_action = []
957
+ current_actors_map = self.topology.get_stage_actors() # Snapshot
958
+
959
+ for stage_name, target_replica_count in final_adjustments.items():
960
+ current_count = len(current_actors_map.get(stage_name, []))
961
+ stage_info = self.topology.get_stage_info(stage_name) # Get info from topology
962
+
963
+ if not stage_info:
964
+ logger.warning(f"[ScalingApply] Cannot apply scaling for unknown stage '{stage_name}'. Skipping.")
965
+ continue
966
+
967
+ # Clamp target using StageInfo from topology
968
+ clamped_target = max(stage_info.min_replicas, min(stage_info.max_replicas, target_replica_count))
969
+ if clamped_target != target_replica_count:
970
+ logger.warning(
971
+ f"[ScalingApply-{stage_name}] Target {target_replica_count} clamped to {clamped_target} by bounds."
972
+ )
973
+ target_replica_count = clamped_target
974
+
975
+ if target_replica_count != current_count:
976
+ stages_needing_action.append((stage_name, target_replica_count))
977
+ logger.info(
978
+ f"[ScalingApply-{stage_name}] Action: Current={current_count}, "
979
+ f"Target={target_replica_count} (Min={stage_info.min_replicas}, Max={stage_info.max_replicas})"
980
+ )
981
+
982
+ if not stages_needing_action:
983
+ logger.debug("[ScalingApply] No scaling actions required.")
984
+ return
985
+
986
+ max_workers = min(len(stages_needing_action), 8)
987
+ logger.debug(
988
+ f"[ScalingApply] Submitting {len(stages_needing_action)} scaling actions ({max_workers} workers)..."
989
+ )
990
+ action_results = {}
991
+
992
+ with concurrent.futures.ThreadPoolExecutor(
993
+ max_workers=max_workers, thread_name_prefix="ScalingAction"
994
+ ) as executor:
995
+ future_to_stage = {
996
+ executor.submit(self._scale_stage, stage_name, target_count): stage_name
997
+ for stage_name, target_count in stages_needing_action
998
+ }
999
+ wait_timeout = 180.0
1000
+ logger.debug(f"[ScalingApply] Waiting up to {wait_timeout}s for actions...")
1001
+ for future in concurrent.futures.as_completed(future_to_stage, timeout=wait_timeout):
1002
+ stage_name = future_to_stage[future]
1003
+ try:
1004
+ result = future.result() # Raises exception if _scale_stage failed internally
1005
+ action_results[stage_name] = {"status": "completed", "result": result}
1006
+ logger.debug(f"[ScalingApply-{stage_name}] Action completed.")
1007
+ except TimeoutError:
1008
+ logger.error(f"[ScalingApply-{stage_name}] Action timed out ({wait_timeout}s).")
1009
+ action_results[stage_name] = {"status": "timeout"}
1010
+ self.topology.update_scaling_state(stage_name, "Error") # Mark as error on timeout
1011
+ except Exception as exc:
1012
+ logger.error(f"[ScalingApply-{stage_name}] Action failed: {exc}", exc_info=True)
1013
+ action_results[stage_name] = {"status": "error", "exception": exc}
1014
+ # State should be set to Error inside _scale_stage or its handlers on failure
1015
+
1016
+ completed = sum(1 for r in action_results.values() if r["status"] == "completed")
1017
+ errors = sum(1 for r in action_results.values() if r["status"] == "error")
1018
+ timeouts = sum(1 for r in action_results.values() if r["status"] == "timeout")
1019
+ logger.info(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
1020
+
1021
+ def _perform_scaling_and_maintenance(self) -> None:
1022
+ """Orchestrates scaling/maintenance using topology and stats collector."""
1023
+ logger.debug("--- Performing Scaling & Maintenance Cycle ---")
1024
+
1025
+ if not self.dynamic_memory_scaling:
1026
+ logger.debug("Dynamic memory scaling disabled. Skipping cycle.")
1027
+ return
1028
+
1029
+ cycle_start_time = time.time()
1030
+
1031
+ # Check flushing state via topology
1032
+ if self.topology.get_is_flushing():
1033
+ logger.debug("Skipping scaling cycle: Queue flush in progress (topology state).")
1034
+ return
1035
+
1036
+ # --- Check for quietness for flushing (uses topology state via helper) ---
1037
+ try:
1038
+ if self._is_pipeline_quiet():
1039
+ logger.info("Pipeline quiet, initiating queue flush.")
1040
+ flush_success = self._execute_queue_flush() # Uses topology internally
1041
+ logger.info(f"Automatic queue flush completed. Success: {flush_success}")
1042
+ return # Skip scaling if flush occurred
1043
+ except Exception as e:
1044
+ logger.error(f"Error during quiet check or flush: {e}. Skipping cycle.", exc_info=True)
1045
+ return
1046
+
1047
+ # --- Get & Validate Stats ---
1048
+ current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
1049
+ self.stats_collector.get_latest_stats()
1050
+ )
1051
+
1052
+ last_update_age = time.time() - last_update_time
1053
+ max_stats_age_for_scaling = max(15.0, self._stats_collection_interval_seconds)
1054
+ if not current_stage_stats or not stats_were_successful or last_update_age > max_stats_age_for_scaling:
1055
+ status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
1056
+ logger.warning(
1057
+ f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
1058
+ )
1059
+ return
1060
+
1061
+ # --- Gather Metrics (uses topology via helper) ---
1062
+ current_stage_metrics = self._gather_controller_metrics(current_stage_stats, global_in_flight)
1063
+ if not current_stage_metrics:
1064
+ logger.error("[Scaling] Failed gather metrics. Skipping.")
1065
+ return
1066
+
1067
+ # --- Get Memory Usage ---
1068
+ current_global_memory_mb = self._get_current_global_memory()
1069
+
1070
+ # --- Calculate Scaling Adjustments (uses topology via helper) ---
1071
+ final_adjustments = self._calculate_scaling_adjustments(
1072
+ current_stage_metrics, global_in_flight, current_global_memory_mb
1073
+ )
1074
+
1075
+ # --- Update Memory Usage *After* Decision ---
1076
+ self.prev_global_memory_usage = current_global_memory_mb
1077
+
1078
+ # --- Apply Scaling Actions (uses topology via helper) ---
1079
+ self._apply_scaling_actions(final_adjustments)
1080
+
1081
+ logger.debug(f"--- Scaling & Maintenance Cycle Complete (Duration: {time.time() - cycle_start_time:.2f}s) ---")
1082
+
1083
+ # --- Lifecycle Methods for Monitoring/Scaling Threads ---
1084
+ def _scaling_loop(self, interval: float) -> None:
1085
+ """Main loop for the scaling thread."""
1086
+ logger.info(f"Scaling loop started. Interval: {interval}s")
1087
+ while self._scaling_monitoring:
1088
+ try:
1089
+ self._perform_scaling_and_maintenance()
1090
+ except Exception as e:
1091
+ logger.error(f"Error in scaling loop: {e}", exc_info=True)
1092
+
1093
+ sleep_time = interval
1094
+ if not self._scaling_monitoring:
1095
+ break
1096
+ time.sleep(sleep_time)
1097
+ logger.info("Scaling loop finished.")
1098
+
1099
+ def _start_scaling(self, poll_interval: float = 10.0) -> None:
1100
+ if not self._scaling_monitoring:
1101
+ self._scaling_monitoring = True
1102
+ self._scaling_thread = threading.Thread(target=self._scaling_loop, args=(poll_interval,), daemon=True)
1103
+ self._scaling_thread.start()
1104
+ logger.info(f"Scaling/Maintenance thread launched (Interval: {poll_interval}s).")
1105
+
1106
+ def _stop_scaling(self) -> None:
1107
+ if self._scaling_monitoring:
1108
+ logger.debug("Stopping scaling/maintenance thread...")
1109
+ self._scaling_monitoring = False
1110
+ if self._scaling_thread is not None:
1111
+ self._scaling_thread.join(timeout=15) # Allow more time for scaling actions
1112
+ if self._scaling_thread.is_alive():
1113
+ logger.warning("Scaling thread did not exit cleanly.")
1114
+ self._scaling_thread = None
1115
+ logger.info("Scaling/Maintenance stopped.")
1116
+
1117
+ # --- Pipeline Start/Stop ---
1118
+ def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
1119
+ """Starts actors (via topology) and background threads."""
1120
+ # Check topology for actors (indicates built)
1121
+ if not self.topology.get_stage_actors():
1122
+ logger.error("Cannot start: Pipeline not built or has no actors.")
1123
+ return
1124
+
1125
+ logger.info("Starting pipeline execution...")
1126
+ start_refs = []
1127
+ # Get actors from topology
1128
+ actors_to_start = [actor for actors in self.topology.get_stage_actors().values() for actor in actors]
1129
+
1130
+ for actor in actors_to_start:
1131
+ start_refs.append(actor.start.remote())
1132
+
1133
+ if start_refs:
1134
+ logger.debug(f"Waiting for {len(start_refs)} actors to start...")
1135
+ try:
1136
+ ray.get(start_refs, timeout=60.0)
1137
+ logger.info(f"{len(start_refs)} actors started.")
1138
+ except Exception as e:
1139
+ logger.error(f"Error/Timeout starting actors: {e}", exc_info=True)
1140
+ self.stop() # Attempt cleanup
1141
+
1142
+ raise RuntimeError("Pipeline start failed: actors did not start.") from e
1143
+
1144
+ self.stats_collector.start()
1145
+ self._start_scaling(poll_interval=scaling_poll_interval)
1146
+ logger.info("Pipeline started successfully.")
1147
+
1148
+ def stop(self) -> None:
1149
+ """Stops background threads and actors (via topology)."""
1150
+ logger.info("Stopping pipeline...")
1151
+
1152
+ # 1. Stop background threads first
1153
+ self._stop_scaling()
1154
+ self.stats_collector.stop()
1155
+
1156
+ # 2. Stop actors (using topology)
1157
+ logger.debug("Stopping all stage actors...")
1158
+ stop_refs_map: Dict[ray.ObjectRef, Any] = {}
1159
+ actors_to_kill = []
1160
+
1161
+ # Get actors snapshot from topology
1162
+ current_actors = {name: list(actors) for name, actors in self.topology.get_stage_actors().items()}
1163
+
1164
+ for stage_name, actors in current_actors.items():
1165
+ for actor in actors:
1166
+ try:
1167
+ stop_refs_map[actor.stop.remote()] = actor
1168
+ except Exception as e:
1169
+ logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Will kill.")
1170
+
1171
+ if stop_refs_map:
1172
+ stop_refs = list(stop_refs_map.keys())
1173
+ logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
1174
+ try:
1175
+ ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
1176
+ if not_ready:
1177
+ logger.warning(f"Timeout waiting for {len(not_ready)} actors to stop. Will kill.")
1178
+ actors_to_kill.extend(stop_refs_map.get(ref) for ref in not_ready if stop_refs_map.get(ref))
1179
+ logger.info(f"{len(ready)} actors stopped via stop().")
1180
+ except Exception as e:
1181
+ logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
1182
+ actors_to_kill.extend(a for a in stop_refs_map.values() if a not in actors_to_kill) # Add all on error
1183
+
1184
+ # Clear runtime state in topology
1185
+ self.topology.clear_runtime_state()
1186
+
1187
+ logger.info("Pipeline stopped.")