nv-ingest 2025.5.21.dev20250521__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/__init__.py +20 -0
- nv_ingest/api/__init__.py +3 -0
- nv_ingest/api/main.py +43 -0
- nv_ingest/api/v1/__init__.py +3 -0
- nv_ingest/api/v1/health.py +114 -0
- nv_ingest/api/v1/ingest.py +454 -0
- nv_ingest/framework/__init__.py +3 -0
- nv_ingest/framework/orchestration/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
- nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
- nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
- nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
- nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
- nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
- nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
- nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
- nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
- nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
- nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
- nv_ingest/framework/schemas/__init__.py +0 -0
- nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
- nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
- nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
- nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
- nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
- nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
- nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
- nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
- nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
- nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
- nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
- nv_ingest/framework/util/__init__.py +3 -0
- nv_ingest/framework/util/flow_control/__init__.py +8 -0
- nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
- nv_ingest/framework/util/service/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
- nv_ingest/framework/util/service/meta/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
- nv_ingest/framework/util/telemetry/__init__.py +3 -0
- nv_ingest/framework/util/telemetry/global_stats.py +145 -0
- nv_ingest/version.py +38 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1187 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import threading
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
import psutil
|
|
10
|
+
import uuid
|
|
11
|
+
import ray
|
|
12
|
+
from ray.exceptions import GetTimeoutError
|
|
13
|
+
from ray.util.queue import Queue as RayQueue
|
|
14
|
+
from typing import Dict, Optional, List, Tuple, Any
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
import concurrent.futures
|
|
17
|
+
import logging
|
|
18
|
+
import time
|
|
19
|
+
|
|
20
|
+
from nv_ingest.framework.orchestration.ray.primitives.pipeline_topology import PipelineTopology, StageInfo
|
|
21
|
+
from nv_ingest.framework.orchestration.ray.primitives.ray_stat_collector import RayStatsCollector
|
|
22
|
+
from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import PIDController, ResourceConstraintManager
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# --- Configuration Objects ---
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ScalingConfig:
|
|
32
|
+
"""Configuration for PID and Resource Constraint Manager based scaling."""
|
|
33
|
+
|
|
34
|
+
dynamic_memory_scaling: bool = True
|
|
35
|
+
dynamic_memory_threshold: float = 0.75
|
|
36
|
+
pid_kp: float = 0.1
|
|
37
|
+
pid_ki: float = 0.001
|
|
38
|
+
pid_kd: float = 0.0
|
|
39
|
+
pid_target_queue_depth: int = 0
|
|
40
|
+
pid_penalty_factor: float = 0.1
|
|
41
|
+
pid_error_boost_factor: float = 1.5
|
|
42
|
+
pid_window_size: int = 10
|
|
43
|
+
rcm_estimated_edge_cost_mb: int = 5000
|
|
44
|
+
rcm_memory_safety_buffer_fraction: float = 0.15
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class FlushingConfig:
|
|
49
|
+
"""Configuration for queue flushing behavior."""
|
|
50
|
+
|
|
51
|
+
queue_flush_interval_seconds: int = 600
|
|
52
|
+
queue_flush_drain_timeout_seconds: int = 300
|
|
53
|
+
quiet_period_threshold: int = 0
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class StatsConfig:
|
|
58
|
+
"""Configuration for the RayStatsCollector."""
|
|
59
|
+
|
|
60
|
+
collection_interval_seconds: float = 10.0
|
|
61
|
+
actor_timeout_seconds: float = 5.0
|
|
62
|
+
queue_timeout_seconds: float = 2.0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class RayPipeline:
|
|
66
|
+
"""
|
|
67
|
+
A structured pipeline supporting dynamic scaling and queue flushing.
|
|
68
|
+
Uses PIDController and ResourceConstraintManager. Supports optional GUI display.
|
|
69
|
+
Delegates statistics collection to RayStatsCollector.
|
|
70
|
+
|
|
71
|
+
Configuration is managed via dedicated config objects (ScalingConfig, etc.).
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
scaling_config: ScalingConfig = ScalingConfig(),
|
|
77
|
+
flushing_config: FlushingConfig = FlushingConfig(),
|
|
78
|
+
stats_config: StatsConfig = StatsConfig(),
|
|
79
|
+
) -> None:
|
|
80
|
+
# Store config objects
|
|
81
|
+
self.scaling_config = scaling_config
|
|
82
|
+
self.flushing_config = flushing_config
|
|
83
|
+
self.stats_config = stats_config
|
|
84
|
+
|
|
85
|
+
# --- Instantiate Topology ---
|
|
86
|
+
self.topology = PipelineTopology()
|
|
87
|
+
|
|
88
|
+
# --- Structure Lock ---
|
|
89
|
+
self._structure_lock: threading.Lock = threading.Lock()
|
|
90
|
+
|
|
91
|
+
# --- State ---
|
|
92
|
+
# self.scaling_state: Dict[str, str] = {}
|
|
93
|
+
self.prev_global_memory_usage: Optional[int] = None
|
|
94
|
+
|
|
95
|
+
# --- Build Time Config & State ---
|
|
96
|
+
# Use scaling_config for these
|
|
97
|
+
self.dynamic_memory_scaling = self.scaling_config.dynamic_memory_scaling
|
|
98
|
+
self.dynamic_memory_threshold = self.scaling_config.dynamic_memory_threshold
|
|
99
|
+
self.stage_memory_overhead: Dict[str, float] = {}
|
|
100
|
+
|
|
101
|
+
# --- Background Threads ---
|
|
102
|
+
self._scaling_thread: Optional[threading.Thread] = None
|
|
103
|
+
self._scaling_monitoring = False
|
|
104
|
+
|
|
105
|
+
# --- Queue Flushing ---
|
|
106
|
+
self._last_queue_flush_time: float = time.time()
|
|
107
|
+
self.queue_flush_interval_seconds = self.flushing_config.queue_flush_interval_seconds
|
|
108
|
+
self.queue_flush_drain_timeout_seconds = self.flushing_config.queue_flush_drain_timeout_seconds
|
|
109
|
+
self.quiet_period_threshold = self.flushing_config.quiet_period_threshold
|
|
110
|
+
|
|
111
|
+
# --- Instantiate Autoscaling Controllers ---
|
|
112
|
+
# Use scaling_config
|
|
113
|
+
self.pid_controller = PIDController(
|
|
114
|
+
kp=self.scaling_config.pid_kp,
|
|
115
|
+
ki=self.scaling_config.pid_ki,
|
|
116
|
+
kd=self.scaling_config.pid_kd,
|
|
117
|
+
stage_cost_estimates={}, # Populated during build
|
|
118
|
+
target_queue_depth=self.scaling_config.pid_target_queue_depth,
|
|
119
|
+
window_size=self.scaling_config.pid_window_size,
|
|
120
|
+
penalty_factor=self.scaling_config.pid_penalty_factor,
|
|
121
|
+
error_boost_factor=self.scaling_config.pid_error_boost_factor,
|
|
122
|
+
)
|
|
123
|
+
logger.info("PIDController initialized using ScalingConfig.")
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
total_system_memory_bytes = psutil.virtual_memory().total
|
|
127
|
+
# Use scaling_config for dynamic_memory_threshold
|
|
128
|
+
absolute_memory_threshold_mb = int(
|
|
129
|
+
self.scaling_config.dynamic_memory_threshold * total_system_memory_bytes / (1024 * 1024)
|
|
130
|
+
)
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.error(f"Failed to get system memory: {e}. Using high limit.")
|
|
133
|
+
absolute_memory_threshold_mb = 1_000_000 # Fallback value
|
|
134
|
+
|
|
135
|
+
# Use scaling_config
|
|
136
|
+
self.constraint_manager = ResourceConstraintManager(
|
|
137
|
+
max_replicas=1, # Updated during build
|
|
138
|
+
memory_threshold=absolute_memory_threshold_mb,
|
|
139
|
+
estimated_edge_cost_mb=self.scaling_config.rcm_estimated_edge_cost_mb,
|
|
140
|
+
memory_safety_buffer_fraction=self.scaling_config.rcm_memory_safety_buffer_fraction,
|
|
141
|
+
)
|
|
142
|
+
logger.info("ResourceConstraintManager initialized using ScalingConfig.")
|
|
143
|
+
|
|
144
|
+
# --- Instantiate Stats Collector ---
|
|
145
|
+
self._stats_collection_interval_seconds = self.stats_config.collection_interval_seconds
|
|
146
|
+
self.stats_collector = RayStatsCollector(
|
|
147
|
+
pipeline_accessor=self, # This dependency remains for now
|
|
148
|
+
interval=self.stats_config.collection_interval_seconds,
|
|
149
|
+
actor_timeout=self.stats_config.actor_timeout_seconds,
|
|
150
|
+
queue_timeout=self.stats_config.queue_timeout_seconds,
|
|
151
|
+
)
|
|
152
|
+
logger.info("RayStatsCollector initialized using StatsConfig.")
|
|
153
|
+
|
|
154
|
+
# --- Accessor Methods for Stats Collector (and internal use) ---
|
|
155
|
+
|
|
156
|
+
def get_stages_info(self) -> List[StageInfo]:
|
|
157
|
+
"""Returns a snapshot of the current stage information."""
|
|
158
|
+
return self.topology.get_stages_info()
|
|
159
|
+
|
|
160
|
+
def get_stage_actors(self) -> Dict[str, List[Any]]:
|
|
161
|
+
"""Returns a snapshot of the current actors per stage."""
|
|
162
|
+
return self.topology.get_stage_actors()
|
|
163
|
+
|
|
164
|
+
def get_edge_queues(self) -> Dict[str, Tuple[Any, int]]:
|
|
165
|
+
"""Returns a snapshot of the current edge queues."""
|
|
166
|
+
return self.topology.get_edge_queues()
|
|
167
|
+
|
|
168
|
+
def _configure_autoscalers(self) -> None:
|
|
169
|
+
"""Updates controllers based on current pipeline configuration via topology."""
|
|
170
|
+
logger.debug("[Build-Configure] Configuring autoscalers...")
|
|
171
|
+
total_max_replicas = 0
|
|
172
|
+
default_cost_bytes = 100 * 1024 * 1024
|
|
173
|
+
stage_overheads = {} # Collect locally
|
|
174
|
+
|
|
175
|
+
# Use topology accessor
|
|
176
|
+
current_stages = self.topology.get_stages_info()
|
|
177
|
+
|
|
178
|
+
for stage in current_stages:
|
|
179
|
+
total_max_replicas += stage.max_replicas
|
|
180
|
+
# Use estimated overhead if available (Assume it's calculated elsewhere or default)
|
|
181
|
+
# For now, let's store a dummy overhead in topology during build
|
|
182
|
+
overhead_bytes = default_cost_bytes # Simplification for now
|
|
183
|
+
stage_overheads[stage.name] = overhead_bytes # Store locally first
|
|
184
|
+
cost_mb = max(1, int(overhead_bytes / (1024 * 1024)))
|
|
185
|
+
# Update controller directly (or via dedicated method if preferred)
|
|
186
|
+
self.pid_controller.stage_cost_estimates[stage.name] = cost_mb
|
|
187
|
+
|
|
188
|
+
# Update topology with collected overheads
|
|
189
|
+
self.topology.set_stage_memory_overhead(stage_overheads)
|
|
190
|
+
|
|
191
|
+
# Update constraint manager
|
|
192
|
+
self.constraint_manager.max_replicas = total_max_replicas
|
|
193
|
+
|
|
194
|
+
logger.info(f"[Build-Configure] Autoscalers configured. Total Max Replicas: {total_max_replicas}")
|
|
195
|
+
logger.debug(f"[Build-Configure] PID stage cost estimates (MB): {self.pid_controller.stage_cost_estimates}")
|
|
196
|
+
|
|
197
|
+
def _instantiate_initial_actors(self) -> None:
|
|
198
|
+
"""Instantiates initial actors and updates topology."""
|
|
199
|
+
logger.info("[Build-Actors] Instantiating initial stage actors (min_replicas)...")
|
|
200
|
+
# Use topology accessor
|
|
201
|
+
current_stages = self.topology.get_stages_info()
|
|
202
|
+
|
|
203
|
+
for stage in current_stages:
|
|
204
|
+
replicas = []
|
|
205
|
+
|
|
206
|
+
if not self.dynamic_memory_scaling:
|
|
207
|
+
num_initial_actors = stage.max_replicas
|
|
208
|
+
else:
|
|
209
|
+
num_initial_actors = (
|
|
210
|
+
max(stage.min_replicas, 1) if stage.is_source or stage.is_sink else stage.min_replicas
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
if num_initial_actors > 0:
|
|
214
|
+
logger.debug(f"[Build-Actors] Stage '{stage.name}' creating {num_initial_actors} initial actor(s).")
|
|
215
|
+
for i in range(num_initial_actors):
|
|
216
|
+
actor_name = f"{stage.name}_{uuid.uuid4()}"
|
|
217
|
+
logger.debug(
|
|
218
|
+
f"[Build-Actors] Creating actor '{actor_name}' ({i + 1}/{num_initial_actors})"
|
|
219
|
+
f" for '{stage.name}'"
|
|
220
|
+
)
|
|
221
|
+
try:
|
|
222
|
+
actor = stage.callable.options(
|
|
223
|
+
name=actor_name, max_concurrency=10, max_restarts=0, lifetime="detached"
|
|
224
|
+
).remote(config=stage.config)
|
|
225
|
+
replicas.append(actor)
|
|
226
|
+
except Exception as e:
|
|
227
|
+
logger.error(f"[Build-Actors] Failed create actor '{actor_name}': {e}", exc_info=True)
|
|
228
|
+
raise RuntimeError(f"Build failed: actor creation error for stage '{stage.name}'") from e
|
|
229
|
+
|
|
230
|
+
# Update topology for this stage
|
|
231
|
+
self.topology.set_actors_for_stage(stage.name, replicas)
|
|
232
|
+
logger.debug(f"[Build-Actors] Stage '{stage.name}' initial actors set in topology: count={len(replicas)}")
|
|
233
|
+
|
|
234
|
+
logger.info("[Build-Actors] Initial actor instantiation complete.")
|
|
235
|
+
|
|
236
|
+
def _create_and_wire_edges(self) -> List[ray.ObjectRef]:
|
|
237
|
+
"""Creates queues, wires actors (using topology), and updates topology."""
|
|
238
|
+
logger.info("[Build-Wiring] Creating and wiring edges...")
|
|
239
|
+
wiring_refs = []
|
|
240
|
+
new_edge_queues: Dict[str, Tuple[Any, int]] = {}
|
|
241
|
+
|
|
242
|
+
current_connections = self.topology.get_connections()
|
|
243
|
+
current_stage_actors = self.topology.get_stage_actors() # Gets copy
|
|
244
|
+
|
|
245
|
+
for from_stage_name, connections_list in current_connections.items():
|
|
246
|
+
for to_stage_name, queue_size in connections_list:
|
|
247
|
+
queue_name = f"{from_stage_name}_to_{to_stage_name}"
|
|
248
|
+
logger.debug(f"[Build-Wiring] Creating queue '{queue_name}' (size {queue_size}) and wiring.")
|
|
249
|
+
try:
|
|
250
|
+
edge_queue = RayQueue(maxsize=queue_size, actor_options={"max_restarts": 0})
|
|
251
|
+
new_edge_queues[queue_name] = (edge_queue, queue_size)
|
|
252
|
+
|
|
253
|
+
# Wire using current actors from topology snapshot
|
|
254
|
+
source_actors = current_stage_actors.get(from_stage_name, [])
|
|
255
|
+
for actor in source_actors:
|
|
256
|
+
wiring_refs.append(actor.set_output_queue.remote(edge_queue))
|
|
257
|
+
|
|
258
|
+
dest_actors = current_stage_actors.get(to_stage_name, [])
|
|
259
|
+
for actor in dest_actors:
|
|
260
|
+
wiring_refs.append(actor.set_input_queue.remote(edge_queue))
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
logger.error(f"[Build-Wiring] Failed create/wire queue '{queue_name}': {e}", exc_info=True)
|
|
264
|
+
raise RuntimeError(f"Build failed: queue wiring error for '{queue_name}'") from e
|
|
265
|
+
|
|
266
|
+
# Update topology with the new queues
|
|
267
|
+
self.topology.set_edge_queues(new_edge_queues)
|
|
268
|
+
|
|
269
|
+
logger.debug(f"[Build-Wiring] Submitted {len(wiring_refs)} wiring calls. Queues set in topology.")
|
|
270
|
+
return wiring_refs
|
|
271
|
+
|
|
272
|
+
@staticmethod
|
|
273
|
+
def _wait_for_wiring(wiring_refs: List[ray.ObjectRef]) -> None:
|
|
274
|
+
"""Waits for remote wiring calls to complete. (Static, no changes needed)."""
|
|
275
|
+
if not wiring_refs:
|
|
276
|
+
logger.debug("[Build-WaitWiring] No wiring calls.")
|
|
277
|
+
return
|
|
278
|
+
logger.debug(f"[Build-WaitWiring] Waiting for {len(wiring_refs)} wiring calls...")
|
|
279
|
+
try:
|
|
280
|
+
ray.get(wiring_refs)
|
|
281
|
+
logger.debug("[Build-WaitWiring] All wiring calls completed.")
|
|
282
|
+
except Exception as e:
|
|
283
|
+
logger.error(f"[Build-WaitWiring] Error during wiring confirmation: {e}", exc_info=True)
|
|
284
|
+
raise RuntimeError("Build failed: error confirming initial wiring") from e
|
|
285
|
+
|
|
286
|
+
def add_source(
|
|
287
|
+
self, *, name: str, source_actor: Any, config: BaseModel, min_replicas: int = 1, max_replicas: int = 1
|
|
288
|
+
) -> "RayPipeline":
|
|
289
|
+
if min_replicas < 1:
|
|
290
|
+
logger.warning(f"Source stage '{name}': min_replicas must be >= 1. Overriding.")
|
|
291
|
+
min_replicas = 1
|
|
292
|
+
|
|
293
|
+
stage_info = StageInfo(
|
|
294
|
+
name=name,
|
|
295
|
+
callable=source_actor,
|
|
296
|
+
config=config,
|
|
297
|
+
is_source=True,
|
|
298
|
+
min_replicas=min_replicas,
|
|
299
|
+
max_replicas=max_replicas,
|
|
300
|
+
)
|
|
301
|
+
self.topology.add_stage(stage_info) # Delegate
|
|
302
|
+
|
|
303
|
+
return self
|
|
304
|
+
|
|
305
|
+
def add_stage(
|
|
306
|
+
self, *, name: str, stage_actor: Any, config: BaseModel, min_replicas: int = 0, max_replicas: int = 1
|
|
307
|
+
) -> "RayPipeline":
|
|
308
|
+
if min_replicas < 0:
|
|
309
|
+
logger.warning(f"Stage '{name}': min_replicas cannot be negative. Overriding to 0.")
|
|
310
|
+
min_replicas = 0
|
|
311
|
+
stage_info = StageInfo(
|
|
312
|
+
name=name, callable=stage_actor, config=config, min_replicas=min_replicas, max_replicas=max_replicas
|
|
313
|
+
)
|
|
314
|
+
self.topology.add_stage(stage_info) # Delegate
|
|
315
|
+
|
|
316
|
+
return self
|
|
317
|
+
|
|
318
|
+
def add_sink(
|
|
319
|
+
self, *, name: str, sink_actor: Any, config: BaseModel, min_replicas: int = 1, max_replicas: int = 1
|
|
320
|
+
) -> "RayPipeline":
|
|
321
|
+
# Sink min_replicas can realistically be 0 if data drain is optional/best-effort? Let's allow 0.
|
|
322
|
+
if min_replicas < 0:
|
|
323
|
+
logger.warning(f"Sink stage '{name}': min_replicas cannot be negative. Overriding to 0.")
|
|
324
|
+
min_replicas = 0
|
|
325
|
+
stage_info = StageInfo(
|
|
326
|
+
name=name,
|
|
327
|
+
callable=sink_actor,
|
|
328
|
+
config=config,
|
|
329
|
+
is_sink=True,
|
|
330
|
+
min_replicas=min_replicas,
|
|
331
|
+
max_replicas=max_replicas,
|
|
332
|
+
)
|
|
333
|
+
self.topology.add_stage(stage_info) # Delegate
|
|
334
|
+
|
|
335
|
+
return self
|
|
336
|
+
|
|
337
|
+
# --- Method for defining connections ---
|
|
338
|
+
def make_edge(self, from_stage: str, to_stage: str, queue_size: int = 100) -> "RayPipeline":
|
|
339
|
+
try:
|
|
340
|
+
self.topology.add_connection(from_stage, to_stage, queue_size) # Delegate (includes validation)
|
|
341
|
+
except ValueError as e:
|
|
342
|
+
logger.error(f"make_edge failed: {e}")
|
|
343
|
+
raise # Re-raise the error
|
|
344
|
+
return self
|
|
345
|
+
|
|
346
|
+
# ----- Pipeline Build Process ---
|
|
347
|
+
def build(self) -> Dict[str, List[Any]]:
|
|
348
|
+
"""Builds the pipeline: configures, instantiates, wires, using topology."""
|
|
349
|
+
logger.info("--- Starting Pipeline Build Process ---")
|
|
350
|
+
try:
|
|
351
|
+
if not self.topology.get_stages_info():
|
|
352
|
+
logger.error("Build failed: No stages defined in topology.")
|
|
353
|
+
return {}
|
|
354
|
+
|
|
355
|
+
# Steps interact with self.topology
|
|
356
|
+
self._configure_autoscalers()
|
|
357
|
+
self._instantiate_initial_actors()
|
|
358
|
+
wiring_futures = self._create_and_wire_edges()
|
|
359
|
+
self._wait_for_wiring(wiring_futures)
|
|
360
|
+
|
|
361
|
+
logger.info("--- Pipeline Build Completed Successfully ---")
|
|
362
|
+
return self.topology.get_stage_actors() # Return actors from topology
|
|
363
|
+
|
|
364
|
+
except RuntimeError as e:
|
|
365
|
+
logger.critical(f"Pipeline build failed: {e}", exc_info=False)
|
|
366
|
+
# Clean up topology runtime state?
|
|
367
|
+
self.topology.clear_runtime_state()
|
|
368
|
+
return {}
|
|
369
|
+
except Exception as e:
|
|
370
|
+
logger.critical(f"Unexpected error during pipeline build: {e}", exc_info=True)
|
|
371
|
+
self.topology.clear_runtime_state()
|
|
372
|
+
return {}
|
|
373
|
+
|
|
374
|
+
# --- Scaling Logic ---
|
|
375
|
+
@staticmethod
|
|
376
|
+
def _create_single_replica(stage_info: StageInfo) -> Any:
|
|
377
|
+
"""Creates a single new Ray actor replica for the given stage."""
|
|
378
|
+
actor_name = f"{stage_info.name}_{uuid.uuid4()}"
|
|
379
|
+
logger.debug(f"[ScaleUtil] Creating new actor '{actor_name}' for stage '{stage_info.name}'")
|
|
380
|
+
try:
|
|
381
|
+
new_actor = stage_info.callable.options(
|
|
382
|
+
name=actor_name, max_concurrency=10, max_restarts=0, lifetime="detached"
|
|
383
|
+
).remote(config=stage_info.config)
|
|
384
|
+
|
|
385
|
+
return new_actor
|
|
386
|
+
except Exception as e:
|
|
387
|
+
logger.error(
|
|
388
|
+
f"[ScaleUtil] Failed to create actor '{actor_name}' for stage '{stage_info.name}':" f" {e}",
|
|
389
|
+
exc_info=True,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Propagate error to halt the scaling operation
|
|
393
|
+
raise RuntimeError(f"Actor creation failed for stage '{stage_info.name}' during scale up") from e
|
|
394
|
+
|
|
395
|
+
def _get_wiring_refs_for_actor(self, actor: Any, stage_name: str) -> List[ray.ObjectRef]:
|
|
396
|
+
"""Gets wiring futures for a single actor using topology for queues/connections."""
|
|
397
|
+
wiring_refs = []
|
|
398
|
+
|
|
399
|
+
# Use topology accessors
|
|
400
|
+
connections = self.topology.get_connections()
|
|
401
|
+
edge_queues = self.topology.get_edge_queues()
|
|
402
|
+
|
|
403
|
+
# Wire outputs
|
|
404
|
+
if stage_name in connections:
|
|
405
|
+
for to_stage, _ in connections[stage_name]:
|
|
406
|
+
queue_name = f"{stage_name}_to_{to_stage}"
|
|
407
|
+
if queue_name in edge_queues:
|
|
408
|
+
edge_queue, _ = edge_queues[queue_name]
|
|
409
|
+
wiring_refs.append(actor.set_output_queue.remote(edge_queue))
|
|
410
|
+
|
|
411
|
+
# Wire inputs
|
|
412
|
+
for from_stage, conns in connections.items():
|
|
413
|
+
for to_stage, _ in conns:
|
|
414
|
+
if to_stage == stage_name:
|
|
415
|
+
queue_name = f"{from_stage}_to_{stage_name}"
|
|
416
|
+
if queue_name in edge_queues:
|
|
417
|
+
edge_queue, _ = edge_queues[queue_name]
|
|
418
|
+
wiring_refs.append(actor.set_input_queue.remote(edge_queue))
|
|
419
|
+
|
|
420
|
+
return wiring_refs
|
|
421
|
+
|
|
422
|
+
@staticmethod
|
|
423
|
+
def _start_actors(actors_to_start: List[Any], stage_name: str) -> None:
|
|
424
|
+
"""Starts a list of actors if they have a 'start' method and waits for completion."""
|
|
425
|
+
start_refs = []
|
|
426
|
+
for actor in actors_to_start:
|
|
427
|
+
if hasattr(actor, "start"):
|
|
428
|
+
logger.debug(f"[ScaleUtil] Starting actor '{actor}' for stage '{stage_name}'")
|
|
429
|
+
start_refs.append(actor.start.remote())
|
|
430
|
+
|
|
431
|
+
if not start_refs:
|
|
432
|
+
logger.debug(f"[ScaleUtil] No actors with start() method found for stage '{stage_name}'.")
|
|
433
|
+
return
|
|
434
|
+
|
|
435
|
+
logger.debug(f"[ScaleUtil] Waiting for {len(start_refs)} actor starts for stage '{stage_name}'...")
|
|
436
|
+
try:
|
|
437
|
+
ray.get(start_refs)
|
|
438
|
+
logger.debug(f"[ScaleUtil] {len(start_refs)} actors started successfully for stage '{stage_name}'.")
|
|
439
|
+
except Exception as e:
|
|
440
|
+
logger.error(
|
|
441
|
+
f"[ScaleUtil] Error waiting for actors to start for stage '{stage_name}':" f" {e}", exc_info=True
|
|
442
|
+
)
|
|
443
|
+
# Note: Actors might be started but confirmation failed. State might be inconsistent.
|
|
444
|
+
# Consider raising an error to signal potential inconsistency?
|
|
445
|
+
raise RuntimeError(f"Error confirming actor starts for stage '{stage_name}'") from e
|
|
446
|
+
|
|
447
|
+
def _handle_scale_up(self, stage_info: StageInfo, current_count: int, target_count: int) -> None:
|
|
448
|
+
"""Handles scaling up, interacting with topology."""
|
|
449
|
+
stage_name = stage_info.name
|
|
450
|
+
num_to_add = target_count - current_count
|
|
451
|
+
logger.debug(f"[ScaleUp-{stage_name}] Scaling up from {current_count} to {target_count} (+{num_to_add}).")
|
|
452
|
+
# Update topology state
|
|
453
|
+
self.topology.update_scaling_state(stage_name, "Scaling Up")
|
|
454
|
+
|
|
455
|
+
new_actors = []
|
|
456
|
+
all_wiring_refs = []
|
|
457
|
+
successfully_added_actors = []
|
|
458
|
+
|
|
459
|
+
try:
|
|
460
|
+
# 1. Create actors
|
|
461
|
+
for _ in range(num_to_add):
|
|
462
|
+
new_actor = self._create_single_replica(stage_info)
|
|
463
|
+
new_actors.append(new_actor)
|
|
464
|
+
|
|
465
|
+
# 2. Get wiring refs (uses topology internally)
|
|
466
|
+
for actor in new_actors:
|
|
467
|
+
all_wiring_refs.extend(self._get_wiring_refs_for_actor(actor, stage_name))
|
|
468
|
+
|
|
469
|
+
# 3. Wait for wiring (static helper)
|
|
470
|
+
self._wait_for_wiring(all_wiring_refs) # Handles errors
|
|
471
|
+
|
|
472
|
+
# 4. Start actors (static helper)
|
|
473
|
+
self._start_actors(new_actors, stage_name) # Handles errors
|
|
474
|
+
|
|
475
|
+
# 5. Add successfully created/wired/started actors to topology
|
|
476
|
+
for actor in new_actors:
|
|
477
|
+
self.topology.add_actor_to_stage(stage_name, actor)
|
|
478
|
+
successfully_added_actors.append(actor) # Keep track
|
|
479
|
+
|
|
480
|
+
final_count = self.topology.get_actor_count(stage_name)
|
|
481
|
+
logger.debug(
|
|
482
|
+
f"[ScaleUp-{stage_name}] Scale up complete. Added {len(successfully_added_actors)}. "
|
|
483
|
+
f"New count: {final_count}"
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
except Exception as e:
|
|
487
|
+
logger.error(f"[ScaleUp-{stage_name}] Error during scale up: {e}", exc_info=False)
|
|
488
|
+
self.topology.update_scaling_state(stage_name, "Error")
|
|
489
|
+
# --- Cleanup Attempt ---
|
|
490
|
+
# Actors created but potentially not wired/started/added to topology.
|
|
491
|
+
# Only kill actors that were definitely *not* added to the topology.
|
|
492
|
+
actors_to_kill = [a for a in new_actors if a not in successfully_added_actors]
|
|
493
|
+
if actors_to_kill:
|
|
494
|
+
logger.warning(
|
|
495
|
+
f"[ScaleUp-{stage_name}] Attempting to kill {len(actors_to_kill)} partially created actors."
|
|
496
|
+
)
|
|
497
|
+
for actor in actors_to_kill:
|
|
498
|
+
try:
|
|
499
|
+
ray.kill(actor, no_restart=True)
|
|
500
|
+
except Exception as kill_e:
|
|
501
|
+
logger.warning(f"Failed to kill actor {actor}: {kill_e}")
|
|
502
|
+
logger.critical(f"[ScaleUp-{stage_name}] Scale up failed. State potentially inconsistent.")
|
|
503
|
+
|
|
504
|
+
finally:
|
|
505
|
+
# Reset state only if it was Scaling Up and didn't end in Error
|
|
506
|
+
current_state = self.topology.get_scaling_state().get(stage_name)
|
|
507
|
+
if current_state == "Scaling Up":
|
|
508
|
+
self.topology.update_scaling_state(stage_name, "Idle")
|
|
509
|
+
|
|
510
|
+
def _handle_scale_down(self, stage_name: str, current_replicas: List[Any], target_count: int) -> None:
|
|
511
|
+
"""
|
|
512
|
+
Handles scaling down: initiates stop on actors, registers handles with
|
|
513
|
+
the topology for pending removal if stop was successfully initiated.
|
|
514
|
+
"""
|
|
515
|
+
current_count = len(current_replicas)
|
|
516
|
+
num_to_remove = current_count - target_count
|
|
517
|
+
logger.info(f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove}).")
|
|
518
|
+
|
|
519
|
+
# Basic validation
|
|
520
|
+
if num_to_remove <= 0:
|
|
521
|
+
logger.warning(f"[ScaleDown-{stage_name}] Invalid num_to_remove {num_to_remove}. Aborting.")
|
|
522
|
+
return
|
|
523
|
+
|
|
524
|
+
# Identify actors to remove (last N)
|
|
525
|
+
actors_to_remove = current_replicas[-num_to_remove:]
|
|
526
|
+
logger.debug(f"[ScaleDown-{stage_name}] Identified {len(actors_to_remove)} actors for removal.")
|
|
527
|
+
|
|
528
|
+
actors_to_register_map: Dict[str, List[Tuple[Any, ray.ObjectRef]]] = defaultdict(list)
|
|
529
|
+
stop_initiation_failures = 0
|
|
530
|
+
|
|
531
|
+
for actor in actors_to_remove:
|
|
532
|
+
actor_id_str = str(actor)
|
|
533
|
+
try:
|
|
534
|
+
# Call stop(), which now returns shutdown future
|
|
535
|
+
shutdown_future = actor.stop.remote()
|
|
536
|
+
actors_to_register_map[stage_name].append((actor, shutdown_future))
|
|
537
|
+
logger.debug(f"[ScaleDown-{stage_name}] Submitted stop() call for actor '{actor_id_str}'.")
|
|
538
|
+
except Exception as e:
|
|
539
|
+
logger.error(
|
|
540
|
+
f"[ScaleDown-{stage_name}] Error submitting stop() for actor '{actor_id_str}': "
|
|
541
|
+
f"{e}. Cannot register.",
|
|
542
|
+
exc_info=False,
|
|
543
|
+
)
|
|
544
|
+
stop_initiation_failures += 1
|
|
545
|
+
|
|
546
|
+
# Register actors pending removal (with their shutdown futures)
|
|
547
|
+
if actors_to_register_map:
|
|
548
|
+
num_registered = sum(len(v) for v in actors_to_register_map.values())
|
|
549
|
+
logger.debug(
|
|
550
|
+
f"[ScaleDown-{stage_name}] Registering {num_registered} "
|
|
551
|
+
f"actor handles with topology for shutdown monitoring."
|
|
552
|
+
)
|
|
553
|
+
try:
|
|
554
|
+
self.topology.register_actors_pending_removal(actors_to_register_map)
|
|
555
|
+
except Exception as e:
|
|
556
|
+
logger.error(
|
|
557
|
+
f"[ScaleDown-{stage_name}] CRITICAL - Failed to register actors pending removal with topology: {e}",
|
|
558
|
+
exc_info=True,
|
|
559
|
+
)
|
|
560
|
+
self.topology.update_scaling_state(stage_name, "Error")
|
|
561
|
+
elif actors_to_remove:
|
|
562
|
+
logger.warning(f"[ScaleDown-{stage_name}] No actors successfully initiated stop for registration.")
|
|
563
|
+
|
|
564
|
+
total_attempted = len(actors_to_remove)
|
|
565
|
+
logger.info(
|
|
566
|
+
f"[ScaleDown-{stage_name}] Scale down initiation process complete for {total_attempted} actors "
|
|
567
|
+
f"(Skipped/Failed Initiation: {stop_initiation_failures}). Topology cleanup will handle final removal."
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
def _scale_stage(self, stage_name: str, new_replica_count: int) -> None:
|
|
571
|
+
"""Orchestrates scaling using topology for state and info."""
|
|
572
|
+
logger.debug(f"[ScaleStage-{stage_name}] Request for target count: {new_replica_count}")
|
|
573
|
+
|
|
574
|
+
# --- Use Topology Accessors ---
|
|
575
|
+
stage_info = self.topology.get_stage_info(stage_name)
|
|
576
|
+
current_replicas = self.topology.get_stage_actors().get(stage_name, []) # Get current actors safely
|
|
577
|
+
current_count = len(current_replicas)
|
|
578
|
+
|
|
579
|
+
if stage_info is None:
|
|
580
|
+
logger.error(f"[ScaleStage-{stage_name}] Stage info not found. Cannot scale.")
|
|
581
|
+
return
|
|
582
|
+
|
|
583
|
+
target_count = max(stage_info.min_replicas, min(new_replica_count, stage_info.max_replicas))
|
|
584
|
+
if target_count != new_replica_count:
|
|
585
|
+
logger.debug(
|
|
586
|
+
f"[ScaleStage-{stage_name}] Count {new_replica_count} adjusted to {target_count} "
|
|
587
|
+
f"by bounds ({stage_info.min_replicas}/{stage_info.max_replicas})."
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
if target_count == current_count:
|
|
591
|
+
logger.debug(f"[ScaleStage-{stage_name}] Already at target count ({current_count}). No action.")
|
|
592
|
+
# Reset state if needed
|
|
593
|
+
if self.topology.get_scaling_state().get(stage_name) != "Idle":
|
|
594
|
+
self.topology.update_scaling_state(stage_name, "Idle")
|
|
595
|
+
return
|
|
596
|
+
|
|
597
|
+
# --- Delegate ---
|
|
598
|
+
try:
|
|
599
|
+
if target_count > current_count:
|
|
600
|
+
self._handle_scale_up(stage_info, current_count, target_count)
|
|
601
|
+
else: # target_count < current_count
|
|
602
|
+
# Pass the list of actors we know about *now*
|
|
603
|
+
self._handle_scale_down(stage_name, current_replicas, target_count)
|
|
604
|
+
except RuntimeError as e: # Catch specific errors from handlers
|
|
605
|
+
logger.error(f"[ScaleStage-{stage_name}] Scaling failed: {e}", exc_info=False)
|
|
606
|
+
# State should have been set to "Error" within the handler
|
|
607
|
+
except Exception as e:
|
|
608
|
+
logger.error(f"[ScaleStage-{stage_name}] Unexpected error: {e}", exc_info=True)
|
|
609
|
+
self.topology.update_scaling_state(stage_name, "Error") # Ensure error state
|
|
610
|
+
|
|
611
|
+
def _is_pipeline_quiet(self) -> bool:
|
|
612
|
+
"""Checks if pipeline is quiet using topology state and stats collector."""
|
|
613
|
+
|
|
614
|
+
# Check topology state first
|
|
615
|
+
if self.topology.get_is_flushing():
|
|
616
|
+
logger.debug("Pipeline quiet check: False (Flush in progress via topology state)")
|
|
617
|
+
return False
|
|
618
|
+
|
|
619
|
+
# Time check
|
|
620
|
+
time_since_last_flush = time.time() - self._last_queue_flush_time
|
|
621
|
+
if time_since_last_flush < self.queue_flush_interval_seconds:
|
|
622
|
+
return False
|
|
623
|
+
|
|
624
|
+
# Stats check (same as before)
|
|
625
|
+
current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
|
|
626
|
+
self.stats_collector.get_latest_stats()
|
|
627
|
+
)
|
|
628
|
+
last_update_age = time.time() - last_update_time
|
|
629
|
+
max_stats_age_for_quiet = max(10.0, self._stats_collection_interval_seconds * 2.5)
|
|
630
|
+
|
|
631
|
+
if not stats_were_successful:
|
|
632
|
+
logger.warning(f"Pipeline quiet check: False (Stats failed {last_update_age:.1f}s ago).")
|
|
633
|
+
return False
|
|
634
|
+
|
|
635
|
+
if last_update_age > max_stats_age_for_quiet:
|
|
636
|
+
logger.warning(
|
|
637
|
+
f"Pipeline quiet check: False (Stats too old: {last_update_age:.1f}s > {max_stats_age_for_quiet:.1f}s)."
|
|
638
|
+
)
|
|
639
|
+
return False
|
|
640
|
+
|
|
641
|
+
if not current_stage_stats:
|
|
642
|
+
logger.warning("Pipeline quiet check: False (No stats currently available).")
|
|
643
|
+
return False
|
|
644
|
+
|
|
645
|
+
# Activity check
|
|
646
|
+
is_quiet = global_in_flight <= self.quiet_period_threshold
|
|
647
|
+
|
|
648
|
+
if is_quiet:
|
|
649
|
+
logger.info(f"Pipeline IS quiet. In-Flight: {global_in_flight} <= Threshold: {self.quiet_period_threshold}")
|
|
650
|
+
|
|
651
|
+
return is_quiet
|
|
652
|
+
|
|
653
|
+
def _wait_for_pipeline_drain(self, timeout_seconds: int) -> bool:
|
|
654
|
+
"""
|
|
655
|
+
Actively monitors pipeline drain using direct calls to the stats collector.
|
|
656
|
+
"""
|
|
657
|
+
start_time = time.time()
|
|
658
|
+
logger.info(f"Waiting for pipeline drain (Timeout: {timeout_seconds}s)...")
|
|
659
|
+
last_in_flight = -1
|
|
660
|
+
drain_check_interval = 1.0 # Check every second
|
|
661
|
+
|
|
662
|
+
while True:
|
|
663
|
+
current_time = time.time()
|
|
664
|
+
elapsed_time = current_time - start_time
|
|
665
|
+
|
|
666
|
+
if elapsed_time >= timeout_seconds:
|
|
667
|
+
logger.warning(f"Pipeline drain timed out after {elapsed_time:.1f}s. Last In-Flight: {last_in_flight}")
|
|
668
|
+
return False
|
|
669
|
+
|
|
670
|
+
# --- Trigger immediate stats collection via the collector instance ---
|
|
671
|
+
drain_stats = {}
|
|
672
|
+
drain_success = False
|
|
673
|
+
collection_error = None
|
|
674
|
+
|
|
675
|
+
global_in_flight = -1
|
|
676
|
+
try:
|
|
677
|
+
# Use the collector's method for a one-off, blocking collection
|
|
678
|
+
drain_stats, global_in_flight, drain_success = self.stats_collector.collect_stats_now()
|
|
679
|
+
except Exception as e:
|
|
680
|
+
logger.error(f"[DrainWait] Critical error during direct stats collection call: {e}.", exc_info=True)
|
|
681
|
+
collection_error = e # Indicate failure to even run collection
|
|
682
|
+
|
|
683
|
+
# --- Process collection results ---
|
|
684
|
+
if global_in_flight != last_in_flight:
|
|
685
|
+
status_msg = (
|
|
686
|
+
f"Collection Success: {drain_success}"
|
|
687
|
+
if not collection_error
|
|
688
|
+
else f"Collection Error: {type(collection_error).__name__}"
|
|
689
|
+
)
|
|
690
|
+
logger.info(
|
|
691
|
+
f"[DrainWait] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
|
|
692
|
+
)
|
|
693
|
+
last_in_flight = global_in_flight
|
|
694
|
+
|
|
695
|
+
# --- Check for successful drain ---
|
|
696
|
+
# Requires BOTH in-flight=0 AND the collection reporting it was successful
|
|
697
|
+
if global_in_flight == 0 and drain_success and not collection_error:
|
|
698
|
+
logger.info(f"Pipeline confirmed drained (In-Flight=0) in {elapsed_time:.1f}s.")
|
|
699
|
+
return True
|
|
700
|
+
elif global_in_flight == 0: # Saw zero, but collection wasn't fully successful
|
|
701
|
+
logger.warning(
|
|
702
|
+
"[DrainWait] In-Flight reached 0, but stats collection had errors/timeouts."
|
|
703
|
+
" Cannot confirm drain yet."
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
# --- Wait ---
|
|
707
|
+
remaining_time = timeout_seconds - elapsed_time
|
|
708
|
+
sleep_duration = min(drain_check_interval, remaining_time, 1.0) # Ensure positive sleep
|
|
709
|
+
if sleep_duration > 0:
|
|
710
|
+
time.sleep(sleep_duration)
|
|
711
|
+
|
|
712
|
+
def _execute_queue_flush(self) -> bool:
|
|
713
|
+
"""Executes queue flush, using topology for state and structure."""
|
|
714
|
+
if self.topology.get_is_flushing(): # Check topology state
|
|
715
|
+
logger.warning("Queue flush requested but already in progress. Ignoring.")
|
|
716
|
+
return False
|
|
717
|
+
|
|
718
|
+
# Set flushing state in topology
|
|
719
|
+
self.topology.set_flushing(True)
|
|
720
|
+
logger.info("--- Starting Queue Flush ---")
|
|
721
|
+
overall_success = False
|
|
722
|
+
source_actors_paused = []
|
|
723
|
+
pause_refs = []
|
|
724
|
+
new_edge_queues_map: Optional[Dict[str, Tuple[Any, int]]] = None
|
|
725
|
+
|
|
726
|
+
try:
|
|
727
|
+
# --- Get structure snapshots from topology ---
|
|
728
|
+
# Use lock context for multiple reads if needed, but individual accessors are locked too
|
|
729
|
+
current_stages = self.topology.get_stages_info()
|
|
730
|
+
current_stage_actors = self.topology.get_stage_actors()
|
|
731
|
+
current_edge_queues = self.topology.get_edge_queues()
|
|
732
|
+
current_connections = self.topology.get_connections()
|
|
733
|
+
|
|
734
|
+
# --- 1. Pause Source Stages (using snapshots) ---
|
|
735
|
+
logger.info("Pausing source stages...")
|
|
736
|
+
pause_timeout = 60.0
|
|
737
|
+
for stage in current_stages:
|
|
738
|
+
if stage.is_source:
|
|
739
|
+
actors = current_stage_actors.get(stage.name, [])
|
|
740
|
+
for actor in actors:
|
|
741
|
+
if hasattr(actor, "pause") and hasattr(actor.pause, "remote"):
|
|
742
|
+
try:
|
|
743
|
+
pause_refs.append(actor.pause.remote())
|
|
744
|
+
source_actors_paused.append(actor)
|
|
745
|
+
except Exception as e:
|
|
746
|
+
logger.error(f"Failed sending pause to {actor}: {e}")
|
|
747
|
+
if pause_refs:
|
|
748
|
+
logger.info(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
|
|
749
|
+
try:
|
|
750
|
+
ray.get(pause_refs, timeout=pause_timeout)
|
|
751
|
+
logger.info(f"{len(pause_refs)} sources acknowledged pause.")
|
|
752
|
+
except GetTimeoutError:
|
|
753
|
+
logger.warning(f"Timeout waiting for {len(pause_refs)} sources to pause.")
|
|
754
|
+
except Exception as e:
|
|
755
|
+
logger.error(f"Error waiting for sources pause: {e}. Proceeding cautiously.")
|
|
756
|
+
|
|
757
|
+
# --- 2. Wait for Drain ---
|
|
758
|
+
logger.info("Waiting for pipeline to drain...")
|
|
759
|
+
if not self._wait_for_pipeline_drain(self.queue_flush_drain_timeout_seconds):
|
|
760
|
+
raise RuntimeError("Pipeline drain failed or timed out, aborting flush.")
|
|
761
|
+
|
|
762
|
+
# --- 3. Create New Queues (using snapshot) ---
|
|
763
|
+
logger.info("Creating new replacement queues...")
|
|
764
|
+
new_edge_queues_map = {}
|
|
765
|
+
for queue_name, (_, queue_size) in current_edge_queues.items():
|
|
766
|
+
try:
|
|
767
|
+
new_edge_queues_map[queue_name] = (
|
|
768
|
+
RayQueue(maxsize=queue_size, actor_options={"max_restarts": 0}),
|
|
769
|
+
queue_size,
|
|
770
|
+
)
|
|
771
|
+
logger.debug(f"Created new queue: {queue_name}")
|
|
772
|
+
except Exception as e:
|
|
773
|
+
raise RuntimeError(f"Failed to create new queue '{queue_name}'.") from e
|
|
774
|
+
|
|
775
|
+
# --- 4. Re-wire Actors to New Queues (using snapshots) ---
|
|
776
|
+
logger.info("Re-wiring actors to new queues...")
|
|
777
|
+
wiring_refs = []
|
|
778
|
+
wiring_timeout = 120.0
|
|
779
|
+
for from_stage_name, conns in current_connections.items():
|
|
780
|
+
for to_stage_name, _ in conns:
|
|
781
|
+
queue_name = f"{from_stage_name}_to_{to_stage_name}"
|
|
782
|
+
if queue_name not in new_edge_queues_map:
|
|
783
|
+
raise RuntimeError(f"New queue missing for {queue_name}")
|
|
784
|
+
new_queue_actor, _ = new_edge_queues_map[queue_name]
|
|
785
|
+
|
|
786
|
+
# Re-wire sources outputs
|
|
787
|
+
for actor in current_stage_actors.get(from_stage_name, []):
|
|
788
|
+
try:
|
|
789
|
+
wiring_refs.append(actor.set_output_queue.remote(new_queue_actor))
|
|
790
|
+
except Exception as e:
|
|
791
|
+
logger.error(f"Failed sending set_output_queue to {actor}: {e}")
|
|
792
|
+
|
|
793
|
+
# Re-wire destinations inputs
|
|
794
|
+
for actor in current_stage_actors.get(to_stage_name, []):
|
|
795
|
+
try:
|
|
796
|
+
wiring_refs.append(actor.set_input_queue.remote(new_queue_actor))
|
|
797
|
+
except Exception as e:
|
|
798
|
+
logger.error(f"Failed sending set_input_queue to {actor}: {e}")
|
|
799
|
+
|
|
800
|
+
if wiring_refs:
|
|
801
|
+
logger.debug(f"Waiting up to {wiring_timeout}s for {len(wiring_refs)} actors to re-wire...")
|
|
802
|
+
try:
|
|
803
|
+
ready, not_ready = ray.wait(wiring_refs, num_returns=len(wiring_refs), timeout=wiring_timeout)
|
|
804
|
+
if not_ready:
|
|
805
|
+
raise RuntimeError("Actor re-wiring timed out or failed.")
|
|
806
|
+
ray.get(ready) # Check for internal errors
|
|
807
|
+
logger.debug(f"{len(ready)} actors re-wired successfully.")
|
|
808
|
+
except Exception as e:
|
|
809
|
+
raise RuntimeError("Actor re-wiring failed.") from e
|
|
810
|
+
|
|
811
|
+
# --- 5. Update Topology State (Commit Point) ---
|
|
812
|
+
logger.info("Committing new queues to pipeline topology.")
|
|
813
|
+
self.topology.set_edge_queues(new_edge_queues_map) # Commit the change
|
|
814
|
+
overall_success = True
|
|
815
|
+
|
|
816
|
+
except Exception as e:
|
|
817
|
+
logger.error(f"Error during queue flush: {e}", exc_info=True)
|
|
818
|
+
overall_success = False
|
|
819
|
+
|
|
820
|
+
finally:
|
|
821
|
+
# --- 6. Resume Source Stages (Always attempt) ---
|
|
822
|
+
if source_actors_paused:
|
|
823
|
+
logger.info(f"Attempting to resume {len(source_actors_paused)} source actors...")
|
|
824
|
+
resume_timeout = 30.0
|
|
825
|
+
resume_refs = []
|
|
826
|
+
for actor in source_actors_paused:
|
|
827
|
+
try:
|
|
828
|
+
resume_refs.append(actor.resume.remote())
|
|
829
|
+
except Exception as e:
|
|
830
|
+
logger.error(f"Failed sending resume to {actor}: {e}")
|
|
831
|
+
if resume_refs:
|
|
832
|
+
logger.info(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
|
|
833
|
+
try:
|
|
834
|
+
ray.get(resume_refs, timeout=resume_timeout)
|
|
835
|
+
logger.info(f"{len(resume_refs)} sources resumed.")
|
|
836
|
+
except GetTimeoutError:
|
|
837
|
+
logger.warning(f"Timeout waiting for {len(resume_refs)} sources to resume.")
|
|
838
|
+
except Exception as e:
|
|
839
|
+
logger.error(f"Error waiting for sources resume: {e}")
|
|
840
|
+
|
|
841
|
+
# Update flush timestamp only on success
|
|
842
|
+
if overall_success:
|
|
843
|
+
self._last_queue_flush_time = time.time()
|
|
844
|
+
logger.info("--- Queue Flush Completed Successfully ---")
|
|
845
|
+
else:
|
|
846
|
+
logger.error("--- Queue Flush Failed ---")
|
|
847
|
+
|
|
848
|
+
# Reset flushing state in topology
|
|
849
|
+
self.topology.set_flushing(False)
|
|
850
|
+
|
|
851
|
+
return overall_success
|
|
852
|
+
|
|
853
|
+
def request_queue_flush(self, force: bool = False) -> None:
|
|
854
|
+
"""Requests a queue flush, checking topology state."""
|
|
855
|
+
logger.info(f"Manual queue flush requested (force={force}).")
|
|
856
|
+
if self.topology.get_is_flushing(): # Check topology
|
|
857
|
+
logger.warning("Flush already in progress.")
|
|
858
|
+
return
|
|
859
|
+
if force or self._is_pipeline_quiet():
|
|
860
|
+
# Consider running _execute_queue_flush in a separate thread
|
|
861
|
+
# to avoid blocking the caller, especially if 'force=True'.
|
|
862
|
+
# For now, run synchronously:
|
|
863
|
+
self._execute_queue_flush()
|
|
864
|
+
else:
|
|
865
|
+
logger.info("Manual flush denied: pipeline not quiet or interval not met.")
|
|
866
|
+
|
|
867
|
+
def _gather_controller_metrics(
|
|
868
|
+
self, current_stage_stats: Dict[str, Dict[str, int]], global_in_flight: int
|
|
869
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
870
|
+
"""Gathers metrics using provided stats and topology."""
|
|
871
|
+
logger.debug("[ScalingMetrics] Gathering metrics for controllers...")
|
|
872
|
+
current_stage_metrics = {}
|
|
873
|
+
|
|
874
|
+
# Use topology accessors
|
|
875
|
+
current_stages = self.topology.get_stages_info()
|
|
876
|
+
current_actors = self.topology.get_stage_actors() # Snapshot
|
|
877
|
+
|
|
878
|
+
for stage in current_stages:
|
|
879
|
+
stage_name = stage.name
|
|
880
|
+
replicas = len(current_actors.get(stage_name, []))
|
|
881
|
+
stats = current_stage_stats.get(stage_name, {"processing": 0, "in_flight": 0})
|
|
882
|
+
processing = stats.get("processing", 0)
|
|
883
|
+
in_flight = stats.get("in_flight", 0)
|
|
884
|
+
queue_depth = max(0, in_flight - processing)
|
|
885
|
+
|
|
886
|
+
current_stage_metrics[stage_name] = {
|
|
887
|
+
"replicas": replicas,
|
|
888
|
+
"queue_depth": queue_depth,
|
|
889
|
+
"processing": processing,
|
|
890
|
+
"in_flight": in_flight,
|
|
891
|
+
"min_replicas": stage.min_replicas,
|
|
892
|
+
"max_replicas": stage.max_replicas,
|
|
893
|
+
"pipeline_in_flight": global_in_flight,
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
logger.debug(f"[ScalingMetrics] Gathered metrics for {len(current_stage_metrics)} stages.")
|
|
897
|
+
return current_stage_metrics
|
|
898
|
+
|
|
899
|
+
def _get_current_global_memory(self) -> int:
|
|
900
|
+
"""
|
|
901
|
+
Safely retrieves the current global system memory usage (used, not free) in MB.
|
|
902
|
+
Uses the previous measurement as a fallback only if the current read fails.
|
|
903
|
+
|
|
904
|
+
Returns:
|
|
905
|
+
int: Current global memory usage (RSS/used) in MB. Returns previous value
|
|
906
|
+
or 0 if the read fails and no previous value exists.
|
|
907
|
+
"""
|
|
908
|
+
try:
|
|
909
|
+
# psutil.virtual_memory().used provides total RAM used by processes
|
|
910
|
+
current_global_memory_bytes = psutil.virtual_memory().used
|
|
911
|
+
current_global_memory_mb = int(current_global_memory_bytes / (1024 * 1024))
|
|
912
|
+
logger.debug(f"[ScalingMemCheck] Current global memory usage (used): {current_global_memory_mb} MB")
|
|
913
|
+
|
|
914
|
+
return current_global_memory_mb
|
|
915
|
+
except Exception as e:
|
|
916
|
+
logger.error(
|
|
917
|
+
f"[ScalingMemCheck] Failed to get current system memory usage: {e}. "
|
|
918
|
+
f"Attempting to use previous value ({self.prev_global_memory_usage} MB).",
|
|
919
|
+
exc_info=False,
|
|
920
|
+
)
|
|
921
|
+
|
|
922
|
+
# Use previous value if available, otherwise default to 0 (less ideal, but avoids None)
|
|
923
|
+
# Returning 0 might incorrectly signal low memory usage if it's the first read that fails.
|
|
924
|
+
return self.prev_global_memory_usage if self.prev_global_memory_usage is not None else 0
|
|
925
|
+
|
|
926
|
+
def _calculate_scaling_adjustments(
|
|
927
|
+
self, current_stage_metrics: Dict[str, Dict[str, Any]], global_in_flight: int, current_global_memory_mb: int
|
|
928
|
+
) -> Dict[str, int]:
|
|
929
|
+
"""Runs controllers to get target replica counts using topology for edge count."""
|
|
930
|
+
logger.debug("[ScalingCalc] Calculating adjustments via PID and RCM...")
|
|
931
|
+
# Get edge count from topology
|
|
932
|
+
num_edges = len(self.topology.get_edge_queues())
|
|
933
|
+
|
|
934
|
+
try:
|
|
935
|
+
initial_proposals = self.pid_controller.calculate_initial_proposals(current_stage_metrics)
|
|
936
|
+
logger.debug(
|
|
937
|
+
"[ScalingCalc] PID Initial Proposals:"
|
|
938
|
+
f" { {n: p.proposed_replicas for n, p in initial_proposals.items()} }" # noqa E201,E202
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
final_adjustments = self.constraint_manager.apply_constraints(
|
|
942
|
+
initial_proposals=initial_proposals,
|
|
943
|
+
global_in_flight=global_in_flight,
|
|
944
|
+
current_global_memory_usage_mb=current_global_memory_mb,
|
|
945
|
+
num_edges=num_edges,
|
|
946
|
+
)
|
|
947
|
+
logger.debug(f"[ScalingCalc] RCM Final Adjustments: {final_adjustments}")
|
|
948
|
+
return final_adjustments
|
|
949
|
+
except Exception as e:
|
|
950
|
+
logger.error(f"[ScalingCalc] Error during controller execution: {e}", exc_info=True)
|
|
951
|
+
logger.warning("[ScalingCalc] Falling back to current replica counts.")
|
|
952
|
+
return {name: metrics.get("replicas", 0) for name, metrics in current_stage_metrics.items()}
|
|
953
|
+
|
|
954
|
+
def _apply_scaling_actions(self, final_adjustments: Dict[str, int]) -> None:
|
|
955
|
+
"""Applies scaling by calling _scale_stage, using topology for validation."""
|
|
956
|
+
stages_needing_action = []
|
|
957
|
+
current_actors_map = self.topology.get_stage_actors() # Snapshot
|
|
958
|
+
|
|
959
|
+
for stage_name, target_replica_count in final_adjustments.items():
|
|
960
|
+
current_count = len(current_actors_map.get(stage_name, []))
|
|
961
|
+
stage_info = self.topology.get_stage_info(stage_name) # Get info from topology
|
|
962
|
+
|
|
963
|
+
if not stage_info:
|
|
964
|
+
logger.warning(f"[ScalingApply] Cannot apply scaling for unknown stage '{stage_name}'. Skipping.")
|
|
965
|
+
continue
|
|
966
|
+
|
|
967
|
+
# Clamp target using StageInfo from topology
|
|
968
|
+
clamped_target = max(stage_info.min_replicas, min(stage_info.max_replicas, target_replica_count))
|
|
969
|
+
if clamped_target != target_replica_count:
|
|
970
|
+
logger.warning(
|
|
971
|
+
f"[ScalingApply-{stage_name}] Target {target_replica_count} clamped to {clamped_target} by bounds."
|
|
972
|
+
)
|
|
973
|
+
target_replica_count = clamped_target
|
|
974
|
+
|
|
975
|
+
if target_replica_count != current_count:
|
|
976
|
+
stages_needing_action.append((stage_name, target_replica_count))
|
|
977
|
+
logger.info(
|
|
978
|
+
f"[ScalingApply-{stage_name}] Action: Current={current_count}, "
|
|
979
|
+
f"Target={target_replica_count} (Min={stage_info.min_replicas}, Max={stage_info.max_replicas})"
|
|
980
|
+
)
|
|
981
|
+
|
|
982
|
+
if not stages_needing_action:
|
|
983
|
+
logger.debug("[ScalingApply] No scaling actions required.")
|
|
984
|
+
return
|
|
985
|
+
|
|
986
|
+
max_workers = min(len(stages_needing_action), 8)
|
|
987
|
+
logger.debug(
|
|
988
|
+
f"[ScalingApply] Submitting {len(stages_needing_action)} scaling actions ({max_workers} workers)..."
|
|
989
|
+
)
|
|
990
|
+
action_results = {}
|
|
991
|
+
|
|
992
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
993
|
+
max_workers=max_workers, thread_name_prefix="ScalingAction"
|
|
994
|
+
) as executor:
|
|
995
|
+
future_to_stage = {
|
|
996
|
+
executor.submit(self._scale_stage, stage_name, target_count): stage_name
|
|
997
|
+
for stage_name, target_count in stages_needing_action
|
|
998
|
+
}
|
|
999
|
+
wait_timeout = 180.0
|
|
1000
|
+
logger.debug(f"[ScalingApply] Waiting up to {wait_timeout}s for actions...")
|
|
1001
|
+
for future in concurrent.futures.as_completed(future_to_stage, timeout=wait_timeout):
|
|
1002
|
+
stage_name = future_to_stage[future]
|
|
1003
|
+
try:
|
|
1004
|
+
result = future.result() # Raises exception if _scale_stage failed internally
|
|
1005
|
+
action_results[stage_name] = {"status": "completed", "result": result}
|
|
1006
|
+
logger.debug(f"[ScalingApply-{stage_name}] Action completed.")
|
|
1007
|
+
except TimeoutError:
|
|
1008
|
+
logger.error(f"[ScalingApply-{stage_name}] Action timed out ({wait_timeout}s).")
|
|
1009
|
+
action_results[stage_name] = {"status": "timeout"}
|
|
1010
|
+
self.topology.update_scaling_state(stage_name, "Error") # Mark as error on timeout
|
|
1011
|
+
except Exception as exc:
|
|
1012
|
+
logger.error(f"[ScalingApply-{stage_name}] Action failed: {exc}", exc_info=True)
|
|
1013
|
+
action_results[stage_name] = {"status": "error", "exception": exc}
|
|
1014
|
+
# State should be set to Error inside _scale_stage or its handlers on failure
|
|
1015
|
+
|
|
1016
|
+
completed = sum(1 for r in action_results.values() if r["status"] == "completed")
|
|
1017
|
+
errors = sum(1 for r in action_results.values() if r["status"] == "error")
|
|
1018
|
+
timeouts = sum(1 for r in action_results.values() if r["status"] == "timeout")
|
|
1019
|
+
logger.info(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
|
|
1020
|
+
|
|
1021
|
+
def _perform_scaling_and_maintenance(self) -> None:
|
|
1022
|
+
"""Orchestrates scaling/maintenance using topology and stats collector."""
|
|
1023
|
+
logger.debug("--- Performing Scaling & Maintenance Cycle ---")
|
|
1024
|
+
|
|
1025
|
+
if not self.dynamic_memory_scaling:
|
|
1026
|
+
logger.debug("Dynamic memory scaling disabled. Skipping cycle.")
|
|
1027
|
+
return
|
|
1028
|
+
|
|
1029
|
+
cycle_start_time = time.time()
|
|
1030
|
+
|
|
1031
|
+
# Check flushing state via topology
|
|
1032
|
+
if self.topology.get_is_flushing():
|
|
1033
|
+
logger.debug("Skipping scaling cycle: Queue flush in progress (topology state).")
|
|
1034
|
+
return
|
|
1035
|
+
|
|
1036
|
+
# --- Check for quietness for flushing (uses topology state via helper) ---
|
|
1037
|
+
try:
|
|
1038
|
+
if self._is_pipeline_quiet():
|
|
1039
|
+
logger.info("Pipeline quiet, initiating queue flush.")
|
|
1040
|
+
flush_success = self._execute_queue_flush() # Uses topology internally
|
|
1041
|
+
logger.info(f"Automatic queue flush completed. Success: {flush_success}")
|
|
1042
|
+
return # Skip scaling if flush occurred
|
|
1043
|
+
except Exception as e:
|
|
1044
|
+
logger.error(f"Error during quiet check or flush: {e}. Skipping cycle.", exc_info=True)
|
|
1045
|
+
return
|
|
1046
|
+
|
|
1047
|
+
# --- Get & Validate Stats ---
|
|
1048
|
+
current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
|
|
1049
|
+
self.stats_collector.get_latest_stats()
|
|
1050
|
+
)
|
|
1051
|
+
|
|
1052
|
+
last_update_age = time.time() - last_update_time
|
|
1053
|
+
max_stats_age_for_scaling = max(15.0, self._stats_collection_interval_seconds)
|
|
1054
|
+
if not current_stage_stats or not stats_were_successful or last_update_age > max_stats_age_for_scaling:
|
|
1055
|
+
status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
|
|
1056
|
+
logger.warning(
|
|
1057
|
+
f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
|
|
1058
|
+
)
|
|
1059
|
+
return
|
|
1060
|
+
|
|
1061
|
+
# --- Gather Metrics (uses topology via helper) ---
|
|
1062
|
+
current_stage_metrics = self._gather_controller_metrics(current_stage_stats, global_in_flight)
|
|
1063
|
+
if not current_stage_metrics:
|
|
1064
|
+
logger.error("[Scaling] Failed gather metrics. Skipping.")
|
|
1065
|
+
return
|
|
1066
|
+
|
|
1067
|
+
# --- Get Memory Usage ---
|
|
1068
|
+
current_global_memory_mb = self._get_current_global_memory()
|
|
1069
|
+
|
|
1070
|
+
# --- Calculate Scaling Adjustments (uses topology via helper) ---
|
|
1071
|
+
final_adjustments = self._calculate_scaling_adjustments(
|
|
1072
|
+
current_stage_metrics, global_in_flight, current_global_memory_mb
|
|
1073
|
+
)
|
|
1074
|
+
|
|
1075
|
+
# --- Update Memory Usage *After* Decision ---
|
|
1076
|
+
self.prev_global_memory_usage = current_global_memory_mb
|
|
1077
|
+
|
|
1078
|
+
# --- Apply Scaling Actions (uses topology via helper) ---
|
|
1079
|
+
self._apply_scaling_actions(final_adjustments)
|
|
1080
|
+
|
|
1081
|
+
logger.debug(f"--- Scaling & Maintenance Cycle Complete (Duration: {time.time() - cycle_start_time:.2f}s) ---")
|
|
1082
|
+
|
|
1083
|
+
# --- Lifecycle Methods for Monitoring/Scaling Threads ---
|
|
1084
|
+
def _scaling_loop(self, interval: float) -> None:
|
|
1085
|
+
"""Main loop for the scaling thread."""
|
|
1086
|
+
logger.info(f"Scaling loop started. Interval: {interval}s")
|
|
1087
|
+
while self._scaling_monitoring:
|
|
1088
|
+
try:
|
|
1089
|
+
self._perform_scaling_and_maintenance()
|
|
1090
|
+
except Exception as e:
|
|
1091
|
+
logger.error(f"Error in scaling loop: {e}", exc_info=True)
|
|
1092
|
+
|
|
1093
|
+
sleep_time = interval
|
|
1094
|
+
if not self._scaling_monitoring:
|
|
1095
|
+
break
|
|
1096
|
+
time.sleep(sleep_time)
|
|
1097
|
+
logger.info("Scaling loop finished.")
|
|
1098
|
+
|
|
1099
|
+
def _start_scaling(self, poll_interval: float = 10.0) -> None:
|
|
1100
|
+
if not self._scaling_monitoring:
|
|
1101
|
+
self._scaling_monitoring = True
|
|
1102
|
+
self._scaling_thread = threading.Thread(target=self._scaling_loop, args=(poll_interval,), daemon=True)
|
|
1103
|
+
self._scaling_thread.start()
|
|
1104
|
+
logger.info(f"Scaling/Maintenance thread launched (Interval: {poll_interval}s).")
|
|
1105
|
+
|
|
1106
|
+
def _stop_scaling(self) -> None:
|
|
1107
|
+
if self._scaling_monitoring:
|
|
1108
|
+
logger.debug("Stopping scaling/maintenance thread...")
|
|
1109
|
+
self._scaling_monitoring = False
|
|
1110
|
+
if self._scaling_thread is not None:
|
|
1111
|
+
self._scaling_thread.join(timeout=15) # Allow more time for scaling actions
|
|
1112
|
+
if self._scaling_thread.is_alive():
|
|
1113
|
+
logger.warning("Scaling thread did not exit cleanly.")
|
|
1114
|
+
self._scaling_thread = None
|
|
1115
|
+
logger.info("Scaling/Maintenance stopped.")
|
|
1116
|
+
|
|
1117
|
+
# --- Pipeline Start/Stop ---
|
|
1118
|
+
def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
|
|
1119
|
+
"""Starts actors (via topology) and background threads."""
|
|
1120
|
+
# Check topology for actors (indicates built)
|
|
1121
|
+
if not self.topology.get_stage_actors():
|
|
1122
|
+
logger.error("Cannot start: Pipeline not built or has no actors.")
|
|
1123
|
+
return
|
|
1124
|
+
|
|
1125
|
+
logger.info("Starting pipeline execution...")
|
|
1126
|
+
start_refs = []
|
|
1127
|
+
# Get actors from topology
|
|
1128
|
+
actors_to_start = [actor for actors in self.topology.get_stage_actors().values() for actor in actors]
|
|
1129
|
+
|
|
1130
|
+
for actor in actors_to_start:
|
|
1131
|
+
start_refs.append(actor.start.remote())
|
|
1132
|
+
|
|
1133
|
+
if start_refs:
|
|
1134
|
+
logger.debug(f"Waiting for {len(start_refs)} actors to start...")
|
|
1135
|
+
try:
|
|
1136
|
+
ray.get(start_refs, timeout=60.0)
|
|
1137
|
+
logger.info(f"{len(start_refs)} actors started.")
|
|
1138
|
+
except Exception as e:
|
|
1139
|
+
logger.error(f"Error/Timeout starting actors: {e}", exc_info=True)
|
|
1140
|
+
self.stop() # Attempt cleanup
|
|
1141
|
+
|
|
1142
|
+
raise RuntimeError("Pipeline start failed: actors did not start.") from e
|
|
1143
|
+
|
|
1144
|
+
self.stats_collector.start()
|
|
1145
|
+
self._start_scaling(poll_interval=scaling_poll_interval)
|
|
1146
|
+
logger.info("Pipeline started successfully.")
|
|
1147
|
+
|
|
1148
|
+
def stop(self) -> None:
|
|
1149
|
+
"""Stops background threads and actors (via topology)."""
|
|
1150
|
+
logger.info("Stopping pipeline...")
|
|
1151
|
+
|
|
1152
|
+
# 1. Stop background threads first
|
|
1153
|
+
self._stop_scaling()
|
|
1154
|
+
self.stats_collector.stop()
|
|
1155
|
+
|
|
1156
|
+
# 2. Stop actors (using topology)
|
|
1157
|
+
logger.debug("Stopping all stage actors...")
|
|
1158
|
+
stop_refs_map: Dict[ray.ObjectRef, Any] = {}
|
|
1159
|
+
actors_to_kill = []
|
|
1160
|
+
|
|
1161
|
+
# Get actors snapshot from topology
|
|
1162
|
+
current_actors = {name: list(actors) for name, actors in self.topology.get_stage_actors().items()}
|
|
1163
|
+
|
|
1164
|
+
for stage_name, actors in current_actors.items():
|
|
1165
|
+
for actor in actors:
|
|
1166
|
+
try:
|
|
1167
|
+
stop_refs_map[actor.stop.remote()] = actor
|
|
1168
|
+
except Exception as e:
|
|
1169
|
+
logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Will kill.")
|
|
1170
|
+
|
|
1171
|
+
if stop_refs_map:
|
|
1172
|
+
stop_refs = list(stop_refs_map.keys())
|
|
1173
|
+
logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
|
|
1174
|
+
try:
|
|
1175
|
+
ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
|
|
1176
|
+
if not_ready:
|
|
1177
|
+
logger.warning(f"Timeout waiting for {len(not_ready)} actors to stop. Will kill.")
|
|
1178
|
+
actors_to_kill.extend(stop_refs_map.get(ref) for ref in not_ready if stop_refs_map.get(ref))
|
|
1179
|
+
logger.info(f"{len(ready)} actors stopped via stop().")
|
|
1180
|
+
except Exception as e:
|
|
1181
|
+
logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
|
|
1182
|
+
actors_to_kill.extend(a for a in stop_refs_map.values() if a not in actors_to_kill) # Add all on error
|
|
1183
|
+
|
|
1184
|
+
# Clear runtime state in topology
|
|
1185
|
+
self.topology.clear_runtime_state()
|
|
1186
|
+
|
|
1187
|
+
logger.info("Pipeline stopped.")
|