nv-ingest 2025.7.8.dev20250708__py3-none-any.whl → 2025.7.9.dev20250709__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +65 -303
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +438 -163
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +30 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +159 -230
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +27 -9
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +7 -72
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +2 -1
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +22 -12
- {nv_ingest-2025.7.8.dev20250708.dist-info → nv_ingest-2025.7.9.dev20250709.dist-info}/METADATA +1 -1
- {nv_ingest-2025.7.8.dev20250708.dist-info → nv_ingest-2025.7.9.dev20250709.dist-info}/RECORD +13 -13
- {nv_ingest-2025.7.8.dev20250708.dist-info → nv_ingest-2025.7.9.dev20250709.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.7.8.dev20250708.dist-info → nv_ingest-2025.7.9.dev20250709.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.7.8.dev20250708.dist-info → nv_ingest-2025.7.9.dev20250709.dist-info}/top_level.txt +0 -0
|
@@ -6,7 +6,6 @@ import threading
|
|
|
6
6
|
import logging
|
|
7
7
|
import contextlib
|
|
8
8
|
import time
|
|
9
|
-
from collections import defaultdict
|
|
10
9
|
from typing import List, Dict, Tuple, Any, Optional, Iterator, Set
|
|
11
10
|
|
|
12
11
|
import ray
|
|
@@ -59,19 +58,15 @@ class PipelineTopology:
|
|
|
59
58
|
self._edge_queues: Dict[str, Tuple[Any, int]] = {} # Map: q_name -> (QueueHandle, Capacity)
|
|
60
59
|
self._scaling_state: Dict[str, str] = {} # Map: stage_name -> "Idle" | "Scaling Up" | "Scaling Down" | "Error"
|
|
61
60
|
self._stage_memory_overhead: Dict[str, float] = {} # Populated during build/config
|
|
62
|
-
self.
|
|
61
|
+
self._actors_pending_removal: Set[Tuple[str, Any]] = set()
|
|
63
62
|
|
|
64
63
|
# --- Operational State ---
|
|
65
64
|
self._is_flushing: bool = False
|
|
66
65
|
|
|
67
66
|
# --- Synchronization & Threading ---
|
|
68
67
|
self._lock: threading.Lock = threading.Lock()
|
|
69
|
-
self.
|
|
70
|
-
self.
|
|
71
|
-
self._stop_event: threading.Event = threading.Event() # For interruptible sleep
|
|
72
|
-
|
|
73
|
-
logger.debug("PipelineTopology initialized.")
|
|
74
|
-
self._start_cleanup_thread() # Start background cleanup on init
|
|
68
|
+
self._stop_cleanup = threading.Event()
|
|
69
|
+
self._cleanup_thread = None
|
|
75
70
|
|
|
76
71
|
def __del__(self):
|
|
77
72
|
"""Ensure cleanup thread is stopped and internal actor references are released."""
|
|
@@ -79,7 +74,7 @@ class PipelineTopology:
|
|
|
79
74
|
|
|
80
75
|
# Stop the background cleanup thread
|
|
81
76
|
try:
|
|
82
|
-
self.
|
|
77
|
+
self.stop_cleanup_thread()
|
|
83
78
|
except Exception as e:
|
|
84
79
|
logger.warning(f"Error stopping cleanup thread during __del__: {e}")
|
|
85
80
|
|
|
@@ -89,7 +84,7 @@ class PipelineTopology:
|
|
|
89
84
|
self._edge_queues.clear()
|
|
90
85
|
self._scaling_state.clear()
|
|
91
86
|
self._stage_memory_overhead.clear()
|
|
92
|
-
self.
|
|
87
|
+
self._actors_pending_removal.clear()
|
|
93
88
|
self._stages.clear()
|
|
94
89
|
self._connections.clear()
|
|
95
90
|
except Exception as e:
|
|
@@ -184,303 +179,65 @@ class PipelineTopology:
|
|
|
184
179
|
|
|
185
180
|
return removed
|
|
186
181
|
|
|
187
|
-
def
|
|
188
|
-
"""
|
|
189
|
-
Registers actor handles that have been told to stop, along with their shutdown futures.
|
|
190
|
-
The topology's background thread will monitor these futures for completion.
|
|
191
|
-
|
|
192
|
-
Parameters
|
|
193
|
-
----------
|
|
194
|
-
registration_info : Dict[str, List[Tuple[Any, ObjectRef]]]
|
|
195
|
-
Dictionary mapping stage names to a list of (actor_handle, shutdown_future) tuples.
|
|
196
|
-
"""
|
|
197
|
-
added_count = 0
|
|
198
|
-
time_registered = time.time()
|
|
199
|
-
stages_updated = set()
|
|
200
|
-
|
|
182
|
+
def mark_actor_for_removal(self, stage_name: str, actor: Any) -> None:
|
|
183
|
+
"""Marks an actor as pending removal, to be cleaned up by the background thread."""
|
|
201
184
|
with self._lock:
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
185
|
+
self._actors_pending_removal.add((stage_name, actor))
|
|
186
|
+
logger.info(f"Marked actor {actor} from stage {stage_name} for removal.")
|
|
187
|
+
|
|
188
|
+
def start_cleanup_thread(self, interval: int = 5) -> None:
|
|
189
|
+
"""Starts the background thread for periodic cleanup tasks."""
|
|
190
|
+
if self._cleanup_thread is None or not self._cleanup_thread.is_alive():
|
|
191
|
+
self._stop_cleanup.clear()
|
|
192
|
+
self._cleanup_thread = threading.Thread(target=self._cleanup_loop, args=(interval,), daemon=True)
|
|
193
|
+
self._cleanup_thread.start()
|
|
194
|
+
logger.info("Topology cleanup thread started.")
|
|
195
|
+
|
|
196
|
+
def stop_cleanup_thread(self) -> None:
|
|
197
|
+
"""Stops the background cleanup thread."""
|
|
198
|
+
if self._cleanup_thread and self._cleanup_thread.is_alive():
|
|
199
|
+
self._stop_cleanup.set()
|
|
200
|
+
self._cleanup_thread.join(timeout=5)
|
|
201
|
+
logger.info("Topology cleanup thread stopped.")
|
|
202
|
+
|
|
203
|
+
def _cleanup_loop(self, interval: int) -> None:
|
|
204
|
+
"""Periodically checks for and removes actors that have completed shutdown."""
|
|
205
|
+
while not self._stop_cleanup.is_set():
|
|
206
|
+
actors_to_remove_finally = []
|
|
207
|
+
if not self._actors_pending_removal:
|
|
208
|
+
time.sleep(interval)
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
# Check the status of actors pending removal
|
|
212
|
+
# Create a copy for safe iteration, as the set might be modified elsewhere
|
|
213
|
+
pending_actors_copy = set()
|
|
214
|
+
with self._lock:
|
|
215
|
+
pending_actors_copy = set(self._actors_pending_removal)
|
|
216
|
+
|
|
217
|
+
for stage_name, actor in pending_actors_copy:
|
|
218
|
+
try:
|
|
219
|
+
if ray.get(actor.is_shutdown_complete.remote()):
|
|
220
|
+
actors_to_remove_finally.append((stage_name, actor))
|
|
221
|
+
except ray.exceptions.RayActorError:
|
|
206
222
|
logger.warning(
|
|
207
|
-
f"
|
|
208
|
-
f"
|
|
223
|
+
f"Actor {actor} from stage {stage_name} is no longer available (RayActorError). "
|
|
224
|
+
f"Assuming it has shut down and marking for removal."
|
|
209
225
|
)
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
for actor_handle, shutdown_future in actor_list:
|
|
215
|
-
if not actor_handle or not shutdown_future:
|
|
216
|
-
logger.warning(
|
|
217
|
-
f"[TopologyRegister-{stage_name}] "
|
|
218
|
-
f"Received invalid (actor, future) in registration list. Skipping."
|
|
219
|
-
)
|
|
220
|
-
continue
|
|
221
|
-
|
|
222
|
-
actor_id_str = str(actor_handle)
|
|
223
|
-
actor_tuple = (actor_handle, actor_id_str, time_registered, shutdown_future)
|
|
224
|
-
|
|
225
|
-
if actor_tuple not in stage_pending_set:
|
|
226
|
-
stage_pending_set.add(actor_tuple)
|
|
227
|
-
added_count += 1
|
|
228
|
-
logger.debug(
|
|
229
|
-
f"[TopologyRegister-{stage_name}] "
|
|
230
|
-
f"Registered actor '{actor_id_str}' pending shutdown monitoring."
|
|
231
|
-
)
|
|
232
|
-
else:
|
|
233
|
-
logger.debug(
|
|
234
|
-
f"[TopologyRegister-{stage_name}] "
|
|
235
|
-
f"Actor '{actor_id_str}' already registered pending removal."
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
if actor_list:
|
|
239
|
-
self._scaling_state[stage_name] = "Scaling Down Pending"
|
|
240
|
-
stages_updated.add(stage_name)
|
|
241
|
-
|
|
242
|
-
if added_count > 0:
|
|
243
|
-
logger.debug(
|
|
244
|
-
f"[TopologyRegister] Registered {added_count} "
|
|
245
|
-
f"actors across {len(stages_updated)} stages pending removal."
|
|
246
|
-
)
|
|
247
|
-
elif registration_info:
|
|
248
|
-
logger.debug("[TopologyRegister] No new actors registered pending removal (likely duplicates).")
|
|
249
|
-
|
|
250
|
-
def _start_cleanup_thread(self) -> None:
|
|
251
|
-
"""Starts the background thread for cleaning up terminated actors."""
|
|
252
|
-
with self._lock: # Protect thread state modification
|
|
253
|
-
if self._cleanup_thread is not None and self._cleanup_thread.is_alive():
|
|
254
|
-
logger.warning("[TopologyCleanup] Cleanup thread already started.")
|
|
255
|
-
return
|
|
256
|
-
|
|
257
|
-
logger.info("[TopologyCleanup] Starting background cleanup thread...")
|
|
258
|
-
self._cleanup_thread_running = True
|
|
259
|
-
self._stop_event.clear() # Ensure event is not set initially
|
|
260
|
-
self._cleanup_thread = threading.Thread(
|
|
261
|
-
target=self._cleanup_loop,
|
|
262
|
-
daemon=True, # Allows program exit even if this thread hangs (though join tries)
|
|
263
|
-
name="TopologyActorCleanup",
|
|
264
|
-
)
|
|
265
|
-
self._cleanup_thread.start()
|
|
266
|
-
|
|
267
|
-
def _stop_cleanup_thread(self) -> None:
|
|
268
|
-
"""Signals the background cleanup thread to stop and waits for it."""
|
|
269
|
-
if not self._cleanup_thread_running or self._cleanup_thread is None:
|
|
270
|
-
logger.debug("[TopologyCleanup] Cleanup thread not running or already stopped.")
|
|
271
|
-
return
|
|
272
|
-
|
|
273
|
-
with self._lock: # Protect thread state read/write
|
|
274
|
-
if not self._cleanup_thread_running or self._cleanup_thread is None:
|
|
275
|
-
return # Double check inside lock
|
|
276
|
-
logger.info("[TopologyCleanup] Stopping background cleanup thread...")
|
|
277
|
-
self._cleanup_thread_running = False
|
|
278
|
-
self._stop_event.set() # Signal the loop to wake up and exit
|
|
279
|
-
|
|
280
|
-
# Wait for the thread to finish outside the lock
|
|
281
|
-
join_timeout = CLEANUP_INTERVAL_SECONDS + 5.0 # Give it time to finish last cycle
|
|
282
|
-
self._cleanup_thread.join(timeout=join_timeout)
|
|
283
|
-
|
|
284
|
-
if self._cleanup_thread.is_alive():
|
|
285
|
-
logger.warning(f"[TopologyCleanup] Cleanup thread did not exit gracefully after {join_timeout}s.")
|
|
286
|
-
else:
|
|
287
|
-
logger.info("[TopologyCleanup] Cleanup thread stopped and joined.")
|
|
288
|
-
self._cleanup_thread = None # Clear thread object
|
|
289
|
-
|
|
290
|
-
@staticmethod
|
|
291
|
-
def _delayed_actor_release(self, actor_handle_to_release: Any, actor_id_str: str, delay_seconds: int = 60):
|
|
292
|
-
"""
|
|
293
|
-
Holds a reference to an actor handle for a specified delay, then releases it.
|
|
294
|
-
This function is intended to be run in a daemon thread.
|
|
295
|
-
|
|
296
|
-
Note: this is a bit of a hack
|
|
297
|
-
"""
|
|
298
|
-
logger.debug(f"[DelayedRelease-{actor_id_str}] Thread started. Holding actor reference for {delay_seconds}s.")
|
|
299
|
-
# The actor_handle_to_release is kept in scope by being a parameter to this function,
|
|
300
|
-
# and this function's frame existing for delay_seconds.
|
|
301
|
-
time.sleep(delay_seconds)
|
|
302
|
-
logger.info(
|
|
303
|
-
f"[DelayedRelease-{actor_id_str}] Delay complete. Releasing reference. Actor should now be GC'd by Ray "
|
|
304
|
-
f"if this was the last ref."
|
|
305
|
-
)
|
|
306
|
-
# When this function exits, actor_handle_to_release goes out of scope, dropping the reference.
|
|
307
|
-
|
|
308
|
-
def _cleanup_loop(self) -> None:
|
|
309
|
-
"""
|
|
310
|
-
Background thread for periodically checking shutdown status of actors pending removal.
|
|
226
|
+
actors_to_remove_finally.append((stage_name, actor))
|
|
227
|
+
except Exception as e:
|
|
228
|
+
logger.error(f"Error checking shutdown status for actor {actor}: {e}", exc_info=True)
|
|
311
229
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
logger.info("[TopologyCleanupLoop] Cleanup thread started.")
|
|
315
|
-
|
|
316
|
-
while self._cleanup_thread_running:
|
|
317
|
-
cycle_start_time = time.time()
|
|
318
|
-
actors_removed_this_cycle = 0
|
|
319
|
-
processed_actor_ids_this_cycle = set()
|
|
320
|
-
actors_to_remove_from_pending: Dict[str, List[Tuple[Any, str, float, ray.ObjectRef]]] = defaultdict(list)
|
|
321
|
-
stages_potentially_idle: Set[str] = set()
|
|
322
|
-
|
|
323
|
-
try:
|
|
230
|
+
# Remove the fully shut-down actors from the topology
|
|
231
|
+
if actors_to_remove_finally:
|
|
324
232
|
with self._lock:
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
)
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
if not pending_set:
|
|
334
|
-
continue
|
|
335
|
-
|
|
336
|
-
pending_set_copy = pending_set.copy()
|
|
337
|
-
|
|
338
|
-
for actor_tuple in pending_set_copy:
|
|
339
|
-
actor_handle, actor_id_str, time_registered, shutdown_future = actor_tuple
|
|
340
|
-
|
|
341
|
-
if actor_id_str in processed_actor_ids_this_cycle:
|
|
342
|
-
continue
|
|
343
|
-
|
|
344
|
-
remove_from_topology = False
|
|
345
|
-
mark_for_pending_removal = False
|
|
346
|
-
actor_status = "PENDING"
|
|
347
|
-
|
|
348
|
-
# 1. Check for overall shutdown timeout
|
|
349
|
-
if time.time() - time_registered > PENDING_SHUTDOWN_TIMEOUT_SECONDS:
|
|
350
|
-
logger.warning(
|
|
351
|
-
f"[TopologyCleanupLoop-{stage_name}] Actor '{actor_id_str}' "
|
|
352
|
-
f"timed out after {PENDING_SHUTDOWN_TIMEOUT_SECONDS}s. Forcing removal."
|
|
353
|
-
)
|
|
354
|
-
remove_from_topology = True
|
|
355
|
-
mark_for_pending_removal = True
|
|
356
|
-
actor_status = "TIMEOUT"
|
|
357
|
-
|
|
358
|
-
# 2. Otherwise, check if shutdown future completed
|
|
359
|
-
if not remove_from_topology:
|
|
360
|
-
try:
|
|
361
|
-
ready, _ = ray.wait([shutdown_future], timeout=PENDING_CHECK_ACTOR_METHOD_TIMEOUT)
|
|
362
|
-
if ready:
|
|
363
|
-
logger.debug(
|
|
364
|
-
f"[TopologyCleanupLoop-{stage_name}] "
|
|
365
|
-
f"Actor '{actor_id_str}' shutdown future completed. Marking for removal."
|
|
366
|
-
)
|
|
367
|
-
remove_from_topology = True
|
|
368
|
-
mark_for_pending_removal = True
|
|
369
|
-
actor_status = "COMPLETED"
|
|
370
|
-
else:
|
|
371
|
-
logger.debug(
|
|
372
|
-
f"[TopologyCleanupLoop-{stage_name}] "
|
|
373
|
-
f"Actor '{actor_id_str}' shutdown future still pending."
|
|
374
|
-
)
|
|
375
|
-
actor_status = "PENDING"
|
|
376
|
-
except Exception as e:
|
|
377
|
-
logger.error(
|
|
378
|
-
f"[TopologyCleanupLoop-{stage_name}] "
|
|
379
|
-
f"Error checking shutdown future for actor '{actor_id_str}': {e}",
|
|
380
|
-
exc_info=False,
|
|
381
|
-
)
|
|
382
|
-
actor_status = "ERROR"
|
|
383
|
-
|
|
384
|
-
# 3. Perform removal actions
|
|
385
|
-
if remove_from_topology:
|
|
386
|
-
logger.debug(
|
|
387
|
-
f"[TopologyCleanupLoop-{stage_name}] Removing actor '{actor_id_str}' "
|
|
388
|
-
f"from topology (Reason: {actor_status})."
|
|
389
|
-
)
|
|
390
|
-
removed_list = self.remove_actors_from_stage(stage_name, [actor_handle])
|
|
391
|
-
if removed_list:
|
|
392
|
-
actors_removed_this_cycle += 1
|
|
393
|
-
else:
|
|
394
|
-
logger.debug(
|
|
395
|
-
f"[TopologyCleanupLoop-{stage_name}] Actor '{actor_id_str}' "
|
|
396
|
-
f"was already removed from main list."
|
|
397
|
-
)
|
|
398
|
-
|
|
399
|
-
if mark_for_pending_removal:
|
|
400
|
-
actors_to_remove_from_pending[stage_name].append(actor_tuple)
|
|
401
|
-
processed_actor_ids_this_cycle.add(actor_id_str)
|
|
402
|
-
stages_potentially_idle.add(stage_name)
|
|
403
|
-
|
|
404
|
-
# --- Update pending lists ---
|
|
405
|
-
for stage_to_update, removal_list in actors_to_remove_from_pending.items():
|
|
406
|
-
if stage_to_update in self._pending_removal_actors:
|
|
407
|
-
current_pending_set = self._pending_removal_actors[stage_to_update]
|
|
408
|
-
for removal_tuple in removal_list: # removal_list contains actor_tuples
|
|
409
|
-
# Extract actor_handle and actor_id_str from the tuple being removed
|
|
410
|
-
actor_handle_to_delay, actor_id_str_to_delay, _, _ = removal_tuple
|
|
411
|
-
|
|
412
|
-
if current_pending_set.discard(
|
|
413
|
-
removal_tuple
|
|
414
|
-
): # If discard was successful (element was present)
|
|
415
|
-
logger.debug(
|
|
416
|
-
f"[TopologyCleanupLoop-{stage_to_update}] Actor tuple for "
|
|
417
|
-
f"'{actor_id_str_to_delay}' discarded from pending set."
|
|
418
|
-
)
|
|
419
|
-
try:
|
|
420
|
-
# This is a bit of a hack. For some reason Ray likes to cause exceptions on
|
|
421
|
-
# the actor when we let it auto GCS just after pushing to the output queue, and
|
|
422
|
-
# mysteriously lose control messages.
|
|
423
|
-
# This lets the shutdown future complete, but leaves the actor to be killed off
|
|
424
|
-
# by ray.actor_exit()
|
|
425
|
-
delay_thread = threading.Thread(
|
|
426
|
-
target=self._delayed_actor_release,
|
|
427
|
-
args=(actor_handle_to_delay, actor_id_str_to_delay, 60), # 60s delay
|
|
428
|
-
daemon=True,
|
|
429
|
-
)
|
|
430
|
-
delay_thread.start()
|
|
431
|
-
logger.debug(
|
|
432
|
-
f"[TopologyCleanupLoop-{stage_to_update}] Started delayed release thread "
|
|
433
|
-
f"for '{actor_id_str_to_delay}'."
|
|
434
|
-
)
|
|
435
|
-
except Exception as e_thread:
|
|
436
|
-
logger.error(
|
|
437
|
-
f"[TopologyCleanupLoop-{stage_to_update}] Failed to start delayed release "
|
|
438
|
-
f"thread for '{actor_id_str_to_delay}': {e_thread}"
|
|
439
|
-
)
|
|
440
|
-
|
|
441
|
-
# After processing all removals for this stage's list, check if the set is empty
|
|
442
|
-
if not self._pending_removal_actors[stage_to_update]:
|
|
443
|
-
logger.debug(
|
|
444
|
-
f"[TopologyCleanupLoop-{stage_to_update}] Pending set empty. Deleting key."
|
|
445
|
-
)
|
|
446
|
-
del self._pending_removal_actors[stage_to_update]
|
|
447
|
-
|
|
448
|
-
# --- Update stage scaling states if pending list is empty ---
|
|
449
|
-
stages_with_empty_pending = []
|
|
450
|
-
stages_with_empty_pending = []
|
|
451
|
-
for stage_to_check in stages_potentially_idle:
|
|
452
|
-
if stage_to_check not in self._pending_removal_actors:
|
|
453
|
-
stages_with_empty_pending.append(stage_to_check)
|
|
454
|
-
if self._scaling_state.get(stage_to_check) == "Scaling Down Pending":
|
|
455
|
-
logger.debug( # Your original log level
|
|
456
|
-
f"[TopologyCleanupLoop-{stage_to_check}] All pending actors cleared. "
|
|
457
|
-
f"Setting scaling state to Idle."
|
|
458
|
-
)
|
|
459
|
-
self._scaling_state[stage_to_check] = "Idle"
|
|
460
|
-
|
|
461
|
-
# --- Log cycle summary ---
|
|
462
|
-
cycle_duration = time.time() - cycle_start_time
|
|
463
|
-
if actors_removed_this_cycle > 0:
|
|
464
|
-
logger.debug(
|
|
465
|
-
f"[TopologyCleanupLoop] Cleanup cycle finished in {cycle_duration:.3f}s. "
|
|
466
|
-
f"Removed {actors_removed_this_cycle} actors."
|
|
467
|
-
)
|
|
468
|
-
else:
|
|
469
|
-
logger.debug(
|
|
470
|
-
f"[TopologyCleanupLoop] Cleanup cycle finished in {cycle_duration:.3f}s. "
|
|
471
|
-
f"No actors removed."
|
|
472
|
-
)
|
|
473
|
-
|
|
474
|
-
except Exception as e:
|
|
475
|
-
logger.error(f"[TopologyCleanupLoop] Unhandled error in cleanup loop iteration: " f"{e}", exc_info=True)
|
|
476
|
-
|
|
477
|
-
# --- Wait until next cycle ---
|
|
478
|
-
woken_by_stop = self._stop_event.wait(timeout=CLEANUP_INTERVAL_SECONDS)
|
|
479
|
-
if woken_by_stop:
|
|
480
|
-
logger.info("[TopologyCleanupLoop] Stop event received during sleep. Exiting loop.")
|
|
481
|
-
break
|
|
482
|
-
|
|
483
|
-
logger.info("[TopologyCleanupLoop] Cleanup thread finished.")
|
|
233
|
+
for stage_name, actor in actors_to_remove_finally:
|
|
234
|
+
if (stage_name, actor) in self._actors_pending_removal:
|
|
235
|
+
self._actors_pending_removal.remove((stage_name, actor))
|
|
236
|
+
if actor in self._stage_actors.get(stage_name, []):
|
|
237
|
+
self._stage_actors[stage_name].remove(actor)
|
|
238
|
+
logger.info(f"Successfully removed actor {actor} from stage {stage_name} in topology.")
|
|
239
|
+
|
|
240
|
+
time.sleep(interval)
|
|
484
241
|
|
|
485
242
|
def set_edge_queues(self, queues: Dict[str, Tuple[Any, int]]) -> None:
|
|
486
243
|
"""Sets the dictionary of edge queues."""
|
|
@@ -526,12 +283,17 @@ class PipelineTopology:
|
|
|
526
283
|
|
|
527
284
|
# --- Accessor Methods (Read Operations - Use Lock, Return Copies) ---
|
|
528
285
|
|
|
286
|
+
def get_all_actors(self) -> List[Any]:
|
|
287
|
+
"""Returns a list of all actors across all stages."""
|
|
288
|
+
with self._lock:
|
|
289
|
+
return [actor for actors in self._stage_actors.values() for actor in actors]
|
|
290
|
+
|
|
529
291
|
def get_stages_info(self) -> List[StageInfo]:
|
|
530
292
|
"""Returns a copy of stage info with pending_shutdown flags updated."""
|
|
531
293
|
with self._lock:
|
|
532
294
|
updated_stages = []
|
|
533
295
|
for stage in self._stages:
|
|
534
|
-
pending_shutdown = bool(self.
|
|
296
|
+
pending_shutdown = bool(self._actors_pending_removal)
|
|
535
297
|
# Make a shallow copy with updated pending_shutdown
|
|
536
298
|
stage_copy = StageInfo(
|
|
537
299
|
name=stage.name,
|