nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,574 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import threading
6
+ import logging
7
+ import contextlib
8
+ import time
9
+ from collections import defaultdict
10
+ from typing import List, Dict, Tuple, Any, Optional, Iterator, Set
11
+
12
+ import ray
13
+
14
+ # --- Constants ---
15
+ CLEANUP_INTERVAL_SECONDS = 15.0
16
+ PENDING_SHUTDOWN_TIMEOUT_SECONDS = 60.0 * 60
17
+ PENDING_CHECK_ACTOR_METHOD_TIMEOUT = 5.0
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class StageInfo:
23
+ def __init__(
24
+ self,
25
+ name,
26
+ callable,
27
+ config,
28
+ is_source=False,
29
+ is_sink=False,
30
+ min_replicas=0,
31
+ max_replicas=1,
32
+ pending_shutdown=False,
33
+ ):
34
+ self.name = name
35
+ self.callable = callable
36
+ self.config = config
37
+ self.is_source = is_source
38
+ self.is_sink = is_sink
39
+ self.min_replicas = min_replicas
40
+ self.max_replicas = max_replicas
41
+ self.pending_shutdown = pending_shutdown
42
+
43
+
44
+ class PipelineTopology:
45
+ """
46
+ Holds the structural definition and runtime state of the pipeline.
47
+
48
+ Encapsulates stages, connections, actors, queues, and associated state
49
+ with thread-safe access via internal locking.
50
+ """
51
+
52
+ def __init__(self):
53
+ # --- Definition ---
54
+ self._stages: List[StageInfo] = []
55
+ self._connections: Dict[str, List[Tuple[str, int]]] = {}
56
+
57
+ # --- Runtime State ---
58
+ self._stage_actors: Dict[str, List[Any]] = {}
59
+ self._edge_queues: Dict[str, Tuple[Any, int]] = {} # Map: q_name -> (QueueHandle, Capacity)
60
+ self._scaling_state: Dict[str, str] = {} # Map: stage_name -> "Idle" | "Scaling Up" | "Scaling Down" | "Error"
61
+ self._stage_memory_overhead: Dict[str, float] = {} # Populated during build/config
62
+ self._pending_removal_actors: Dict[str, Set[Tuple[Any, str, float, ray.ObjectRef]]] = defaultdict(set)
63
+
64
+ # --- Operational State ---
65
+ self._is_flushing: bool = False
66
+
67
+ # --- Synchronization & Threading ---
68
+ self._lock: threading.Lock = threading.Lock()
69
+ self._cleanup_thread: Optional[threading.Thread] = None
70
+ self._cleanup_thread_running: bool = False
71
+ self._stop_event: threading.Event = threading.Event() # For interruptible sleep
72
+
73
+ logger.debug("PipelineTopology initialized.")
74
+ self._start_cleanup_thread() # Start background cleanup on init
75
+
76
+ def __del__(self):
77
+ """Ensure cleanup thread is stopped when topology object is destroyed."""
78
+ logger.debug("PipelineTopology destructor called, ensuring cleanup thread is stopped.")
79
+ self._stop_cleanup_thread()
80
+
81
+ # --- Lock Context Manager ---
82
+ @contextlib.contextmanager
83
+ def lock_context(self) -> Iterator["PipelineTopology"]:
84
+ """Provides safe access to the topology under lock for complex operations."""
85
+ with self._lock:
86
+ yield self
87
+
88
+ # --- Mutator Methods (Write Operations - Use Lock) ---
89
+
90
+ def add_stage(self, stage_info: StageInfo) -> None:
91
+ """Adds a stage definition."""
92
+ with self._lock:
93
+ # Prevent duplicate stage names?
94
+ if any(s.name == stage_info.name for s in self._stages):
95
+ logger.error(f"Attempted to add duplicate stage name: {stage_info.name}")
96
+ raise ValueError(f"Stage name '{stage_info.name}' already exists.")
97
+ self._stages.append(stage_info)
98
+ logger.debug(f"Added stage definition: {stage_info.name}")
99
+
100
+ def add_connection(self, from_stage: str, to_stage: str, queue_size: int) -> None:
101
+ """Adds a connection definition between two stages."""
102
+ with self._lock:
103
+ # Basic validation (more can be added in Pipeline class)
104
+ stage_names = {s.name for s in self._stages}
105
+ if from_stage not in stage_names:
106
+ raise ValueError(f"Source stage '{from_stage}' for connection not found.")
107
+ if to_stage not in stage_names:
108
+ raise ValueError(f"Destination stage '{to_stage}' for connection not found.")
109
+
110
+ self._connections.setdefault(from_stage, []).append((to_stage, queue_size))
111
+ logger.debug(f"Added connection definition: {from_stage} -> {to_stage} (q_size={queue_size})")
112
+
113
+ def set_actors_for_stage(self, stage_name: str, actors: List[Any]) -> None:
114
+ """Sets the list of actors for a given stage, resetting scaling state."""
115
+ with self._lock:
116
+ if stage_name not in {s.name for s in self._stages}:
117
+ logger.warning(f"Attempted to set actors for unknown stage: {stage_name}")
118
+ return # Or raise error?
119
+ self._stage_actors[stage_name] = actors
120
+ self._scaling_state[stage_name] = "Idle" # Initialize/reset state
121
+ logger.debug(f"Set {len(actors)} actors for stage '{stage_name}'. State set to Idle.")
122
+
123
+ def add_actor_to_stage(self, stage_name: str, actor: Any) -> None:
124
+ """Adds a single actor to a stage's list."""
125
+ with self._lock:
126
+ if stage_name not in self._stage_actors:
127
+ # This might happen if stage has 0 min_replicas and is scaled up first time
128
+ self._stage_actors[stage_name] = []
129
+ self._scaling_state[stage_name] = "Idle" # Ensure state exists
130
+ logger.debug(f"Initialized actor list for stage '{stage_name}' during add.")
131
+ self._stage_actors[stage_name].append(actor)
132
+ logger.debug(f"Added actor to stage '{stage_name}'. New count: {len(self._stage_actors[stage_name])}")
133
+
134
+ def remove_actors_from_stage(self, stage_name: str, actors_to_remove: List[Any]) -> List[Any]:
135
+ """
136
+ Removes specific actors from a stage's list immediately.
137
+ Called by the cleanup thread or potentially for forced removal.
138
+ """
139
+ removed = []
140
+ # Assumes lock is already held by caller (e.g., cleanup thread or lock_context)
141
+ if stage_name not in self._stage_actors:
142
+ logger.warning(
143
+ f"[Topology-InternalRemove] Attempted to remove actors from non-existent stage entry: {stage_name}"
144
+ )
145
+ return []
146
+ current_actors = self._stage_actors.get(stage_name, [])
147
+
148
+ # Create sets for efficient lookup
149
+ current_actor_set = set(current_actors)
150
+ to_remove_set = set(actors_to_remove)
151
+
152
+ # Actors remaining are those in current set but not in removal set
153
+ actors_remaining = list(current_actor_set - to_remove_set)
154
+ # Actors actually removed are the intersection
155
+ actors_actually_removed = list(current_actor_set.intersection(to_remove_set))
156
+
157
+ if actors_actually_removed:
158
+ self._stage_actors[stage_name] = actors_remaining
159
+ removed = actors_actually_removed
160
+ logger.debug(
161
+ f"[Topology-InternalRemove] Removed {len(removed)} actors from stage '{stage_name}'. "
162
+ f"Remaining: {len(actors_remaining)}"
163
+ )
164
+ elif to_remove_set:
165
+ # This might happen if called twice for the same actor
166
+ logger.debug(f"[Topology-InternalRemove] No actors matching removal list found in stage '{stage_name}'.")
167
+
168
+ return removed
169
+
170
+ def register_actors_pending_removal(self, registration_info: Dict[str, List[Tuple[Any, ray.ObjectRef]]]) -> None:
171
+ """
172
+ Registers actor handles that have been told to stop, along with their shutdown futures.
173
+ The topology's background thread will monitor these futures for completion.
174
+
175
+ Parameters
176
+ ----------
177
+ registration_info : Dict[str, List[Tuple[Any, ObjectRef]]]
178
+ Dictionary mapping stage names to a list of (actor_handle, shutdown_future) tuples.
179
+ """
180
+ added_count = 0
181
+ time_registered = time.time()
182
+ stages_updated = set()
183
+
184
+ with self._lock:
185
+ all_known_stages = {s.name for s in self._stages}
186
+
187
+ for stage_name, actor_list in registration_info.items():
188
+ if stage_name not in all_known_stages:
189
+ logger.warning(
190
+ f"[TopologyRegister] Received pending removal registration for unknown stage "
191
+ f"'{stage_name}'. Skipping."
192
+ )
193
+ continue
194
+
195
+ stage_pending_set = self._pending_removal_actors[stage_name]
196
+
197
+ for actor_handle, shutdown_future in actor_list:
198
+ if not actor_handle or not shutdown_future:
199
+ logger.warning(
200
+ f"[TopologyRegister-{stage_name}] "
201
+ f"Received invalid (actor, future) in registration list. Skipping."
202
+ )
203
+ continue
204
+
205
+ actor_id_str = str(actor_handle)
206
+ actor_tuple = (actor_handle, actor_id_str, time_registered, shutdown_future)
207
+
208
+ if actor_tuple not in stage_pending_set:
209
+ stage_pending_set.add(actor_tuple)
210
+ added_count += 1
211
+ logger.debug(
212
+ f"[TopologyRegister-{stage_name}] "
213
+ f"Registered actor '{actor_id_str}' pending shutdown monitoring."
214
+ )
215
+ else:
216
+ logger.debug(
217
+ f"[TopologyRegister-{stage_name}] "
218
+ f"Actor '{actor_id_str}' already registered pending removal."
219
+ )
220
+
221
+ if actor_list:
222
+ self._scaling_state[stage_name] = "Scaling Down Pending"
223
+ stages_updated.add(stage_name)
224
+
225
+ if added_count > 0:
226
+ logger.debug(
227
+ f"[TopologyRegister] Registered {added_count} "
228
+ f"actors across {len(stages_updated)} stages pending removal."
229
+ )
230
+ elif registration_info:
231
+ logger.debug("[TopologyRegister] No new actors registered pending removal (likely duplicates).")
232
+
233
+ def _start_cleanup_thread(self) -> None:
234
+ """Starts the background thread for cleaning up terminated actors."""
235
+ with self._lock: # Protect thread state modification
236
+ if self._cleanup_thread is not None and self._cleanup_thread.is_alive():
237
+ logger.warning("[TopologyCleanup] Cleanup thread already started.")
238
+ return
239
+
240
+ logger.info("[TopologyCleanup] Starting background cleanup thread...")
241
+ self._cleanup_thread_running = True
242
+ self._stop_event.clear() # Ensure event is not set initially
243
+ self._cleanup_thread = threading.Thread(
244
+ target=self._cleanup_loop,
245
+ daemon=True, # Allows program exit even if this thread hangs (though join tries)
246
+ name="TopologyActorCleanup",
247
+ )
248
+ self._cleanup_thread.start()
249
+
250
+ def _stop_cleanup_thread(self) -> None:
251
+ """Signals the background cleanup thread to stop and waits for it."""
252
+ if not self._cleanup_thread_running or self._cleanup_thread is None:
253
+ logger.debug("[TopologyCleanup] Cleanup thread not running or already stopped.")
254
+ return
255
+
256
+ with self._lock: # Protect thread state read/write
257
+ if not self._cleanup_thread_running or self._cleanup_thread is None:
258
+ return # Double check inside lock
259
+ logger.info("[TopologyCleanup] Stopping background cleanup thread...")
260
+ self._cleanup_thread_running = False
261
+ self._stop_event.set() # Signal the loop to wake up and exit
262
+
263
+ # Wait for the thread to finish outside the lock
264
+ join_timeout = CLEANUP_INTERVAL_SECONDS + 5.0 # Give it time to finish last cycle
265
+ self._cleanup_thread.join(timeout=join_timeout)
266
+
267
+ if self._cleanup_thread.is_alive():
268
+ logger.warning(f"[TopologyCleanup] Cleanup thread did not exit gracefully after {join_timeout}s.")
269
+ else:
270
+ logger.info("[TopologyCleanup] Cleanup thread stopped and joined.")
271
+ self._cleanup_thread = None # Clear thread object
272
+
273
+ @staticmethod
274
+ def _delayed_actor_release(self, actor_handle_to_release: Any, actor_id_str: str, delay_seconds: int = 60):
275
+ """
276
+ Holds a reference to an actor handle for a specified delay, then releases it.
277
+ This function is intended to be run in a daemon thread.
278
+
279
+ Note: this is a bit of a hack
280
+ """
281
+ logger.debug(f"[DelayedRelease-{actor_id_str}] Thread started. Holding actor reference for {delay_seconds}s.")
282
+ # The actor_handle_to_release is kept in scope by being a parameter to this function,
283
+ # and this function's frame existing for delay_seconds.
284
+ time.sleep(delay_seconds)
285
+ logger.info(
286
+ f"[DelayedRelease-{actor_id_str}] Delay complete. Releasing reference. Actor should now be GC'd by Ray "
287
+ f"if this was the last ref."
288
+ )
289
+ # When this function exits, actor_handle_to_release goes out of scope, dropping the reference.
290
+
291
+ def _cleanup_loop(self) -> None:
292
+ """
293
+ Background thread for periodically checking shutdown status of actors pending removal.
294
+
295
+ Actors are removed from the topology once their shutdown futures complete or they time out.
296
+ """
297
+ logger.info("[TopologyCleanupLoop] Cleanup thread started.")
298
+
299
+ while self._cleanup_thread_running:
300
+ cycle_start_time = time.time()
301
+ actors_removed_this_cycle = 0
302
+ processed_actor_ids_this_cycle = set()
303
+ actors_to_remove_from_pending: Dict[str, List[Tuple[Any, str, float, ray.ObjectRef]]] = defaultdict(list)
304
+ stages_potentially_idle: Set[str] = set()
305
+
306
+ try:
307
+ with self._lock:
308
+ if not self._cleanup_thread_running:
309
+ logger.debug(
310
+ "[TopologyCleanupLoop] " "Stop signal received after lock acquisition. Exiting loop."
311
+ )
312
+ break
313
+
314
+ for stage_name in list(self._pending_removal_actors.keys()):
315
+ pending_set = self._pending_removal_actors[stage_name]
316
+ if not pending_set:
317
+ continue
318
+
319
+ pending_set_copy = pending_set.copy()
320
+
321
+ for actor_tuple in pending_set_copy:
322
+ actor_handle, actor_id_str, time_registered, shutdown_future = actor_tuple
323
+
324
+ if actor_id_str in processed_actor_ids_this_cycle:
325
+ continue
326
+
327
+ remove_from_topology = False
328
+ mark_for_pending_removal = False
329
+ actor_status = "PENDING"
330
+
331
+ # 1. Check for overall shutdown timeout
332
+ if time.time() - time_registered > PENDING_SHUTDOWN_TIMEOUT_SECONDS:
333
+ logger.warning(
334
+ f"[TopologyCleanupLoop-{stage_name}] Actor '{actor_id_str}' "
335
+ f"timed out after {PENDING_SHUTDOWN_TIMEOUT_SECONDS}s. Forcing removal."
336
+ )
337
+ remove_from_topology = True
338
+ mark_for_pending_removal = True
339
+ actor_status = "TIMEOUT"
340
+
341
+ # 2. Otherwise, check if shutdown future completed
342
+ if not remove_from_topology:
343
+ try:
344
+ ready, _ = ray.wait([shutdown_future], timeout=PENDING_CHECK_ACTOR_METHOD_TIMEOUT)
345
+ if ready:
346
+ logger.debug(
347
+ f"[TopologyCleanupLoop-{stage_name}] "
348
+ f"Actor '{actor_id_str}' shutdown future completed. Marking for removal."
349
+ )
350
+ remove_from_topology = True
351
+ mark_for_pending_removal = True
352
+ actor_status = "COMPLETED"
353
+ else:
354
+ logger.debug(
355
+ f"[TopologyCleanupLoop-{stage_name}] "
356
+ f"Actor '{actor_id_str}' shutdown future still pending."
357
+ )
358
+ actor_status = "PENDING"
359
+ except Exception as e:
360
+ logger.error(
361
+ f"[TopologyCleanupLoop-{stage_name}] "
362
+ f"Error checking shutdown future for actor '{actor_id_str}': {e}",
363
+ exc_info=False,
364
+ )
365
+ actor_status = "ERROR"
366
+
367
+ # 3. Perform removal actions
368
+ if remove_from_topology:
369
+ logger.debug(
370
+ f"[TopologyCleanupLoop-{stage_name}] Removing actor '{actor_id_str}' "
371
+ f"from topology (Reason: {actor_status})."
372
+ )
373
+ removed_list = self.remove_actors_from_stage(stage_name, [actor_handle])
374
+ if removed_list:
375
+ actors_removed_this_cycle += 1
376
+ else:
377
+ logger.debug(
378
+ f"[TopologyCleanupLoop-{stage_name}] Actor '{actor_id_str}' "
379
+ f"was already removed from main list."
380
+ )
381
+
382
+ if mark_for_pending_removal:
383
+ actors_to_remove_from_pending[stage_name].append(actor_tuple)
384
+ processed_actor_ids_this_cycle.add(actor_id_str)
385
+ stages_potentially_idle.add(stage_name)
386
+
387
+ # --- Update pending lists ---
388
+ for stage_to_update, removal_list in actors_to_remove_from_pending.items():
389
+ if stage_to_update in self._pending_removal_actors:
390
+ current_pending_set = self._pending_removal_actors[stage_to_update]
391
+ for removal_tuple in removal_list: # removal_list contains actor_tuples
392
+ # Extract actor_handle and actor_id_str from the tuple being removed
393
+ actor_handle_to_delay, actor_id_str_to_delay, _, _ = removal_tuple
394
+
395
+ if current_pending_set.discard(
396
+ removal_tuple
397
+ ): # If discard was successful (element was present)
398
+ logger.debug(
399
+ f"[TopologyCleanupLoop-{stage_to_update}] Actor tuple for "
400
+ f"'{actor_id_str_to_delay}' discarded from pending set."
401
+ )
402
+ try:
403
+ # This is a bit of a hack. For some reason Ray likes to cause exceptions on
404
+ # the actor when we let it auto GCS just after pushing to the output queue, and
405
+ # mysteriously lose control messages.
406
+ # This lets the shutdown future complete, but leaves the actor to be killed off
407
+ # by ray.actor_exit()
408
+ delay_thread = threading.Thread(
409
+ target=self._delayed_actor_release,
410
+ args=(actor_handle_to_delay, actor_id_str_to_delay, 60), # 60s delay
411
+ daemon=True,
412
+ )
413
+ delay_thread.start()
414
+ logger.debug(
415
+ f"[TopologyCleanupLoop-{stage_to_update}] Started delayed release thread "
416
+ f"for '{actor_id_str_to_delay}'."
417
+ )
418
+ except Exception as e_thread:
419
+ logger.error(
420
+ f"[TopologyCleanupLoop-{stage_to_update}] Failed to start delayed release "
421
+ f"thread for '{actor_id_str_to_delay}': {e_thread}"
422
+ )
423
+
424
+ # After processing all removals for this stage's list, check if the set is empty
425
+ if not self._pending_removal_actors[stage_to_update]:
426
+ logger.debug(
427
+ f"[TopologyCleanupLoop-{stage_to_update}] Pending set empty. Deleting key."
428
+ )
429
+ del self._pending_removal_actors[stage_to_update]
430
+
431
+ # --- Update stage scaling states if pending list is empty ---
432
+ stages_with_empty_pending = []
433
+ stages_with_empty_pending = []
434
+ for stage_to_check in stages_potentially_idle:
435
+ if stage_to_check not in self._pending_removal_actors:
436
+ stages_with_empty_pending.append(stage_to_check)
437
+ if self._scaling_state.get(stage_to_check) == "Scaling Down Pending":
438
+ logger.debug( # Your original log level
439
+ f"[TopologyCleanupLoop-{stage_to_check}] All pending actors cleared. "
440
+ f"Setting scaling state to Idle."
441
+ )
442
+ self._scaling_state[stage_to_check] = "Idle"
443
+
444
+ # --- Log cycle summary ---
445
+ cycle_duration = time.time() - cycle_start_time
446
+ if actors_removed_this_cycle > 0:
447
+ logger.debug(
448
+ f"[TopologyCleanupLoop] Cleanup cycle finished in {cycle_duration:.3f}s. "
449
+ f"Removed {actors_removed_this_cycle} actors."
450
+ )
451
+ else:
452
+ logger.debug(
453
+ f"[TopologyCleanupLoop] Cleanup cycle finished in {cycle_duration:.3f}s. "
454
+ f"No actors removed."
455
+ )
456
+
457
+ except Exception as e:
458
+ logger.error(f"[TopologyCleanupLoop] Unhandled error in cleanup loop iteration: " f"{e}", exc_info=True)
459
+
460
+ # --- Wait until next cycle ---
461
+ woken_by_stop = self._stop_event.wait(timeout=CLEANUP_INTERVAL_SECONDS)
462
+ if woken_by_stop:
463
+ logger.info("[TopologyCleanupLoop] Stop event received during sleep. Exiting loop.")
464
+ break
465
+
466
+ logger.info("[TopologyCleanupLoop] Cleanup thread finished.")
467
+
468
+ def set_edge_queues(self, queues: Dict[str, Tuple[Any, int]]) -> None:
469
+ """Sets the dictionary of edge queues."""
470
+ with self._lock:
471
+ self._edge_queues = queues
472
+ logger.debug(f"Set {len(queues)} edge queues.")
473
+
474
+ def update_scaling_state(self, stage_name: str, state: str) -> None:
475
+ """Updates the scaling state for a stage."""
476
+ with self._lock:
477
+ # Add validation for state values?
478
+ valid_states = {"Idle", "Scaling Up", "Scaling Down", "Error"}
479
+ if state not in valid_states:
480
+ logger.error(f"Invalid scaling state '{state}' for stage '{stage_name}'. Ignoring.")
481
+ return
482
+ if stage_name not in {s.name for s in self._stages}:
483
+ logger.warning(f"Attempted to set scaling state for unknown stage: {stage_name}")
484
+ return
485
+ self._scaling_state[stage_name] = state
486
+ logger.debug(f"Updated scaling state for '{stage_name}' to '{state}'.")
487
+
488
+ def set_flushing(self, is_flushing: bool) -> None:
489
+ """Sets the pipeline flushing state."""
490
+ with self._lock:
491
+ self._is_flushing = is_flushing
492
+ logger.debug(f"Pipeline flushing state set to: {is_flushing}")
493
+
494
+ def set_stage_memory_overhead(self, overheads: Dict[str, float]) -> None:
495
+ """Sets the estimated memory overhead for stages."""
496
+ with self._lock:
497
+ self._stage_memory_overhead = overheads
498
+ logger.debug(f"Set memory overheads for {len(overheads)} stages.")
499
+
500
+ def clear_runtime_state(self) -> None:
501
+ """Clears actors, queues, and scaling state. Keeps definitions."""
502
+ with self._lock:
503
+ self._stage_actors.clear()
504
+ self._edge_queues.clear()
505
+ self._scaling_state.clear()
506
+ self._is_flushing = False # Reset flushing state too
507
+
508
+ logger.debug("Cleared runtime state (actors, queues, scaling state, flushing flag).")
509
+
510
+ # --- Accessor Methods (Read Operations - Use Lock, Return Copies) ---
511
+
512
+ def get_stages_info(self) -> List[StageInfo]:
513
+ """Returns a copy of stage info with pending_shutdown flags updated."""
514
+ with self._lock:
515
+ updated_stages = []
516
+ for stage in self._stages:
517
+ pending_shutdown = bool(self._pending_removal_actors.get(stage.name))
518
+ # Make a shallow copy with updated pending_shutdown
519
+ stage_copy = StageInfo(
520
+ name=stage.name,
521
+ callable=stage.callable,
522
+ config=stage.config,
523
+ is_source=stage.is_source,
524
+ is_sink=stage.is_sink,
525
+ min_replicas=stage.min_replicas,
526
+ max_replicas=stage.max_replicas,
527
+ pending_shutdown=pending_shutdown,
528
+ )
529
+ updated_stages.append(stage_copy)
530
+ return updated_stages
531
+
532
+ def get_stage_info(self, stage_name: str) -> Optional[StageInfo]:
533
+ """Returns the StageInfo for a specific stage, or None if not found."""
534
+ with self._lock:
535
+ for stage in self._stages:
536
+ if stage.name == stage_name:
537
+ return stage
538
+ return None
539
+
540
+ def get_connections(self) -> Dict[str, List[Tuple[str, int]]]:
541
+ """Returns a shallow copy of the connections dictionary."""
542
+ with self._lock:
543
+ # Shallow copy is usually sufficient here as tuples are immutable
544
+ return self._connections.copy()
545
+
546
+ def get_stage_actors(self) -> Dict[str, List[Any]]:
547
+ """Returns a copy of the stage actors dictionary (with copies of actor lists)."""
548
+ with self._lock:
549
+ return {name: list(actors) for name, actors in self._stage_actors.items()}
550
+
551
+ def get_actor_count(self, stage_name: str) -> int:
552
+ """Returns the number of actors for a specific stage."""
553
+ with self._lock:
554
+ return len(self._stage_actors.get(stage_name, []))
555
+
556
+ def get_edge_queues(self) -> Dict[str, Tuple[Any, int]]:
557
+ """Returns a shallow copy of the edge queues dictionary."""
558
+ with self._lock:
559
+ return self._edge_queues.copy()
560
+
561
+ def get_scaling_state(self) -> Dict[str, str]:
562
+ """Returns a copy of the scaling state dictionary."""
563
+ with self._lock:
564
+ return self._scaling_state.copy()
565
+
566
+ def get_is_flushing(self) -> bool:
567
+ """Returns the current flushing state."""
568
+ with self._lock:
569
+ return self._is_flushing
570
+
571
+ def get_stage_memory_overhead(self) -> Dict[str, float]:
572
+ """Returns a copy of the stage memory overhead dictionary."""
573
+ with self._lock:
574
+ return self._stage_memory_overhead.copy()