nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,652 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import sys
6
+ import threading
7
+ import time
8
+ from abc import ABC, abstractmethod
9
+ from typing import Any, Dict, Optional
10
+
11
+ import ray
12
+ import ray.actor
13
+ from pydantic import BaseModel
14
+ import logging
15
+
16
+ from ray import get_runtime_context
17
+
18
+
19
+ def setup_stdout_logging(name: str = __name__, level: int = logging.INFO) -> logging.Logger:
20
+ logger = logging.getLogger(name)
21
+ logger.setLevel(level)
22
+
23
+ if not any(isinstance(h, logging.StreamHandler) for h in logger.handlers):
24
+ handler = logging.StreamHandler(sys.stdout)
25
+ handler.setLevel(level)
26
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s"))
27
+ logger.addHandler(handler)
28
+
29
+ return logger
30
+
31
+
32
+ @ray.remote
33
+ def external_monitor_actor_shutdown(actor_handle: "RayActorStage", poll_interval: float = 0.1) -> bool:
34
+ """
35
+ Polls the provided actor's `is_shutdown_complete` method until it returns True
36
+ or the actor becomes unreachable.
37
+ """
38
+ logger = setup_stdout_logging("_external_monitor_actor_shutdown") # Optional: for monitor's own logs
39
+
40
+ if actor_handle is None:
41
+ logger.error("Received null actor_handle. Cannot monitor shutdown.")
42
+ return False # Or raise error
43
+
44
+ actor_id_to_monitor = None
45
+ try:
46
+ # Try to get a string representation for logging, might fail if already gone
47
+ actor_id_to_monitor = str(actor_handle) # Basic representation
48
+ except Exception:
49
+ actor_id_to_monitor = "unknown_actor"
50
+
51
+ logger.debug(f"Monitoring shutdown for actor: {actor_id_to_monitor}")
52
+
53
+ while True:
54
+ try:
55
+ # Remotely call the actor's method
56
+ if ray.get(actor_handle.is_shutdown_complete.remote()):
57
+ logger.debug(f"Actor {actor_id_to_monitor} reported shutdown complete.")
58
+ actor_handle.request_actor_exit.remote()
59
+
60
+ return True
61
+ except ray.exceptions.RayActorError:
62
+ # Actor has died or is otherwise unreachable.
63
+ # Consider this as shutdown complete for the purpose of the future.
64
+ logger.warning(f"Actor {actor_id_to_monitor} became unreachable (RayActorError). Assuming shutdown.")
65
+ return True
66
+ except Exception as e:
67
+ # Catch other potential errors during the remote call
68
+ logger.error(f"Unexpected error while polling shutdown status for {actor_id_to_monitor}: {e}")
69
+ # Depending on policy, either continue polling or assume failure
70
+ return True # Or True if any exit is "shutdown"
71
+
72
+ time.sleep(poll_interval)
73
+
74
+
75
+ class RayActorStage(ABC):
76
+ """
77
+ Abstract base class for a stateful Ray actor stage in a processing pipeline.
78
+
79
+ This class provides a common structure for actors that consume items from
80
+ an input queue, process them, and potentially place results onto an output
81
+ queue. It utilizes a background thread for the main processing loop to
82
+ avoid blocking the main Ray actor thread. It includes basic statistics
83
+ tracking (processed count, elapsed time, processing rate) and mechanisms
84
+ for graceful shutdown.
85
+
86
+ Subclasses must implement the `on_data` method to define the specific
87
+ processing logic for each item.
88
+
89
+ Attributes
90
+ ----------
91
+ config : BaseModel
92
+ Configuration object for the stage.
93
+ _input_queue : Optional[Any]
94
+ Handle to the Ray queue from which input items are read.
95
+ Expected to be set via `set_input_queue`.
96
+ _output_queue : Optional[Any]
97
+ Handle to the Ray queue where processed items are placed.
98
+ Expected to be set via `set_output_queue`.
99
+ _running : bool
100
+ Flag indicating if the processing loop should be actively running.
101
+ Set to True by `start()` and False by `stop()`. Controls the main loop.
102
+ _active_processing : bool
103
+ Flag indicating if the `on_data` method is currently executing.
104
+ Useful for understanding if the actor is busy at a given moment.
105
+ stats : Dict[str, int]
106
+ Dictionary to store basic operational statistics. Currently tracks 'processed'.
107
+ start_time : Optional[float]
108
+ Timestamp (from time.time()) when the `start()` method was called.
109
+ Used for calculating total elapsed time.
110
+ _last_processed_count : int
111
+ Internal state variable storing the processed count at the last `get_stats` call.
112
+ Used for calculating interval processing rate.
113
+ _last_stats_time : Optional[float]
114
+ Internal state variable storing the timestamp of the last `get_stats` call.
115
+ Used for calculating interval processing rate.
116
+ _processing_thread : Optional[threading.Thread]
117
+ Handle to the background thread running the `_processing_loop`.
118
+ _shutting_down : bool
119
+ Internal flag to prevent redundant shutdown actions, protected by _lock.
120
+ _lock : threading.Lock
121
+ Lock to protect access to shutdown-related state (`_shutting_down`).
122
+ """
123
+
124
+ def __init__(self, config: BaseModel, log_to_stdout=False) -> None:
125
+ """
126
+ Initialize the RayActorStage.
127
+
128
+ Parameters
129
+ ----------
130
+ config : BaseModel
131
+ Configuration object specific to the stage's behavior. Passed by
132
+ the orchestrator during actor creation.
133
+ """
134
+ self.config: BaseModel = config
135
+ self._input_queue: Optional[Any] = None # Ray Queue handle expected
136
+ self._output_queue: Optional[Any] = None # Ray Queue handle expected
137
+ self._running: bool = False
138
+ self._active_processing: bool = False
139
+
140
+ # --- Core statistics ---
141
+ self.stats: Dict[str, int] = {
142
+ "active_processing": False,
143
+ "delta_processed": 0,
144
+ "elapsed": 0.0,
145
+ "errors": 0,
146
+ "failed": 0,
147
+ "processed": 0,
148
+ "processing_rate_cps": 0.0,
149
+ "successful_queue_reads": 0,
150
+ "successful_queue_writes": 0,
151
+ "queue_full": 0,
152
+ }
153
+ self.start_time: Optional[float] = None
154
+
155
+ # --- State for processing rate calculation ---
156
+ self._last_processed_count: int = 0
157
+ self._last_stats_time: Optional[float] = None
158
+
159
+ # --- Threading and shutdown management ---
160
+ self._processing_thread: Optional[threading.Thread] = None
161
+ self._shutting_down: bool = False
162
+
163
+ # Lock specifically for coordinating the final shutdown sequence (_request_actor_exit)
164
+ self._lock = threading.Lock()
165
+ self._shutdown_signal_complete = False # Initialize flag
166
+ self._shutdown_future: Optional[ray.ObjectRef] = None
167
+
168
+ # --- Logging ---
169
+ # Ray won't propagate logging to the root logger by default, so we set up a custom logger for debugging
170
+ self._logger = setup_stdout_logging(self.__class__.__name__) if log_to_stdout else logging.getLogger(__name__)
171
+
172
+ @staticmethod
173
+ def _get_actor_id_str() -> str:
174
+ """
175
+ Helper method to safely get the current Ray actor ID string for logging.
176
+
177
+ Handles cases where the runtime context or actor ID might not be available.
178
+
179
+ Returns
180
+ -------
181
+ str
182
+ A formatted string representing the actor ID or a fallback message.
183
+ """
184
+ try:
185
+ # Attempt to get the full actor ID from Ray's runtime context
186
+ return f"Actor {get_runtime_context().get_actor_id()}"
187
+ except Exception:
188
+ # Fallback if running outside a Ray actor context or if context fails
189
+ return "Actor (ID unavailable)"
190
+
191
+ def _read_input(self) -> Optional[Any]:
192
+ """
193
+ Reads an item from the input queue with a timeout.
194
+
195
+ This method attempts to get an item from the configured `input_queue`.
196
+ It uses a timeout to prevent indefinite blocking, allowing the
197
+ processing loop to remain responsive to the `running` flag.
198
+
199
+ Returns
200
+ -------
201
+ Optional[Any]
202
+ The item read from the queue, or None if the queue is empty after
203
+ the timeout, the queue is not set, or the actor is not running.
204
+
205
+ Raises
206
+ ------
207
+ ValueError
208
+ If `input_queue` is None while the actor's `running` flag is True.
209
+ This indicates a configuration error.
210
+ """
211
+ if not self._running:
212
+ return None
213
+
214
+ # Ensure the input queue has been configured before attempting to read
215
+ if self._input_queue is None:
216
+ # This check should ideally not fail if start() is called after setup
217
+ if self._running:
218
+ self._logger.error(f"{self._get_actor_id_str()}: Input queue not set while running")
219
+ # Indicate a programming error - queue should be set before starting
220
+ raise ValueError("Input queue not set while running")
221
+ return None # Should not happen if self._running is False, but defensive check
222
+
223
+ try:
224
+ # Perform a non-blocking or short-blocking read from the queue
225
+ # The timeout allows the loop to check self._running periodically
226
+ return self._input_queue.get(timeout=1.0)
227
+ except Exception:
228
+ # Common exceptions include queue.Empty in older Ray versions or
229
+ # custom queue implementations raising timeout errors.
230
+ # Return None to signify no item was retrieved this cycle.
231
+ return None
232
+
233
+ @abstractmethod
234
+ def on_data(self, control_message: Any) -> Optional[Any]:
235
+ """
236
+ Process a single data item (control message).
237
+
238
+ This is the core logic method that must be implemented by subclasses.
239
+ It receives an item dequeued by `read_input` and performs the
240
+ stage-specific processing.
241
+
242
+ Parameters
243
+ ----------
244
+ control_message : Any
245
+ The data item retrieved from the input queue.
246
+
247
+ Returns
248
+ -------
249
+ Optional[Any]
250
+ The result of the processing. If a result is returned (not None),
251
+ it will be placed onto the `output_queue`. Return None if this
252
+ stage does not produce output or if this specific message yields
253
+ no result.
254
+ """
255
+ pass # Must be implemented by concrete subclasses
256
+
257
+ def _processing_loop(self) -> None:
258
+ """Core processing routine executed in a dedicated background thread.
259
+
260
+ This loop performs the primary work of the actor:
261
+ 1. Continuously attempts to retrieve a `control_message` from the
262
+ `_input_queue`.
263
+ 2. If a message is obtained, it's processed by the `on_data` method.
264
+ 3. If `on_data` yields a result (`updated_cm`), this result is
265
+ indefinitely retried to be `put` onto the `_output_queue`.
266
+ 4. The loop continues as long as the `self._running` flag is `True`.
267
+ This flag is typically controlled by external calls to `start()`
268
+ and `stop()` methods of the actor.
269
+ 5. Upon exiting the main `while` loop (i.e., when `self._running`
270
+ becomes `False`), this method sets `self._shutdown_signal_complete`
271
+ to `True`, indicating to external monitors that the actor's
272
+ processing work is finished.
273
+
274
+ Error Handling
275
+ --------------
276
+ - Exceptions raised during `on_data` or the `_output_queue.put`
277
+ sequence are caught, logged, and relevant error statistics are
278
+ incremented. The loop then continues to the next iteration if
279
+ `self._running` is still `True`.
280
+ - If `on_data` returns `None`, it's treated as a recoverable incident;
281
+ a warning is logged, stats are updated, and the loop continues.
282
+ No output is produced for that specific input message.
283
+ - A critical failure in the `_output_queue.put` (e.g., `RayActorError`
284
+ if the queue actor is dead) will currently lead to indefinite retries.
285
+
286
+ Statistics
287
+ ----------
288
+ This method updates various keys in `self.stats`, including:
289
+ - `successful_queue_reads`: Incremented when an item is successfully
290
+ read from the input queue.
291
+ - `errors`: Incremented if `on_data` returns `None` or if an
292
+ exception occurs during `on_data` or output queuing.
293
+ - `processed`: Incremented after processing a control message
294
+ - `successful_queue_writes`: Incremented when an item is successfully
295
+ put onto the output queue.
296
+ - `queue_full`: Incremented when an attempt to put to the output
297
+ queue fails (e.g., due to being full or other transient errors),
298
+ triggering a retry.
299
+
300
+ Notes
301
+ -----
302
+ - The `self._active_processing` flag is managed to reflect whether
303
+ the `on_data` method is currently (or about to be) active.
304
+ - This method is intended to be the target of a `threading.Thread`.
305
+ - Thread safety for `self.stats` relies on the GIL for simple
306
+ increment operations
307
+ """
308
+ actor_id_str = self._get_actor_id_str()
309
+ self._logger.debug(f"{actor_id_str}: Processing loop thread starting.")
310
+
311
+ try:
312
+ while self._running:
313
+ control_message: Optional[Any] = None
314
+ try:
315
+ # Step 1: Attempt to get work from the input queue.
316
+ # _read_input() is expected to handle its own timeouts and
317
+ # return None if no message is available or if self._running became False.
318
+ control_message = self._read_input()
319
+
320
+ if control_message is None:
321
+ # No message from input queue (e.g., timeout or shutting down)
322
+ # Loop back to check self._running again.
323
+ continue
324
+ # else: # Implicitly, control_message is not None here
325
+ self.stats["successful_queue_reads"] += 1
326
+
327
+ # Mark as busy only when a message is retrieved and about to be processed.
328
+ self._active_processing = True
329
+
330
+ # Step 2: Process the retrieved message using subclass-specific logic.
331
+ updated_cm: Optional[Any] = self.on_data(control_message)
332
+
333
+ # If there's a valid result and an output queue is configured, attempt to put.
334
+ if self._output_queue is not None:
335
+ # This loop will retry indefinitely until the item is put successfully
336
+ # or an unrecoverable error occurs (which is not explicitly handled to break here).
337
+ # TODO(Devin) -- This can be improved, should probably fail at some point?
338
+ # Consider max retries or specific error handling for RayActorError
339
+ # to prevent indefinite blocking if the queue actor is permanently dead.
340
+ is_put_successful = False
341
+ while not is_put_successful: # Renamed loop variable for clarity
342
+ try:
343
+ self._output_queue.put(updated_cm)
344
+ self.stats["successful_queue_writes"] += 1
345
+ is_put_successful = True # Exit retry loop on success
346
+ except Exception as e_put: # Broad exception catch for put failures
347
+ self._logger.warning(
348
+ f"[{actor_id_str}] Output queue put failed (e.g., full, "
349
+ f"timeout, or actor error), retrying. Error: {e_put}"
350
+ )
351
+ self.stats["queue_full"] += 1 # Consider renaming if it catches more than "full"
352
+ time.sleep(0.1) # Brief pause before retrying
353
+
354
+ # Step 3: Increment "processed" count after successful processing and output (if any).
355
+ # This is the primary path for "successful processing".
356
+ self.stats["processed"] += 1
357
+
358
+ except Exception as e_item_processing:
359
+ # Catch exceptions from on_data() or unexpected issues in the item handling block.
360
+ cm_info_str = f" (message type: {type(control_message).__name__})" if control_message else ""
361
+ self._logger.exception(
362
+ f"[{actor_id_str}] Error during processing of item{cm_info_str}: {e_item_processing}"
363
+ )
364
+ self.stats["errors"] += 1
365
+
366
+ # If still running, pause briefly to prevent rapid spinning on persistent errors.
367
+ if self._running:
368
+ time.sleep(0.1)
369
+ finally:
370
+ # Ensure _active_processing is reset after each item attempt (success, failure, or no item).
371
+ self._active_processing = False
372
+
373
+ # --- Loop Exit Condition Met ---
374
+ # This point is reached when self._running becomes False.
375
+ self._logger.debug(f"[{actor_id_str}] Graceful exit: self._running is False. Processing loop terminating.")
376
+
377
+ except Exception as e_outer_loop:
378
+ # Catches very unexpected errors in the structure of the while loop itself.
379
+ self._logger.exception(
380
+ f"[{actor_id_str}] Unexpected critical error caused processing loop termination: {e_outer_loop}"
381
+ )
382
+ finally:
383
+ # This block executes when the processing thread is about to exit,
384
+ # either due to self._running becoming False or an unhandled critical exception.
385
+ self._logger.debug(f"[{actor_id_str}] Processing loop thread finished.")
386
+ # Signal that this actor's processing duties are complete.
387
+ # External monitors (e.g., via a future from stop()) can use this signal.
388
+ self._shutdown_signal_complete = True
389
+
390
+ @staticmethod
391
+ @ray.remote
392
+ def _immediate_true() -> bool:
393
+ """
394
+ A tiny remote method that immediately returns True.
395
+ Used to create a resolved ObjectRef when shutdown is already complete.
396
+ """
397
+ return True
398
+
399
+ @ray.method(num_returns=1)
400
+ def _finalize_shutdown(self) -> None:
401
+ """
402
+ Internal Ray method called remotely by the processing thread to safely exit the actor.
403
+
404
+ This method runs in the main Ray actor thread context. It acquires a lock
405
+ to prevent multiple exit attempts and then calls `ray.actor.exit_actor()`
406
+ to terminate the actor process gracefully.
407
+
408
+ Note: Only necessary if running in a detached actor context.
409
+ """
410
+
411
+ actor_id_str = self._get_actor_id_str()
412
+ with self._lock:
413
+ if self._shutting_down:
414
+ return
415
+
416
+ self._shutting_down = True
417
+
418
+ self._logger.info(f"{actor_id_str}: Executing actor exit process.")
419
+
420
+ get_runtime_context().current_actor.request_actor_exit.remote()
421
+
422
+ @ray.method(num_returns=1)
423
+ def request_actor_exit(self) -> None:
424
+ """
425
+ Request the actor to exit gracefully.
426
+
427
+ This method is called from the main Ray actor thread to ensure a clean
428
+ shutdown of the actor. It should be called when the processing loop
429
+ has completed its work and is ready to exit.
430
+ """
431
+
432
+ if self._processing_thread:
433
+ self._processing_thread.join()
434
+
435
+ self._shutdown_signal_complete = True
436
+
437
+ self._logger.debug(f"{self._get_actor_id_str()}: Requesting actor exit.")
438
+ ray.actor.exit_actor()
439
+
440
+ @ray.method(num_returns=1)
441
+ def start(self) -> bool:
442
+ """
443
+ Starts the actor's processing loop in a background thread.
444
+
445
+ Initializes state, resets statistics, and launches the `_processing_loop`
446
+ thread. Idempotent: if called while already running, it logs a warning
447
+ and returns False.
448
+
449
+ Returns
450
+ -------
451
+ bool
452
+ True if the actor was successfully started, False if it was already running.
453
+ """
454
+ actor_id_str = self._get_actor_id_str()
455
+ # Prevent starting if already running
456
+ if self._running:
457
+ self._logger.warning(f"{actor_id_str}: Start called but actor is already running.")
458
+ return False
459
+
460
+ self._logger.info(f"{actor_id_str}: Starting actor...")
461
+ # --- Initialize Actor State ---
462
+ self._running = True
463
+ self._shutting_down = False # Reset shutdown flag on start
464
+ self._shutdown_signal_complete = False
465
+ self.start_time = time.time()
466
+
467
+ # --- Reset Statistics ---
468
+ self._last_stats_time = self.start_time
469
+ self._last_processed_count = 0
470
+
471
+ # --- Start Background Processing Thread ---
472
+ self._logger.debug(f"{actor_id_str}: Creating and starting processing thread.")
473
+ self._processing_thread = threading.Thread(
474
+ target=self._processing_loop,
475
+ daemon=False,
476
+ )
477
+ self._processing_thread.start()
478
+
479
+ self._logger.info(f"{actor_id_str}: Actor started successfully.")
480
+
481
+ return True
482
+
483
+ @ray.method(num_returns=1)
484
+ def stop(self) -> ray.ObjectRef:
485
+ actor_id_str = self._get_actor_id_str()
486
+ self._logger.info(f"{actor_id_str}: Received external stop request.")
487
+
488
+ if self._shutdown_future is not None:
489
+ self._logger.debug(f"{actor_id_str}: Stop called again, returning existing shutdown future.")
490
+ return self._shutdown_future
491
+
492
+ if not self._running and self._shutdown_signal_complete: # Check if already fully shutdown
493
+ self._logger.info(f"{actor_id_str}: Stop called, but actor was already shutdown and signal complete.")
494
+ if self._shutdown_future: # Should have been set by the previous shutdown sequence
495
+ return self._shutdown_future
496
+ else: # Should not happen if shutdown_signal_complete is true, but as a fallback
497
+ self._shutdown_future = self._immediate_true.remote()
498
+ return self._shutdown_future
499
+ elif not self._running: # Was stopped but maybe not fully signaled (e.g. mid-shutdown)
500
+ self._logger.warning(
501
+ f"{actor_id_str}: Stop called but actor was not running (or already stopping). "
502
+ "Will create/return monitor future."
503
+ )
504
+ # If _shutdown_future is None here, it means stop wasn't called before OR a previous
505
+ # monitor didn't get stored. Proceed to create a new monitor.
506
+ # If it *was* already stopping and _shutdown_future exists, the first `if` catches it.
507
+
508
+ # --- Initiate Shutdown signal to internal loop (if still running) ---
509
+ if self._running: # Only set self._running = False if it was actually running
510
+ self._running = False
511
+ self._logger.info(f"{actor_id_str}: Stop signal sent to processing loop. Shutdown initiated.")
512
+ else:
513
+ self._logger.info(
514
+ f"{actor_id_str}: Actor processing loop was already stopped. Monitoring for final shutdown signal."
515
+ )
516
+
517
+ # --- Spawn shutdown watcher task ---
518
+ # Get a handle to the current actor instance to pass to the monitor.
519
+ # This is crucial: the monitor needs to call methods on *this specific actor*.
520
+ try:
521
+ self_handle = get_runtime_context().current_actor
522
+ except Exception as e:
523
+ self._logger.error(
524
+ f"{actor_id_str}: Failed to get current_actor handle for monitoring: {e}. Returning a failing future."
525
+ )
526
+
527
+ # Cannot proceed to monitor, return a future that resolves to False or raises
528
+ @ray.remote
529
+ def failed_future():
530
+ raise RuntimeError("Failed to initiate shutdown monitoring due to missing actor handle.")
531
+
532
+ return failed_future.remote() # Or ray.put(False) directly
533
+
534
+ self._shutdown_future = external_monitor_actor_shutdown.remote(self_handle)
535
+
536
+ return self._shutdown_future
537
+
538
+ @ray.method(num_returns=1)
539
+ def is_shutdown_complete(self) -> bool:
540
+ """
541
+ Checks if the actor's processing loop has finished and signaled completion.
542
+ Raises RayActorError if the actor process has terminated.
543
+ """
544
+ return self._shutdown_signal_complete
545
+
546
+ # --- get_stats ---
547
+
548
+ @ray.method(num_returns=1)
549
+ def get_stats(self) -> Dict[str, Any]:
550
+ """
551
+ Retrieves performance statistics for the actor.
552
+
553
+ Calculates the approximate processing rate since the last call to
554
+ `get_stats` or since `start()`.
555
+
556
+ Returns
557
+ -------
558
+ Dict[str, Any]
559
+ A dictionary containing statistics:
560
+ - 'processed' (int): Total items processed since the actor started.
561
+ - 'elapsed' (float): Total time in seconds since the actor started.
562
+ - 'active_processing' (bool): Whether the actor was actively
563
+ processing an item in `on_data`
564
+ at the moment this method was called.
565
+ - 'processing_rate_cps' (float): Calculated items processed per
566
+ second during the last interval.
567
+ Can be zero if no items were
568
+ processed or the interval was too short.
569
+ """
570
+ current_time: float = time.time()
571
+ current_processed: int = self.stats.get("processed", 0)
572
+ is_active: bool = self._active_processing
573
+ delta_processed = 0
574
+
575
+ processing_rate_cps: float = 0.0 # Default rate
576
+
577
+ # Calculate rate only if actor has started and stats have been initialized
578
+ if self._last_stats_time is not None and self.start_time is not None:
579
+ delta_time: float = current_time - self._last_stats_time
580
+ # Use the processed count captured at the start of this method call
581
+ delta_processed: int = current_processed - self._last_processed_count
582
+
583
+ # Calculate rate if time has advanced and items were processed
584
+ # Use a small epsilon for delta_time to avoid division by zero
585
+ if delta_time > 0.001 and delta_processed >= 0:
586
+ processing_rate_cps = delta_processed / delta_time
587
+ # If delta_processed is negative (e.g., due to counter reset or race), report 0 rate.
588
+
589
+ # Update state for the *next* interval calculation AFTER computing the current rate
590
+ self._last_stats_time = current_time
591
+ self._last_processed_count = current_processed # Store the count used in *this* interval calculation
592
+
593
+ # Calculate total elapsed time
594
+ elapsed: float = (current_time - self.start_time) if self.start_time else 0.0
595
+
596
+ # Compile and return the statistics dictionary
597
+ return {
598
+ "active_processing": is_active, # Return the state captured at the beginning
599
+ "delta_processed": delta_processed,
600
+ "elapsed": elapsed,
601
+ "errors": self.stats.get("errors", 0),
602
+ "failed": self.stats.get("failed", 0),
603
+ "processed": current_processed,
604
+ "processing_rate_cps": processing_rate_cps,
605
+ "queue_full": self.stats.get("queue_full", 0),
606
+ "successful_queue_reads": self.stats.get("successful_queue_reads", 0),
607
+ "successful_queue_writes": self.stats.get("successful_queue_writes", 0),
608
+ }
609
+
610
+ @ray.method(num_returns=1)
611
+ def set_input_queue(self, queue_handle: Any) -> bool:
612
+ """
613
+ Sets the input queue handle for this actor stage.
614
+
615
+ Should be called before `start()`.
616
+
617
+ Parameters
618
+ ----------
619
+ queue_handle : Any
620
+ The Ray queue handle (e.g., `ray.util.queue.Queue`) from which
621
+ this actor should read input items.
622
+
623
+ Returns
624
+ -------
625
+ bool
626
+ True indicating the queue was set.
627
+ """
628
+ self._logger.debug(f"{self._get_actor_id_str()}: Setting input queue.")
629
+ self._input_queue = queue_handle
630
+ return True
631
+
632
+ @ray.method(num_returns=1)
633
+ def set_output_queue(self, queue_handle: Any) -> bool:
634
+ """
635
+ Sets the output queue handle for this actor stage.
636
+
637
+ Should be called before `start()`.
638
+
639
+ Parameters
640
+ ----------
641
+ queue_handle : Any
642
+ The Ray queue handle (e.g., `ray.util.queue.Queue`) to which
643
+ this actor should write output items.
644
+
645
+ Returns
646
+ -------
647
+ bool
648
+ True indicating the queue was set.
649
+ """
650
+ self._logger.debug(f"{self._get_actor_id_str()}: Setting output queue.")
651
+ self._output_queue = queue_handle
652
+ return True
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0