nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,346 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import time
6
+ import threading
7
+ import logging
8
+ from collections import defaultdict
9
+ from typing import Tuple, Dict, Any, Optional
10
+
11
+ import ray
12
+ from ray.exceptions import RayActorError
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class RayStatsCollector:
18
+ """
19
+ Collects statistics from a RayPipeline's actors and queues in parallel
20
+ using a dedicated background thread.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ pipeline_accessor: Any, # Object providing access to pipeline structure
26
+ interval: float = 30.0,
27
+ actor_timeout: float = 5.0,
28
+ queue_timeout: float = 2.0,
29
+ ):
30
+ """
31
+ Initializes the RayStatsCollector.
32
+
33
+ Parameters
34
+ ----------
35
+ pipeline_accessor : Any
36
+ An object (typically the RayPipeline instance) that provides methods
37
+ to access the pipeline's structure safely:
38
+ - `get_stages_info() -> List[StageInfo]`
39
+ - `get_stage_actors() -> Dict[str, List[Any]]`
40
+ - `get_edge_queues() -> Dict[str, Tuple[Any, int]]`
41
+ These methods should return snapshots suitable for iteration.
42
+ interval : float, optional
43
+ The interval in seconds between stats collection attempts, by default 5.0.
44
+ actor_timeout : float, optional
45
+ Timeout in seconds for waiting for stats from a single actor, by default 5.0.
46
+ queue_timeout : float, optional
47
+ Timeout in seconds for waiting for qsize from a single queue, by default 2.0.
48
+ """
49
+ if not ray:
50
+ logger.warning("RayStatsCollector initialized but Ray is not available.")
51
+
52
+ self._pipeline = pipeline_accessor
53
+ self._interval = interval
54
+ self._actor_timeout = actor_timeout
55
+ self._queue_timeout = queue_timeout
56
+
57
+ self._lock: threading.Lock = threading.Lock() # Protects access to collected stats and status
58
+ self._running: bool = False
59
+ self._thread: Optional[threading.Thread] = None
60
+
61
+ # Internal state holding the latest results
62
+ self._collected_stats: Dict[str, Dict[str, int]] = {}
63
+ self._total_inflight: int = 0
64
+ self._last_update_time: float = 0.0
65
+ self._last_update_successful: bool = False
66
+
67
+ self._cumulative_stats: Dict[str, Dict[str, int]] = defaultdict(lambda: {"processed": 0})
68
+
69
+ logger.info(
70
+ f"RayStatsCollector initialized (Interval: {self._interval}s, "
71
+ f"Actor Timeout: {self._actor_timeout}s, Queue Timeout: {self._queue_timeout}s)"
72
+ )
73
+
74
+ # --- Helper function to be run in threads ---
75
+
76
+ def _get_qsize_sync(self, q_name: str, queue_actor: Any) -> Tuple[str, int]:
77
+ """Safely calls qsize() on a queue actor and returns name + size/-1."""
78
+ try:
79
+ # Check right before calling - actor might have become invalid
80
+ if queue_actor is None:
81
+ logger.warning(f"[ThreadPool-qsize] Queue actor for '{q_name}' is None.")
82
+ return q_name, -1
83
+ if hasattr(queue_actor, "qsize") and callable(getattr(queue_actor, "qsize")):
84
+ # Direct, synchronous call
85
+ q_size_val = queue_actor.qsize()
86
+ return q_name, int(q_size_val)
87
+ else:
88
+ logger.warning(f"[ThreadPool-qsize] Queue actor for '{q_name}' lacks qsize method in thread.")
89
+ return q_name, 0 # Treat lack of method as size 0? Or -1? Let's use 0.
90
+ except RayActorError as e:
91
+ logger.error(f"[ThreadPool-qsize] Actor error calling qsize for queue {q_name}: {e}")
92
+ return q_name, -1
93
+ except Exception as e:
94
+ logger.error(f"[ThreadPool-qsize] Error calling qsize for queue {q_name}: {e}", exc_info=True)
95
+ return q_name, -1
96
+
97
+ def start(self) -> None:
98
+ """Starts the dedicated background statistics collection thread."""
99
+ if self._thread is not None and self._thread.is_alive():
100
+ logger.warning("Stats collector thread already started and alive.")
101
+ return
102
+ if self._running and (self._thread is None or not self._thread.is_alive()):
103
+ logger.warning("Stats collector flag was true but thread not running. Resetting flag.")
104
+ self._running = False # Correct inconsistent state
105
+
106
+ if not self._running:
107
+ logger.info("Starting stats collector thread...")
108
+ self._running = True
109
+ with self._lock:
110
+ self._last_update_successful = False # Mark as stale until first collection
111
+ self._last_update_time = time.time()
112
+
113
+ self._thread = threading.Thread(
114
+ target=self._collection_loop,
115
+ daemon=True, # Ensure thread exits if main program exits
116
+ name="PipelineStatsCollector",
117
+ )
118
+ self._thread.start()
119
+ # else: # Should not happen due to checks above
120
+ # logger.error("Logic error: Attempted to start stats collector when flag is already True.")
121
+
122
+ def stop(self) -> None:
123
+ """Signals the background stats collection thread to stop and waits for it."""
124
+ if self._running:
125
+ logger.info("Stopping stats collector thread...")
126
+ self._running = False # Signal loop to stop
127
+
128
+ if self._thread is not None:
129
+ # Calculate a reasonable join timeout
130
+ join_timeout = max(10.0, self._interval + self._actor_timeout * 2 + self._queue_timeout * 2 + 5.0)
131
+ logger.debug(f"Waiting up to {join_timeout:.1f}s for stats thread to join...")
132
+ self._thread.join(timeout=join_timeout)
133
+
134
+ if self._thread.is_alive():
135
+ logger.warning(f"Stats collector thread did not stop gracefully after {join_timeout:.1f}s.")
136
+ else:
137
+ logger.debug("Stats collector thread joined successfully.")
138
+ self._thread = None
139
+ else:
140
+ logger.warning("Stop called for stats collector, but thread object was None.")
141
+
142
+ # Reset status flags after stopping
143
+ with self._lock:
144
+ self._last_update_successful = False
145
+ self._collected_stats = {} # Clear last collected stats
146
+ logger.info("Stats collector thread stopped.")
147
+ else:
148
+ logger.debug("Stats collector thread already stopped or never started.")
149
+
150
+ def get_latest_stats(self) -> Tuple[Dict[str, Dict[str, int]], int, float, bool]:
151
+ """
152
+ Returns the most recently collected statistics, update time, and success status.
153
+
154
+ Returns
155
+ -------
156
+ Tuple[Dict[str, Dict[str, int]], float, bool]
157
+ A tuple containing:
158
+ - A dictionary mapping stage names to their statistics (or empty if none collected).
159
+ - The timestamp (time.time()) of the last update attempt.
160
+ - A boolean indicating if the last collection was successful.
161
+ """
162
+ with self._lock:
163
+ # Return copies to prevent external modification
164
+ stats_copy = self._collected_stats.copy()
165
+ total_inflight = self._total_inflight
166
+ update_time = self._last_update_time
167
+ success = self._last_update_successful
168
+ return stats_copy, total_inflight, update_time, success
169
+
170
+ def _collection_loop(self) -> None:
171
+ """
172
+ Main loop for the statistics collection thread. Periodically calls
173
+ collect_stats_now and updates shared state.
174
+ """
175
+ logger.debug(f"Stats collector loop started. Interval: {self._interval}s.")
176
+ while self._running:
177
+ start_time = time.time()
178
+ new_stats = {}
179
+ success = False
180
+ collection_duration = 0.0
181
+
182
+ try:
183
+ # Collect stats using the core logic method
184
+ new_stats, total_inflight, success = self.collect_stats_now()
185
+ collection_duration = time.time() - start_time
186
+
187
+ # Update shared state under lock
188
+ with self._lock:
189
+ self._collected_stats = new_stats
190
+ self._total_inflight = total_inflight
191
+
192
+ for stage, stats in new_stats.items():
193
+ if "delta_processed" in stats:
194
+ self._cumulative_stats[stage]["processed"] += stats["delta_processed"]
195
+
196
+ self._last_update_time = time.time()
197
+ self._last_update_successful = success
198
+
199
+ except Exception as e:
200
+ # Catch critical errors within the collection call itself
201
+ logger.error(f"Critical error during collect_stats_now call: {e}", exc_info=True)
202
+ collection_duration = time.time() - start_time
203
+ with self._lock: # Ensure flags are updated on critical error
204
+ self._collected_stats = {} # Clear potentially inconsistent stats
205
+ self._last_update_successful = False
206
+ self._last_update_time = time.time()
207
+
208
+ # --- Logging ---
209
+ log_level = logging.DEBUG if success else logging.WARNING
210
+ logger.log(
211
+ log_level, f"Stats collection cycle finished (Success: {success}) in {collection_duration:.3f}s."
212
+ )
213
+
214
+ # --- Sleep ---
215
+ elapsed = time.time() - start_time
216
+ sleep_time = max(0.1, self._interval - elapsed)
217
+
218
+ # Check running flag *before* sleeping to allow faster exit
219
+ if not self._running:
220
+ break
221
+
222
+ # Using Event for interruptible sleep might be slightly better for immediate stops,
223
+ # but time.sleep is simpler for now.
224
+ time.sleep(sleep_time)
225
+
226
+ logger.info("Stats collector loop finished.")
227
+
228
+ def collect_stats_now(self) -> Tuple[Dict[str, Dict[str, int]], int, bool]:
229
+ """
230
+ Performs a single collection cycle of statistics from pipeline actors/queues.
231
+
232
+ Returns
233
+ -------
234
+ Tuple[Dict[str, Dict[str, int]], bool]
235
+ A dictionary mapping stage names to their collected statistics, and a
236
+ boolean indicating if the overall collection was successful.
237
+ """
238
+ if not ray:
239
+ logger.error("[StatsCollectNow] Ray is not available. Cannot collect stats.")
240
+ return {}, 0, False
241
+
242
+ overall_success = True
243
+ stage_stats_updates: Dict[str, Dict[str, int]] = {}
244
+ actor_tasks: Dict[ray.ObjectRef, Tuple[Any, str]] = {}
245
+ queue_sizes: Dict[str, int] = {}
246
+
247
+ try:
248
+ current_stages = self._pipeline.get_stages_info()
249
+ current_stage_actors = self._pipeline.get_stage_actors()
250
+ current_edge_queues = self._pipeline.get_edge_queues()
251
+ except Exception as e:
252
+ logger.error(f"[StatsCollectNow] Failed to get pipeline structure: {e}", exc_info=True)
253
+ return {}, 0, False
254
+
255
+ logger.debug(f"[StatsCollectNow] Starting collection for {len(current_stages)} stages.")
256
+
257
+ # --- 1. Prepare Actor Stat Requests ---
258
+ for stage_info in current_stages:
259
+ stage_name = stage_info.name
260
+ stage_stats_updates[stage_name] = {"processing": 0, "in_flight": 0}
261
+
262
+ if stage_info.pending_shutdown:
263
+ logger.debug(f"[StatsCollectNow] Stage '{stage_name}' pending shutdown. Skipping actor queries.")
264
+ # Assume stage has 1 active job to prevent premature scale-down
265
+ stage_stats_updates[stage_name]["processing"] = 1
266
+ stage_stats_updates[stage_name]["in_flight"] = 0
267
+ continue
268
+
269
+ actors = current_stage_actors.get(stage_name, [])
270
+ for actor in actors:
271
+ try:
272
+ stats_ref = actor.get_stats.remote()
273
+ actor_tasks[stats_ref] = (actor, stage_name)
274
+ except Exception as e:
275
+ logger.error(
276
+ f"[StatsCollectNow] Failed to initiate get_stats for actor {actor}: {e}", exc_info=True
277
+ )
278
+ overall_success = False
279
+
280
+ logger.debug(f"[StatsCollectNow] Initiated {len(actor_tasks)} actor stat requests.")
281
+
282
+ # --- 2. Collect Queue Stats (Synchronous Threaded Calls) ---
283
+ for q_name, (queue_actor, _) in current_edge_queues.items():
284
+ try:
285
+ q_size_val = queue_actor.qsize()
286
+ queue_sizes[q_name] = int(q_size_val)
287
+ except Exception as e:
288
+ logger.error(f"[StatsCollectNow] Failed to get queue size for '{q_name}': {e}", exc_info=True)
289
+ queue_sizes[q_name] = 0
290
+ overall_success = False
291
+
292
+ # --- 3. Resolve Actor Stats ---
293
+ if actor_tasks:
294
+ try:
295
+ ready_refs, remaining_refs = ray.wait(
296
+ list(actor_tasks.keys()), num_returns=len(actor_tasks), timeout=self._actor_timeout
297
+ )
298
+
299
+ for ref in ready_refs:
300
+ actor, stage_name = actor_tasks[ref]
301
+ try:
302
+ stats = ray.get(ref)
303
+ active = int(stats.get("active_processing", 0))
304
+ delta = int(stats.get("delta_processed", 0))
305
+ processed = stage_stats_updates[stage_name].get("processed", 0)
306
+ processing = stage_stats_updates[stage_name].get("processing", 0)
307
+ stage_stats_updates[stage_name]["processing"] = processing + active
308
+ stage_stats_updates[stage_name]["processed"] = processed + delta
309
+ stage_stats_updates[stage_name]["delta_processed"] = (
310
+ stage_stats_updates[stage_name].get("delta_processed", 0) + delta
311
+ )
312
+
313
+ except Exception as e:
314
+ logger.warning(
315
+ f"[StatsCollectNow] Error getting stats for actor {actor} (Stage '{stage_name}'): {e}"
316
+ )
317
+ overall_success = False
318
+
319
+ if remaining_refs:
320
+ logger.warning(f"[StatsCollectNow] {len(remaining_refs)} actor stats requests timed out.")
321
+ overall_success = False
322
+
323
+ except Exception as e:
324
+ logger.error(f"[StatsCollectNow] Error during actor stats collection: {e}", exc_info=True)
325
+ overall_success = False
326
+
327
+ # --- 4. Aggregate In-Flight Stats ---
328
+ _total_inflight = 0
329
+ for stage_info in current_stages:
330
+ stage_name = stage_info.name
331
+ input_queues = [q_name for q_name in current_edge_queues.keys() if q_name.endswith(f"_to_{stage_name}")]
332
+ total_queued = sum(queue_sizes.get(q, 0) for q in input_queues)
333
+ stage_stats_updates[stage_name]["in_flight"] += total_queued
334
+
335
+ _total_inflight += total_queued + stage_stats_updates[stage_name]["processing"]
336
+
337
+ logger.debug(f"[StatsCollectNow] Collected stats for {len(stage_stats_updates)} stages.")
338
+ for stage, stats in stage_stats_updates.items():
339
+ flat_stats = ", ".join(f"{k}={v}" for k, v in stats.items())
340
+ total = self._cumulative_stats.get(stage, {}).get("processed", 0)
341
+ logger.debug(f"[StatsCollectNow] {stage}: {flat_stats}, total_processed={total}")
342
+
343
+ logger.debug(f"[StatsCollectNow] Total in-flight jobs: {_total_inflight}")
344
+ logger.debug(f"[StatsCollectNow] Stats collection complete. Overall success: {overall_success}")
345
+
346
+ return stage_stats_updates, _total_inflight, overall_success
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,82 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ import ray
9
+
10
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
+ from nv_ingest.framework.util.flow_control import filter_by_task
12
+ from nv_ingest_api.internal.extract.audio.audio_extraction import extract_text_from_audio_internal
13
+ from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type, IngestControlMessage
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
+ from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
16
+ from nv_ingest_api.util.exception_handlers.decorators import (
17
+ nv_ingest_node_failure_try_except,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @ray.remote
24
+ class AudioExtractorStage(RayActorStage):
25
+ """
26
+ A Ray actor stage that extracts text from audio content.
27
+
28
+ It expects an IngestControlMessage containing a DataFrame with audio data. It then:
29
+ 1. Removes the "audio_data_extract" task from the message.
30
+ 2. Calls the audio extraction logic (via extract_text_from_audio_internal) using a validated configuration.
31
+ 3. Updates the message payload with the extracted text DataFrame.
32
+ """
33
+
34
+ def __init__(self, config: AudioExtractorSchema) -> None:
35
+ super().__init__(config, log_to_stdout=False)
36
+ try:
37
+ self.validated_config = config
38
+ self._logger.info("AudioExtractorStage configuration validated successfully.")
39
+ except Exception as e:
40
+ self._logger.exception(f"Error validating Audio Extractor config: {e}")
41
+ raise
42
+
43
+ @traceable("audio_extractor")
44
+ @filter_by_task(required_tasks=[("extract", {"document_type": "regex:^(mp3|wav)$"})])
45
+ @nv_ingest_node_failure_try_except(annotation_id="audio_extractor", raise_on_failure=False)
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ """
48
+ Process the control message by extracting text from audio.
49
+
50
+ Parameters
51
+ ----------
52
+ control_message : IngestControlMessage
53
+ The message containing a DataFrame payload with audio data.
54
+
55
+ Returns
56
+ -------
57
+ IngestControlMessage
58
+ The updated message with extracted text data.
59
+ """
60
+ self._logger.debug("AudioExtractorStage.on_data: Starting audio extraction process.")
61
+
62
+ # Extract the DataFrame payload.
63
+ df_ledger = control_message.payload()
64
+ self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
65
+
66
+ # Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
67
+ task_config = remove_task_by_type(control_message, "extract")
68
+ self._logger.debug("Extracted task config: %s", task_config)
69
+
70
+ # Perform audio text extraction.
71
+ new_df, extraction_info = extract_text_from_audio_internal(
72
+ df_extraction_ledger=df_ledger,
73
+ task_config=task_config,
74
+ extraction_config=self.validated_config,
75
+ execution_trace_log=None,
76
+ )
77
+
78
+ # Update the message payload with the extracted text DataFrame.
79
+ control_message.payload(new_df)
80
+ control_message.set_metadata("audio_extraction_info", extraction_info)
81
+
82
+ return control_message
@@ -0,0 +1,92 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ import ray
9
+
10
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
+ from nv_ingest.framework.util.flow_control import filter_by_task
12
+ from nv_ingest_api.internal.extract.image.chart_extractor import extract_chart_data_from_image_internal
13
+ from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
+ from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
16
+ from nv_ingest_api.util.exception_handlers.decorators import (
17
+ nv_ingest_node_failure_try_except,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @ray.remote
24
+ class ChartExtractorStage(RayActorStage):
25
+ """
26
+ A Ray actor stage that extracts chart data from PDF content.
27
+
28
+ It expects an IngestControlMessage containing a DataFrame payload with PDF documents.
29
+ The stage removes the "chart_data_extract" task from the message, calls the internal
30
+ extraction function using a validated ChartExtractorSchema, updates the message payload,
31
+ and annotates the message metadata with extraction info.
32
+ """
33
+
34
+ def __init__(self, config: ChartExtractorSchema) -> None:
35
+ super().__init__(config)
36
+ try:
37
+ self.validated_config = config
38
+ # logger.warning(
39
+ # "ChartExtractorStage validated config:\n%s", pprint.pformat(self.validated_config.model_dump())
40
+ # )
41
+ except Exception as e:
42
+ logger.exception("Error validating chart extractor config")
43
+ raise e
44
+
45
+ @traceable("chart_extraction")
46
+ @filter_by_task(required_tasks=["chart_data_extract"])
47
+ @nv_ingest_node_failure_try_except(annotation_id="chart_extraction", raise_on_failure=False)
48
+ def on_data(self, control_message: Any) -> Any:
49
+ """
50
+ Process the control message by extracting chart data.
51
+
52
+ Parameters
53
+ ----------
54
+ control_message : IngestControlMessage
55
+ The incoming message containing the PDF payload.
56
+
57
+ Returns
58
+ -------
59
+ IngestControlMessage
60
+ The updated message with the extracted chart data and extraction info in metadata.
61
+ """
62
+ logger.info("ChartExtractorStage.on_data: Starting chart extraction.")
63
+ # Extract the DataFrame payload.
64
+ df_payload = control_message.payload()
65
+ logger.debug("ChartExtractorStage: Extracted payload with %d rows.", len(df_payload))
66
+
67
+ # Remove the "chart_data_extract" task to obtain task-specific configuration.
68
+ task_config = remove_task_by_type(control_message, "chart_data_extract")
69
+ logger.debug("ChartExtractorStage: Task config extracted: %s", task_config)
70
+
71
+ # Perform chart data extraction.
72
+ execution_trace_log = {}
73
+ new_df, extraction_info = extract_chart_data_from_image_internal(
74
+ df_extraction_ledger=df_payload,
75
+ task_config=task_config,
76
+ extraction_config=self.validated_config,
77
+ execution_trace_log=execution_trace_log,
78
+ )
79
+ logger.info("ChartExtractorStage: Chart extraction completed. New payload has %d rows.", len(new_df))
80
+
81
+ # Update the control message with the new DataFrame.
82
+ control_message.payload(new_df)
83
+ # Annotate the message with extraction info.
84
+ control_message.set_metadata("chart_extraction_info", extraction_info)
85
+ logger.info("ChartExtractorStage: Metadata injection complete. Returning updated control message.")
86
+
87
+ do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
88
+ if do_trace_tagging and execution_trace_log:
89
+ for key, ts in execution_trace_log.items():
90
+ control_message.set_timestamp(key, ts)
91
+
92
+ return control_message
@@ -0,0 +1,81 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+
7
+ import ray
8
+
9
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
10
+ from nv_ingest.framework.util.flow_control import filter_by_task
11
+ from nv_ingest_api.internal.extract.docx.docx_extractor import extract_primitives_from_docx_internal
12
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
13
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
14
+ from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
15
+ from nv_ingest_api.util.exception_handlers.decorators import (
16
+ nv_ingest_node_failure_try_except,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @ray.remote
23
+ class DocxExtractorStage(RayActorStage):
24
+ """
25
+ A Ray actor stage that extracts content from DOCX documents.
26
+
27
+ It expects an IngestControlMessage containing a DataFrame with DOCX document data. It then:
28
+ 1. Removes the "docx-extract" task from the message.
29
+ 2. Calls the DOCX extraction logic (via extract_primitives_from_docx_internal) using a validated configuration.
30
+ 3. Updates the message payload with the extracted content DataFrame.
31
+ """
32
+
33
+ def __init__(self, config: DocxExtractorSchema) -> None:
34
+ super().__init__(config, log_to_stdout=False)
35
+ try:
36
+ self.validated_config = config
37
+ logger.info("DocxExtractorStage configuration validated successfully.")
38
+ except Exception as e:
39
+ logger.exception(f"Error validating DOCX Extractor config: {e}")
40
+ raise
41
+
42
+ @traceable("docx_extractor")
43
+ @filter_by_task(required_tasks=[("extract", {"document_type": "docx"})])
44
+ @nv_ingest_node_failure_try_except(annotation_id="docx_extractor", raise_on_failure=True)
45
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
46
+ """
47
+ Process the control message by extracting content from DOCX documents.
48
+
49
+ Parameters
50
+ ----------
51
+ control_message : IngestControlMessage
52
+ The message containing a DataFrame payload with DOCX document data.
53
+
54
+ Returns
55
+ -------
56
+ IngestControlMessage
57
+ The updated message with extracted DOCX content.
58
+ """
59
+ self._logger.debug("DocxExtractorStage.on_data: Starting DOCX extraction process.")
60
+
61
+ # Extract the DataFrame payload.
62
+ df_ledger = control_message.payload()
63
+ self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
64
+
65
+ # Remove the "docx-extract" task from the message to obtain task-specific configuration.
66
+ task_config = remove_task_by_type(control_message, "extract")
67
+ self._logger.debug("Extracted task config: %s", task_config)
68
+
69
+ # Perform DOCX content extraction.
70
+ new_df, extraction_info = extract_primitives_from_docx_internal(
71
+ df_extraction_ledger=df_ledger,
72
+ task_config=task_config,
73
+ extraction_config=self.validated_config,
74
+ execution_trace_log=None,
75
+ )
76
+
77
+ # Update the message payload with the extracted DOCX content DataFrame.
78
+ control_message.payload(new_df)
79
+ control_message.set_metadata("docx_extraction_info", extraction_info)
80
+
81
+ return control_message