nv-ingest 2025.5.21.dev20250521__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/__init__.py +20 -0
- nv_ingest/api/__init__.py +3 -0
- nv_ingest/api/main.py +43 -0
- nv_ingest/api/v1/__init__.py +3 -0
- nv_ingest/api/v1/health.py +114 -0
- nv_ingest/api/v1/ingest.py +454 -0
- nv_ingest/framework/__init__.py +3 -0
- nv_ingest/framework/orchestration/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
- nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
- nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
- nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
- nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
- nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
- nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
- nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
- nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
- nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
- nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
- nv_ingest/framework/schemas/__init__.py +0 -0
- nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
- nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
- nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
- nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
- nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
- nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
- nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
- nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
- nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
- nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
- nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
- nv_ingest/framework/util/__init__.py +3 -0
- nv_ingest/framework/util/flow_control/__init__.py +8 -0
- nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
- nv_ingest/framework/util/service/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
- nv_ingest/framework/util/service/meta/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
- nv_ingest/framework/util/telemetry/__init__.py +3 -0
- nv_ingest/framework/util/telemetry/global_stats.py +145 -0
- nv_ingest/version.py +38 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
import threading
|
|
7
|
+
import logging
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from typing import Tuple, Dict, Any, Optional
|
|
10
|
+
|
|
11
|
+
import ray
|
|
12
|
+
from ray.exceptions import RayActorError
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RayStatsCollector:
|
|
18
|
+
"""
|
|
19
|
+
Collects statistics from a RayPipeline's actors and queues in parallel
|
|
20
|
+
using a dedicated background thread.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
pipeline_accessor: Any, # Object providing access to pipeline structure
|
|
26
|
+
interval: float = 30.0,
|
|
27
|
+
actor_timeout: float = 5.0,
|
|
28
|
+
queue_timeout: float = 2.0,
|
|
29
|
+
):
|
|
30
|
+
"""
|
|
31
|
+
Initializes the RayStatsCollector.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
pipeline_accessor : Any
|
|
36
|
+
An object (typically the RayPipeline instance) that provides methods
|
|
37
|
+
to access the pipeline's structure safely:
|
|
38
|
+
- `get_stages_info() -> List[StageInfo]`
|
|
39
|
+
- `get_stage_actors() -> Dict[str, List[Any]]`
|
|
40
|
+
- `get_edge_queues() -> Dict[str, Tuple[Any, int]]`
|
|
41
|
+
These methods should return snapshots suitable for iteration.
|
|
42
|
+
interval : float, optional
|
|
43
|
+
The interval in seconds between stats collection attempts, by default 5.0.
|
|
44
|
+
actor_timeout : float, optional
|
|
45
|
+
Timeout in seconds for waiting for stats from a single actor, by default 5.0.
|
|
46
|
+
queue_timeout : float, optional
|
|
47
|
+
Timeout in seconds for waiting for qsize from a single queue, by default 2.0.
|
|
48
|
+
"""
|
|
49
|
+
if not ray:
|
|
50
|
+
logger.warning("RayStatsCollector initialized but Ray is not available.")
|
|
51
|
+
|
|
52
|
+
self._pipeline = pipeline_accessor
|
|
53
|
+
self._interval = interval
|
|
54
|
+
self._actor_timeout = actor_timeout
|
|
55
|
+
self._queue_timeout = queue_timeout
|
|
56
|
+
|
|
57
|
+
self._lock: threading.Lock = threading.Lock() # Protects access to collected stats and status
|
|
58
|
+
self._running: bool = False
|
|
59
|
+
self._thread: Optional[threading.Thread] = None
|
|
60
|
+
|
|
61
|
+
# Internal state holding the latest results
|
|
62
|
+
self._collected_stats: Dict[str, Dict[str, int]] = {}
|
|
63
|
+
self._total_inflight: int = 0
|
|
64
|
+
self._last_update_time: float = 0.0
|
|
65
|
+
self._last_update_successful: bool = False
|
|
66
|
+
|
|
67
|
+
self._cumulative_stats: Dict[str, Dict[str, int]] = defaultdict(lambda: {"processed": 0})
|
|
68
|
+
|
|
69
|
+
logger.info(
|
|
70
|
+
f"RayStatsCollector initialized (Interval: {self._interval}s, "
|
|
71
|
+
f"Actor Timeout: {self._actor_timeout}s, Queue Timeout: {self._queue_timeout}s)"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# --- Helper function to be run in threads ---
|
|
75
|
+
|
|
76
|
+
def _get_qsize_sync(self, q_name: str, queue_actor: Any) -> Tuple[str, int]:
|
|
77
|
+
"""Safely calls qsize() on a queue actor and returns name + size/-1."""
|
|
78
|
+
try:
|
|
79
|
+
# Check right before calling - actor might have become invalid
|
|
80
|
+
if queue_actor is None:
|
|
81
|
+
logger.warning(f"[ThreadPool-qsize] Queue actor for '{q_name}' is None.")
|
|
82
|
+
return q_name, -1
|
|
83
|
+
if hasattr(queue_actor, "qsize") and callable(getattr(queue_actor, "qsize")):
|
|
84
|
+
# Direct, synchronous call
|
|
85
|
+
q_size_val = queue_actor.qsize()
|
|
86
|
+
return q_name, int(q_size_val)
|
|
87
|
+
else:
|
|
88
|
+
logger.warning(f"[ThreadPool-qsize] Queue actor for '{q_name}' lacks qsize method in thread.")
|
|
89
|
+
return q_name, 0 # Treat lack of method as size 0? Or -1? Let's use 0.
|
|
90
|
+
except RayActorError as e:
|
|
91
|
+
logger.error(f"[ThreadPool-qsize] Actor error calling qsize for queue {q_name}: {e}")
|
|
92
|
+
return q_name, -1
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.error(f"[ThreadPool-qsize] Error calling qsize for queue {q_name}: {e}", exc_info=True)
|
|
95
|
+
return q_name, -1
|
|
96
|
+
|
|
97
|
+
def start(self) -> None:
|
|
98
|
+
"""Starts the dedicated background statistics collection thread."""
|
|
99
|
+
if self._thread is not None and self._thread.is_alive():
|
|
100
|
+
logger.warning("Stats collector thread already started and alive.")
|
|
101
|
+
return
|
|
102
|
+
if self._running and (self._thread is None or not self._thread.is_alive()):
|
|
103
|
+
logger.warning("Stats collector flag was true but thread not running. Resetting flag.")
|
|
104
|
+
self._running = False # Correct inconsistent state
|
|
105
|
+
|
|
106
|
+
if not self._running:
|
|
107
|
+
logger.info("Starting stats collector thread...")
|
|
108
|
+
self._running = True
|
|
109
|
+
with self._lock:
|
|
110
|
+
self._last_update_successful = False # Mark as stale until first collection
|
|
111
|
+
self._last_update_time = time.time()
|
|
112
|
+
|
|
113
|
+
self._thread = threading.Thread(
|
|
114
|
+
target=self._collection_loop,
|
|
115
|
+
daemon=True, # Ensure thread exits if main program exits
|
|
116
|
+
name="PipelineStatsCollector",
|
|
117
|
+
)
|
|
118
|
+
self._thread.start()
|
|
119
|
+
# else: # Should not happen due to checks above
|
|
120
|
+
# logger.error("Logic error: Attempted to start stats collector when flag is already True.")
|
|
121
|
+
|
|
122
|
+
def stop(self) -> None:
|
|
123
|
+
"""Signals the background stats collection thread to stop and waits for it."""
|
|
124
|
+
if self._running:
|
|
125
|
+
logger.info("Stopping stats collector thread...")
|
|
126
|
+
self._running = False # Signal loop to stop
|
|
127
|
+
|
|
128
|
+
if self._thread is not None:
|
|
129
|
+
# Calculate a reasonable join timeout
|
|
130
|
+
join_timeout = max(10.0, self._interval + self._actor_timeout * 2 + self._queue_timeout * 2 + 5.0)
|
|
131
|
+
logger.debug(f"Waiting up to {join_timeout:.1f}s for stats thread to join...")
|
|
132
|
+
self._thread.join(timeout=join_timeout)
|
|
133
|
+
|
|
134
|
+
if self._thread.is_alive():
|
|
135
|
+
logger.warning(f"Stats collector thread did not stop gracefully after {join_timeout:.1f}s.")
|
|
136
|
+
else:
|
|
137
|
+
logger.debug("Stats collector thread joined successfully.")
|
|
138
|
+
self._thread = None
|
|
139
|
+
else:
|
|
140
|
+
logger.warning("Stop called for stats collector, but thread object was None.")
|
|
141
|
+
|
|
142
|
+
# Reset status flags after stopping
|
|
143
|
+
with self._lock:
|
|
144
|
+
self._last_update_successful = False
|
|
145
|
+
self._collected_stats = {} # Clear last collected stats
|
|
146
|
+
logger.info("Stats collector thread stopped.")
|
|
147
|
+
else:
|
|
148
|
+
logger.debug("Stats collector thread already stopped or never started.")
|
|
149
|
+
|
|
150
|
+
def get_latest_stats(self) -> Tuple[Dict[str, Dict[str, int]], int, float, bool]:
|
|
151
|
+
"""
|
|
152
|
+
Returns the most recently collected statistics, update time, and success status.
|
|
153
|
+
|
|
154
|
+
Returns
|
|
155
|
+
-------
|
|
156
|
+
Tuple[Dict[str, Dict[str, int]], float, bool]
|
|
157
|
+
A tuple containing:
|
|
158
|
+
- A dictionary mapping stage names to their statistics (or empty if none collected).
|
|
159
|
+
- The timestamp (time.time()) of the last update attempt.
|
|
160
|
+
- A boolean indicating if the last collection was successful.
|
|
161
|
+
"""
|
|
162
|
+
with self._lock:
|
|
163
|
+
# Return copies to prevent external modification
|
|
164
|
+
stats_copy = self._collected_stats.copy()
|
|
165
|
+
total_inflight = self._total_inflight
|
|
166
|
+
update_time = self._last_update_time
|
|
167
|
+
success = self._last_update_successful
|
|
168
|
+
return stats_copy, total_inflight, update_time, success
|
|
169
|
+
|
|
170
|
+
def _collection_loop(self) -> None:
|
|
171
|
+
"""
|
|
172
|
+
Main loop for the statistics collection thread. Periodically calls
|
|
173
|
+
collect_stats_now and updates shared state.
|
|
174
|
+
"""
|
|
175
|
+
logger.debug(f"Stats collector loop started. Interval: {self._interval}s.")
|
|
176
|
+
while self._running:
|
|
177
|
+
start_time = time.time()
|
|
178
|
+
new_stats = {}
|
|
179
|
+
success = False
|
|
180
|
+
collection_duration = 0.0
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
# Collect stats using the core logic method
|
|
184
|
+
new_stats, total_inflight, success = self.collect_stats_now()
|
|
185
|
+
collection_duration = time.time() - start_time
|
|
186
|
+
|
|
187
|
+
# Update shared state under lock
|
|
188
|
+
with self._lock:
|
|
189
|
+
self._collected_stats = new_stats
|
|
190
|
+
self._total_inflight = total_inflight
|
|
191
|
+
|
|
192
|
+
for stage, stats in new_stats.items():
|
|
193
|
+
if "delta_processed" in stats:
|
|
194
|
+
self._cumulative_stats[stage]["processed"] += stats["delta_processed"]
|
|
195
|
+
|
|
196
|
+
self._last_update_time = time.time()
|
|
197
|
+
self._last_update_successful = success
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
# Catch critical errors within the collection call itself
|
|
201
|
+
logger.error(f"Critical error during collect_stats_now call: {e}", exc_info=True)
|
|
202
|
+
collection_duration = time.time() - start_time
|
|
203
|
+
with self._lock: # Ensure flags are updated on critical error
|
|
204
|
+
self._collected_stats = {} # Clear potentially inconsistent stats
|
|
205
|
+
self._last_update_successful = False
|
|
206
|
+
self._last_update_time = time.time()
|
|
207
|
+
|
|
208
|
+
# --- Logging ---
|
|
209
|
+
log_level = logging.DEBUG if success else logging.WARNING
|
|
210
|
+
logger.log(
|
|
211
|
+
log_level, f"Stats collection cycle finished (Success: {success}) in {collection_duration:.3f}s."
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# --- Sleep ---
|
|
215
|
+
elapsed = time.time() - start_time
|
|
216
|
+
sleep_time = max(0.1, self._interval - elapsed)
|
|
217
|
+
|
|
218
|
+
# Check running flag *before* sleeping to allow faster exit
|
|
219
|
+
if not self._running:
|
|
220
|
+
break
|
|
221
|
+
|
|
222
|
+
# Using Event for interruptible sleep might be slightly better for immediate stops,
|
|
223
|
+
# but time.sleep is simpler for now.
|
|
224
|
+
time.sleep(sleep_time)
|
|
225
|
+
|
|
226
|
+
logger.info("Stats collector loop finished.")
|
|
227
|
+
|
|
228
|
+
def collect_stats_now(self) -> Tuple[Dict[str, Dict[str, int]], int, bool]:
|
|
229
|
+
"""
|
|
230
|
+
Performs a single collection cycle of statistics from pipeline actors/queues.
|
|
231
|
+
|
|
232
|
+
Returns
|
|
233
|
+
-------
|
|
234
|
+
Tuple[Dict[str, Dict[str, int]], bool]
|
|
235
|
+
A dictionary mapping stage names to their collected statistics, and a
|
|
236
|
+
boolean indicating if the overall collection was successful.
|
|
237
|
+
"""
|
|
238
|
+
if not ray:
|
|
239
|
+
logger.error("[StatsCollectNow] Ray is not available. Cannot collect stats.")
|
|
240
|
+
return {}, 0, False
|
|
241
|
+
|
|
242
|
+
overall_success = True
|
|
243
|
+
stage_stats_updates: Dict[str, Dict[str, int]] = {}
|
|
244
|
+
actor_tasks: Dict[ray.ObjectRef, Tuple[Any, str]] = {}
|
|
245
|
+
queue_sizes: Dict[str, int] = {}
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
current_stages = self._pipeline.get_stages_info()
|
|
249
|
+
current_stage_actors = self._pipeline.get_stage_actors()
|
|
250
|
+
current_edge_queues = self._pipeline.get_edge_queues()
|
|
251
|
+
except Exception as e:
|
|
252
|
+
logger.error(f"[StatsCollectNow] Failed to get pipeline structure: {e}", exc_info=True)
|
|
253
|
+
return {}, 0, False
|
|
254
|
+
|
|
255
|
+
logger.debug(f"[StatsCollectNow] Starting collection for {len(current_stages)} stages.")
|
|
256
|
+
|
|
257
|
+
# --- 1. Prepare Actor Stat Requests ---
|
|
258
|
+
for stage_info in current_stages:
|
|
259
|
+
stage_name = stage_info.name
|
|
260
|
+
stage_stats_updates[stage_name] = {"processing": 0, "in_flight": 0}
|
|
261
|
+
|
|
262
|
+
if stage_info.pending_shutdown:
|
|
263
|
+
logger.debug(f"[StatsCollectNow] Stage '{stage_name}' pending shutdown. Skipping actor queries.")
|
|
264
|
+
# Assume stage has 1 active job to prevent premature scale-down
|
|
265
|
+
stage_stats_updates[stage_name]["processing"] = 1
|
|
266
|
+
stage_stats_updates[stage_name]["in_flight"] = 0
|
|
267
|
+
continue
|
|
268
|
+
|
|
269
|
+
actors = current_stage_actors.get(stage_name, [])
|
|
270
|
+
for actor in actors:
|
|
271
|
+
try:
|
|
272
|
+
stats_ref = actor.get_stats.remote()
|
|
273
|
+
actor_tasks[stats_ref] = (actor, stage_name)
|
|
274
|
+
except Exception as e:
|
|
275
|
+
logger.error(
|
|
276
|
+
f"[StatsCollectNow] Failed to initiate get_stats for actor {actor}: {e}", exc_info=True
|
|
277
|
+
)
|
|
278
|
+
overall_success = False
|
|
279
|
+
|
|
280
|
+
logger.debug(f"[StatsCollectNow] Initiated {len(actor_tasks)} actor stat requests.")
|
|
281
|
+
|
|
282
|
+
# --- 2. Collect Queue Stats (Synchronous Threaded Calls) ---
|
|
283
|
+
for q_name, (queue_actor, _) in current_edge_queues.items():
|
|
284
|
+
try:
|
|
285
|
+
q_size_val = queue_actor.qsize()
|
|
286
|
+
queue_sizes[q_name] = int(q_size_val)
|
|
287
|
+
except Exception as e:
|
|
288
|
+
logger.error(f"[StatsCollectNow] Failed to get queue size for '{q_name}': {e}", exc_info=True)
|
|
289
|
+
queue_sizes[q_name] = 0
|
|
290
|
+
overall_success = False
|
|
291
|
+
|
|
292
|
+
# --- 3. Resolve Actor Stats ---
|
|
293
|
+
if actor_tasks:
|
|
294
|
+
try:
|
|
295
|
+
ready_refs, remaining_refs = ray.wait(
|
|
296
|
+
list(actor_tasks.keys()), num_returns=len(actor_tasks), timeout=self._actor_timeout
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
for ref in ready_refs:
|
|
300
|
+
actor, stage_name = actor_tasks[ref]
|
|
301
|
+
try:
|
|
302
|
+
stats = ray.get(ref)
|
|
303
|
+
active = int(stats.get("active_processing", 0))
|
|
304
|
+
delta = int(stats.get("delta_processed", 0))
|
|
305
|
+
processed = stage_stats_updates[stage_name].get("processed", 0)
|
|
306
|
+
processing = stage_stats_updates[stage_name].get("processing", 0)
|
|
307
|
+
stage_stats_updates[stage_name]["processing"] = processing + active
|
|
308
|
+
stage_stats_updates[stage_name]["processed"] = processed + delta
|
|
309
|
+
stage_stats_updates[stage_name]["delta_processed"] = (
|
|
310
|
+
stage_stats_updates[stage_name].get("delta_processed", 0) + delta
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
except Exception as e:
|
|
314
|
+
logger.warning(
|
|
315
|
+
f"[StatsCollectNow] Error getting stats for actor {actor} (Stage '{stage_name}'): {e}"
|
|
316
|
+
)
|
|
317
|
+
overall_success = False
|
|
318
|
+
|
|
319
|
+
if remaining_refs:
|
|
320
|
+
logger.warning(f"[StatsCollectNow] {len(remaining_refs)} actor stats requests timed out.")
|
|
321
|
+
overall_success = False
|
|
322
|
+
|
|
323
|
+
except Exception as e:
|
|
324
|
+
logger.error(f"[StatsCollectNow] Error during actor stats collection: {e}", exc_info=True)
|
|
325
|
+
overall_success = False
|
|
326
|
+
|
|
327
|
+
# --- 4. Aggregate In-Flight Stats ---
|
|
328
|
+
_total_inflight = 0
|
|
329
|
+
for stage_info in current_stages:
|
|
330
|
+
stage_name = stage_info.name
|
|
331
|
+
input_queues = [q_name for q_name in current_edge_queues.keys() if q_name.endswith(f"_to_{stage_name}")]
|
|
332
|
+
total_queued = sum(queue_sizes.get(q, 0) for q in input_queues)
|
|
333
|
+
stage_stats_updates[stage_name]["in_flight"] += total_queued
|
|
334
|
+
|
|
335
|
+
_total_inflight += total_queued + stage_stats_updates[stage_name]["processing"]
|
|
336
|
+
|
|
337
|
+
logger.debug(f"[StatsCollectNow] Collected stats for {len(stage_stats_updates)} stages.")
|
|
338
|
+
for stage, stats in stage_stats_updates.items():
|
|
339
|
+
flat_stats = ", ".join(f"{k}={v}" for k, v in stats.items())
|
|
340
|
+
total = self._cumulative_stats.get(stage, {}).get("processed", 0)
|
|
341
|
+
logger.debug(f"[StatsCollectNow] {stage}: {flat_stats}, total_processed={total}")
|
|
342
|
+
|
|
343
|
+
logger.debug(f"[StatsCollectNow] Total in-flight jobs: {_total_inflight}")
|
|
344
|
+
logger.debug(f"[StatsCollectNow] Stats collection complete. Overall success: {overall_success}")
|
|
345
|
+
|
|
346
|
+
return stage_stats_updates, _total_inflight, overall_success
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
import ray
|
|
9
|
+
|
|
10
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
11
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
12
|
+
from nv_ingest_api.internal.extract.audio.audio_extraction import extract_text_from_audio_internal
|
|
13
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type, IngestControlMessage
|
|
14
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
15
|
+
from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
|
|
16
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
|
+
nv_ingest_node_failure_try_except,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@ray.remote
|
|
24
|
+
class AudioExtractorStage(RayActorStage):
|
|
25
|
+
"""
|
|
26
|
+
A Ray actor stage that extracts text from audio content.
|
|
27
|
+
|
|
28
|
+
It expects an IngestControlMessage containing a DataFrame with audio data. It then:
|
|
29
|
+
1. Removes the "audio_data_extract" task from the message.
|
|
30
|
+
2. Calls the audio extraction logic (via extract_text_from_audio_internal) using a validated configuration.
|
|
31
|
+
3. Updates the message payload with the extracted text DataFrame.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, config: AudioExtractorSchema) -> None:
|
|
35
|
+
super().__init__(config, log_to_stdout=False)
|
|
36
|
+
try:
|
|
37
|
+
self.validated_config = config
|
|
38
|
+
self._logger.info("AudioExtractorStage configuration validated successfully.")
|
|
39
|
+
except Exception as e:
|
|
40
|
+
self._logger.exception(f"Error validating Audio Extractor config: {e}")
|
|
41
|
+
raise
|
|
42
|
+
|
|
43
|
+
@traceable("audio_extractor")
|
|
44
|
+
@filter_by_task(required_tasks=[("extract", {"document_type": "regex:^(mp3|wav)$"})])
|
|
45
|
+
@nv_ingest_node_failure_try_except(annotation_id="audio_extractor", raise_on_failure=False)
|
|
46
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
|
+
"""
|
|
48
|
+
Process the control message by extracting text from audio.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
control_message : IngestControlMessage
|
|
53
|
+
The message containing a DataFrame payload with audio data.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
IngestControlMessage
|
|
58
|
+
The updated message with extracted text data.
|
|
59
|
+
"""
|
|
60
|
+
self._logger.debug("AudioExtractorStage.on_data: Starting audio extraction process.")
|
|
61
|
+
|
|
62
|
+
# Extract the DataFrame payload.
|
|
63
|
+
df_ledger = control_message.payload()
|
|
64
|
+
self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
|
|
65
|
+
|
|
66
|
+
# Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
|
|
67
|
+
task_config = remove_task_by_type(control_message, "extract")
|
|
68
|
+
self._logger.debug("Extracted task config: %s", task_config)
|
|
69
|
+
|
|
70
|
+
# Perform audio text extraction.
|
|
71
|
+
new_df, extraction_info = extract_text_from_audio_internal(
|
|
72
|
+
df_extraction_ledger=df_ledger,
|
|
73
|
+
task_config=task_config,
|
|
74
|
+
extraction_config=self.validated_config,
|
|
75
|
+
execution_trace_log=None,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Update the message payload with the extracted text DataFrame.
|
|
79
|
+
control_message.payload(new_df)
|
|
80
|
+
control_message.set_metadata("audio_extraction_info", extraction_info)
|
|
81
|
+
|
|
82
|
+
return control_message
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import ray
|
|
9
|
+
|
|
10
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
11
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
12
|
+
from nv_ingest_api.internal.extract.image.chart_extractor import extract_chart_data_from_image_internal
|
|
13
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
|
|
14
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
15
|
+
from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
|
|
16
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
|
+
nv_ingest_node_failure_try_except,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@ray.remote
|
|
24
|
+
class ChartExtractorStage(RayActorStage):
|
|
25
|
+
"""
|
|
26
|
+
A Ray actor stage that extracts chart data from PDF content.
|
|
27
|
+
|
|
28
|
+
It expects an IngestControlMessage containing a DataFrame payload with PDF documents.
|
|
29
|
+
The stage removes the "chart_data_extract" task from the message, calls the internal
|
|
30
|
+
extraction function using a validated ChartExtractorSchema, updates the message payload,
|
|
31
|
+
and annotates the message metadata with extraction info.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, config: ChartExtractorSchema) -> None:
|
|
35
|
+
super().__init__(config)
|
|
36
|
+
try:
|
|
37
|
+
self.validated_config = config
|
|
38
|
+
# logger.warning(
|
|
39
|
+
# "ChartExtractorStage validated config:\n%s", pprint.pformat(self.validated_config.model_dump())
|
|
40
|
+
# )
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.exception("Error validating chart extractor config")
|
|
43
|
+
raise e
|
|
44
|
+
|
|
45
|
+
@traceable("chart_extraction")
|
|
46
|
+
@filter_by_task(required_tasks=["chart_data_extract"])
|
|
47
|
+
@nv_ingest_node_failure_try_except(annotation_id="chart_extraction", raise_on_failure=False)
|
|
48
|
+
def on_data(self, control_message: Any) -> Any:
|
|
49
|
+
"""
|
|
50
|
+
Process the control message by extracting chart data.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
control_message : IngestControlMessage
|
|
55
|
+
The incoming message containing the PDF payload.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
IngestControlMessage
|
|
60
|
+
The updated message with the extracted chart data and extraction info in metadata.
|
|
61
|
+
"""
|
|
62
|
+
logger.info("ChartExtractorStage.on_data: Starting chart extraction.")
|
|
63
|
+
# Extract the DataFrame payload.
|
|
64
|
+
df_payload = control_message.payload()
|
|
65
|
+
logger.debug("ChartExtractorStage: Extracted payload with %d rows.", len(df_payload))
|
|
66
|
+
|
|
67
|
+
# Remove the "chart_data_extract" task to obtain task-specific configuration.
|
|
68
|
+
task_config = remove_task_by_type(control_message, "chart_data_extract")
|
|
69
|
+
logger.debug("ChartExtractorStage: Task config extracted: %s", task_config)
|
|
70
|
+
|
|
71
|
+
# Perform chart data extraction.
|
|
72
|
+
execution_trace_log = {}
|
|
73
|
+
new_df, extraction_info = extract_chart_data_from_image_internal(
|
|
74
|
+
df_extraction_ledger=df_payload,
|
|
75
|
+
task_config=task_config,
|
|
76
|
+
extraction_config=self.validated_config,
|
|
77
|
+
execution_trace_log=execution_trace_log,
|
|
78
|
+
)
|
|
79
|
+
logger.info("ChartExtractorStage: Chart extraction completed. New payload has %d rows.", len(new_df))
|
|
80
|
+
|
|
81
|
+
# Update the control message with the new DataFrame.
|
|
82
|
+
control_message.payload(new_df)
|
|
83
|
+
# Annotate the message with extraction info.
|
|
84
|
+
control_message.set_metadata("chart_extraction_info", extraction_info)
|
|
85
|
+
logger.info("ChartExtractorStage: Metadata injection complete. Returning updated control message.")
|
|
86
|
+
|
|
87
|
+
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
88
|
+
if do_trace_tagging and execution_trace_log:
|
|
89
|
+
for key, ts in execution_trace_log.items():
|
|
90
|
+
control_message.set_timestamp(key, ts)
|
|
91
|
+
|
|
92
|
+
return control_message
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import ray
|
|
8
|
+
|
|
9
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
10
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
11
|
+
from nv_ingest_api.internal.extract.docx.docx_extractor import extract_primitives_from_docx_internal
|
|
12
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
13
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
14
|
+
from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
|
|
15
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
16
|
+
nv_ingest_node_failure_try_except,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@ray.remote
|
|
23
|
+
class DocxExtractorStage(RayActorStage):
|
|
24
|
+
"""
|
|
25
|
+
A Ray actor stage that extracts content from DOCX documents.
|
|
26
|
+
|
|
27
|
+
It expects an IngestControlMessage containing a DataFrame with DOCX document data. It then:
|
|
28
|
+
1. Removes the "docx-extract" task from the message.
|
|
29
|
+
2. Calls the DOCX extraction logic (via extract_primitives_from_docx_internal) using a validated configuration.
|
|
30
|
+
3. Updates the message payload with the extracted content DataFrame.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: DocxExtractorSchema) -> None:
|
|
34
|
+
super().__init__(config, log_to_stdout=False)
|
|
35
|
+
try:
|
|
36
|
+
self.validated_config = config
|
|
37
|
+
logger.info("DocxExtractorStage configuration validated successfully.")
|
|
38
|
+
except Exception as e:
|
|
39
|
+
logger.exception(f"Error validating DOCX Extractor config: {e}")
|
|
40
|
+
raise
|
|
41
|
+
|
|
42
|
+
@traceable("docx_extractor")
|
|
43
|
+
@filter_by_task(required_tasks=[("extract", {"document_type": "docx"})])
|
|
44
|
+
@nv_ingest_node_failure_try_except(annotation_id="docx_extractor", raise_on_failure=True)
|
|
45
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
46
|
+
"""
|
|
47
|
+
Process the control message by extracting content from DOCX documents.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
control_message : IngestControlMessage
|
|
52
|
+
The message containing a DataFrame payload with DOCX document data.
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
IngestControlMessage
|
|
57
|
+
The updated message with extracted DOCX content.
|
|
58
|
+
"""
|
|
59
|
+
self._logger.debug("DocxExtractorStage.on_data: Starting DOCX extraction process.")
|
|
60
|
+
|
|
61
|
+
# Extract the DataFrame payload.
|
|
62
|
+
df_ledger = control_message.payload()
|
|
63
|
+
self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
|
|
64
|
+
|
|
65
|
+
# Remove the "docx-extract" task from the message to obtain task-specific configuration.
|
|
66
|
+
task_config = remove_task_by_type(control_message, "extract")
|
|
67
|
+
self._logger.debug("Extracted task config: %s", task_config)
|
|
68
|
+
|
|
69
|
+
# Perform DOCX content extraction.
|
|
70
|
+
new_df, extraction_info = extract_primitives_from_docx_internal(
|
|
71
|
+
df_extraction_ledger=df_ledger,
|
|
72
|
+
task_config=task_config,
|
|
73
|
+
extraction_config=self.validated_config,
|
|
74
|
+
execution_trace_log=None,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Update the message payload with the extracted DOCX content DataFrame.
|
|
78
|
+
control_message.payload(new_df)
|
|
79
|
+
control_message.set_metadata("docx_extraction_info", extraction_info)
|
|
80
|
+
|
|
81
|
+
return control_message
|