nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/__init__.py +6 -0
- nv_ingest/api/main.py +2 -0
- nv_ingest/api/tracing.py +82 -0
- nv_ingest/api/v2/README.md +203 -0
- nv_ingest/api/v2/__init__.py +3 -0
- nv_ingest/api/v2/ingest.py +1300 -0
- nv_ingest/framework/orchestration/execution/__init__.py +3 -0
- nv_ingest/framework/orchestration/execution/helpers.py +85 -0
- nv_ingest/framework/orchestration/execution/options.py +112 -0
- nv_ingest/framework/orchestration/process/__init__.py +3 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
- nv_ingest/framework/orchestration/process/execution.py +495 -0
- nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
- nv_ingest/framework/orchestration/process/strategies.py +218 -0
- nv_ingest/framework/orchestration/process/termination.py +147 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +3 -3
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
- nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +79 -11
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -18
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
- nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
- nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
- nv_ingest/pipeline/__init__.py +3 -0
- nv_ingest/pipeline/config/__init__.py +3 -0
- nv_ingest/pipeline/config/loaders.py +229 -0
- nv_ingest/pipeline/config/replica_resolver.py +237 -0
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +528 -0
- nv_ingest/pipeline/default_pipeline_impl.py +557 -0
- nv_ingest/pipeline/ingest_pipeline.py +389 -0
- nv_ingest/pipeline/pipeline_schema.py +398 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +6 -3
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +64 -43
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
|
@@ -3,8 +3,6 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import multiprocessing
|
|
6
|
-
import os
|
|
7
|
-
import signal
|
|
8
6
|
import threading
|
|
9
7
|
from abc import ABC, abstractmethod
|
|
10
8
|
from dataclasses import dataclass
|
|
@@ -22,6 +20,7 @@ import logging
|
|
|
22
20
|
import time
|
|
23
21
|
|
|
24
22
|
from nv_ingest.framework.orchestration.ray.primitives.pipeline_topology import PipelineTopology, StageInfo
|
|
23
|
+
from nv_ingest.framework.orchestration.process.termination import kill_pipeline_process_group
|
|
25
24
|
from nv_ingest.framework.orchestration.ray.primitives.ray_stat_collector import RayStatsCollector
|
|
26
25
|
from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import PIDController, ResourceConstraintManager
|
|
27
26
|
from nv_ingest.framework.orchestration.ray.util.pipeline.tools import wrap_callable_as_stage
|
|
@@ -120,24 +119,19 @@ class RayPipelineSubprocessInterface(PipelineInterface):
|
|
|
120
119
|
|
|
121
120
|
def stop(self) -> None:
|
|
122
121
|
"""
|
|
123
|
-
Stops the subprocess pipeline
|
|
122
|
+
Stops the subprocess pipeline and its entire process group to ensure
|
|
123
|
+
any child processes (e.g., the simple message broker) are terminated.
|
|
124
124
|
"""
|
|
125
|
-
|
|
125
|
+
try:
|
|
126
|
+
pid = int(self._process.pid)
|
|
127
|
+
except Exception:
|
|
126
128
|
return
|
|
127
129
|
|
|
130
|
+
# Always attempt to terminate the entire process group
|
|
128
131
|
try:
|
|
129
|
-
|
|
130
|
-
self._process.join(timeout=5.0)
|
|
132
|
+
kill_pipeline_process_group(pid)
|
|
131
133
|
except Exception as e:
|
|
132
|
-
logger.warning(f"
|
|
133
|
-
|
|
134
|
-
if self._process.is_alive():
|
|
135
|
-
try:
|
|
136
|
-
pgid = os.getpgid(self._process.pid)
|
|
137
|
-
os.killpg(pgid, signal.SIGKILL)
|
|
138
|
-
except Exception as e:
|
|
139
|
-
logger.error(f"Failed to force-kill process group: {e}")
|
|
140
|
-
self._process.join(timeout=3.0)
|
|
134
|
+
logger.warning(f"kill_pipeline_process_group failed: {e}")
|
|
141
135
|
|
|
142
136
|
|
|
143
137
|
class RayPipelineInterface(PipelineInterface):
|
|
@@ -252,7 +246,7 @@ class RayPipeline(PipelineInterface):
|
|
|
252
246
|
penalty_factor=self.scaling_config.pid_penalty_factor,
|
|
253
247
|
error_boost_factor=self.scaling_config.pid_error_boost_factor,
|
|
254
248
|
)
|
|
255
|
-
logger.
|
|
249
|
+
logger.debug("PIDController initialized using ScalingConfig.")
|
|
256
250
|
|
|
257
251
|
try:
|
|
258
252
|
total_system_memory_bytes = psutil.virtual_memory().total
|
|
@@ -270,7 +264,7 @@ class RayPipeline(PipelineInterface):
|
|
|
270
264
|
memory_threshold=absolute_memory_threshold_mb,
|
|
271
265
|
memory_safety_buffer_fraction=self.scaling_config.rcm_memory_safety_buffer_fraction,
|
|
272
266
|
)
|
|
273
|
-
logger.
|
|
267
|
+
logger.debug("ResourceConstraintManager initialized using ScalingConfig.")
|
|
274
268
|
|
|
275
269
|
# --- Instantiate Stats Collector ---
|
|
276
270
|
self._stats_collection_interval_seconds = self.stats_config.collection_interval_seconds
|
|
@@ -282,7 +276,7 @@ class RayPipeline(PipelineInterface):
|
|
|
282
276
|
ema_alpha=self.scaling_config.pid_ema_alpha,
|
|
283
277
|
)
|
|
284
278
|
|
|
285
|
-
logger.
|
|
279
|
+
logger.debug("RayStatsCollector initialized using StatsConfig.")
|
|
286
280
|
|
|
287
281
|
# --- Accessor Methods for Stat Collector (and internal use) ---
|
|
288
282
|
|
|
@@ -349,11 +343,11 @@ class RayPipeline(PipelineInterface):
|
|
|
349
343
|
# Update constraint manager
|
|
350
344
|
self.constraint_manager.max_replicas = total_max_replicas
|
|
351
345
|
|
|
352
|
-
logger.
|
|
346
|
+
logger.debug(f"[Build-Configure] Autoscalers configured. Total Max Replicas: {total_max_replicas}")
|
|
353
347
|
|
|
354
348
|
def _instantiate_initial_actors(self) -> None:
|
|
355
349
|
"""Instantiates initial actors and updates topology."""
|
|
356
|
-
logger.
|
|
350
|
+
logger.debug("[Build-Actors] Instantiating initial stage actors (min_replicas)...")
|
|
357
351
|
# Use topology accessor
|
|
358
352
|
current_stages = self.topology.get_stages_info()
|
|
359
353
|
|
|
@@ -377,7 +371,7 @@ class RayPipeline(PipelineInterface):
|
|
|
377
371
|
)
|
|
378
372
|
try:
|
|
379
373
|
actor = stage.callable.options(name=actor_name, max_concurrency=1, max_restarts=0).remote(
|
|
380
|
-
config=stage.config
|
|
374
|
+
config=stage.config, stage_name=stage.name
|
|
381
375
|
)
|
|
382
376
|
replicas.append(actor)
|
|
383
377
|
except Exception as e:
|
|
@@ -388,7 +382,7 @@ class RayPipeline(PipelineInterface):
|
|
|
388
382
|
self.topology.set_actors_for_stage(stage.name, replicas)
|
|
389
383
|
logger.debug(f"[Build-Actors] Stage '{stage.name}' initial actors set in topology: count={len(replicas)}")
|
|
390
384
|
|
|
391
|
-
logger.
|
|
385
|
+
logger.debug("[Build-Actors] Initial actor instantiation complete.")
|
|
392
386
|
|
|
393
387
|
def _create_and_wire_edges(self) -> List[ray.ObjectRef]:
|
|
394
388
|
"""
|
|
@@ -399,7 +393,7 @@ class RayPipeline(PipelineInterface):
|
|
|
399
393
|
List[ray.ObjectRef]
|
|
400
394
|
A list of object references for the remote wiring calls.
|
|
401
395
|
"""
|
|
402
|
-
logger.
|
|
396
|
+
logger.debug("[Build-Wiring] Creating and wiring edges...")
|
|
403
397
|
wiring_refs = []
|
|
404
398
|
new_edge_queues: Dict[str, Tuple[Any, int]] = {}
|
|
405
399
|
|
|
@@ -628,7 +622,7 @@ class RayPipeline(PipelineInterface):
|
|
|
628
622
|
Dict[str, List[Any]]
|
|
629
623
|
A dictionary mapping stage names to lists of actor handles.
|
|
630
624
|
"""
|
|
631
|
-
logger.
|
|
625
|
+
logger.debug("--- Starting Pipeline Build Process ---")
|
|
632
626
|
try:
|
|
633
627
|
if not self.topology.get_stages_info():
|
|
634
628
|
logger.error("Build failed: No stages defined in topology.")
|
|
@@ -640,7 +634,7 @@ class RayPipeline(PipelineInterface):
|
|
|
640
634
|
wiring_futures = self._create_and_wire_edges()
|
|
641
635
|
self._wait_for_wiring(wiring_futures)
|
|
642
636
|
|
|
643
|
-
logger.
|
|
637
|
+
logger.debug("--- Pipeline Build Completed Successfully ---")
|
|
644
638
|
return self.topology.get_stage_actors() # Return actors from topology
|
|
645
639
|
|
|
646
640
|
except RuntimeError as e:
|
|
@@ -673,7 +667,7 @@ class RayPipeline(PipelineInterface):
|
|
|
673
667
|
logger.debug(f"[ScaleUtil] Creating new actor '{actor_name}' for stage '{stage_info.name}'")
|
|
674
668
|
try:
|
|
675
669
|
new_actor = stage_info.callable.options(name=actor_name, max_concurrency=1, max_restarts=0).remote(
|
|
676
|
-
config=stage_info.config
|
|
670
|
+
config=stage_info.config, stage_name=stage_info.name
|
|
677
671
|
)
|
|
678
672
|
|
|
679
673
|
return new_actor
|
|
@@ -861,7 +855,7 @@ class RayPipeline(PipelineInterface):
|
|
|
861
855
|
# Select actors to remove (e.g., the most recently added)
|
|
862
856
|
actors_to_remove = current_replicas[-num_to_remove:]
|
|
863
857
|
|
|
864
|
-
logger.
|
|
858
|
+
logger.debug(f"[ScaleDown-{stage_name}] Selected {len(actors_to_remove)} actors for removal.")
|
|
865
859
|
|
|
866
860
|
# Signal each actor to stop and mark it for removal by the topology.
|
|
867
861
|
# The topology's cleanup thread will handle polling and final removal.
|
|
@@ -966,7 +960,7 @@ class RayPipeline(PipelineInterface):
|
|
|
966
960
|
True if the pipeline drained successfully, False otherwise.
|
|
967
961
|
"""
|
|
968
962
|
start_time = time.time()
|
|
969
|
-
logger.
|
|
963
|
+
logger.debug(f"Waiting for pipeline drain (Timeout: {timeout_seconds}s)...")
|
|
970
964
|
last_in_flight = -1
|
|
971
965
|
drain_check_interval = 1.0 # Check every second
|
|
972
966
|
|
|
@@ -1172,7 +1166,7 @@ class RayPipeline(PipelineInterface):
|
|
|
1172
1166
|
force : bool, optional
|
|
1173
1167
|
Whether to force the flush, by default False.
|
|
1174
1168
|
"""
|
|
1175
|
-
logger.
|
|
1169
|
+
logger.debug(f"Manual queue flush requested (force={force}).")
|
|
1176
1170
|
|
|
1177
1171
|
if self.topology.get_is_flushing() or self._stopping: # Check topology
|
|
1178
1172
|
logger.warning("Flush already in progress or pipeline is stopping.")
|
|
@@ -1183,7 +1177,7 @@ class RayPipeline(PipelineInterface):
|
|
|
1183
1177
|
# For now, run synchronously:
|
|
1184
1178
|
self._execute_queue_flush()
|
|
1185
1179
|
else:
|
|
1186
|
-
logger.
|
|
1180
|
+
logger.debug("Manual flush denied: pipeline not quiet or interval not met.")
|
|
1187
1181
|
|
|
1188
1182
|
def _gather_controller_metrics(
|
|
1189
1183
|
self, current_stage_stats: Dict[str, Dict[str, int]], global_in_flight: int
|
|
@@ -1409,7 +1403,7 @@ class RayPipeline(PipelineInterface):
|
|
|
1409
1403
|
self._consecutive_quiet_cycles += 1
|
|
1410
1404
|
logger.debug(f"Pipeline is quiet. Consecutive quiet cycles: {self._consecutive_quiet_cycles}")
|
|
1411
1405
|
if self._consecutive_quiet_cycles >= self.consecutive_quiet_cycles_for_flush:
|
|
1412
|
-
logger.
|
|
1406
|
+
logger.debug(
|
|
1413
1407
|
f"Pipeline has been quiet for {self._consecutive_quiet_cycles} cycles. "
|
|
1414
1408
|
"Initiating queue flush."
|
|
1415
1409
|
)
|
|
@@ -1423,7 +1417,7 @@ class RayPipeline(PipelineInterface):
|
|
|
1423
1417
|
)
|
|
1424
1418
|
else:
|
|
1425
1419
|
if self._consecutive_quiet_cycles > 0:
|
|
1426
|
-
logger.
|
|
1420
|
+
logger.debug(
|
|
1427
1421
|
f"Pipeline is no longer quiet. Resetting consecutive quiet cycle count "
|
|
1428
1422
|
f"from {self._consecutive_quiet_cycles} to 0."
|
|
1429
1423
|
)
|
|
@@ -1479,7 +1473,7 @@ class RayPipeline(PipelineInterface):
|
|
|
1479
1473
|
interval : float
|
|
1480
1474
|
The interval in seconds.
|
|
1481
1475
|
"""
|
|
1482
|
-
logger.
|
|
1476
|
+
logger.debug(f"Scaling loop started. Interval: {interval}s")
|
|
1483
1477
|
while self._scaling_monitoring:
|
|
1484
1478
|
try:
|
|
1485
1479
|
self._perform_scaling_and_maintenance()
|
|
@@ -1490,7 +1484,7 @@ class RayPipeline(PipelineInterface):
|
|
|
1490
1484
|
if not self._scaling_monitoring:
|
|
1491
1485
|
break
|
|
1492
1486
|
time.sleep(sleep_time)
|
|
1493
|
-
logger.
|
|
1487
|
+
logger.debug("Scaling loop finished.")
|
|
1494
1488
|
|
|
1495
1489
|
def _start_scaling(self, poll_interval: float = 10.0) -> None:
|
|
1496
1490
|
"""
|
|
@@ -1505,7 +1499,7 @@ class RayPipeline(PipelineInterface):
|
|
|
1505
1499
|
self._scaling_monitoring = True
|
|
1506
1500
|
self._scaling_thread = threading.Thread(target=self._scaling_loop, args=(poll_interval,), daemon=True)
|
|
1507
1501
|
self._scaling_thread.start()
|
|
1508
|
-
logger.
|
|
1502
|
+
logger.debug(f"Scaling/Maintenance thread launched (Interval: {poll_interval}s).")
|
|
1509
1503
|
|
|
1510
1504
|
def _stop_scaling(self) -> None:
|
|
1511
1505
|
"""
|
|
@@ -1519,7 +1513,7 @@ class RayPipeline(PipelineInterface):
|
|
|
1519
1513
|
if self._scaling_thread.is_alive():
|
|
1520
1514
|
logger.warning("Scaling thread did not exit cleanly.")
|
|
1521
1515
|
self._scaling_thread = None
|
|
1522
|
-
logger.
|
|
1516
|
+
logger.debug("Scaling/Maintenance stopped.")
|
|
1523
1517
|
|
|
1524
1518
|
# --- Pipeline Start/Stop ---
|
|
1525
1519
|
def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
|
|
@@ -1548,7 +1542,7 @@ class RayPipeline(PipelineInterface):
|
|
|
1548
1542
|
logger.debug(f"Waiting for {len(start_futures)} actors to start...")
|
|
1549
1543
|
try:
|
|
1550
1544
|
ray.get(start_futures, timeout=60.0)
|
|
1551
|
-
logger.
|
|
1545
|
+
logger.debug(f"{len(start_futures)} actors started.")
|
|
1552
1546
|
except Exception as e:
|
|
1553
1547
|
logger.error(f"Error/Timeout starting actors: {e}", exc_info=True)
|
|
1554
1548
|
self.stop() # Attempt cleanup
|
|
@@ -1593,7 +1587,7 @@ class RayPipeline(PipelineInterface):
|
|
|
1593
1587
|
logger.warning(
|
|
1594
1588
|
f"Timeout waiting for {len(not_ready)} actors to stop. " f"Proceeding with shutdown."
|
|
1595
1589
|
)
|
|
1596
|
-
logger.
|
|
1590
|
+
logger.debug(f"{len(ready)} actors confirmed stop.")
|
|
1597
1591
|
except Exception as e:
|
|
1598
1592
|
logger.error(f"An unexpected error occurred during actor shutdown: {e}", exc_info=True)
|
|
1599
1593
|
|
|
@@ -72,7 +72,7 @@ class RayStatsCollector:
|
|
|
72
72
|
self._cumulative_stats: Dict[str, Dict[str, int]] = defaultdict(lambda: {"processed": 0})
|
|
73
73
|
self.ema_memory_per_replica: Dict[str, float] = {} # EMA of memory per replica
|
|
74
74
|
|
|
75
|
-
logger.
|
|
75
|
+
logger.debug(
|
|
76
76
|
f"RayStatsCollector initialized (Interval: {self._interval}s, "
|
|
77
77
|
f"Actor Timeout: {self._actor_timeout}s, Queue Timeout: {self._queue_timeout}s, "
|
|
78
78
|
f"EMA Alpha: {self.ema_alpha})"
|
|
@@ -111,7 +111,7 @@ class RayStatsCollector:
|
|
|
111
111
|
self._running = False # Correct inconsistent state
|
|
112
112
|
|
|
113
113
|
if not self._running:
|
|
114
|
-
logger.
|
|
114
|
+
logger.debug("Starting stats collector thread...")
|
|
115
115
|
self._running = True
|
|
116
116
|
with self._lock:
|
|
117
117
|
self._last_update_successful = False # Mark as stale until first collection
|
|
@@ -129,7 +129,7 @@ class RayStatsCollector:
|
|
|
129
129
|
def stop(self) -> None:
|
|
130
130
|
"""Signals the background stats collection thread to stop and waits for it."""
|
|
131
131
|
if self._running:
|
|
132
|
-
logger.
|
|
132
|
+
logger.debug("Stopping stats collector thread...")
|
|
133
133
|
self._running = False # Signal loop to stop
|
|
134
134
|
|
|
135
135
|
if self._thread is not None:
|
|
@@ -150,7 +150,7 @@ class RayStatsCollector:
|
|
|
150
150
|
with self._lock:
|
|
151
151
|
self._last_update_successful = False
|
|
152
152
|
self._collected_stats = {} # Clear last collected stats
|
|
153
|
-
logger.
|
|
153
|
+
logger.debug("Stats collector thread stopped.")
|
|
154
154
|
else:
|
|
155
155
|
logger.debug("Stats collector thread already stopped or never started.")
|
|
156
156
|
|
|
@@ -230,7 +230,7 @@ class RayStatsCollector:
|
|
|
230
230
|
# but time.sleep is simpler for now.
|
|
231
231
|
time.sleep(sleep_time)
|
|
232
232
|
|
|
233
|
-
logger.
|
|
233
|
+
logger.debug("Stats collector loop finished.")
|
|
234
234
|
|
|
235
235
|
def collect_stats_now(self) -> Tuple[Dict[str, Dict[str, int]], int, bool]:
|
|
236
236
|
"""
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
|
-
|
|
7
|
+
from typing import Optional
|
|
8
8
|
import ray
|
|
9
9
|
|
|
10
10
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
@@ -16,6 +16,9 @@ from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExt
|
|
|
16
16
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
17
|
nv_ingest_node_failure_try_except,
|
|
18
18
|
)
|
|
19
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
20
|
+
|
|
21
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
19
22
|
|
|
20
23
|
logger = logging.getLogger(__name__)
|
|
21
24
|
|
|
@@ -31,8 +34,8 @@ class AudioExtractorStage(RayActorStage):
|
|
|
31
34
|
3. Updates the message payload with the extracted text DataFrame.
|
|
32
35
|
"""
|
|
33
36
|
|
|
34
|
-
def __init__(self, config: AudioExtractorSchema) -> None:
|
|
35
|
-
super().__init__(config, log_to_stdout=False)
|
|
37
|
+
def __init__(self, config: AudioExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
38
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
36
39
|
try:
|
|
37
40
|
self.validated_config = config
|
|
38
41
|
self._logger.info("AudioExtractorStage configuration validated successfully.")
|
|
@@ -40,9 +43,10 @@ class AudioExtractorStage(RayActorStage):
|
|
|
40
43
|
self._logger.exception(f"Error validating Audio Extractor config: {e}")
|
|
41
44
|
raise
|
|
42
45
|
|
|
43
|
-
@
|
|
46
|
+
@nv_ingest_node_failure_try_except()
|
|
47
|
+
@traceable()
|
|
48
|
+
@udf_intercept_hook()
|
|
44
49
|
@filter_by_task(required_tasks=[("extract", {"document_type": "regex:^(mp3|wav)$"})])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="audio_extractor", raise_on_failure=False)
|
|
46
50
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
51
|
"""
|
|
48
52
|
Process the control message by extracting text from audio.
|
|
@@ -62,10 +66,9 @@ class AudioExtractorStage(RayActorStage):
|
|
|
62
66
|
# Extract the DataFrame payload.
|
|
63
67
|
df_ledger = control_message.payload()
|
|
64
68
|
self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
|
|
65
|
-
|
|
66
69
|
# Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
|
|
67
70
|
task_config = remove_task_by_type(control_message, "extract")
|
|
68
|
-
self._logger.debug("Extracted task config: %s", task_config)
|
|
71
|
+
self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
69
72
|
|
|
70
73
|
# Perform audio text extraction.
|
|
71
74
|
new_df, extraction_info = extract_text_from_audio_internal(
|
|
@@ -3,19 +3,21 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Optional
|
|
7
7
|
|
|
8
8
|
import ray
|
|
9
|
-
|
|
10
|
-
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
11
|
-
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
12
9
|
from nv_ingest_api.internal.extract.image.chart_extractor import extract_chart_data_from_image_internal
|
|
13
10
|
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
|
|
11
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import set_trace_timestamps_with_parent_context
|
|
12
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
13
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
14
14
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
15
15
|
from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
|
|
16
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
16
17
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
18
|
nv_ingest_node_failure_try_except,
|
|
18
19
|
)
|
|
20
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
19
21
|
|
|
20
22
|
logger = logging.getLogger(__name__)
|
|
21
23
|
|
|
@@ -31,8 +33,8 @@ class ChartExtractorStage(RayActorStage):
|
|
|
31
33
|
and annotates the message metadata with extraction info.
|
|
32
34
|
"""
|
|
33
35
|
|
|
34
|
-
def __init__(self, config: ChartExtractorSchema) -> None:
|
|
35
|
-
super().__init__(config)
|
|
36
|
+
def __init__(self, config: ChartExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
37
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
38
|
try:
|
|
37
39
|
self.validated_config = config
|
|
38
40
|
# logger.warning(
|
|
@@ -42,9 +44,10 @@ class ChartExtractorStage(RayActorStage):
|
|
|
42
44
|
logger.exception("Error validating chart extractor config")
|
|
43
45
|
raise e
|
|
44
46
|
|
|
45
|
-
@
|
|
47
|
+
@nv_ingest_node_failure_try_except()
|
|
48
|
+
@traceable()
|
|
49
|
+
@udf_intercept_hook()
|
|
46
50
|
@filter_by_task(required_tasks=["chart_data_extract"])
|
|
47
|
-
@nv_ingest_node_failure_try_except(annotation_id="chart_extraction", raise_on_failure=False)
|
|
48
51
|
def on_data(self, control_message: Any) -> Any:
|
|
49
52
|
"""
|
|
50
53
|
Process the control message by extracting chart data.
|
|
@@ -59,14 +62,14 @@ class ChartExtractorStage(RayActorStage):
|
|
|
59
62
|
IngestControlMessage
|
|
60
63
|
The updated message with the extracted chart data and extraction info in metadata.
|
|
61
64
|
"""
|
|
62
|
-
logger.
|
|
65
|
+
logger.debug("ChartExtractorStage.on_data: Starting chart extraction.")
|
|
63
66
|
# Extract the DataFrame payload.
|
|
64
67
|
df_payload = control_message.payload()
|
|
65
68
|
logger.debug("ChartExtractorStage: Extracted payload with %d rows.", len(df_payload))
|
|
66
69
|
|
|
67
70
|
# Remove the "chart_data_extract" task to obtain task-specific configuration.
|
|
68
71
|
task_config = remove_task_by_type(control_message, "chart_data_extract")
|
|
69
|
-
logger.debug("ChartExtractorStage: Task config extracted: %s", task_config)
|
|
72
|
+
logger.debug("ChartExtractorStage: Task config extracted: %s", sanitize_for_logging(task_config))
|
|
70
73
|
|
|
71
74
|
# Perform chart data extraction.
|
|
72
75
|
execution_trace_log = {}
|
|
@@ -76,17 +79,17 @@ class ChartExtractorStage(RayActorStage):
|
|
|
76
79
|
extraction_config=self.validated_config,
|
|
77
80
|
execution_trace_log=execution_trace_log,
|
|
78
81
|
)
|
|
79
|
-
logger.
|
|
82
|
+
logger.debug("ChartExtractorStage: Chart extraction completed. New payload has %d rows.", len(new_df))
|
|
80
83
|
|
|
81
84
|
# Update the control message with the new DataFrame.
|
|
82
85
|
control_message.payload(new_df)
|
|
83
86
|
# Annotate the message with extraction info.
|
|
84
87
|
control_message.set_metadata("chart_extraction_info", extraction_info)
|
|
85
|
-
logger.
|
|
88
|
+
logger.debug("ChartExtractorStage: Metadata injection complete. Returning updated control message.")
|
|
86
89
|
|
|
87
90
|
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
88
91
|
if do_trace_tagging and execution_trace_log:
|
|
89
|
-
|
|
90
|
-
|
|
92
|
+
parent_name = self.stage_name if self.stage_name else "chart_extractor"
|
|
93
|
+
set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
|
|
91
94
|
|
|
92
95
|
return control_message
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
from typing import Optional
|
|
6
7
|
|
|
7
8
|
import ray
|
|
8
9
|
|
|
@@ -15,6 +16,9 @@ from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtra
|
|
|
15
16
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
16
17
|
nv_ingest_node_failure_try_except,
|
|
17
18
|
)
|
|
19
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
20
|
+
|
|
21
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
22
|
|
|
19
23
|
logger = logging.getLogger(__name__)
|
|
20
24
|
|
|
@@ -26,12 +30,12 @@ class DocxExtractorStage(RayActorStage):
|
|
|
26
30
|
|
|
27
31
|
It expects an IngestControlMessage containing a DataFrame with DOCX document data. It then:
|
|
28
32
|
1. Removes the "docx-extract" task from the message.
|
|
29
|
-
2. Calls the DOCX extraction logic (via
|
|
33
|
+
2. Calls the DOCX extraction logic (via extract_docx_internal) using a validated configuration.
|
|
30
34
|
3. Updates the message payload with the extracted content DataFrame.
|
|
31
35
|
"""
|
|
32
36
|
|
|
33
|
-
def __init__(self, config: DocxExtractorSchema) -> None:
|
|
34
|
-
super().__init__(config, log_to_stdout=False)
|
|
37
|
+
def __init__(self, config: DocxExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
38
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
35
39
|
try:
|
|
36
40
|
self.validated_config = config
|
|
37
41
|
logger.info("DocxExtractorStage configuration validated successfully.")
|
|
@@ -39,9 +43,10 @@ class DocxExtractorStage(RayActorStage):
|
|
|
39
43
|
logger.exception(f"Error validating DOCX Extractor config: {e}")
|
|
40
44
|
raise
|
|
41
45
|
|
|
42
|
-
@
|
|
46
|
+
@nv_ingest_node_failure_try_except()
|
|
47
|
+
@traceable()
|
|
48
|
+
@udf_intercept_hook()
|
|
43
49
|
@filter_by_task(required_tasks=[("extract", {"document_type": "docx"})])
|
|
44
|
-
@nv_ingest_node_failure_try_except(annotation_id="docx_extractor", raise_on_failure=True)
|
|
45
50
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
46
51
|
"""
|
|
47
52
|
Process the control message by extracting content from DOCX documents.
|
|
@@ -64,7 +69,7 @@ class DocxExtractorStage(RayActorStage):
|
|
|
64
69
|
|
|
65
70
|
# Remove the "docx-extract" task from the message to obtain task-specific configuration.
|
|
66
71
|
task_config = remove_task_by_type(control_message, "extract")
|
|
67
|
-
self._logger.debug("Extracted task config: %s", task_config)
|
|
72
|
+
self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
68
73
|
|
|
69
74
|
# Perform DOCX content extraction.
|
|
70
75
|
new_df, extraction_info = extract_primitives_from_docx_internal(
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
|
+
from typing import Optional
|
|
7
8
|
|
|
8
9
|
import ray
|
|
9
10
|
|
|
@@ -16,6 +17,9 @@ from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtra
|
|
|
16
17
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
18
|
nv_ingest_node_failure_try_except,
|
|
18
19
|
)
|
|
20
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
21
|
+
|
|
22
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
19
23
|
|
|
20
24
|
logger = logging.getLogger(__name__)
|
|
21
25
|
|
|
@@ -31,8 +35,8 @@ class HtmlExtractorStage(RayActorStage):
|
|
|
31
35
|
3. Updates the message payload with the extracted text DataFrame.
|
|
32
36
|
"""
|
|
33
37
|
|
|
34
|
-
def __init__(self, config: HtmlExtractorSchema) -> None:
|
|
35
|
-
super().__init__(config, log_to_stdout=False)
|
|
38
|
+
def __init__(self, config: HtmlExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
39
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
36
40
|
try:
|
|
37
41
|
self.validated_config = config
|
|
38
42
|
self._logger.info("HtmlExtractorStage configuration validated successfully.")
|
|
@@ -40,9 +44,10 @@ class HtmlExtractorStage(RayActorStage):
|
|
|
40
44
|
self._logger.exception(f"Error validating Html Extractor config: {e}")
|
|
41
45
|
raise
|
|
42
46
|
|
|
43
|
-
@
|
|
47
|
+
@nv_ingest_node_failure_try_except()
|
|
48
|
+
@traceable()
|
|
49
|
+
@udf_intercept_hook()
|
|
44
50
|
@filter_by_task(required_tasks=[("extract", {"document_type": "html"})])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="html_extractor", raise_on_failure=False)
|
|
46
51
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
52
|
"""
|
|
48
53
|
Process the control message by extracting content from html.
|
|
@@ -65,7 +70,7 @@ class HtmlExtractorStage(RayActorStage):
|
|
|
65
70
|
|
|
66
71
|
# Remove the "html_content_extract" task from the message to obtain task-specific configuration.
|
|
67
72
|
task_config = remove_task_by_type(control_message, "extract")
|
|
68
|
-
self._logger.debug("Extracted task config: %s", task_config)
|
|
73
|
+
self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
69
74
|
|
|
70
75
|
# Perform html content extraction.
|
|
71
76
|
new_df, extraction_info = extract_markdown_from_html_internal(
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
from typing import Optional
|
|
6
7
|
|
|
7
8
|
import ray
|
|
8
9
|
|
|
@@ -15,6 +16,9 @@ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExt
|
|
|
15
16
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
16
17
|
nv_ingest_node_failure_try_except,
|
|
17
18
|
)
|
|
19
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
20
|
+
|
|
21
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
22
|
|
|
19
23
|
logger = logging.getLogger(__name__)
|
|
20
24
|
|
|
@@ -30,18 +34,19 @@ class ImageExtractorStage(RayActorStage):
|
|
|
30
34
|
3. Updates the message payload with the extracted primitives DataFrame.
|
|
31
35
|
"""
|
|
32
36
|
|
|
33
|
-
def __init__(self, config: ImageExtractorSchema) -> None:
|
|
34
|
-
super().__init__(config)
|
|
37
|
+
def __init__(self, config: ImageExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
38
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
35
39
|
try:
|
|
36
40
|
self.validated_config = config
|
|
37
|
-
|
|
41
|
+
self._logger.info("ImageExtractorStage configuration validated successfully.")
|
|
38
42
|
except Exception as e:
|
|
39
|
-
|
|
43
|
+
self._logger.exception(f"Error validating Image Extractor config: {e}")
|
|
40
44
|
raise
|
|
41
45
|
|
|
42
|
-
@
|
|
46
|
+
@nv_ingest_node_failure_try_except()
|
|
47
|
+
@traceable()
|
|
48
|
+
@udf_intercept_hook()
|
|
43
49
|
@filter_by_task(required_tasks=[("extract", {"document_type": "regex:^(png|jpeg|jpg|tiff|bmp)$"})])
|
|
44
|
-
@nv_ingest_node_failure_try_except(annotation_id="image_extractor", raise_on_failure=False)
|
|
45
50
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
46
51
|
"""
|
|
47
52
|
Process the control message by extracting primitives from images.
|
|
@@ -64,7 +69,7 @@ class ImageExtractorStage(RayActorStage):
|
|
|
64
69
|
|
|
65
70
|
# Remove the "extract" task from the message to obtain task-specific configuration.
|
|
66
71
|
task_config = remove_task_by_type(control_message, "extract")
|
|
67
|
-
logger.debug("Extracted task config: %s", task_config)
|
|
72
|
+
logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
68
73
|
|
|
69
74
|
# Perform image primitives extraction.
|
|
70
75
|
new_df, extraction_info = extract_primitives_from_image_internal(
|
|
@@ -5,32 +5,44 @@
|
|
|
5
5
|
import logging
|
|
6
6
|
import ray
|
|
7
7
|
|
|
8
|
-
from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
|
|
9
8
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
10
9
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
11
10
|
from nv_ingest_api.internal.extract.image.infographic_extractor import extract_infographic_data_from_image_internal
|
|
12
11
|
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
13
|
-
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
12
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
|
|
13
|
+
from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
|
|
14
14
|
from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
15
18
|
|
|
16
19
|
logger = logging.getLogger(__name__)
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
@ray.remote
|
|
20
23
|
class InfographicExtractorStage(RayActorStage):
|
|
21
|
-
|
|
22
|
-
|
|
24
|
+
"""
|
|
25
|
+
A Ray actor stage that extracts infographic data from image content.
|
|
26
|
+
|
|
27
|
+
It expects an IngestControlMessage containing a DataFrame with image data. It then:
|
|
28
|
+
1. Removes the "infographic_data_extract" task from the message.
|
|
29
|
+
2. Calls the infographic extraction logic using a validated configuration.
|
|
30
|
+
3. Updates the message payload with the extracted infographic DataFrame.
|
|
31
|
+
"""
|
|
23
32
|
|
|
33
|
+
def __init__(self, config: InfographicExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
34
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
24
35
|
try:
|
|
25
36
|
self.validated_config = config
|
|
26
|
-
|
|
37
|
+
self._logger.info("InfographicExtractorStage configuration validated successfully.")
|
|
27
38
|
except Exception as e:
|
|
28
|
-
|
|
39
|
+
self._logger.exception(f"Error validating Infographic extractor config: {e}")
|
|
29
40
|
raise
|
|
30
41
|
|
|
31
|
-
@
|
|
42
|
+
@nv_ingest_node_failure_try_except()
|
|
43
|
+
@traceable()
|
|
44
|
+
@udf_intercept_hook()
|
|
32
45
|
@filter_by_task(required_tasks=["infographic_data_extract"])
|
|
33
|
-
@nv_ingest_node_failure_try_except(annotation_id="infographic_extraction", raise_on_failure=False)
|
|
34
46
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
35
47
|
# Extract DataFrame payload
|
|
36
48
|
df_ledger = control_message.payload()
|
|
@@ -51,7 +63,7 @@ class InfographicExtractorStage(RayActorStage):
|
|
|
51
63
|
|
|
52
64
|
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
53
65
|
if do_trace_tagging and execution_trace_log:
|
|
54
|
-
|
|
55
|
-
|
|
66
|
+
parent_name = self.stage_name if self.stage_name else "infographic_extractor"
|
|
67
|
+
set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
|
|
56
68
|
|
|
57
69
|
return control_message
|