nv-ingest 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.15.dev20250815__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (56) hide show
  1. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  2. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  3. nv_ingest/framework/orchestration/execution/options.py +112 -0
  4. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  5. nv_ingest/framework/orchestration/process/dependent_services.py +55 -0
  6. nv_ingest/framework/orchestration/process/execution.py +497 -0
  7. nv_ingest/framework/orchestration/process/lifecycle.py +122 -0
  8. nv_ingest/framework/orchestration/process/strategies.py +182 -0
  9. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +1 -1
  10. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  11. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +23 -23
  12. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  13. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +8 -4
  14. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +16 -16
  15. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +9 -5
  16. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +8 -4
  17. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +10 -6
  18. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  19. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +18 -17
  20. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  21. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +14 -13
  22. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +15 -13
  23. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  24. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  25. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +22 -13
  26. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +10 -7
  27. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +10 -8
  28. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  29. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  30. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +71 -61
  31. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +7 -5
  32. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +8 -4
  33. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  34. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  35. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +7 -5
  36. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +13 -14
  37. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +18 -12
  38. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  39. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  40. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  41. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  42. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  43. nv_ingest/pipeline/__init__.py +3 -0
  44. nv_ingest/pipeline/config/__init__.py +3 -0
  45. nv_ingest/pipeline/config/loaders.py +198 -0
  46. nv_ingest/pipeline/config/replica_resolver.py +227 -0
  47. nv_ingest/pipeline/default_pipeline_impl.py +517 -0
  48. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  49. nv_ingest/pipeline/pipeline_schema.py +398 -0
  50. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/METADATA +1 -1
  51. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/RECORD +54 -40
  52. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  53. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  54. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/WHEEL +0 -0
  55. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/licenses/LICENSE +0 -0
  56. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,182 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Process execution strategies for pipeline deployment.
7
+
8
+ This module defines abstract and concrete strategies for executing pipelines
9
+ in different process contexts (in-process vs subprocess), implementing the
10
+ Strategy pattern for clean separation of execution concerns.
11
+ """
12
+
13
+ import atexit
14
+ import logging
15
+ import multiprocessing
16
+ import time
17
+ from abc import ABC, abstractmethod
18
+
19
+ from nv_ingest.pipeline.pipeline_schema import PipelineConfigSchema
20
+ from nv_ingest.framework.orchestration.execution.options import ExecutionOptions, ExecutionResult
21
+ from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import (
22
+ RayPipelineInterface,
23
+ RayPipelineSubprocessInterface,
24
+ )
25
+ from nv_ingest.framework.orchestration.process.execution import (
26
+ launch_pipeline,
27
+ run_pipeline_process,
28
+ kill_pipeline_process_group,
29
+ )
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class ProcessExecutionStrategy(ABC):
35
+ """
36
+ Abstract base class for pipeline execution strategies.
37
+
38
+ This class defines the interface for different ways of executing
39
+ a pipeline (in-process, subprocess, etc.) using the Strategy pattern.
40
+ """
41
+
42
+ @abstractmethod
43
+ def execute(self, config: PipelineConfigSchema, options: ExecutionOptions) -> ExecutionResult:
44
+ """
45
+ Execute a pipeline using this strategy.
46
+
47
+ Parameters
48
+ ----------
49
+ config : PipelineConfigSchema
50
+ Validated pipeline configuration to execute.
51
+ options : ExecutionOptions
52
+ Execution options controlling blocking behavior and output redirection.
53
+
54
+ Returns
55
+ -------
56
+ ExecutionResult
57
+ Result containing pipeline interface and/or timing information.
58
+ """
59
+ pass
60
+
61
+
62
+ class InProcessStrategy(ProcessExecutionStrategy):
63
+ """
64
+ Strategy for executing pipelines in the current process.
65
+
66
+ This strategy runs the pipeline directly in the current Python process,
67
+ providing the most direct execution path with minimal overhead.
68
+ """
69
+
70
+ def execute(self, config: PipelineConfigSchema, options: ExecutionOptions) -> ExecutionResult:
71
+ """
72
+ Execute pipeline in the current process.
73
+
74
+ Parameters
75
+ ----------
76
+ config : PipelineConfigSchema
77
+ Pipeline configuration to execute.
78
+ options : ExecutionOptions
79
+ Execution options. stdout/stderr are ignored for in-process execution.
80
+
81
+ Returns
82
+ -------
83
+ ExecutionResult
84
+ Result with pipeline interface (non-blocking) or elapsed time (blocking).
85
+ """
86
+ logger.info("Executing pipeline in current process")
87
+
88
+ # Execute the pipeline using existing launch_pipeline function
89
+ # launch_pipeline returns raw RayPipeline object (not wrapped in interface)
90
+ pipeline, total_elapsed = launch_pipeline(
91
+ config,
92
+ block=options.block,
93
+ disable_dynamic_scaling=None, # Already applied in config
94
+ )
95
+
96
+ if options.block:
97
+ logger.debug(f"Pipeline execution completed successfully in {total_elapsed:.2f} seconds.")
98
+ return ExecutionResult(interface=None, elapsed_time=total_elapsed)
99
+ else:
100
+ # Wrap the raw RayPipeline in RayPipelineInterface
101
+ interface = RayPipelineInterface(pipeline)
102
+ return ExecutionResult(interface=interface, elapsed_time=None)
103
+
104
+
105
+ class SubprocessStrategy(ProcessExecutionStrategy):
106
+ """
107
+ Strategy for executing pipelines in a separate subprocess.
108
+
109
+ This strategy launches the pipeline in a separate Python process using
110
+ multiprocessing, providing process isolation and output redirection.
111
+ """
112
+
113
+ def execute(self, config: PipelineConfigSchema, options: ExecutionOptions) -> ExecutionResult:
114
+ """
115
+ Execute pipeline in a separate subprocess.
116
+
117
+ Parameters
118
+ ----------
119
+ config : PipelineConfigSchema
120
+ Pipeline configuration to execute.
121
+ options : ExecutionOptions
122
+ Execution options including output redirection streams.
123
+
124
+ Returns
125
+ -------
126
+ ExecutionResult
127
+ Result with subprocess interface (non-blocking) or elapsed time (blocking).
128
+ """
129
+ logger.info("Launching pipeline in Python subprocess using multiprocessing.")
130
+
131
+ # Create subprocess using fork context
132
+ ctx = multiprocessing.get_context("fork")
133
+ process = ctx.Process(
134
+ target=run_pipeline_process,
135
+ args=(
136
+ config,
137
+ options.stdout, # raw_stdout
138
+ options.stderr, # raw_stderr
139
+ ),
140
+ daemon=False,
141
+ )
142
+
143
+ process.start()
144
+ interface = RayPipelineSubprocessInterface(process)
145
+
146
+ if options.block:
147
+ # Block until subprocess completes
148
+ start_time = time.time()
149
+ logger.info("Waiting for subprocess pipeline to complete...")
150
+ process.join()
151
+ logger.info("Pipeline subprocess completed.")
152
+ elapsed_time = time.time() - start_time
153
+ return ExecutionResult(interface=None, elapsed_time=elapsed_time)
154
+ else:
155
+ # Return interface for non-blocking execution
156
+ logger.info(f"Pipeline subprocess started (PID={process.pid})")
157
+ # Ensure we pass the Process object, not just the PID, to avoid AttributeError
158
+ # kill_pipeline_process_group expects a multiprocessing.Process instance
159
+ # Capture raw PID to avoid using multiprocessing APIs during interpreter shutdown
160
+ pid = int(process.pid)
161
+ atexit.register(kill_pipeline_process_group, pid)
162
+ return ExecutionResult(interface=interface, elapsed_time=None)
163
+
164
+
165
+ def create_execution_strategy(run_in_subprocess: bool) -> ProcessExecutionStrategy:
166
+ """
167
+ Factory function to create the appropriate execution strategy.
168
+
169
+ Parameters
170
+ ----------
171
+ run_in_subprocess : bool
172
+ If True, creates SubprocessStrategy. If False, creates InProcessStrategy.
173
+
174
+ Returns
175
+ -------
176
+ ProcessExecutionStrategy
177
+ Configured execution strategy instance.
178
+ """
179
+ if run_in_subprocess:
180
+ return SubprocessStrategy()
181
+ else:
182
+ return InProcessStrategy()
@@ -29,8 +29,8 @@ from nv_ingest.framework.orchestration.ray.stages.sinks.message_broker_task_sink
29
29
  from nv_ingest.framework.orchestration.ray.stages.sources.message_broker_task_source import (
30
30
  MessageBrokerTaskSourceStage,
31
31
  MessageBrokerTaskSourceConfig,
32
- start_simple_message_broker,
33
32
  )
33
+ from nv_ingest.framework.orchestration.process.dependent_services import start_simple_message_broker
34
34
  from nv_ingest.framework.orchestration.ray.stages.storage.image_storage import ImageStorageStage
35
35
  from nv_ingest.framework.orchestration.ray.stages.storage.store_embeddings import EmbeddingStorageStage
36
36
  from nv_ingest.framework.orchestration.ray.stages.transforms.image_caption import ImageCaptionTransformStage
@@ -183,7 +183,7 @@ class PipelineTopology:
183
183
  """Marks an actor as pending removal, to be cleaned up by the background thread."""
184
184
  with self._lock:
185
185
  self._actors_pending_removal.add((stage_name, actor))
186
- logger.info(f"Marked actor {actor} from stage {stage_name} for removal.")
186
+ logger.debug(f"Marked actor {actor} from stage {stage_name} for removal.")
187
187
 
188
188
  def start_cleanup_thread(self, interval: int = 5) -> None:
189
189
  """Starts the background thread for periodic cleanup tasks."""
@@ -191,14 +191,14 @@ class PipelineTopology:
191
191
  self._stop_cleanup.clear()
192
192
  self._cleanup_thread = threading.Thread(target=self._cleanup_loop, args=(interval,), daemon=True)
193
193
  self._cleanup_thread.start()
194
- logger.info("Topology cleanup thread started.")
194
+ logger.debug("Topology cleanup thread started.")
195
195
 
196
196
  def stop_cleanup_thread(self) -> None:
197
197
  """Stops the background cleanup thread."""
198
198
  if self._cleanup_thread and self._cleanup_thread.is_alive():
199
199
  self._stop_cleanup.set()
200
200
  self._cleanup_thread.join(timeout=5)
201
- logger.info("Topology cleanup thread stopped.")
201
+ logger.debug("Topology cleanup thread stopped.")
202
202
 
203
203
  def _cleanup_loop(self, interval: int) -> None:
204
204
  """Periodically checks for and removes actors that have completed shutdown."""
@@ -235,7 +235,7 @@ class PipelineTopology:
235
235
  self._actors_pending_removal.remove((stage_name, actor))
236
236
  if actor in self._stage_actors.get(stage_name, []):
237
237
  self._stage_actors[stage_name].remove(actor)
238
- logger.info(f"Successfully removed actor {actor} from stage {stage_name} in topology.")
238
+ logger.debug(f"Successfully removed actor {actor} from stage {stage_name} in topology.")
239
239
 
240
240
  time.sleep(interval)
241
241
 
@@ -252,7 +252,7 @@ class RayPipeline(PipelineInterface):
252
252
  penalty_factor=self.scaling_config.pid_penalty_factor,
253
253
  error_boost_factor=self.scaling_config.pid_error_boost_factor,
254
254
  )
255
- logger.info("PIDController initialized using ScalingConfig.")
255
+ logger.debug("PIDController initialized using ScalingConfig.")
256
256
 
257
257
  try:
258
258
  total_system_memory_bytes = psutil.virtual_memory().total
@@ -270,7 +270,7 @@ class RayPipeline(PipelineInterface):
270
270
  memory_threshold=absolute_memory_threshold_mb,
271
271
  memory_safety_buffer_fraction=self.scaling_config.rcm_memory_safety_buffer_fraction,
272
272
  )
273
- logger.info("ResourceConstraintManager initialized using ScalingConfig.")
273
+ logger.debug("ResourceConstraintManager initialized using ScalingConfig.")
274
274
 
275
275
  # --- Instantiate Stats Collector ---
276
276
  self._stats_collection_interval_seconds = self.stats_config.collection_interval_seconds
@@ -282,7 +282,7 @@ class RayPipeline(PipelineInterface):
282
282
  ema_alpha=self.scaling_config.pid_ema_alpha,
283
283
  )
284
284
 
285
- logger.info("RayStatsCollector initialized using StatsConfig.")
285
+ logger.debug("RayStatsCollector initialized using StatsConfig.")
286
286
 
287
287
  # --- Accessor Methods for Stat Collector (and internal use) ---
288
288
 
@@ -349,11 +349,11 @@ class RayPipeline(PipelineInterface):
349
349
  # Update constraint manager
350
350
  self.constraint_manager.max_replicas = total_max_replicas
351
351
 
352
- logger.info(f"[Build-Configure] Autoscalers configured. Total Max Replicas: {total_max_replicas}")
352
+ logger.debug(f"[Build-Configure] Autoscalers configured. Total Max Replicas: {total_max_replicas}")
353
353
 
354
354
  def _instantiate_initial_actors(self) -> None:
355
355
  """Instantiates initial actors and updates topology."""
356
- logger.info("[Build-Actors] Instantiating initial stage actors (min_replicas)...")
356
+ logger.debug("[Build-Actors] Instantiating initial stage actors (min_replicas)...")
357
357
  # Use topology accessor
358
358
  current_stages = self.topology.get_stages_info()
359
359
 
@@ -377,7 +377,7 @@ class RayPipeline(PipelineInterface):
377
377
  )
378
378
  try:
379
379
  actor = stage.callable.options(name=actor_name, max_concurrency=1, max_restarts=0).remote(
380
- config=stage.config
380
+ config=stage.config, stage_name=stage.name
381
381
  )
382
382
  replicas.append(actor)
383
383
  except Exception as e:
@@ -388,7 +388,7 @@ class RayPipeline(PipelineInterface):
388
388
  self.topology.set_actors_for_stage(stage.name, replicas)
389
389
  logger.debug(f"[Build-Actors] Stage '{stage.name}' initial actors set in topology: count={len(replicas)}")
390
390
 
391
- logger.info("[Build-Actors] Initial actor instantiation complete.")
391
+ logger.debug("[Build-Actors] Initial actor instantiation complete.")
392
392
 
393
393
  def _create_and_wire_edges(self) -> List[ray.ObjectRef]:
394
394
  """
@@ -399,7 +399,7 @@ class RayPipeline(PipelineInterface):
399
399
  List[ray.ObjectRef]
400
400
  A list of object references for the remote wiring calls.
401
401
  """
402
- logger.info("[Build-Wiring] Creating and wiring edges...")
402
+ logger.debug("[Build-Wiring] Creating and wiring edges...")
403
403
  wiring_refs = []
404
404
  new_edge_queues: Dict[str, Tuple[Any, int]] = {}
405
405
 
@@ -628,7 +628,7 @@ class RayPipeline(PipelineInterface):
628
628
  Dict[str, List[Any]]
629
629
  A dictionary mapping stage names to lists of actor handles.
630
630
  """
631
- logger.info("--- Starting Pipeline Build Process ---")
631
+ logger.debug("--- Starting Pipeline Build Process ---")
632
632
  try:
633
633
  if not self.topology.get_stages_info():
634
634
  logger.error("Build failed: No stages defined in topology.")
@@ -640,7 +640,7 @@ class RayPipeline(PipelineInterface):
640
640
  wiring_futures = self._create_and_wire_edges()
641
641
  self._wait_for_wiring(wiring_futures)
642
642
 
643
- logger.info("--- Pipeline Build Completed Successfully ---")
643
+ logger.debug("--- Pipeline Build Completed Successfully ---")
644
644
  return self.topology.get_stage_actors() # Return actors from topology
645
645
 
646
646
  except RuntimeError as e:
@@ -673,7 +673,7 @@ class RayPipeline(PipelineInterface):
673
673
  logger.debug(f"[ScaleUtil] Creating new actor '{actor_name}' for stage '{stage_info.name}'")
674
674
  try:
675
675
  new_actor = stage_info.callable.options(name=actor_name, max_concurrency=1, max_restarts=0).remote(
676
- config=stage_info.config
676
+ config=stage_info.config, stage_name=stage_info.name
677
677
  )
678
678
 
679
679
  return new_actor
@@ -861,7 +861,7 @@ class RayPipeline(PipelineInterface):
861
861
  # Select actors to remove (e.g., the most recently added)
862
862
  actors_to_remove = current_replicas[-num_to_remove:]
863
863
 
864
- logger.info(f"[ScaleDown-{stage_name}] Selected {len(actors_to_remove)} actors for removal.")
864
+ logger.debug(f"[ScaleDown-{stage_name}] Selected {len(actors_to_remove)} actors for removal.")
865
865
 
866
866
  # Signal each actor to stop and mark it for removal by the topology.
867
867
  # The topology's cleanup thread will handle polling and final removal.
@@ -966,7 +966,7 @@ class RayPipeline(PipelineInterface):
966
966
  True if the pipeline drained successfully, False otherwise.
967
967
  """
968
968
  start_time = time.time()
969
- logger.info(f"Waiting for pipeline drain (Timeout: {timeout_seconds}s)...")
969
+ logger.debug(f"Waiting for pipeline drain (Timeout: {timeout_seconds}s)...")
970
970
  last_in_flight = -1
971
971
  drain_check_interval = 1.0 # Check every second
972
972
 
@@ -1172,7 +1172,7 @@ class RayPipeline(PipelineInterface):
1172
1172
  force : bool, optional
1173
1173
  Whether to force the flush, by default False.
1174
1174
  """
1175
- logger.info(f"Manual queue flush requested (force={force}).")
1175
+ logger.debug(f"Manual queue flush requested (force={force}).")
1176
1176
 
1177
1177
  if self.topology.get_is_flushing() or self._stopping: # Check topology
1178
1178
  logger.warning("Flush already in progress or pipeline is stopping.")
@@ -1183,7 +1183,7 @@ class RayPipeline(PipelineInterface):
1183
1183
  # For now, run synchronously:
1184
1184
  self._execute_queue_flush()
1185
1185
  else:
1186
- logger.info("Manual flush denied: pipeline not quiet or interval not met.")
1186
+ logger.debug("Manual flush denied: pipeline not quiet or interval not met.")
1187
1187
 
1188
1188
  def _gather_controller_metrics(
1189
1189
  self, current_stage_stats: Dict[str, Dict[str, int]], global_in_flight: int
@@ -1409,7 +1409,7 @@ class RayPipeline(PipelineInterface):
1409
1409
  self._consecutive_quiet_cycles += 1
1410
1410
  logger.debug(f"Pipeline is quiet. Consecutive quiet cycles: {self._consecutive_quiet_cycles}")
1411
1411
  if self._consecutive_quiet_cycles >= self.consecutive_quiet_cycles_for_flush:
1412
- logger.info(
1412
+ logger.debug(
1413
1413
  f"Pipeline has been quiet for {self._consecutive_quiet_cycles} cycles. "
1414
1414
  "Initiating queue flush."
1415
1415
  )
@@ -1423,7 +1423,7 @@ class RayPipeline(PipelineInterface):
1423
1423
  )
1424
1424
  else:
1425
1425
  if self._consecutive_quiet_cycles > 0:
1426
- logger.info(
1426
+ logger.debug(
1427
1427
  f"Pipeline is no longer quiet. Resetting consecutive quiet cycle count "
1428
1428
  f"from {self._consecutive_quiet_cycles} to 0."
1429
1429
  )
@@ -1479,7 +1479,7 @@ class RayPipeline(PipelineInterface):
1479
1479
  interval : float
1480
1480
  The interval in seconds.
1481
1481
  """
1482
- logger.info(f"Scaling loop started. Interval: {interval}s")
1482
+ logger.debug(f"Scaling loop started. Interval: {interval}s")
1483
1483
  while self._scaling_monitoring:
1484
1484
  try:
1485
1485
  self._perform_scaling_and_maintenance()
@@ -1490,7 +1490,7 @@ class RayPipeline(PipelineInterface):
1490
1490
  if not self._scaling_monitoring:
1491
1491
  break
1492
1492
  time.sleep(sleep_time)
1493
- logger.info("Scaling loop finished.")
1493
+ logger.debug("Scaling loop finished.")
1494
1494
 
1495
1495
  def _start_scaling(self, poll_interval: float = 10.0) -> None:
1496
1496
  """
@@ -1505,7 +1505,7 @@ class RayPipeline(PipelineInterface):
1505
1505
  self._scaling_monitoring = True
1506
1506
  self._scaling_thread = threading.Thread(target=self._scaling_loop, args=(poll_interval,), daemon=True)
1507
1507
  self._scaling_thread.start()
1508
- logger.info(f"Scaling/Maintenance thread launched (Interval: {poll_interval}s).")
1508
+ logger.debug(f"Scaling/Maintenance thread launched (Interval: {poll_interval}s).")
1509
1509
 
1510
1510
  def _stop_scaling(self) -> None:
1511
1511
  """
@@ -1519,7 +1519,7 @@ class RayPipeline(PipelineInterface):
1519
1519
  if self._scaling_thread.is_alive():
1520
1520
  logger.warning("Scaling thread did not exit cleanly.")
1521
1521
  self._scaling_thread = None
1522
- logger.info("Scaling/Maintenance stopped.")
1522
+ logger.debug("Scaling/Maintenance stopped.")
1523
1523
 
1524
1524
  # --- Pipeline Start/Stop ---
1525
1525
  def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
@@ -1548,7 +1548,7 @@ class RayPipeline(PipelineInterface):
1548
1548
  logger.debug(f"Waiting for {len(start_futures)} actors to start...")
1549
1549
  try:
1550
1550
  ray.get(start_futures, timeout=60.0)
1551
- logger.info(f"{len(start_futures)} actors started.")
1551
+ logger.debug(f"{len(start_futures)} actors started.")
1552
1552
  except Exception as e:
1553
1553
  logger.error(f"Error/Timeout starting actors: {e}", exc_info=True)
1554
1554
  self.stop() # Attempt cleanup
@@ -1593,7 +1593,7 @@ class RayPipeline(PipelineInterface):
1593
1593
  logger.warning(
1594
1594
  f"Timeout waiting for {len(not_ready)} actors to stop. " f"Proceeding with shutdown."
1595
1595
  )
1596
- logger.info(f"{len(ready)} actors confirmed stop.")
1596
+ logger.debug(f"{len(ready)} actors confirmed stop.")
1597
1597
  except Exception as e:
1598
1598
  logger.error(f"An unexpected error occurred during actor shutdown: {e}", exc_info=True)
1599
1599
 
@@ -72,7 +72,7 @@ class RayStatsCollector:
72
72
  self._cumulative_stats: Dict[str, Dict[str, int]] = defaultdict(lambda: {"processed": 0})
73
73
  self.ema_memory_per_replica: Dict[str, float] = {} # EMA of memory per replica
74
74
 
75
- logger.info(
75
+ logger.debug(
76
76
  f"RayStatsCollector initialized (Interval: {self._interval}s, "
77
77
  f"Actor Timeout: {self._actor_timeout}s, Queue Timeout: {self._queue_timeout}s, "
78
78
  f"EMA Alpha: {self.ema_alpha})"
@@ -111,7 +111,7 @@ class RayStatsCollector:
111
111
  self._running = False # Correct inconsistent state
112
112
 
113
113
  if not self._running:
114
- logger.info("Starting stats collector thread...")
114
+ logger.debug("Starting stats collector thread...")
115
115
  self._running = True
116
116
  with self._lock:
117
117
  self._last_update_successful = False # Mark as stale until first collection
@@ -129,7 +129,7 @@ class RayStatsCollector:
129
129
  def stop(self) -> None:
130
130
  """Signals the background stats collection thread to stop and waits for it."""
131
131
  if self._running:
132
- logger.info("Stopping stats collector thread...")
132
+ logger.debug("Stopping stats collector thread...")
133
133
  self._running = False # Signal loop to stop
134
134
 
135
135
  if self._thread is not None:
@@ -150,7 +150,7 @@ class RayStatsCollector:
150
150
  with self._lock:
151
151
  self._last_update_successful = False
152
152
  self._collected_stats = {} # Clear last collected stats
153
- logger.info("Stats collector thread stopped.")
153
+ logger.debug("Stats collector thread stopped.")
154
154
  else:
155
155
  logger.debug("Stats collector thread already stopped or never started.")
156
156
 
@@ -230,7 +230,7 @@ class RayStatsCollector:
230
230
  # but time.sleep is simpler for now.
231
231
  time.sleep(sleep_time)
232
232
 
233
- logger.info("Stats collector loop finished.")
233
+ logger.debug("Stats collector loop finished.")
234
234
 
235
235
  def collect_stats_now(self) -> Tuple[Dict[str, Dict[str, int]], int, bool]:
236
236
  """
@@ -4,6 +4,7 @@
4
4
 
5
5
 
6
6
  import logging
7
+ from typing import Optional
7
8
 
8
9
  import ray
9
10
 
@@ -17,6 +18,8 @@ from nv_ingest_api.util.exception_handlers.decorators import (
17
18
  nv_ingest_node_failure_try_except,
18
19
  )
19
20
 
21
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
22
+
20
23
  logger = logging.getLogger(__name__)
21
24
 
22
25
 
@@ -31,8 +34,8 @@ class AudioExtractorStage(RayActorStage):
31
34
  3. Updates the message payload with the extracted text DataFrame.
32
35
  """
33
36
 
34
- def __init__(self, config: AudioExtractorSchema) -> None:
35
- super().__init__(config, log_to_stdout=False)
37
+ def __init__(self, config: AudioExtractorSchema, stage_name: Optional[str] = None) -> None:
38
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
36
39
  try:
37
40
  self.validated_config = config
38
41
  self._logger.info("AudioExtractorStage configuration validated successfully.")
@@ -40,9 +43,10 @@ class AudioExtractorStage(RayActorStage):
40
43
  self._logger.exception(f"Error validating Audio Extractor config: {e}")
41
44
  raise
42
45
 
43
- @traceable("audio_extractor")
46
+ @nv_ingest_node_failure_try_except()
47
+ @traceable()
48
+ @udf_intercept_hook()
44
49
  @filter_by_task(required_tasks=[("extract", {"document_type": "regex:^(mp3|wav)$"})])
45
- @nv_ingest_node_failure_try_except(annotation_id="audio_extractor", raise_on_failure=False)
46
50
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
51
  """
48
52
  Process the control message by extracting text from audio.
@@ -3,19 +3,18 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
- from typing import Any
6
+ from typing import Any, Optional
7
7
 
8
8
  import ray
9
-
10
- from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
- from nv_ingest.framework.util.flow_control import filter_by_task
12
9
  from nv_ingest_api.internal.extract.image.chart_extractor import extract_chart_data_from_image_internal
13
10
  from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
11
+ from nv_ingest_api.internal.primitives.tracing.tagging import set_trace_timestamps_with_parent_context
12
+ from nv_ingest.framework.util.flow_control import filter_by_task
13
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
14
14
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
15
  from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
16
- from nv_ingest_api.util.exception_handlers.decorators import (
17
- nv_ingest_node_failure_try_except,
18
- )
16
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
17
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
19
18
 
20
19
  logger = logging.getLogger(__name__)
21
20
 
@@ -31,8 +30,8 @@ class ChartExtractorStage(RayActorStage):
31
30
  and annotates the message metadata with extraction info.
32
31
  """
33
32
 
34
- def __init__(self, config: ChartExtractorSchema) -> None:
35
- super().__init__(config)
33
+ def __init__(self, config: ChartExtractorSchema, stage_name: Optional[str] = None) -> None:
34
+ super().__init__(config, stage_name=stage_name)
36
35
  try:
37
36
  self.validated_config = config
38
37
  # logger.warning(
@@ -42,9 +41,10 @@ class ChartExtractorStage(RayActorStage):
42
41
  logger.exception("Error validating chart extractor config")
43
42
  raise e
44
43
 
45
- @traceable("chart_extraction")
44
+ @nv_ingest_node_failure_try_except()
45
+ @traceable()
46
+ @udf_intercept_hook()
46
47
  @filter_by_task(required_tasks=["chart_data_extract"])
47
- @nv_ingest_node_failure_try_except(annotation_id="chart_extraction", raise_on_failure=False)
48
48
  def on_data(self, control_message: Any) -> Any:
49
49
  """
50
50
  Process the control message by extracting chart data.
@@ -59,7 +59,7 @@ class ChartExtractorStage(RayActorStage):
59
59
  IngestControlMessage
60
60
  The updated message with the extracted chart data and extraction info in metadata.
61
61
  """
62
- logger.info("ChartExtractorStage.on_data: Starting chart extraction.")
62
+ logger.debug("ChartExtractorStage.on_data: Starting chart extraction.")
63
63
  # Extract the DataFrame payload.
64
64
  df_payload = control_message.payload()
65
65
  logger.debug("ChartExtractorStage: Extracted payload with %d rows.", len(df_payload))
@@ -76,17 +76,17 @@ class ChartExtractorStage(RayActorStage):
76
76
  extraction_config=self.validated_config,
77
77
  execution_trace_log=execution_trace_log,
78
78
  )
79
- logger.info("ChartExtractorStage: Chart extraction completed. New payload has %d rows.", len(new_df))
79
+ logger.debug("ChartExtractorStage: Chart extraction completed. New payload has %d rows.", len(new_df))
80
80
 
81
81
  # Update the control message with the new DataFrame.
82
82
  control_message.payload(new_df)
83
83
  # Annotate the message with extraction info.
84
84
  control_message.set_metadata("chart_extraction_info", extraction_info)
85
- logger.info("ChartExtractorStage: Metadata injection complete. Returning updated control message.")
85
+ logger.debug("ChartExtractorStage: Metadata injection complete. Returning updated control message.")
86
86
 
87
87
  do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
88
88
  if do_trace_tagging and execution_trace_log:
89
- for key, ts in execution_trace_log.items():
90
- control_message.set_timestamp(key, ts)
89
+ parent_name = self.stage_name if self.stage_name else "chart_extractor"
90
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
91
91
 
92
92
  return control_message
@@ -3,6 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ from typing import Optional
6
7
 
7
8
  import ray
8
9
 
@@ -16,6 +17,8 @@ from nv_ingest_api.util.exception_handlers.decorators import (
16
17
  nv_ingest_node_failure_try_except,
17
18
  )
18
19
 
20
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
21
+
19
22
  logger = logging.getLogger(__name__)
20
23
 
21
24
 
@@ -26,12 +29,12 @@ class DocxExtractorStage(RayActorStage):
26
29
 
27
30
  It expects an IngestControlMessage containing a DataFrame with DOCX document data. It then:
28
31
  1. Removes the "docx-extract" task from the message.
29
- 2. Calls the DOCX extraction logic (via extract_primitives_from_docx_internal) using a validated configuration.
32
+ 2. Calls the DOCX extraction logic (via extract_docx_internal) using a validated configuration.
30
33
  3. Updates the message payload with the extracted content DataFrame.
31
34
  """
32
35
 
33
- def __init__(self, config: DocxExtractorSchema) -> None:
34
- super().__init__(config, log_to_stdout=False)
36
+ def __init__(self, config: DocxExtractorSchema, stage_name: Optional[str] = None) -> None:
37
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
35
38
  try:
36
39
  self.validated_config = config
37
40
  logger.info("DocxExtractorStage configuration validated successfully.")
@@ -39,9 +42,10 @@ class DocxExtractorStage(RayActorStage):
39
42
  logger.exception(f"Error validating DOCX Extractor config: {e}")
40
43
  raise
41
44
 
42
- @traceable("docx_extractor")
45
+ @nv_ingest_node_failure_try_except()
46
+ @traceable()
47
+ @udf_intercept_hook()
43
48
  @filter_by_task(required_tasks=[("extract", {"document_type": "docx"})])
44
- @nv_ingest_node_failure_try_except(annotation_id="docx_extractor", raise_on_failure=True)
45
49
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
46
50
  """
47
51
  Process the control message by extracting content from DOCX documents.