PyPI - nv-ingest - Versions diffs - 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.15.dev20250815__py3-none-any.whl - Mend

nv-ingest 2025.8.14.dev20250814py3-none-any.whl → 2025.8.15.dev20250815py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest might be problematic. Click here for more details.

Files changed (56) hide show

nv_ingest/framework/orchestration/process/strategies.py ADDED Viewed

@@ -0,0 +1,182 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Process execution strategies for pipeline deployment.
+This module defines abstract and concrete strategies for executing pipelines
+in different process contexts (in-process vs subprocess), implementing the
+Strategy pattern for clean separation of execution concerns.
+"""
+import atexit
+import logging
+import multiprocessing
+import time
+from abc import ABC, abstractmethod
+from nv_ingest.pipeline.pipeline_schema import PipelineConfigSchema
+from nv_ingest.framework.orchestration.execution.options import ExecutionOptions, ExecutionResult
+from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import (
+    RayPipelineInterface,
+    RayPipelineSubprocessInterface,
+)
+from nv_ingest.framework.orchestration.process.execution import (
+    launch_pipeline,
+    run_pipeline_process,
+    kill_pipeline_process_group,
+)
+logger = logging.getLogger(__name__)
+class ProcessExecutionStrategy(ABC):
+    """
+    Abstract base class for pipeline execution strategies.
+    This class defines the interface for different ways of executing
+    a pipeline (in-process, subprocess, etc.) using the Strategy pattern.
+    """
+    @abstractmethod
+    def execute(self, config: PipelineConfigSchema, options: ExecutionOptions) -> ExecutionResult:
+        """
+        Execute a pipeline using this strategy.
+        Parameters
+        ----------
+        config : PipelineConfigSchema
+            Validated pipeline configuration to execute.
+        options : ExecutionOptions
+            Execution options controlling blocking behavior and output redirection.
+        Returns
+        -------
+        ExecutionResult
+            Result containing pipeline interface and/or timing information.
+        """
+        pass
+class InProcessStrategy(ProcessExecutionStrategy):
+    """
+    Strategy for executing pipelines in the current process.
+    This strategy runs the pipeline directly in the current Python process,
+    providing the most direct execution path with minimal overhead.
+    """
+    def execute(self, config: PipelineConfigSchema, options: ExecutionOptions) -> ExecutionResult:
+        """
+        Execute pipeline in the current process.
+        Parameters
+        ----------
+        config : PipelineConfigSchema
+            Pipeline configuration to execute.
+        options : ExecutionOptions
+            Execution options. stdout/stderr are ignored for in-process execution.
+        Returns
+        -------
+        ExecutionResult
+            Result with pipeline interface (non-blocking) or elapsed time (blocking).
+        """
+        logger.info("Executing pipeline in current process")
+        # Execute the pipeline using existing launch_pipeline function
+        # launch_pipeline returns raw RayPipeline object (not wrapped in interface)
+        pipeline, total_elapsed = launch_pipeline(
+            config,
+            block=options.block,
+            disable_dynamic_scaling=None,  # Already applied in config
+        )
+        if options.block:
+            logger.debug(f"Pipeline execution completed successfully in {total_elapsed:.2f} seconds.")
+            return ExecutionResult(interface=None, elapsed_time=total_elapsed)
+        else:
+            # Wrap the raw RayPipeline in RayPipelineInterface
+            interface = RayPipelineInterface(pipeline)
+            return ExecutionResult(interface=interface, elapsed_time=None)
+class SubprocessStrategy(ProcessExecutionStrategy):
+    """
+    Strategy for executing pipelines in a separate subprocess.
+    This strategy launches the pipeline in a separate Python process using
+    multiprocessing, providing process isolation and output redirection.
+    """
+    def execute(self, config: PipelineConfigSchema, options: ExecutionOptions) -> ExecutionResult:
+        """
+        Execute pipeline in a separate subprocess.
+        Parameters
+        ----------
+        config : PipelineConfigSchema
+            Pipeline configuration to execute.
+        options : ExecutionOptions
+            Execution options including output redirection streams.
+        Returns
+        -------
+        ExecutionResult
+            Result with subprocess interface (non-blocking) or elapsed time (blocking).
+        """
+        logger.info("Launching pipeline in Python subprocess using multiprocessing.")
+        # Create subprocess using fork context
+        ctx = multiprocessing.get_context("fork")
+        process = ctx.Process(
+            target=run_pipeline_process,
+            args=(
+                config,
+                options.stdout,  # raw_stdout
+                options.stderr,  # raw_stderr
+            ),
+            daemon=False,
+        )
+        process.start()
+        interface = RayPipelineSubprocessInterface(process)
+        if options.block:
+            # Block until subprocess completes
+            start_time = time.time()
+            logger.info("Waiting for subprocess pipeline to complete...")
+            process.join()
+            logger.info("Pipeline subprocess completed.")
+            elapsed_time = time.time() - start_time
+            return ExecutionResult(interface=None, elapsed_time=elapsed_time)
+        else:
+            # Return interface for non-blocking execution
+            logger.info(f"Pipeline subprocess started (PID={process.pid})")
+            # Ensure we pass the Process object, not just the PID, to avoid AttributeError
+            # kill_pipeline_process_group expects a multiprocessing.Process instance
+            # Capture raw PID to avoid using multiprocessing APIs during interpreter shutdown
+            pid = int(process.pid)
+            atexit.register(kill_pipeline_process_group, pid)
+            return ExecutionResult(interface=interface, elapsed_time=None)
+def create_execution_strategy(run_in_subprocess: bool) -> ProcessExecutionStrategy:
+    """
+    Factory function to create the appropriate execution strategy.
+    Parameters
+    ----------
+    run_in_subprocess : bool
+        If True, creates SubprocessStrategy. If False, creates InProcessStrategy.
+    Returns
+    -------
+    ProcessExecutionStrategy
+        Configured execution strategy instance.
+    """
+    if run_in_subprocess:
+        return SubprocessStrategy()
+    else:
+        return InProcessStrategy()

nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py CHANGED Viewed

@@ -29,8 +29,8 @@ from nv_ingest.framework.orchestration.ray.stages.sinks.message_broker_task_sink
 from nv_ingest.framework.orchestration.ray.stages.sources.message_broker_task_source import (
     MessageBrokerTaskSourceStage,
     MessageBrokerTaskSourceConfig,
-    start_simple_message_broker,
 )
+from nv_ingest.framework.orchestration.process.dependent_services import start_simple_message_broker
 from nv_ingest.framework.orchestration.ray.stages.storage.image_storage import ImageStorageStage
 from nv_ingest.framework.orchestration.ray.stages.storage.store_embeddings import EmbeddingStorageStage
 from nv_ingest.framework.orchestration.ray.stages.transforms.image_caption import ImageCaptionTransformStage

nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py CHANGED Viewed

@@ -183,7 +183,7 @@ class PipelineTopology:
         """Marks an actor as pending removal, to be cleaned up by the background thread."""
         with self._lock:
             self._actors_pending_removal.add((stage_name, actor))
-            logger.info(f"Marked actor {actor} from stage {stage_name} for removal.")
+            logger.debug(f"Marked actor {actor} from stage {stage_name} for removal.")
     def start_cleanup_thread(self, interval: int = 5) -> None:
         """Starts the background thread for periodic cleanup tasks."""
@@ -191,14 +191,14 @@ class PipelineTopology:
             self._stop_cleanup.clear()
             self._cleanup_thread = threading.Thread(target=self._cleanup_loop, args=(interval,), daemon=True)
             self._cleanup_thread.start()
-            logger.info("Topology cleanup thread started.")
+            logger.debug("Topology cleanup thread started.")
     def stop_cleanup_thread(self) -> None:
         """Stops the background cleanup thread."""
         if self._cleanup_thread and self._cleanup_thread.is_alive():
             self._stop_cleanup.set()
             self._cleanup_thread.join(timeout=5)
-            logger.info("Topology cleanup thread stopped.")
+            logger.debug("Topology cleanup thread stopped.")
     def _cleanup_loop(self, interval: int) -> None:
         """Periodically checks for and removes actors that have completed shutdown."""
@@ -235,7 +235,7 @@ class PipelineTopology:
                             self._actors_pending_removal.remove((stage_name, actor))
                         if actor in self._stage_actors.get(stage_name, []):
                             self._stage_actors[stage_name].remove(actor)
-                            logger.info(f"Successfully removed actor {actor} from stage {stage_name} in topology.")
+                            logger.debug(f"Successfully removed actor {actor} from stage {stage_name} in topology.")
             time.sleep(interval)

nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py CHANGED Viewed

@@ -252,7 +252,7 @@ class RayPipeline(PipelineInterface):
             penalty_factor=self.scaling_config.pid_penalty_factor,
             error_boost_factor=self.scaling_config.pid_error_boost_factor,
         )
-        logger.info("PIDController initialized using ScalingConfig.")
+        logger.debug("PIDController initialized using ScalingConfig.")
         try:
             total_system_memory_bytes = psutil.virtual_memory().total
@@ -270,7 +270,7 @@ class RayPipeline(PipelineInterface):
             memory_threshold=absolute_memory_threshold_mb,
             memory_safety_buffer_fraction=self.scaling_config.rcm_memory_safety_buffer_fraction,
         )
-        logger.info("ResourceConstraintManager initialized using ScalingConfig.")
+        logger.debug("ResourceConstraintManager initialized using ScalingConfig.")
         # --- Instantiate Stats Collector ---
         self._stats_collection_interval_seconds = self.stats_config.collection_interval_seconds
@@ -282,7 +282,7 @@ class RayPipeline(PipelineInterface):
             ema_alpha=self.scaling_config.pid_ema_alpha,
         )
-        logger.info("RayStatsCollector initialized using StatsConfig.")
+        logger.debug("RayStatsCollector initialized using StatsConfig.")
     # --- Accessor Methods for Stat Collector (and internal use) ---
@@ -349,11 +349,11 @@ class RayPipeline(PipelineInterface):
         # Update constraint manager
         self.constraint_manager.max_replicas = total_max_replicas
-        logger.info(f"[Build-Configure] Autoscalers configured. Total Max Replicas: {total_max_replicas}")
+        logger.debug(f"[Build-Configure] Autoscalers configured. Total Max Replicas: {total_max_replicas}")
     def _instantiate_initial_actors(self) -> None:
         """Instantiates initial actors and updates topology."""
-        logger.info("[Build-Actors] Instantiating initial stage actors (min_replicas)...")
+        logger.debug("[Build-Actors] Instantiating initial stage actors (min_replicas)...")
         # Use topology accessor
         current_stages = self.topology.get_stages_info()
@@ -377,7 +377,7 @@ class RayPipeline(PipelineInterface):
                     )
                     try:
                         actor = stage.callable.options(name=actor_name, max_concurrency=1, max_restarts=0).remote(
-                            config=stage.config
+                            config=stage.config, stage_name=stage.name
                         )
                         replicas.append(actor)
                     except Exception as e:
@@ -388,7 +388,7 @@ class RayPipeline(PipelineInterface):
             self.topology.set_actors_for_stage(stage.name, replicas)
             logger.debug(f"[Build-Actors] Stage '{stage.name}' initial actors set in topology: count={len(replicas)}")
-        logger.info("[Build-Actors] Initial actor instantiation complete.")
+        logger.debug("[Build-Actors] Initial actor instantiation complete.")
     def _create_and_wire_edges(self) -> List[ray.ObjectRef]:
         """
@@ -399,7 +399,7 @@ class RayPipeline(PipelineInterface):
         List[ray.ObjectRef]
             A list of object references for the remote wiring calls.
         """
-        logger.info("[Build-Wiring] Creating and wiring edges...")
+        logger.debug("[Build-Wiring] Creating and wiring edges...")
         wiring_refs = []
         new_edge_queues: Dict[str, Tuple[Any, int]] = {}
@@ -628,7 +628,7 @@ class RayPipeline(PipelineInterface):
         Dict[str, List[Any]]
             A dictionary mapping stage names to lists of actor handles.
         """
-        logger.info("--- Starting Pipeline Build Process ---")
+        logger.debug("--- Starting Pipeline Build Process ---")
         try:
             if not self.topology.get_stages_info():
                 logger.error("Build failed: No stages defined in topology.")
@@ -640,7 +640,7 @@ class RayPipeline(PipelineInterface):
             wiring_futures = self._create_and_wire_edges()
             self._wait_for_wiring(wiring_futures)
-            logger.info("--- Pipeline Build Completed Successfully ---")
+            logger.debug("--- Pipeline Build Completed Successfully ---")
             return self.topology.get_stage_actors()  # Return actors from topology
         except RuntimeError as e:
@@ -673,7 +673,7 @@ class RayPipeline(PipelineInterface):
         logger.debug(f"[ScaleUtil] Creating new actor '{actor_name}' for stage '{stage_info.name}'")
         try:
             new_actor = stage_info.callable.options(name=actor_name, max_concurrency=1, max_restarts=0).remote(
-                config=stage_info.config
+                config=stage_info.config, stage_name=stage_info.name
             )
             return new_actor
@@ -861,7 +861,7 @@ class RayPipeline(PipelineInterface):
         # Select actors to remove (e.g., the most recently added)
         actors_to_remove = current_replicas[-num_to_remove:]
-        logger.info(f"[ScaleDown-{stage_name}] Selected {len(actors_to_remove)} actors for removal.")
+        logger.debug(f"[ScaleDown-{stage_name}] Selected {len(actors_to_remove)} actors for removal.")
         # Signal each actor to stop and mark it for removal by the topology.
         # The topology's cleanup thread will handle polling and final removal.
@@ -966,7 +966,7 @@ class RayPipeline(PipelineInterface):
             True if the pipeline drained successfully, False otherwise.
         """
         start_time = time.time()
-        logger.info(f"Waiting for pipeline drain (Timeout: {timeout_seconds}s)...")
+        logger.debug(f"Waiting for pipeline drain (Timeout: {timeout_seconds}s)...")
         last_in_flight = -1
         drain_check_interval = 1.0  # Check every second
@@ -1172,7 +1172,7 @@ class RayPipeline(PipelineInterface):
         force : bool, optional
             Whether to force the flush, by default False.
         """
-        logger.info(f"Manual queue flush requested (force={force}).")
+        logger.debug(f"Manual queue flush requested (force={force}).")
         if self.topology.get_is_flushing() or self._stopping:  # Check topology
             logger.warning("Flush already in progress or pipeline is stopping.")
@@ -1183,7 +1183,7 @@ class RayPipeline(PipelineInterface):
             # For now, run synchronously:
             self._execute_queue_flush()
         else:
-            logger.info("Manual flush denied: pipeline not quiet or interval not met.")
+            logger.debug("Manual flush denied: pipeline not quiet or interval not met.")
     def _gather_controller_metrics(
         self, current_stage_stats: Dict[str, Dict[str, int]], global_in_flight: int
@@ -1409,7 +1409,7 @@ class RayPipeline(PipelineInterface):
                 self._consecutive_quiet_cycles += 1
                 logger.debug(f"Pipeline is quiet. Consecutive quiet cycles: {self._consecutive_quiet_cycles}")
                 if self._consecutive_quiet_cycles >= self.consecutive_quiet_cycles_for_flush:
-                    logger.info(
+                    logger.debug(
                         f"Pipeline has been quiet for {self._consecutive_quiet_cycles} cycles. "
                         "Initiating queue flush."
                     )
@@ -1423,7 +1423,7 @@ class RayPipeline(PipelineInterface):
                     )
             else:
                 if self._consecutive_quiet_cycles > 0:
-                    logger.info(
+                    logger.debug(
                         f"Pipeline is no longer quiet. Resetting consecutive quiet cycle count "
                         f"from {self._consecutive_quiet_cycles} to 0."
                     )
@@ -1479,7 +1479,7 @@ class RayPipeline(PipelineInterface):
         interval : float
             The interval in seconds.
         """
-        logger.info(f"Scaling loop started. Interval: {interval}s")
+        logger.debug(f"Scaling loop started. Interval: {interval}s")
         while self._scaling_monitoring:
             try:
                 self._perform_scaling_and_maintenance()
@@ -1490,7 +1490,7 @@ class RayPipeline(PipelineInterface):
             if not self._scaling_monitoring:
                 break
             time.sleep(sleep_time)
-        logger.info("Scaling loop finished.")
+        logger.debug("Scaling loop finished.")
     def _start_scaling(self, poll_interval: float = 10.0) -> None:
         """
@@ -1505,7 +1505,7 @@ class RayPipeline(PipelineInterface):
             self._scaling_monitoring = True
             self._scaling_thread = threading.Thread(target=self._scaling_loop, args=(poll_interval,), daemon=True)
             self._scaling_thread.start()
-            logger.info(f"Scaling/Maintenance thread launched (Interval: {poll_interval}s).")
+            logger.debug(f"Scaling/Maintenance thread launched (Interval: {poll_interval}s).")
     def _stop_scaling(self) -> None:
         """
@@ -1519,7 +1519,7 @@ class RayPipeline(PipelineInterface):
                 if self._scaling_thread.is_alive():
                     logger.warning("Scaling thread did not exit cleanly.")
             self._scaling_thread = None
-            logger.info("Scaling/Maintenance stopped.")
+            logger.debug("Scaling/Maintenance stopped.")
     # --- Pipeline Start/Stop ---
     def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
@@ -1548,7 +1548,7 @@ class RayPipeline(PipelineInterface):
             logger.debug(f"Waiting for {len(start_futures)} actors to start...")
             try:
                 ray.get(start_futures, timeout=60.0)
-                logger.info(f"{len(start_futures)} actors started.")
+                logger.debug(f"{len(start_futures)} actors started.")
             except Exception as e:
                 logger.error(f"Error/Timeout starting actors: {e}", exc_info=True)
                 self.stop()  # Attempt cleanup
@@ -1593,7 +1593,7 @@ class RayPipeline(PipelineInterface):
                     logger.warning(
                         f"Timeout waiting for {len(not_ready)} actors to stop. " f"Proceeding with shutdown."
                     )
-                logger.info(f"{len(ready)} actors confirmed stop.")
+                logger.debug(f"{len(ready)} actors confirmed stop.")
             except Exception as e:
                 logger.error(f"An unexpected error occurred during actor shutdown: {e}", exc_info=True)

nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py CHANGED Viewed

@@ -72,7 +72,7 @@ class RayStatsCollector:
         self._cumulative_stats: Dict[str, Dict[str, int]] = defaultdict(lambda: {"processed": 0})
         self.ema_memory_per_replica: Dict[str, float] = {}  # EMA of memory per replica
-        logger.info(
+        logger.debug(
             f"RayStatsCollector initialized (Interval: {self._interval}s, "
             f"Actor Timeout: {self._actor_timeout}s, Queue Timeout: {self._queue_timeout}s, "
             f"EMA Alpha: {self.ema_alpha})"
@@ -111,7 +111,7 @@ class RayStatsCollector:
             self._running = False  # Correct inconsistent state
         if not self._running:
-            logger.info("Starting stats collector thread...")
+            logger.debug("Starting stats collector thread...")
             self._running = True
             with self._lock:
                 self._last_update_successful = False  # Mark as stale until first collection
@@ -129,7 +129,7 @@ class RayStatsCollector:
     def stop(self) -> None:
         """Signals the background stats collection thread to stop and waits for it."""
         if self._running:
-            logger.info("Stopping stats collector thread...")
+            logger.debug("Stopping stats collector thread...")
             self._running = False  # Signal loop to stop
             if self._thread is not None:
@@ -150,7 +150,7 @@ class RayStatsCollector:
             with self._lock:
                 self._last_update_successful = False
                 self._collected_stats = {}  # Clear last collected stats
-            logger.info("Stats collector thread stopped.")
+            logger.debug("Stats collector thread stopped.")
         else:
             logger.debug("Stats collector thread already stopped or never started.")
@@ -230,7 +230,7 @@ class RayStatsCollector:
             # but time.sleep is simpler for now.
             time.sleep(sleep_time)
-        logger.info("Stats collector loop finished.")
+        logger.debug("Stats collector loop finished.")
     def collect_stats_now(self) -> Tuple[Dict[str, Dict[str, int]], int, bool]:
         """

nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import logging
+from typing import Optional
 import ray
@@ -17,6 +18,8 @@ from nv_ingest_api.util.exception_handlers.decorators import (
     nv_ingest_node_failure_try_except,
 )
+from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
 logger = logging.getLogger(__name__)
@@ -31,8 +34,8 @@ class AudioExtractorStage(RayActorStage):
       3. Updates the message payload with the extracted text DataFrame.
     """
-    def __init__(self, config: AudioExtractorSchema) -> None:
-        super().__init__(config, log_to_stdout=False)
+    def __init__(self, config: AudioExtractorSchema, stage_name: Optional[str] = None) -> None:
+        super().__init__(config, log_to_stdout=False, stage_name=stage_name)
         try:
             self.validated_config = config
             self._logger.info("AudioExtractorStage configuration validated successfully.")
@@ -40,9 +43,10 @@ class AudioExtractorStage(RayActorStage):
             self._logger.exception(f"Error validating Audio Extractor config: {e}")
             raise
-    @traceable("audio_extractor")
+    @nv_ingest_node_failure_try_except()
+    @traceable()
+    @udf_intercept_hook()
     @filter_by_task(required_tasks=[("extract", {"document_type": "regex:^(mp3|wav)$"})])
-    @nv_ingest_node_failure_try_except(annotation_id="audio_extractor", raise_on_failure=False)
     def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
         """
         Process the control message by extracting text from audio.

nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py CHANGED Viewed

@@ -3,19 +3,18 @@
 # SPDX-License-Identifier: Apache-2.0
 import logging
-from typing import Any
+from typing import Any, Optional
 import ray
-from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
-from nv_ingest.framework.util.flow_control import filter_by_task
 from nv_ingest_api.internal.extract.image.chart_extractor import extract_chart_data_from_image_internal
 from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
+from nv_ingest_api.internal.primitives.tracing.tagging import set_trace_timestamps_with_parent_context
+from nv_ingest.framework.util.flow_control import filter_by_task
+from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
 from nv_ingest_api.internal.primitives.tracing.tagging import traceable
 from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
-from nv_ingest_api.util.exception_handlers.decorators import (
-    nv_ingest_node_failure_try_except,
-)
+from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
+from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
 logger = logging.getLogger(__name__)
@@ -31,8 +30,8 @@ class ChartExtractorStage(RayActorStage):
     and annotates the message metadata with extraction info.
     """
-    def __init__(self, config: ChartExtractorSchema) -> None:
-        super().__init__(config)
+    def __init__(self, config: ChartExtractorSchema, stage_name: Optional[str] = None) -> None:
+        super().__init__(config, stage_name=stage_name)
         try:
             self.validated_config = config
             # logger.warning(
@@ -42,9 +41,10 @@ class ChartExtractorStage(RayActorStage):
             logger.exception("Error validating chart extractor config")
             raise e
-    @traceable("chart_extraction")
+    @nv_ingest_node_failure_try_except()
+    @traceable()
+    @udf_intercept_hook()
     @filter_by_task(required_tasks=["chart_data_extract"])
-    @nv_ingest_node_failure_try_except(annotation_id="chart_extraction", raise_on_failure=False)
     def on_data(self, control_message: Any) -> Any:
         """
         Process the control message by extracting chart data.
@@ -59,7 +59,7 @@ class ChartExtractorStage(RayActorStage):
         IngestControlMessage
             The updated message with the extracted chart data and extraction info in metadata.
         """
-        logger.info("ChartExtractorStage.on_data: Starting chart extraction.")
+        logger.debug("ChartExtractorStage.on_data: Starting chart extraction.")
         # Extract the DataFrame payload.
         df_payload = control_message.payload()
         logger.debug("ChartExtractorStage: Extracted payload with %d rows.", len(df_payload))
@@ -76,17 +76,17 @@ class ChartExtractorStage(RayActorStage):
             extraction_config=self.validated_config,
             execution_trace_log=execution_trace_log,
         )
-        logger.info("ChartExtractorStage: Chart extraction completed. New payload has %d rows.", len(new_df))
+        logger.debug("ChartExtractorStage: Chart extraction completed. New payload has %d rows.", len(new_df))
         # Update the control message with the new DataFrame.
         control_message.payload(new_df)
         # Annotate the message with extraction info.
         control_message.set_metadata("chart_extraction_info", extraction_info)
-        logger.info("ChartExtractorStage: Metadata injection complete. Returning updated control message.")
+        logger.debug("ChartExtractorStage: Metadata injection complete. Returning updated control message.")
         do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
         if do_trace_tagging and execution_trace_log:
-            for key, ts in execution_trace_log.items():
-                control_message.set_timestamp(key, ts)
+            parent_name = self.stage_name if self.stage_name else "chart_extractor"
+            set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
         return control_message

nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import logging
+from typing import Optional
 import ray
@@ -16,6 +17,8 @@ from nv_ingest_api.util.exception_handlers.decorators import (
     nv_ingest_node_failure_try_except,
 )
+from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
 logger = logging.getLogger(__name__)
@@ -26,12 +29,12 @@ class DocxExtractorStage(RayActorStage):
     It expects an IngestControlMessage containing a DataFrame with DOCX document data. It then:
       1. Removes the "docx-extract" task from the message.
-      2. Calls the DOCX extraction logic (via extract_primitives_from_docx_internal) using a validated configuration.
+      2. Calls the DOCX extraction logic (via extract_docx_internal) using a validated configuration.
       3. Updates the message payload with the extracted content DataFrame.
     """
-    def __init__(self, config: DocxExtractorSchema) -> None:
-        super().__init__(config, log_to_stdout=False)
+    def __init__(self, config: DocxExtractorSchema, stage_name: Optional[str] = None) -> None:
+        super().__init__(config, log_to_stdout=False, stage_name=stage_name)
         try:
             self.validated_config = config
             logger.info("DocxExtractorStage configuration validated successfully.")
@@ -39,9 +42,10 @@ class DocxExtractorStage(RayActorStage):
             logger.exception(f"Error validating DOCX Extractor config: {e}")
             raise
-    @traceable("docx_extractor")
+    @nv_ingest_node_failure_try_except()
+    @traceable()
+    @udf_intercept_hook()
     @filter_by_task(required_tasks=[("extract", {"document_type": "docx"})])
-    @nv_ingest_node_failure_try_except(annotation_id="docx_extractor", raise_on_failure=True)
     def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
         """
         Process the control message by extracting content from DOCX documents.

nv-ingest 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.15.dev20250815__py3-none-any.whl

Potentially problematic release.

nv-ingest 2025.8.14.dev20250814py3-none-any.whl → 2025.8.15.dev20250815py3-none-any.whl