nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. nv_ingest/api/__init__.py +6 -0
  2. nv_ingest/api/main.py +2 -0
  3. nv_ingest/api/tracing.py +82 -0
  4. nv_ingest/api/v2/README.md +203 -0
  5. nv_ingest/api/v2/__init__.py +3 -0
  6. nv_ingest/api/v2/ingest.py +1300 -0
  7. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  9. nv_ingest/framework/orchestration/execution/options.py +112 -0
  10. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
  12. nv_ingest/framework/orchestration/process/execution.py +495 -0
  13. nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
  14. nv_ingest/framework/orchestration/process/strategies.py +218 -0
  15. nv_ingest/framework/orchestration/process/termination.py +147 -0
  16. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +3 -3
  17. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  18. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
  19. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  20. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
  21. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
  22. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
  23. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
  24. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
  25. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  26. nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
  28. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  29. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
  30. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
  31. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  32. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  33. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
  34. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
  35. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
  36. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  37. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  38. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
  39. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +79 -11
  40. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
  41. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  42. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  43. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
  44. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -18
  45. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
  46. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  47. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  48. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  49. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  50. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  51. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
  52. nv_ingest/pipeline/__init__.py +3 -0
  53. nv_ingest/pipeline/config/__init__.py +3 -0
  54. nv_ingest/pipeline/config/loaders.py +229 -0
  55. nv_ingest/pipeline/config/replica_resolver.py +237 -0
  56. nv_ingest/pipeline/default_libmode_pipeline_impl.py +528 -0
  57. nv_ingest/pipeline/default_pipeline_impl.py +557 -0
  58. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  59. nv_ingest/pipeline/pipeline_schema.py +398 -0
  60. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +6 -3
  61. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +64 -43
  62. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  63. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  64. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
  65. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
  66. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,6 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import multiprocessing
6
- import os
7
- import signal
8
6
  import threading
9
7
  from abc import ABC, abstractmethod
10
8
  from dataclasses import dataclass
@@ -22,6 +20,7 @@ import logging
22
20
  import time
23
21
 
24
22
  from nv_ingest.framework.orchestration.ray.primitives.pipeline_topology import PipelineTopology, StageInfo
23
+ from nv_ingest.framework.orchestration.process.termination import kill_pipeline_process_group
25
24
  from nv_ingest.framework.orchestration.ray.primitives.ray_stat_collector import RayStatsCollector
26
25
  from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import PIDController, ResourceConstraintManager
27
26
  from nv_ingest.framework.orchestration.ray.util.pipeline.tools import wrap_callable_as_stage
@@ -120,24 +119,19 @@ class RayPipelineSubprocessInterface(PipelineInterface):
120
119
 
121
120
  def stop(self) -> None:
122
121
  """
123
- Stops the subprocess pipeline. Tries terminate(), then escalates to SIGKILL on the process group if needed.
122
+ Stops the subprocess pipeline and its entire process group to ensure
123
+ any child processes (e.g., the simple message broker) are terminated.
124
124
  """
125
- if not self._process.is_alive():
125
+ try:
126
+ pid = int(self._process.pid)
127
+ except Exception:
126
128
  return
127
129
 
130
+ # Always attempt to terminate the entire process group
128
131
  try:
129
- self._process.terminate()
130
- self._process.join(timeout=5.0)
132
+ kill_pipeline_process_group(pid)
131
133
  except Exception as e:
132
- logger.warning(f"Failed to terminate process cleanly: {e}")
133
-
134
- if self._process.is_alive():
135
- try:
136
- pgid = os.getpgid(self._process.pid)
137
- os.killpg(pgid, signal.SIGKILL)
138
- except Exception as e:
139
- logger.error(f"Failed to force-kill process group: {e}")
140
- self._process.join(timeout=3.0)
134
+ logger.warning(f"kill_pipeline_process_group failed: {e}")
141
135
 
142
136
 
143
137
  class RayPipelineInterface(PipelineInterface):
@@ -252,7 +246,7 @@ class RayPipeline(PipelineInterface):
252
246
  penalty_factor=self.scaling_config.pid_penalty_factor,
253
247
  error_boost_factor=self.scaling_config.pid_error_boost_factor,
254
248
  )
255
- logger.info("PIDController initialized using ScalingConfig.")
249
+ logger.debug("PIDController initialized using ScalingConfig.")
256
250
 
257
251
  try:
258
252
  total_system_memory_bytes = psutil.virtual_memory().total
@@ -270,7 +264,7 @@ class RayPipeline(PipelineInterface):
270
264
  memory_threshold=absolute_memory_threshold_mb,
271
265
  memory_safety_buffer_fraction=self.scaling_config.rcm_memory_safety_buffer_fraction,
272
266
  )
273
- logger.info("ResourceConstraintManager initialized using ScalingConfig.")
267
+ logger.debug("ResourceConstraintManager initialized using ScalingConfig.")
274
268
 
275
269
  # --- Instantiate Stats Collector ---
276
270
  self._stats_collection_interval_seconds = self.stats_config.collection_interval_seconds
@@ -282,7 +276,7 @@ class RayPipeline(PipelineInterface):
282
276
  ema_alpha=self.scaling_config.pid_ema_alpha,
283
277
  )
284
278
 
285
- logger.info("RayStatsCollector initialized using StatsConfig.")
279
+ logger.debug("RayStatsCollector initialized using StatsConfig.")
286
280
 
287
281
  # --- Accessor Methods for Stat Collector (and internal use) ---
288
282
 
@@ -349,11 +343,11 @@ class RayPipeline(PipelineInterface):
349
343
  # Update constraint manager
350
344
  self.constraint_manager.max_replicas = total_max_replicas
351
345
 
352
- logger.info(f"[Build-Configure] Autoscalers configured. Total Max Replicas: {total_max_replicas}")
346
+ logger.debug(f"[Build-Configure] Autoscalers configured. Total Max Replicas: {total_max_replicas}")
353
347
 
354
348
  def _instantiate_initial_actors(self) -> None:
355
349
  """Instantiates initial actors and updates topology."""
356
- logger.info("[Build-Actors] Instantiating initial stage actors (min_replicas)...")
350
+ logger.debug("[Build-Actors] Instantiating initial stage actors (min_replicas)...")
357
351
  # Use topology accessor
358
352
  current_stages = self.topology.get_stages_info()
359
353
 
@@ -377,7 +371,7 @@ class RayPipeline(PipelineInterface):
377
371
  )
378
372
  try:
379
373
  actor = stage.callable.options(name=actor_name, max_concurrency=1, max_restarts=0).remote(
380
- config=stage.config
374
+ config=stage.config, stage_name=stage.name
381
375
  )
382
376
  replicas.append(actor)
383
377
  except Exception as e:
@@ -388,7 +382,7 @@ class RayPipeline(PipelineInterface):
388
382
  self.topology.set_actors_for_stage(stage.name, replicas)
389
383
  logger.debug(f"[Build-Actors] Stage '{stage.name}' initial actors set in topology: count={len(replicas)}")
390
384
 
391
- logger.info("[Build-Actors] Initial actor instantiation complete.")
385
+ logger.debug("[Build-Actors] Initial actor instantiation complete.")
392
386
 
393
387
  def _create_and_wire_edges(self) -> List[ray.ObjectRef]:
394
388
  """
@@ -399,7 +393,7 @@ class RayPipeline(PipelineInterface):
399
393
  List[ray.ObjectRef]
400
394
  A list of object references for the remote wiring calls.
401
395
  """
402
- logger.info("[Build-Wiring] Creating and wiring edges...")
396
+ logger.debug("[Build-Wiring] Creating and wiring edges...")
403
397
  wiring_refs = []
404
398
  new_edge_queues: Dict[str, Tuple[Any, int]] = {}
405
399
 
@@ -628,7 +622,7 @@ class RayPipeline(PipelineInterface):
628
622
  Dict[str, List[Any]]
629
623
  A dictionary mapping stage names to lists of actor handles.
630
624
  """
631
- logger.info("--- Starting Pipeline Build Process ---")
625
+ logger.debug("--- Starting Pipeline Build Process ---")
632
626
  try:
633
627
  if not self.topology.get_stages_info():
634
628
  logger.error("Build failed: No stages defined in topology.")
@@ -640,7 +634,7 @@ class RayPipeline(PipelineInterface):
640
634
  wiring_futures = self._create_and_wire_edges()
641
635
  self._wait_for_wiring(wiring_futures)
642
636
 
643
- logger.info("--- Pipeline Build Completed Successfully ---")
637
+ logger.debug("--- Pipeline Build Completed Successfully ---")
644
638
  return self.topology.get_stage_actors() # Return actors from topology
645
639
 
646
640
  except RuntimeError as e:
@@ -673,7 +667,7 @@ class RayPipeline(PipelineInterface):
673
667
  logger.debug(f"[ScaleUtil] Creating new actor '{actor_name}' for stage '{stage_info.name}'")
674
668
  try:
675
669
  new_actor = stage_info.callable.options(name=actor_name, max_concurrency=1, max_restarts=0).remote(
676
- config=stage_info.config
670
+ config=stage_info.config, stage_name=stage_info.name
677
671
  )
678
672
 
679
673
  return new_actor
@@ -861,7 +855,7 @@ class RayPipeline(PipelineInterface):
861
855
  # Select actors to remove (e.g., the most recently added)
862
856
  actors_to_remove = current_replicas[-num_to_remove:]
863
857
 
864
- logger.info(f"[ScaleDown-{stage_name}] Selected {len(actors_to_remove)} actors for removal.")
858
+ logger.debug(f"[ScaleDown-{stage_name}] Selected {len(actors_to_remove)} actors for removal.")
865
859
 
866
860
  # Signal each actor to stop and mark it for removal by the topology.
867
861
  # The topology's cleanup thread will handle polling and final removal.
@@ -966,7 +960,7 @@ class RayPipeline(PipelineInterface):
966
960
  True if the pipeline drained successfully, False otherwise.
967
961
  """
968
962
  start_time = time.time()
969
- logger.info(f"Waiting for pipeline drain (Timeout: {timeout_seconds}s)...")
963
+ logger.debug(f"Waiting for pipeline drain (Timeout: {timeout_seconds}s)...")
970
964
  last_in_flight = -1
971
965
  drain_check_interval = 1.0 # Check every second
972
966
 
@@ -1172,7 +1166,7 @@ class RayPipeline(PipelineInterface):
1172
1166
  force : bool, optional
1173
1167
  Whether to force the flush, by default False.
1174
1168
  """
1175
- logger.info(f"Manual queue flush requested (force={force}).")
1169
+ logger.debug(f"Manual queue flush requested (force={force}).")
1176
1170
 
1177
1171
  if self.topology.get_is_flushing() or self._stopping: # Check topology
1178
1172
  logger.warning("Flush already in progress or pipeline is stopping.")
@@ -1183,7 +1177,7 @@ class RayPipeline(PipelineInterface):
1183
1177
  # For now, run synchronously:
1184
1178
  self._execute_queue_flush()
1185
1179
  else:
1186
- logger.info("Manual flush denied: pipeline not quiet or interval not met.")
1180
+ logger.debug("Manual flush denied: pipeline not quiet or interval not met.")
1187
1181
 
1188
1182
  def _gather_controller_metrics(
1189
1183
  self, current_stage_stats: Dict[str, Dict[str, int]], global_in_flight: int
@@ -1409,7 +1403,7 @@ class RayPipeline(PipelineInterface):
1409
1403
  self._consecutive_quiet_cycles += 1
1410
1404
  logger.debug(f"Pipeline is quiet. Consecutive quiet cycles: {self._consecutive_quiet_cycles}")
1411
1405
  if self._consecutive_quiet_cycles >= self.consecutive_quiet_cycles_for_flush:
1412
- logger.info(
1406
+ logger.debug(
1413
1407
  f"Pipeline has been quiet for {self._consecutive_quiet_cycles} cycles. "
1414
1408
  "Initiating queue flush."
1415
1409
  )
@@ -1423,7 +1417,7 @@ class RayPipeline(PipelineInterface):
1423
1417
  )
1424
1418
  else:
1425
1419
  if self._consecutive_quiet_cycles > 0:
1426
- logger.info(
1420
+ logger.debug(
1427
1421
  f"Pipeline is no longer quiet. Resetting consecutive quiet cycle count "
1428
1422
  f"from {self._consecutive_quiet_cycles} to 0."
1429
1423
  )
@@ -1479,7 +1473,7 @@ class RayPipeline(PipelineInterface):
1479
1473
  interval : float
1480
1474
  The interval in seconds.
1481
1475
  """
1482
- logger.info(f"Scaling loop started. Interval: {interval}s")
1476
+ logger.debug(f"Scaling loop started. Interval: {interval}s")
1483
1477
  while self._scaling_monitoring:
1484
1478
  try:
1485
1479
  self._perform_scaling_and_maintenance()
@@ -1490,7 +1484,7 @@ class RayPipeline(PipelineInterface):
1490
1484
  if not self._scaling_monitoring:
1491
1485
  break
1492
1486
  time.sleep(sleep_time)
1493
- logger.info("Scaling loop finished.")
1487
+ logger.debug("Scaling loop finished.")
1494
1488
 
1495
1489
  def _start_scaling(self, poll_interval: float = 10.0) -> None:
1496
1490
  """
@@ -1505,7 +1499,7 @@ class RayPipeline(PipelineInterface):
1505
1499
  self._scaling_monitoring = True
1506
1500
  self._scaling_thread = threading.Thread(target=self._scaling_loop, args=(poll_interval,), daemon=True)
1507
1501
  self._scaling_thread.start()
1508
- logger.info(f"Scaling/Maintenance thread launched (Interval: {poll_interval}s).")
1502
+ logger.debug(f"Scaling/Maintenance thread launched (Interval: {poll_interval}s).")
1509
1503
 
1510
1504
  def _stop_scaling(self) -> None:
1511
1505
  """
@@ -1519,7 +1513,7 @@ class RayPipeline(PipelineInterface):
1519
1513
  if self._scaling_thread.is_alive():
1520
1514
  logger.warning("Scaling thread did not exit cleanly.")
1521
1515
  self._scaling_thread = None
1522
- logger.info("Scaling/Maintenance stopped.")
1516
+ logger.debug("Scaling/Maintenance stopped.")
1523
1517
 
1524
1518
  # --- Pipeline Start/Stop ---
1525
1519
  def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
@@ -1548,7 +1542,7 @@ class RayPipeline(PipelineInterface):
1548
1542
  logger.debug(f"Waiting for {len(start_futures)} actors to start...")
1549
1543
  try:
1550
1544
  ray.get(start_futures, timeout=60.0)
1551
- logger.info(f"{len(start_futures)} actors started.")
1545
+ logger.debug(f"{len(start_futures)} actors started.")
1552
1546
  except Exception as e:
1553
1547
  logger.error(f"Error/Timeout starting actors: {e}", exc_info=True)
1554
1548
  self.stop() # Attempt cleanup
@@ -1593,7 +1587,7 @@ class RayPipeline(PipelineInterface):
1593
1587
  logger.warning(
1594
1588
  f"Timeout waiting for {len(not_ready)} actors to stop. " f"Proceeding with shutdown."
1595
1589
  )
1596
- logger.info(f"{len(ready)} actors confirmed stop.")
1590
+ logger.debug(f"{len(ready)} actors confirmed stop.")
1597
1591
  except Exception as e:
1598
1592
  logger.error(f"An unexpected error occurred during actor shutdown: {e}", exc_info=True)
1599
1593
 
@@ -72,7 +72,7 @@ class RayStatsCollector:
72
72
  self._cumulative_stats: Dict[str, Dict[str, int]] = defaultdict(lambda: {"processed": 0})
73
73
  self.ema_memory_per_replica: Dict[str, float] = {} # EMA of memory per replica
74
74
 
75
- logger.info(
75
+ logger.debug(
76
76
  f"RayStatsCollector initialized (Interval: {self._interval}s, "
77
77
  f"Actor Timeout: {self._actor_timeout}s, Queue Timeout: {self._queue_timeout}s, "
78
78
  f"EMA Alpha: {self.ema_alpha})"
@@ -111,7 +111,7 @@ class RayStatsCollector:
111
111
  self._running = False # Correct inconsistent state
112
112
 
113
113
  if not self._running:
114
- logger.info("Starting stats collector thread...")
114
+ logger.debug("Starting stats collector thread...")
115
115
  self._running = True
116
116
  with self._lock:
117
117
  self._last_update_successful = False # Mark as stale until first collection
@@ -129,7 +129,7 @@ class RayStatsCollector:
129
129
  def stop(self) -> None:
130
130
  """Signals the background stats collection thread to stop and waits for it."""
131
131
  if self._running:
132
- logger.info("Stopping stats collector thread...")
132
+ logger.debug("Stopping stats collector thread...")
133
133
  self._running = False # Signal loop to stop
134
134
 
135
135
  if self._thread is not None:
@@ -150,7 +150,7 @@ class RayStatsCollector:
150
150
  with self._lock:
151
151
  self._last_update_successful = False
152
152
  self._collected_stats = {} # Clear last collected stats
153
- logger.info("Stats collector thread stopped.")
153
+ logger.debug("Stats collector thread stopped.")
154
154
  else:
155
155
  logger.debug("Stats collector thread already stopped or never started.")
156
156
 
@@ -230,7 +230,7 @@ class RayStatsCollector:
230
230
  # but time.sleep is simpler for now.
231
231
  time.sleep(sleep_time)
232
232
 
233
- logger.info("Stats collector loop finished.")
233
+ logger.debug("Stats collector loop finished.")
234
234
 
235
235
  def collect_stats_now(self) -> Tuple[Dict[str, Dict[str, int]], int, bool]:
236
236
  """
@@ -4,7 +4,7 @@
4
4
 
5
5
 
6
6
  import logging
7
-
7
+ from typing import Optional
8
8
  import ray
9
9
 
10
10
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
@@ -16,6 +16,9 @@ from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExt
16
16
  from nv_ingest_api.util.exception_handlers.decorators import (
17
17
  nv_ingest_node_failure_try_except,
18
18
  )
19
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
20
+
21
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
19
22
 
20
23
  logger = logging.getLogger(__name__)
21
24
 
@@ -31,8 +34,8 @@ class AudioExtractorStage(RayActorStage):
31
34
  3. Updates the message payload with the extracted text DataFrame.
32
35
  """
33
36
 
34
- def __init__(self, config: AudioExtractorSchema) -> None:
35
- super().__init__(config, log_to_stdout=False)
37
+ def __init__(self, config: AudioExtractorSchema, stage_name: Optional[str] = None) -> None:
38
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
36
39
  try:
37
40
  self.validated_config = config
38
41
  self._logger.info("AudioExtractorStage configuration validated successfully.")
@@ -40,9 +43,10 @@ class AudioExtractorStage(RayActorStage):
40
43
  self._logger.exception(f"Error validating Audio Extractor config: {e}")
41
44
  raise
42
45
 
43
- @traceable("audio_extractor")
46
+ @nv_ingest_node_failure_try_except()
47
+ @traceable()
48
+ @udf_intercept_hook()
44
49
  @filter_by_task(required_tasks=[("extract", {"document_type": "regex:^(mp3|wav)$"})])
45
- @nv_ingest_node_failure_try_except(annotation_id="audio_extractor", raise_on_failure=False)
46
50
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
51
  """
48
52
  Process the control message by extracting text from audio.
@@ -62,10 +66,9 @@ class AudioExtractorStage(RayActorStage):
62
66
  # Extract the DataFrame payload.
63
67
  df_ledger = control_message.payload()
64
68
  self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
65
-
66
69
  # Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
67
70
  task_config = remove_task_by_type(control_message, "extract")
68
- self._logger.debug("Extracted task config: %s", task_config)
71
+ self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
69
72
 
70
73
  # Perform audio text extraction.
71
74
  new_df, extraction_info = extract_text_from_audio_internal(
@@ -3,19 +3,21 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
- from typing import Any
6
+ from typing import Any, Optional
7
7
 
8
8
  import ray
9
-
10
- from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
- from nv_ingest.framework.util.flow_control import filter_by_task
12
9
  from nv_ingest_api.internal.extract.image.chart_extractor import extract_chart_data_from_image_internal
13
10
  from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
11
+ from nv_ingest_api.internal.primitives.tracing.tagging import set_trace_timestamps_with_parent_context
12
+ from nv_ingest.framework.util.flow_control import filter_by_task
13
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
14
14
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
15
  from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
16
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
16
17
  from nv_ingest_api.util.exception_handlers.decorators import (
17
18
  nv_ingest_node_failure_try_except,
18
19
  )
20
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
19
21
 
20
22
  logger = logging.getLogger(__name__)
21
23
 
@@ -31,8 +33,8 @@ class ChartExtractorStage(RayActorStage):
31
33
  and annotates the message metadata with extraction info.
32
34
  """
33
35
 
34
- def __init__(self, config: ChartExtractorSchema) -> None:
35
- super().__init__(config)
36
+ def __init__(self, config: ChartExtractorSchema, stage_name: Optional[str] = None) -> None:
37
+ super().__init__(config, stage_name=stage_name)
36
38
  try:
37
39
  self.validated_config = config
38
40
  # logger.warning(
@@ -42,9 +44,10 @@ class ChartExtractorStage(RayActorStage):
42
44
  logger.exception("Error validating chart extractor config")
43
45
  raise e
44
46
 
45
- @traceable("chart_extraction")
47
+ @nv_ingest_node_failure_try_except()
48
+ @traceable()
49
+ @udf_intercept_hook()
46
50
  @filter_by_task(required_tasks=["chart_data_extract"])
47
- @nv_ingest_node_failure_try_except(annotation_id="chart_extraction", raise_on_failure=False)
48
51
  def on_data(self, control_message: Any) -> Any:
49
52
  """
50
53
  Process the control message by extracting chart data.
@@ -59,14 +62,14 @@ class ChartExtractorStage(RayActorStage):
59
62
  IngestControlMessage
60
63
  The updated message with the extracted chart data and extraction info in metadata.
61
64
  """
62
- logger.info("ChartExtractorStage.on_data: Starting chart extraction.")
65
+ logger.debug("ChartExtractorStage.on_data: Starting chart extraction.")
63
66
  # Extract the DataFrame payload.
64
67
  df_payload = control_message.payload()
65
68
  logger.debug("ChartExtractorStage: Extracted payload with %d rows.", len(df_payload))
66
69
 
67
70
  # Remove the "chart_data_extract" task to obtain task-specific configuration.
68
71
  task_config = remove_task_by_type(control_message, "chart_data_extract")
69
- logger.debug("ChartExtractorStage: Task config extracted: %s", task_config)
72
+ logger.debug("ChartExtractorStage: Task config extracted: %s", sanitize_for_logging(task_config))
70
73
 
71
74
  # Perform chart data extraction.
72
75
  execution_trace_log = {}
@@ -76,17 +79,17 @@ class ChartExtractorStage(RayActorStage):
76
79
  extraction_config=self.validated_config,
77
80
  execution_trace_log=execution_trace_log,
78
81
  )
79
- logger.info("ChartExtractorStage: Chart extraction completed. New payload has %d rows.", len(new_df))
82
+ logger.debug("ChartExtractorStage: Chart extraction completed. New payload has %d rows.", len(new_df))
80
83
 
81
84
  # Update the control message with the new DataFrame.
82
85
  control_message.payload(new_df)
83
86
  # Annotate the message with extraction info.
84
87
  control_message.set_metadata("chart_extraction_info", extraction_info)
85
- logger.info("ChartExtractorStage: Metadata injection complete. Returning updated control message.")
88
+ logger.debug("ChartExtractorStage: Metadata injection complete. Returning updated control message.")
86
89
 
87
90
  do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
88
91
  if do_trace_tagging and execution_trace_log:
89
- for key, ts in execution_trace_log.items():
90
- control_message.set_timestamp(key, ts)
92
+ parent_name = self.stage_name if self.stage_name else "chart_extractor"
93
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
91
94
 
92
95
  return control_message
@@ -3,6 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ from typing import Optional
6
7
 
7
8
  import ray
8
9
 
@@ -15,6 +16,9 @@ from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtra
15
16
  from nv_ingest_api.util.exception_handlers.decorators import (
16
17
  nv_ingest_node_failure_try_except,
17
18
  )
19
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
20
+
21
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
22
 
19
23
  logger = logging.getLogger(__name__)
20
24
 
@@ -26,12 +30,12 @@ class DocxExtractorStage(RayActorStage):
26
30
 
27
31
  It expects an IngestControlMessage containing a DataFrame with DOCX document data. It then:
28
32
  1. Removes the "docx-extract" task from the message.
29
- 2. Calls the DOCX extraction logic (via extract_primitives_from_docx_internal) using a validated configuration.
33
+ 2. Calls the DOCX extraction logic (via extract_docx_internal) using a validated configuration.
30
34
  3. Updates the message payload with the extracted content DataFrame.
31
35
  """
32
36
 
33
- def __init__(self, config: DocxExtractorSchema) -> None:
34
- super().__init__(config, log_to_stdout=False)
37
+ def __init__(self, config: DocxExtractorSchema, stage_name: Optional[str] = None) -> None:
38
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
35
39
  try:
36
40
  self.validated_config = config
37
41
  logger.info("DocxExtractorStage configuration validated successfully.")
@@ -39,9 +43,10 @@ class DocxExtractorStage(RayActorStage):
39
43
  logger.exception(f"Error validating DOCX Extractor config: {e}")
40
44
  raise
41
45
 
42
- @traceable("docx_extractor")
46
+ @nv_ingest_node_failure_try_except()
47
+ @traceable()
48
+ @udf_intercept_hook()
43
49
  @filter_by_task(required_tasks=[("extract", {"document_type": "docx"})])
44
- @nv_ingest_node_failure_try_except(annotation_id="docx_extractor", raise_on_failure=True)
45
50
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
46
51
  """
47
52
  Process the control message by extracting content from DOCX documents.
@@ -64,7 +69,7 @@ class DocxExtractorStage(RayActorStage):
64
69
 
65
70
  # Remove the "docx-extract" task from the message to obtain task-specific configuration.
66
71
  task_config = remove_task_by_type(control_message, "extract")
67
- self._logger.debug("Extracted task config: %s", task_config)
72
+ self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
68
73
 
69
74
  # Perform DOCX content extraction.
70
75
  new_df, extraction_info = extract_primitives_from_docx_internal(
@@ -4,6 +4,7 @@
4
4
 
5
5
 
6
6
  import logging
7
+ from typing import Optional
7
8
 
8
9
  import ray
9
10
 
@@ -16,6 +17,9 @@ from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtra
16
17
  from nv_ingest_api.util.exception_handlers.decorators import (
17
18
  nv_ingest_node_failure_try_except,
18
19
  )
20
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
21
+
22
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
19
23
 
20
24
  logger = logging.getLogger(__name__)
21
25
 
@@ -31,8 +35,8 @@ class HtmlExtractorStage(RayActorStage):
31
35
  3. Updates the message payload with the extracted text DataFrame.
32
36
  """
33
37
 
34
- def __init__(self, config: HtmlExtractorSchema) -> None:
35
- super().__init__(config, log_to_stdout=False)
38
+ def __init__(self, config: HtmlExtractorSchema, stage_name: Optional[str] = None) -> None:
39
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
36
40
  try:
37
41
  self.validated_config = config
38
42
  self._logger.info("HtmlExtractorStage configuration validated successfully.")
@@ -40,9 +44,10 @@ class HtmlExtractorStage(RayActorStage):
40
44
  self._logger.exception(f"Error validating Html Extractor config: {e}")
41
45
  raise
42
46
 
43
- @traceable("html_extractor")
47
+ @nv_ingest_node_failure_try_except()
48
+ @traceable()
49
+ @udf_intercept_hook()
44
50
  @filter_by_task(required_tasks=[("extract", {"document_type": "html"})])
45
- @nv_ingest_node_failure_try_except(annotation_id="html_extractor", raise_on_failure=False)
46
51
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
52
  """
48
53
  Process the control message by extracting content from html.
@@ -65,7 +70,7 @@ class HtmlExtractorStage(RayActorStage):
65
70
 
66
71
  # Remove the "html_content_extract" task from the message to obtain task-specific configuration.
67
72
  task_config = remove_task_by_type(control_message, "extract")
68
- self._logger.debug("Extracted task config: %s", task_config)
73
+ self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
69
74
 
70
75
  # Perform html content extraction.
71
76
  new_df, extraction_info = extract_markdown_from_html_internal(
@@ -3,6 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ from typing import Optional
6
7
 
7
8
  import ray
8
9
 
@@ -15,6 +16,9 @@ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExt
15
16
  from nv_ingest_api.util.exception_handlers.decorators import (
16
17
  nv_ingest_node_failure_try_except,
17
18
  )
19
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
20
+
21
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
22
 
19
23
  logger = logging.getLogger(__name__)
20
24
 
@@ -30,18 +34,19 @@ class ImageExtractorStage(RayActorStage):
30
34
  3. Updates the message payload with the extracted primitives DataFrame.
31
35
  """
32
36
 
33
- def __init__(self, config: ImageExtractorSchema) -> None:
34
- super().__init__(config)
37
+ def __init__(self, config: ImageExtractorSchema, stage_name: Optional[str] = None) -> None:
38
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
35
39
  try:
36
40
  self.validated_config = config
37
- logger.info("ImageExtractorStage configuration validated successfully.")
41
+ self._logger.info("ImageExtractorStage configuration validated successfully.")
38
42
  except Exception as e:
39
- logger.exception(f"Error validating Image Extractor config: {e}")
43
+ self._logger.exception(f"Error validating Image Extractor config: {e}")
40
44
  raise
41
45
 
42
- @traceable("image_extraction")
46
+ @nv_ingest_node_failure_try_except()
47
+ @traceable()
48
+ @udf_intercept_hook()
43
49
  @filter_by_task(required_tasks=[("extract", {"document_type": "regex:^(png|jpeg|jpg|tiff|bmp)$"})])
44
- @nv_ingest_node_failure_try_except(annotation_id="image_extractor", raise_on_failure=False)
45
50
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
46
51
  """
47
52
  Process the control message by extracting primitives from images.
@@ -64,7 +69,7 @@ class ImageExtractorStage(RayActorStage):
64
69
 
65
70
  # Remove the "extract" task from the message to obtain task-specific configuration.
66
71
  task_config = remove_task_by_type(control_message, "extract")
67
- logger.debug("Extracted task config: %s", task_config)
72
+ logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
68
73
 
69
74
  # Perform image primitives extraction.
70
75
  new_df, extraction_info = extract_primitives_from_image_internal(
@@ -5,32 +5,44 @@
5
5
  import logging
6
6
  import ray
7
7
 
8
- from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
9
8
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
10
9
  from nv_ingest.framework.util.flow_control import filter_by_task
11
10
  from nv_ingest_api.internal.extract.image.infographic_extractor import extract_infographic_data_from_image_internal
12
11
  from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
13
- from nv_ingest_api.internal.primitives.tracing.tagging import traceable
12
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
13
+ from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
14
14
  from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
15
+ from typing import Optional
16
+
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
15
18
 
16
19
  logger = logging.getLogger(__name__)
17
20
 
18
21
 
19
22
  @ray.remote
20
23
  class InfographicExtractorStage(RayActorStage):
21
- def __init__(self, config: InfographicExtractorSchema) -> None:
22
- super().__init__(config)
24
+ """
25
+ A Ray actor stage that extracts infographic data from image content.
26
+
27
+ It expects an IngestControlMessage containing a DataFrame with image data. It then:
28
+ 1. Removes the "infographic_data_extract" task from the message.
29
+ 2. Calls the infographic extraction logic using a validated configuration.
30
+ 3. Updates the message payload with the extracted infographic DataFrame.
31
+ """
23
32
 
33
+ def __init__(self, config: InfographicExtractorSchema, stage_name: Optional[str] = None) -> None:
34
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
24
35
  try:
25
36
  self.validated_config = config
26
- logger.info("ImageExtractorStage configuration validated successfully.")
37
+ self._logger.info("InfographicExtractorStage configuration validated successfully.")
27
38
  except Exception as e:
28
- logger.exception(f"Error validating Image Extractor config: {e}")
39
+ self._logger.exception(f"Error validating Infographic extractor config: {e}")
29
40
  raise
30
41
 
31
- @traceable("infographic_extraction")
42
+ @nv_ingest_node_failure_try_except()
43
+ @traceable()
44
+ @udf_intercept_hook()
32
45
  @filter_by_task(required_tasks=["infographic_data_extract"])
33
- @nv_ingest_node_failure_try_except(annotation_id="infographic_extraction", raise_on_failure=False)
34
46
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
35
47
  # Extract DataFrame payload
36
48
  df_ledger = control_message.payload()
@@ -51,7 +63,7 @@ class InfographicExtractorStage(RayActorStage):
51
63
 
52
64
  do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
53
65
  if do_trace_tagging and execution_trace_log:
54
- for key, ts in execution_trace_log.items():
55
- control_message.set_timestamp(key, ts)
66
+ parent_name = self.stage_name if self.stage_name else "infographic_extractor"
67
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
56
68
 
57
69
  return control_message