nv-ingest 2025.8.16.dev20250816__py3-none-any.whl → 2025.11.21.dev20251121__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. nv_ingest/api/__init__.py +6 -0
  2. nv_ingest/api/main.py +2 -0
  3. nv_ingest/api/tracing.py +82 -0
  4. nv_ingest/api/v2/README.md +203 -0
  5. nv_ingest/api/v2/__init__.py +3 -0
  6. nv_ingest/api/v2/ingest.py +1300 -0
  7. nv_ingest/framework/orchestration/process/dependent_services.py +43 -14
  8. nv_ingest/framework/orchestration/process/execution.py +92 -94
  9. nv_ingest/framework/orchestration/process/lifecycle.py +98 -6
  10. nv_ingest/framework/orchestration/process/strategies.py +41 -5
  11. nv_ingest/framework/orchestration/process/termination.py +147 -0
  12. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +2 -2
  13. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +9 -15
  14. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +2 -3
  15. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +5 -2
  16. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +2 -1
  17. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +2 -1
  18. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +2 -1
  19. nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
  20. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +5 -2
  21. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +2 -1
  22. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +2 -1
  23. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +2 -1
  24. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +2 -1
  25. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +46 -9
  26. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +2 -1
  27. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +5 -1
  28. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +5 -1
  29. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +4 -3
  30. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
  31. nv_ingest/pipeline/config/loaders.py +33 -2
  32. nv_ingest/pipeline/default_libmode_pipeline_impl.py +514 -0
  33. nv_ingest/pipeline/default_pipeline_impl.py +111 -88
  34. {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/METADATA +4 -3
  35. {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/RECORD +38 -31
  36. {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/WHEEL +0 -0
  37. {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/licenses/LICENSE +0 -0
  38. {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,147 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Process termination utilities, isolated to avoid circular imports.
7
+
8
+ This module provides functions to terminate a process and its entire process
9
+ group safely, without depending on pipeline construction or Ray types.
10
+ """
11
+
12
+ import logging
13
+ import os
14
+ import signal
15
+ import time
16
+ from typing import Optional
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _safe_log(level: int, msg: str) -> None:
22
+ """Best-effort logging that won't emit handler tracebacks on closed streams.
23
+
24
+ Temporarily disables logging.raiseExceptions to prevent the logging module
25
+ from printing "--- Logging error ---" to stderr if a handler's stream is
26
+ already closed (common during process teardown). Falls back to writing to
27
+ sys.__stderr__ if available.
28
+ """
29
+ try:
30
+ import logging as _logging
31
+
32
+ prev = getattr(_logging, "raiseExceptions", True)
33
+ # Suppress handler errors being printed to stderr
34
+ _logging.raiseExceptions = False
35
+
36
+ # If there are no handlers, skip and use stderr fallback
37
+ if logger.handlers:
38
+ logger.log(level, msg)
39
+ return
40
+ except Exception:
41
+ # Intentionally ignore and try stderr fallback
42
+ pass
43
+ finally:
44
+ try:
45
+ import logging as _logging # re-import safe even if earlier failed
46
+
47
+ _logging.raiseExceptions = prev # type: ignore[name-defined]
48
+ except Exception:
49
+ pass
50
+
51
+ # Fallback to stderr if available
52
+ try:
53
+ import sys
54
+
55
+ if hasattr(sys, "__stderr__") and sys.__stderr__:
56
+ sys.__stderr__.write(msg + "\n")
57
+ sys.__stderr__.flush()
58
+ except Exception:
59
+ pass
60
+
61
+
62
+ def kill_pipeline_process_group(process) -> None:
63
+ """
64
+ Kill a process and its entire process group.
65
+
66
+ Accepts either a multiprocessing.Process-like object exposing a ``pid`` attribute
67
+ or a raw PID integer. Sends SIGTERM to the process group first, and escalates
68
+ to SIGKILL if it does not terminate within a short grace period.
69
+
70
+ Parameters
71
+ ----------
72
+ process : multiprocessing.Process | int
73
+ Process handle (or a raw PID int) for the process whose process group should be terminated.
74
+ """
75
+ proc: Optional[object] = None
76
+ pid: Optional[int] = None
77
+
78
+ if isinstance(process, int):
79
+ pid = process
80
+ elif hasattr(process, "pid"):
81
+ proc = process
82
+ try:
83
+ pid = int(getattr(proc, "pid"))
84
+ except Exception as e:
85
+ raise AttributeError(f"Invalid process-like object without usable pid: {e}")
86
+ else:
87
+ raise AttributeError(
88
+ "kill_pipeline_process_group expects a multiprocessing.Process or a PID int (process-like object with .pid)"
89
+ )
90
+
91
+ if proc is not None and hasattr(proc, "is_alive") and not proc.is_alive():
92
+ _safe_log(logging.DEBUG, "Process already terminated")
93
+ return
94
+
95
+ if pid is None:
96
+ raise AttributeError("Unable to determine PID for process group termination")
97
+
98
+ _safe_log(logging.INFO, f"Terminating pipeline process group (PID: {pid})")
99
+
100
+ try:
101
+ # Send graceful termination to the entire process group
102
+ try:
103
+ pgid = os.getpgid(pid)
104
+ except Exception:
105
+ # Process already gone
106
+ _safe_log(logging.DEBUG, f"Process group for PID {pid} not found during SIGTERM phase")
107
+ return
108
+ try:
109
+ os.killpg(pgid, signal.SIGTERM)
110
+ except ProcessLookupError:
111
+ _safe_log(logging.DEBUG, f"Process group for PID {pid} no longer exists (SIGTERM)")
112
+ return
113
+
114
+ # If we have a Process handle, give it a chance to exit cleanly
115
+ if proc is not None and hasattr(proc, "join"):
116
+ try:
117
+ proc.join(timeout=5.0)
118
+ except Exception:
119
+ pass
120
+ still_alive = getattr(proc, "is_alive", lambda: True)()
121
+ else:
122
+ # Without a handle, provide a small grace period
123
+ time.sleep(2.0)
124
+ try:
125
+ _ = os.getpgid(pid)
126
+ still_alive = True
127
+ except Exception:
128
+ still_alive = False
129
+
130
+ if still_alive:
131
+ _safe_log(logging.WARNING, "Process group did not terminate gracefully, using SIGKILL")
132
+ try:
133
+ try:
134
+ pgid2 = os.getpgid(pid)
135
+ except Exception:
136
+ _safe_log(logging.DEBUG, f"Process group for PID {pid} vanished before SIGKILL")
137
+ return
138
+ os.killpg(pgid2, signal.SIGKILL)
139
+ finally:
140
+ if proc is not None and hasattr(proc, "join"):
141
+ try:
142
+ proc.join(timeout=3.0)
143
+ except Exception:
144
+ pass
145
+
146
+ except (ProcessLookupError, OSError) as e:
147
+ _safe_log(logging.DEBUG, f"Process group already terminated or not found: {e}")
@@ -152,11 +152,11 @@ if __name__ == "__main__":
152
152
  os.environ["OCR_MODEL_NAME"] = "paddle"
153
153
  os.environ["NEMORETRIEVER_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
154
154
  os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
155
- os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
155
+ os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/nemotron-nano-12b-v2-vl"
156
156
  logger.info("Environment variables set.")
157
157
 
158
158
  image_caption_endpoint_url = "https://integrate.api.nvidia.com/v1/chat/completions"
159
- model_name = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
159
+ model_name = "nvidia/nemotron-nano-12b-v2-vl"
160
160
  yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
161
161
  (
162
162
  yolox_table_structure_grpc,
@@ -3,8 +3,6 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import multiprocessing
6
- import os
7
- import signal
8
6
  import threading
9
7
  from abc import ABC, abstractmethod
10
8
  from dataclasses import dataclass
@@ -22,6 +20,7 @@ import logging
22
20
  import time
23
21
 
24
22
  from nv_ingest.framework.orchestration.ray.primitives.pipeline_topology import PipelineTopology, StageInfo
23
+ from nv_ingest.framework.orchestration.process.termination import kill_pipeline_process_group
25
24
  from nv_ingest.framework.orchestration.ray.primitives.ray_stat_collector import RayStatsCollector
26
25
  from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import PIDController, ResourceConstraintManager
27
26
  from nv_ingest.framework.orchestration.ray.util.pipeline.tools import wrap_callable_as_stage
@@ -120,24 +119,19 @@ class RayPipelineSubprocessInterface(PipelineInterface):
120
119
 
121
120
  def stop(self) -> None:
122
121
  """
123
- Stops the subprocess pipeline. Tries terminate(), then escalates to SIGKILL on the process group if needed.
122
+ Stops the subprocess pipeline and its entire process group to ensure
123
+ any child processes (e.g., the simple message broker) are terminated.
124
124
  """
125
- if not self._process.is_alive():
125
+ try:
126
+ pid = int(self._process.pid)
127
+ except Exception:
126
128
  return
127
129
 
130
+ # Always attempt to terminate the entire process group
128
131
  try:
129
- self._process.terminate()
130
- self._process.join(timeout=5.0)
132
+ kill_pipeline_process_group(pid)
131
133
  except Exception as e:
132
- logger.warning(f"Failed to terminate process cleanly: {e}")
133
-
134
- if self._process.is_alive():
135
- try:
136
- pgid = os.getpgid(self._process.pid)
137
- os.killpg(pgid, signal.SIGKILL)
138
- except Exception as e:
139
- logger.error(f"Failed to force-kill process group: {e}")
140
- self._process.join(timeout=3.0)
134
+ logger.warning(f"kill_pipeline_process_group failed: {e}")
141
135
 
142
136
 
143
137
  class RayPipelineInterface(PipelineInterface):
@@ -5,7 +5,6 @@
5
5
 
6
6
  import logging
7
7
  from typing import Optional
8
-
9
8
  import ray
10
9
 
11
10
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
@@ -17,6 +16,7 @@ from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExt
17
16
  from nv_ingest_api.util.exception_handlers.decorators import (
18
17
  nv_ingest_node_failure_try_except,
19
18
  )
19
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
20
20
 
21
21
  from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
22
22
 
@@ -66,10 +66,9 @@ class AudioExtractorStage(RayActorStage):
66
66
  # Extract the DataFrame payload.
67
67
  df_ledger = control_message.payload()
68
68
  self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
69
-
70
69
  # Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
71
70
  task_config = remove_task_by_type(control_message, "extract")
72
- self._logger.debug("Extracted task config: %s", task_config)
71
+ self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
73
72
 
74
73
  # Perform audio text extraction.
75
74
  new_df, extraction_info = extract_text_from_audio_internal(
@@ -13,8 +13,11 @@ from nv_ingest.framework.util.flow_control import filter_by_task
13
13
  from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
14
14
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
15
  from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
16
- from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
17
16
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
17
+ from nv_ingest_api.util.exception_handlers.decorators import (
18
+ nv_ingest_node_failure_try_except,
19
+ )
20
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
18
21
 
19
22
  logger = logging.getLogger(__name__)
20
23
 
@@ -66,7 +69,7 @@ class ChartExtractorStage(RayActorStage):
66
69
 
67
70
  # Remove the "chart_data_extract" task to obtain task-specific configuration.
68
71
  task_config = remove_task_by_type(control_message, "chart_data_extract")
69
- logger.debug("ChartExtractorStage: Task config extracted: %s", task_config)
72
+ logger.debug("ChartExtractorStage: Task config extracted: %s", sanitize_for_logging(task_config))
70
73
 
71
74
  # Perform chart data extraction.
72
75
  execution_trace_log = {}
@@ -16,6 +16,7 @@ from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtra
16
16
  from nv_ingest_api.util.exception_handlers.decorators import (
17
17
  nv_ingest_node_failure_try_except,
18
18
  )
19
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
19
20
 
20
21
  from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
21
22
 
@@ -68,7 +69,7 @@ class DocxExtractorStage(RayActorStage):
68
69
 
69
70
  # Remove the "docx-extract" task from the message to obtain task-specific configuration.
70
71
  task_config = remove_task_by_type(control_message, "extract")
71
- self._logger.debug("Extracted task config: %s", task_config)
72
+ self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
72
73
 
73
74
  # Perform DOCX content extraction.
74
75
  new_df, extraction_info = extract_primitives_from_docx_internal(
@@ -17,6 +17,7 @@ from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtra
17
17
  from nv_ingest_api.util.exception_handlers.decorators import (
18
18
  nv_ingest_node_failure_try_except,
19
19
  )
20
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
20
21
 
21
22
  from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
22
23
 
@@ -69,7 +70,7 @@ class HtmlExtractorStage(RayActorStage):
69
70
 
70
71
  # Remove the "html_content_extract" task from the message to obtain task-specific configuration.
71
72
  task_config = remove_task_by_type(control_message, "extract")
72
- self._logger.debug("Extracted task config: %s", task_config)
73
+ self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
73
74
 
74
75
  # Perform html content extraction.
75
76
  new_df, extraction_info = extract_markdown_from_html_internal(
@@ -16,6 +16,7 @@ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExt
16
16
  from nv_ingest_api.util.exception_handlers.decorators import (
17
17
  nv_ingest_node_failure_try_except,
18
18
  )
19
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
19
20
 
20
21
  from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
21
22
 
@@ -68,7 +69,7 @@ class ImageExtractorStage(RayActorStage):
68
69
 
69
70
  # Remove the "extract" task from the message to obtain task-specific configuration.
70
71
  task_config = remove_task_by_type(control_message, "extract")
71
- logger.debug("Extracted task config: %s", task_config)
72
+ logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
72
73
 
73
74
  # Perform image primitives extraction.
74
75
  new_df, extraction_info = extract_primitives_from_image_internal(
@@ -0,0 +1,71 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import ray
7
+
8
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
9
+ from nv_ingest.framework.util.flow_control import filter_by_task
10
+ from nv_ingest_api.internal.extract.image.ocr_extractor import extract_text_data_from_image_internal
11
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
12
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
13
+ from nv_ingest_api.internal.schemas.extract.extract_ocr_schema import OCRExtractorSchema
14
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
15
+ from typing import Optional
16
+
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @ray.remote
23
+ class OCRExtractorStage(RayActorStage):
24
+ """
25
+ A Ray actor stage that extracts text data from image content.
26
+
27
+ It expects an IngestControlMessage containing a DataFrame with image data. It then:
28
+ 1. Removes the "text_data_extract" task from the message.
29
+ 2. Calls the text extraction logic using a validated configuration.
30
+ 3. Updates the message payload with the extracted text DataFrame.
31
+ """
32
+
33
+ def __init__(self, config: OCRExtractorSchema, stage_name: Optional[str] = None) -> None:
34
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
35
+ try:
36
+ self.validated_config = config
37
+ self._logger.info("OCRExtractorStage configuration validated successfully.")
38
+ except Exception as e:
39
+ self._logger.exception(f"Error validating Text extractor config: {e}")
40
+ raise
41
+
42
+ @nv_ingest_node_failure_try_except()
43
+ @traceable()
44
+ @udf_intercept_hook()
45
+ @filter_by_task(required_tasks=["ocr_data_extract"])
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ # Extract DataFrame payload
48
+ df_ledger = control_message.payload()
49
+ if df_ledger.empty:
50
+ return control_message
51
+
52
+ # Remove the "text_data_extract" task from the message
53
+ task_config = remove_task_by_type(control_message, "ocr_data_extract")
54
+
55
+ execution_trace_log = {}
56
+ new_df, extraction_info = extract_text_data_from_image_internal(
57
+ df_extraction_ledger=df_ledger,
58
+ task_config=task_config,
59
+ extraction_config=self.validated_config,
60
+ execution_trace_log=execution_trace_log,
61
+ )
62
+
63
+ control_message.payload(new_df)
64
+ control_message.set_metadata("ocr_extraction_info", extraction_info)
65
+
66
+ do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
67
+ if do_trace_tagging and execution_trace_log:
68
+ parent_name = self.stage_name if self.stage_name else "ocr_extractor"
69
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
70
+
71
+ return control_message
@@ -15,7 +15,10 @@ from nv_ingest_api.internal.primitives.tracing.tagging import set_trace_timestam
15
15
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
16
16
  from nv_ingest.framework.util.flow_control import filter_by_task
17
17
  from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
- from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
18
+ from nv_ingest_api.util.exception_handlers.decorators import (
19
+ nv_ingest_node_failure_try_except,
20
+ )
21
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
19
22
 
20
23
  logger = logging.getLogger(__name__)
21
24
 
@@ -87,7 +90,7 @@ class PDFExtractorStage(RayActorStage):
87
90
 
88
91
  # Remove the "extract" task from the message to obtain task-specific configuration.
89
92
  task_config = remove_task_by_type(control_message, "extract")
90
- logger.debug("Extracted task config: %s", task_config)
93
+ logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
91
94
 
92
95
  # Perform PDF extraction.
93
96
  execution_trace_log = {}
@@ -16,6 +16,7 @@ from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExt
16
16
  from nv_ingest_api.util.exception_handlers.decorators import (
17
17
  nv_ingest_node_failure_try_except,
18
18
  )
19
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
@@ -65,7 +66,7 @@ class TableExtractorStage(RayActorStage):
65
66
 
66
67
  # Remove the "table_data_extract" task to obtain task-specific configuration.
67
68
  task_config = remove_task_by_type(control_message, "table_data_extract")
68
- logger.debug("Extracted task configuration: %s", task_config)
69
+ logger.debug("Extracted task configuration: %s", sanitize_for_logging(task_config))
69
70
 
70
71
  # Perform table data extraction.
71
72
  execution_trace_log = {}
@@ -25,6 +25,7 @@ from nv_ingest_api.util.exception_handlers.decorators import (
25
25
  nv_ingest_node_failure_try_except,
26
26
  )
27
27
  from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
28
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
28
29
 
29
30
  logger = logging.getLogger(__name__)
30
31
 
@@ -42,7 +43,7 @@ class MetadataInjectionStage(RayActorStage):
42
43
  # Call the base initializer to set attributes like self._running.
43
44
  super().__init__(config, stage_name=stage_name)
44
45
  # Additional initialization can be added here if necessary.
45
- self._logger.debug("MetadataInjectionStage initialized with config: %s", config)
46
+ self._logger.debug("MetadataInjectionStage initialized with config: %s", sanitize_for_logging(config))
46
47
 
47
48
  @nv_ingest_node_failure_try_except()
48
49
  @traceable()
@@ -18,6 +18,7 @@ from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import Imag
18
18
  from nv_ingest_api.util.exception_handlers.decorators import (
19
19
  nv_ingest_node_failure_try_except,
20
20
  )
21
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
21
22
 
22
23
  logger = logging.getLogger(__name__)
23
24
 
@@ -68,7 +69,7 @@ class ImageDedupStage(RayActorStage):
68
69
 
69
70
  # Remove the "dedup" task from the message to obtain task-specific configuration.
70
71
  task_config = remove_task_by_type(control_message, "dedup")
71
- logger.debug("Extracted task config: %s", task_config)
72
+ logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
72
73
 
73
74
  # Perform image deduplication.
74
75
  new_df = deduplicate_images_internal(
@@ -17,6 +17,7 @@ from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema impo
17
17
  from nv_ingest_api.util.exception_handlers.decorators import (
18
18
  nv_ingest_node_failure_try_except,
19
19
  )
20
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
20
21
 
21
22
  logger = logging.getLogger(__name__)
22
23
 
@@ -67,7 +68,7 @@ class ImageFilterStage(RayActorStage):
67
68
 
68
69
  # Remove the "filter" task from the message to obtain task-specific configuration.
69
70
  task_config = remove_task_by_type(control_message, "filter")
70
- logger.debug("Extracted task config: %s", task_config)
71
+ logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
71
72
 
72
73
  task_params: Dict[str, Any] = task_config.get("params", {})
73
74
 
@@ -29,6 +29,8 @@ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_inges
29
29
  # Import clients
30
30
  from nv_ingest_api.util.message_brokers.simple_message_broker.simple_client import SimpleClient
31
31
  from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
32
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
33
+ from nv_ingest_api.util.message_brokers.qos_scheduler import QosScheduler
32
34
 
33
35
  logger = logging.getLogger(__name__)
34
36
 
@@ -88,8 +90,10 @@ class MessageBrokerTaskSourceConfig(BaseModel):
88
90
 
89
91
  # Use the discriminated union for broker_client
90
92
  broker_client: Union[RedisClientConfig, SimpleClientConfig] = Field(..., discriminator="client_type")
91
- task_queue: str = Field(..., description="The name of the queue to fetch tasks from.")
92
- poll_interval: float = Field(default=0.1, gt=0, description="Polling interval in seconds.")
93
+ task_queue: str = Field(
94
+ ..., description="The base name of the queue to fetch tasks from. Derives sub-queues for fair scheduling."
95
+ )
96
+ poll_interval: float = Field(default=0.0, gt=0, description="Polling interval in seconds.")
93
97
 
94
98
 
95
99
  @ray.remote
@@ -104,8 +108,11 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
104
108
  def __init__(self, config: MessageBrokerTaskSourceConfig, stage_name: Optional[str] = None) -> None:
105
109
  super().__init__(config, log_to_stdout=False, stage_name=stage_name)
106
110
  self.config: MessageBrokerTaskSourceConfig # Add a type hint for self.config
111
+
112
+ # Sanitize config before logging to avoid leaking secrets
113
+ _sanitized = sanitize_for_logging(config)
107
114
  self._logger.debug(
108
- "Initializing MessageBrokerTaskSourceStage with config: %s", config.model_dump()
115
+ "Initializing MessageBrokerTaskSourceStage with config: %s", _sanitized
109
116
  ) # Log validated config
110
117
 
111
118
  # Access validated configuration directly via self.config
@@ -130,7 +137,29 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
130
137
  self._current_backoff_sleep: float = 0.0
131
138
  self._last_backoff_log_time: float = 0.0
132
139
 
133
- self._logger.debug("MessageBrokerTaskSourceStage initialized. Task queue: %s", self.task_queue)
140
+ # Initialize QoS scheduler. Use a simple base-queue strategy for SimpleClient.
141
+ strategy = "simple" if isinstance(self.client, SimpleClient) else "lottery"
142
+ self.scheduler = QosScheduler(
143
+ self.task_queue,
144
+ num_prefetch_threads=6, # one per category (no-op for simple strategy)
145
+ total_buffer_capacity=96, # e.g., ~16 per thread
146
+ prefetch_poll_interval=0.002, # faster polling for responsiveness
147
+ prefetch_non_immediate=True, # enable prefetch for non-immediate categories
148
+ strategy=strategy,
149
+ )
150
+
151
+ self._logger.info(
152
+ "MessageBrokerTaskSourceStage initialized. Base task queue: %s | Derived queues: %s",
153
+ self.task_queue,
154
+ {
155
+ "immediate": f"{self.task_queue}_immediate",
156
+ "micro": f"{self.task_queue}_micro",
157
+ "small": f"{self.task_queue}_small",
158
+ "medium": f"{self.task_queue}_medium",
159
+ "large": f"{self.task_queue}_large",
160
+ "default": f"{self.task_queue}",
161
+ },
162
+ )
134
163
 
135
164
  # --- Private helper methods ---
136
165
  def _create_client(self):
@@ -261,14 +290,21 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
261
290
 
262
291
  return control_message
263
292
 
264
- def _fetch_message(self, timeout=100):
293
+ def _fetch_message(self, timeout=0):
265
294
  """
266
- Fetch a message from the message broker.
295
+ Fetch a message from the message broker using fair scheduling across derived queues.
296
+ This is a non-blocking sweep across all queues for the current scheduling cycle. If no
297
+ message is found across any queue, return None so the caller can sleep briefly.
267
298
  """
268
299
  try:
269
- job = self.client.fetch_message(self.task_queue, timeout)
300
+ # Use scheduler to fetch next. In simple strategy this will block up to poll_interval on base queue.
301
+ job = self.scheduler.fetch_next(self.client, timeout=self.config.poll_interval)
270
302
  if job is None:
271
- self._logger.debug("No message received from '%s'", self.task_queue)
303
+ self._logger.debug(
304
+ "No message received from derived queues for base "
305
+ "'%s' (immediate, micro, small, medium, large, default)",
306
+ self.task_queue,
307
+ )
272
308
  # Do not treat normal empty polls as failures
273
309
  self._fetch_failure_count = 0
274
310
  self._current_backoff_sleep = 0.0
@@ -332,7 +368,8 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
332
368
  Instead of reading from an input edge, fetch a message from the broker.
333
369
  """
334
370
  self._logger.debug("read_input: calling _fetch_message()")
335
- job = self._fetch_message(timeout=100)
371
+ # Perform a non-blocking sweep across all queues for this cycle
372
+ job = self._fetch_message(timeout=0)
336
373
  if job is None:
337
374
  # Sleep for either the configured poll interval or the current backoff, whichever is larger
338
375
  sleep_time = max(self.config.poll_interval, getattr(self, "_current_backoff_sleep", 0.0))
@@ -16,6 +16,7 @@ from nv_ingest_api.internal.store.embed_text_upload import store_text_embeddings
16
16
  from nv_ingest_api.util.exception_handlers.decorators import (
17
17
  nv_ingest_node_failure_try_except,
18
18
  )
19
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
19
20
 
20
21
  from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
21
22
 
@@ -68,7 +69,7 @@ class EmbeddingStorageStage(RayActorStage):
68
69
 
69
70
  # Remove the "store_embedding" task from the message to obtain task-specific configuration.
70
71
  task_config = remove_task_by_type(control_message, "store_embedding")
71
- logger.debug("Extracted task config: %s", task_config)
72
+ logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
72
73
 
73
74
  # Perform embedding storage.
74
75
  new_df = store_text_embeddings_internal(
@@ -18,6 +18,7 @@ from nv_ingest_api.internal.transform.caption_image import transform_image_creat
18
18
  from nv_ingest_api.util.exception_handlers.decorators import (
19
19
  nv_ingest_node_failure_try_except,
20
20
  )
21
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
21
22
 
22
23
  logger = logging.getLogger(__name__)
23
24
 
@@ -67,7 +68,10 @@ class ImageCaptionTransformStage(RayActorStage):
67
68
 
68
69
  # Remove the "caption" task to obtain task-specific configuration.
69
70
  task_config = remove_task_by_type(control_message, "caption")
70
- logger.debug("ImageCaptionTransformStage: Task configuration extracted: %s", pprint.pformat(task_config))
71
+ logger.debug(
72
+ "ImageCaptionTransformStage: Task configuration extracted: %s",
73
+ pprint.pformat(sanitize_for_logging(task_config)),
74
+ )
71
75
 
72
76
  # Call the caption extraction function.
73
77
  new_df = transform_image_create_vlm_caption_internal(
@@ -15,6 +15,7 @@ from nv_ingest_api.internal.transform.embed_text import transform_create_text_em
15
15
  from nv_ingest_api.util.exception_handlers.decorators import (
16
16
  nv_ingest_node_failure_try_except,
17
17
  )
18
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
18
19
 
19
20
  from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
20
21
 
@@ -62,7 +63,10 @@ class TextEmbeddingTransformStage(RayActorStage):
62
63
 
63
64
  # Remove the "embed" task to obtain task-specific configuration.
64
65
  task_config = remove_task_by_type(control_message, "embed")
65
- self._logger.debug("TextEmbeddingTransformStage: Task configuration extracted: %s", pprint.pformat(task_config))
66
+ self._logger.debug(
67
+ "TextEmbeddingTransformStage: Task configuration extracted: %s",
68
+ pprint.pformat(sanitize_for_logging(task_config)),
69
+ )
66
70
 
67
71
  # Call the text embedding extraction function.
68
72
  new_df, execution_trace_log = transform_create_text_embeddings_internal(