nv-ingest 2025.9.15.dev20250915__py3-none-any.whl → 2025.12.13.dev20251213__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. nv_ingest/api/__init__.py +6 -0
  2. nv_ingest/api/main.py +2 -0
  3. nv_ingest/api/tracing.py +82 -0
  4. nv_ingest/api/v2/README.md +203 -0
  5. nv_ingest/api/v2/__init__.py +3 -0
  6. nv_ingest/api/v2/ingest.py +1300 -0
  7. nv_ingest/framework/orchestration/process/dependent_services.py +17 -10
  8. nv_ingest/framework/orchestration/process/execution.py +6 -0
  9. nv_ingest/framework/orchestration/process/strategies.py +6 -2
  10. nv_ingest/framework/orchestration/process/termination.py +49 -9
  11. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +11 -11
  12. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -2
  13. nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
  14. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +41 -8
  15. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +72 -6
  16. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +40 -0
  17. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
  18. nv_ingest/pipeline/config/replica_resolver.py +12 -2
  19. nv_ingest/pipeline/default_libmode_pipeline_impl.py +40 -25
  20. nv_ingest/pipeline/default_pipeline_impl.py +83 -40
  21. {nv_ingest-2025.9.15.dev20250915.dist-info → nv_ingest-2025.12.13.dev20251213.dist-info}/METADATA +5 -2
  22. {nv_ingest-2025.9.15.dev20250915.dist-info → nv_ingest-2025.12.13.dev20251213.dist-info}/RECORD +25 -20
  23. {nv_ingest-2025.9.15.dev20250915.dist-info → nv_ingest-2025.12.13.dev20251213.dist-info}/WHEEL +0 -0
  24. {nv_ingest-2025.9.15.dev20250915.dist-info → nv_ingest-2025.12.13.dev20251213.dist-info}/licenses/LICENSE +0 -0
  25. {nv_ingest-2025.9.15.dev20250915.dist-info → nv_ingest-2025.12.13.dev20251213.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,18 @@ from nv_ingest_api.util.message_brokers.simple_message_broker.broker import Simp
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
20
 
21
+ def _broker_server_target(host, port, max_queue_size):
22
+ """
23
+ Target function to be run in a separate process for the SimpleMessageBroker.
24
+ """
25
+ server = SimpleMessageBroker(host, port, max_queue_size)
26
+ try:
27
+ server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
28
+ except Exception:
29
+ pass
30
+ server.serve_forever()
31
+
32
+
21
33
  def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
22
34
  """
23
35
  Starts a SimpleMessageBroker server in a separate process.
@@ -58,16 +70,11 @@ def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
58
70
  f"continuing to spawn a broker process (tests expect a Process to be returned)"
59
71
  )
60
72
 
61
- def broker_server():
62
- # Optionally, set socket options here for reuse (note: binding occurs in server __init__).
63
- server = SimpleMessageBroker(server_host, server_port, max_queue_size)
64
- try:
65
- server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
66
- except Exception:
67
- pass
68
- server.serve_forever()
69
-
70
- p = multiprocessing.Process(target=broker_server)
73
+ p = multiprocessing.Process(
74
+ target=_broker_server_target,
75
+ args=(server_host, server_port, max_queue_size),
76
+ daemon=True,
77
+ )
71
78
  # If we're launching from inside the pipeline subprocess, mark daemon so the
72
79
  # broker dies automatically when the subprocess exits.
73
80
  p.daemon = os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS") == "1"
@@ -162,6 +162,11 @@ def build_logging_config_from_env() -> LoggingConfig:
162
162
  if key not in os.environ:
163
163
  os.environ[key] = default_value
164
164
 
165
+ # For PRODUCTION mode, also suppress nv-ingest module INFO logs
166
+ if preset_level == "PRODUCTION":
167
+ logging.getLogger("nv_ingest").setLevel(logging.WARNING)
168
+ logging.getLogger("nv_ingest_api").setLevel(logging.WARNING)
169
+
165
170
  logger.info(f"Applied Ray logging preset: {preset_level}")
166
171
 
167
172
  # Get log level from environment, default to INFO
@@ -324,6 +329,7 @@ def launch_pipeline(
324
329
  pipeline_config = resolve_static_replicas(pipeline_config)
325
330
 
326
331
  # Pretty print the final pipeline configuration (after replica resolution)
332
+ # INFO level so it shows in docker/helm deployments; quiet mode suppresses in library mode
327
333
  pretty_output = pretty_print_pipeline_config(pipeline_config, config_path=None)
328
334
  logger.info("\n" + pretty_output)
329
335
 
@@ -11,9 +11,10 @@ Strategy pattern for clean separation of execution concerns.
11
11
  """
12
12
 
13
13
  import atexit
14
- import os
15
14
  import logging
16
15
  import multiprocessing
16
+ import os
17
+ import sys
17
18
  import time
18
19
  from abc import ABC, abstractmethod
19
20
 
@@ -132,7 +133,10 @@ class SubprocessStrategy(ProcessExecutionStrategy):
132
133
  logger.info("Launching pipeline in Python subprocess using multiprocessing.")
133
134
 
134
135
  # Create subprocess using fork context
135
- ctx = multiprocessing.get_context("fork")
136
+ start_method = "fork"
137
+ if sys.platform.lower() == "darwin":
138
+ start_method = "spawn"
139
+ ctx = multiprocessing.get_context(start_method)
136
140
  process = ctx.Process(
137
141
  target=run_pipeline_process,
138
142
  args=(
@@ -19,20 +19,45 @@ logger = logging.getLogger(__name__)
19
19
 
20
20
 
21
21
  def _safe_log(level: int, msg: str) -> None:
22
- """Best-effort logging that won't crash during interpreter shutdown."""
22
+ """Best-effort logging that won't emit handler tracebacks on closed streams.
23
+
24
+ Temporarily disables logging.raiseExceptions to prevent the logging module
25
+ from printing "--- Logging error ---" to stderr if a handler's stream is
26
+ already closed (common during process teardown). Falls back to writing to
27
+ sys.__stderr__ if available.
28
+ """
23
29
  try:
24
- logger.log(level, msg)
30
+ import logging as _logging
31
+
32
+ prev = getattr(_logging, "raiseExceptions", True)
33
+ # Suppress handler errors being printed to stderr
34
+ _logging.raiseExceptions = False
35
+
36
+ # If there are no handlers, skip and use stderr fallback
37
+ if logger.handlers:
38
+ logger.log(level, msg)
39
+ return
25
40
  except Exception:
41
+ # Intentionally ignore and try stderr fallback
42
+ pass
43
+ finally:
26
44
  try:
27
- # Fallback to stderr if available
28
- import sys
45
+ import logging as _logging # re-import safe even if earlier failed
29
46
 
30
- if hasattr(sys, "__stderr__") and sys.__stderr__:
31
- sys.__stderr__.write(msg + "\n")
32
- sys.__stderr__.flush()
47
+ _logging.raiseExceptions = prev # type: ignore[name-defined]
33
48
  except Exception:
34
49
  pass
35
50
 
51
+ # Fallback to stderr if available
52
+ try:
53
+ import sys
54
+
55
+ if hasattr(sys, "__stderr__") and sys.__stderr__:
56
+ sys.__stderr__.write(msg + "\n")
57
+ sys.__stderr__.flush()
58
+ except Exception:
59
+ pass
60
+
36
61
 
37
62
  def kill_pipeline_process_group(process) -> None:
38
63
  """
@@ -74,7 +99,17 @@ def kill_pipeline_process_group(process) -> None:
74
99
 
75
100
  try:
76
101
  # Send graceful termination to the entire process group
77
- os.killpg(os.getpgid(pid), signal.SIGTERM)
102
+ try:
103
+ pgid = os.getpgid(pid)
104
+ except Exception:
105
+ # Process already gone
106
+ _safe_log(logging.DEBUG, f"Process group for PID {pid} not found during SIGTERM phase")
107
+ return
108
+ try:
109
+ os.killpg(pgid, signal.SIGTERM)
110
+ except ProcessLookupError:
111
+ _safe_log(logging.DEBUG, f"Process group for PID {pid} no longer exists (SIGTERM)")
112
+ return
78
113
 
79
114
  # If we have a Process handle, give it a chance to exit cleanly
80
115
  if proc is not None and hasattr(proc, "join"):
@@ -95,7 +130,12 @@ def kill_pipeline_process_group(process) -> None:
95
130
  if still_alive:
96
131
  _safe_log(logging.WARNING, "Process group did not terminate gracefully, using SIGKILL")
97
132
  try:
98
- os.killpg(os.getpgid(pid), signal.SIGKILL)
133
+ try:
134
+ pgid2 = os.getpgid(pid)
135
+ except Exception:
136
+ _safe_log(logging.DEBUG, f"Process group for PID {pid} vanished before SIGKILL")
137
+ return
138
+ os.killpg(pgid2, signal.SIGKILL)
99
139
  finally:
100
140
  if proc is not None and hasattr(proc, "join"):
101
141
  try:
@@ -150,13 +150,13 @@ if __name__ == "__main__":
150
150
  os.environ["OCR_GRPC_ENDPOINT"] = "localhost:8010"
151
151
  os.environ["OCR_INFER_PROTOCOL"] = "grpc"
152
152
  os.environ["OCR_MODEL_NAME"] = "paddle"
153
- os.environ["NEMORETRIEVER_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
153
+ os.environ["NEMOTRON_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
154
154
  os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
155
- os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
155
+ os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/nemotron-nano-12b-v2-vl"
156
156
  logger.info("Environment variables set.")
157
157
 
158
158
  image_caption_endpoint_url = "https://integrate.api.nvidia.com/v1/chat/completions"
159
- model_name = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
159
+ model_name = "nvidia/nemotron-nano-12b-v2-vl"
160
160
  yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
161
161
  (
162
162
  yolox_table_structure_grpc,
@@ -170,23 +170,23 @@ if __name__ == "__main__":
170
170
  yolox_graphic_elements_auth,
171
171
  yolox_graphic_elements_protocol,
172
172
  ) = get_nim_service("yolox_graphic_elements")
173
- nemoretriever_parse_grpc, nemoretriever_parse_http, nemoretriever_parse_auth, nemoretriever_parse_protocol = (
174
- get_nim_service("nemoretriever_parse")
173
+ nemotron_parse_grpc, nemotron_parse_http, nemotron_parse_auth, nemotron_parse_protocol = get_nim_service(
174
+ "nemotron_parse"
175
175
  )
176
176
  ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
177
177
 
178
- model_name = os.environ.get("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
178
+ model_name = os.environ.get("NEMOTRON_PARSE_MODEL_NAME", "nvidia/nemotron-parse")
179
179
  pdf_extractor_config = {
180
180
  "pdfium_config": {
181
181
  "auth_token": yolox_auth, # All auth tokens are the same for the moment
182
182
  "yolox_endpoints": (yolox_grpc, yolox_http),
183
183
  "yolox_infer_protocol": yolox_protocol,
184
184
  },
185
- "nemoretriever_parse_config": {
186
- "auth_token": nemoretriever_parse_auth,
187
- "nemoretriever_parse_endpoints": (nemoretriever_parse_grpc, nemoretriever_parse_http),
188
- "nemoretriever_parse_infer_protocol": nemoretriever_parse_protocol,
189
- "nemoretriever_parse_model_name": model_name,
185
+ "nemotron_parse_config": {
186
+ "auth_token": nemotron_parse_auth,
187
+ "nemotron_parse_endpoints": (nemotron_parse_grpc, nemotron_parse_http),
188
+ "nemotron_parse_infer_protocol": nemotron_parse_protocol,
189
+ "nemotron_parse_model_name": model_name,
190
190
  "yolox_endpoints": (yolox_grpc, yolox_http),
191
191
  "yolox_infer_protocol": yolox_protocol,
192
192
  },
@@ -5,7 +5,6 @@
5
5
 
6
6
  import logging
7
7
  from typing import Optional
8
-
9
8
  import ray
10
9
 
11
10
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
@@ -67,7 +66,6 @@ class AudioExtractorStage(RayActorStage):
67
66
  # Extract the DataFrame payload.
68
67
  df_ledger = control_message.payload()
69
68
  self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
70
-
71
69
  # Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
72
70
  task_config = remove_task_by_type(control_message, "extract")
73
71
  self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
@@ -0,0 +1,71 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import ray
7
+
8
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
9
+ from nv_ingest.framework.util.flow_control import filter_by_task
10
+ from nv_ingest_api.internal.extract.image.ocr_extractor import extract_text_data_from_image_internal
11
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
12
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
13
+ from nv_ingest_api.internal.schemas.extract.extract_ocr_schema import OCRExtractorSchema
14
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
15
+ from typing import Optional
16
+
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @ray.remote
23
+ class OCRExtractorStage(RayActorStage):
24
+ """
25
+ A Ray actor stage that extracts text data from image content.
26
+
27
+ It expects an IngestControlMessage containing a DataFrame with image data. It then:
28
+ 1. Removes the "text_data_extract" task from the message.
29
+ 2. Calls the text extraction logic using a validated configuration.
30
+ 3. Updates the message payload with the extracted text DataFrame.
31
+ """
32
+
33
+ def __init__(self, config: OCRExtractorSchema, stage_name: Optional[str] = None) -> None:
34
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
35
+ try:
36
+ self.validated_config = config
37
+ self._logger.info("OCRExtractorStage configuration validated successfully.")
38
+ except Exception as e:
39
+ self._logger.exception(f"Error validating Text extractor config: {e}")
40
+ raise
41
+
42
+ @nv_ingest_node_failure_try_except()
43
+ @traceable()
44
+ @udf_intercept_hook()
45
+ @filter_by_task(required_tasks=["ocr_data_extract"])
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ # Extract DataFrame payload
48
+ df_ledger = control_message.payload()
49
+ if df_ledger.empty:
50
+ return control_message
51
+
52
+ # Remove the "text_data_extract" task from the message
53
+ task_config = remove_task_by_type(control_message, "ocr_data_extract")
54
+
55
+ execution_trace_log = {}
56
+ new_df, extraction_info = extract_text_data_from_image_internal(
57
+ df_extraction_ledger=df_ledger,
58
+ task_config=task_config,
59
+ extraction_config=self.validated_config,
60
+ execution_trace_log=execution_trace_log,
61
+ )
62
+
63
+ control_message.payload(new_df)
64
+ control_message.set_metadata("ocr_extraction_info", extraction_info)
65
+
66
+ do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
67
+ if do_trace_tagging and execution_trace_log:
68
+ parent_name = self.stage_name if self.stage_name else "ocr_extractor"
69
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
70
+
71
+ return control_message
@@ -30,6 +30,7 @@ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_inges
30
30
  from nv_ingest_api.util.message_brokers.simple_message_broker.simple_client import SimpleClient
31
31
  from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
32
32
  from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
33
+ from nv_ingest_api.util.message_brokers.qos_scheduler import QosScheduler
33
34
 
34
35
  logger = logging.getLogger(__name__)
35
36
 
@@ -89,8 +90,10 @@ class MessageBrokerTaskSourceConfig(BaseModel):
89
90
 
90
91
  # Use the discriminated union for broker_client
91
92
  broker_client: Union[RedisClientConfig, SimpleClientConfig] = Field(..., discriminator="client_type")
92
- task_queue: str = Field(..., description="The name of the queue to fetch tasks from.")
93
- poll_interval: float = Field(default=0.1, gt=0, description="Polling interval in seconds.")
93
+ task_queue: str = Field(
94
+ ..., description="The base name of the queue to fetch tasks from. Derives sub-queues for fair scheduling."
95
+ )
96
+ poll_interval: float = Field(default=0.0, gt=0, description="Polling interval in seconds.")
94
97
 
95
98
 
96
99
  @ray.remote
@@ -134,7 +137,29 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
134
137
  self._current_backoff_sleep: float = 0.0
135
138
  self._last_backoff_log_time: float = 0.0
136
139
 
137
- self._logger.debug("MessageBrokerTaskSourceStage initialized. Task queue: %s", self.task_queue)
140
+ # Initialize QoS scheduler. Use a simple base-queue strategy for SimpleClient.
141
+ strategy = "simple" if isinstance(self.client, SimpleClient) else "lottery"
142
+ self.scheduler = QosScheduler(
143
+ self.task_queue,
144
+ num_prefetch_threads=6, # one per category (no-op for simple strategy)
145
+ total_buffer_capacity=96, # e.g., ~16 per thread
146
+ prefetch_poll_interval=0.002, # faster polling for responsiveness
147
+ prefetch_non_immediate=True, # enable prefetch for non-immediate categories
148
+ strategy=strategy,
149
+ )
150
+
151
+ self._logger.info(
152
+ "MessageBrokerTaskSourceStage initialized. Base task queue: %s | Derived queues: %s",
153
+ self.task_queue,
154
+ {
155
+ "immediate": f"{self.task_queue}_immediate",
156
+ "micro": f"{self.task_queue}_micro",
157
+ "small": f"{self.task_queue}_small",
158
+ "medium": f"{self.task_queue}_medium",
159
+ "large": f"{self.task_queue}_large",
160
+ "default": f"{self.task_queue}",
161
+ },
162
+ )
138
163
 
139
164
  # --- Private helper methods ---
140
165
  def _create_client(self):
@@ -265,14 +290,21 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
265
290
 
266
291
  return control_message
267
292
 
268
- def _fetch_message(self, timeout=100):
293
+ def _fetch_message(self, timeout=0):
269
294
  """
270
- Fetch a message from the message broker.
295
+ Fetch a message from the message broker using fair scheduling across derived queues.
296
+ This is a non-blocking sweep across all queues for the current scheduling cycle. If no
297
+ message is found across any queue, return None so the caller can sleep briefly.
271
298
  """
272
299
  try:
273
- job = self.client.fetch_message(self.task_queue, timeout)
300
+ # Use scheduler to fetch next. In simple strategy this will block up to poll_interval on base queue.
301
+ job = self.scheduler.fetch_next(self.client, timeout=self.config.poll_interval)
274
302
  if job is None:
275
- self._logger.debug("No message received from '%s'", self.task_queue)
303
+ self._logger.debug(
304
+ "No message received from derived queues for base "
305
+ "'%s' (immediate, micro, small, medium, large, default)",
306
+ self.task_queue,
307
+ )
276
308
  # Do not treat normal empty polls as failures
277
309
  self._fetch_failure_count = 0
278
310
  self._current_backoff_sleep = 0.0
@@ -336,7 +368,8 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
336
368
  Instead of reading from an input edge, fetch a message from the broker.
337
369
  """
338
370
  self._logger.debug("read_input: calling _fetch_message()")
339
- job = self._fetch_message(timeout=100)
371
+ # Perform a non-blocking sweep across all queues for this cycle
372
+ job = self._fetch_message(timeout=0)
340
373
  if job is None:
341
374
  # Sleep for either the configured poll interval or the current backoff, whichever is larger
342
375
  sleep_time = max(self.config.poll_interval, getattr(self, "_current_backoff_sleep", 0.0))
@@ -3,7 +3,9 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ import os
6
7
  from typing import Dict, Any, Optional
8
+ from urllib.parse import urlparse
7
9
 
8
10
  import pandas as pd
9
11
  import ray
@@ -26,7 +28,8 @@ logger = logging.getLogger(__name__)
26
28
  @ray.remote
27
29
  class ImageStorageStage(RayActorStage):
28
30
  """
29
- A Ray actor stage that stores images or structured content in MinIO and updates metadata with storage URLs.
31
+ A Ray actor stage that stores images or structured content using an fsspec-compatible backend and updates
32
+ metadata with storage URLs.
30
33
 
31
34
  This stage uses the validated configuration (ImageStorageModuleSchema) to process and store the DataFrame
32
35
  payload and updates the control message accordingly.
@@ -69,8 +72,16 @@ class ImageStorageStage(RayActorStage):
69
72
  task_config = remove_task_by_type(control_message, "store")
70
73
  # logger.debug("ImageStorageStage: Task configuration extracted: %s", pprint.pformat(task_config))
71
74
 
72
- store_structured: bool = task_config.get("structured", True)
73
- store_unstructured: bool = task_config.get("images", False)
75
+ stage_defaults = {
76
+ "structured": self.validated_config.structured,
77
+ "images": self.validated_config.images,
78
+ "storage_uri": self.validated_config.storage_uri,
79
+ "storage_options": self.validated_config.storage_options,
80
+ "public_base_url": self.validated_config.public_base_url,
81
+ }
82
+
83
+ store_structured: bool = task_config.get("structured", stage_defaults["structured"])
84
+ store_unstructured: bool = task_config.get("images", stage_defaults["images"])
74
85
 
75
86
  content_types: Dict[Any, Any] = {}
76
87
  if store_structured:
@@ -80,14 +91,34 @@ class ImageStorageStage(RayActorStage):
80
91
  content_types[ContentTypeEnum.IMAGE] = store_unstructured
81
92
 
82
93
  params: Dict[str, Any] = task_config.get("params", {})
83
- params["content_types"] = content_types
84
94
 
85
- logger.debug(f"Processing storage task with parameters: {params}")
95
+ storage_uri = task_config.get("storage_uri") or params.get("storage_uri") or stage_defaults["storage_uri"]
96
+ storage_options = {
97
+ **(stage_defaults["storage_options"] or {}),
98
+ **(task_config.get("storage_options") or {}),
99
+ **params.get("storage_options", {}),
100
+ }
101
+ if "public_base_url" in task_config:
102
+ public_base_url = task_config["public_base_url"]
103
+ else:
104
+ public_base_url = params.get("public_base_url", stage_defaults["public_base_url"])
105
+
106
+ storage_options = self._inject_storage_defaults(storage_uri, storage_options)
107
+
108
+ storage_params: Dict[str, Any] = {
109
+ "content_types": content_types,
110
+ "storage_uri": storage_uri,
111
+ "storage_options": storage_options,
112
+ }
113
+ if public_base_url:
114
+ storage_params["public_base_url"] = public_base_url
115
+
116
+ logger.debug("Processing storage task with parameters: %s", storage_params)
86
117
 
87
118
  # Store images or structured content.
88
119
  df_storage_ledger: pd.DataFrame = store_images_to_minio_internal(
89
120
  df_storage_ledger=df_payload,
90
- task_config=params,
121
+ task_config=storage_params,
91
122
  storage_config={},
92
123
  execution_trace_log=None,
93
124
  )
@@ -98,3 +129,38 @@ class ImageStorageStage(RayActorStage):
98
129
  control_message.payload(df_storage_ledger)
99
130
 
100
131
  return control_message
132
+
133
+ @staticmethod
134
+ def _inject_storage_defaults(storage_uri: str, storage_options: Dict[str, Any]) -> Dict[str, Any]:
135
+ """
136
+ Populate storage options for common backends (e.g., MinIO/S3) using environment defaults.
137
+ """
138
+ parsed_scheme = urlparse(storage_uri).scheme.lower()
139
+ merged_options: Dict[str, Any] = {k: v for k, v in storage_options.items() if v is not None}
140
+
141
+ if parsed_scheme not in {"s3", "s3a", "s3n"}:
142
+ return merged_options
143
+
144
+ def _set_if_absent(key: str, env_var: str) -> None:
145
+ if key not in merged_options and env_var in os.environ:
146
+ merged_options[key] = os.environ[env_var]
147
+
148
+ _set_if_absent("key", "MINIO_ACCESS_KEY")
149
+ _set_if_absent("secret", "MINIO_SECRET_KEY")
150
+ if "token" not in merged_options and os.environ.get("MINIO_SESSION_TOKEN"):
151
+ merged_options["token"] = os.environ["MINIO_SESSION_TOKEN"]
152
+
153
+ client_kwargs = dict(merged_options.get("client_kwargs", {}))
154
+ endpoint = os.environ.get("MINIO_INTERNAL_ADDRESS")
155
+ if not endpoint:
156
+ endpoint = "http://minio:9000"
157
+ if endpoint and not endpoint.startswith(("http://", "https://")):
158
+ endpoint = f"http://{endpoint}"
159
+ client_kwargs.setdefault("endpoint_url", endpoint)
160
+ region = os.environ.get("MINIO_REGION")
161
+ if region:
162
+ client_kwargs.setdefault("region_name", region)
163
+ if client_kwargs:
164
+ merged_options["client_kwargs"] = client_kwargs
165
+
166
+ return merged_options
@@ -3,6 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ import os
6
7
  from typing import Union, Optional, TextIO
7
8
 
8
9
 
@@ -23,6 +24,34 @@ from nv_ingest.framework.orchestration.execution.helpers import (
23
24
  logger = logging.getLogger(__name__)
24
25
 
25
26
 
27
+ def _configure_quiet_mode():
28
+ """
29
+ Configure environment for quiet/production logging in library mode.
30
+
31
+ Sets INGEST_RAY_LOG_LEVEL=PRODUCTION if not already set by user, which:
32
+ - Sets Ray logging to ERROR level (suppresses INFO/WARNING)
33
+ - Disables Ray usage stats collection
34
+ - Disables Ray import warnings
35
+
36
+ Also silences other common warnings that are noisy in library mode.
37
+ """
38
+ # Only set if user hasn't explicitly configured
39
+ if "INGEST_RAY_LOG_LEVEL" not in os.environ:
40
+ os.environ["INGEST_RAY_LOG_LEVEL"] = "PRODUCTION"
41
+
42
+ # Silence Ray accelerator env var warning
43
+ if "RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO" not in os.environ:
44
+ os.environ["RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO"] = "0"
45
+
46
+ # Disable OTEL tracing export errors (no collector expected in library mode)
47
+ if "OTEL_SDK_DISABLED" not in os.environ:
48
+ os.environ["OTEL_SDK_DISABLED"] = "true"
49
+
50
+ # Set nv-ingest module loggers to WARNING to suppress INFO level startup messages
51
+ logging.getLogger("nv_ingest").setLevel(logging.WARNING)
52
+ logging.getLogger("nv_ingest_api").setLevel(logging.WARNING)
53
+
54
+
26
55
  def run_pipeline(
27
56
  pipeline_config: Optional[PipelineConfigSchema] = None,
28
57
  block: bool = True,
@@ -32,6 +61,7 @@ def run_pipeline(
32
61
  stdout: Optional[TextIO] = None,
33
62
  stderr: Optional[TextIO] = None,
34
63
  libmode: bool = True,
64
+ quiet: Optional[bool] = None,
35
65
  ) -> Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]:
36
66
  """
37
67
  Launch and manage a pipeline using configuration.
@@ -65,6 +95,10 @@ def run_pipeline(
65
95
  libmode : bool, default=True
66
96
  If True and pipeline_config is None, loads the default libmode pipeline configuration.
67
97
  If False, requires pipeline_config to be provided.
98
+ quiet : Optional[bool], default=None
99
+ If True, configures logging for minimal output (PRODUCTION preset, suppresses
100
+ INFO-level startup messages). If None, defaults to True when libmode=True.
101
+ Set to False to see verbose startup logs even in library mode.
68
102
 
69
103
  Returns
70
104
  -------
@@ -83,6 +117,12 @@ def run_pipeline(
83
117
  Exception
84
118
  Any other exceptions raised during pipeline launch or configuration.
85
119
  """
120
+ # Configure quiet mode for library mode by default (unless explicitly disabled)
121
+ if quiet is None:
122
+ quiet = libmode
123
+ if quiet:
124
+ _configure_quiet_mode()
125
+
86
126
  # Resolve configuration
87
127
  config = resolve_pipeline_config(pipeline_config, libmode)
88
128
  overrides = create_runtime_overrides(disable_dynamic_scaling, dynamic_memory_threshold)