nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. nv_ingest/api/__init__.py +6 -0
  2. nv_ingest/api/main.py +2 -0
  3. nv_ingest/api/tracing.py +82 -0
  4. nv_ingest/api/v2/README.md +203 -0
  5. nv_ingest/api/v2/__init__.py +3 -0
  6. nv_ingest/api/v2/ingest.py +1300 -0
  7. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  9. nv_ingest/framework/orchestration/execution/options.py +112 -0
  10. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
  12. nv_ingest/framework/orchestration/process/execution.py +495 -0
  13. nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
  14. nv_ingest/framework/orchestration/process/strategies.py +218 -0
  15. nv_ingest/framework/orchestration/process/termination.py +147 -0
  16. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +3 -3
  17. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  18. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
  19. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  20. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
  21. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
  22. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
  23. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
  24. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
  25. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  26. nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
  28. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  29. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
  30. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
  31. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  32. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  33. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
  34. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
  35. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
  36. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  37. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  38. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
  39. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +79 -11
  40. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
  41. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  42. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  43. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
  44. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -18
  45. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
  46. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  47. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  48. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  49. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  50. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  51. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
  52. nv_ingest/pipeline/__init__.py +3 -0
  53. nv_ingest/pipeline/config/__init__.py +3 -0
  54. nv_ingest/pipeline/config/loaders.py +229 -0
  55. nv_ingest/pipeline/config/replica_resolver.py +237 -0
  56. nv_ingest/pipeline/default_libmode_pipeline_impl.py +528 -0
  57. nv_ingest/pipeline/default_pipeline_impl.py +557 -0
  58. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  59. nv_ingest/pipeline/pipeline_schema.py +398 -0
  60. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +6 -3
  61. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +64 -43
  62. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  63. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  64. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
  65. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
  66. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
@@ -3,12 +3,13 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
- from typing import Dict, Any
6
+ from typing import Dict, Any, Optional
7
7
 
8
8
  import ray
9
9
 
10
10
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
11
  from nv_ingest.framework.util.flow_control import filter_by_task
12
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
12
13
  from nv_ingest_api.internal.mutate.filter import filter_images_internal
13
14
  from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
14
15
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable
@@ -16,6 +17,7 @@ from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema impo
16
17
  from nv_ingest_api.util.exception_handlers.decorators import (
17
18
  nv_ingest_node_failure_try_except,
18
19
  )
20
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
19
21
 
20
22
  logger = logging.getLogger(__name__)
21
23
 
@@ -31,18 +33,19 @@ class ImageFilterStage(RayActorStage):
31
33
  3. Updates the message payload with the filtered DataFrame.
32
34
  """
33
35
 
34
- def __init__(self, config: ImageFilterSchema) -> None:
35
- super().__init__(config)
36
+ def __init__(self, config: ImageFilterSchema, stage_name: Optional[str] = None) -> None:
37
+ super().__init__(config, stage_name=stage_name)
36
38
  try:
37
39
  self.validated_config = config
38
- logger.info("ImageFilterStage configuration validated successfully.")
40
+ logger.debug("ImageFilterStage configuration validated successfully.")
39
41
  except Exception as e:
40
42
  logger.exception(f"Error validating Image Filter config: {e}")
41
43
  raise
42
44
 
43
- @traceable("image_filter")
45
+ @nv_ingest_node_failure_try_except()
46
+ @traceable()
47
+ @udf_intercept_hook()
44
48
  @filter_by_task(required_tasks=["filter"])
45
- @nv_ingest_node_failure_try_except(annotation_id="image_filter", raise_on_failure=False)
46
49
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
50
  """
48
51
  Process the control message by filtering images.
@@ -57,7 +60,7 @@ class ImageFilterStage(RayActorStage):
57
60
  IngestControlMessage
58
61
  The updated message with filtered images in the payload.
59
62
  """
60
- logger.info("ImageFilterStage.on_data: Starting image filtering process.")
63
+ logger.debug("ImageFilterStage.on_data: Starting image filtering process.")
61
64
 
62
65
  # Extract the DataFrame payload.
63
66
  df_ledger = control_message.payload()
@@ -65,7 +68,7 @@ class ImageFilterStage(RayActorStage):
65
68
 
66
69
  # Remove the "filter" task from the message to obtain task-specific configuration.
67
70
  task_config = remove_task_by_type(control_message, "filter")
68
- logger.debug("Extracted task config: %s", task_config)
71
+ logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
69
72
 
70
73
  task_params: Dict[str, Any] = task_config.get("params", {})
71
74
 
@@ -76,7 +79,7 @@ class ImageFilterStage(RayActorStage):
76
79
  mutate_config=self.validated_config,
77
80
  execution_trace_log=None,
78
81
  )
79
- logger.info("Image filtering completed. Resulting DataFrame has %d rows.", len(new_df))
82
+ logger.debug("Image filtering completed. Resulting DataFrame has %d rows.", len(new_df))
80
83
 
81
84
  # Update the message payload with the filtered DataFrame.
82
85
  control_message.payload(new_df)
@@ -2,7 +2,7 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
- from typing import Any, Dict
5
+ from typing import Any, Dict, Optional
6
6
  import ray
7
7
 
8
8
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_sink_stage_base import RayActorSinkStage
@@ -12,13 +12,13 @@ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_fail
12
12
 
13
13
  @ray.remote
14
14
  class DefaultDrainSink(RayActorSinkStage):
15
- def __init__(self, config: Any) -> None:
16
- super().__init__(config, log_to_stdout=False)
15
+ def __init__(self, config: Any, stage_name: Optional[str] = None) -> None:
16
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
17
17
 
18
18
  self._last_sunk_count = 0
19
19
  self._sunk_count = 0
20
20
 
21
- @nv_ingest_node_failure_try_except(annotation_id="drain_sink", raise_on_failure=False)
21
+ @nv_ingest_node_failure_try_except()
22
22
  def on_data(self, message: IngestControlMessage) -> IngestControlMessage:
23
23
  self._sunk_count += 1
24
24
 
@@ -14,6 +14,8 @@ from nv_ingest_api.internal.primitives.tracing.logging import annotate_cm
14
14
  from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
15
15
  from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
16
16
 
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
+
17
19
  logger = logging.getLogger(__name__)
18
20
 
19
21
 
@@ -75,8 +77,8 @@ class MessageBrokerTaskSinkConfig(BaseModel):
75
77
 
76
78
  @ray.remote
77
79
  class MessageBrokerTaskSinkStage(RayActorStage):
78
- def __init__(self, config: MessageBrokerTaskSinkConfig) -> None:
79
- super().__init__(config, log_to_stdout=False)
80
+ def __init__(self, config: MessageBrokerTaskSinkConfig, stage_name: Optional[str] = None) -> None:
81
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
80
82
 
81
83
  self.config: MessageBrokerTaskSinkConfig
82
84
 
@@ -224,6 +226,7 @@ class MessageBrokerTaskSinkStage(RayActorStage):
224
226
 
225
227
  # --- Public API Methods for message broker sink ---
226
228
 
229
+ @udf_intercept_hook()
227
230
  def on_data(self, control_message: Any) -> Any:
228
231
  """
229
232
  Processes the control message and pushes the resulting JSON payloads to the broker.
@@ -3,9 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
- import multiprocessing
7
6
  import uuid
8
- import socket
9
7
  from typing import Optional, Literal, Dict, Any, Union
10
8
 
11
9
  import ray
@@ -13,6 +11,7 @@ import json
13
11
  import copy
14
12
  import threading
15
13
  import time
14
+ import random
16
15
  from datetime import datetime
17
16
 
18
17
  import pandas as pd
@@ -30,6 +29,8 @@ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_inges
30
29
  # Import clients
31
30
  from nv_ingest_api.util.message_brokers.simple_message_broker.simple_client import SimpleClient
32
31
  from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
32
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
33
+ from nv_ingest_api.util.message_brokers.qos_scheduler import QosScheduler
33
34
 
34
35
  logger = logging.getLogger(__name__)
35
36
 
@@ -89,8 +90,10 @@ class MessageBrokerTaskSourceConfig(BaseModel):
89
90
 
90
91
  # Use the discriminated union for broker_client
91
92
  broker_client: Union[RedisClientConfig, SimpleClientConfig] = Field(..., discriminator="client_type")
92
- task_queue: str = Field(..., description="The name of the queue to fetch tasks from.")
93
- poll_interval: float = Field(default=0.1, gt=0, description="Polling interval in seconds.")
93
+ task_queue: str = Field(
94
+ ..., description="The base name of the queue to fetch tasks from. Derives sub-queues for fair scheduling."
95
+ )
96
+ poll_interval: float = Field(default=0.0, gt=0, description="Polling interval in seconds.")
94
97
 
95
98
 
96
99
  @ray.remote
@@ -102,11 +105,14 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
102
105
  """
103
106
 
104
107
  # Use the updated config type hint
105
- def __init__(self, config: MessageBrokerTaskSourceConfig) -> None:
106
- super().__init__(config, log_to_stdout=False)
107
- self.config: MessageBrokerTaskSourceConfig # Add type hint for self.config
108
+ def __init__(self, config: MessageBrokerTaskSourceConfig, stage_name: Optional[str] = None) -> None:
109
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
110
+ self.config: MessageBrokerTaskSourceConfig # Add a type hint for self.config
111
+
112
+ # Sanitize config before logging to avoid leaking secrets
113
+ _sanitized = sanitize_for_logging(config)
108
114
  self._logger.debug(
109
- "Initializing MessageBrokerTaskSourceStage with config: %s", config.dict()
115
+ "Initializing MessageBrokerTaskSourceStage with config: %s", _sanitized
110
116
  ) # Log validated config
111
117
 
112
118
  # Access validated configuration directly via self.config
@@ -126,13 +132,40 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
126
132
  self._pause_event = threading.Event()
127
133
  self._pause_event.set() # Initially not paused
128
134
 
129
- self._logger.debug("MessageBrokerTaskSourceStage initialized. Task queue: %s", self.task_queue)
135
+ # Backoff state for graceful retries when broker is unavailable
136
+ self._fetch_failure_count: int = 0
137
+ self._current_backoff_sleep: float = 0.0
138
+ self._last_backoff_log_time: float = 0.0
139
+
140
+ # Initialize QoS scheduler. Use a simple base-queue strategy for SimpleClient.
141
+ strategy = "simple" if isinstance(self.client, SimpleClient) else "lottery"
142
+ self.scheduler = QosScheduler(
143
+ self.task_queue,
144
+ num_prefetch_threads=6, # one per category (no-op for simple strategy)
145
+ total_buffer_capacity=96, # e.g., ~16 per thread
146
+ prefetch_poll_interval=0.002, # faster polling for responsiveness
147
+ prefetch_non_immediate=True, # enable prefetch for non-immediate categories
148
+ strategy=strategy,
149
+ )
150
+
151
+ self._logger.info(
152
+ "MessageBrokerTaskSourceStage initialized. Base task queue: %s | Derived queues: %s",
153
+ self.task_queue,
154
+ {
155
+ "immediate": f"{self.task_queue}_immediate",
156
+ "micro": f"{self.task_queue}_micro",
157
+ "small": f"{self.task_queue}_small",
158
+ "medium": f"{self.task_queue}_medium",
159
+ "large": f"{self.task_queue}_large",
160
+ "default": f"{self.task_queue}",
161
+ },
162
+ )
130
163
 
131
164
  # --- Private helper methods ---
132
165
  def _create_client(self):
133
166
  # Access broker config via self.config.broker_client
134
167
  broker_config = self.config.broker_client
135
- self._logger.info("Creating client of type: %s", broker_config.client_type)
168
+ self._logger.debug("Creating client of type: %s", broker_config.client_type)
136
169
 
137
170
  if broker_config.client_type == "redis":
138
171
  client = RedisClient(
@@ -257,14 +290,24 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
257
290
 
258
291
  return control_message
259
292
 
260
- def _fetch_message(self, timeout=100):
293
+ def _fetch_message(self, timeout=0):
261
294
  """
262
- Fetch a message from the message broker.
295
+ Fetch a message from the message broker using fair scheduling across derived queues.
296
+ This is a non-blocking sweep across all queues for the current scheduling cycle. If no
297
+ message is found across any queue, return None so the caller can sleep briefly.
263
298
  """
264
299
  try:
265
- job = self.client.fetch_message(self.task_queue, timeout)
300
+ # Use scheduler to fetch next. In simple strategy this will block up to poll_interval on base queue.
301
+ job = self.scheduler.fetch_next(self.client, timeout=self.config.poll_interval)
266
302
  if job is None:
267
- self._logger.debug("No message received from '%s'", self.task_queue)
303
+ self._logger.debug(
304
+ "No message received from derived queues for base "
305
+ "'%s' (immediate, micro, small, medium, large, default)",
306
+ self.task_queue,
307
+ )
308
+ # Do not treat normal empty polls as failures
309
+ self._fetch_failure_count = 0
310
+ self._current_backoff_sleep = 0.0
268
311
  return None
269
312
  self._logger.debug("Received message type: %s", type(job))
270
313
  if isinstance(job, BaseModel):
@@ -277,12 +320,46 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
277
320
  return None
278
321
  job = json.loads(job.response)
279
322
  self._logger.debug("Successfully fetched message with job_id: %s", job.get("job_id", "unknown"))
323
+ # Success: reset backoff state
324
+ self._fetch_failure_count = 0
325
+ self._current_backoff_sleep = 0.0
280
326
  return job
281
327
  except TimeoutError:
282
328
  self._logger.debug("Timeout waiting for message")
329
+ # Timeout is not a connectivity failure; do not escalate backoff
283
330
  return None
284
331
  except Exception as err:
285
- self._logger.exception("Error during message fetching: %s", err)
332
+ # Connectivity or other fetch issue: apply graceful backoff and avoid stacktrace spam
333
+ self._fetch_failure_count += 1
334
+
335
+ # Compute exponential backoff with jitter, capped by configured max_backoff
336
+ try:
337
+ max_backoff = getattr(self.config.broker_client, "max_backoff", 5.0)
338
+ except Exception:
339
+ max_backoff = 5.0
340
+ # Start from 0.5s, double each failure
341
+ base = 0.5
342
+ backoff_no_jitter = min(max_backoff, base * (2 ** (self._fetch_failure_count - 1)))
343
+ jitter = random.uniform(0, backoff_no_jitter * 0.2)
344
+ self._current_backoff_sleep = backoff_no_jitter + jitter
345
+
346
+ now = time.time()
347
+ # Throttle warning logs to at most once per 5 seconds to avoid spam
348
+ if now - self._last_backoff_log_time >= 5.0:
349
+ self._logger.warning(
350
+ "Broker fetch failed (%d consecutive failures). Backing off for %.2fs. Error: %s",
351
+ self._fetch_failure_count,
352
+ self._current_backoff_sleep,
353
+ err,
354
+ )
355
+ self._last_backoff_log_time = now
356
+ else:
357
+ self._logger.debug(
358
+ "Broker fetch failed (%d). Backoff %.2fs. Error: %s",
359
+ self._fetch_failure_count,
360
+ self._current_backoff_sleep,
361
+ err,
362
+ )
286
363
  return None
287
364
 
288
365
  def _read_input(self) -> any:
@@ -291,10 +368,20 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
291
368
  Instead of reading from an input edge, fetch a message from the broker.
292
369
  """
293
370
  self._logger.debug("read_input: calling _fetch_message()")
294
- job = self._fetch_message(timeout=100)
371
+ # Perform a non-blocking sweep across all queues for this cycle
372
+ job = self._fetch_message(timeout=0)
295
373
  if job is None:
296
- self._logger.debug("read_input: No job received, sleeping for poll_interval: %s", self.config.poll_interval)
297
- time.sleep(self.config.poll_interval)
374
+ # Sleep for either the configured poll interval or the current backoff, whichever is larger
375
+ sleep_time = max(self.config.poll_interval, getattr(self, "_current_backoff_sleep", 0.0))
376
+ self._logger.debug(
377
+ "read_input: No job received; sleeping %.2fs (poll_interval=%.2fs, backoff=%.2fs)",
378
+ sleep_time,
379
+ self.config.poll_interval,
380
+ getattr(self, "_current_backoff_sleep", 0.0),
381
+ )
382
+ time.sleep(sleep_time)
383
+ # Reset one-shot backoff so that repeated failures recompute progressively
384
+ self._current_backoff_sleep = 0.0
298
385
 
299
386
  return None
300
387
 
@@ -314,7 +401,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
314
401
  This loop fetches messages from the broker and writes them to the output queue,
315
402
  but blocks on the pause event when the stage is paused.
316
403
  """
317
- self._logger.info("Processing loop started")
404
+ self._logger.debug("Processing loop started")
318
405
  iteration = 0
319
406
  while self._running:
320
407
  iteration += 1
@@ -381,25 +468,25 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
381
468
  self._active_processing = False
382
469
  self._shutdown_signal_complete = True
383
470
 
384
- self._logger.info("Processing loop ending")
471
+ self._logger.debug("Processing loop ending")
385
472
 
386
473
  @ray.method(num_returns=1)
387
474
  def start(self) -> bool:
388
475
  if self._running:
389
- self._logger.info("Start called but stage is already running.")
476
+ self._logger.warning("Start called but stage is already running.")
390
477
  return False
391
478
  self._running = True
392
479
  self.start_time = time.time()
393
480
  self._message_count = 0
394
- self._logger.info("Starting processing loop thread.")
481
+ self._logger.debug("Starting processing loop thread.")
395
482
  threading.Thread(target=self._processing_loop, daemon=True).start()
396
- self._logger.info("MessageBrokerTaskSourceStage started.")
483
+ self._logger.debug("MessageBrokerTaskSourceStage started.")
397
484
  return True
398
485
 
399
486
  @ray.method(num_returns=1)
400
487
  def stop(self) -> bool:
401
488
  self._running = False
402
- self._logger.info("Stop called on MessageBrokerTaskSourceStage")
489
+ self._logger.debug("Stop called on MessageBrokerTaskSourceStage")
403
490
  return True
404
491
 
405
492
  @ray.method(num_returns=1)
@@ -425,7 +512,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
425
512
  @ray.method(num_returns=1)
426
513
  def set_output_queue(self, queue_handle: any) -> bool:
427
514
  self.output_queue = queue_handle
428
- self._logger.info("Output queue set: %s", queue_handle)
515
+ self._logger.debug("Output queue set: %s", queue_handle)
429
516
  return True
430
517
 
431
518
  @ray.method(num_returns=1)
@@ -440,7 +527,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
440
527
  True after the stage is paused.
441
528
  """
442
529
  self._pause_event.clear()
443
- self._logger.info("Stage paused.")
530
+ self._logger.debug("Stage paused.")
444
531
 
445
532
  return True
446
533
 
@@ -456,7 +543,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
456
543
  True after the stage is resumed.
457
544
  """
458
545
  self._pause_event.set()
459
- self._logger.info("Stage resumed.")
546
+ self._logger.debug("Stage resumed.")
460
547
  return True
461
548
 
462
549
  @ray.method(num_returns=1)
@@ -466,49 +553,9 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
466
553
  This method pauses the stage, waits for any current processing to finish,
467
554
  replaces the output queue, and then resumes the stage.
468
555
  """
469
- self._logger.info("Swapping output queue: pausing stage first.")
556
+ self._logger.debug("Swapping output queue: pausing stage first.")
470
557
  self.pause()
471
558
  self.set_output_queue(new_queue)
472
- self._logger.info("Output queue swapped. Resuming stage.")
559
+ self._logger.debug("Output queue swapped. Resuming stage.")
473
560
  self.resume()
474
561
  return True
475
-
476
-
477
- def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
478
- """
479
- Starts a SimpleMessageBroker server in a separate process.
480
-
481
- Parameters
482
- ----------
483
- broker_client : dict
484
- Broker configuration. Expected keys include:
485
- - "port": the port to bind the server to,
486
- - "broker_params": optionally including "max_queue_size",
487
- - and any other parameters required by SimpleMessageBroker.
488
-
489
- Returns
490
- -------
491
- multiprocessing.Process
492
- The process running the SimpleMessageBroker server.
493
- """
494
-
495
- def broker_server():
496
- from nv_ingest_api.util.message_brokers.simple_message_broker.broker import SimpleMessageBroker
497
-
498
- # Use max_queue_size from broker_params or default to 10000.
499
- broker_params = broker_client.get("broker_params", {})
500
- max_queue_size = broker_params.get("max_queue_size", 10000)
501
- server_host = broker_client.get("host", "0.0.0.0")
502
- server_port = broker_client.get("port", 7671)
503
- # Optionally, set socket options here for reuse.
504
- server = SimpleMessageBroker(server_host, server_port, max_queue_size)
505
- # Enable address reuse on the server socket.
506
- server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
507
- server.serve_forever()
508
-
509
- p = multiprocessing.Process(target=broker_server)
510
- p.daemon = False
511
- p.start()
512
- logger.info(f"Started SimpleMessageBroker server in separate process on port {broker_client['port']}")
513
-
514
- return p
@@ -3,13 +3,16 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
- from typing import Dict, Any
6
+ import os
7
+ from typing import Dict, Any, Optional
8
+ from urllib.parse import urlparse
7
9
 
8
10
  import pandas as pd
9
11
  import ray
10
12
 
11
13
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
14
  from nv_ingest.framework.util.flow_control import filter_by_task
15
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
13
16
  from nv_ingest_api.internal.enums.common import ContentTypeEnum
14
17
  from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
15
18
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable
@@ -25,14 +28,15 @@ logger = logging.getLogger(__name__)
25
28
  @ray.remote
26
29
  class ImageStorageStage(RayActorStage):
27
30
  """
28
- A Ray actor stage that stores images or structured content in MinIO and updates metadata with storage URLs.
31
+ A Ray actor stage that stores images or structured content using an fsspec-compatible backend and updates
32
+ metadata with storage URLs.
29
33
 
30
34
  This stage uses the validated configuration (ImageStorageModuleSchema) to process and store the DataFrame
31
35
  payload and updates the control message accordingly.
32
36
  """
33
37
 
34
- def __init__(self, config: ImageStorageModuleSchema) -> None:
35
- super().__init__(config)
38
+ def __init__(self, config: ImageStorageModuleSchema, stage_name: Optional[str] = None) -> None:
39
+ super().__init__(config, stage_name=stage_name)
36
40
  try:
37
41
  self.validated_config = config
38
42
  logger.info("ImageStorageStage configuration validated successfully.")
@@ -40,9 +44,10 @@ class ImageStorageStage(RayActorStage):
40
44
  logger.exception("Error validating image storage config")
41
45
  raise e
42
46
 
43
- @traceable("image_storage")
47
+ @nv_ingest_node_failure_try_except()
48
+ @traceable()
49
+ @udf_intercept_hook()
44
50
  @filter_by_task(required_tasks=["store"])
45
- @nv_ingest_node_failure_try_except(annotation_id="image_storage", raise_on_failure=False)
46
51
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
52
  """
48
53
  Process the control message by storing images or structured content.
@@ -67,8 +72,16 @@ class ImageStorageStage(RayActorStage):
67
72
  task_config = remove_task_by_type(control_message, "store")
68
73
  # logger.debug("ImageStorageStage: Task configuration extracted: %s", pprint.pformat(task_config))
69
74
 
70
- store_structured: bool = task_config.get("structured", True)
71
- store_unstructured: bool = task_config.get("images", False)
75
+ stage_defaults = {
76
+ "structured": self.validated_config.structured,
77
+ "images": self.validated_config.images,
78
+ "storage_uri": self.validated_config.storage_uri,
79
+ "storage_options": self.validated_config.storage_options,
80
+ "public_base_url": self.validated_config.public_base_url,
81
+ }
82
+
83
+ store_structured: bool = task_config.get("structured", stage_defaults["structured"])
84
+ store_unstructured: bool = task_config.get("images", stage_defaults["images"])
72
85
 
73
86
  content_types: Dict[Any, Any] = {}
74
87
  if store_structured:
@@ -78,14 +91,34 @@ class ImageStorageStage(RayActorStage):
78
91
  content_types[ContentTypeEnum.IMAGE] = store_unstructured
79
92
 
80
93
  params: Dict[str, Any] = task_config.get("params", {})
81
- params["content_types"] = content_types
82
94
 
83
- logger.debug(f"Processing storage task with parameters: {params}")
95
+ storage_uri = task_config.get("storage_uri") or params.get("storage_uri") or stage_defaults["storage_uri"]
96
+ storage_options = {
97
+ **(stage_defaults["storage_options"] or {}),
98
+ **(task_config.get("storage_options") or {}),
99
+ **params.get("storage_options", {}),
100
+ }
101
+ if "public_base_url" in task_config:
102
+ public_base_url = task_config["public_base_url"]
103
+ else:
104
+ public_base_url = params.get("public_base_url", stage_defaults["public_base_url"])
105
+
106
+ storage_options = self._inject_storage_defaults(storage_uri, storage_options)
107
+
108
+ storage_params: Dict[str, Any] = {
109
+ "content_types": content_types,
110
+ "storage_uri": storage_uri,
111
+ "storage_options": storage_options,
112
+ }
113
+ if public_base_url:
114
+ storage_params["public_base_url"] = public_base_url
115
+
116
+ logger.debug("Processing storage task with parameters: %s", storage_params)
84
117
 
85
118
  # Store images or structured content.
86
119
  df_storage_ledger: pd.DataFrame = store_images_to_minio_internal(
87
120
  df_storage_ledger=df_payload,
88
- task_config=params,
121
+ task_config=storage_params,
89
122
  storage_config={},
90
123
  execution_trace_log=None,
91
124
  )
@@ -96,3 +129,38 @@ class ImageStorageStage(RayActorStage):
96
129
  control_message.payload(df_storage_ledger)
97
130
 
98
131
  return control_message
132
+
133
+ @staticmethod
134
+ def _inject_storage_defaults(storage_uri: str, storage_options: Dict[str, Any]) -> Dict[str, Any]:
135
+ """
136
+ Populate storage options for common backends (e.g., MinIO/S3) using environment defaults.
137
+ """
138
+ parsed_scheme = urlparse(storage_uri).scheme.lower()
139
+ merged_options: Dict[str, Any] = {k: v for k, v in storage_options.items() if v is not None}
140
+
141
+ if parsed_scheme not in {"s3", "s3a", "s3n"}:
142
+ return merged_options
143
+
144
+ def _set_if_absent(key: str, env_var: str) -> None:
145
+ if key not in merged_options and env_var in os.environ:
146
+ merged_options[key] = os.environ[env_var]
147
+
148
+ _set_if_absent("key", "MINIO_ACCESS_KEY")
149
+ _set_if_absent("secret", "MINIO_SECRET_KEY")
150
+ if "token" not in merged_options and os.environ.get("MINIO_SESSION_TOKEN"):
151
+ merged_options["token"] = os.environ["MINIO_SESSION_TOKEN"]
152
+
153
+ client_kwargs = dict(merged_options.get("client_kwargs", {}))
154
+ endpoint = os.environ.get("MINIO_INTERNAL_ADDRESS")
155
+ if not endpoint:
156
+ endpoint = "http://minio:9000"
157
+ if endpoint and not endpoint.startswith(("http://", "https://")):
158
+ endpoint = f"http://{endpoint}"
159
+ client_kwargs.setdefault("endpoint_url", endpoint)
160
+ region = os.environ.get("MINIO_REGION")
161
+ if region:
162
+ client_kwargs.setdefault("region_name", region)
163
+ if client_kwargs:
164
+ merged_options["client_kwargs"] = client_kwargs
165
+
166
+ return merged_options
@@ -3,6 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ from typing import Optional
6
7
 
7
8
  import ray
8
9
 
@@ -15,6 +16,9 @@ from nv_ingest_api.internal.store.embed_text_upload import store_text_embeddings
15
16
  from nv_ingest_api.util.exception_handlers.decorators import (
16
17
  nv_ingest_node_failure_try_except,
17
18
  )
19
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
20
+
21
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
22
 
19
23
  logger = logging.getLogger(__name__)
20
24
 
@@ -30,8 +34,8 @@ class EmbeddingStorageStage(RayActorStage):
30
34
  3. Updates the message payload with the stored embeddings DataFrame.
31
35
  """
32
36
 
33
- def __init__(self, config: EmbeddingStorageSchema) -> None:
34
- super().__init__(config)
37
+ def __init__(self, config: EmbeddingStorageSchema, stage_name: Optional[str] = None) -> None:
38
+ super().__init__(config, stage_name=stage_name)
35
39
  try:
36
40
  self.validated_config = config
37
41
  logger.info("EmbeddingStorageStage configuration validated successfully.")
@@ -39,9 +43,10 @@ class EmbeddingStorageStage(RayActorStage):
39
43
  logger.exception(f"Error validating Embedding Storage config: {e}")
40
44
  raise
41
45
 
42
- @traceable("embedding_storage")
46
+ @nv_ingest_node_failure_try_except()
47
+ @traceable()
48
+ @udf_intercept_hook()
43
49
  @filter_by_task(required_tasks=["store_embedding"])
44
- @nv_ingest_node_failure_try_except(annotation_id="embedding_storage", raise_on_failure=False)
45
50
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
46
51
  """
47
52
  Process the control message by storing embeddings.
@@ -64,7 +69,7 @@ class EmbeddingStorageStage(RayActorStage):
64
69
 
65
70
  # Remove the "store_embedding" task from the message to obtain task-specific configuration.
66
71
  task_config = remove_task_by_type(control_message, "store_embedding")
67
- logger.debug("Extracted task config: %s", task_config)
72
+ logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
68
73
 
69
74
  # Perform embedding storage.
70
75
  new_df = store_text_embeddings_internal(