nv-ingest 2025.10.4.dev20251004__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/__init__.py +6 -0
- nv_ingest/api/main.py +2 -0
- nv_ingest/api/tracing.py +82 -0
- nv_ingest/api/v2/README.md +203 -0
- nv_ingest/api/v2/__init__.py +3 -0
- nv_ingest/api/v2/ingest.py +1300 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +17 -10
- nv_ingest/framework/orchestration/process/strategies.py +6 -2
- nv_ingest/framework/orchestration/process/termination.py +49 -9
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +2 -2
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -2
- nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +41 -8
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +72 -6
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
- nv_ingest/pipeline/config/replica_resolver.py +12 -2
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +32 -18
- nv_ingest/pipeline/default_pipeline_impl.py +75 -33
- {nv_ingest-2025.10.4.dev20251004.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +4 -2
- {nv_ingest-2025.10.4.dev20251004.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +23 -18
- {nv_ingest-2025.10.4.dev20251004.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.10.4.dev20251004.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.10.4.dev20251004.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,18 @@ from nv_ingest_api.util.message_brokers.simple_message_broker.broker import Simp
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
+
def _broker_server_target(host, port, max_queue_size):
|
|
22
|
+
"""
|
|
23
|
+
Target function to be run in a separate process for the SimpleMessageBroker.
|
|
24
|
+
"""
|
|
25
|
+
server = SimpleMessageBroker(host, port, max_queue_size)
|
|
26
|
+
try:
|
|
27
|
+
server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
28
|
+
except Exception:
|
|
29
|
+
pass
|
|
30
|
+
server.serve_forever()
|
|
31
|
+
|
|
32
|
+
|
|
21
33
|
def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
|
|
22
34
|
"""
|
|
23
35
|
Starts a SimpleMessageBroker server in a separate process.
|
|
@@ -58,16 +70,11 @@ def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
|
|
|
58
70
|
f"continuing to spawn a broker process (tests expect a Process to be returned)"
|
|
59
71
|
)
|
|
60
72
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
except Exception:
|
|
67
|
-
pass
|
|
68
|
-
server.serve_forever()
|
|
69
|
-
|
|
70
|
-
p = multiprocessing.Process(target=broker_server)
|
|
73
|
+
p = multiprocessing.Process(
|
|
74
|
+
target=_broker_server_target,
|
|
75
|
+
args=(server_host, server_port, max_queue_size),
|
|
76
|
+
daemon=True,
|
|
77
|
+
)
|
|
71
78
|
# If we're launching from inside the pipeline subprocess, mark daemon so the
|
|
72
79
|
# broker dies automatically when the subprocess exits.
|
|
73
80
|
p.daemon = os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS") == "1"
|
|
@@ -11,9 +11,10 @@ Strategy pattern for clean separation of execution concerns.
|
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
import atexit
|
|
14
|
-
import os
|
|
15
14
|
import logging
|
|
16
15
|
import multiprocessing
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
17
18
|
import time
|
|
18
19
|
from abc import ABC, abstractmethod
|
|
19
20
|
|
|
@@ -132,7 +133,10 @@ class SubprocessStrategy(ProcessExecutionStrategy):
|
|
|
132
133
|
logger.info("Launching pipeline in Python subprocess using multiprocessing.")
|
|
133
134
|
|
|
134
135
|
# Create subprocess using fork context
|
|
135
|
-
|
|
136
|
+
start_method = "fork"
|
|
137
|
+
if sys.platform.lower() == "darwin":
|
|
138
|
+
start_method = "spawn"
|
|
139
|
+
ctx = multiprocessing.get_context(start_method)
|
|
136
140
|
process = ctx.Process(
|
|
137
141
|
target=run_pipeline_process,
|
|
138
142
|
args=(
|
|
@@ -19,20 +19,45 @@ logger = logging.getLogger(__name__)
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def _safe_log(level: int, msg: str) -> None:
|
|
22
|
-
"""Best-effort logging that won't
|
|
22
|
+
"""Best-effort logging that won't emit handler tracebacks on closed streams.
|
|
23
|
+
|
|
24
|
+
Temporarily disables logging.raiseExceptions to prevent the logging module
|
|
25
|
+
from printing "--- Logging error ---" to stderr if a handler's stream is
|
|
26
|
+
already closed (common during process teardown). Falls back to writing to
|
|
27
|
+
sys.__stderr__ if available.
|
|
28
|
+
"""
|
|
23
29
|
try:
|
|
24
|
-
|
|
30
|
+
import logging as _logging
|
|
31
|
+
|
|
32
|
+
prev = getattr(_logging, "raiseExceptions", True)
|
|
33
|
+
# Suppress handler errors being printed to stderr
|
|
34
|
+
_logging.raiseExceptions = False
|
|
35
|
+
|
|
36
|
+
# If there are no handlers, skip and use stderr fallback
|
|
37
|
+
if logger.handlers:
|
|
38
|
+
logger.log(level, msg)
|
|
39
|
+
return
|
|
25
40
|
except Exception:
|
|
41
|
+
# Intentionally ignore and try stderr fallback
|
|
42
|
+
pass
|
|
43
|
+
finally:
|
|
26
44
|
try:
|
|
27
|
-
#
|
|
28
|
-
import sys
|
|
45
|
+
import logging as _logging # re-import safe even if earlier failed
|
|
29
46
|
|
|
30
|
-
|
|
31
|
-
sys.__stderr__.write(msg + "\n")
|
|
32
|
-
sys.__stderr__.flush()
|
|
47
|
+
_logging.raiseExceptions = prev # type: ignore[name-defined]
|
|
33
48
|
except Exception:
|
|
34
49
|
pass
|
|
35
50
|
|
|
51
|
+
# Fallback to stderr if available
|
|
52
|
+
try:
|
|
53
|
+
import sys
|
|
54
|
+
|
|
55
|
+
if hasattr(sys, "__stderr__") and sys.__stderr__:
|
|
56
|
+
sys.__stderr__.write(msg + "\n")
|
|
57
|
+
sys.__stderr__.flush()
|
|
58
|
+
except Exception:
|
|
59
|
+
pass
|
|
60
|
+
|
|
36
61
|
|
|
37
62
|
def kill_pipeline_process_group(process) -> None:
|
|
38
63
|
"""
|
|
@@ -74,7 +99,17 @@ def kill_pipeline_process_group(process) -> None:
|
|
|
74
99
|
|
|
75
100
|
try:
|
|
76
101
|
# Send graceful termination to the entire process group
|
|
77
|
-
|
|
102
|
+
try:
|
|
103
|
+
pgid = os.getpgid(pid)
|
|
104
|
+
except Exception:
|
|
105
|
+
# Process already gone
|
|
106
|
+
_safe_log(logging.DEBUG, f"Process group for PID {pid} not found during SIGTERM phase")
|
|
107
|
+
return
|
|
108
|
+
try:
|
|
109
|
+
os.killpg(pgid, signal.SIGTERM)
|
|
110
|
+
except ProcessLookupError:
|
|
111
|
+
_safe_log(logging.DEBUG, f"Process group for PID {pid} no longer exists (SIGTERM)")
|
|
112
|
+
return
|
|
78
113
|
|
|
79
114
|
# If we have a Process handle, give it a chance to exit cleanly
|
|
80
115
|
if proc is not None and hasattr(proc, "join"):
|
|
@@ -95,7 +130,12 @@ def kill_pipeline_process_group(process) -> None:
|
|
|
95
130
|
if still_alive:
|
|
96
131
|
_safe_log(logging.WARNING, "Process group did not terminate gracefully, using SIGKILL")
|
|
97
132
|
try:
|
|
98
|
-
|
|
133
|
+
try:
|
|
134
|
+
pgid2 = os.getpgid(pid)
|
|
135
|
+
except Exception:
|
|
136
|
+
_safe_log(logging.DEBUG, f"Process group for PID {pid} vanished before SIGKILL")
|
|
137
|
+
return
|
|
138
|
+
os.killpg(pgid2, signal.SIGKILL)
|
|
99
139
|
finally:
|
|
100
140
|
if proc is not None and hasattr(proc, "join"):
|
|
101
141
|
try:
|
|
@@ -152,11 +152,11 @@ if __name__ == "__main__":
|
|
|
152
152
|
os.environ["OCR_MODEL_NAME"] = "paddle"
|
|
153
153
|
os.environ["NEMORETRIEVER_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
154
154
|
os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
155
|
-
os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/
|
|
155
|
+
os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/nemotron-nano-12b-v2-vl"
|
|
156
156
|
logger.info("Environment variables set.")
|
|
157
157
|
|
|
158
158
|
image_caption_endpoint_url = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
159
|
-
model_name = "nvidia/
|
|
159
|
+
model_name = "nvidia/nemotron-nano-12b-v2-vl"
|
|
160
160
|
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
161
161
|
(
|
|
162
162
|
yolox_table_structure_grpc,
|
|
@@ -5,7 +5,6 @@
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
from typing import Optional
|
|
8
|
-
|
|
9
8
|
import ray
|
|
10
9
|
|
|
11
10
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
@@ -67,7 +66,6 @@ class AudioExtractorStage(RayActorStage):
|
|
|
67
66
|
# Extract the DataFrame payload.
|
|
68
67
|
df_ledger = control_message.payload()
|
|
69
68
|
self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
|
|
70
|
-
|
|
71
69
|
# Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
|
|
72
70
|
task_config = remove_task_by_type(control_message, "extract")
|
|
73
71
|
self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import ray
|
|
7
|
+
|
|
8
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
9
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
10
|
+
from nv_ingest_api.internal.extract.image.ocr_extractor import extract_text_data_from_image_internal
|
|
11
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
12
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
|
|
13
|
+
from nv_ingest_api.internal.schemas.extract.extract_ocr_schema import OCRExtractorSchema
|
|
14
|
+
from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@ray.remote
|
|
23
|
+
class OCRExtractorStage(RayActorStage):
|
|
24
|
+
"""
|
|
25
|
+
A Ray actor stage that extracts text data from image content.
|
|
26
|
+
|
|
27
|
+
It expects an IngestControlMessage containing a DataFrame with image data. It then:
|
|
28
|
+
1. Removes the "text_data_extract" task from the message.
|
|
29
|
+
2. Calls the text extraction logic using a validated configuration.
|
|
30
|
+
3. Updates the message payload with the extracted text DataFrame.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: OCRExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
34
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
35
|
+
try:
|
|
36
|
+
self.validated_config = config
|
|
37
|
+
self._logger.info("OCRExtractorStage configuration validated successfully.")
|
|
38
|
+
except Exception as e:
|
|
39
|
+
self._logger.exception(f"Error validating Text extractor config: {e}")
|
|
40
|
+
raise
|
|
41
|
+
|
|
42
|
+
@nv_ingest_node_failure_try_except()
|
|
43
|
+
@traceable()
|
|
44
|
+
@udf_intercept_hook()
|
|
45
|
+
@filter_by_task(required_tasks=["ocr_data_extract"])
|
|
46
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
|
+
# Extract DataFrame payload
|
|
48
|
+
df_ledger = control_message.payload()
|
|
49
|
+
if df_ledger.empty:
|
|
50
|
+
return control_message
|
|
51
|
+
|
|
52
|
+
# Remove the "text_data_extract" task from the message
|
|
53
|
+
task_config = remove_task_by_type(control_message, "ocr_data_extract")
|
|
54
|
+
|
|
55
|
+
execution_trace_log = {}
|
|
56
|
+
new_df, extraction_info = extract_text_data_from_image_internal(
|
|
57
|
+
df_extraction_ledger=df_ledger,
|
|
58
|
+
task_config=task_config,
|
|
59
|
+
extraction_config=self.validated_config,
|
|
60
|
+
execution_trace_log=execution_trace_log,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
control_message.payload(new_df)
|
|
64
|
+
control_message.set_metadata("ocr_extraction_info", extraction_info)
|
|
65
|
+
|
|
66
|
+
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
67
|
+
if do_trace_tagging and execution_trace_log:
|
|
68
|
+
parent_name = self.stage_name if self.stage_name else "ocr_extractor"
|
|
69
|
+
set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
|
|
70
|
+
|
|
71
|
+
return control_message
|
|
@@ -30,6 +30,7 @@ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_inges
|
|
|
30
30
|
from nv_ingest_api.util.message_brokers.simple_message_broker.simple_client import SimpleClient
|
|
31
31
|
from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
|
|
32
32
|
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
33
|
+
from nv_ingest_api.util.message_brokers.qos_scheduler import QosScheduler
|
|
33
34
|
|
|
34
35
|
logger = logging.getLogger(__name__)
|
|
35
36
|
|
|
@@ -89,8 +90,10 @@ class MessageBrokerTaskSourceConfig(BaseModel):
|
|
|
89
90
|
|
|
90
91
|
# Use the discriminated union for broker_client
|
|
91
92
|
broker_client: Union[RedisClientConfig, SimpleClientConfig] = Field(..., discriminator="client_type")
|
|
92
|
-
task_queue: str = Field(
|
|
93
|
-
|
|
93
|
+
task_queue: str = Field(
|
|
94
|
+
..., description="The base name of the queue to fetch tasks from. Derives sub-queues for fair scheduling."
|
|
95
|
+
)
|
|
96
|
+
poll_interval: float = Field(default=0.0, gt=0, description="Polling interval in seconds.")
|
|
94
97
|
|
|
95
98
|
|
|
96
99
|
@ray.remote
|
|
@@ -134,7 +137,29 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
134
137
|
self._current_backoff_sleep: float = 0.0
|
|
135
138
|
self._last_backoff_log_time: float = 0.0
|
|
136
139
|
|
|
137
|
-
|
|
140
|
+
# Initialize QoS scheduler. Use a simple base-queue strategy for SimpleClient.
|
|
141
|
+
strategy = "simple" if isinstance(self.client, SimpleClient) else "lottery"
|
|
142
|
+
self.scheduler = QosScheduler(
|
|
143
|
+
self.task_queue,
|
|
144
|
+
num_prefetch_threads=6, # one per category (no-op for simple strategy)
|
|
145
|
+
total_buffer_capacity=96, # e.g., ~16 per thread
|
|
146
|
+
prefetch_poll_interval=0.002, # faster polling for responsiveness
|
|
147
|
+
prefetch_non_immediate=True, # enable prefetch for non-immediate categories
|
|
148
|
+
strategy=strategy,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
self._logger.info(
|
|
152
|
+
"MessageBrokerTaskSourceStage initialized. Base task queue: %s | Derived queues: %s",
|
|
153
|
+
self.task_queue,
|
|
154
|
+
{
|
|
155
|
+
"immediate": f"{self.task_queue}_immediate",
|
|
156
|
+
"micro": f"{self.task_queue}_micro",
|
|
157
|
+
"small": f"{self.task_queue}_small",
|
|
158
|
+
"medium": f"{self.task_queue}_medium",
|
|
159
|
+
"large": f"{self.task_queue}_large",
|
|
160
|
+
"default": f"{self.task_queue}",
|
|
161
|
+
},
|
|
162
|
+
)
|
|
138
163
|
|
|
139
164
|
# --- Private helper methods ---
|
|
140
165
|
def _create_client(self):
|
|
@@ -265,14 +290,21 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
265
290
|
|
|
266
291
|
return control_message
|
|
267
292
|
|
|
268
|
-
def _fetch_message(self, timeout=
|
|
293
|
+
def _fetch_message(self, timeout=0):
|
|
269
294
|
"""
|
|
270
|
-
Fetch a message from the message broker.
|
|
295
|
+
Fetch a message from the message broker using fair scheduling across derived queues.
|
|
296
|
+
This is a non-blocking sweep across all queues for the current scheduling cycle. If no
|
|
297
|
+
message is found across any queue, return None so the caller can sleep briefly.
|
|
271
298
|
"""
|
|
272
299
|
try:
|
|
273
|
-
|
|
300
|
+
# Use scheduler to fetch next. In simple strategy this will block up to poll_interval on base queue.
|
|
301
|
+
job = self.scheduler.fetch_next(self.client, timeout=self.config.poll_interval)
|
|
274
302
|
if job is None:
|
|
275
|
-
self._logger.debug(
|
|
303
|
+
self._logger.debug(
|
|
304
|
+
"No message received from derived queues for base "
|
|
305
|
+
"'%s' (immediate, micro, small, medium, large, default)",
|
|
306
|
+
self.task_queue,
|
|
307
|
+
)
|
|
276
308
|
# Do not treat normal empty polls as failures
|
|
277
309
|
self._fetch_failure_count = 0
|
|
278
310
|
self._current_backoff_sleep = 0.0
|
|
@@ -336,7 +368,8 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
336
368
|
Instead of reading from an input edge, fetch a message from the broker.
|
|
337
369
|
"""
|
|
338
370
|
self._logger.debug("read_input: calling _fetch_message()")
|
|
339
|
-
|
|
371
|
+
# Perform a non-blocking sweep across all queues for this cycle
|
|
372
|
+
job = self._fetch_message(timeout=0)
|
|
340
373
|
if job is None:
|
|
341
374
|
# Sleep for either the configured poll interval or the current backoff, whichever is larger
|
|
342
375
|
sleep_time = max(self.config.poll_interval, getattr(self, "_current_backoff_sleep", 0.0))
|
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import os
|
|
6
7
|
from typing import Dict, Any, Optional
|
|
8
|
+
from urllib.parse import urlparse
|
|
7
9
|
|
|
8
10
|
import pandas as pd
|
|
9
11
|
import ray
|
|
@@ -26,7 +28,8 @@ logger = logging.getLogger(__name__)
|
|
|
26
28
|
@ray.remote
|
|
27
29
|
class ImageStorageStage(RayActorStage):
|
|
28
30
|
"""
|
|
29
|
-
A Ray actor stage that stores images or structured content
|
|
31
|
+
A Ray actor stage that stores images or structured content using an fsspec-compatible backend and updates
|
|
32
|
+
metadata with storage URLs.
|
|
30
33
|
|
|
31
34
|
This stage uses the validated configuration (ImageStorageModuleSchema) to process and store the DataFrame
|
|
32
35
|
payload and updates the control message accordingly.
|
|
@@ -69,8 +72,16 @@ class ImageStorageStage(RayActorStage):
|
|
|
69
72
|
task_config = remove_task_by_type(control_message, "store")
|
|
70
73
|
# logger.debug("ImageStorageStage: Task configuration extracted: %s", pprint.pformat(task_config))
|
|
71
74
|
|
|
72
|
-
|
|
73
|
-
|
|
75
|
+
stage_defaults = {
|
|
76
|
+
"structured": self.validated_config.structured,
|
|
77
|
+
"images": self.validated_config.images,
|
|
78
|
+
"storage_uri": self.validated_config.storage_uri,
|
|
79
|
+
"storage_options": self.validated_config.storage_options,
|
|
80
|
+
"public_base_url": self.validated_config.public_base_url,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
store_structured: bool = task_config.get("structured", stage_defaults["structured"])
|
|
84
|
+
store_unstructured: bool = task_config.get("images", stage_defaults["images"])
|
|
74
85
|
|
|
75
86
|
content_types: Dict[Any, Any] = {}
|
|
76
87
|
if store_structured:
|
|
@@ -80,14 +91,34 @@ class ImageStorageStage(RayActorStage):
|
|
|
80
91
|
content_types[ContentTypeEnum.IMAGE] = store_unstructured
|
|
81
92
|
|
|
82
93
|
params: Dict[str, Any] = task_config.get("params", {})
|
|
83
|
-
params["content_types"] = content_types
|
|
84
94
|
|
|
85
|
-
|
|
95
|
+
storage_uri = task_config.get("storage_uri") or params.get("storage_uri") or stage_defaults["storage_uri"]
|
|
96
|
+
storage_options = {
|
|
97
|
+
**(stage_defaults["storage_options"] or {}),
|
|
98
|
+
**(task_config.get("storage_options") or {}),
|
|
99
|
+
**params.get("storage_options", {}),
|
|
100
|
+
}
|
|
101
|
+
if "public_base_url" in task_config:
|
|
102
|
+
public_base_url = task_config["public_base_url"]
|
|
103
|
+
else:
|
|
104
|
+
public_base_url = params.get("public_base_url", stage_defaults["public_base_url"])
|
|
105
|
+
|
|
106
|
+
storage_options = self._inject_storage_defaults(storage_uri, storage_options)
|
|
107
|
+
|
|
108
|
+
storage_params: Dict[str, Any] = {
|
|
109
|
+
"content_types": content_types,
|
|
110
|
+
"storage_uri": storage_uri,
|
|
111
|
+
"storage_options": storage_options,
|
|
112
|
+
}
|
|
113
|
+
if public_base_url:
|
|
114
|
+
storage_params["public_base_url"] = public_base_url
|
|
115
|
+
|
|
116
|
+
logger.debug("Processing storage task with parameters: %s", storage_params)
|
|
86
117
|
|
|
87
118
|
# Store images or structured content.
|
|
88
119
|
df_storage_ledger: pd.DataFrame = store_images_to_minio_internal(
|
|
89
120
|
df_storage_ledger=df_payload,
|
|
90
|
-
task_config=
|
|
121
|
+
task_config=storage_params,
|
|
91
122
|
storage_config={},
|
|
92
123
|
execution_trace_log=None,
|
|
93
124
|
)
|
|
@@ -98,3 +129,38 @@ class ImageStorageStage(RayActorStage):
|
|
|
98
129
|
control_message.payload(df_storage_ledger)
|
|
99
130
|
|
|
100
131
|
return control_message
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def _inject_storage_defaults(storage_uri: str, storage_options: Dict[str, Any]) -> Dict[str, Any]:
|
|
135
|
+
"""
|
|
136
|
+
Populate storage options for common backends (e.g., MinIO/S3) using environment defaults.
|
|
137
|
+
"""
|
|
138
|
+
parsed_scheme = urlparse(storage_uri).scheme.lower()
|
|
139
|
+
merged_options: Dict[str, Any] = {k: v for k, v in storage_options.items() if v is not None}
|
|
140
|
+
|
|
141
|
+
if parsed_scheme not in {"s3", "s3a", "s3n"}:
|
|
142
|
+
return merged_options
|
|
143
|
+
|
|
144
|
+
def _set_if_absent(key: str, env_var: str) -> None:
|
|
145
|
+
if key not in merged_options and env_var in os.environ:
|
|
146
|
+
merged_options[key] = os.environ[env_var]
|
|
147
|
+
|
|
148
|
+
_set_if_absent("key", "MINIO_ACCESS_KEY")
|
|
149
|
+
_set_if_absent("secret", "MINIO_SECRET_KEY")
|
|
150
|
+
if "token" not in merged_options and os.environ.get("MINIO_SESSION_TOKEN"):
|
|
151
|
+
merged_options["token"] = os.environ["MINIO_SESSION_TOKEN"]
|
|
152
|
+
|
|
153
|
+
client_kwargs = dict(merged_options.get("client_kwargs", {}))
|
|
154
|
+
endpoint = os.environ.get("MINIO_INTERNAL_ADDRESS")
|
|
155
|
+
if not endpoint:
|
|
156
|
+
endpoint = "http://minio:9000"
|
|
157
|
+
if endpoint and not endpoint.startswith(("http://", "https://")):
|
|
158
|
+
endpoint = f"http://{endpoint}"
|
|
159
|
+
client_kwargs.setdefault("endpoint_url", endpoint)
|
|
160
|
+
region = os.environ.get("MINIO_REGION")
|
|
161
|
+
if region:
|
|
162
|
+
client_kwargs.setdefault("region_name", region)
|
|
163
|
+
if client_kwargs:
|
|
164
|
+
merged_options["client_kwargs"] = client_kwargs
|
|
165
|
+
|
|
166
|
+
return merged_options
|