nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/__init__.py +6 -0
- nv_ingest/api/main.py +2 -0
- nv_ingest/api/tracing.py +82 -0
- nv_ingest/api/v2/README.md +203 -0
- nv_ingest/api/v2/__init__.py +3 -0
- nv_ingest/api/v2/ingest.py +1300 -0
- nv_ingest/framework/orchestration/execution/__init__.py +3 -0
- nv_ingest/framework/orchestration/execution/helpers.py +85 -0
- nv_ingest/framework/orchestration/execution/options.py +112 -0
- nv_ingest/framework/orchestration/process/__init__.py +3 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
- nv_ingest/framework/orchestration/process/execution.py +495 -0
- nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
- nv_ingest/framework/orchestration/process/strategies.py +218 -0
- nv_ingest/framework/orchestration/process/termination.py +147 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +3 -3
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
- nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +79 -11
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -18
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
- nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
- nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
- nv_ingest/pipeline/__init__.py +3 -0
- nv_ingest/pipeline/config/__init__.py +3 -0
- nv_ingest/pipeline/config/loaders.py +229 -0
- nv_ingest/pipeline/config/replica_resolver.py +237 -0
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +528 -0
- nv_ingest/pipeline/default_pipeline_impl.py +557 -0
- nv_ingest/pipeline/ingest_pipeline.py +389 -0
- nv_ingest/pipeline/pipeline_schema.py +398 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +6 -3
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +64 -43
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
|
@@ -3,12 +3,13 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Dict, Any
|
|
6
|
+
from typing import Dict, Any, Optional
|
|
7
7
|
|
|
8
8
|
import ray
|
|
9
9
|
|
|
10
10
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
11
11
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
12
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
12
13
|
from nv_ingest_api.internal.mutate.filter import filter_images_internal
|
|
13
14
|
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
14
15
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
@@ -16,6 +17,7 @@ from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema impo
|
|
|
16
17
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
18
|
nv_ingest_node_failure_try_except,
|
|
18
19
|
)
|
|
20
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
19
21
|
|
|
20
22
|
logger = logging.getLogger(__name__)
|
|
21
23
|
|
|
@@ -31,18 +33,19 @@ class ImageFilterStage(RayActorStage):
|
|
|
31
33
|
3. Updates the message payload with the filtered DataFrame.
|
|
32
34
|
"""
|
|
33
35
|
|
|
34
|
-
def __init__(self, config: ImageFilterSchema) -> None:
|
|
35
|
-
super().__init__(config)
|
|
36
|
+
def __init__(self, config: ImageFilterSchema, stage_name: Optional[str] = None) -> None:
|
|
37
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
38
|
try:
|
|
37
39
|
self.validated_config = config
|
|
38
|
-
logger.
|
|
40
|
+
logger.debug("ImageFilterStage configuration validated successfully.")
|
|
39
41
|
except Exception as e:
|
|
40
42
|
logger.exception(f"Error validating Image Filter config: {e}")
|
|
41
43
|
raise
|
|
42
44
|
|
|
43
|
-
@
|
|
45
|
+
@nv_ingest_node_failure_try_except()
|
|
46
|
+
@traceable()
|
|
47
|
+
@udf_intercept_hook()
|
|
44
48
|
@filter_by_task(required_tasks=["filter"])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="image_filter", raise_on_failure=False)
|
|
46
49
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
50
|
"""
|
|
48
51
|
Process the control message by filtering images.
|
|
@@ -57,7 +60,7 @@ class ImageFilterStage(RayActorStage):
|
|
|
57
60
|
IngestControlMessage
|
|
58
61
|
The updated message with filtered images in the payload.
|
|
59
62
|
"""
|
|
60
|
-
logger.
|
|
63
|
+
logger.debug("ImageFilterStage.on_data: Starting image filtering process.")
|
|
61
64
|
|
|
62
65
|
# Extract the DataFrame payload.
|
|
63
66
|
df_ledger = control_message.payload()
|
|
@@ -65,7 +68,7 @@ class ImageFilterStage(RayActorStage):
|
|
|
65
68
|
|
|
66
69
|
# Remove the "filter" task from the message to obtain task-specific configuration.
|
|
67
70
|
task_config = remove_task_by_type(control_message, "filter")
|
|
68
|
-
logger.debug("Extracted task config: %s", task_config)
|
|
71
|
+
logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
69
72
|
|
|
70
73
|
task_params: Dict[str, Any] = task_config.get("params", {})
|
|
71
74
|
|
|
@@ -76,7 +79,7 @@ class ImageFilterStage(RayActorStage):
|
|
|
76
79
|
mutate_config=self.validated_config,
|
|
77
80
|
execution_trace_log=None,
|
|
78
81
|
)
|
|
79
|
-
logger.
|
|
82
|
+
logger.debug("Image filtering completed. Resulting DataFrame has %d rows.", len(new_df))
|
|
80
83
|
|
|
81
84
|
# Update the message payload with the filtered DataFrame.
|
|
82
85
|
control_message.payload(new_df)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
-
from typing import Any, Dict
|
|
5
|
+
from typing import Any, Dict, Optional
|
|
6
6
|
import ray
|
|
7
7
|
|
|
8
8
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_sink_stage_base import RayActorSinkStage
|
|
@@ -12,13 +12,13 @@ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_fail
|
|
|
12
12
|
|
|
13
13
|
@ray.remote
|
|
14
14
|
class DefaultDrainSink(RayActorSinkStage):
|
|
15
|
-
def __init__(self, config: Any) -> None:
|
|
16
|
-
super().__init__(config, log_to_stdout=False)
|
|
15
|
+
def __init__(self, config: Any, stage_name: Optional[str] = None) -> None:
|
|
16
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
17
17
|
|
|
18
18
|
self._last_sunk_count = 0
|
|
19
19
|
self._sunk_count = 0
|
|
20
20
|
|
|
21
|
-
@nv_ingest_node_failure_try_except(
|
|
21
|
+
@nv_ingest_node_failure_try_except()
|
|
22
22
|
def on_data(self, message: IngestControlMessage) -> IngestControlMessage:
|
|
23
23
|
self._sunk_count += 1
|
|
24
24
|
|
|
@@ -14,6 +14,8 @@ from nv_ingest_api.internal.primitives.tracing.logging import annotate_cm
|
|
|
14
14
|
from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
|
|
15
15
|
from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
|
|
16
16
|
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
|
+
|
|
17
19
|
logger = logging.getLogger(__name__)
|
|
18
20
|
|
|
19
21
|
|
|
@@ -75,8 +77,8 @@ class MessageBrokerTaskSinkConfig(BaseModel):
|
|
|
75
77
|
|
|
76
78
|
@ray.remote
|
|
77
79
|
class MessageBrokerTaskSinkStage(RayActorStage):
|
|
78
|
-
def __init__(self, config: MessageBrokerTaskSinkConfig) -> None:
|
|
79
|
-
super().__init__(config, log_to_stdout=False)
|
|
80
|
+
def __init__(self, config: MessageBrokerTaskSinkConfig, stage_name: Optional[str] = None) -> None:
|
|
81
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
80
82
|
|
|
81
83
|
self.config: MessageBrokerTaskSinkConfig
|
|
82
84
|
|
|
@@ -224,6 +226,7 @@ class MessageBrokerTaskSinkStage(RayActorStage):
|
|
|
224
226
|
|
|
225
227
|
# --- Public API Methods for message broker sink ---
|
|
226
228
|
|
|
229
|
+
@udf_intercept_hook()
|
|
227
230
|
def on_data(self, control_message: Any) -> Any:
|
|
228
231
|
"""
|
|
229
232
|
Processes the control message and pushes the resulting JSON payloads to the broker.
|
|
@@ -3,9 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
import multiprocessing
|
|
7
6
|
import uuid
|
|
8
|
-
import socket
|
|
9
7
|
from typing import Optional, Literal, Dict, Any, Union
|
|
10
8
|
|
|
11
9
|
import ray
|
|
@@ -13,6 +11,7 @@ import json
|
|
|
13
11
|
import copy
|
|
14
12
|
import threading
|
|
15
13
|
import time
|
|
14
|
+
import random
|
|
16
15
|
from datetime import datetime
|
|
17
16
|
|
|
18
17
|
import pandas as pd
|
|
@@ -30,6 +29,8 @@ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_inges
|
|
|
30
29
|
# Import clients
|
|
31
30
|
from nv_ingest_api.util.message_brokers.simple_message_broker.simple_client import SimpleClient
|
|
32
31
|
from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
|
|
32
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
33
|
+
from nv_ingest_api.util.message_brokers.qos_scheduler import QosScheduler
|
|
33
34
|
|
|
34
35
|
logger = logging.getLogger(__name__)
|
|
35
36
|
|
|
@@ -89,8 +90,10 @@ class MessageBrokerTaskSourceConfig(BaseModel):
|
|
|
89
90
|
|
|
90
91
|
# Use the discriminated union for broker_client
|
|
91
92
|
broker_client: Union[RedisClientConfig, SimpleClientConfig] = Field(..., discriminator="client_type")
|
|
92
|
-
task_queue: str = Field(
|
|
93
|
-
|
|
93
|
+
task_queue: str = Field(
|
|
94
|
+
..., description="The base name of the queue to fetch tasks from. Derives sub-queues for fair scheduling."
|
|
95
|
+
)
|
|
96
|
+
poll_interval: float = Field(default=0.0, gt=0, description="Polling interval in seconds.")
|
|
94
97
|
|
|
95
98
|
|
|
96
99
|
@ray.remote
|
|
@@ -102,11 +105,14 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
102
105
|
"""
|
|
103
106
|
|
|
104
107
|
# Use the updated config type hint
|
|
105
|
-
def __init__(self, config: MessageBrokerTaskSourceConfig) -> None:
|
|
106
|
-
super().__init__(config, log_to_stdout=False)
|
|
107
|
-
self.config: MessageBrokerTaskSourceConfig # Add type hint for self.config
|
|
108
|
+
def __init__(self, config: MessageBrokerTaskSourceConfig, stage_name: Optional[str] = None) -> None:
|
|
109
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
110
|
+
self.config: MessageBrokerTaskSourceConfig # Add a type hint for self.config
|
|
111
|
+
|
|
112
|
+
# Sanitize config before logging to avoid leaking secrets
|
|
113
|
+
_sanitized = sanitize_for_logging(config)
|
|
108
114
|
self._logger.debug(
|
|
109
|
-
"Initializing MessageBrokerTaskSourceStage with config: %s",
|
|
115
|
+
"Initializing MessageBrokerTaskSourceStage with config: %s", _sanitized
|
|
110
116
|
) # Log validated config
|
|
111
117
|
|
|
112
118
|
# Access validated configuration directly via self.config
|
|
@@ -126,13 +132,40 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
126
132
|
self._pause_event = threading.Event()
|
|
127
133
|
self._pause_event.set() # Initially not paused
|
|
128
134
|
|
|
129
|
-
|
|
135
|
+
# Backoff state for graceful retries when broker is unavailable
|
|
136
|
+
self._fetch_failure_count: int = 0
|
|
137
|
+
self._current_backoff_sleep: float = 0.0
|
|
138
|
+
self._last_backoff_log_time: float = 0.0
|
|
139
|
+
|
|
140
|
+
# Initialize QoS scheduler. Use a simple base-queue strategy for SimpleClient.
|
|
141
|
+
strategy = "simple" if isinstance(self.client, SimpleClient) else "lottery"
|
|
142
|
+
self.scheduler = QosScheduler(
|
|
143
|
+
self.task_queue,
|
|
144
|
+
num_prefetch_threads=6, # one per category (no-op for simple strategy)
|
|
145
|
+
total_buffer_capacity=96, # e.g., ~16 per thread
|
|
146
|
+
prefetch_poll_interval=0.002, # faster polling for responsiveness
|
|
147
|
+
prefetch_non_immediate=True, # enable prefetch for non-immediate categories
|
|
148
|
+
strategy=strategy,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
self._logger.info(
|
|
152
|
+
"MessageBrokerTaskSourceStage initialized. Base task queue: %s | Derived queues: %s",
|
|
153
|
+
self.task_queue,
|
|
154
|
+
{
|
|
155
|
+
"immediate": f"{self.task_queue}_immediate",
|
|
156
|
+
"micro": f"{self.task_queue}_micro",
|
|
157
|
+
"small": f"{self.task_queue}_small",
|
|
158
|
+
"medium": f"{self.task_queue}_medium",
|
|
159
|
+
"large": f"{self.task_queue}_large",
|
|
160
|
+
"default": f"{self.task_queue}",
|
|
161
|
+
},
|
|
162
|
+
)
|
|
130
163
|
|
|
131
164
|
# --- Private helper methods ---
|
|
132
165
|
def _create_client(self):
|
|
133
166
|
# Access broker config via self.config.broker_client
|
|
134
167
|
broker_config = self.config.broker_client
|
|
135
|
-
self._logger.
|
|
168
|
+
self._logger.debug("Creating client of type: %s", broker_config.client_type)
|
|
136
169
|
|
|
137
170
|
if broker_config.client_type == "redis":
|
|
138
171
|
client = RedisClient(
|
|
@@ -257,14 +290,24 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
257
290
|
|
|
258
291
|
return control_message
|
|
259
292
|
|
|
260
|
-
def _fetch_message(self, timeout=
|
|
293
|
+
def _fetch_message(self, timeout=0):
|
|
261
294
|
"""
|
|
262
|
-
Fetch a message from the message broker.
|
|
295
|
+
Fetch a message from the message broker using fair scheduling across derived queues.
|
|
296
|
+
This is a non-blocking sweep across all queues for the current scheduling cycle. If no
|
|
297
|
+
message is found across any queue, return None so the caller can sleep briefly.
|
|
263
298
|
"""
|
|
264
299
|
try:
|
|
265
|
-
|
|
300
|
+
# Use scheduler to fetch next. In simple strategy this will block up to poll_interval on base queue.
|
|
301
|
+
job = self.scheduler.fetch_next(self.client, timeout=self.config.poll_interval)
|
|
266
302
|
if job is None:
|
|
267
|
-
self._logger.debug(
|
|
303
|
+
self._logger.debug(
|
|
304
|
+
"No message received from derived queues for base "
|
|
305
|
+
"'%s' (immediate, micro, small, medium, large, default)",
|
|
306
|
+
self.task_queue,
|
|
307
|
+
)
|
|
308
|
+
# Do not treat normal empty polls as failures
|
|
309
|
+
self._fetch_failure_count = 0
|
|
310
|
+
self._current_backoff_sleep = 0.0
|
|
268
311
|
return None
|
|
269
312
|
self._logger.debug("Received message type: %s", type(job))
|
|
270
313
|
if isinstance(job, BaseModel):
|
|
@@ -277,12 +320,46 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
277
320
|
return None
|
|
278
321
|
job = json.loads(job.response)
|
|
279
322
|
self._logger.debug("Successfully fetched message with job_id: %s", job.get("job_id", "unknown"))
|
|
323
|
+
# Success: reset backoff state
|
|
324
|
+
self._fetch_failure_count = 0
|
|
325
|
+
self._current_backoff_sleep = 0.0
|
|
280
326
|
return job
|
|
281
327
|
except TimeoutError:
|
|
282
328
|
self._logger.debug("Timeout waiting for message")
|
|
329
|
+
# Timeout is not a connectivity failure; do not escalate backoff
|
|
283
330
|
return None
|
|
284
331
|
except Exception as err:
|
|
285
|
-
|
|
332
|
+
# Connectivity or other fetch issue: apply graceful backoff and avoid stacktrace spam
|
|
333
|
+
self._fetch_failure_count += 1
|
|
334
|
+
|
|
335
|
+
# Compute exponential backoff with jitter, capped by configured max_backoff
|
|
336
|
+
try:
|
|
337
|
+
max_backoff = getattr(self.config.broker_client, "max_backoff", 5.0)
|
|
338
|
+
except Exception:
|
|
339
|
+
max_backoff = 5.0
|
|
340
|
+
# Start from 0.5s, double each failure
|
|
341
|
+
base = 0.5
|
|
342
|
+
backoff_no_jitter = min(max_backoff, base * (2 ** (self._fetch_failure_count - 1)))
|
|
343
|
+
jitter = random.uniform(0, backoff_no_jitter * 0.2)
|
|
344
|
+
self._current_backoff_sleep = backoff_no_jitter + jitter
|
|
345
|
+
|
|
346
|
+
now = time.time()
|
|
347
|
+
# Throttle warning logs to at most once per 5 seconds to avoid spam
|
|
348
|
+
if now - self._last_backoff_log_time >= 5.0:
|
|
349
|
+
self._logger.warning(
|
|
350
|
+
"Broker fetch failed (%d consecutive failures). Backing off for %.2fs. Error: %s",
|
|
351
|
+
self._fetch_failure_count,
|
|
352
|
+
self._current_backoff_sleep,
|
|
353
|
+
err,
|
|
354
|
+
)
|
|
355
|
+
self._last_backoff_log_time = now
|
|
356
|
+
else:
|
|
357
|
+
self._logger.debug(
|
|
358
|
+
"Broker fetch failed (%d). Backoff %.2fs. Error: %s",
|
|
359
|
+
self._fetch_failure_count,
|
|
360
|
+
self._current_backoff_sleep,
|
|
361
|
+
err,
|
|
362
|
+
)
|
|
286
363
|
return None
|
|
287
364
|
|
|
288
365
|
def _read_input(self) -> any:
|
|
@@ -291,10 +368,20 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
291
368
|
Instead of reading from an input edge, fetch a message from the broker.
|
|
292
369
|
"""
|
|
293
370
|
self._logger.debug("read_input: calling _fetch_message()")
|
|
294
|
-
|
|
371
|
+
# Perform a non-blocking sweep across all queues for this cycle
|
|
372
|
+
job = self._fetch_message(timeout=0)
|
|
295
373
|
if job is None:
|
|
296
|
-
|
|
297
|
-
|
|
374
|
+
# Sleep for either the configured poll interval or the current backoff, whichever is larger
|
|
375
|
+
sleep_time = max(self.config.poll_interval, getattr(self, "_current_backoff_sleep", 0.0))
|
|
376
|
+
self._logger.debug(
|
|
377
|
+
"read_input: No job received; sleeping %.2fs (poll_interval=%.2fs, backoff=%.2fs)",
|
|
378
|
+
sleep_time,
|
|
379
|
+
self.config.poll_interval,
|
|
380
|
+
getattr(self, "_current_backoff_sleep", 0.0),
|
|
381
|
+
)
|
|
382
|
+
time.sleep(sleep_time)
|
|
383
|
+
# Reset one-shot backoff so that repeated failures recompute progressively
|
|
384
|
+
self._current_backoff_sleep = 0.0
|
|
298
385
|
|
|
299
386
|
return None
|
|
300
387
|
|
|
@@ -314,7 +401,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
314
401
|
This loop fetches messages from the broker and writes them to the output queue,
|
|
315
402
|
but blocks on the pause event when the stage is paused.
|
|
316
403
|
"""
|
|
317
|
-
self._logger.
|
|
404
|
+
self._logger.debug("Processing loop started")
|
|
318
405
|
iteration = 0
|
|
319
406
|
while self._running:
|
|
320
407
|
iteration += 1
|
|
@@ -381,25 +468,25 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
381
468
|
self._active_processing = False
|
|
382
469
|
self._shutdown_signal_complete = True
|
|
383
470
|
|
|
384
|
-
self._logger.
|
|
471
|
+
self._logger.debug("Processing loop ending")
|
|
385
472
|
|
|
386
473
|
@ray.method(num_returns=1)
|
|
387
474
|
def start(self) -> bool:
|
|
388
475
|
if self._running:
|
|
389
|
-
self._logger.
|
|
476
|
+
self._logger.warning("Start called but stage is already running.")
|
|
390
477
|
return False
|
|
391
478
|
self._running = True
|
|
392
479
|
self.start_time = time.time()
|
|
393
480
|
self._message_count = 0
|
|
394
|
-
self._logger.
|
|
481
|
+
self._logger.debug("Starting processing loop thread.")
|
|
395
482
|
threading.Thread(target=self._processing_loop, daemon=True).start()
|
|
396
|
-
self._logger.
|
|
483
|
+
self._logger.debug("MessageBrokerTaskSourceStage started.")
|
|
397
484
|
return True
|
|
398
485
|
|
|
399
486
|
@ray.method(num_returns=1)
|
|
400
487
|
def stop(self) -> bool:
|
|
401
488
|
self._running = False
|
|
402
|
-
self._logger.
|
|
489
|
+
self._logger.debug("Stop called on MessageBrokerTaskSourceStage")
|
|
403
490
|
return True
|
|
404
491
|
|
|
405
492
|
@ray.method(num_returns=1)
|
|
@@ -425,7 +512,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
425
512
|
@ray.method(num_returns=1)
|
|
426
513
|
def set_output_queue(self, queue_handle: any) -> bool:
|
|
427
514
|
self.output_queue = queue_handle
|
|
428
|
-
self._logger.
|
|
515
|
+
self._logger.debug("Output queue set: %s", queue_handle)
|
|
429
516
|
return True
|
|
430
517
|
|
|
431
518
|
@ray.method(num_returns=1)
|
|
@@ -440,7 +527,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
440
527
|
True after the stage is paused.
|
|
441
528
|
"""
|
|
442
529
|
self._pause_event.clear()
|
|
443
|
-
self._logger.
|
|
530
|
+
self._logger.debug("Stage paused.")
|
|
444
531
|
|
|
445
532
|
return True
|
|
446
533
|
|
|
@@ -456,7 +543,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
456
543
|
True after the stage is resumed.
|
|
457
544
|
"""
|
|
458
545
|
self._pause_event.set()
|
|
459
|
-
self._logger.
|
|
546
|
+
self._logger.debug("Stage resumed.")
|
|
460
547
|
return True
|
|
461
548
|
|
|
462
549
|
@ray.method(num_returns=1)
|
|
@@ -466,49 +553,9 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
466
553
|
This method pauses the stage, waits for any current processing to finish,
|
|
467
554
|
replaces the output queue, and then resumes the stage.
|
|
468
555
|
"""
|
|
469
|
-
self._logger.
|
|
556
|
+
self._logger.debug("Swapping output queue: pausing stage first.")
|
|
470
557
|
self.pause()
|
|
471
558
|
self.set_output_queue(new_queue)
|
|
472
|
-
self._logger.
|
|
559
|
+
self._logger.debug("Output queue swapped. Resuming stage.")
|
|
473
560
|
self.resume()
|
|
474
561
|
return True
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
|
|
478
|
-
"""
|
|
479
|
-
Starts a SimpleMessageBroker server in a separate process.
|
|
480
|
-
|
|
481
|
-
Parameters
|
|
482
|
-
----------
|
|
483
|
-
broker_client : dict
|
|
484
|
-
Broker configuration. Expected keys include:
|
|
485
|
-
- "port": the port to bind the server to,
|
|
486
|
-
- "broker_params": optionally including "max_queue_size",
|
|
487
|
-
- and any other parameters required by SimpleMessageBroker.
|
|
488
|
-
|
|
489
|
-
Returns
|
|
490
|
-
-------
|
|
491
|
-
multiprocessing.Process
|
|
492
|
-
The process running the SimpleMessageBroker server.
|
|
493
|
-
"""
|
|
494
|
-
|
|
495
|
-
def broker_server():
|
|
496
|
-
from nv_ingest_api.util.message_brokers.simple_message_broker.broker import SimpleMessageBroker
|
|
497
|
-
|
|
498
|
-
# Use max_queue_size from broker_params or default to 10000.
|
|
499
|
-
broker_params = broker_client.get("broker_params", {})
|
|
500
|
-
max_queue_size = broker_params.get("max_queue_size", 10000)
|
|
501
|
-
server_host = broker_client.get("host", "0.0.0.0")
|
|
502
|
-
server_port = broker_client.get("port", 7671)
|
|
503
|
-
# Optionally, set socket options here for reuse.
|
|
504
|
-
server = SimpleMessageBroker(server_host, server_port, max_queue_size)
|
|
505
|
-
# Enable address reuse on the server socket.
|
|
506
|
-
server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
507
|
-
server.serve_forever()
|
|
508
|
-
|
|
509
|
-
p = multiprocessing.Process(target=broker_server)
|
|
510
|
-
p.daemon = False
|
|
511
|
-
p.start()
|
|
512
|
-
logger.info(f"Started SimpleMessageBroker server in separate process on port {broker_client['port']}")
|
|
513
|
-
|
|
514
|
-
return p
|
|
@@ -3,13 +3,16 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
|
|
6
|
+
import os
|
|
7
|
+
from typing import Dict, Any, Optional
|
|
8
|
+
from urllib.parse import urlparse
|
|
7
9
|
|
|
8
10
|
import pandas as pd
|
|
9
11
|
import ray
|
|
10
12
|
|
|
11
13
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
12
14
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
15
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
13
16
|
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
14
17
|
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
15
18
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
@@ -25,14 +28,15 @@ logger = logging.getLogger(__name__)
|
|
|
25
28
|
@ray.remote
|
|
26
29
|
class ImageStorageStage(RayActorStage):
|
|
27
30
|
"""
|
|
28
|
-
A Ray actor stage that stores images or structured content
|
|
31
|
+
A Ray actor stage that stores images or structured content using an fsspec-compatible backend and updates
|
|
32
|
+
metadata with storage URLs.
|
|
29
33
|
|
|
30
34
|
This stage uses the validated configuration (ImageStorageModuleSchema) to process and store the DataFrame
|
|
31
35
|
payload and updates the control message accordingly.
|
|
32
36
|
"""
|
|
33
37
|
|
|
34
|
-
def __init__(self, config: ImageStorageModuleSchema) -> None:
|
|
35
|
-
super().__init__(config)
|
|
38
|
+
def __init__(self, config: ImageStorageModuleSchema, stage_name: Optional[str] = None) -> None:
|
|
39
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
40
|
try:
|
|
37
41
|
self.validated_config = config
|
|
38
42
|
logger.info("ImageStorageStage configuration validated successfully.")
|
|
@@ -40,9 +44,10 @@ class ImageStorageStage(RayActorStage):
|
|
|
40
44
|
logger.exception("Error validating image storage config")
|
|
41
45
|
raise e
|
|
42
46
|
|
|
43
|
-
@
|
|
47
|
+
@nv_ingest_node_failure_try_except()
|
|
48
|
+
@traceable()
|
|
49
|
+
@udf_intercept_hook()
|
|
44
50
|
@filter_by_task(required_tasks=["store"])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="image_storage", raise_on_failure=False)
|
|
46
51
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
52
|
"""
|
|
48
53
|
Process the control message by storing images or structured content.
|
|
@@ -67,8 +72,16 @@ class ImageStorageStage(RayActorStage):
|
|
|
67
72
|
task_config = remove_task_by_type(control_message, "store")
|
|
68
73
|
# logger.debug("ImageStorageStage: Task configuration extracted: %s", pprint.pformat(task_config))
|
|
69
74
|
|
|
70
|
-
|
|
71
|
-
|
|
75
|
+
stage_defaults = {
|
|
76
|
+
"structured": self.validated_config.structured,
|
|
77
|
+
"images": self.validated_config.images,
|
|
78
|
+
"storage_uri": self.validated_config.storage_uri,
|
|
79
|
+
"storage_options": self.validated_config.storage_options,
|
|
80
|
+
"public_base_url": self.validated_config.public_base_url,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
store_structured: bool = task_config.get("structured", stage_defaults["structured"])
|
|
84
|
+
store_unstructured: bool = task_config.get("images", stage_defaults["images"])
|
|
72
85
|
|
|
73
86
|
content_types: Dict[Any, Any] = {}
|
|
74
87
|
if store_structured:
|
|
@@ -78,14 +91,34 @@ class ImageStorageStage(RayActorStage):
|
|
|
78
91
|
content_types[ContentTypeEnum.IMAGE] = store_unstructured
|
|
79
92
|
|
|
80
93
|
params: Dict[str, Any] = task_config.get("params", {})
|
|
81
|
-
params["content_types"] = content_types
|
|
82
94
|
|
|
83
|
-
|
|
95
|
+
storage_uri = task_config.get("storage_uri") or params.get("storage_uri") or stage_defaults["storage_uri"]
|
|
96
|
+
storage_options = {
|
|
97
|
+
**(stage_defaults["storage_options"] or {}),
|
|
98
|
+
**(task_config.get("storage_options") or {}),
|
|
99
|
+
**params.get("storage_options", {}),
|
|
100
|
+
}
|
|
101
|
+
if "public_base_url" in task_config:
|
|
102
|
+
public_base_url = task_config["public_base_url"]
|
|
103
|
+
else:
|
|
104
|
+
public_base_url = params.get("public_base_url", stage_defaults["public_base_url"])
|
|
105
|
+
|
|
106
|
+
storage_options = self._inject_storage_defaults(storage_uri, storage_options)
|
|
107
|
+
|
|
108
|
+
storage_params: Dict[str, Any] = {
|
|
109
|
+
"content_types": content_types,
|
|
110
|
+
"storage_uri": storage_uri,
|
|
111
|
+
"storage_options": storage_options,
|
|
112
|
+
}
|
|
113
|
+
if public_base_url:
|
|
114
|
+
storage_params["public_base_url"] = public_base_url
|
|
115
|
+
|
|
116
|
+
logger.debug("Processing storage task with parameters: %s", storage_params)
|
|
84
117
|
|
|
85
118
|
# Store images or structured content.
|
|
86
119
|
df_storage_ledger: pd.DataFrame = store_images_to_minio_internal(
|
|
87
120
|
df_storage_ledger=df_payload,
|
|
88
|
-
task_config=
|
|
121
|
+
task_config=storage_params,
|
|
89
122
|
storage_config={},
|
|
90
123
|
execution_trace_log=None,
|
|
91
124
|
)
|
|
@@ -96,3 +129,38 @@ class ImageStorageStage(RayActorStage):
|
|
|
96
129
|
control_message.payload(df_storage_ledger)
|
|
97
130
|
|
|
98
131
|
return control_message
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def _inject_storage_defaults(storage_uri: str, storage_options: Dict[str, Any]) -> Dict[str, Any]:
|
|
135
|
+
"""
|
|
136
|
+
Populate storage options for common backends (e.g., MinIO/S3) using environment defaults.
|
|
137
|
+
"""
|
|
138
|
+
parsed_scheme = urlparse(storage_uri).scheme.lower()
|
|
139
|
+
merged_options: Dict[str, Any] = {k: v for k, v in storage_options.items() if v is not None}
|
|
140
|
+
|
|
141
|
+
if parsed_scheme not in {"s3", "s3a", "s3n"}:
|
|
142
|
+
return merged_options
|
|
143
|
+
|
|
144
|
+
def _set_if_absent(key: str, env_var: str) -> None:
|
|
145
|
+
if key not in merged_options and env_var in os.environ:
|
|
146
|
+
merged_options[key] = os.environ[env_var]
|
|
147
|
+
|
|
148
|
+
_set_if_absent("key", "MINIO_ACCESS_KEY")
|
|
149
|
+
_set_if_absent("secret", "MINIO_SECRET_KEY")
|
|
150
|
+
if "token" not in merged_options and os.environ.get("MINIO_SESSION_TOKEN"):
|
|
151
|
+
merged_options["token"] = os.environ["MINIO_SESSION_TOKEN"]
|
|
152
|
+
|
|
153
|
+
client_kwargs = dict(merged_options.get("client_kwargs", {}))
|
|
154
|
+
endpoint = os.environ.get("MINIO_INTERNAL_ADDRESS")
|
|
155
|
+
if not endpoint:
|
|
156
|
+
endpoint = "http://minio:9000"
|
|
157
|
+
if endpoint and not endpoint.startswith(("http://", "https://")):
|
|
158
|
+
endpoint = f"http://{endpoint}"
|
|
159
|
+
client_kwargs.setdefault("endpoint_url", endpoint)
|
|
160
|
+
region = os.environ.get("MINIO_REGION")
|
|
161
|
+
if region:
|
|
162
|
+
client_kwargs.setdefault("region_name", region)
|
|
163
|
+
if client_kwargs:
|
|
164
|
+
merged_options["client_kwargs"] = client_kwargs
|
|
165
|
+
|
|
166
|
+
return merged_options
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
from typing import Optional
|
|
6
7
|
|
|
7
8
|
import ray
|
|
8
9
|
|
|
@@ -15,6 +16,9 @@ from nv_ingest_api.internal.store.embed_text_upload import store_text_embeddings
|
|
|
15
16
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
16
17
|
nv_ingest_node_failure_try_except,
|
|
17
18
|
)
|
|
19
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
20
|
+
|
|
21
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
22
|
|
|
19
23
|
logger = logging.getLogger(__name__)
|
|
20
24
|
|
|
@@ -30,8 +34,8 @@ class EmbeddingStorageStage(RayActorStage):
|
|
|
30
34
|
3. Updates the message payload with the stored embeddings DataFrame.
|
|
31
35
|
"""
|
|
32
36
|
|
|
33
|
-
def __init__(self, config: EmbeddingStorageSchema) -> None:
|
|
34
|
-
super().__init__(config)
|
|
37
|
+
def __init__(self, config: EmbeddingStorageSchema, stage_name: Optional[str] = None) -> None:
|
|
38
|
+
super().__init__(config, stage_name=stage_name)
|
|
35
39
|
try:
|
|
36
40
|
self.validated_config = config
|
|
37
41
|
logger.info("EmbeddingStorageStage configuration validated successfully.")
|
|
@@ -39,9 +43,10 @@ class EmbeddingStorageStage(RayActorStage):
|
|
|
39
43
|
logger.exception(f"Error validating Embedding Storage config: {e}")
|
|
40
44
|
raise
|
|
41
45
|
|
|
42
|
-
@
|
|
46
|
+
@nv_ingest_node_failure_try_except()
|
|
47
|
+
@traceable()
|
|
48
|
+
@udf_intercept_hook()
|
|
43
49
|
@filter_by_task(required_tasks=["store_embedding"])
|
|
44
|
-
@nv_ingest_node_failure_try_except(annotation_id="embedding_storage", raise_on_failure=False)
|
|
45
50
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
46
51
|
"""
|
|
47
52
|
Process the control message by storing embeddings.
|
|
@@ -64,7 +69,7 @@ class EmbeddingStorageStage(RayActorStage):
|
|
|
64
69
|
|
|
65
70
|
# Remove the "store_embedding" task from the message to obtain task-specific configuration.
|
|
66
71
|
task_config = remove_task_by_type(control_message, "store_embedding")
|
|
67
|
-
logger.debug("Extracted task config: %s", task_config)
|
|
72
|
+
logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
68
73
|
|
|
69
74
|
# Perform embedding storage.
|
|
70
75
|
new_df = store_text_embeddings_internal(
|