nv-ingest 2025.5.21.dev20250521__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/__init__.py +20 -0
- nv_ingest/api/__init__.py +3 -0
- nv_ingest/api/main.py +43 -0
- nv_ingest/api/v1/__init__.py +3 -0
- nv_ingest/api/v1/health.py +114 -0
- nv_ingest/api/v1/ingest.py +454 -0
- nv_ingest/framework/__init__.py +3 -0
- nv_ingest/framework/orchestration/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
- nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
- nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
- nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
- nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
- nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
- nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
- nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
- nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
- nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
- nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
- nv_ingest/framework/schemas/__init__.py +0 -0
- nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
- nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
- nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
- nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
- nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
- nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
- nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
- nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
- nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
- nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
- nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
- nv_ingest/framework/util/__init__.py +3 -0
- nv_ingest/framework/util/flow_control/__init__.py +8 -0
- nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
- nv_ingest/framework/util/service/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
- nv_ingest/framework/util/service/meta/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
- nv_ingest/framework/util/telemetry/__init__.py +3 -0
- nv_ingest/framework/util/telemetry/global_stats.py +145 -0
- nv_ingest/version.py +38 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,502 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import multiprocessing
|
|
7
|
+
import uuid
|
|
8
|
+
import socket
|
|
9
|
+
from typing import Optional, Literal, Dict, Any, Union
|
|
10
|
+
|
|
11
|
+
import ray
|
|
12
|
+
import json
|
|
13
|
+
import copy
|
|
14
|
+
import threading
|
|
15
|
+
import time
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
from opentelemetry.trace.span import format_trace_id
|
|
20
|
+
from pydantic import BaseModel, Field
|
|
21
|
+
|
|
22
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_source_stage_base import RayActorSourceStage
|
|
23
|
+
|
|
24
|
+
# Import from nv_ingest_api
|
|
25
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
|
|
26
|
+
from nv_ingest_api.internal.primitives.control_message_task import ControlMessageTask
|
|
27
|
+
from nv_ingest_api.internal.primitives.tracing.logging import annotate_cm
|
|
28
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_ingest_job
|
|
29
|
+
|
|
30
|
+
# Import clients
|
|
31
|
+
from nv_ingest_api.util.message_brokers.simple_message_broker.simple_client import SimpleClient
|
|
32
|
+
from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class BrokerParamsRedis(BaseModel):
|
|
38
|
+
"""Specific parameters for Redis broker_params."""
|
|
39
|
+
|
|
40
|
+
db: int = 0
|
|
41
|
+
use_ssl: bool = False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class BaseBrokerClientConfig(BaseModel):
|
|
45
|
+
"""Base configuration common to all broker clients."""
|
|
46
|
+
|
|
47
|
+
host: str = Field(..., description="Hostname or IP address of the message broker.")
|
|
48
|
+
port: int = Field(..., description="Port number of the message broker.")
|
|
49
|
+
max_retries: int = Field(default=5, ge=0, description="Maximum number of connection retries.")
|
|
50
|
+
max_backoff: float = Field(default=5.0, gt=0, description="Maximum backoff delay in seconds between retries.")
|
|
51
|
+
connection_timeout: float = Field(default=30.0, gt=0, description="Connection timeout in seconds.")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class RedisClientConfig(BaseBrokerClientConfig):
|
|
55
|
+
"""Configuration specific to the Redis client."""
|
|
56
|
+
|
|
57
|
+
client_type: Literal["redis"] = Field(..., description="Specifies the client type as Redis.")
|
|
58
|
+
broker_params: BrokerParamsRedis = Field(
|
|
59
|
+
default_factory=BrokerParamsRedis, description="Redis-specific parameters like db and ssl."
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class SimpleClientConfig(BaseBrokerClientConfig):
|
|
64
|
+
"""Configuration specific to the Simple client."""
|
|
65
|
+
|
|
66
|
+
client_type: Literal["simple"] = Field(..., description="Specifies the client type as Simple.")
|
|
67
|
+
broker_params: Optional[Dict[str, Any]] = Field(
|
|
68
|
+
default={}, description="Optional parameters for Simple client (currently unused)."
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# --- Define Updated Source Configuration ---
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class MessageBrokerTaskSourceConfig(BaseModel):
|
|
76
|
+
"""
|
|
77
|
+
Configuration for the MessageBrokerTaskSourceStage.
|
|
78
|
+
|
|
79
|
+
Attributes
|
|
80
|
+
----------
|
|
81
|
+
broker_client : Union[RedisClientConfig, SimpleClientConfig]
|
|
82
|
+
Configuration parameters for connecting to the message broker.
|
|
83
|
+
The specific schema is determined by the 'client_type' field.
|
|
84
|
+
task_queue : str
|
|
85
|
+
The name of the queue to fetch tasks from.
|
|
86
|
+
poll_interval : float, optional
|
|
87
|
+
The polling interval (in seconds) for fetching messages. Defaults to 0.1.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
# Use the discriminated union for broker_client
|
|
91
|
+
broker_client: Union[RedisClientConfig, SimpleClientConfig] = Field(..., discriminator="client_type")
|
|
92
|
+
task_queue: str = Field(..., description="The name of the queue to fetch tasks from.")
|
|
93
|
+
poll_interval: float = Field(default=0.1, gt=0, description="Polling interval in seconds.")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@ray.remote
|
|
97
|
+
class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
98
|
+
"""
|
|
99
|
+
Ray actor source stage for a message broker task source.
|
|
100
|
+
|
|
101
|
+
Fetches messages from a broker, processes them, and writes to the output queue.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
# Use the updated config type hint
|
|
105
|
+
def __init__(self, config: MessageBrokerTaskSourceConfig) -> None:
|
|
106
|
+
super().__init__(config, log_to_stdout=False)
|
|
107
|
+
self.config: MessageBrokerTaskSourceConfig # Add type hint for self.config
|
|
108
|
+
self._logger.debug(
|
|
109
|
+
"Initializing MessageBrokerTaskSourceStage with config: %s", config.dict()
|
|
110
|
+
) # Log validated config
|
|
111
|
+
|
|
112
|
+
# Access validated configuration directly via self.config
|
|
113
|
+
self.poll_interval = self.config.poll_interval
|
|
114
|
+
self.task_queue = self.config.task_queue
|
|
115
|
+
|
|
116
|
+
# Create the client using validated config
|
|
117
|
+
self.client = self._create_client()
|
|
118
|
+
|
|
119
|
+
# Other initializations
|
|
120
|
+
self._message_count = 0
|
|
121
|
+
self._last_message_count = 0
|
|
122
|
+
self.output_queue = None # Presumably set later or via base class
|
|
123
|
+
self.start_time = None
|
|
124
|
+
|
|
125
|
+
# Threading event remains the same
|
|
126
|
+
self._pause_event = threading.Event()
|
|
127
|
+
self._pause_event.set() # Initially not paused
|
|
128
|
+
|
|
129
|
+
self._logger.debug("MessageBrokerTaskSourceStage initialized. Task queue: %s", self.task_queue)
|
|
130
|
+
|
|
131
|
+
# --- Private helper methods ---
|
|
132
|
+
def _create_client(self):
|
|
133
|
+
# Access broker config via self.config.broker_client
|
|
134
|
+
broker_config = self.config.broker_client
|
|
135
|
+
self._logger.info("Creating client of type: %s", broker_config.client_type)
|
|
136
|
+
|
|
137
|
+
if broker_config.client_type == "redis":
|
|
138
|
+
client = RedisClient(
|
|
139
|
+
host=broker_config.host,
|
|
140
|
+
port=broker_config.port,
|
|
141
|
+
db=broker_config.broker_params.db, # Use nested model attribute access
|
|
142
|
+
max_retries=broker_config.max_retries,
|
|
143
|
+
max_backoff=broker_config.max_backoff,
|
|
144
|
+
connection_timeout=broker_config.connection_timeout,
|
|
145
|
+
use_ssl=broker_config.broker_params.use_ssl, # Use nested model attribute access
|
|
146
|
+
)
|
|
147
|
+
self._logger.debug("RedisClient created: %s", client) # Consider logging non-sensitive parts if needed
|
|
148
|
+
return client
|
|
149
|
+
elif broker_config.client_type == "simple":
|
|
150
|
+
server_host = broker_config.host
|
|
151
|
+
server_host = "0.0.0.0"
|
|
152
|
+
client = SimpleClient(
|
|
153
|
+
host=server_host, # Using configured host
|
|
154
|
+
port=broker_config.port,
|
|
155
|
+
max_retries=broker_config.max_retries,
|
|
156
|
+
max_backoff=broker_config.max_backoff,
|
|
157
|
+
connection_timeout=broker_config.connection_timeout,
|
|
158
|
+
)
|
|
159
|
+
self._logger.debug("SimpleClient created: %s", client)
|
|
160
|
+
return client
|
|
161
|
+
|
|
162
|
+
def _process_message(self, job: dict, ts_fetched: datetime) -> Any:
|
|
163
|
+
"""
|
|
164
|
+
Process a raw job fetched from the message broker into an IngestControlMessage.
|
|
165
|
+
"""
|
|
166
|
+
control_message = IngestControlMessage()
|
|
167
|
+
job_id = None
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
# Log the payload (with content redacted) if in debug mode
|
|
171
|
+
if self._logger.isEnabledFor(logging.DEBUG):
|
|
172
|
+
no_payload = copy.deepcopy(job)
|
|
173
|
+
if "content" in no_payload.get("job_payload", {}):
|
|
174
|
+
no_payload["job_payload"]["content"] = ["[...]"]
|
|
175
|
+
self._logger.debug("Processed job payload for logging: %s", json.dumps(no_payload, indent=2))
|
|
176
|
+
|
|
177
|
+
# Validate incoming job structure
|
|
178
|
+
validate_ingest_job(job)
|
|
179
|
+
|
|
180
|
+
ts_entry = datetime.now()
|
|
181
|
+
job_id = job.pop("job_id")
|
|
182
|
+
|
|
183
|
+
job_payload = job.get("job_payload", {})
|
|
184
|
+
job_tasks = job.get("tasks", [])
|
|
185
|
+
tracing_options = job.pop("tracing_options", {})
|
|
186
|
+
|
|
187
|
+
# Extract tracing options
|
|
188
|
+
do_trace_tagging = tracing_options.get("trace", True)
|
|
189
|
+
if do_trace_tagging in (True, "True", "true", "1"):
|
|
190
|
+
do_trace_tagging = True
|
|
191
|
+
|
|
192
|
+
ts_send = tracing_options.get("ts_send")
|
|
193
|
+
if ts_send is not None:
|
|
194
|
+
ts_send = datetime.fromtimestamp(ts_send / 1e9)
|
|
195
|
+
trace_id = tracing_options.get("trace_id")
|
|
196
|
+
|
|
197
|
+
# Create response channel and load payload
|
|
198
|
+
response_channel = f"{job_id}"
|
|
199
|
+
df = pd.DataFrame(job_payload)
|
|
200
|
+
control_message.payload(df)
|
|
201
|
+
annotate_cm(control_message, message="Created")
|
|
202
|
+
|
|
203
|
+
# Add basic metadata
|
|
204
|
+
control_message.set_metadata("response_channel", response_channel)
|
|
205
|
+
control_message.set_metadata("job_id", job_id)
|
|
206
|
+
control_message.set_metadata("timestamp", datetime.now().timestamp())
|
|
207
|
+
|
|
208
|
+
# Add task definitions to the control message
|
|
209
|
+
for task in job_tasks:
|
|
210
|
+
task_id = task.get("id", str(uuid.uuid4()))
|
|
211
|
+
task_type = task.get("type", "unknown")
|
|
212
|
+
task_props = task.get("task_properties", {})
|
|
213
|
+
|
|
214
|
+
if not isinstance(task_props, dict):
|
|
215
|
+
task_props = task_props.model_dump()
|
|
216
|
+
|
|
217
|
+
task_obj = ControlMessageTask(
|
|
218
|
+
id=task_id,
|
|
219
|
+
type=task_type,
|
|
220
|
+
properties=task_props,
|
|
221
|
+
)
|
|
222
|
+
control_message.add_task(task_obj)
|
|
223
|
+
|
|
224
|
+
# Apply tracing metadata and timestamps if enabled
|
|
225
|
+
control_message.set_metadata("config::add_trace_tagging", do_trace_tagging)
|
|
226
|
+
if do_trace_tagging:
|
|
227
|
+
ts_exit = datetime.now()
|
|
228
|
+
|
|
229
|
+
control_message.set_timestamp("trace::entry::message_broker_task_source", ts_entry)
|
|
230
|
+
control_message.set_timestamp("trace::exit::message_broker_task_source", ts_exit)
|
|
231
|
+
|
|
232
|
+
if ts_send is not None:
|
|
233
|
+
control_message.set_timestamp("trace::entry::broker_source_network_in", ts_send)
|
|
234
|
+
control_message.set_timestamp("trace::exit::broker_source_network_in", ts_fetched)
|
|
235
|
+
|
|
236
|
+
if trace_id is not None:
|
|
237
|
+
if isinstance(trace_id, int):
|
|
238
|
+
trace_id = format_trace_id(trace_id)
|
|
239
|
+
control_message.set_metadata("trace_id", trace_id)
|
|
240
|
+
|
|
241
|
+
control_message.set_timestamp("latency::ts_send", datetime.now())
|
|
242
|
+
|
|
243
|
+
self._logger.debug("Message processed successfully with job_id: %s", job_id)
|
|
244
|
+
|
|
245
|
+
except Exception as e:
|
|
246
|
+
self._logger.exception("Failed to process job submission: %s", e)
|
|
247
|
+
|
|
248
|
+
if job_id is not None:
|
|
249
|
+
response_channel = f"{job_id}"
|
|
250
|
+
control_message.set_metadata("job_id", job_id)
|
|
251
|
+
control_message.set_metadata("response_channel", response_channel)
|
|
252
|
+
control_message.set_metadata("cm_failed", True)
|
|
253
|
+
|
|
254
|
+
annotate_cm(control_message, message="Failed to process job submission", error=str(e))
|
|
255
|
+
else:
|
|
256
|
+
raise
|
|
257
|
+
|
|
258
|
+
return control_message
|
|
259
|
+
|
|
260
|
+
def _fetch_message(self, timeout=100):
|
|
261
|
+
"""
|
|
262
|
+
Fetch a message from the message broker.
|
|
263
|
+
"""
|
|
264
|
+
try:
|
|
265
|
+
job = self.client.fetch_message(self.task_queue, timeout)
|
|
266
|
+
if job is None:
|
|
267
|
+
self._logger.debug("No message received from '%s'", self.task_queue)
|
|
268
|
+
return None
|
|
269
|
+
self._logger.debug("Received message type: %s", type(job))
|
|
270
|
+
if isinstance(job, BaseModel):
|
|
271
|
+
self._logger.debug("Message is a BaseModel with response_code: %s", job.response_code)
|
|
272
|
+
if job.response_code != 0:
|
|
273
|
+
self._logger.debug("Message response_code != 0, returning None")
|
|
274
|
+
return None
|
|
275
|
+
job = json.loads(job.response)
|
|
276
|
+
self._logger.debug("Successfully fetched message with job_id: %s", job.get("job_id", "unknown"))
|
|
277
|
+
return job
|
|
278
|
+
except TimeoutError:
|
|
279
|
+
self._logger.debug("Timeout waiting for message")
|
|
280
|
+
return None
|
|
281
|
+
except Exception as err:
|
|
282
|
+
self._logger.exception("Error during message fetching: %s", err)
|
|
283
|
+
return None
|
|
284
|
+
|
|
285
|
+
def _read_input(self) -> any:
|
|
286
|
+
"""
|
|
287
|
+
Source stage's implementation of get_input.
|
|
288
|
+
Instead of reading from an input edge, fetch a message from the broker.
|
|
289
|
+
"""
|
|
290
|
+
self._logger.debug("read_input: calling _fetch_message()")
|
|
291
|
+
job = self._fetch_message(timeout=100)
|
|
292
|
+
if job is None:
|
|
293
|
+
self._logger.debug("read_input: No job received, sleeping for poll_interval: %s", self.config.poll_interval)
|
|
294
|
+
time.sleep(self.config.poll_interval)
|
|
295
|
+
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
self.stats["successful_queue_reads"] += 1
|
|
299
|
+
|
|
300
|
+
ts_fetched = datetime.now()
|
|
301
|
+
self._logger.debug("read_input: Job fetched, processing message")
|
|
302
|
+
control_message = self._process_message(job, ts_fetched)
|
|
303
|
+
self._logger.debug("read_input: Message processed, returning control message")
|
|
304
|
+
|
|
305
|
+
return control_message
|
|
306
|
+
|
|
307
|
+
def on_data(self, control_message: any) -> any:
|
|
308
|
+
"""
|
|
309
|
+
Process the control message.
|
|
310
|
+
For this source stage, no additional processing is done, so simply return it.
|
|
311
|
+
"""
|
|
312
|
+
self._logger.debug("on_data: Received control message for processing")
|
|
313
|
+
return control_message
|
|
314
|
+
|
|
315
|
+
# In the processing loop, instead of checking a boolean, we wait on the event.
|
|
316
|
+
def _processing_loop(self) -> None:
|
|
317
|
+
"""
|
|
318
|
+
Custom processing loop for a source stage.
|
|
319
|
+
This loop fetches messages from the broker and writes them to the output queue,
|
|
320
|
+
but blocks on the pause event when the stage is paused.
|
|
321
|
+
"""
|
|
322
|
+
self._logger.info("Processing loop started")
|
|
323
|
+
iteration = 0
|
|
324
|
+
while self._running:
|
|
325
|
+
iteration += 1
|
|
326
|
+
try:
|
|
327
|
+
self._logger.debug("Processing loop iteration: %s", iteration)
|
|
328
|
+
control_message = self._read_input()
|
|
329
|
+
if control_message is None:
|
|
330
|
+
self._logger.debug(
|
|
331
|
+
"No control message received; sleeping for poll_interval: %s", self.config.poll_interval
|
|
332
|
+
)
|
|
333
|
+
time.sleep(self.config.poll_interval)
|
|
334
|
+
continue
|
|
335
|
+
|
|
336
|
+
self._active_processing = True
|
|
337
|
+
|
|
338
|
+
self._logger.debug("Control message received; processing data")
|
|
339
|
+
updated_cm = self.on_data(control_message)
|
|
340
|
+
|
|
341
|
+
# Block until not paused using the pause event.
|
|
342
|
+
if self.output_queue is not None:
|
|
343
|
+
self._logger.debug("Waiting for stage to resume if paused...")
|
|
344
|
+
|
|
345
|
+
if not self._pause_event.is_set():
|
|
346
|
+
self._active_processing = False
|
|
347
|
+
self._pause_event.wait() # Block if paused
|
|
348
|
+
self._active_processing = True
|
|
349
|
+
|
|
350
|
+
while True:
|
|
351
|
+
try:
|
|
352
|
+
self.output_queue.put(updated_cm)
|
|
353
|
+
self.stats["successful_queue_writes"] += 1
|
|
354
|
+
break
|
|
355
|
+
except Exception:
|
|
356
|
+
self._logger.warning("Output queue full, retrying put()...")
|
|
357
|
+
self.stats["queue_full"] += 1
|
|
358
|
+
time.sleep(0.1)
|
|
359
|
+
|
|
360
|
+
self.stats["processed"] += 1
|
|
361
|
+
self._message_count += 1
|
|
362
|
+
|
|
363
|
+
self._logger.debug(f"Sourced message_count: {self._message_count}")
|
|
364
|
+
self._logger.debug("Iteration %s complete. Total processed: %s", iteration, self.stats["processed"])
|
|
365
|
+
except Exception as e:
|
|
366
|
+
self._logger.exception("Error in processing loop at iteration %s: %s", iteration, e)
|
|
367
|
+
time.sleep(self.config.poll_interval)
|
|
368
|
+
finally:
|
|
369
|
+
self._active_processing = False
|
|
370
|
+
self._shutdown_signal_complete = True
|
|
371
|
+
|
|
372
|
+
self._logger.info("Processing loop ending")
|
|
373
|
+
|
|
374
|
+
@ray.method(num_returns=1)
|
|
375
|
+
def start(self) -> bool:
|
|
376
|
+
if self._running:
|
|
377
|
+
self._logger.info("Start called but stage is already running.")
|
|
378
|
+
return False
|
|
379
|
+
self._running = True
|
|
380
|
+
self.start_time = time.time()
|
|
381
|
+
self._message_count = 0
|
|
382
|
+
self._logger.info("Starting processing loop thread.")
|
|
383
|
+
threading.Thread(target=self._processing_loop, daemon=True).start()
|
|
384
|
+
self._logger.info("MessageBrokerTaskSourceStage started.")
|
|
385
|
+
return True
|
|
386
|
+
|
|
387
|
+
@ray.method(num_returns=1)
|
|
388
|
+
def stop(self) -> bool:
|
|
389
|
+
self._running = False
|
|
390
|
+
self._logger.info("Stop called on MessageBrokerTaskSourceStage")
|
|
391
|
+
return True
|
|
392
|
+
|
|
393
|
+
@ray.method(num_returns=1)
|
|
394
|
+
def get_stats(self) -> dict:
|
|
395
|
+
elapsed = time.time() - self.start_time if self.start_time else 0
|
|
396
|
+
delta = self._message_count - self._last_message_count
|
|
397
|
+
self._last_message_count = self._message_count
|
|
398
|
+
stats = {
|
|
399
|
+
"active_processing": 1 if self._active_processing else 0,
|
|
400
|
+
"delta_processed": delta,
|
|
401
|
+
"elapsed": elapsed,
|
|
402
|
+
"errors": self.stats.get("errors", 0),
|
|
403
|
+
"failed": 0,
|
|
404
|
+
"processed": self._message_count,
|
|
405
|
+
"processing_rate_cps": self._message_count / elapsed if elapsed > 0 else 0,
|
|
406
|
+
"successful_queue_reads": self.stats.get("successful_queue_reads", 0),
|
|
407
|
+
"successful_queue_writes": self.stats.get("successful_queue_writes", 0),
|
|
408
|
+
"queue_full": self.stats.get("queue_full", 0),
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
return stats
|
|
412
|
+
|
|
413
|
+
@ray.method(num_returns=1)
|
|
414
|
+
def set_output_queue(self, queue_handle: any) -> bool:
|
|
415
|
+
self.output_queue = queue_handle
|
|
416
|
+
self._logger.info("Output queue set: %s", queue_handle)
|
|
417
|
+
return True
|
|
418
|
+
|
|
419
|
+
@ray.method(num_returns=1)
|
|
420
|
+
def pause(self) -> bool:
|
|
421
|
+
"""
|
|
422
|
+
Pause the stage. This clears the pause event, causing the processing loop
|
|
423
|
+
to block before writing to the output queue.
|
|
424
|
+
|
|
425
|
+
Returns
|
|
426
|
+
-------
|
|
427
|
+
bool
|
|
428
|
+
True after the stage is paused.
|
|
429
|
+
"""
|
|
430
|
+
self._pause_event.clear()
|
|
431
|
+
self._logger.info("Stage paused.")
|
|
432
|
+
|
|
433
|
+
return True
|
|
434
|
+
|
|
435
|
+
@ray.method(num_returns=1)
|
|
436
|
+
def resume(self) -> bool:
|
|
437
|
+
"""
|
|
438
|
+
Resume the stage. This sets the pause event, allowing the processing loop
|
|
439
|
+
to proceed with writing to the output queue.
|
|
440
|
+
|
|
441
|
+
Returns
|
|
442
|
+
-------
|
|
443
|
+
bool
|
|
444
|
+
True after the stage is resumed.
|
|
445
|
+
"""
|
|
446
|
+
self._pause_event.set()
|
|
447
|
+
self._logger.info("Stage resumed.")
|
|
448
|
+
return True
|
|
449
|
+
|
|
450
|
+
@ray.method(num_returns=1)
|
|
451
|
+
def swap_queues(self, new_queue: any) -> bool:
|
|
452
|
+
"""
|
|
453
|
+
Swap in a new output queue for this stage.
|
|
454
|
+
This method pauses the stage, waits for any current processing to finish,
|
|
455
|
+
replaces the output queue, and then resumes the stage.
|
|
456
|
+
"""
|
|
457
|
+
self._logger.info("Swapping output queue: pausing stage first.")
|
|
458
|
+
self.pause()
|
|
459
|
+
self.set_output_queue(new_queue)
|
|
460
|
+
self._logger.info("Output queue swapped. Resuming stage.")
|
|
461
|
+
self.resume()
|
|
462
|
+
return True
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
|
|
466
|
+
"""
|
|
467
|
+
Starts a SimpleMessageBroker server in a separate process.
|
|
468
|
+
|
|
469
|
+
Parameters
|
|
470
|
+
----------
|
|
471
|
+
broker_client : dict
|
|
472
|
+
Broker configuration. Expected keys include:
|
|
473
|
+
- "port": the port to bind the server to,
|
|
474
|
+
- "broker_params": optionally including "max_queue_size",
|
|
475
|
+
- and any other parameters required by SimpleMessageBroker.
|
|
476
|
+
|
|
477
|
+
Returns
|
|
478
|
+
-------
|
|
479
|
+
multiprocessing.Process
|
|
480
|
+
The process running the SimpleMessageBroker server.
|
|
481
|
+
"""
|
|
482
|
+
|
|
483
|
+
def broker_server():
|
|
484
|
+
from nv_ingest_api.util.message_brokers.simple_message_broker.broker import SimpleMessageBroker
|
|
485
|
+
|
|
486
|
+
# Use max_queue_size from broker_params or default to 10000.
|
|
487
|
+
broker_params = broker_client.get("broker_params", {})
|
|
488
|
+
max_queue_size = broker_params.get("max_queue_size", 10000)
|
|
489
|
+
server_host = broker_client.get("host", "0.0.0.0")
|
|
490
|
+
server_port = broker_client.get("port", 7671)
|
|
491
|
+
# Optionally, set socket options here for reuse.
|
|
492
|
+
server = SimpleMessageBroker(server_host, server_port, max_queue_size)
|
|
493
|
+
# Enable address reuse on the server socket.
|
|
494
|
+
server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
495
|
+
server.serve_forever()
|
|
496
|
+
|
|
497
|
+
p = multiprocessing.Process(target=broker_server)
|
|
498
|
+
p.daemon = True
|
|
499
|
+
p.start()
|
|
500
|
+
logger.info(f"Started SimpleMessageBroker server in separate process on port {broker_client['port']}")
|
|
501
|
+
|
|
502
|
+
return p
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Dict, Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import ray
|
|
10
|
+
|
|
11
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
12
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
13
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
14
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
15
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
16
|
+
from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
|
|
17
|
+
from nv_ingest_api.internal.store.image_upload import store_images_to_minio_internal
|
|
18
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
19
|
+
nv_ingest_node_failure_try_except,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@ray.remote
|
|
26
|
+
class ImageStorageStage(RayActorStage):
|
|
27
|
+
"""
|
|
28
|
+
A Ray actor stage that stores images or structured content in MinIO and updates metadata with storage URLs.
|
|
29
|
+
|
|
30
|
+
This stage uses the validated configuration (ImageStorageModuleSchema) to process and store the DataFrame
|
|
31
|
+
payload and updates the control message accordingly.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, config: ImageStorageModuleSchema) -> None:
|
|
35
|
+
super().__init__(config)
|
|
36
|
+
try:
|
|
37
|
+
self.validated_config = config
|
|
38
|
+
logger.info("ImageStorageStage configuration validated successfully.")
|
|
39
|
+
except Exception as e:
|
|
40
|
+
logger.exception("Error validating image storage config")
|
|
41
|
+
raise e
|
|
42
|
+
|
|
43
|
+
@traceable("image_storage")
|
|
44
|
+
@filter_by_task(required_tasks=["store"])
|
|
45
|
+
@nv_ingest_node_failure_try_except(annotation_id="image_storage", raise_on_failure=False)
|
|
46
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
|
+
"""
|
|
48
|
+
Process the control message by storing images or structured content.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
control_message : IngestControlMessage
|
|
53
|
+
The incoming message containing the DataFrame payload.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
IngestControlMessage
|
|
58
|
+
The updated message with storage URLs and trace info added.
|
|
59
|
+
"""
|
|
60
|
+
logger.info("ImageStorageStage.on_data: Starting storage operation.")
|
|
61
|
+
|
|
62
|
+
# Extract DataFrame payload.
|
|
63
|
+
df_payload = control_message.payload()
|
|
64
|
+
logger.debug("ImageStorageStage: Extracted payload with %d rows.", len(df_payload))
|
|
65
|
+
|
|
66
|
+
# Remove the "store" task to obtain task-specific configuration.
|
|
67
|
+
task_config = remove_task_by_type(control_message, "store")
|
|
68
|
+
# logger.debug("ImageStorageStage: Task configuration extracted: %s", pprint.pformat(task_config))
|
|
69
|
+
|
|
70
|
+
store_structured: bool = task_config.get("structured", True)
|
|
71
|
+
store_unstructured: bool = task_config.get("images", False)
|
|
72
|
+
|
|
73
|
+
content_types: Dict[Any, Any] = {}
|
|
74
|
+
if store_structured:
|
|
75
|
+
content_types[ContentTypeEnum.STRUCTURED] = store_structured
|
|
76
|
+
|
|
77
|
+
if store_unstructured:
|
|
78
|
+
content_types[ContentTypeEnum.IMAGE] = store_unstructured
|
|
79
|
+
|
|
80
|
+
params: Dict[str, Any] = task_config.get("params", {})
|
|
81
|
+
params["content_types"] = content_types
|
|
82
|
+
|
|
83
|
+
logger.debug(f"Processing storage task with parameters: {params}")
|
|
84
|
+
|
|
85
|
+
# Store images or structured content.
|
|
86
|
+
df_storage_ledger: pd.DataFrame = store_images_to_minio_internal(
|
|
87
|
+
df_storage_ledger=df_payload,
|
|
88
|
+
task_config=params,
|
|
89
|
+
storage_config={},
|
|
90
|
+
execution_trace_log=None,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
logger.info("Image storage operation completed. Updated payload has %d rows.", len(df_storage_ledger))
|
|
94
|
+
|
|
95
|
+
# Update the control message payload.
|
|
96
|
+
control_message.payload(df_storage_ledger)
|
|
97
|
+
|
|
98
|
+
return control_message
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import ray
|
|
8
|
+
|
|
9
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
10
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
11
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
12
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
13
|
+
from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
|
|
14
|
+
from nv_ingest_api.internal.store.embed_text_upload import store_text_embeddings_internal
|
|
15
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
16
|
+
nv_ingest_node_failure_try_except,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@ray.remote
|
|
23
|
+
class EmbeddingStorageStage(RayActorStage):
|
|
24
|
+
"""
|
|
25
|
+
A Ray actor stage that stores text embeddings in MinIO.
|
|
26
|
+
|
|
27
|
+
It expects an IngestControlMessage containing a DataFrame with embedding data. It then:
|
|
28
|
+
1. Removes the "store_embedding" task from the message.
|
|
29
|
+
2. Calls the embedding storage logic (via store_text_embeddings_internal) using a validated configuration.
|
|
30
|
+
3. Updates the message payload with the stored embeddings DataFrame.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: EmbeddingStorageSchema) -> None:
|
|
34
|
+
super().__init__(config)
|
|
35
|
+
try:
|
|
36
|
+
self.validated_config = config
|
|
37
|
+
logger.info("EmbeddingStorageStage configuration validated successfully.")
|
|
38
|
+
except Exception as e:
|
|
39
|
+
logger.exception(f"Error validating Embedding Storage config: {e}")
|
|
40
|
+
raise
|
|
41
|
+
|
|
42
|
+
@traceable("embedding_storage")
|
|
43
|
+
@filter_by_task(required_tasks=["store_embedding"])
|
|
44
|
+
@nv_ingest_node_failure_try_except(annotation_id="embedding_storage", raise_on_failure=False)
|
|
45
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
46
|
+
"""
|
|
47
|
+
Process the control message by storing embeddings.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
control_message : IngestControlMessage
|
|
52
|
+
The message containing a DataFrame payload with embedding data.
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
IngestControlMessage
|
|
57
|
+
The updated message with embeddings stored in MinIO.
|
|
58
|
+
"""
|
|
59
|
+
logger.info("EmbeddingStorageStage.on_data: Starting embedding storage process.")
|
|
60
|
+
|
|
61
|
+
# Extract the DataFrame payload.
|
|
62
|
+
df_ledger = control_message.payload()
|
|
63
|
+
logger.debug("Extracted payload with %d rows.", len(df_ledger))
|
|
64
|
+
|
|
65
|
+
# Remove the "store_embedding" task from the message to obtain task-specific configuration.
|
|
66
|
+
task_config = remove_task_by_type(control_message, "store_embedding")
|
|
67
|
+
logger.debug("Extracted task config: %s", task_config)
|
|
68
|
+
|
|
69
|
+
# Perform embedding storage.
|
|
70
|
+
new_df = store_text_embeddings_internal(
|
|
71
|
+
df_store_ledger=df_ledger,
|
|
72
|
+
task_config=task_config,
|
|
73
|
+
store_config=self.validated_config,
|
|
74
|
+
execution_trace_log=None,
|
|
75
|
+
)
|
|
76
|
+
logger.info("Embedding storage completed. Resulting DataFrame has %d rows.", len(new_df))
|
|
77
|
+
|
|
78
|
+
# Update the message payload with the stored embeddings DataFrame.
|
|
79
|
+
control_message.payload(new_df)
|
|
80
|
+
|
|
81
|
+
return control_message
|