nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,502 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import multiprocessing
7
+ import uuid
8
+ import socket
9
+ from typing import Optional, Literal, Dict, Any, Union
10
+
11
+ import ray
12
+ import json
13
+ import copy
14
+ import threading
15
+ import time
16
+ from datetime import datetime
17
+
18
+ import pandas as pd
19
+ from opentelemetry.trace.span import format_trace_id
20
+ from pydantic import BaseModel, Field
21
+
22
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_source_stage_base import RayActorSourceStage
23
+
24
+ # Import from nv_ingest_api
25
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
26
+ from nv_ingest_api.internal.primitives.control_message_task import ControlMessageTask
27
+ from nv_ingest_api.internal.primitives.tracing.logging import annotate_cm
28
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_ingest_job
29
+
30
+ # Import clients
31
+ from nv_ingest_api.util.message_brokers.simple_message_broker.simple_client import SimpleClient
32
+ from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class BrokerParamsRedis(BaseModel):
38
+ """Specific parameters for Redis broker_params."""
39
+
40
+ db: int = 0
41
+ use_ssl: bool = False
42
+
43
+
44
+ class BaseBrokerClientConfig(BaseModel):
45
+ """Base configuration common to all broker clients."""
46
+
47
+ host: str = Field(..., description="Hostname or IP address of the message broker.")
48
+ port: int = Field(..., description="Port number of the message broker.")
49
+ max_retries: int = Field(default=5, ge=0, description="Maximum number of connection retries.")
50
+ max_backoff: float = Field(default=5.0, gt=0, description="Maximum backoff delay in seconds between retries.")
51
+ connection_timeout: float = Field(default=30.0, gt=0, description="Connection timeout in seconds.")
52
+
53
+
54
+ class RedisClientConfig(BaseBrokerClientConfig):
55
+ """Configuration specific to the Redis client."""
56
+
57
+ client_type: Literal["redis"] = Field(..., description="Specifies the client type as Redis.")
58
+ broker_params: BrokerParamsRedis = Field(
59
+ default_factory=BrokerParamsRedis, description="Redis-specific parameters like db and ssl."
60
+ )
61
+
62
+
63
+ class SimpleClientConfig(BaseBrokerClientConfig):
64
+ """Configuration specific to the Simple client."""
65
+
66
+ client_type: Literal["simple"] = Field(..., description="Specifies the client type as Simple.")
67
+ broker_params: Optional[Dict[str, Any]] = Field(
68
+ default={}, description="Optional parameters for Simple client (currently unused)."
69
+ )
70
+
71
+
72
+ # --- Define Updated Source Configuration ---
73
+
74
+
75
+ class MessageBrokerTaskSourceConfig(BaseModel):
76
+ """
77
+ Configuration for the MessageBrokerTaskSourceStage.
78
+
79
+ Attributes
80
+ ----------
81
+ broker_client : Union[RedisClientConfig, SimpleClientConfig]
82
+ Configuration parameters for connecting to the message broker.
83
+ The specific schema is determined by the 'client_type' field.
84
+ task_queue : str
85
+ The name of the queue to fetch tasks from.
86
+ poll_interval : float, optional
87
+ The polling interval (in seconds) for fetching messages. Defaults to 0.1.
88
+ """
89
+
90
+ # Use the discriminated union for broker_client
91
+ broker_client: Union[RedisClientConfig, SimpleClientConfig] = Field(..., discriminator="client_type")
92
+ task_queue: str = Field(..., description="The name of the queue to fetch tasks from.")
93
+ poll_interval: float = Field(default=0.1, gt=0, description="Polling interval in seconds.")
94
+
95
+
96
+ @ray.remote
97
+ class MessageBrokerTaskSourceStage(RayActorSourceStage):
98
+ """
99
+ Ray actor source stage for a message broker task source.
100
+
101
+ Fetches messages from a broker, processes them, and writes to the output queue.
102
+ """
103
+
104
+ # Use the updated config type hint
105
+ def __init__(self, config: MessageBrokerTaskSourceConfig) -> None:
106
+ super().__init__(config, log_to_stdout=False)
107
+ self.config: MessageBrokerTaskSourceConfig # Add type hint for self.config
108
+ self._logger.debug(
109
+ "Initializing MessageBrokerTaskSourceStage with config: %s", config.dict()
110
+ ) # Log validated config
111
+
112
+ # Access validated configuration directly via self.config
113
+ self.poll_interval = self.config.poll_interval
114
+ self.task_queue = self.config.task_queue
115
+
116
+ # Create the client using validated config
117
+ self.client = self._create_client()
118
+
119
+ # Other initializations
120
+ self._message_count = 0
121
+ self._last_message_count = 0
122
+ self.output_queue = None # Presumably set later or via base class
123
+ self.start_time = None
124
+
125
+ # Threading event remains the same
126
+ self._pause_event = threading.Event()
127
+ self._pause_event.set() # Initially not paused
128
+
129
+ self._logger.debug("MessageBrokerTaskSourceStage initialized. Task queue: %s", self.task_queue)
130
+
131
+ # --- Private helper methods ---
132
+ def _create_client(self):
133
+ # Access broker config via self.config.broker_client
134
+ broker_config = self.config.broker_client
135
+ self._logger.info("Creating client of type: %s", broker_config.client_type)
136
+
137
+ if broker_config.client_type == "redis":
138
+ client = RedisClient(
139
+ host=broker_config.host,
140
+ port=broker_config.port,
141
+ db=broker_config.broker_params.db, # Use nested model attribute access
142
+ max_retries=broker_config.max_retries,
143
+ max_backoff=broker_config.max_backoff,
144
+ connection_timeout=broker_config.connection_timeout,
145
+ use_ssl=broker_config.broker_params.use_ssl, # Use nested model attribute access
146
+ )
147
+ self._logger.debug("RedisClient created: %s", client) # Consider logging non-sensitive parts if needed
148
+ return client
149
+ elif broker_config.client_type == "simple":
150
+ server_host = broker_config.host
151
+ server_host = "0.0.0.0"
152
+ client = SimpleClient(
153
+ host=server_host, # Using configured host
154
+ port=broker_config.port,
155
+ max_retries=broker_config.max_retries,
156
+ max_backoff=broker_config.max_backoff,
157
+ connection_timeout=broker_config.connection_timeout,
158
+ )
159
+ self._logger.debug("SimpleClient created: %s", client)
160
+ return client
161
+
162
+ def _process_message(self, job: dict, ts_fetched: datetime) -> Any:
163
+ """
164
+ Process a raw job fetched from the message broker into an IngestControlMessage.
165
+ """
166
+ control_message = IngestControlMessage()
167
+ job_id = None
168
+
169
+ try:
170
+ # Log the payload (with content redacted) if in debug mode
171
+ if self._logger.isEnabledFor(logging.DEBUG):
172
+ no_payload = copy.deepcopy(job)
173
+ if "content" in no_payload.get("job_payload", {}):
174
+ no_payload["job_payload"]["content"] = ["[...]"]
175
+ self._logger.debug("Processed job payload for logging: %s", json.dumps(no_payload, indent=2))
176
+
177
+ # Validate incoming job structure
178
+ validate_ingest_job(job)
179
+
180
+ ts_entry = datetime.now()
181
+ job_id = job.pop("job_id")
182
+
183
+ job_payload = job.get("job_payload", {})
184
+ job_tasks = job.get("tasks", [])
185
+ tracing_options = job.pop("tracing_options", {})
186
+
187
+ # Extract tracing options
188
+ do_trace_tagging = tracing_options.get("trace", True)
189
+ if do_trace_tagging in (True, "True", "true", "1"):
190
+ do_trace_tagging = True
191
+
192
+ ts_send = tracing_options.get("ts_send")
193
+ if ts_send is not None:
194
+ ts_send = datetime.fromtimestamp(ts_send / 1e9)
195
+ trace_id = tracing_options.get("trace_id")
196
+
197
+ # Create response channel and load payload
198
+ response_channel = f"{job_id}"
199
+ df = pd.DataFrame(job_payload)
200
+ control_message.payload(df)
201
+ annotate_cm(control_message, message="Created")
202
+
203
+ # Add basic metadata
204
+ control_message.set_metadata("response_channel", response_channel)
205
+ control_message.set_metadata("job_id", job_id)
206
+ control_message.set_metadata("timestamp", datetime.now().timestamp())
207
+
208
+ # Add task definitions to the control message
209
+ for task in job_tasks:
210
+ task_id = task.get("id", str(uuid.uuid4()))
211
+ task_type = task.get("type", "unknown")
212
+ task_props = task.get("task_properties", {})
213
+
214
+ if not isinstance(task_props, dict):
215
+ task_props = task_props.model_dump()
216
+
217
+ task_obj = ControlMessageTask(
218
+ id=task_id,
219
+ type=task_type,
220
+ properties=task_props,
221
+ )
222
+ control_message.add_task(task_obj)
223
+
224
+ # Apply tracing metadata and timestamps if enabled
225
+ control_message.set_metadata("config::add_trace_tagging", do_trace_tagging)
226
+ if do_trace_tagging:
227
+ ts_exit = datetime.now()
228
+
229
+ control_message.set_timestamp("trace::entry::message_broker_task_source", ts_entry)
230
+ control_message.set_timestamp("trace::exit::message_broker_task_source", ts_exit)
231
+
232
+ if ts_send is not None:
233
+ control_message.set_timestamp("trace::entry::broker_source_network_in", ts_send)
234
+ control_message.set_timestamp("trace::exit::broker_source_network_in", ts_fetched)
235
+
236
+ if trace_id is not None:
237
+ if isinstance(trace_id, int):
238
+ trace_id = format_trace_id(trace_id)
239
+ control_message.set_metadata("trace_id", trace_id)
240
+
241
+ control_message.set_timestamp("latency::ts_send", datetime.now())
242
+
243
+ self._logger.debug("Message processed successfully with job_id: %s", job_id)
244
+
245
+ except Exception as e:
246
+ self._logger.exception("Failed to process job submission: %s", e)
247
+
248
+ if job_id is not None:
249
+ response_channel = f"{job_id}"
250
+ control_message.set_metadata("job_id", job_id)
251
+ control_message.set_metadata("response_channel", response_channel)
252
+ control_message.set_metadata("cm_failed", True)
253
+
254
+ annotate_cm(control_message, message="Failed to process job submission", error=str(e))
255
+ else:
256
+ raise
257
+
258
+ return control_message
259
+
260
+ def _fetch_message(self, timeout=100):
261
+ """
262
+ Fetch a message from the message broker.
263
+ """
264
+ try:
265
+ job = self.client.fetch_message(self.task_queue, timeout)
266
+ if job is None:
267
+ self._logger.debug("No message received from '%s'", self.task_queue)
268
+ return None
269
+ self._logger.debug("Received message type: %s", type(job))
270
+ if isinstance(job, BaseModel):
271
+ self._logger.debug("Message is a BaseModel with response_code: %s", job.response_code)
272
+ if job.response_code != 0:
273
+ self._logger.debug("Message response_code != 0, returning None")
274
+ return None
275
+ job = json.loads(job.response)
276
+ self._logger.debug("Successfully fetched message with job_id: %s", job.get("job_id", "unknown"))
277
+ return job
278
+ except TimeoutError:
279
+ self._logger.debug("Timeout waiting for message")
280
+ return None
281
+ except Exception as err:
282
+ self._logger.exception("Error during message fetching: %s", err)
283
+ return None
284
+
285
+ def _read_input(self) -> any:
286
+ """
287
+ Source stage's implementation of get_input.
288
+ Instead of reading from an input edge, fetch a message from the broker.
289
+ """
290
+ self._logger.debug("read_input: calling _fetch_message()")
291
+ job = self._fetch_message(timeout=100)
292
+ if job is None:
293
+ self._logger.debug("read_input: No job received, sleeping for poll_interval: %s", self.config.poll_interval)
294
+ time.sleep(self.config.poll_interval)
295
+
296
+ return None
297
+
298
+ self.stats["successful_queue_reads"] += 1
299
+
300
+ ts_fetched = datetime.now()
301
+ self._logger.debug("read_input: Job fetched, processing message")
302
+ control_message = self._process_message(job, ts_fetched)
303
+ self._logger.debug("read_input: Message processed, returning control message")
304
+
305
+ return control_message
306
+
307
+ def on_data(self, control_message: any) -> any:
308
+ """
309
+ Process the control message.
310
+ For this source stage, no additional processing is done, so simply return it.
311
+ """
312
+ self._logger.debug("on_data: Received control message for processing")
313
+ return control_message
314
+
315
+ # In the processing loop, instead of checking a boolean, we wait on the event.
316
+ def _processing_loop(self) -> None:
317
+ """
318
+ Custom processing loop for a source stage.
319
+ This loop fetches messages from the broker and writes them to the output queue,
320
+ but blocks on the pause event when the stage is paused.
321
+ """
322
+ self._logger.info("Processing loop started")
323
+ iteration = 0
324
+ while self._running:
325
+ iteration += 1
326
+ try:
327
+ self._logger.debug("Processing loop iteration: %s", iteration)
328
+ control_message = self._read_input()
329
+ if control_message is None:
330
+ self._logger.debug(
331
+ "No control message received; sleeping for poll_interval: %s", self.config.poll_interval
332
+ )
333
+ time.sleep(self.config.poll_interval)
334
+ continue
335
+
336
+ self._active_processing = True
337
+
338
+ self._logger.debug("Control message received; processing data")
339
+ updated_cm = self.on_data(control_message)
340
+
341
+ # Block until not paused using the pause event.
342
+ if self.output_queue is not None:
343
+ self._logger.debug("Waiting for stage to resume if paused...")
344
+
345
+ if not self._pause_event.is_set():
346
+ self._active_processing = False
347
+ self._pause_event.wait() # Block if paused
348
+ self._active_processing = True
349
+
350
+ while True:
351
+ try:
352
+ self.output_queue.put(updated_cm)
353
+ self.stats["successful_queue_writes"] += 1
354
+ break
355
+ except Exception:
356
+ self._logger.warning("Output queue full, retrying put()...")
357
+ self.stats["queue_full"] += 1
358
+ time.sleep(0.1)
359
+
360
+ self.stats["processed"] += 1
361
+ self._message_count += 1
362
+
363
+ self._logger.debug(f"Sourced message_count: {self._message_count}")
364
+ self._logger.debug("Iteration %s complete. Total processed: %s", iteration, self.stats["processed"])
365
+ except Exception as e:
366
+ self._logger.exception("Error in processing loop at iteration %s: %s", iteration, e)
367
+ time.sleep(self.config.poll_interval)
368
+ finally:
369
+ self._active_processing = False
370
+ self._shutdown_signal_complete = True
371
+
372
+ self._logger.info("Processing loop ending")
373
+
374
+ @ray.method(num_returns=1)
375
+ def start(self) -> bool:
376
+ if self._running:
377
+ self._logger.info("Start called but stage is already running.")
378
+ return False
379
+ self._running = True
380
+ self.start_time = time.time()
381
+ self._message_count = 0
382
+ self._logger.info("Starting processing loop thread.")
383
+ threading.Thread(target=self._processing_loop, daemon=True).start()
384
+ self._logger.info("MessageBrokerTaskSourceStage started.")
385
+ return True
386
+
387
+ @ray.method(num_returns=1)
388
+ def stop(self) -> bool:
389
+ self._running = False
390
+ self._logger.info("Stop called on MessageBrokerTaskSourceStage")
391
+ return True
392
+
393
+ @ray.method(num_returns=1)
394
+ def get_stats(self) -> dict:
395
+ elapsed = time.time() - self.start_time if self.start_time else 0
396
+ delta = self._message_count - self._last_message_count
397
+ self._last_message_count = self._message_count
398
+ stats = {
399
+ "active_processing": 1 if self._active_processing else 0,
400
+ "delta_processed": delta,
401
+ "elapsed": elapsed,
402
+ "errors": self.stats.get("errors", 0),
403
+ "failed": 0,
404
+ "processed": self._message_count,
405
+ "processing_rate_cps": self._message_count / elapsed if elapsed > 0 else 0,
406
+ "successful_queue_reads": self.stats.get("successful_queue_reads", 0),
407
+ "successful_queue_writes": self.stats.get("successful_queue_writes", 0),
408
+ "queue_full": self.stats.get("queue_full", 0),
409
+ }
410
+
411
+ return stats
412
+
413
+ @ray.method(num_returns=1)
414
+ def set_output_queue(self, queue_handle: any) -> bool:
415
+ self.output_queue = queue_handle
416
+ self._logger.info("Output queue set: %s", queue_handle)
417
+ return True
418
+
419
+ @ray.method(num_returns=1)
420
+ def pause(self) -> bool:
421
+ """
422
+ Pause the stage. This clears the pause event, causing the processing loop
423
+ to block before writing to the output queue.
424
+
425
+ Returns
426
+ -------
427
+ bool
428
+ True after the stage is paused.
429
+ """
430
+ self._pause_event.clear()
431
+ self._logger.info("Stage paused.")
432
+
433
+ return True
434
+
435
+ @ray.method(num_returns=1)
436
+ def resume(self) -> bool:
437
+ """
438
+ Resume the stage. This sets the pause event, allowing the processing loop
439
+ to proceed with writing to the output queue.
440
+
441
+ Returns
442
+ -------
443
+ bool
444
+ True after the stage is resumed.
445
+ """
446
+ self._pause_event.set()
447
+ self._logger.info("Stage resumed.")
448
+ return True
449
+
450
+ @ray.method(num_returns=1)
451
+ def swap_queues(self, new_queue: any) -> bool:
452
+ """
453
+ Swap in a new output queue for this stage.
454
+ This method pauses the stage, waits for any current processing to finish,
455
+ replaces the output queue, and then resumes the stage.
456
+ """
457
+ self._logger.info("Swapping output queue: pausing stage first.")
458
+ self.pause()
459
+ self.set_output_queue(new_queue)
460
+ self._logger.info("Output queue swapped. Resuming stage.")
461
+ self.resume()
462
+ return True
463
+
464
+
465
+ def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
466
+ """
467
+ Starts a SimpleMessageBroker server in a separate process.
468
+
469
+ Parameters
470
+ ----------
471
+ broker_client : dict
472
+ Broker configuration. Expected keys include:
473
+ - "port": the port to bind the server to,
474
+ - "broker_params": optionally including "max_queue_size",
475
+ - and any other parameters required by SimpleMessageBroker.
476
+
477
+ Returns
478
+ -------
479
+ multiprocessing.Process
480
+ The process running the SimpleMessageBroker server.
481
+ """
482
+
483
+ def broker_server():
484
+ from nv_ingest_api.util.message_brokers.simple_message_broker.broker import SimpleMessageBroker
485
+
486
+ # Use max_queue_size from broker_params or default to 10000.
487
+ broker_params = broker_client.get("broker_params", {})
488
+ max_queue_size = broker_params.get("max_queue_size", 10000)
489
+ server_host = broker_client.get("host", "0.0.0.0")
490
+ server_port = broker_client.get("port", 7671)
491
+ # Optionally, set socket options here for reuse.
492
+ server = SimpleMessageBroker(server_host, server_port, max_queue_size)
493
+ # Enable address reuse on the server socket.
494
+ server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
495
+ server.serve_forever()
496
+
497
+ p = multiprocessing.Process(target=broker_server)
498
+ p.daemon = True
499
+ p.start()
500
+ logger.info(f"Started SimpleMessageBroker server in separate process on port {broker_client['port']}")
501
+
502
+ return p
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,98 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Dict, Any
7
+
8
+ import pandas as pd
9
+ import ray
10
+
11
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
+ from nv_ingest.framework.util.flow_control import filter_by_task
13
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
14
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
15
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
16
+ from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
17
+ from nv_ingest_api.internal.store.image_upload import store_images_to_minio_internal
18
+ from nv_ingest_api.util.exception_handlers.decorators import (
19
+ nv_ingest_node_failure_try_except,
20
+ )
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @ray.remote
26
+ class ImageStorageStage(RayActorStage):
27
+ """
28
+ A Ray actor stage that stores images or structured content in MinIO and updates metadata with storage URLs.
29
+
30
+ This stage uses the validated configuration (ImageStorageModuleSchema) to process and store the DataFrame
31
+ payload and updates the control message accordingly.
32
+ """
33
+
34
+ def __init__(self, config: ImageStorageModuleSchema) -> None:
35
+ super().__init__(config)
36
+ try:
37
+ self.validated_config = config
38
+ logger.info("ImageStorageStage configuration validated successfully.")
39
+ except Exception as e:
40
+ logger.exception("Error validating image storage config")
41
+ raise e
42
+
43
+ @traceable("image_storage")
44
+ @filter_by_task(required_tasks=["store"])
45
+ @nv_ingest_node_failure_try_except(annotation_id="image_storage", raise_on_failure=False)
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ """
48
+ Process the control message by storing images or structured content.
49
+
50
+ Parameters
51
+ ----------
52
+ control_message : IngestControlMessage
53
+ The incoming message containing the DataFrame payload.
54
+
55
+ Returns
56
+ -------
57
+ IngestControlMessage
58
+ The updated message with storage URLs and trace info added.
59
+ """
60
+ logger.info("ImageStorageStage.on_data: Starting storage operation.")
61
+
62
+ # Extract DataFrame payload.
63
+ df_payload = control_message.payload()
64
+ logger.debug("ImageStorageStage: Extracted payload with %d rows.", len(df_payload))
65
+
66
+ # Remove the "store" task to obtain task-specific configuration.
67
+ task_config = remove_task_by_type(control_message, "store")
68
+ # logger.debug("ImageStorageStage: Task configuration extracted: %s", pprint.pformat(task_config))
69
+
70
+ store_structured: bool = task_config.get("structured", True)
71
+ store_unstructured: bool = task_config.get("images", False)
72
+
73
+ content_types: Dict[Any, Any] = {}
74
+ if store_structured:
75
+ content_types[ContentTypeEnum.STRUCTURED] = store_structured
76
+
77
+ if store_unstructured:
78
+ content_types[ContentTypeEnum.IMAGE] = store_unstructured
79
+
80
+ params: Dict[str, Any] = task_config.get("params", {})
81
+ params["content_types"] = content_types
82
+
83
+ logger.debug(f"Processing storage task with parameters: {params}")
84
+
85
+ # Store images or structured content.
86
+ df_storage_ledger: pd.DataFrame = store_images_to_minio_internal(
87
+ df_storage_ledger=df_payload,
88
+ task_config=params,
89
+ storage_config={},
90
+ execution_trace_log=None,
91
+ )
92
+
93
+ logger.info("Image storage operation completed. Updated payload has %d rows.", len(df_storage_ledger))
94
+
95
+ # Update the control message payload.
96
+ control_message.payload(df_storage_ledger)
97
+
98
+ return control_message
@@ -0,0 +1,81 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+
7
+ import ray
8
+
9
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
10
+ from nv_ingest.framework.util.flow_control import filter_by_task
11
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
12
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
13
+ from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
14
+ from nv_ingest_api.internal.store.embed_text_upload import store_text_embeddings_internal
15
+ from nv_ingest_api.util.exception_handlers.decorators import (
16
+ nv_ingest_node_failure_try_except,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @ray.remote
23
+ class EmbeddingStorageStage(RayActorStage):
24
+ """
25
+ A Ray actor stage that stores text embeddings in MinIO.
26
+
27
+ It expects an IngestControlMessage containing a DataFrame with embedding data. It then:
28
+ 1. Removes the "store_embedding" task from the message.
29
+ 2. Calls the embedding storage logic (via store_text_embeddings_internal) using a validated configuration.
30
+ 3. Updates the message payload with the stored embeddings DataFrame.
31
+ """
32
+
33
+ def __init__(self, config: EmbeddingStorageSchema) -> None:
34
+ super().__init__(config)
35
+ try:
36
+ self.validated_config = config
37
+ logger.info("EmbeddingStorageStage configuration validated successfully.")
38
+ except Exception as e:
39
+ logger.exception(f"Error validating Embedding Storage config: {e}")
40
+ raise
41
+
42
+ @traceable("embedding_storage")
43
+ @filter_by_task(required_tasks=["store_embedding"])
44
+ @nv_ingest_node_failure_try_except(annotation_id="embedding_storage", raise_on_failure=False)
45
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
46
+ """
47
+ Process the control message by storing embeddings.
48
+
49
+ Parameters
50
+ ----------
51
+ control_message : IngestControlMessage
52
+ The message containing a DataFrame payload with embedding data.
53
+
54
+ Returns
55
+ -------
56
+ IngestControlMessage
57
+ The updated message with embeddings stored in MinIO.
58
+ """
59
+ logger.info("EmbeddingStorageStage.on_data: Starting embedding storage process.")
60
+
61
+ # Extract the DataFrame payload.
62
+ df_ledger = control_message.payload()
63
+ logger.debug("Extracted payload with %d rows.", len(df_ledger))
64
+
65
+ # Remove the "store_embedding" task from the message to obtain task-specific configuration.
66
+ task_config = remove_task_by_type(control_message, "store_embedding")
67
+ logger.debug("Extracted task config: %s", task_config)
68
+
69
+ # Perform embedding storage.
70
+ new_df = store_text_embeddings_internal(
71
+ df_store_ledger=df_ledger,
72
+ task_config=task_config,
73
+ store_config=self.validated_config,
74
+ execution_trace_log=None,
75
+ )
76
+ logger.info("Embedding storage completed. Resulting DataFrame has %d rows.", len(new_df))
77
+
78
+ # Update the message payload with the stored embeddings DataFrame.
79
+ control_message.payload(new_df)
80
+
81
+ return control_message
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0