nv-ingest 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.15.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/framework/orchestration/execution/__init__.py +3 -0
- nv_ingest/framework/orchestration/execution/helpers.py +85 -0
- nv_ingest/framework/orchestration/execution/options.py +112 -0
- nv_ingest/framework/orchestration/process/__init__.py +3 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +55 -0
- nv_ingest/framework/orchestration/process/execution.py +497 -0
- nv_ingest/framework/orchestration/process/lifecycle.py +122 -0
- nv_ingest/framework/orchestration/process/strategies.py +182 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +1 -1
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +23 -23
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +16 -16
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +9 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +10 -6
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +18 -17
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +14 -13
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +15 -13
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +22 -13
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +10 -7
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +10 -8
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +71 -61
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +7 -5
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +7 -5
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +13 -14
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +18 -12
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
- nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
- nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
- nv_ingest/pipeline/__init__.py +3 -0
- nv_ingest/pipeline/config/__init__.py +3 -0
- nv_ingest/pipeline/config/loaders.py +198 -0
- nv_ingest/pipeline/config/replica_resolver.py +227 -0
- nv_ingest/pipeline/default_pipeline_impl.py +517 -0
- nv_ingest/pipeline/ingest_pipeline.py +389 -0
- nv_ingest/pipeline/pipeline_schema.py +398 -0
- {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/METADATA +1 -1
- {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/RECORD +54 -40
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
- {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/top_level.txt +0 -0
|
@@ -3,12 +3,13 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Dict, Any
|
|
6
|
+
from typing import Dict, Any, Optional
|
|
7
7
|
|
|
8
8
|
import ray
|
|
9
9
|
|
|
10
10
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
11
11
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
12
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
12
13
|
from nv_ingest_api.internal.mutate.filter import filter_images_internal
|
|
13
14
|
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
14
15
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
@@ -31,18 +32,19 @@ class ImageFilterStage(RayActorStage):
|
|
|
31
32
|
3. Updates the message payload with the filtered DataFrame.
|
|
32
33
|
"""
|
|
33
34
|
|
|
34
|
-
def __init__(self, config: ImageFilterSchema) -> None:
|
|
35
|
-
super().__init__(config)
|
|
35
|
+
def __init__(self, config: ImageFilterSchema, stage_name: Optional[str] = None) -> None:
|
|
36
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
37
|
try:
|
|
37
38
|
self.validated_config = config
|
|
38
|
-
logger.
|
|
39
|
+
logger.debug("ImageFilterStage configuration validated successfully.")
|
|
39
40
|
except Exception as e:
|
|
40
41
|
logger.exception(f"Error validating Image Filter config: {e}")
|
|
41
42
|
raise
|
|
42
43
|
|
|
43
|
-
@
|
|
44
|
+
@nv_ingest_node_failure_try_except()
|
|
45
|
+
@traceable()
|
|
46
|
+
@udf_intercept_hook()
|
|
44
47
|
@filter_by_task(required_tasks=["filter"])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="image_filter", raise_on_failure=False)
|
|
46
48
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
49
|
"""
|
|
48
50
|
Process the control message by filtering images.
|
|
@@ -57,7 +59,7 @@ class ImageFilterStage(RayActorStage):
|
|
|
57
59
|
IngestControlMessage
|
|
58
60
|
The updated message with filtered images in the payload.
|
|
59
61
|
"""
|
|
60
|
-
logger.
|
|
62
|
+
logger.debug("ImageFilterStage.on_data: Starting image filtering process.")
|
|
61
63
|
|
|
62
64
|
# Extract the DataFrame payload.
|
|
63
65
|
df_ledger = control_message.payload()
|
|
@@ -76,7 +78,7 @@ class ImageFilterStage(RayActorStage):
|
|
|
76
78
|
mutate_config=self.validated_config,
|
|
77
79
|
execution_trace_log=None,
|
|
78
80
|
)
|
|
79
|
-
logger.
|
|
81
|
+
logger.debug("Image filtering completed. Resulting DataFrame has %d rows.", len(new_df))
|
|
80
82
|
|
|
81
83
|
# Update the message payload with the filtered DataFrame.
|
|
82
84
|
control_message.payload(new_df)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
-
from typing import Any, Dict
|
|
5
|
+
from typing import Any, Dict, Optional
|
|
6
6
|
import ray
|
|
7
7
|
|
|
8
8
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_sink_stage_base import RayActorSinkStage
|
|
@@ -12,13 +12,13 @@ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_fail
|
|
|
12
12
|
|
|
13
13
|
@ray.remote
|
|
14
14
|
class DefaultDrainSink(RayActorSinkStage):
|
|
15
|
-
def __init__(self, config: Any) -> None:
|
|
16
|
-
super().__init__(config, log_to_stdout=False)
|
|
15
|
+
def __init__(self, config: Any, stage_name: Optional[str] = None) -> None:
|
|
16
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
17
17
|
|
|
18
18
|
self._last_sunk_count = 0
|
|
19
19
|
self._sunk_count = 0
|
|
20
20
|
|
|
21
|
-
@nv_ingest_node_failure_try_except(
|
|
21
|
+
@nv_ingest_node_failure_try_except()
|
|
22
22
|
def on_data(self, message: IngestControlMessage) -> IngestControlMessage:
|
|
23
23
|
self._sunk_count += 1
|
|
24
24
|
|
|
@@ -14,6 +14,8 @@ from nv_ingest_api.internal.primitives.tracing.logging import annotate_cm
|
|
|
14
14
|
from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
|
|
15
15
|
from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
|
|
16
16
|
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
|
+
|
|
17
19
|
logger = logging.getLogger(__name__)
|
|
18
20
|
|
|
19
21
|
|
|
@@ -75,8 +77,8 @@ class MessageBrokerTaskSinkConfig(BaseModel):
|
|
|
75
77
|
|
|
76
78
|
@ray.remote
|
|
77
79
|
class MessageBrokerTaskSinkStage(RayActorStage):
|
|
78
|
-
def __init__(self, config: MessageBrokerTaskSinkConfig) -> None:
|
|
79
|
-
super().__init__(config, log_to_stdout=False)
|
|
80
|
+
def __init__(self, config: MessageBrokerTaskSinkConfig, stage_name: Optional[str] = None) -> None:
|
|
81
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
80
82
|
|
|
81
83
|
self.config: MessageBrokerTaskSinkConfig
|
|
82
84
|
|
|
@@ -224,6 +226,7 @@ class MessageBrokerTaskSinkStage(RayActorStage):
|
|
|
224
226
|
|
|
225
227
|
# --- Public API Methods for message broker sink ---
|
|
226
228
|
|
|
229
|
+
@udf_intercept_hook()
|
|
227
230
|
def on_data(self, control_message: Any) -> Any:
|
|
228
231
|
"""
|
|
229
232
|
Processes the control message and pushes the resulting JSON payloads to the broker.
|
|
@@ -3,9 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
import multiprocessing
|
|
7
6
|
import uuid
|
|
8
|
-
import socket
|
|
9
7
|
from typing import Optional, Literal, Dict, Any, Union
|
|
10
8
|
|
|
11
9
|
import ray
|
|
@@ -13,6 +11,7 @@ import json
|
|
|
13
11
|
import copy
|
|
14
12
|
import threading
|
|
15
13
|
import time
|
|
14
|
+
import random
|
|
16
15
|
from datetime import datetime
|
|
17
16
|
|
|
18
17
|
import pandas as pd
|
|
@@ -102,11 +101,11 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
102
101
|
"""
|
|
103
102
|
|
|
104
103
|
# Use the updated config type hint
|
|
105
|
-
def __init__(self, config: MessageBrokerTaskSourceConfig) -> None:
|
|
106
|
-
super().__init__(config, log_to_stdout=False)
|
|
107
|
-
self.config: MessageBrokerTaskSourceConfig # Add type hint for self.config
|
|
104
|
+
def __init__(self, config: MessageBrokerTaskSourceConfig, stage_name: Optional[str] = None) -> None:
|
|
105
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
106
|
+
self.config: MessageBrokerTaskSourceConfig # Add a type hint for self.config
|
|
108
107
|
self._logger.debug(
|
|
109
|
-
"Initializing MessageBrokerTaskSourceStage with config: %s", config.
|
|
108
|
+
"Initializing MessageBrokerTaskSourceStage with config: %s", config.model_dump()
|
|
110
109
|
) # Log validated config
|
|
111
110
|
|
|
112
111
|
# Access validated configuration directly via self.config
|
|
@@ -126,13 +125,18 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
126
125
|
self._pause_event = threading.Event()
|
|
127
126
|
self._pause_event.set() # Initially not paused
|
|
128
127
|
|
|
128
|
+
# Backoff state for graceful retries when broker is unavailable
|
|
129
|
+
self._fetch_failure_count: int = 0
|
|
130
|
+
self._current_backoff_sleep: float = 0.0
|
|
131
|
+
self._last_backoff_log_time: float = 0.0
|
|
132
|
+
|
|
129
133
|
self._logger.debug("MessageBrokerTaskSourceStage initialized. Task queue: %s", self.task_queue)
|
|
130
134
|
|
|
131
135
|
# --- Private helper methods ---
|
|
132
136
|
def _create_client(self):
|
|
133
137
|
# Access broker config via self.config.broker_client
|
|
134
138
|
broker_config = self.config.broker_client
|
|
135
|
-
self._logger.
|
|
139
|
+
self._logger.debug("Creating client of type: %s", broker_config.client_type)
|
|
136
140
|
|
|
137
141
|
if broker_config.client_type == "redis":
|
|
138
142
|
client = RedisClient(
|
|
@@ -265,6 +269,9 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
265
269
|
job = self.client.fetch_message(self.task_queue, timeout)
|
|
266
270
|
if job is None:
|
|
267
271
|
self._logger.debug("No message received from '%s'", self.task_queue)
|
|
272
|
+
# Do not treat normal empty polls as failures
|
|
273
|
+
self._fetch_failure_count = 0
|
|
274
|
+
self._current_backoff_sleep = 0.0
|
|
268
275
|
return None
|
|
269
276
|
self._logger.debug("Received message type: %s", type(job))
|
|
270
277
|
if isinstance(job, BaseModel):
|
|
@@ -277,12 +284,46 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
277
284
|
return None
|
|
278
285
|
job = json.loads(job.response)
|
|
279
286
|
self._logger.debug("Successfully fetched message with job_id: %s", job.get("job_id", "unknown"))
|
|
287
|
+
# Success: reset backoff state
|
|
288
|
+
self._fetch_failure_count = 0
|
|
289
|
+
self._current_backoff_sleep = 0.0
|
|
280
290
|
return job
|
|
281
291
|
except TimeoutError:
|
|
282
292
|
self._logger.debug("Timeout waiting for message")
|
|
293
|
+
# Timeout is not a connectivity failure; do not escalate backoff
|
|
283
294
|
return None
|
|
284
295
|
except Exception as err:
|
|
285
|
-
|
|
296
|
+
# Connectivity or other fetch issue: apply graceful backoff and avoid stacktrace spam
|
|
297
|
+
self._fetch_failure_count += 1
|
|
298
|
+
|
|
299
|
+
# Compute exponential backoff with jitter, capped by configured max_backoff
|
|
300
|
+
try:
|
|
301
|
+
max_backoff = getattr(self.config.broker_client, "max_backoff", 5.0)
|
|
302
|
+
except Exception:
|
|
303
|
+
max_backoff = 5.0
|
|
304
|
+
# Start from 0.5s, double each failure
|
|
305
|
+
base = 0.5
|
|
306
|
+
backoff_no_jitter = min(max_backoff, base * (2 ** (self._fetch_failure_count - 1)))
|
|
307
|
+
jitter = random.uniform(0, backoff_no_jitter * 0.2)
|
|
308
|
+
self._current_backoff_sleep = backoff_no_jitter + jitter
|
|
309
|
+
|
|
310
|
+
now = time.time()
|
|
311
|
+
# Throttle warning logs to at most once per 5 seconds to avoid spam
|
|
312
|
+
if now - self._last_backoff_log_time >= 5.0:
|
|
313
|
+
self._logger.warning(
|
|
314
|
+
"Broker fetch failed (%d consecutive failures). Backing off for %.2fs. Error: %s",
|
|
315
|
+
self._fetch_failure_count,
|
|
316
|
+
self._current_backoff_sleep,
|
|
317
|
+
err,
|
|
318
|
+
)
|
|
319
|
+
self._last_backoff_log_time = now
|
|
320
|
+
else:
|
|
321
|
+
self._logger.debug(
|
|
322
|
+
"Broker fetch failed (%d). Backoff %.2fs. Error: %s",
|
|
323
|
+
self._fetch_failure_count,
|
|
324
|
+
self._current_backoff_sleep,
|
|
325
|
+
err,
|
|
326
|
+
)
|
|
286
327
|
return None
|
|
287
328
|
|
|
288
329
|
def _read_input(self) -> any:
|
|
@@ -293,8 +334,17 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
293
334
|
self._logger.debug("read_input: calling _fetch_message()")
|
|
294
335
|
job = self._fetch_message(timeout=100)
|
|
295
336
|
if job is None:
|
|
296
|
-
|
|
297
|
-
|
|
337
|
+
# Sleep for either the configured poll interval or the current backoff, whichever is larger
|
|
338
|
+
sleep_time = max(self.config.poll_interval, getattr(self, "_current_backoff_sleep", 0.0))
|
|
339
|
+
self._logger.debug(
|
|
340
|
+
"read_input: No job received; sleeping %.2fs (poll_interval=%.2fs, backoff=%.2fs)",
|
|
341
|
+
sleep_time,
|
|
342
|
+
self.config.poll_interval,
|
|
343
|
+
getattr(self, "_current_backoff_sleep", 0.0),
|
|
344
|
+
)
|
|
345
|
+
time.sleep(sleep_time)
|
|
346
|
+
# Reset one-shot backoff so that repeated failures recompute progressively
|
|
347
|
+
self._current_backoff_sleep = 0.0
|
|
298
348
|
|
|
299
349
|
return None
|
|
300
350
|
|
|
@@ -314,7 +364,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
314
364
|
This loop fetches messages from the broker and writes them to the output queue,
|
|
315
365
|
but blocks on the pause event when the stage is paused.
|
|
316
366
|
"""
|
|
317
|
-
self._logger.
|
|
367
|
+
self._logger.debug("Processing loop started")
|
|
318
368
|
iteration = 0
|
|
319
369
|
while self._running:
|
|
320
370
|
iteration += 1
|
|
@@ -381,25 +431,25 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
381
431
|
self._active_processing = False
|
|
382
432
|
self._shutdown_signal_complete = True
|
|
383
433
|
|
|
384
|
-
self._logger.
|
|
434
|
+
self._logger.debug("Processing loop ending")
|
|
385
435
|
|
|
386
436
|
@ray.method(num_returns=1)
|
|
387
437
|
def start(self) -> bool:
|
|
388
438
|
if self._running:
|
|
389
|
-
self._logger.
|
|
439
|
+
self._logger.warning("Start called but stage is already running.")
|
|
390
440
|
return False
|
|
391
441
|
self._running = True
|
|
392
442
|
self.start_time = time.time()
|
|
393
443
|
self._message_count = 0
|
|
394
|
-
self._logger.
|
|
444
|
+
self._logger.debug("Starting processing loop thread.")
|
|
395
445
|
threading.Thread(target=self._processing_loop, daemon=True).start()
|
|
396
|
-
self._logger.
|
|
446
|
+
self._logger.debug("MessageBrokerTaskSourceStage started.")
|
|
397
447
|
return True
|
|
398
448
|
|
|
399
449
|
@ray.method(num_returns=1)
|
|
400
450
|
def stop(self) -> bool:
|
|
401
451
|
self._running = False
|
|
402
|
-
self._logger.
|
|
452
|
+
self._logger.debug("Stop called on MessageBrokerTaskSourceStage")
|
|
403
453
|
return True
|
|
404
454
|
|
|
405
455
|
@ray.method(num_returns=1)
|
|
@@ -425,7 +475,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
425
475
|
@ray.method(num_returns=1)
|
|
426
476
|
def set_output_queue(self, queue_handle: any) -> bool:
|
|
427
477
|
self.output_queue = queue_handle
|
|
428
|
-
self._logger.
|
|
478
|
+
self._logger.debug("Output queue set: %s", queue_handle)
|
|
429
479
|
return True
|
|
430
480
|
|
|
431
481
|
@ray.method(num_returns=1)
|
|
@@ -440,7 +490,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
440
490
|
True after the stage is paused.
|
|
441
491
|
"""
|
|
442
492
|
self._pause_event.clear()
|
|
443
|
-
self._logger.
|
|
493
|
+
self._logger.debug("Stage paused.")
|
|
444
494
|
|
|
445
495
|
return True
|
|
446
496
|
|
|
@@ -456,7 +506,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
456
506
|
True after the stage is resumed.
|
|
457
507
|
"""
|
|
458
508
|
self._pause_event.set()
|
|
459
|
-
self._logger.
|
|
509
|
+
self._logger.debug("Stage resumed.")
|
|
460
510
|
return True
|
|
461
511
|
|
|
462
512
|
@ray.method(num_returns=1)
|
|
@@ -466,49 +516,9 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
466
516
|
This method pauses the stage, waits for any current processing to finish,
|
|
467
517
|
replaces the output queue, and then resumes the stage.
|
|
468
518
|
"""
|
|
469
|
-
self._logger.
|
|
519
|
+
self._logger.debug("Swapping output queue: pausing stage first.")
|
|
470
520
|
self.pause()
|
|
471
521
|
self.set_output_queue(new_queue)
|
|
472
|
-
self._logger.
|
|
522
|
+
self._logger.debug("Output queue swapped. Resuming stage.")
|
|
473
523
|
self.resume()
|
|
474
524
|
return True
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
|
|
478
|
-
"""
|
|
479
|
-
Starts a SimpleMessageBroker server in a separate process.
|
|
480
|
-
|
|
481
|
-
Parameters
|
|
482
|
-
----------
|
|
483
|
-
broker_client : dict
|
|
484
|
-
Broker configuration. Expected keys include:
|
|
485
|
-
- "port": the port to bind the server to,
|
|
486
|
-
- "broker_params": optionally including "max_queue_size",
|
|
487
|
-
- and any other parameters required by SimpleMessageBroker.
|
|
488
|
-
|
|
489
|
-
Returns
|
|
490
|
-
-------
|
|
491
|
-
multiprocessing.Process
|
|
492
|
-
The process running the SimpleMessageBroker server.
|
|
493
|
-
"""
|
|
494
|
-
|
|
495
|
-
def broker_server():
|
|
496
|
-
from nv_ingest_api.util.message_brokers.simple_message_broker.broker import SimpleMessageBroker
|
|
497
|
-
|
|
498
|
-
# Use max_queue_size from broker_params or default to 10000.
|
|
499
|
-
broker_params = broker_client.get("broker_params", {})
|
|
500
|
-
max_queue_size = broker_params.get("max_queue_size", 10000)
|
|
501
|
-
server_host = broker_client.get("host", "0.0.0.0")
|
|
502
|
-
server_port = broker_client.get("port", 7671)
|
|
503
|
-
# Optionally, set socket options here for reuse.
|
|
504
|
-
server = SimpleMessageBroker(server_host, server_port, max_queue_size)
|
|
505
|
-
# Enable address reuse on the server socket.
|
|
506
|
-
server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
507
|
-
server.serve_forever()
|
|
508
|
-
|
|
509
|
-
p = multiprocessing.Process(target=broker_server)
|
|
510
|
-
p.daemon = False
|
|
511
|
-
p.start()
|
|
512
|
-
logger.info(f"Started SimpleMessageBroker server in separate process on port {broker_client['port']}")
|
|
513
|
-
|
|
514
|
-
return p
|
|
@@ -3,13 +3,14 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Dict, Any
|
|
6
|
+
from typing import Dict, Any, Optional
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
import ray
|
|
10
10
|
|
|
11
11
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
12
12
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
13
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
13
14
|
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
14
15
|
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
15
16
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
@@ -31,8 +32,8 @@ class ImageStorageStage(RayActorStage):
|
|
|
31
32
|
payload and updates the control message accordingly.
|
|
32
33
|
"""
|
|
33
34
|
|
|
34
|
-
def __init__(self, config: ImageStorageModuleSchema) -> None:
|
|
35
|
-
super().__init__(config)
|
|
35
|
+
def __init__(self, config: ImageStorageModuleSchema, stage_name: Optional[str] = None) -> None:
|
|
36
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
37
|
try:
|
|
37
38
|
self.validated_config = config
|
|
38
39
|
logger.info("ImageStorageStage configuration validated successfully.")
|
|
@@ -40,9 +41,10 @@ class ImageStorageStage(RayActorStage):
|
|
|
40
41
|
logger.exception("Error validating image storage config")
|
|
41
42
|
raise e
|
|
42
43
|
|
|
43
|
-
@
|
|
44
|
+
@nv_ingest_node_failure_try_except()
|
|
45
|
+
@traceable()
|
|
46
|
+
@udf_intercept_hook()
|
|
44
47
|
@filter_by_task(required_tasks=["store"])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="image_storage", raise_on_failure=False)
|
|
46
48
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
49
|
"""
|
|
48
50
|
Process the control message by storing images or structured content.
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
from typing import Optional
|
|
6
7
|
|
|
7
8
|
import ray
|
|
8
9
|
|
|
@@ -16,6 +17,8 @@ from nv_ingest_api.util.exception_handlers.decorators import (
|
|
|
16
17
|
nv_ingest_node_failure_try_except,
|
|
17
18
|
)
|
|
18
19
|
|
|
20
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
21
|
+
|
|
19
22
|
logger = logging.getLogger(__name__)
|
|
20
23
|
|
|
21
24
|
|
|
@@ -30,8 +33,8 @@ class EmbeddingStorageStage(RayActorStage):
|
|
|
30
33
|
3. Updates the message payload with the stored embeddings DataFrame.
|
|
31
34
|
"""
|
|
32
35
|
|
|
33
|
-
def __init__(self, config: EmbeddingStorageSchema) -> None:
|
|
34
|
-
super().__init__(config)
|
|
36
|
+
def __init__(self, config: EmbeddingStorageSchema, stage_name: Optional[str] = None) -> None:
|
|
37
|
+
super().__init__(config, stage_name=stage_name)
|
|
35
38
|
try:
|
|
36
39
|
self.validated_config = config
|
|
37
40
|
logger.info("EmbeddingStorageStage configuration validated successfully.")
|
|
@@ -39,9 +42,10 @@ class EmbeddingStorageStage(RayActorStage):
|
|
|
39
42
|
logger.exception(f"Error validating Embedding Storage config: {e}")
|
|
40
43
|
raise
|
|
41
44
|
|
|
42
|
-
@
|
|
45
|
+
@nv_ingest_node_failure_try_except()
|
|
46
|
+
@traceable()
|
|
47
|
+
@udf_intercept_hook()
|
|
43
48
|
@filter_by_task(required_tasks=["store_embedding"])
|
|
44
|
-
@nv_ingest_node_failure_try_except(annotation_id="embedding_storage", raise_on_failure=False)
|
|
45
49
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
46
50
|
"""
|
|
47
51
|
Process the control message by storing embeddings.
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Optional
|
|
7
7
|
from pydantic import BaseModel
|
|
8
8
|
import ray
|
|
9
9
|
|
|
@@ -14,6 +14,8 @@ from nv_ingest.framework.util.telemetry.global_stats import GlobalStats
|
|
|
14
14
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
15
15
|
nv_ingest_node_failure_try_except,
|
|
16
16
|
)
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
17
19
|
|
|
18
20
|
# Import the JobCounter schema and global stats singleton.
|
|
19
21
|
|
|
@@ -30,15 +32,17 @@ class JobCounterStage(RayActorStage):
|
|
|
30
32
|
statistic each time it processes a message.
|
|
31
33
|
"""
|
|
32
34
|
|
|
33
|
-
def __init__(self, config: BaseModel) -> None:
|
|
35
|
+
def __init__(self, config: BaseModel, stage_name: Optional[str] = None) -> None:
|
|
34
36
|
# Ensure base attributes (e.g. self._running) are initialized.
|
|
35
|
-
super().__init__(config)
|
|
37
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
38
|
# The validated config should be a JobCounterSchema instance.
|
|
37
39
|
self.validated_config: JobCounterSchema = config
|
|
38
40
|
# Obtain the global stats' singleton.
|
|
39
41
|
self.stats = GlobalStats.get_instance()
|
|
40
42
|
|
|
41
|
-
@nv_ingest_node_failure_try_except(
|
|
43
|
+
@nv_ingest_node_failure_try_except()
|
|
44
|
+
@traceable()
|
|
45
|
+
@udf_intercept_hook()
|
|
42
46
|
async def on_data(self, message: Any) -> Any:
|
|
43
47
|
"""
|
|
44
48
|
Process an incoming IngestControlMessage by counting jobs.
|
|
@@ -24,6 +24,7 @@ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_fail
|
|
|
24
24
|
|
|
25
25
|
from nv_ingest_api.internal.primitives.tracing.logging import TaskResultStatus
|
|
26
26
|
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
|
|
27
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
@ray.remote
|
|
@@ -35,8 +36,8 @@ class OpenTelemetryTracerStage(RayActorStage):
|
|
|
35
36
|
It creates spans for tasks and exports them to a configured OpenTelemetry endpoint.
|
|
36
37
|
"""
|
|
37
38
|
|
|
38
|
-
def __init__(self, config: OpenTelemetryTracerSchema) -> None:
|
|
39
|
-
super().__init__(config)
|
|
39
|
+
def __init__(self, config: OpenTelemetryTracerSchema, stage_name: Optional[str] = None) -> None:
|
|
40
|
+
super().__init__(config, stage_name=stage_name)
|
|
40
41
|
|
|
41
42
|
# self._logger.info(f"[Telemetry] Initializing OpenTelemetry tracer stage with config: {config}")
|
|
42
43
|
|
|
@@ -81,7 +82,7 @@ class OpenTelemetryTracerStage(RayActorStage):
|
|
|
81
82
|
parent_ctx = trace.set_span_in_context(NonRecordingSpan(span_context))
|
|
82
83
|
parent_span = self.tracer.start_span(str(job_id), context=parent_ctx, start_time=start_time)
|
|
83
84
|
|
|
84
|
-
event_count = create_span_with_timestamps(self.tracer, parent_span, message)
|
|
85
|
+
event_count = create_span_with_timestamps(self.tracer, parent_span, message, self._logger)
|
|
85
86
|
|
|
86
87
|
if message.has_metadata("cm_failed") and message.get_metadata("cm_failed"):
|
|
87
88
|
parent_span.set_status(Status(StatusCode.ERROR))
|
|
@@ -96,7 +97,8 @@ class OpenTelemetryTracerStage(RayActorStage):
|
|
|
96
97
|
|
|
97
98
|
self._logger.debug(f"[Telemetry] Exported spans for message {job_id} with {event_count} total events.")
|
|
98
99
|
|
|
99
|
-
@nv_ingest_node_failure_try_except(
|
|
100
|
+
@nv_ingest_node_failure_try_except()
|
|
101
|
+
@udf_intercept_hook()
|
|
100
102
|
def on_data(self, control_message: IngestControlMessage) -> Optional[Any]:
|
|
101
103
|
try:
|
|
102
104
|
do_trace_tagging = bool(control_message.get_metadata("config::add_trace_tagging"))
|
|
@@ -160,7 +162,7 @@ def extract_annotated_task_results(message):
|
|
|
160
162
|
return task_results
|
|
161
163
|
|
|
162
164
|
|
|
163
|
-
def create_span_with_timestamps(tracer, parent_span, message) -> int:
|
|
165
|
+
def create_span_with_timestamps(tracer, parent_span, message, logger) -> int:
|
|
164
166
|
timestamps = extract_timestamps_from_message(message)
|
|
165
167
|
task_results = extract_annotated_task_results(message)
|
|
166
168
|
|
|
@@ -175,8 +177,16 @@ def create_span_with_timestamps(tracer, parent_span, message) -> int:
|
|
|
175
177
|
if not subtask:
|
|
176
178
|
span = tracer.start_span(main_task, context=child_ctx, start_time=ts_entry)
|
|
177
179
|
else:
|
|
178
|
-
|
|
179
|
-
|
|
180
|
+
# Check if parent context exists, otherwise create standalone span with warning
|
|
181
|
+
if main_task in ctx_store:
|
|
182
|
+
subtask_ctx = trace.set_span_in_context(ctx_store[main_task][0])
|
|
183
|
+
span = tracer.start_span(subtask, context=subtask_ctx, start_time=ts_entry)
|
|
184
|
+
else:
|
|
185
|
+
logger.warning(
|
|
186
|
+
f"Missing parent context for subtask '{subtask}'"
|
|
187
|
+
f" (expected parent: '{main_task}'). Creating standalone span."
|
|
188
|
+
)
|
|
189
|
+
span = tracer.start_span(f"{main_task}::{subtask}", context=child_ctx, start_time=ts_entry)
|
|
180
190
|
|
|
181
191
|
span.add_event("entry", timestamp=ts_entry)
|
|
182
192
|
span.add_event("exit", timestamp=ts_exit)
|
|
@@ -4,12 +4,13 @@
|
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
6
|
import pprint
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, Optional
|
|
8
8
|
|
|
9
9
|
import ray
|
|
10
10
|
|
|
11
11
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
12
12
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
13
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
13
14
|
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
|
|
14
15
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
15
16
|
from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
|
|
@@ -31,8 +32,8 @@ class ImageCaptionTransformStage(RayActorStage):
|
|
|
31
32
|
are stored in the control message.
|
|
32
33
|
"""
|
|
33
34
|
|
|
34
|
-
def __init__(self, config: ImageCaptionExtractionSchema) -> None:
|
|
35
|
-
super().__init__(config)
|
|
35
|
+
def __init__(self, config: ImageCaptionExtractionSchema, stage_name: Optional[str] = None) -> None:
|
|
36
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
37
|
try:
|
|
37
38
|
self.validated_config = config
|
|
38
39
|
logger.info("ImageCaptionTransformStage configuration validated.")
|
|
@@ -40,9 +41,10 @@ class ImageCaptionTransformStage(RayActorStage):
|
|
|
40
41
|
logger.exception("Error validating caption extraction config")
|
|
41
42
|
raise e
|
|
42
43
|
|
|
43
|
-
@
|
|
44
|
+
@nv_ingest_node_failure_try_except()
|
|
45
|
+
@traceable()
|
|
46
|
+
@udf_intercept_hook()
|
|
44
47
|
@filter_by_task(required_tasks=["caption"])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="image_captioning", raise_on_failure=False)
|
|
46
48
|
def on_data(self, control_message: Any) -> Any:
|
|
47
49
|
"""
|
|
48
50
|
Process the control message by extracting image captions.
|
|
@@ -2,12 +2,10 @@
|
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
-
import logging
|
|
6
5
|
import pprint
|
|
7
|
-
from typing import
|
|
6
|
+
from typing import Optional
|
|
8
7
|
import ray
|
|
9
8
|
|
|
10
|
-
# Assume these imports come from your project:
|
|
11
9
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
12
10
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
13
11
|
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type, IngestControlMessage
|
|
@@ -18,7 +16,7 @@ from nv_ingest_api.util.exception_handlers.decorators import (
|
|
|
18
16
|
nv_ingest_node_failure_try_except,
|
|
19
17
|
)
|
|
20
18
|
|
|
21
|
-
|
|
19
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
@ray.remote
|
|
@@ -31,19 +29,20 @@ class TextEmbeddingTransformStage(RayActorStage):
|
|
|
31
29
|
trace or extraction metadata is added.
|
|
32
30
|
"""
|
|
33
31
|
|
|
34
|
-
def __init__(self, config: TextEmbeddingSchema) -> None:
|
|
35
|
-
super().__init__(config,
|
|
32
|
+
def __init__(self, config: TextEmbeddingSchema, stage_name: Optional[str] = None) -> None:
|
|
33
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
34
|
try:
|
|
37
35
|
self.validated_config = config
|
|
38
|
-
|
|
36
|
+
self._logger.info("TextEmbeddingTransformStage configuration validated successfully.")
|
|
39
37
|
except Exception as e:
|
|
40
|
-
|
|
41
|
-
raise
|
|
38
|
+
self._logger.exception(f"Error validating text embedding config: {e}")
|
|
39
|
+
raise
|
|
42
40
|
|
|
43
|
-
@
|
|
41
|
+
@nv_ingest_node_failure_try_except()
|
|
42
|
+
@traceable()
|
|
43
|
+
@udf_intercept_hook()
|
|
44
44
|
@filter_by_task(required_tasks=["embed"])
|
|
45
|
-
|
|
46
|
-
def on_data(self, control_message: IngestControlMessage) -> Any:
|
|
45
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
46
|
"""
|
|
48
47
|
Process the control message by generating text embeddings.
|
|
49
48
|
|
|
@@ -59,11 +58,11 @@ class TextEmbeddingTransformStage(RayActorStage):
|
|
|
59
58
|
"""
|
|
60
59
|
# Get the DataFrame payload.
|
|
61
60
|
df_payload = control_message.payload()
|
|
62
|
-
|
|
61
|
+
self._logger.debug("TextEmbeddingTransformStage: Extracted payload with %d rows.", len(df_payload))
|
|
63
62
|
|
|
64
63
|
# Remove the "embed" task to obtain task-specific configuration.
|
|
65
64
|
task_config = remove_task_by_type(control_message, "embed")
|
|
66
|
-
|
|
65
|
+
self._logger.debug("TextEmbeddingTransformStage: Task configuration extracted: %s", pprint.pformat(task_config))
|
|
67
66
|
|
|
68
67
|
# Call the text embedding extraction function.
|
|
69
68
|
new_df, execution_trace_log = transform_create_text_embeddings_internal(
|