nv-ingest 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.15.dev20250815__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (56) hide show
  1. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  2. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  3. nv_ingest/framework/orchestration/execution/options.py +112 -0
  4. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  5. nv_ingest/framework/orchestration/process/dependent_services.py +55 -0
  6. nv_ingest/framework/orchestration/process/execution.py +497 -0
  7. nv_ingest/framework/orchestration/process/lifecycle.py +122 -0
  8. nv_ingest/framework/orchestration/process/strategies.py +182 -0
  9. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +1 -1
  10. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  11. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +23 -23
  12. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  13. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +8 -4
  14. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +16 -16
  15. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +9 -5
  16. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +8 -4
  17. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +10 -6
  18. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  19. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +18 -17
  20. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  21. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +14 -13
  22. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +15 -13
  23. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  24. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  25. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +22 -13
  26. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +10 -7
  27. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +10 -8
  28. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  29. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  30. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +71 -61
  31. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +7 -5
  32. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +8 -4
  33. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  34. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  35. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +7 -5
  36. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +13 -14
  37. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +18 -12
  38. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  39. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  40. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  41. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  42. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  43. nv_ingest/pipeline/__init__.py +3 -0
  44. nv_ingest/pipeline/config/__init__.py +3 -0
  45. nv_ingest/pipeline/config/loaders.py +198 -0
  46. nv_ingest/pipeline/config/replica_resolver.py +227 -0
  47. nv_ingest/pipeline/default_pipeline_impl.py +517 -0
  48. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  49. nv_ingest/pipeline/pipeline_schema.py +398 -0
  50. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/METADATA +1 -1
  51. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/RECORD +54 -40
  52. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  53. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  54. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/WHEEL +0 -0
  55. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/licenses/LICENSE +0 -0
  56. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/top_level.txt +0 -0
@@ -3,12 +3,13 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
- from typing import Dict, Any
6
+ from typing import Dict, Any, Optional
7
7
 
8
8
  import ray
9
9
 
10
10
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
11
  from nv_ingest.framework.util.flow_control import filter_by_task
12
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
12
13
  from nv_ingest_api.internal.mutate.filter import filter_images_internal
13
14
  from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
14
15
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable
@@ -31,18 +32,19 @@ class ImageFilterStage(RayActorStage):
31
32
  3. Updates the message payload with the filtered DataFrame.
32
33
  """
33
34
 
34
- def __init__(self, config: ImageFilterSchema) -> None:
35
- super().__init__(config)
35
+ def __init__(self, config: ImageFilterSchema, stage_name: Optional[str] = None) -> None:
36
+ super().__init__(config, stage_name=stage_name)
36
37
  try:
37
38
  self.validated_config = config
38
- logger.info("ImageFilterStage configuration validated successfully.")
39
+ logger.debug("ImageFilterStage configuration validated successfully.")
39
40
  except Exception as e:
40
41
  logger.exception(f"Error validating Image Filter config: {e}")
41
42
  raise
42
43
 
43
- @traceable("image_filter")
44
+ @nv_ingest_node_failure_try_except()
45
+ @traceable()
46
+ @udf_intercept_hook()
44
47
  @filter_by_task(required_tasks=["filter"])
45
- @nv_ingest_node_failure_try_except(annotation_id="image_filter", raise_on_failure=False)
46
48
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
49
  """
48
50
  Process the control message by filtering images.
@@ -57,7 +59,7 @@ class ImageFilterStage(RayActorStage):
57
59
  IngestControlMessage
58
60
  The updated message with filtered images in the payload.
59
61
  """
60
- logger.info("ImageFilterStage.on_data: Starting image filtering process.")
62
+ logger.debug("ImageFilterStage.on_data: Starting image filtering process.")
61
63
 
62
64
  # Extract the DataFrame payload.
63
65
  df_ledger = control_message.payload()
@@ -76,7 +78,7 @@ class ImageFilterStage(RayActorStage):
76
78
  mutate_config=self.validated_config,
77
79
  execution_trace_log=None,
78
80
  )
79
- logger.info("Image filtering completed. Resulting DataFrame has %d rows.", len(new_df))
81
+ logger.debug("Image filtering completed. Resulting DataFrame has %d rows.", len(new_df))
80
82
 
81
83
  # Update the message payload with the filtered DataFrame.
82
84
  control_message.payload(new_df)
@@ -2,7 +2,7 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
- from typing import Any, Dict
5
+ from typing import Any, Dict, Optional
6
6
  import ray
7
7
 
8
8
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_sink_stage_base import RayActorSinkStage
@@ -12,13 +12,13 @@ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_fail
12
12
 
13
13
  @ray.remote
14
14
  class DefaultDrainSink(RayActorSinkStage):
15
- def __init__(self, config: Any) -> None:
16
- super().__init__(config, log_to_stdout=False)
15
+ def __init__(self, config: Any, stage_name: Optional[str] = None) -> None:
16
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
17
17
 
18
18
  self._last_sunk_count = 0
19
19
  self._sunk_count = 0
20
20
 
21
- @nv_ingest_node_failure_try_except(annotation_id="drain_sink", raise_on_failure=False)
21
+ @nv_ingest_node_failure_try_except()
22
22
  def on_data(self, message: IngestControlMessage) -> IngestControlMessage:
23
23
  self._sunk_count += 1
24
24
 
@@ -14,6 +14,8 @@ from nv_ingest_api.internal.primitives.tracing.logging import annotate_cm
14
14
  from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
15
15
  from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
16
16
 
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
+
17
19
  logger = logging.getLogger(__name__)
18
20
 
19
21
 
@@ -75,8 +77,8 @@ class MessageBrokerTaskSinkConfig(BaseModel):
75
77
 
76
78
  @ray.remote
77
79
  class MessageBrokerTaskSinkStage(RayActorStage):
78
- def __init__(self, config: MessageBrokerTaskSinkConfig) -> None:
79
- super().__init__(config, log_to_stdout=False)
80
+ def __init__(self, config: MessageBrokerTaskSinkConfig, stage_name: Optional[str] = None) -> None:
81
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
80
82
 
81
83
  self.config: MessageBrokerTaskSinkConfig
82
84
 
@@ -224,6 +226,7 @@ class MessageBrokerTaskSinkStage(RayActorStage):
224
226
 
225
227
  # --- Public API Methods for message broker sink ---
226
228
 
229
+ @udf_intercept_hook()
227
230
  def on_data(self, control_message: Any) -> Any:
228
231
  """
229
232
  Processes the control message and pushes the resulting JSON payloads to the broker.
@@ -3,9 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
- import multiprocessing
7
6
  import uuid
8
- import socket
9
7
  from typing import Optional, Literal, Dict, Any, Union
10
8
 
11
9
  import ray
@@ -13,6 +11,7 @@ import json
13
11
  import copy
14
12
  import threading
15
13
  import time
14
+ import random
16
15
  from datetime import datetime
17
16
 
18
17
  import pandas as pd
@@ -102,11 +101,11 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
102
101
  """
103
102
 
104
103
  # Use the updated config type hint
105
- def __init__(self, config: MessageBrokerTaskSourceConfig) -> None:
106
- super().__init__(config, log_to_stdout=False)
107
- self.config: MessageBrokerTaskSourceConfig # Add type hint for self.config
104
+ def __init__(self, config: MessageBrokerTaskSourceConfig, stage_name: Optional[str] = None) -> None:
105
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
106
+ self.config: MessageBrokerTaskSourceConfig # Add a type hint for self.config
108
107
  self._logger.debug(
109
- "Initializing MessageBrokerTaskSourceStage with config: %s", config.dict()
108
+ "Initializing MessageBrokerTaskSourceStage with config: %s", config.model_dump()
110
109
  ) # Log validated config
111
110
 
112
111
  # Access validated configuration directly via self.config
@@ -126,13 +125,18 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
126
125
  self._pause_event = threading.Event()
127
126
  self._pause_event.set() # Initially not paused
128
127
 
128
+ # Backoff state for graceful retries when broker is unavailable
129
+ self._fetch_failure_count: int = 0
130
+ self._current_backoff_sleep: float = 0.0
131
+ self._last_backoff_log_time: float = 0.0
132
+
129
133
  self._logger.debug("MessageBrokerTaskSourceStage initialized. Task queue: %s", self.task_queue)
130
134
 
131
135
  # --- Private helper methods ---
132
136
  def _create_client(self):
133
137
  # Access broker config via self.config.broker_client
134
138
  broker_config = self.config.broker_client
135
- self._logger.info("Creating client of type: %s", broker_config.client_type)
139
+ self._logger.debug("Creating client of type: %s", broker_config.client_type)
136
140
 
137
141
  if broker_config.client_type == "redis":
138
142
  client = RedisClient(
@@ -265,6 +269,9 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
265
269
  job = self.client.fetch_message(self.task_queue, timeout)
266
270
  if job is None:
267
271
  self._logger.debug("No message received from '%s'", self.task_queue)
272
+ # Do not treat normal empty polls as failures
273
+ self._fetch_failure_count = 0
274
+ self._current_backoff_sleep = 0.0
268
275
  return None
269
276
  self._logger.debug("Received message type: %s", type(job))
270
277
  if isinstance(job, BaseModel):
@@ -277,12 +284,46 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
277
284
  return None
278
285
  job = json.loads(job.response)
279
286
  self._logger.debug("Successfully fetched message with job_id: %s", job.get("job_id", "unknown"))
287
+ # Success: reset backoff state
288
+ self._fetch_failure_count = 0
289
+ self._current_backoff_sleep = 0.0
280
290
  return job
281
291
  except TimeoutError:
282
292
  self._logger.debug("Timeout waiting for message")
293
+ # Timeout is not a connectivity failure; do not escalate backoff
283
294
  return None
284
295
  except Exception as err:
285
- self._logger.exception("Error during message fetching: %s", err)
296
+ # Connectivity or other fetch issue: apply graceful backoff and avoid stacktrace spam
297
+ self._fetch_failure_count += 1
298
+
299
+ # Compute exponential backoff with jitter, capped by configured max_backoff
300
+ try:
301
+ max_backoff = getattr(self.config.broker_client, "max_backoff", 5.0)
302
+ except Exception:
303
+ max_backoff = 5.0
304
+ # Start from 0.5s, double each failure
305
+ base = 0.5
306
+ backoff_no_jitter = min(max_backoff, base * (2 ** (self._fetch_failure_count - 1)))
307
+ jitter = random.uniform(0, backoff_no_jitter * 0.2)
308
+ self._current_backoff_sleep = backoff_no_jitter + jitter
309
+
310
+ now = time.time()
311
+ # Throttle warning logs to at most once per 5 seconds to avoid spam
312
+ if now - self._last_backoff_log_time >= 5.0:
313
+ self._logger.warning(
314
+ "Broker fetch failed (%d consecutive failures). Backing off for %.2fs. Error: %s",
315
+ self._fetch_failure_count,
316
+ self._current_backoff_sleep,
317
+ err,
318
+ )
319
+ self._last_backoff_log_time = now
320
+ else:
321
+ self._logger.debug(
322
+ "Broker fetch failed (%d). Backoff %.2fs. Error: %s",
323
+ self._fetch_failure_count,
324
+ self._current_backoff_sleep,
325
+ err,
326
+ )
286
327
  return None
287
328
 
288
329
  def _read_input(self) -> any:
@@ -293,8 +334,17 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
293
334
  self._logger.debug("read_input: calling _fetch_message()")
294
335
  job = self._fetch_message(timeout=100)
295
336
  if job is None:
296
- self._logger.debug("read_input: No job received, sleeping for poll_interval: %s", self.config.poll_interval)
297
- time.sleep(self.config.poll_interval)
337
+ # Sleep for either the configured poll interval or the current backoff, whichever is larger
338
+ sleep_time = max(self.config.poll_interval, getattr(self, "_current_backoff_sleep", 0.0))
339
+ self._logger.debug(
340
+ "read_input: No job received; sleeping %.2fs (poll_interval=%.2fs, backoff=%.2fs)",
341
+ sleep_time,
342
+ self.config.poll_interval,
343
+ getattr(self, "_current_backoff_sleep", 0.0),
344
+ )
345
+ time.sleep(sleep_time)
346
+ # Reset one-shot backoff so that repeated failures recompute progressively
347
+ self._current_backoff_sleep = 0.0
298
348
 
299
349
  return None
300
350
 
@@ -314,7 +364,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
314
364
  This loop fetches messages from the broker and writes them to the output queue,
315
365
  but blocks on the pause event when the stage is paused.
316
366
  """
317
- self._logger.info("Processing loop started")
367
+ self._logger.debug("Processing loop started")
318
368
  iteration = 0
319
369
  while self._running:
320
370
  iteration += 1
@@ -381,25 +431,25 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
381
431
  self._active_processing = False
382
432
  self._shutdown_signal_complete = True
383
433
 
384
- self._logger.info("Processing loop ending")
434
+ self._logger.debug("Processing loop ending")
385
435
 
386
436
  @ray.method(num_returns=1)
387
437
  def start(self) -> bool:
388
438
  if self._running:
389
- self._logger.info("Start called but stage is already running.")
439
+ self._logger.warning("Start called but stage is already running.")
390
440
  return False
391
441
  self._running = True
392
442
  self.start_time = time.time()
393
443
  self._message_count = 0
394
- self._logger.info("Starting processing loop thread.")
444
+ self._logger.debug("Starting processing loop thread.")
395
445
  threading.Thread(target=self._processing_loop, daemon=True).start()
396
- self._logger.info("MessageBrokerTaskSourceStage started.")
446
+ self._logger.debug("MessageBrokerTaskSourceStage started.")
397
447
  return True
398
448
 
399
449
  @ray.method(num_returns=1)
400
450
  def stop(self) -> bool:
401
451
  self._running = False
402
- self._logger.info("Stop called on MessageBrokerTaskSourceStage")
452
+ self._logger.debug("Stop called on MessageBrokerTaskSourceStage")
403
453
  return True
404
454
 
405
455
  @ray.method(num_returns=1)
@@ -425,7 +475,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
425
475
  @ray.method(num_returns=1)
426
476
  def set_output_queue(self, queue_handle: any) -> bool:
427
477
  self.output_queue = queue_handle
428
- self._logger.info("Output queue set: %s", queue_handle)
478
+ self._logger.debug("Output queue set: %s", queue_handle)
429
479
  return True
430
480
 
431
481
  @ray.method(num_returns=1)
@@ -440,7 +490,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
440
490
  True after the stage is paused.
441
491
  """
442
492
  self._pause_event.clear()
443
- self._logger.info("Stage paused.")
493
+ self._logger.debug("Stage paused.")
444
494
 
445
495
  return True
446
496
 
@@ -456,7 +506,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
456
506
  True after the stage is resumed.
457
507
  """
458
508
  self._pause_event.set()
459
- self._logger.info("Stage resumed.")
509
+ self._logger.debug("Stage resumed.")
460
510
  return True
461
511
 
462
512
  @ray.method(num_returns=1)
@@ -466,49 +516,9 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
466
516
  This method pauses the stage, waits for any current processing to finish,
467
517
  replaces the output queue, and then resumes the stage.
468
518
  """
469
- self._logger.info("Swapping output queue: pausing stage first.")
519
+ self._logger.debug("Swapping output queue: pausing stage first.")
470
520
  self.pause()
471
521
  self.set_output_queue(new_queue)
472
- self._logger.info("Output queue swapped. Resuming stage.")
522
+ self._logger.debug("Output queue swapped. Resuming stage.")
473
523
  self.resume()
474
524
  return True
475
-
476
-
477
- def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
478
- """
479
- Starts a SimpleMessageBroker server in a separate process.
480
-
481
- Parameters
482
- ----------
483
- broker_client : dict
484
- Broker configuration. Expected keys include:
485
- - "port": the port to bind the server to,
486
- - "broker_params": optionally including "max_queue_size",
487
- - and any other parameters required by SimpleMessageBroker.
488
-
489
- Returns
490
- -------
491
- multiprocessing.Process
492
- The process running the SimpleMessageBroker server.
493
- """
494
-
495
- def broker_server():
496
- from nv_ingest_api.util.message_brokers.simple_message_broker.broker import SimpleMessageBroker
497
-
498
- # Use max_queue_size from broker_params or default to 10000.
499
- broker_params = broker_client.get("broker_params", {})
500
- max_queue_size = broker_params.get("max_queue_size", 10000)
501
- server_host = broker_client.get("host", "0.0.0.0")
502
- server_port = broker_client.get("port", 7671)
503
- # Optionally, set socket options here for reuse.
504
- server = SimpleMessageBroker(server_host, server_port, max_queue_size)
505
- # Enable address reuse on the server socket.
506
- server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
507
- server.serve_forever()
508
-
509
- p = multiprocessing.Process(target=broker_server)
510
- p.daemon = False
511
- p.start()
512
- logger.info(f"Started SimpleMessageBroker server in separate process on port {broker_client['port']}")
513
-
514
- return p
@@ -3,13 +3,14 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
- from typing import Dict, Any
6
+ from typing import Dict, Any, Optional
7
7
 
8
8
  import pandas as pd
9
9
  import ray
10
10
 
11
11
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
12
  from nv_ingest.framework.util.flow_control import filter_by_task
13
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
13
14
  from nv_ingest_api.internal.enums.common import ContentTypeEnum
14
15
  from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
15
16
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable
@@ -31,8 +32,8 @@ class ImageStorageStage(RayActorStage):
31
32
  payload and updates the control message accordingly.
32
33
  """
33
34
 
34
- def __init__(self, config: ImageStorageModuleSchema) -> None:
35
- super().__init__(config)
35
+ def __init__(self, config: ImageStorageModuleSchema, stage_name: Optional[str] = None) -> None:
36
+ super().__init__(config, stage_name=stage_name)
36
37
  try:
37
38
  self.validated_config = config
38
39
  logger.info("ImageStorageStage configuration validated successfully.")
@@ -40,9 +41,10 @@ class ImageStorageStage(RayActorStage):
40
41
  logger.exception("Error validating image storage config")
41
42
  raise e
42
43
 
43
- @traceable("image_storage")
44
+ @nv_ingest_node_failure_try_except()
45
+ @traceable()
46
+ @udf_intercept_hook()
44
47
  @filter_by_task(required_tasks=["store"])
45
- @nv_ingest_node_failure_try_except(annotation_id="image_storage", raise_on_failure=False)
46
48
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
49
  """
48
50
  Process the control message by storing images or structured content.
@@ -3,6 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ from typing import Optional
6
7
 
7
8
  import ray
8
9
 
@@ -16,6 +17,8 @@ from nv_ingest_api.util.exception_handlers.decorators import (
16
17
  nv_ingest_node_failure_try_except,
17
18
  )
18
19
 
20
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
21
+
19
22
  logger = logging.getLogger(__name__)
20
23
 
21
24
 
@@ -30,8 +33,8 @@ class EmbeddingStorageStage(RayActorStage):
30
33
  3. Updates the message payload with the stored embeddings DataFrame.
31
34
  """
32
35
 
33
- def __init__(self, config: EmbeddingStorageSchema) -> None:
34
- super().__init__(config)
36
+ def __init__(self, config: EmbeddingStorageSchema, stage_name: Optional[str] = None) -> None:
37
+ super().__init__(config, stage_name=stage_name)
35
38
  try:
36
39
  self.validated_config = config
37
40
  logger.info("EmbeddingStorageStage configuration validated successfully.")
@@ -39,9 +42,10 @@ class EmbeddingStorageStage(RayActorStage):
39
42
  logger.exception(f"Error validating Embedding Storage config: {e}")
40
43
  raise
41
44
 
42
- @traceable("embedding_storage")
45
+ @nv_ingest_node_failure_try_except()
46
+ @traceable()
47
+ @udf_intercept_hook()
43
48
  @filter_by_task(required_tasks=["store_embedding"])
44
- @nv_ingest_node_failure_try_except(annotation_id="embedding_storage", raise_on_failure=False)
45
49
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
46
50
  """
47
51
  Process the control message by storing embeddings.
@@ -3,7 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
- from typing import Any
6
+ from typing import Any, Optional
7
7
  from pydantic import BaseModel
8
8
  import ray
9
9
 
@@ -14,6 +14,8 @@ from nv_ingest.framework.util.telemetry.global_stats import GlobalStats
14
14
  from nv_ingest_api.util.exception_handlers.decorators import (
15
15
  nv_ingest_node_failure_try_except,
16
16
  )
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
17
19
 
18
20
  # Import the JobCounter schema and global stats singleton.
19
21
 
@@ -30,15 +32,17 @@ class JobCounterStage(RayActorStage):
30
32
  statistic each time it processes a message.
31
33
  """
32
34
 
33
- def __init__(self, config: BaseModel) -> None:
35
+ def __init__(self, config: BaseModel, stage_name: Optional[str] = None) -> None:
34
36
  # Ensure base attributes (e.g. self._running) are initialized.
35
- super().__init__(config)
37
+ super().__init__(config, stage_name=stage_name)
36
38
  # The validated config should be a JobCounterSchema instance.
37
39
  self.validated_config: JobCounterSchema = config
38
40
  # Obtain the global stats' singleton.
39
41
  self.stats = GlobalStats.get_instance()
40
42
 
41
- @nv_ingest_node_failure_try_except(annotation_id="job_counter", raise_on_failure=False)
43
+ @nv_ingest_node_failure_try_except()
44
+ @traceable()
45
+ @udf_intercept_hook()
42
46
  async def on_data(self, message: Any) -> Any:
43
47
  """
44
48
  Process an incoming IngestControlMessage by counting jobs.
@@ -24,6 +24,7 @@ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_fail
24
24
 
25
25
  from nv_ingest_api.internal.primitives.tracing.logging import TaskResultStatus
26
26
  from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
27
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
27
28
 
28
29
 
29
30
  @ray.remote
@@ -35,8 +36,8 @@ class OpenTelemetryTracerStage(RayActorStage):
35
36
  It creates spans for tasks and exports them to a configured OpenTelemetry endpoint.
36
37
  """
37
38
 
38
- def __init__(self, config: OpenTelemetryTracerSchema) -> None:
39
- super().__init__(config)
39
+ def __init__(self, config: OpenTelemetryTracerSchema, stage_name: Optional[str] = None) -> None:
40
+ super().__init__(config, stage_name=stage_name)
40
41
 
41
42
  # self._logger.info(f"[Telemetry] Initializing OpenTelemetry tracer stage with config: {config}")
42
43
 
@@ -81,7 +82,7 @@ class OpenTelemetryTracerStage(RayActorStage):
81
82
  parent_ctx = trace.set_span_in_context(NonRecordingSpan(span_context))
82
83
  parent_span = self.tracer.start_span(str(job_id), context=parent_ctx, start_time=start_time)
83
84
 
84
- event_count = create_span_with_timestamps(self.tracer, parent_span, message)
85
+ event_count = create_span_with_timestamps(self.tracer, parent_span, message, self._logger)
85
86
 
86
87
  if message.has_metadata("cm_failed") and message.get_metadata("cm_failed"):
87
88
  parent_span.set_status(Status(StatusCode.ERROR))
@@ -96,7 +97,8 @@ class OpenTelemetryTracerStage(RayActorStage):
96
97
 
97
98
  self._logger.debug(f"[Telemetry] Exported spans for message {job_id} with {event_count} total events.")
98
99
 
99
- @nv_ingest_node_failure_try_except(annotation_id="otel_tracer", raise_on_failure=False)
100
+ @nv_ingest_node_failure_try_except()
101
+ @udf_intercept_hook()
100
102
  def on_data(self, control_message: IngestControlMessage) -> Optional[Any]:
101
103
  try:
102
104
  do_trace_tagging = bool(control_message.get_metadata("config::add_trace_tagging"))
@@ -160,7 +162,7 @@ def extract_annotated_task_results(message):
160
162
  return task_results
161
163
 
162
164
 
163
- def create_span_with_timestamps(tracer, parent_span, message) -> int:
165
+ def create_span_with_timestamps(tracer, parent_span, message, logger) -> int:
164
166
  timestamps = extract_timestamps_from_message(message)
165
167
  task_results = extract_annotated_task_results(message)
166
168
 
@@ -175,8 +177,16 @@ def create_span_with_timestamps(tracer, parent_span, message) -> int:
175
177
  if not subtask:
176
178
  span = tracer.start_span(main_task, context=child_ctx, start_time=ts_entry)
177
179
  else:
178
- subtask_ctx = trace.set_span_in_context(ctx_store[main_task][0])
179
- span = tracer.start_span(subtask, context=subtask_ctx, start_time=ts_entry)
180
+ # Check if parent context exists, otherwise create standalone span with warning
181
+ if main_task in ctx_store:
182
+ subtask_ctx = trace.set_span_in_context(ctx_store[main_task][0])
183
+ span = tracer.start_span(subtask, context=subtask_ctx, start_time=ts_entry)
184
+ else:
185
+ logger.warning(
186
+ f"Missing parent context for subtask '{subtask}'"
187
+ f" (expected parent: '{main_task}'). Creating standalone span."
188
+ )
189
+ span = tracer.start_span(f"{main_task}::{subtask}", context=child_ctx, start_time=ts_entry)
180
190
 
181
191
  span.add_event("entry", timestamp=ts_entry)
182
192
  span.add_event("exit", timestamp=ts_exit)
@@ -4,12 +4,13 @@
4
4
 
5
5
  import logging
6
6
  import pprint
7
- from typing import Any
7
+ from typing import Any, Optional
8
8
 
9
9
  import ray
10
10
 
11
11
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
12
  from nv_ingest.framework.util.flow_control import filter_by_task
13
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
13
14
  from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
14
15
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
16
  from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
@@ -31,8 +32,8 @@ class ImageCaptionTransformStage(RayActorStage):
31
32
  are stored in the control message.
32
33
  """
33
34
 
34
- def __init__(self, config: ImageCaptionExtractionSchema) -> None:
35
- super().__init__(config)
35
+ def __init__(self, config: ImageCaptionExtractionSchema, stage_name: Optional[str] = None) -> None:
36
+ super().__init__(config, stage_name=stage_name)
36
37
  try:
37
38
  self.validated_config = config
38
39
  logger.info("ImageCaptionTransformStage configuration validated.")
@@ -40,9 +41,10 @@ class ImageCaptionTransformStage(RayActorStage):
40
41
  logger.exception("Error validating caption extraction config")
41
42
  raise e
42
43
 
43
- @traceable("image_captioning")
44
+ @nv_ingest_node_failure_try_except()
45
+ @traceable()
46
+ @udf_intercept_hook()
44
47
  @filter_by_task(required_tasks=["caption"])
45
- @nv_ingest_node_failure_try_except(annotation_id="image_captioning", raise_on_failure=False)
46
48
  def on_data(self, control_message: Any) -> Any:
47
49
  """
48
50
  Process the control message by extracting image captions.
@@ -2,12 +2,10 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
- import logging
6
5
  import pprint
7
- from typing import Any
6
+ from typing import Optional
8
7
  import ray
9
8
 
10
- # Assume these imports come from your project:
11
9
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
10
  from nv_ingest.framework.util.flow_control import filter_by_task
13
11
  from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type, IngestControlMessage
@@ -18,7 +16,7 @@ from nv_ingest_api.util.exception_handlers.decorators import (
18
16
  nv_ingest_node_failure_try_except,
19
17
  )
20
18
 
21
- logger = logging.getLogger(__name__)
19
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
22
20
 
23
21
 
24
22
  @ray.remote
@@ -31,19 +29,20 @@ class TextEmbeddingTransformStage(RayActorStage):
31
29
  trace or extraction metadata is added.
32
30
  """
33
31
 
34
- def __init__(self, config: TextEmbeddingSchema) -> None:
35
- super().__init__(config, log_to_stdout=False)
32
+ def __init__(self, config: TextEmbeddingSchema, stage_name: Optional[str] = None) -> None:
33
+ super().__init__(config, stage_name=stage_name)
36
34
  try:
37
35
  self.validated_config = config
38
- logger.info("TextEmbeddingTransformStage configuration validated successfully.")
36
+ self._logger.info("TextEmbeddingTransformStage configuration validated successfully.")
39
37
  except Exception as e:
40
- logger.exception("Error validating text embedding extractor config")
41
- raise e
38
+ self._logger.exception(f"Error validating text embedding config: {e}")
39
+ raise
42
40
 
43
- @traceable("text_embedding")
41
+ @nv_ingest_node_failure_try_except()
42
+ @traceable()
43
+ @udf_intercept_hook()
44
44
  @filter_by_task(required_tasks=["embed"])
45
- @nv_ingest_node_failure_try_except(annotation_id="text_embedding", raise_on_failure=False)
46
- def on_data(self, control_message: IngestControlMessage) -> Any:
45
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
46
  """
48
47
  Process the control message by generating text embeddings.
49
48
 
@@ -59,11 +58,11 @@ class TextEmbeddingTransformStage(RayActorStage):
59
58
  """
60
59
  # Get the DataFrame payload.
61
60
  df_payload = control_message.payload()
62
- logger.debug("TextEmbeddingTransformStage: Extracted payload with %d rows.", len(df_payload))
61
+ self._logger.debug("TextEmbeddingTransformStage: Extracted payload with %d rows.", len(df_payload))
63
62
 
64
63
  # Remove the "embed" task to obtain task-specific configuration.
65
64
  task_config = remove_task_by_type(control_message, "embed")
66
- logger.debug("TextEmbeddingTransformStage: Task configuration extracted: %s", pprint.pformat(task_config))
65
+ self._logger.debug("TextEmbeddingTransformStage: Task configuration extracted: %s", pprint.pformat(task_config))
67
66
 
68
67
  # Call the text embedding extraction function.
69
68
  new_df, execution_trace_log = transform_create_text_embeddings_internal(