nv-ingest 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.16.dev20250816__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (56) hide show
  1. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  2. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  3. nv_ingest/framework/orchestration/execution/options.py +112 -0
  4. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  5. nv_ingest/framework/orchestration/process/dependent_services.py +55 -0
  6. nv_ingest/framework/orchestration/process/execution.py +497 -0
  7. nv_ingest/framework/orchestration/process/lifecycle.py +122 -0
  8. nv_ingest/framework/orchestration/process/strategies.py +182 -0
  9. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +1 -1
  10. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  11. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +23 -23
  12. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  13. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +8 -4
  14. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +16 -16
  15. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +9 -5
  16. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +8 -4
  17. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +10 -6
  18. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  19. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +18 -17
  20. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  21. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +14 -13
  22. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +15 -13
  23. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  24. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  25. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +22 -13
  26. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +10 -7
  27. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +10 -8
  28. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  29. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  30. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +71 -61
  31. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +7 -5
  32. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +8 -4
  33. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  34. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  35. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +7 -5
  36. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +13 -14
  37. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +18 -12
  38. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  39. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  40. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  41. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  42. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  43. nv_ingest/pipeline/__init__.py +3 -0
  44. nv_ingest/pipeline/config/__init__.py +3 -0
  45. nv_ingest/pipeline/config/loaders.py +198 -0
  46. nv_ingest/pipeline/config/replica_resolver.py +227 -0
  47. nv_ingest/pipeline/default_pipeline_impl.py +517 -0
  48. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  49. nv_ingest/pipeline/pipeline_schema.py +398 -0
  50. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/METADATA +1 -1
  51. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/RECORD +54 -40
  52. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  53. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  54. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/WHEEL +0 -0
  55. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/licenses/LICENSE +0 -0
  56. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.16.dev20250816.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@
4
4
 
5
5
 
6
6
  import logging
7
+ from typing import Optional
7
8
 
8
9
  import ray
9
10
 
@@ -17,6 +18,8 @@ from nv_ingest_api.util.exception_handlers.decorators import (
17
18
  nv_ingest_node_failure_try_except,
18
19
  )
19
20
 
21
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
22
+
20
23
  logger = logging.getLogger(__name__)
21
24
 
22
25
 
@@ -31,8 +34,8 @@ class HtmlExtractorStage(RayActorStage):
31
34
  3. Updates the message payload with the extracted text DataFrame.
32
35
  """
33
36
 
34
- def __init__(self, config: HtmlExtractorSchema) -> None:
35
- super().__init__(config, log_to_stdout=False)
37
+ def __init__(self, config: HtmlExtractorSchema, stage_name: Optional[str] = None) -> None:
38
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
36
39
  try:
37
40
  self.validated_config = config
38
41
  self._logger.info("HtmlExtractorStage configuration validated successfully.")
@@ -40,9 +43,10 @@ class HtmlExtractorStage(RayActorStage):
40
43
  self._logger.exception(f"Error validating Html Extractor config: {e}")
41
44
  raise
42
45
 
43
- @traceable("html_extractor")
46
+ @nv_ingest_node_failure_try_except()
47
+ @traceable()
48
+ @udf_intercept_hook()
44
49
  @filter_by_task(required_tasks=[("extract", {"document_type": "html"})])
45
- @nv_ingest_node_failure_try_except(annotation_id="html_extractor", raise_on_failure=False)
46
50
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
51
  """
48
52
  Process the control message by extracting content from html.
@@ -3,6 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ from typing import Optional
6
7
 
7
8
  import ray
8
9
 
@@ -16,6 +17,8 @@ from nv_ingest_api.util.exception_handlers.decorators import (
16
17
  nv_ingest_node_failure_try_except,
17
18
  )
18
19
 
20
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
21
+
19
22
  logger = logging.getLogger(__name__)
20
23
 
21
24
 
@@ -30,18 +33,19 @@ class ImageExtractorStage(RayActorStage):
30
33
  3. Updates the message payload with the extracted primitives DataFrame.
31
34
  """
32
35
 
33
- def __init__(self, config: ImageExtractorSchema) -> None:
34
- super().__init__(config)
36
+ def __init__(self, config: ImageExtractorSchema, stage_name: Optional[str] = None) -> None:
37
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
35
38
  try:
36
39
  self.validated_config = config
37
- logger.info("ImageExtractorStage configuration validated successfully.")
40
+ self._logger.info("ImageExtractorStage configuration validated successfully.")
38
41
  except Exception as e:
39
- logger.exception(f"Error validating Image Extractor config: {e}")
42
+ self._logger.exception(f"Error validating Image Extractor config: {e}")
40
43
  raise
41
44
 
42
- @traceable("image_extraction")
45
+ @nv_ingest_node_failure_try_except()
46
+ @traceable()
47
+ @udf_intercept_hook()
43
48
  @filter_by_task(required_tasks=[("extract", {"document_type": "regex:^(png|jpeg|jpg|tiff|bmp)$"})])
44
- @nv_ingest_node_failure_try_except(annotation_id="image_extractor", raise_on_failure=False)
45
49
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
46
50
  """
47
51
  Process the control message by extracting primitives from images.
@@ -5,32 +5,44 @@
5
5
  import logging
6
6
  import ray
7
7
 
8
- from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
9
8
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
10
9
  from nv_ingest.framework.util.flow_control import filter_by_task
11
10
  from nv_ingest_api.internal.extract.image.infographic_extractor import extract_infographic_data_from_image_internal
12
11
  from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
13
- from nv_ingest_api.internal.primitives.tracing.tagging import traceable
12
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
13
+ from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
14
14
  from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
15
+ from typing import Optional
16
+
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
15
18
 
16
19
  logger = logging.getLogger(__name__)
17
20
 
18
21
 
19
22
  @ray.remote
20
23
  class InfographicExtractorStage(RayActorStage):
21
- def __init__(self, config: InfographicExtractorSchema) -> None:
22
- super().__init__(config)
24
+ """
25
+ A Ray actor stage that extracts infographic data from image content.
26
+
27
+ It expects an IngestControlMessage containing a DataFrame with image data. It then:
28
+ 1. Removes the "infographic_data_extract" task from the message.
29
+ 2. Calls the infographic extraction logic using a validated configuration.
30
+ 3. Updates the message payload with the extracted infographic DataFrame.
31
+ """
23
32
 
33
+ def __init__(self, config: InfographicExtractorSchema, stage_name: Optional[str] = None) -> None:
34
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
24
35
  try:
25
36
  self.validated_config = config
26
- logger.info("ImageExtractorStage configuration validated successfully.")
37
+ self._logger.info("InfographicExtractorStage configuration validated successfully.")
27
38
  except Exception as e:
28
- logger.exception(f"Error validating Image Extractor config: {e}")
39
+ self._logger.exception(f"Error validating Infographic extractor config: {e}")
29
40
  raise
30
41
 
31
- @traceable("infographic_extraction")
42
+ @nv_ingest_node_failure_try_except()
43
+ @traceable()
44
+ @udf_intercept_hook()
32
45
  @filter_by_task(required_tasks=["infographic_data_extract"])
33
- @nv_ingest_node_failure_try_except(annotation_id="infographic_extraction", raise_on_failure=False)
34
46
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
35
47
  # Extract DataFrame payload
36
48
  df_ledger = control_message.payload()
@@ -51,7 +63,7 @@ class InfographicExtractorStage(RayActorStage):
51
63
 
52
64
  do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
53
65
  if do_trace_tagging and execution_trace_log:
54
- for key, ts in execution_trace_log.items():
55
- control_message.set_timestamp(key, ts)
66
+ parent_name = self.stage_name if self.stage_name else "infographic_extractor"
67
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
56
68
 
57
69
  return control_message
@@ -7,16 +7,15 @@ import pandas as pd
7
7
  from typing import Any, Dict, Tuple, Optional
8
8
  import ray
9
9
 
10
- # Assume these imports come from your project:
11
- from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
- from nv_ingest.framework.util.flow_control import filter_by_task
13
10
  from nv_ingest_api.internal.extract.pdf.pdf_extractor import extract_primitives_from_pdf_internal
14
11
  from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
15
- from nv_ingest_api.internal.primitives.tracing.tagging import traceable
16
12
  from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
17
- from nv_ingest_api.util.exception_handlers.decorators import (
18
- nv_ingest_node_failure_try_except,
19
- )
13
+
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import set_trace_timestamps_with_parent_context, traceable
15
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
16
+ from nv_ingest.framework.util.flow_control import filter_by_task
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
20
19
 
21
20
  logger = logging.getLogger(__name__)
22
21
 
@@ -51,19 +50,20 @@ class PDFExtractorStage(RayActorStage):
51
50
  4. Optionally, stores additional extraction info in the message metadata.
52
51
  """
53
52
 
54
- def __init__(self, config: PDFExtractorSchema) -> None:
55
- super().__init__(config)
53
+ def __init__(self, config: PDFExtractorSchema, stage_name: Optional[str] = None) -> None:
54
+ super().__init__(config, stage_name=stage_name)
56
55
  try:
57
56
  # Validate and store the PDF extractor configuration.
58
57
  self.validated_config = config
59
- logger.info("PDFExtractorStage configuration validated successfully.")
58
+ logger.debug("PDFExtractorStage configuration validated successfully.")
60
59
  except Exception as e:
61
60
  logger.exception(f"Error validating PDF extractor config: {e}")
62
61
  raise
63
62
 
64
- @traceable("pdf_extraction")
63
+ @nv_ingest_node_failure_try_except()
64
+ @traceable()
65
+ @udf_intercept_hook()
65
66
  @filter_by_task(required_tasks=[("extract", {"document_type": "pdf"})])
66
- @nv_ingest_node_failure_try_except(annotation_id="pdf_extractor", raise_on_failure=False)
67
67
  def on_data(self, control_message: Any) -> Any:
68
68
  """
69
69
  Process the control message by extracting PDF content.
@@ -79,7 +79,7 @@ class PDFExtractorStage(RayActorStage):
79
79
  The updated message with the extracted DataFrame and extraction info in metadata.
80
80
  """
81
81
 
82
- logger.info("PDFExtractorStage.on_data: Starting PDF extraction process.")
82
+ logger.debug("PDFExtractorStage.on_data: Starting PDF extraction process.")
83
83
 
84
84
  # Extract the DataFrame payload.
85
85
  df_extraction_ledger = control_message.payload()
@@ -97,17 +97,18 @@ class PDFExtractorStage(RayActorStage):
97
97
  execution_trace_log=execution_trace_log,
98
98
  validated_config=self.validated_config,
99
99
  )
100
- logger.info("PDF extraction completed. Extracted %d rows.", len(new_df))
100
+ logger.debug("PDF extraction completed. Extracted %d rows.", len(new_df))
101
101
 
102
102
  # Update the message payload with the extracted DataFrame.
103
103
  control_message.payload(new_df)
104
104
  # Optionally, annotate the message with extraction info.
105
105
  control_message.set_metadata("pdf_extraction_info", extraction_info)
106
- logger.info("PDF extraction metadata injected successfully.")
106
+ logger.debug("PDF extraction metadata injected successfully.")
107
107
 
108
108
  do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
109
109
  if do_trace_tagging and execution_trace_log:
110
- for key, ts in execution_trace_log.items():
111
- control_message.set_timestamp(key, ts)
110
+ # Use utility function to set trace timestamps with proper parent-child context
111
+ parent_name = self.stage_name or "pdf_extractor"
112
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
112
113
 
113
114
  return control_message
@@ -3,8 +3,10 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ from typing import Optional
6
7
 
7
8
  import ray
9
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
8
10
 
9
11
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
10
12
  from nv_ingest.framework.util.flow_control import filter_by_task
@@ -28,7 +30,7 @@ class PPTXExtractorStage(RayActorStage):
28
30
  3. Updates the message payload with the extracted content DataFrame.
29
31
  """
30
32
 
31
- def __init__(self, config: PPTXExtractorSchema) -> None:
33
+ def __init__(self, config: PPTXExtractorSchema, stage_name: Optional[str] = None) -> None:
32
34
  """
33
35
  Initializes the PptxExtractorStage.
34
36
 
@@ -36,8 +38,10 @@ class PPTXExtractorStage(RayActorStage):
36
38
  ----------
37
39
  config : PPTXExtractorSchema
38
40
  The validated configuration object for PPTX extraction.
41
+ stage_name : Optional[str]
42
+ Name of the stage from YAML pipeline configuration.
39
43
  """
40
- super().__init__(config)
44
+ super().__init__(config, stage_name=stage_name)
41
45
  try:
42
46
  # The config passed in should already be validated, but storing it.
43
47
  self.validated_config = config
@@ -47,9 +51,10 @@ class PPTXExtractorStage(RayActorStage):
47
51
  logger.exception(f"Error initializing or validating PPTX Extractor config: {e}")
48
52
  raise
49
53
 
50
- @traceable("pptx_extractor")
54
+ @nv_ingest_node_failure_try_except()
55
+ @traceable()
56
+ @udf_intercept_hook()
51
57
  @filter_by_task(required_tasks=[("extract", {"document_type": "pptx"})])
52
- @nv_ingest_node_failure_try_except(annotation_id="pptx_extractor", raise_on_failure=False)
53
58
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
54
59
  """
55
60
  Process the control message by extracting content from PPTX documents.
@@ -80,6 +85,6 @@ class PPTXExtractorStage(RayActorStage):
80
85
 
81
86
  # Update the message payload with the extracted PPTX content DataFrame.
82
87
  control_message.payload(new_df)
83
- control_message.set_metadata("pptx_extraction_info", extraction_info) # <-- Changed metadata key
88
+ control_message.set_metadata("pptx_extraction_info", extraction_info)
84
89
 
85
90
  return control_message
@@ -3,15 +3,15 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
- from typing import Any
6
+ from typing import Any, Optional
7
7
  import ray
8
8
 
9
- # These imports are assumed from your project.
10
9
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
10
  from nv_ingest.framework.util.flow_control import filter_by_task
11
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
12
12
  from nv_ingest_api.internal.extract.image.table_extractor import extract_table_data_from_image_internal
13
13
  from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
14
- from nv_ingest_api.internal.primitives.tracing.tagging import traceable
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
15
15
  from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
16
16
  from nv_ingest_api.util.exception_handlers.decorators import (
17
17
  nv_ingest_node_failure_try_except,
@@ -31,18 +31,19 @@ class TableExtractorStage(RayActorStage):
31
31
  and annotates the message metadata with extraction info.
32
32
  """
33
33
 
34
- def __init__(self, config: TableExtractorSchema) -> None:
35
- super().__init__(config)
34
+ def __init__(self, config: TableExtractorSchema, stage_name: Optional[str] = None) -> None:
35
+ super().__init__(config, stage_name=stage_name)
36
36
  try:
37
37
  self.validated_config = config
38
- logger.info("TableExtractorStage configuration validated successfully.")
38
+ logger.debug("TableExtractorStage configuration validated successfully.")
39
39
  except Exception as e:
40
40
  logger.exception("Error validating table extractor config")
41
41
  raise e
42
42
 
43
- @traceable("table_extraction")
43
+ @nv_ingest_node_failure_try_except()
44
+ @traceable()
45
+ @udf_intercept_hook()
44
46
  @filter_by_task(required_tasks=["table_data_extract"])
45
- @nv_ingest_node_failure_try_except(annotation_id="table_extraction", raise_on_failure=False)
46
47
  def on_data(self, control_message: Any) -> Any:
47
48
  """
48
49
  Process the control message by extracting table data from the PDF payload.
@@ -57,7 +58,7 @@ class TableExtractorStage(RayActorStage):
57
58
  IngestControlMessage
58
59
  The updated message with the extracted table data and extraction info in metadata.
59
60
  """
60
- logger.info("TableExtractorStage.on_data: Starting table extraction.")
61
+ logger.debug("TableExtractorStage.on_data: Starting table extraction.")
61
62
  # Extract the DataFrame payload.
62
63
  df_payload = control_message.payload()
63
64
  logger.debug("Extracted payload with %d rows.", len(df_payload))
@@ -74,17 +75,17 @@ class TableExtractorStage(RayActorStage):
74
75
  extraction_config=self.validated_config,
75
76
  execution_trace_log=execution_trace_log,
76
77
  )
77
- logger.info("Table extraction completed. Extracted %d rows.", len(new_df))
78
+ logger.debug("Table extraction completed. Extracted %d rows.", len(new_df))
78
79
 
79
80
  # Update the control message with the new DataFrame.
80
81
  control_message.payload(new_df)
81
82
  # Annotate the message with extraction info.
82
83
  control_message.set_metadata("table_extraction_info", extraction_info)
83
- logger.info("Table extraction metadata injected successfully.")
84
+ logger.debug("Table extraction metadata injected successfully.")
84
85
 
85
86
  do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
86
87
  if do_trace_tagging and execution_trace_log:
87
- for key, ts in execution_trace_log.items():
88
- control_message.set_timestamp(key, ts)
88
+ parent_name = self.stage_name if self.stage_name else "table_extractor"
89
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
89
90
 
90
91
  return control_message
@@ -4,12 +4,14 @@
4
4
 
5
5
  from datetime import datetime
6
6
  import logging
7
+ from typing import Optional
7
8
  import pandas as pd
8
- from typing import Any
9
9
  from pydantic import BaseModel
10
10
  import ray
11
11
 
12
12
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
13
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
13
15
  from nv_ingest_api.internal.enums.common import (
14
16
  DocumentTypeEnum,
15
17
  ContentTypeEnum,
@@ -17,14 +19,13 @@ from nv_ingest_api.internal.enums.common import (
17
19
  TextTypeEnum,
18
20
  LanguageEnum,
19
21
  )
20
- from nv_ingest_api.internal.primitives.tracing.tagging import traceable
21
22
  from nv_ingest_api.internal.schemas.meta.metadata_schema import ContentHierarchySchema
22
23
  from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
23
24
  from nv_ingest_api.util.exception_handlers.decorators import (
24
25
  nv_ingest_node_failure_try_except,
25
26
  )
27
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
26
28
 
27
- # logging.basicConfig(level=logging.DEBUG)
28
29
  logger = logging.getLogger(__name__)
29
30
 
30
31
 
@@ -37,15 +38,16 @@ class MetadataInjectionStage(RayActorStage):
37
38
  injection is required, and if so, injects the appropriate metadata.
38
39
  """
39
40
 
40
- def __init__(self, config: BaseModel) -> None:
41
+ def __init__(self, config: BaseModel, stage_name: Optional[str] = None) -> None:
41
42
  # Call the base initializer to set attributes like self._running.
42
- super().__init__(config)
43
+ super().__init__(config, stage_name=stage_name)
43
44
  # Additional initialization can be added here if necessary.
44
- logger.info("MetadataInjectionStage initialized with config: %s", config)
45
+ self._logger.debug("MetadataInjectionStage initialized with config: %s", config)
45
46
 
46
- @traceable("metadata_injector")
47
- @nv_ingest_node_failure_try_except(annotation_id="metadata_injector", raise_on_failure=False)
48
- def on_data(self, message: Any) -> Any:
47
+ @nv_ingest_node_failure_try_except()
48
+ @traceable()
49
+ @udf_intercept_hook()
50
+ def on_data(self, message: IngestControlMessage) -> IngestControlMessage:
49
51
  """
50
52
  Process an incoming IngestControlMessage by injecting metadata into its DataFrame payload.
51
53
 
@@ -62,7 +64,7 @@ class MetadataInjectionStage(RayActorStage):
62
64
  df = message.payload()
63
65
  update_required = False
64
66
  rows = []
65
- logger.info("Starting metadata injection on DataFrame with %d rows", len(df))
67
+ logger.debug("Starting metadata injection on DataFrame with %d rows", len(df))
66
68
 
67
69
  for _, row in df.iterrows():
68
70
  try:
@@ -141,7 +143,7 @@ class MetadataInjectionStage(RayActorStage):
141
143
  "source_metadata": default_source_metadata,
142
144
  "text_metadata": default_text_metadata,
143
145
  }
144
- logger.info(
146
+ logger.debug(
145
147
  f"METADATA_INJECTOR_DEBUG: Rebuilt metadata for source_id='{row.get('source_id', 'N/A')}'. "
146
148
  f"Metadata keys: {list(row['metadata'].keys())}."
147
149
  f"'content' present: {'content' in row['metadata']}"
@@ -154,8 +156,8 @@ class MetadataInjectionStage(RayActorStage):
154
156
  if update_required:
155
157
  docs = pd.DataFrame(rows)
156
158
  message.payload(docs)
157
- logger.info("Metadata injection updated payload with %d rows", len(docs))
159
+ logger.debug("Metadata injection updated payload with %d rows", len(docs))
158
160
  else:
159
- logger.info("No metadata update was necessary during metadata injection")
161
+ logger.debug("No metadata update was necessary during metadata injection")
160
162
 
161
163
  return message
@@ -21,6 +21,9 @@ class RayActorSinkStage(RayActorStage, ABC):
21
21
  to deliver their final processed messages.
22
22
  """
23
23
 
24
+ def __init__(self, config: Any, log_to_stdout=False, stage_name: Optional[str] = None) -> None:
25
+ super().__init__(config, log_to_stdout=log_to_stdout, stage_name=stage_name)
26
+
24
27
  @ray.method(num_returns=1)
25
28
  def set_output_queue(self, queue_handle: any) -> bool:
26
29
  raise NotImplementedError("Sink stages do not support an output queue.")
@@ -3,7 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  from abc import ABC, abstractmethod
6
- from typing import Any
6
+ from typing import Any, Optional
7
7
  import ray
8
8
  import logging
9
9
 
@@ -19,8 +19,8 @@ class RayActorSourceStage(RayActorStage, ABC):
19
19
  Instead, they must implement get_input() to fetch control messages from an external source.
20
20
  """
21
21
 
22
- def __init__(self, config: Any, log_to_stdout=False) -> None:
23
- super().__init__(config, log_to_stdout=log_to_stdout)
22
+ def __init__(self, config: Any, log_to_stdout=False, stage_name: Optional[str] = None) -> None:
23
+ super().__init__(config, log_to_stdout=log_to_stdout, stage_name=stage_name)
24
24
  self.paused = False
25
25
 
26
26
  def on_data(self, IngestControlMessage):
@@ -2,6 +2,7 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ import gc
5
6
  import sys
6
7
  import threading
7
8
  import time
@@ -9,12 +10,12 @@ from abc import ABC, abstractmethod
9
10
  from typing import Any, Dict, Optional
10
11
  import os
11
12
  import psutil
12
- import gc
13
13
 
14
14
  import ray
15
15
  import ray.actor
16
16
  from pydantic import BaseModel
17
17
  import logging
18
+ import pyarrow as pa
18
19
 
19
20
  from ray import get_runtime_context
20
21
 
@@ -50,6 +51,9 @@ class RayActorStage(ABC):
50
51
  ----------
51
52
  config : BaseModel
52
53
  Configuration object for the stage.
54
+ stage_name : Optional[str]
55
+ Name of the stage from YAML pipeline configuration. Used by
56
+ stage-aware decorators for consistent naming.
53
57
  _input_queue : Optional[Any]
54
58
  Handle to the Ray queue from which input items are read.
55
59
  Expected to be set via `set_input_queue`.
@@ -81,7 +85,7 @@ class RayActorStage(ABC):
81
85
  Lock to protect access to shutdown-related state (`_shutting_down`).
82
86
  """
83
87
 
84
- def __init__(self, config: BaseModel, log_to_stdout=False) -> None:
88
+ def __init__(self, config: BaseModel, stage_name: Optional[str] = None, log_to_stdout=False) -> None:
85
89
  """
86
90
  Initialize the RayActorStage.
87
91
 
@@ -90,8 +94,14 @@ class RayActorStage(ABC):
90
94
  config : BaseModel
91
95
  Configuration object specific to the stage's behavior. Passed by
92
96
  the orchestrator during actor creation.
97
+ stage_name : Optional[str]
98
+ Name of the stage from YAML pipeline configuration. Used by
99
+ stage-aware decorators for consistent naming.
100
+ log_to_stdout : bool
101
+ Whether to enable stdout logging.
93
102
  """
94
103
  self.config: BaseModel = config
104
+ self.stage_name: Optional[str] = stage_name
95
105
  self._input_queue: Optional[Any] = None # Ray Queue handle expected
96
106
  self._output_queue: Optional[Any] = None # Ray Queue handle expected
97
107
  self._running: bool = False
@@ -130,12 +140,13 @@ class RayActorStage(ABC):
130
140
 
131
141
  self._actor_id_str = self._get_actor_id_str()
132
142
 
133
- # --- PyArrow memory cleanup configuration/state ---
134
- # Allow stages to configure the cleanup interval (seconds) via their config.
135
- # Defaults to 5 minutes if not provided.
136
- self._memory_cleanup_interval_seconds: int = int(getattr(self.config, "memory_cleanup_interval_seconds", 300))
137
- self._last_memory_cleanup_time: float = time.time()
138
- self._memory_cleanups_performed: int = 0
143
+ # --- PyArrow Memory Management ---
144
+ # Time-based periodic cleanup to prevent long-term memory accumulation
145
+ self._memory_cleanup_interval_seconds = getattr(
146
+ config, "memory_cleanup_interval_seconds", 300
147
+ ) # 5 minutes default
148
+ self._last_memory_cleanup_time = time.time()
149
+ self._memory_cleanups_performed = 0
139
150
 
140
151
  @staticmethod
141
152
  def _get_actor_id_str() -> str:
@@ -429,8 +440,6 @@ class RayActorStage(ABC):
429
440
  gc.collect()
430
441
 
431
442
  try:
432
- import pyarrow as pa # Local import to avoid hard dependency at import time
433
-
434
443
  pool = pa.default_memory_pool()
435
444
  try:
436
445
  before_bytes = getattr(pool, "bytes_allocated", lambda: 0)()
@@ -579,7 +588,7 @@ class RayActorStage(ABC):
579
588
  self._logger.warning(f"{self._actor_id_str}: Start called but actor is already running.")
580
589
  return False
581
590
 
582
- self._logger.info(f"{self._actor_id_str}: Starting actor...")
591
+ self._logger.debug(f"{self._actor_id_str}: Starting actor...")
583
592
  # --- Initialize Actor State ---
584
593
  self._running = True
585
594
  self._shutting_down = False # Reset shutdown flag on start
@@ -598,14 +607,14 @@ class RayActorStage(ABC):
598
607
  )
599
608
  self._processing_thread.start()
600
609
 
601
- self._logger.info(f"{self._actor_id_str}: Actor started successfully.")
610
+ self._logger.debug(f"{self._actor_id_str}: Actor started successfully.")
602
611
 
603
612
  return True
604
613
 
605
614
  @ray.method(num_returns=0)
606
615
  def stop(self) -> None:
607
616
  """Stops the actor's processing loop by setting the running flag to False."""
608
- self._logger.info(f"[{self._actor_id_str}] Stop signal received. Initiating graceful shutdown.")
617
+ self._logger.debug(f"[{self._actor_id_str}] Stop signal received. Initiating graceful shutdown.")
609
618
  self._running = False
610
619
 
611
620
  def is_shutdown_complete(self) -> bool:
@@ -4,11 +4,13 @@
4
4
 
5
5
 
6
6
  import logging
7
+ from typing import Optional
7
8
 
8
9
  import ray
9
10
 
10
11
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
12
  from nv_ingest.framework.util.flow_control import filter_by_task
13
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
12
14
  from nv_ingest_api.internal.mutate.deduplicate import deduplicate_images_internal
13
15
  from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
14
16
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable
@@ -31,18 +33,19 @@ class ImageDedupStage(RayActorStage):
31
33
  3. Updates the message payload with the deduplicated DataFrame.
32
34
  """
33
35
 
34
- def __init__(self, config: ImageDedupSchema) -> None:
35
- super().__init__(config)
36
+ def __init__(self, config: ImageDedupSchema, stage_name: Optional[str] = None) -> None:
37
+ super().__init__(config, stage_name=stage_name)
36
38
  try:
37
39
  self.validated_config = config
38
- logger.info("ImageDedupStage configuration validated successfully.")
40
+ logger.debug("ImageDedupStage configuration validated successfully.")
39
41
  except Exception as e:
40
42
  logger.exception(f"Error validating Image Deduplication config: {e}")
41
43
  raise
42
44
 
43
- @traceable("image_deduplication")
45
+ @nv_ingest_node_failure_try_except()
46
+ @traceable()
47
+ @udf_intercept_hook()
44
48
  @filter_by_task(required_tasks=["dedup"])
45
- @nv_ingest_node_failure_try_except(annotation_id="image_dedup", raise_on_failure=False)
46
49
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
50
  """
48
51
  Process the control message by deduplicating images.
@@ -57,7 +60,7 @@ class ImageDedupStage(RayActorStage):
57
60
  IngestControlMessage
58
61
  The updated message with deduplicated images in the payload.
59
62
  """
60
- logger.info("ImageDedupStage.on_data: Starting image deduplication process.")
63
+ logger.debug("ImageDedupStage.on_data: Starting image deduplication process.")
61
64
  try:
62
65
  # Extract the DataFrame payload.
63
66
  df_ledger = control_message.payload()
@@ -74,7 +77,7 @@ class ImageDedupStage(RayActorStage):
74
77
  mutate_config=self.validated_config,
75
78
  execution_trace_log=None,
76
79
  )
77
- logger.info("Image deduplication completed. Resulting DataFrame has %d rows.", len(new_df))
80
+ logger.debug("Image deduplication completed. Resulting DataFrame has %d rows.", len(new_df))
78
81
 
79
82
  # Update the message payload with the deduplicated DataFrame.
80
83
  control_message.payload(new_df)