nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. nv_ingest/api/__init__.py +6 -0
  2. nv_ingest/api/main.py +2 -0
  3. nv_ingest/api/tracing.py +82 -0
  4. nv_ingest/api/v2/README.md +203 -0
  5. nv_ingest/api/v2/__init__.py +3 -0
  6. nv_ingest/api/v2/ingest.py +1300 -0
  7. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  9. nv_ingest/framework/orchestration/execution/options.py +112 -0
  10. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
  12. nv_ingest/framework/orchestration/process/execution.py +495 -0
  13. nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
  14. nv_ingest/framework/orchestration/process/strategies.py +218 -0
  15. nv_ingest/framework/orchestration/process/termination.py +147 -0
  16. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +3 -3
  17. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  18. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
  19. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  20. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
  21. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
  22. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
  23. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
  24. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
  25. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  26. nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
  28. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  29. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
  30. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
  31. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  32. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  33. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
  34. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
  35. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
  36. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  37. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  38. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
  39. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +79 -11
  40. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
  41. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  42. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  43. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
  44. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -18
  45. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
  46. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  47. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  48. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  49. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  50. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  51. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
  52. nv_ingest/pipeline/__init__.py +3 -0
  53. nv_ingest/pipeline/config/__init__.py +3 -0
  54. nv_ingest/pipeline/config/loaders.py +229 -0
  55. nv_ingest/pipeline/config/replica_resolver.py +237 -0
  56. nv_ingest/pipeline/default_libmode_pipeline_impl.py +528 -0
  57. nv_ingest/pipeline/default_pipeline_impl.py +557 -0
  58. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  59. nv_ingest/pipeline/pipeline_schema.py +398 -0
  60. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +6 -3
  61. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +64 -43
  62. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  63. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  64. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
  65. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
  66. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,71 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import ray
7
+
8
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
9
+ from nv_ingest.framework.util.flow_control import filter_by_task
10
+ from nv_ingest_api.internal.extract.image.ocr_extractor import extract_text_data_from_image_internal
11
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
12
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
13
+ from nv_ingest_api.internal.schemas.extract.extract_ocr_schema import OCRExtractorSchema
14
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
15
+ from typing import Optional
16
+
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @ray.remote
23
+ class OCRExtractorStage(RayActorStage):
24
+ """
25
+ A Ray actor stage that extracts text data from image content.
26
+
27
+ It expects an IngestControlMessage containing a DataFrame with image data. It then:
28
+ 1. Removes the "text_data_extract" task from the message.
29
+ 2. Calls the text extraction logic using a validated configuration.
30
+ 3. Updates the message payload with the extracted text DataFrame.
31
+ """
32
+
33
+ def __init__(self, config: OCRExtractorSchema, stage_name: Optional[str] = None) -> None:
34
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
35
+ try:
36
+ self.validated_config = config
37
+ self._logger.info("OCRExtractorStage configuration validated successfully.")
38
+ except Exception as e:
39
+ self._logger.exception(f"Error validating Text extractor config: {e}")
40
+ raise
41
+
42
+ @nv_ingest_node_failure_try_except()
43
+ @traceable()
44
+ @udf_intercept_hook()
45
+ @filter_by_task(required_tasks=["ocr_data_extract"])
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ # Extract DataFrame payload
48
+ df_ledger = control_message.payload()
49
+ if df_ledger.empty:
50
+ return control_message
51
+
52
+ # Remove the "text_data_extract" task from the message
53
+ task_config = remove_task_by_type(control_message, "ocr_data_extract")
54
+
55
+ execution_trace_log = {}
56
+ new_df, extraction_info = extract_text_data_from_image_internal(
57
+ df_extraction_ledger=df_ledger,
58
+ task_config=task_config,
59
+ extraction_config=self.validated_config,
60
+ execution_trace_log=execution_trace_log,
61
+ )
62
+
63
+ control_message.payload(new_df)
64
+ control_message.set_metadata("ocr_extraction_info", extraction_info)
65
+
66
+ do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
67
+ if do_trace_tagging and execution_trace_log:
68
+ parent_name = self.stage_name if self.stage_name else "ocr_extractor"
69
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
70
+
71
+ return control_message
@@ -7,16 +7,18 @@ import pandas as pd
7
7
  from typing import Any, Dict, Tuple, Optional
8
8
  import ray
9
9
 
10
- # Assume these imports come from your project:
11
- from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
- from nv_ingest.framework.util.flow_control import filter_by_task
13
10
  from nv_ingest_api.internal.extract.pdf.pdf_extractor import extract_primitives_from_pdf_internal
14
11
  from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
15
- from nv_ingest_api.internal.primitives.tracing.tagging import traceable
16
12
  from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
13
+
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import set_trace_timestamps_with_parent_context, traceable
15
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
16
+ from nv_ingest.framework.util.flow_control import filter_by_task
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
17
18
  from nv_ingest_api.util.exception_handlers.decorators import (
18
19
  nv_ingest_node_failure_try_except,
19
20
  )
21
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
20
22
 
21
23
  logger = logging.getLogger(__name__)
22
24
 
@@ -51,19 +53,20 @@ class PDFExtractorStage(RayActorStage):
51
53
  4. Optionally, stores additional extraction info in the message metadata.
52
54
  """
53
55
 
54
- def __init__(self, config: PDFExtractorSchema) -> None:
55
- super().__init__(config)
56
+ def __init__(self, config: PDFExtractorSchema, stage_name: Optional[str] = None) -> None:
57
+ super().__init__(config, stage_name=stage_name)
56
58
  try:
57
59
  # Validate and store the PDF extractor configuration.
58
60
  self.validated_config = config
59
- logger.info("PDFExtractorStage configuration validated successfully.")
61
+ logger.debug("PDFExtractorStage configuration validated successfully.")
60
62
  except Exception as e:
61
63
  logger.exception(f"Error validating PDF extractor config: {e}")
62
64
  raise
63
65
 
64
- @traceable("pdf_extraction")
66
+ @nv_ingest_node_failure_try_except()
67
+ @traceable()
68
+ @udf_intercept_hook()
65
69
  @filter_by_task(required_tasks=[("extract", {"document_type": "pdf"})])
66
- @nv_ingest_node_failure_try_except(annotation_id="pdf_extractor", raise_on_failure=False)
67
70
  def on_data(self, control_message: Any) -> Any:
68
71
  """
69
72
  Process the control message by extracting PDF content.
@@ -79,7 +82,7 @@ class PDFExtractorStage(RayActorStage):
79
82
  The updated message with the extracted DataFrame and extraction info in metadata.
80
83
  """
81
84
 
82
- logger.info("PDFExtractorStage.on_data: Starting PDF extraction process.")
85
+ logger.debug("PDFExtractorStage.on_data: Starting PDF extraction process.")
83
86
 
84
87
  # Extract the DataFrame payload.
85
88
  df_extraction_ledger = control_message.payload()
@@ -87,7 +90,7 @@ class PDFExtractorStage(RayActorStage):
87
90
 
88
91
  # Remove the "extract" task from the message to obtain task-specific configuration.
89
92
  task_config = remove_task_by_type(control_message, "extract")
90
- logger.debug("Extracted task config: %s", task_config)
93
+ logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
91
94
 
92
95
  # Perform PDF extraction.
93
96
  execution_trace_log = {}
@@ -97,17 +100,18 @@ class PDFExtractorStage(RayActorStage):
97
100
  execution_trace_log=execution_trace_log,
98
101
  validated_config=self.validated_config,
99
102
  )
100
- logger.info("PDF extraction completed. Extracted %d rows.", len(new_df))
103
+ logger.debug("PDF extraction completed. Extracted %d rows.", len(new_df))
101
104
 
102
105
  # Update the message payload with the extracted DataFrame.
103
106
  control_message.payload(new_df)
104
107
  # Optionally, annotate the message with extraction info.
105
108
  control_message.set_metadata("pdf_extraction_info", extraction_info)
106
- logger.info("PDF extraction metadata injected successfully.")
109
+ logger.debug("PDF extraction metadata injected successfully.")
107
110
 
108
111
  do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
109
112
  if do_trace_tagging and execution_trace_log:
110
- for key, ts in execution_trace_log.items():
111
- control_message.set_timestamp(key, ts)
113
+ # Use utility function to set trace timestamps with proper parent-child context
114
+ parent_name = self.stage_name or "pdf_extractor"
115
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
112
116
 
113
117
  return control_message
@@ -3,8 +3,10 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ from typing import Optional
6
7
 
7
8
  import ray
9
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
8
10
 
9
11
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
10
12
  from nv_ingest.framework.util.flow_control import filter_by_task
@@ -28,7 +30,7 @@ class PPTXExtractorStage(RayActorStage):
28
30
  3. Updates the message payload with the extracted content DataFrame.
29
31
  """
30
32
 
31
- def __init__(self, config: PPTXExtractorSchema) -> None:
33
+ def __init__(self, config: PPTXExtractorSchema, stage_name: Optional[str] = None) -> None:
32
34
  """
33
35
  Initializes the PptxExtractorStage.
34
36
 
@@ -36,8 +38,10 @@ class PPTXExtractorStage(RayActorStage):
36
38
  ----------
37
39
  config : PPTXExtractorSchema
38
40
  The validated configuration object for PPTX extraction.
41
+ stage_name : Optional[str]
42
+ Name of the stage from YAML pipeline configuration.
39
43
  """
40
- super().__init__(config)
44
+ super().__init__(config, stage_name=stage_name)
41
45
  try:
42
46
  # The config passed in should already be validated, but storing it.
43
47
  self.validated_config = config
@@ -47,9 +51,10 @@ class PPTXExtractorStage(RayActorStage):
47
51
  logger.exception(f"Error initializing or validating PPTX Extractor config: {e}")
48
52
  raise
49
53
 
50
- @traceable("pptx_extractor")
54
+ @nv_ingest_node_failure_try_except()
55
+ @traceable()
56
+ @udf_intercept_hook()
51
57
  @filter_by_task(required_tasks=[("extract", {"document_type": "pptx"})])
52
- @nv_ingest_node_failure_try_except(annotation_id="pptx_extractor", raise_on_failure=False)
53
58
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
54
59
  """
55
60
  Process the control message by extracting content from PPTX documents.
@@ -80,6 +85,6 @@ class PPTXExtractorStage(RayActorStage):
80
85
 
81
86
  # Update the message payload with the extracted PPTX content DataFrame.
82
87
  control_message.payload(new_df)
83
- control_message.set_metadata("pptx_extraction_info", extraction_info) # <-- Changed metadata key
88
+ control_message.set_metadata("pptx_extraction_info", extraction_info)
84
89
 
85
90
  return control_message
@@ -3,19 +3,20 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
- from typing import Any
6
+ from typing import Any, Optional
7
7
  import ray
8
8
 
9
- # These imports are assumed from your project.
10
9
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
10
  from nv_ingest.framework.util.flow_control import filter_by_task
11
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
12
12
  from nv_ingest_api.internal.extract.image.table_extractor import extract_table_data_from_image_internal
13
13
  from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
14
- from nv_ingest_api.internal.primitives.tracing.tagging import traceable
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
15
15
  from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
16
16
  from nv_ingest_api.util.exception_handlers.decorators import (
17
17
  nv_ingest_node_failure_try_except,
18
18
  )
19
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
@@ -31,18 +32,19 @@ class TableExtractorStage(RayActorStage):
31
32
  and annotates the message metadata with extraction info.
32
33
  """
33
34
 
34
- def __init__(self, config: TableExtractorSchema) -> None:
35
- super().__init__(config)
35
+ def __init__(self, config: TableExtractorSchema, stage_name: Optional[str] = None) -> None:
36
+ super().__init__(config, stage_name=stage_name)
36
37
  try:
37
38
  self.validated_config = config
38
- logger.info("TableExtractorStage configuration validated successfully.")
39
+ logger.debug("TableExtractorStage configuration validated successfully.")
39
40
  except Exception as e:
40
41
  logger.exception("Error validating table extractor config")
41
42
  raise e
42
43
 
43
- @traceable("table_extraction")
44
+ @nv_ingest_node_failure_try_except()
45
+ @traceable()
46
+ @udf_intercept_hook()
44
47
  @filter_by_task(required_tasks=["table_data_extract"])
45
- @nv_ingest_node_failure_try_except(annotation_id="table_extraction", raise_on_failure=False)
46
48
  def on_data(self, control_message: Any) -> Any:
47
49
  """
48
50
  Process the control message by extracting table data from the PDF payload.
@@ -57,14 +59,14 @@ class TableExtractorStage(RayActorStage):
57
59
  IngestControlMessage
58
60
  The updated message with the extracted table data and extraction info in metadata.
59
61
  """
60
- logger.info("TableExtractorStage.on_data: Starting table extraction.")
62
+ logger.debug("TableExtractorStage.on_data: Starting table extraction.")
61
63
  # Extract the DataFrame payload.
62
64
  df_payload = control_message.payload()
63
65
  logger.debug("Extracted payload with %d rows.", len(df_payload))
64
66
 
65
67
  # Remove the "table_data_extract" task to obtain task-specific configuration.
66
68
  task_config = remove_task_by_type(control_message, "table_data_extract")
67
- logger.debug("Extracted task configuration: %s", task_config)
69
+ logger.debug("Extracted task configuration: %s", sanitize_for_logging(task_config))
68
70
 
69
71
  # Perform table data extraction.
70
72
  execution_trace_log = {}
@@ -74,17 +76,17 @@ class TableExtractorStage(RayActorStage):
74
76
  extraction_config=self.validated_config,
75
77
  execution_trace_log=execution_trace_log,
76
78
  )
77
- logger.info("Table extraction completed. Extracted %d rows.", len(new_df))
79
+ logger.debug("Table extraction completed. Extracted %d rows.", len(new_df))
78
80
 
79
81
  # Update the control message with the new DataFrame.
80
82
  control_message.payload(new_df)
81
83
  # Annotate the message with extraction info.
82
84
  control_message.set_metadata("table_extraction_info", extraction_info)
83
- logger.info("Table extraction metadata injected successfully.")
85
+ logger.debug("Table extraction metadata injected successfully.")
84
86
 
85
87
  do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
86
88
  if do_trace_tagging and execution_trace_log:
87
- for key, ts in execution_trace_log.items():
88
- control_message.set_timestamp(key, ts)
89
+ parent_name = self.stage_name if self.stage_name else "table_extractor"
90
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
89
91
 
90
92
  return control_message
@@ -4,12 +4,14 @@
4
4
 
5
5
  from datetime import datetime
6
6
  import logging
7
+ from typing import Optional
7
8
  import pandas as pd
8
- from typing import Any
9
9
  from pydantic import BaseModel
10
10
  import ray
11
11
 
12
12
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
13
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
13
15
  from nv_ingest_api.internal.enums.common import (
14
16
  DocumentTypeEnum,
15
17
  ContentTypeEnum,
@@ -17,14 +19,14 @@ from nv_ingest_api.internal.enums.common import (
17
19
  TextTypeEnum,
18
20
  LanguageEnum,
19
21
  )
20
- from nv_ingest_api.internal.primitives.tracing.tagging import traceable
21
22
  from nv_ingest_api.internal.schemas.meta.metadata_schema import ContentHierarchySchema
22
23
  from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
23
24
  from nv_ingest_api.util.exception_handlers.decorators import (
24
25
  nv_ingest_node_failure_try_except,
25
26
  )
27
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
28
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
26
29
 
27
- # logging.basicConfig(level=logging.DEBUG)
28
30
  logger = logging.getLogger(__name__)
29
31
 
30
32
 
@@ -37,15 +39,16 @@ class MetadataInjectionStage(RayActorStage):
37
39
  injection is required, and if so, injects the appropriate metadata.
38
40
  """
39
41
 
40
- def __init__(self, config: BaseModel) -> None:
42
+ def __init__(self, config: BaseModel, stage_name: Optional[str] = None) -> None:
41
43
  # Call the base initializer to set attributes like self._running.
42
- super().__init__(config)
44
+ super().__init__(config, stage_name=stage_name)
43
45
  # Additional initialization can be added here if necessary.
44
- logger.info("MetadataInjectionStage initialized with config: %s", config)
46
+ self._logger.debug("MetadataInjectionStage initialized with config: %s", sanitize_for_logging(config))
45
47
 
46
- @traceable("metadata_injector")
47
- @nv_ingest_node_failure_try_except(annotation_id="metadata_injector", raise_on_failure=False)
48
- def on_data(self, message: Any) -> Any:
48
+ @nv_ingest_node_failure_try_except()
49
+ @traceable()
50
+ @udf_intercept_hook()
51
+ def on_data(self, message: IngestControlMessage) -> IngestControlMessage:
49
52
  """
50
53
  Process an incoming IngestControlMessage by injecting metadata into its DataFrame payload.
51
54
 
@@ -62,7 +65,7 @@ class MetadataInjectionStage(RayActorStage):
62
65
  df = message.payload()
63
66
  update_required = False
64
67
  rows = []
65
- logger.info("Starting metadata injection on DataFrame with %d rows", len(df))
68
+ logger.debug("Starting metadata injection on DataFrame with %d rows", len(df))
66
69
 
67
70
  for _, row in df.iterrows():
68
71
  try:
@@ -141,7 +144,7 @@ class MetadataInjectionStage(RayActorStage):
141
144
  "source_metadata": default_source_metadata,
142
145
  "text_metadata": default_text_metadata,
143
146
  }
144
- logger.info(
147
+ logger.debug(
145
148
  f"METADATA_INJECTOR_DEBUG: Rebuilt metadata for source_id='{row.get('source_id', 'N/A')}'. "
146
149
  f"Metadata keys: {list(row['metadata'].keys())}."
147
150
  f"'content' present: {'content' in row['metadata']}"
@@ -154,8 +157,8 @@ class MetadataInjectionStage(RayActorStage):
154
157
  if update_required:
155
158
  docs = pd.DataFrame(rows)
156
159
  message.payload(docs)
157
- logger.info("Metadata injection updated payload with %d rows", len(docs))
160
+ logger.debug("Metadata injection updated payload with %d rows", len(docs))
158
161
  else:
159
- logger.info("No metadata update was necessary during metadata injection")
162
+ logger.debug("No metadata update was necessary during metadata injection")
160
163
 
161
164
  return message
@@ -21,6 +21,9 @@ class RayActorSinkStage(RayActorStage, ABC):
21
21
  to deliver their final processed messages.
22
22
  """
23
23
 
24
+ def __init__(self, config: Any, log_to_stdout=False, stage_name: Optional[str] = None) -> None:
25
+ super().__init__(config, log_to_stdout=log_to_stdout, stage_name=stage_name)
26
+
24
27
  @ray.method(num_returns=1)
25
28
  def set_output_queue(self, queue_handle: any) -> bool:
26
29
  raise NotImplementedError("Sink stages do not support an output queue.")
@@ -3,7 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  from abc import ABC, abstractmethod
6
- from typing import Any
6
+ from typing import Any, Optional
7
7
  import ray
8
8
  import logging
9
9
 
@@ -19,8 +19,8 @@ class RayActorSourceStage(RayActorStage, ABC):
19
19
  Instead, they must implement get_input() to fetch control messages from an external source.
20
20
  """
21
21
 
22
- def __init__(self, config: Any, log_to_stdout=False) -> None:
23
- super().__init__(config, log_to_stdout=log_to_stdout)
22
+ def __init__(self, config: Any, log_to_stdout=False, stage_name: Optional[str] = None) -> None:
23
+ super().__init__(config, log_to_stdout=log_to_stdout, stage_name=stage_name)
24
24
  self.paused = False
25
25
 
26
26
  def on_data(self, IngestControlMessage):
@@ -2,6 +2,7 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ import gc
5
6
  import sys
6
7
  import threading
7
8
  import time
@@ -14,6 +15,7 @@ import ray
14
15
  import ray.actor
15
16
  from pydantic import BaseModel
16
17
  import logging
18
+ import pyarrow as pa
17
19
 
18
20
  from ray import get_runtime_context
19
21
 
@@ -49,6 +51,9 @@ class RayActorStage(ABC):
49
51
  ----------
50
52
  config : BaseModel
51
53
  Configuration object for the stage.
54
+ stage_name : Optional[str]
55
+ Name of the stage from YAML pipeline configuration. Used by
56
+ stage-aware decorators for consistent naming.
52
57
  _input_queue : Optional[Any]
53
58
  Handle to the Ray queue from which input items are read.
54
59
  Expected to be set via `set_input_queue`.
@@ -80,7 +85,7 @@ class RayActorStage(ABC):
80
85
  Lock to protect access to shutdown-related state (`_shutting_down`).
81
86
  """
82
87
 
83
- def __init__(self, config: BaseModel, log_to_stdout=False) -> None:
88
+ def __init__(self, config: BaseModel, stage_name: Optional[str] = None, log_to_stdout=False) -> None:
84
89
  """
85
90
  Initialize the RayActorStage.
86
91
 
@@ -89,8 +94,14 @@ class RayActorStage(ABC):
89
94
  config : BaseModel
90
95
  Configuration object specific to the stage's behavior. Passed by
91
96
  the orchestrator during actor creation.
97
+ stage_name : Optional[str]
98
+ Name of the stage from YAML pipeline configuration. Used by
99
+ stage-aware decorators for consistent naming.
100
+ log_to_stdout : bool
101
+ Whether to enable stdout logging.
92
102
  """
93
103
  self.config: BaseModel = config
104
+ self.stage_name: Optional[str] = stage_name
94
105
  self._input_queue: Optional[Any] = None # Ray Queue handle expected
95
106
  self._output_queue: Optional[Any] = None # Ray Queue handle expected
96
107
  self._running: bool = False
@@ -129,6 +140,14 @@ class RayActorStage(ABC):
129
140
 
130
141
  self._actor_id_str = self._get_actor_id_str()
131
142
 
143
+ # --- PyArrow Memory Management ---
144
+ # Time-based periodic cleanup to prevent long-term memory accumulation
145
+ self._memory_cleanup_interval_seconds = getattr(
146
+ config, "memory_cleanup_interval_seconds", 300
147
+ ) # 5 minutes default
148
+ self._last_memory_cleanup_time = time.time()
149
+ self._memory_cleanups_performed = 0
150
+
132
151
  @staticmethod
133
152
  def _get_actor_id_str() -> str:
134
153
  """
@@ -344,6 +363,16 @@ class RayActorStage(ABC):
344
363
  # This is the primary path for "successful processing".
345
364
  self.stats["processed"] += 1
346
365
 
366
+ # Time-based PyArrow memory cleanup check (best-effort, low overhead)
367
+ try:
368
+ current_time = time.time()
369
+ if (current_time - self._last_memory_cleanup_time) >= self._memory_cleanup_interval_seconds:
370
+ self._force_arrow_memory_cleanup()
371
+ self._last_memory_cleanup_time = current_time
372
+ except Exception:
373
+ # Never allow cleanup issues to interfere with processing
374
+ pass
375
+
347
376
  except ray.exceptions.ObjectLostError:
348
377
  # This error is handled inside the loop to prevent the actor from crashing.
349
378
  # We log it and continue to the next message.
@@ -386,10 +415,69 @@ class RayActorStage(ABC):
386
415
  # This block executes when the processing thread is about to exit,
387
416
  # either due to self._running becoming False or an unhandled critical exception.
388
417
  self._logger.debug(f"[{self._actor_id_str}] Processing loop thread finished.")
418
+ # Perform a best-effort final memory cleanup on exit
419
+ try:
420
+ self._force_arrow_memory_cleanup()
421
+ except Exception:
422
+ pass
389
423
  # Signal that this actor's processing duties are complete.
390
424
  # External monitors (e.g., via a future from stop()) can use this signal.
391
425
  self._shutdown_signal_complete = True
392
426
 
427
+ def _force_arrow_memory_cleanup(self) -> None:
428
+ """
429
+ Best-effort memory cleanup for PyArrow allocations.
430
+
431
+ - Runs Python garbage collection to drop unreachable references.
432
+ - If PyArrow is available and its default memory pool supports
433
+ release_unused(), request it to return free pages to the OS.
434
+
435
+ Designed to be safe to call periodically; any failures are logged at
436
+ debug/warning levels and are non-fatal.
437
+ """
438
+ try:
439
+ # First, trigger Python GC to maximize reclaimable memory
440
+ gc.collect()
441
+
442
+ try:
443
+ pool = pa.default_memory_pool()
444
+ try:
445
+ before_bytes = getattr(pool, "bytes_allocated", lambda: 0)()
446
+ except Exception:
447
+ before_bytes = 0
448
+
449
+ released = False
450
+ if hasattr(pool, "release_unused"):
451
+ try:
452
+ pool.release_unused()
453
+ released = True
454
+ except Exception as e_release:
455
+ self._logger.debug(f"[{self._actor_id_str}] Arrow pool release_unused() failed: {e_release}")
456
+
457
+ try:
458
+ after_bytes = getattr(pool, "bytes_allocated", lambda: before_bytes)()
459
+ except Exception:
460
+ after_bytes = before_bytes
461
+
462
+ if released:
463
+ delta_mb = max(0, (before_bytes - after_bytes) / (1024 * 1024))
464
+ if delta_mb > 0:
465
+ self._logger.debug(
466
+ f"[{self._actor_id_str}] Arrow cleanup released ~{delta_mb:.2f}"
467
+ f" MB (pool now {after_bytes/(1024*1024):.2f} MB)."
468
+ )
469
+ self._memory_cleanups_performed += 1
470
+ except ModuleNotFoundError:
471
+ # PyArrow not present; nothing to do beyond GC.
472
+ self._memory_cleanups_performed += 1
473
+ except Exception as e_pa:
474
+ # Any other PyArrow-related issues are non-fatal.
475
+ self._logger.debug(f"[{self._actor_id_str}] Arrow cleanup skipped due to error: {e_pa}")
476
+ self._memory_cleanups_performed += 1
477
+ except Exception as e:
478
+ # As a last resort, swallow any errors to avoid interfering with the actor loop.
479
+ self._logger.debug(f"[{self._actor_id_str}] Memory cleanup encountered an error: {e}")
480
+
393
481
  def _get_memory_usage_mb(self) -> float:
394
482
  """
395
483
  Gets the total memory usage of the current actor process (RSS).
@@ -500,7 +588,7 @@ class RayActorStage(ABC):
500
588
  self._logger.warning(f"{self._actor_id_str}: Start called but actor is already running.")
501
589
  return False
502
590
 
503
- self._logger.info(f"{self._actor_id_str}: Starting actor...")
591
+ self._logger.debug(f"{self._actor_id_str}: Starting actor...")
504
592
  # --- Initialize Actor State ---
505
593
  self._running = True
506
594
  self._shutting_down = False # Reset shutdown flag on start
@@ -519,14 +607,14 @@ class RayActorStage(ABC):
519
607
  )
520
608
  self._processing_thread.start()
521
609
 
522
- self._logger.info(f"{self._actor_id_str}: Actor started successfully.")
610
+ self._logger.debug(f"{self._actor_id_str}: Actor started successfully.")
523
611
 
524
612
  return True
525
613
 
526
614
  @ray.method(num_returns=0)
527
615
  def stop(self) -> None:
528
616
  """Stops the actor's processing loop by setting the running flag to False."""
529
- self._logger.info(f"[{self._actor_id_str}] Stop signal received. Initiating graceful shutdown.")
617
+ self._logger.debug(f"[{self._actor_id_str}] Stop signal received. Initiating graceful shutdown.")
530
618
  self._running = False
531
619
 
532
620
  def is_shutdown_complete(self) -> bool:
@@ -4,11 +4,13 @@
4
4
 
5
5
 
6
6
  import logging
7
+ from typing import Optional
7
8
 
8
9
  import ray
9
10
 
10
11
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
12
  from nv_ingest.framework.util.flow_control import filter_by_task
13
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
12
14
  from nv_ingest_api.internal.mutate.deduplicate import deduplicate_images_internal
13
15
  from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
14
16
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable
@@ -16,6 +18,7 @@ from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import Imag
16
18
  from nv_ingest_api.util.exception_handlers.decorators import (
17
19
  nv_ingest_node_failure_try_except,
18
20
  )
21
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
19
22
 
20
23
  logger = logging.getLogger(__name__)
21
24
 
@@ -31,18 +34,19 @@ class ImageDedupStage(RayActorStage):
31
34
  3. Updates the message payload with the deduplicated DataFrame.
32
35
  """
33
36
 
34
- def __init__(self, config: ImageDedupSchema) -> None:
35
- super().__init__(config)
37
+ def __init__(self, config: ImageDedupSchema, stage_name: Optional[str] = None) -> None:
38
+ super().__init__(config, stage_name=stage_name)
36
39
  try:
37
40
  self.validated_config = config
38
- logger.info("ImageDedupStage configuration validated successfully.")
41
+ logger.debug("ImageDedupStage configuration validated successfully.")
39
42
  except Exception as e:
40
43
  logger.exception(f"Error validating Image Deduplication config: {e}")
41
44
  raise
42
45
 
43
- @traceable("image_deduplication")
46
+ @nv_ingest_node_failure_try_except()
47
+ @traceable()
48
+ @udf_intercept_hook()
44
49
  @filter_by_task(required_tasks=["dedup"])
45
- @nv_ingest_node_failure_try_except(annotation_id="image_dedup", raise_on_failure=False)
46
50
  def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
51
  """
48
52
  Process the control message by deduplicating images.
@@ -57,7 +61,7 @@ class ImageDedupStage(RayActorStage):
57
61
  IngestControlMessage
58
62
  The updated message with deduplicated images in the payload.
59
63
  """
60
- logger.info("ImageDedupStage.on_data: Starting image deduplication process.")
64
+ logger.debug("ImageDedupStage.on_data: Starting image deduplication process.")
61
65
  try:
62
66
  # Extract the DataFrame payload.
63
67
  df_ledger = control_message.payload()
@@ -65,7 +69,7 @@ class ImageDedupStage(RayActorStage):
65
69
 
66
70
  # Remove the "dedup" task from the message to obtain task-specific configuration.
67
71
  task_config = remove_task_by_type(control_message, "dedup")
68
- logger.debug("Extracted task config: %s", task_config)
72
+ logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
69
73
 
70
74
  # Perform image deduplication.
71
75
  new_df = deduplicate_images_internal(
@@ -74,7 +78,7 @@ class ImageDedupStage(RayActorStage):
74
78
  mutate_config=self.validated_config,
75
79
  execution_trace_log=None,
76
80
  )
77
- logger.info("Image deduplication completed. Resulting DataFrame has %d rows.", len(new_df))
81
+ logger.debug("Image deduplication completed. Resulting DataFrame has %d rows.", len(new_df))
78
82
 
79
83
  # Update the message payload with the deduplicated DataFrame.
80
84
  control_message.payload(new_df)