nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/__init__.py +6 -0
- nv_ingest/api/main.py +2 -0
- nv_ingest/api/tracing.py +82 -0
- nv_ingest/api/v2/README.md +203 -0
- nv_ingest/api/v2/__init__.py +3 -0
- nv_ingest/api/v2/ingest.py +1300 -0
- nv_ingest/framework/orchestration/execution/__init__.py +3 -0
- nv_ingest/framework/orchestration/execution/helpers.py +85 -0
- nv_ingest/framework/orchestration/execution/options.py +112 -0
- nv_ingest/framework/orchestration/process/__init__.py +3 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
- nv_ingest/framework/orchestration/process/execution.py +495 -0
- nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
- nv_ingest/framework/orchestration/process/strategies.py +218 -0
- nv_ingest/framework/orchestration/process/termination.py +147 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +3 -3
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
- nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +79 -11
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -18
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
- nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
- nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
- nv_ingest/pipeline/__init__.py +3 -0
- nv_ingest/pipeline/config/__init__.py +3 -0
- nv_ingest/pipeline/config/loaders.py +229 -0
- nv_ingest/pipeline/config/replica_resolver.py +237 -0
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +528 -0
- nv_ingest/pipeline/default_pipeline_impl.py +557 -0
- nv_ingest/pipeline/ingest_pipeline.py +389 -0
- nv_ingest/pipeline/pipeline_schema.py +398 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +6 -3
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +64 -43
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import ray
|
|
7
|
+
|
|
8
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
9
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
10
|
+
from nv_ingest_api.internal.extract.image.ocr_extractor import extract_text_data_from_image_internal
|
|
11
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
12
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
|
|
13
|
+
from nv_ingest_api.internal.schemas.extract.extract_ocr_schema import OCRExtractorSchema
|
|
14
|
+
from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@ray.remote
|
|
23
|
+
class OCRExtractorStage(RayActorStage):
|
|
24
|
+
"""
|
|
25
|
+
A Ray actor stage that extracts text data from image content.
|
|
26
|
+
|
|
27
|
+
It expects an IngestControlMessage containing a DataFrame with image data. It then:
|
|
28
|
+
1. Removes the "text_data_extract" task from the message.
|
|
29
|
+
2. Calls the text extraction logic using a validated configuration.
|
|
30
|
+
3. Updates the message payload with the extracted text DataFrame.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: OCRExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
34
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
35
|
+
try:
|
|
36
|
+
self.validated_config = config
|
|
37
|
+
self._logger.info("OCRExtractorStage configuration validated successfully.")
|
|
38
|
+
except Exception as e:
|
|
39
|
+
self._logger.exception(f"Error validating Text extractor config: {e}")
|
|
40
|
+
raise
|
|
41
|
+
|
|
42
|
+
@nv_ingest_node_failure_try_except()
|
|
43
|
+
@traceable()
|
|
44
|
+
@udf_intercept_hook()
|
|
45
|
+
@filter_by_task(required_tasks=["ocr_data_extract"])
|
|
46
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
|
+
# Extract DataFrame payload
|
|
48
|
+
df_ledger = control_message.payload()
|
|
49
|
+
if df_ledger.empty:
|
|
50
|
+
return control_message
|
|
51
|
+
|
|
52
|
+
# Remove the "text_data_extract" task from the message
|
|
53
|
+
task_config = remove_task_by_type(control_message, "ocr_data_extract")
|
|
54
|
+
|
|
55
|
+
execution_trace_log = {}
|
|
56
|
+
new_df, extraction_info = extract_text_data_from_image_internal(
|
|
57
|
+
df_extraction_ledger=df_ledger,
|
|
58
|
+
task_config=task_config,
|
|
59
|
+
extraction_config=self.validated_config,
|
|
60
|
+
execution_trace_log=execution_trace_log,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
control_message.payload(new_df)
|
|
64
|
+
control_message.set_metadata("ocr_extraction_info", extraction_info)
|
|
65
|
+
|
|
66
|
+
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
67
|
+
if do_trace_tagging and execution_trace_log:
|
|
68
|
+
parent_name = self.stage_name if self.stage_name else "ocr_extractor"
|
|
69
|
+
set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
|
|
70
|
+
|
|
71
|
+
return control_message
|
|
@@ -7,16 +7,18 @@ import pandas as pd
|
|
|
7
7
|
from typing import Any, Dict, Tuple, Optional
|
|
8
8
|
import ray
|
|
9
9
|
|
|
10
|
-
# Assume these imports come from your project:
|
|
11
|
-
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
12
|
-
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
13
10
|
from nv_ingest_api.internal.extract.pdf.pdf_extractor import extract_primitives_from_pdf_internal
|
|
14
11
|
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
|
|
15
|
-
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
16
12
|
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
|
|
13
|
+
|
|
14
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import set_trace_timestamps_with_parent_context, traceable
|
|
15
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
16
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
17
18
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
18
19
|
nv_ingest_node_failure_try_except,
|
|
19
20
|
)
|
|
21
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
20
22
|
|
|
21
23
|
logger = logging.getLogger(__name__)
|
|
22
24
|
|
|
@@ -51,19 +53,20 @@ class PDFExtractorStage(RayActorStage):
|
|
|
51
53
|
4. Optionally, stores additional extraction info in the message metadata.
|
|
52
54
|
"""
|
|
53
55
|
|
|
54
|
-
def __init__(self, config: PDFExtractorSchema) -> None:
|
|
55
|
-
super().__init__(config)
|
|
56
|
+
def __init__(self, config: PDFExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
57
|
+
super().__init__(config, stage_name=stage_name)
|
|
56
58
|
try:
|
|
57
59
|
# Validate and store the PDF extractor configuration.
|
|
58
60
|
self.validated_config = config
|
|
59
|
-
logger.
|
|
61
|
+
logger.debug("PDFExtractorStage configuration validated successfully.")
|
|
60
62
|
except Exception as e:
|
|
61
63
|
logger.exception(f"Error validating PDF extractor config: {e}")
|
|
62
64
|
raise
|
|
63
65
|
|
|
64
|
-
@
|
|
66
|
+
@nv_ingest_node_failure_try_except()
|
|
67
|
+
@traceable()
|
|
68
|
+
@udf_intercept_hook()
|
|
65
69
|
@filter_by_task(required_tasks=[("extract", {"document_type": "pdf"})])
|
|
66
|
-
@nv_ingest_node_failure_try_except(annotation_id="pdf_extractor", raise_on_failure=False)
|
|
67
70
|
def on_data(self, control_message: Any) -> Any:
|
|
68
71
|
"""
|
|
69
72
|
Process the control message by extracting PDF content.
|
|
@@ -79,7 +82,7 @@ class PDFExtractorStage(RayActorStage):
|
|
|
79
82
|
The updated message with the extracted DataFrame and extraction info in metadata.
|
|
80
83
|
"""
|
|
81
84
|
|
|
82
|
-
logger.
|
|
85
|
+
logger.debug("PDFExtractorStage.on_data: Starting PDF extraction process.")
|
|
83
86
|
|
|
84
87
|
# Extract the DataFrame payload.
|
|
85
88
|
df_extraction_ledger = control_message.payload()
|
|
@@ -87,7 +90,7 @@ class PDFExtractorStage(RayActorStage):
|
|
|
87
90
|
|
|
88
91
|
# Remove the "extract" task from the message to obtain task-specific configuration.
|
|
89
92
|
task_config = remove_task_by_type(control_message, "extract")
|
|
90
|
-
logger.debug("Extracted task config: %s", task_config)
|
|
93
|
+
logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
91
94
|
|
|
92
95
|
# Perform PDF extraction.
|
|
93
96
|
execution_trace_log = {}
|
|
@@ -97,17 +100,18 @@ class PDFExtractorStage(RayActorStage):
|
|
|
97
100
|
execution_trace_log=execution_trace_log,
|
|
98
101
|
validated_config=self.validated_config,
|
|
99
102
|
)
|
|
100
|
-
logger.
|
|
103
|
+
logger.debug("PDF extraction completed. Extracted %d rows.", len(new_df))
|
|
101
104
|
|
|
102
105
|
# Update the message payload with the extracted DataFrame.
|
|
103
106
|
control_message.payload(new_df)
|
|
104
107
|
# Optionally, annotate the message with extraction info.
|
|
105
108
|
control_message.set_metadata("pdf_extraction_info", extraction_info)
|
|
106
|
-
logger.
|
|
109
|
+
logger.debug("PDF extraction metadata injected successfully.")
|
|
107
110
|
|
|
108
111
|
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
109
112
|
if do_trace_tagging and execution_trace_log:
|
|
110
|
-
|
|
111
|
-
|
|
113
|
+
# Use utility function to set trace timestamps with proper parent-child context
|
|
114
|
+
parent_name = self.stage_name or "pdf_extractor"
|
|
115
|
+
set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
|
|
112
116
|
|
|
113
117
|
return control_message
|
|
@@ -3,8 +3,10 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
from typing import Optional
|
|
6
7
|
|
|
7
8
|
import ray
|
|
9
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
8
10
|
|
|
9
11
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
10
12
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
@@ -28,7 +30,7 @@ class PPTXExtractorStage(RayActorStage):
|
|
|
28
30
|
3. Updates the message payload with the extracted content DataFrame.
|
|
29
31
|
"""
|
|
30
32
|
|
|
31
|
-
def __init__(self, config: PPTXExtractorSchema) -> None:
|
|
33
|
+
def __init__(self, config: PPTXExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
32
34
|
"""
|
|
33
35
|
Initializes the PptxExtractorStage.
|
|
34
36
|
|
|
@@ -36,8 +38,10 @@ class PPTXExtractorStage(RayActorStage):
|
|
|
36
38
|
----------
|
|
37
39
|
config : PPTXExtractorSchema
|
|
38
40
|
The validated configuration object for PPTX extraction.
|
|
41
|
+
stage_name : Optional[str]
|
|
42
|
+
Name of the stage from YAML pipeline configuration.
|
|
39
43
|
"""
|
|
40
|
-
super().__init__(config)
|
|
44
|
+
super().__init__(config, stage_name=stage_name)
|
|
41
45
|
try:
|
|
42
46
|
# The config passed in should already be validated, but storing it.
|
|
43
47
|
self.validated_config = config
|
|
@@ -47,9 +51,10 @@ class PPTXExtractorStage(RayActorStage):
|
|
|
47
51
|
logger.exception(f"Error initializing or validating PPTX Extractor config: {e}")
|
|
48
52
|
raise
|
|
49
53
|
|
|
50
|
-
@
|
|
54
|
+
@nv_ingest_node_failure_try_except()
|
|
55
|
+
@traceable()
|
|
56
|
+
@udf_intercept_hook()
|
|
51
57
|
@filter_by_task(required_tasks=[("extract", {"document_type": "pptx"})])
|
|
52
|
-
@nv_ingest_node_failure_try_except(annotation_id="pptx_extractor", raise_on_failure=False)
|
|
53
58
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
54
59
|
"""
|
|
55
60
|
Process the control message by extracting content from PPTX documents.
|
|
@@ -80,6 +85,6 @@ class PPTXExtractorStage(RayActorStage):
|
|
|
80
85
|
|
|
81
86
|
# Update the message payload with the extracted PPTX content DataFrame.
|
|
82
87
|
control_message.payload(new_df)
|
|
83
|
-
control_message.set_metadata("pptx_extraction_info", extraction_info)
|
|
88
|
+
control_message.set_metadata("pptx_extraction_info", extraction_info)
|
|
84
89
|
|
|
85
90
|
return control_message
|
|
@@ -3,19 +3,20 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Optional
|
|
7
7
|
import ray
|
|
8
8
|
|
|
9
|
-
# These imports are assumed from your project.
|
|
10
9
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
11
10
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
11
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
12
12
|
from nv_ingest_api.internal.extract.image.table_extractor import extract_table_data_from_image_internal
|
|
13
13
|
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
|
|
14
|
-
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
14
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
|
|
15
15
|
from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
|
|
16
16
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
17
|
nv_ingest_node_failure_try_except,
|
|
18
18
|
)
|
|
19
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
@@ -31,18 +32,19 @@ class TableExtractorStage(RayActorStage):
|
|
|
31
32
|
and annotates the message metadata with extraction info.
|
|
32
33
|
"""
|
|
33
34
|
|
|
34
|
-
def __init__(self, config: TableExtractorSchema) -> None:
|
|
35
|
-
super().__init__(config)
|
|
35
|
+
def __init__(self, config: TableExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
36
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
37
|
try:
|
|
37
38
|
self.validated_config = config
|
|
38
|
-
logger.
|
|
39
|
+
logger.debug("TableExtractorStage configuration validated successfully.")
|
|
39
40
|
except Exception as e:
|
|
40
41
|
logger.exception("Error validating table extractor config")
|
|
41
42
|
raise e
|
|
42
43
|
|
|
43
|
-
@
|
|
44
|
+
@nv_ingest_node_failure_try_except()
|
|
45
|
+
@traceable()
|
|
46
|
+
@udf_intercept_hook()
|
|
44
47
|
@filter_by_task(required_tasks=["table_data_extract"])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="table_extraction", raise_on_failure=False)
|
|
46
48
|
def on_data(self, control_message: Any) -> Any:
|
|
47
49
|
"""
|
|
48
50
|
Process the control message by extracting table data from the PDF payload.
|
|
@@ -57,14 +59,14 @@ class TableExtractorStage(RayActorStage):
|
|
|
57
59
|
IngestControlMessage
|
|
58
60
|
The updated message with the extracted table data and extraction info in metadata.
|
|
59
61
|
"""
|
|
60
|
-
logger.
|
|
62
|
+
logger.debug("TableExtractorStage.on_data: Starting table extraction.")
|
|
61
63
|
# Extract the DataFrame payload.
|
|
62
64
|
df_payload = control_message.payload()
|
|
63
65
|
logger.debug("Extracted payload with %d rows.", len(df_payload))
|
|
64
66
|
|
|
65
67
|
# Remove the "table_data_extract" task to obtain task-specific configuration.
|
|
66
68
|
task_config = remove_task_by_type(control_message, "table_data_extract")
|
|
67
|
-
logger.debug("Extracted task configuration: %s", task_config)
|
|
69
|
+
logger.debug("Extracted task configuration: %s", sanitize_for_logging(task_config))
|
|
68
70
|
|
|
69
71
|
# Perform table data extraction.
|
|
70
72
|
execution_trace_log = {}
|
|
@@ -74,17 +76,17 @@ class TableExtractorStage(RayActorStage):
|
|
|
74
76
|
extraction_config=self.validated_config,
|
|
75
77
|
execution_trace_log=execution_trace_log,
|
|
76
78
|
)
|
|
77
|
-
logger.
|
|
79
|
+
logger.debug("Table extraction completed. Extracted %d rows.", len(new_df))
|
|
78
80
|
|
|
79
81
|
# Update the control message with the new DataFrame.
|
|
80
82
|
control_message.payload(new_df)
|
|
81
83
|
# Annotate the message with extraction info.
|
|
82
84
|
control_message.set_metadata("table_extraction_info", extraction_info)
|
|
83
|
-
logger.
|
|
85
|
+
logger.debug("Table extraction metadata injected successfully.")
|
|
84
86
|
|
|
85
87
|
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
86
88
|
if do_trace_tagging and execution_trace_log:
|
|
87
|
-
|
|
88
|
-
|
|
89
|
+
parent_name = self.stage_name if self.stage_name else "table_extractor"
|
|
90
|
+
set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
|
|
89
91
|
|
|
90
92
|
return control_message
|
|
@@ -4,12 +4,14 @@
|
|
|
4
4
|
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
import logging
|
|
7
|
+
from typing import Optional
|
|
7
8
|
import pandas as pd
|
|
8
|
-
from typing import Any
|
|
9
9
|
from pydantic import BaseModel
|
|
10
10
|
import ray
|
|
11
11
|
|
|
12
12
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
13
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
|
|
14
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
13
15
|
from nv_ingest_api.internal.enums.common import (
|
|
14
16
|
DocumentTypeEnum,
|
|
15
17
|
ContentTypeEnum,
|
|
@@ -17,14 +19,14 @@ from nv_ingest_api.internal.enums.common import (
|
|
|
17
19
|
TextTypeEnum,
|
|
18
20
|
LanguageEnum,
|
|
19
21
|
)
|
|
20
|
-
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
21
22
|
from nv_ingest_api.internal.schemas.meta.metadata_schema import ContentHierarchySchema
|
|
22
23
|
from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
|
|
23
24
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
24
25
|
nv_ingest_node_failure_try_except,
|
|
25
26
|
)
|
|
27
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
28
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
26
29
|
|
|
27
|
-
# logging.basicConfig(level=logging.DEBUG)
|
|
28
30
|
logger = logging.getLogger(__name__)
|
|
29
31
|
|
|
30
32
|
|
|
@@ -37,15 +39,16 @@ class MetadataInjectionStage(RayActorStage):
|
|
|
37
39
|
injection is required, and if so, injects the appropriate metadata.
|
|
38
40
|
"""
|
|
39
41
|
|
|
40
|
-
def __init__(self, config: BaseModel) -> None:
|
|
42
|
+
def __init__(self, config: BaseModel, stage_name: Optional[str] = None) -> None:
|
|
41
43
|
# Call the base initializer to set attributes like self._running.
|
|
42
|
-
super().__init__(config)
|
|
44
|
+
super().__init__(config, stage_name=stage_name)
|
|
43
45
|
# Additional initialization can be added here if necessary.
|
|
44
|
-
|
|
46
|
+
self._logger.debug("MetadataInjectionStage initialized with config: %s", sanitize_for_logging(config))
|
|
45
47
|
|
|
46
|
-
@
|
|
47
|
-
@
|
|
48
|
-
|
|
48
|
+
@nv_ingest_node_failure_try_except()
|
|
49
|
+
@traceable()
|
|
50
|
+
@udf_intercept_hook()
|
|
51
|
+
def on_data(self, message: IngestControlMessage) -> IngestControlMessage:
|
|
49
52
|
"""
|
|
50
53
|
Process an incoming IngestControlMessage by injecting metadata into its DataFrame payload.
|
|
51
54
|
|
|
@@ -62,7 +65,7 @@ class MetadataInjectionStage(RayActorStage):
|
|
|
62
65
|
df = message.payload()
|
|
63
66
|
update_required = False
|
|
64
67
|
rows = []
|
|
65
|
-
logger.
|
|
68
|
+
logger.debug("Starting metadata injection on DataFrame with %d rows", len(df))
|
|
66
69
|
|
|
67
70
|
for _, row in df.iterrows():
|
|
68
71
|
try:
|
|
@@ -141,7 +144,7 @@ class MetadataInjectionStage(RayActorStage):
|
|
|
141
144
|
"source_metadata": default_source_metadata,
|
|
142
145
|
"text_metadata": default_text_metadata,
|
|
143
146
|
}
|
|
144
|
-
logger.
|
|
147
|
+
logger.debug(
|
|
145
148
|
f"METADATA_INJECTOR_DEBUG: Rebuilt metadata for source_id='{row.get('source_id', 'N/A')}'. "
|
|
146
149
|
f"Metadata keys: {list(row['metadata'].keys())}."
|
|
147
150
|
f"'content' present: {'content' in row['metadata']}"
|
|
@@ -154,8 +157,8 @@ class MetadataInjectionStage(RayActorStage):
|
|
|
154
157
|
if update_required:
|
|
155
158
|
docs = pd.DataFrame(rows)
|
|
156
159
|
message.payload(docs)
|
|
157
|
-
logger.
|
|
160
|
+
logger.debug("Metadata injection updated payload with %d rows", len(docs))
|
|
158
161
|
else:
|
|
159
|
-
logger.
|
|
162
|
+
logger.debug("No metadata update was necessary during metadata injection")
|
|
160
163
|
|
|
161
164
|
return message
|
|
@@ -21,6 +21,9 @@ class RayActorSinkStage(RayActorStage, ABC):
|
|
|
21
21
|
to deliver their final processed messages.
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
|
+
def __init__(self, config: Any, log_to_stdout=False, stage_name: Optional[str] = None) -> None:
|
|
25
|
+
super().__init__(config, log_to_stdout=log_to_stdout, stage_name=stage_name)
|
|
26
|
+
|
|
24
27
|
@ray.method(num_returns=1)
|
|
25
28
|
def set_output_queue(self, queue_handle: any) -> bool:
|
|
26
29
|
raise NotImplementedError("Sink stages do not support an output queue.")
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Optional
|
|
7
7
|
import ray
|
|
8
8
|
import logging
|
|
9
9
|
|
|
@@ -19,8 +19,8 @@ class RayActorSourceStage(RayActorStage, ABC):
|
|
|
19
19
|
Instead, they must implement get_input() to fetch control messages from an external source.
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
|
-
def __init__(self, config: Any, log_to_stdout=False) -> None:
|
|
23
|
-
super().__init__(config, log_to_stdout=log_to_stdout)
|
|
22
|
+
def __init__(self, config: Any, log_to_stdout=False, stage_name: Optional[str] = None) -> None:
|
|
23
|
+
super().__init__(config, log_to_stdout=log_to_stdout, stage_name=stage_name)
|
|
24
24
|
self.paused = False
|
|
25
25
|
|
|
26
26
|
def on_data(self, IngestControlMessage):
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
+
import gc
|
|
5
6
|
import sys
|
|
6
7
|
import threading
|
|
7
8
|
import time
|
|
@@ -14,6 +15,7 @@ import ray
|
|
|
14
15
|
import ray.actor
|
|
15
16
|
from pydantic import BaseModel
|
|
16
17
|
import logging
|
|
18
|
+
import pyarrow as pa
|
|
17
19
|
|
|
18
20
|
from ray import get_runtime_context
|
|
19
21
|
|
|
@@ -49,6 +51,9 @@ class RayActorStage(ABC):
|
|
|
49
51
|
----------
|
|
50
52
|
config : BaseModel
|
|
51
53
|
Configuration object for the stage.
|
|
54
|
+
stage_name : Optional[str]
|
|
55
|
+
Name of the stage from YAML pipeline configuration. Used by
|
|
56
|
+
stage-aware decorators for consistent naming.
|
|
52
57
|
_input_queue : Optional[Any]
|
|
53
58
|
Handle to the Ray queue from which input items are read.
|
|
54
59
|
Expected to be set via `set_input_queue`.
|
|
@@ -80,7 +85,7 @@ class RayActorStage(ABC):
|
|
|
80
85
|
Lock to protect access to shutdown-related state (`_shutting_down`).
|
|
81
86
|
"""
|
|
82
87
|
|
|
83
|
-
def __init__(self, config: BaseModel, log_to_stdout=False) -> None:
|
|
88
|
+
def __init__(self, config: BaseModel, stage_name: Optional[str] = None, log_to_stdout=False) -> None:
|
|
84
89
|
"""
|
|
85
90
|
Initialize the RayActorStage.
|
|
86
91
|
|
|
@@ -89,8 +94,14 @@ class RayActorStage(ABC):
|
|
|
89
94
|
config : BaseModel
|
|
90
95
|
Configuration object specific to the stage's behavior. Passed by
|
|
91
96
|
the orchestrator during actor creation.
|
|
97
|
+
stage_name : Optional[str]
|
|
98
|
+
Name of the stage from YAML pipeline configuration. Used by
|
|
99
|
+
stage-aware decorators for consistent naming.
|
|
100
|
+
log_to_stdout : bool
|
|
101
|
+
Whether to enable stdout logging.
|
|
92
102
|
"""
|
|
93
103
|
self.config: BaseModel = config
|
|
104
|
+
self.stage_name: Optional[str] = stage_name
|
|
94
105
|
self._input_queue: Optional[Any] = None # Ray Queue handle expected
|
|
95
106
|
self._output_queue: Optional[Any] = None # Ray Queue handle expected
|
|
96
107
|
self._running: bool = False
|
|
@@ -129,6 +140,14 @@ class RayActorStage(ABC):
|
|
|
129
140
|
|
|
130
141
|
self._actor_id_str = self._get_actor_id_str()
|
|
131
142
|
|
|
143
|
+
# --- PyArrow Memory Management ---
|
|
144
|
+
# Time-based periodic cleanup to prevent long-term memory accumulation
|
|
145
|
+
self._memory_cleanup_interval_seconds = getattr(
|
|
146
|
+
config, "memory_cleanup_interval_seconds", 300
|
|
147
|
+
) # 5 minutes default
|
|
148
|
+
self._last_memory_cleanup_time = time.time()
|
|
149
|
+
self._memory_cleanups_performed = 0
|
|
150
|
+
|
|
132
151
|
@staticmethod
|
|
133
152
|
def _get_actor_id_str() -> str:
|
|
134
153
|
"""
|
|
@@ -344,6 +363,16 @@ class RayActorStage(ABC):
|
|
|
344
363
|
# This is the primary path for "successful processing".
|
|
345
364
|
self.stats["processed"] += 1
|
|
346
365
|
|
|
366
|
+
# Time-based PyArrow memory cleanup check (best-effort, low overhead)
|
|
367
|
+
try:
|
|
368
|
+
current_time = time.time()
|
|
369
|
+
if (current_time - self._last_memory_cleanup_time) >= self._memory_cleanup_interval_seconds:
|
|
370
|
+
self._force_arrow_memory_cleanup()
|
|
371
|
+
self._last_memory_cleanup_time = current_time
|
|
372
|
+
except Exception:
|
|
373
|
+
# Never allow cleanup issues to interfere with processing
|
|
374
|
+
pass
|
|
375
|
+
|
|
347
376
|
except ray.exceptions.ObjectLostError:
|
|
348
377
|
# This error is handled inside the loop to prevent the actor from crashing.
|
|
349
378
|
# We log it and continue to the next message.
|
|
@@ -386,10 +415,69 @@ class RayActorStage(ABC):
|
|
|
386
415
|
# This block executes when the processing thread is about to exit,
|
|
387
416
|
# either due to self._running becoming False or an unhandled critical exception.
|
|
388
417
|
self._logger.debug(f"[{self._actor_id_str}] Processing loop thread finished.")
|
|
418
|
+
# Perform a best-effort final memory cleanup on exit
|
|
419
|
+
try:
|
|
420
|
+
self._force_arrow_memory_cleanup()
|
|
421
|
+
except Exception:
|
|
422
|
+
pass
|
|
389
423
|
# Signal that this actor's processing duties are complete.
|
|
390
424
|
# External monitors (e.g., via a future from stop()) can use this signal.
|
|
391
425
|
self._shutdown_signal_complete = True
|
|
392
426
|
|
|
427
|
+
def _force_arrow_memory_cleanup(self) -> None:
|
|
428
|
+
"""
|
|
429
|
+
Best-effort memory cleanup for PyArrow allocations.
|
|
430
|
+
|
|
431
|
+
- Runs Python garbage collection to drop unreachable references.
|
|
432
|
+
- If PyArrow is available and its default memory pool supports
|
|
433
|
+
release_unused(), request it to return free pages to the OS.
|
|
434
|
+
|
|
435
|
+
Designed to be safe to call periodically; any failures are logged at
|
|
436
|
+
debug/warning levels and are non-fatal.
|
|
437
|
+
"""
|
|
438
|
+
try:
|
|
439
|
+
# First, trigger Python GC to maximize reclaimable memory
|
|
440
|
+
gc.collect()
|
|
441
|
+
|
|
442
|
+
try:
|
|
443
|
+
pool = pa.default_memory_pool()
|
|
444
|
+
try:
|
|
445
|
+
before_bytes = getattr(pool, "bytes_allocated", lambda: 0)()
|
|
446
|
+
except Exception:
|
|
447
|
+
before_bytes = 0
|
|
448
|
+
|
|
449
|
+
released = False
|
|
450
|
+
if hasattr(pool, "release_unused"):
|
|
451
|
+
try:
|
|
452
|
+
pool.release_unused()
|
|
453
|
+
released = True
|
|
454
|
+
except Exception as e_release:
|
|
455
|
+
self._logger.debug(f"[{self._actor_id_str}] Arrow pool release_unused() failed: {e_release}")
|
|
456
|
+
|
|
457
|
+
try:
|
|
458
|
+
after_bytes = getattr(pool, "bytes_allocated", lambda: before_bytes)()
|
|
459
|
+
except Exception:
|
|
460
|
+
after_bytes = before_bytes
|
|
461
|
+
|
|
462
|
+
if released:
|
|
463
|
+
delta_mb = max(0, (before_bytes - after_bytes) / (1024 * 1024))
|
|
464
|
+
if delta_mb > 0:
|
|
465
|
+
self._logger.debug(
|
|
466
|
+
f"[{self._actor_id_str}] Arrow cleanup released ~{delta_mb:.2f}"
|
|
467
|
+
f" MB (pool now {after_bytes/(1024*1024):.2f} MB)."
|
|
468
|
+
)
|
|
469
|
+
self._memory_cleanups_performed += 1
|
|
470
|
+
except ModuleNotFoundError:
|
|
471
|
+
# PyArrow not present; nothing to do beyond GC.
|
|
472
|
+
self._memory_cleanups_performed += 1
|
|
473
|
+
except Exception as e_pa:
|
|
474
|
+
# Any other PyArrow-related issues are non-fatal.
|
|
475
|
+
self._logger.debug(f"[{self._actor_id_str}] Arrow cleanup skipped due to error: {e_pa}")
|
|
476
|
+
self._memory_cleanups_performed += 1
|
|
477
|
+
except Exception as e:
|
|
478
|
+
# As a last resort, swallow any errors to avoid interfering with the actor loop.
|
|
479
|
+
self._logger.debug(f"[{self._actor_id_str}] Memory cleanup encountered an error: {e}")
|
|
480
|
+
|
|
393
481
|
def _get_memory_usage_mb(self) -> float:
|
|
394
482
|
"""
|
|
395
483
|
Gets the total memory usage of the current actor process (RSS).
|
|
@@ -500,7 +588,7 @@ class RayActorStage(ABC):
|
|
|
500
588
|
self._logger.warning(f"{self._actor_id_str}: Start called but actor is already running.")
|
|
501
589
|
return False
|
|
502
590
|
|
|
503
|
-
self._logger.
|
|
591
|
+
self._logger.debug(f"{self._actor_id_str}: Starting actor...")
|
|
504
592
|
# --- Initialize Actor State ---
|
|
505
593
|
self._running = True
|
|
506
594
|
self._shutting_down = False # Reset shutdown flag on start
|
|
@@ -519,14 +607,14 @@ class RayActorStage(ABC):
|
|
|
519
607
|
)
|
|
520
608
|
self._processing_thread.start()
|
|
521
609
|
|
|
522
|
-
self._logger.
|
|
610
|
+
self._logger.debug(f"{self._actor_id_str}: Actor started successfully.")
|
|
523
611
|
|
|
524
612
|
return True
|
|
525
613
|
|
|
526
614
|
@ray.method(num_returns=0)
|
|
527
615
|
def stop(self) -> None:
|
|
528
616
|
"""Stops the actor's processing loop by setting the running flag to False."""
|
|
529
|
-
self._logger.
|
|
617
|
+
self._logger.debug(f"[{self._actor_id_str}] Stop signal received. Initiating graceful shutdown.")
|
|
530
618
|
self._running = False
|
|
531
619
|
|
|
532
620
|
def is_shutdown_complete(self) -> bool:
|
|
@@ -4,11 +4,13 @@
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
|
+
from typing import Optional
|
|
7
8
|
|
|
8
9
|
import ray
|
|
9
10
|
|
|
10
11
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
11
12
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
13
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
12
14
|
from nv_ingest_api.internal.mutate.deduplicate import deduplicate_images_internal
|
|
13
15
|
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
14
16
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
@@ -16,6 +18,7 @@ from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import Imag
|
|
|
16
18
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
19
|
nv_ingest_node_failure_try_except,
|
|
18
20
|
)
|
|
21
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
19
22
|
|
|
20
23
|
logger = logging.getLogger(__name__)
|
|
21
24
|
|
|
@@ -31,18 +34,19 @@ class ImageDedupStage(RayActorStage):
|
|
|
31
34
|
3. Updates the message payload with the deduplicated DataFrame.
|
|
32
35
|
"""
|
|
33
36
|
|
|
34
|
-
def __init__(self, config: ImageDedupSchema) -> None:
|
|
35
|
-
super().__init__(config)
|
|
37
|
+
def __init__(self, config: ImageDedupSchema, stage_name: Optional[str] = None) -> None:
|
|
38
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
39
|
try:
|
|
37
40
|
self.validated_config = config
|
|
38
|
-
logger.
|
|
41
|
+
logger.debug("ImageDedupStage configuration validated successfully.")
|
|
39
42
|
except Exception as e:
|
|
40
43
|
logger.exception(f"Error validating Image Deduplication config: {e}")
|
|
41
44
|
raise
|
|
42
45
|
|
|
43
|
-
@
|
|
46
|
+
@nv_ingest_node_failure_try_except()
|
|
47
|
+
@traceable()
|
|
48
|
+
@udf_intercept_hook()
|
|
44
49
|
@filter_by_task(required_tasks=["dedup"])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="image_dedup", raise_on_failure=False)
|
|
46
50
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
51
|
"""
|
|
48
52
|
Process the control message by deduplicating images.
|
|
@@ -57,7 +61,7 @@ class ImageDedupStage(RayActorStage):
|
|
|
57
61
|
IngestControlMessage
|
|
58
62
|
The updated message with deduplicated images in the payload.
|
|
59
63
|
"""
|
|
60
|
-
logger.
|
|
64
|
+
logger.debug("ImageDedupStage.on_data: Starting image deduplication process.")
|
|
61
65
|
try:
|
|
62
66
|
# Extract the DataFrame payload.
|
|
63
67
|
df_ledger = control_message.payload()
|
|
@@ -65,7 +69,7 @@ class ImageDedupStage(RayActorStage):
|
|
|
65
69
|
|
|
66
70
|
# Remove the "dedup" task from the message to obtain task-specific configuration.
|
|
67
71
|
task_config = remove_task_by_type(control_message, "dedup")
|
|
68
|
-
logger.debug("Extracted task config: %s", task_config)
|
|
72
|
+
logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
69
73
|
|
|
70
74
|
# Perform image deduplication.
|
|
71
75
|
new_df = deduplicate_images_internal(
|
|
@@ -74,7 +78,7 @@ class ImageDedupStage(RayActorStage):
|
|
|
74
78
|
mutate_config=self.validated_config,
|
|
75
79
|
execution_trace_log=None,
|
|
76
80
|
)
|
|
77
|
-
logger.
|
|
81
|
+
logger.debug("Image deduplication completed. Resulting DataFrame has %d rows.", len(new_df))
|
|
78
82
|
|
|
79
83
|
# Update the message payload with the deduplicated DataFrame.
|
|
80
84
|
control_message.payload(new_df)
|