nv-ingest 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.15.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/framework/orchestration/execution/__init__.py +3 -0
- nv_ingest/framework/orchestration/execution/helpers.py +85 -0
- nv_ingest/framework/orchestration/execution/options.py +112 -0
- nv_ingest/framework/orchestration/process/__init__.py +3 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +55 -0
- nv_ingest/framework/orchestration/process/execution.py +497 -0
- nv_ingest/framework/orchestration/process/lifecycle.py +122 -0
- nv_ingest/framework/orchestration/process/strategies.py +182 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +1 -1
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +23 -23
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +16 -16
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +9 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +10 -6
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +18 -17
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +14 -13
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +15 -13
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +22 -13
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +10 -7
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +10 -8
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +71 -61
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +7 -5
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +7 -5
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +13 -14
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +18 -12
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
- nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
- nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
- nv_ingest/pipeline/__init__.py +3 -0
- nv_ingest/pipeline/config/__init__.py +3 -0
- nv_ingest/pipeline/config/loaders.py +198 -0
- nv_ingest/pipeline/config/replica_resolver.py +227 -0
- nv_ingest/pipeline/default_pipeline_impl.py +517 -0
- nv_ingest/pipeline/ingest_pipeline.py +389 -0
- nv_ingest/pipeline/pipeline_schema.py +398 -0
- {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/METADATA +1 -1
- {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/RECORD +54 -40
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
- {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/top_level.txt +0 -0
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
|
+
from typing import Optional
|
|
7
8
|
|
|
8
9
|
import ray
|
|
9
10
|
|
|
@@ -17,6 +18,8 @@ from nv_ingest_api.util.exception_handlers.decorators import (
|
|
|
17
18
|
nv_ingest_node_failure_try_except,
|
|
18
19
|
)
|
|
19
20
|
|
|
21
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
22
|
+
|
|
20
23
|
logger = logging.getLogger(__name__)
|
|
21
24
|
|
|
22
25
|
|
|
@@ -31,8 +34,8 @@ class HtmlExtractorStage(RayActorStage):
|
|
|
31
34
|
3. Updates the message payload with the extracted text DataFrame.
|
|
32
35
|
"""
|
|
33
36
|
|
|
34
|
-
def __init__(self, config: HtmlExtractorSchema) -> None:
|
|
35
|
-
super().__init__(config, log_to_stdout=False)
|
|
37
|
+
def __init__(self, config: HtmlExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
38
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
36
39
|
try:
|
|
37
40
|
self.validated_config = config
|
|
38
41
|
self._logger.info("HtmlExtractorStage configuration validated successfully.")
|
|
@@ -40,9 +43,10 @@ class HtmlExtractorStage(RayActorStage):
|
|
|
40
43
|
self._logger.exception(f"Error validating Html Extractor config: {e}")
|
|
41
44
|
raise
|
|
42
45
|
|
|
43
|
-
@
|
|
46
|
+
@nv_ingest_node_failure_try_except()
|
|
47
|
+
@traceable()
|
|
48
|
+
@udf_intercept_hook()
|
|
44
49
|
@filter_by_task(required_tasks=[("extract", {"document_type": "html"})])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="html_extractor", raise_on_failure=False)
|
|
46
50
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
51
|
"""
|
|
48
52
|
Process the control message by extracting content from html.
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
from typing import Optional
|
|
6
7
|
|
|
7
8
|
import ray
|
|
8
9
|
|
|
@@ -16,6 +17,8 @@ from nv_ingest_api.util.exception_handlers.decorators import (
|
|
|
16
17
|
nv_ingest_node_failure_try_except,
|
|
17
18
|
)
|
|
18
19
|
|
|
20
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
21
|
+
|
|
19
22
|
logger = logging.getLogger(__name__)
|
|
20
23
|
|
|
21
24
|
|
|
@@ -30,18 +33,19 @@ class ImageExtractorStage(RayActorStage):
|
|
|
30
33
|
3. Updates the message payload with the extracted primitives DataFrame.
|
|
31
34
|
"""
|
|
32
35
|
|
|
33
|
-
def __init__(self, config: ImageExtractorSchema) -> None:
|
|
34
|
-
super().__init__(config)
|
|
36
|
+
def __init__(self, config: ImageExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
37
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
35
38
|
try:
|
|
36
39
|
self.validated_config = config
|
|
37
|
-
|
|
40
|
+
self._logger.info("ImageExtractorStage configuration validated successfully.")
|
|
38
41
|
except Exception as e:
|
|
39
|
-
|
|
42
|
+
self._logger.exception(f"Error validating Image Extractor config: {e}")
|
|
40
43
|
raise
|
|
41
44
|
|
|
42
|
-
@
|
|
45
|
+
@nv_ingest_node_failure_try_except()
|
|
46
|
+
@traceable()
|
|
47
|
+
@udf_intercept_hook()
|
|
43
48
|
@filter_by_task(required_tasks=[("extract", {"document_type": "regex:^(png|jpeg|jpg|tiff|bmp)$"})])
|
|
44
|
-
@nv_ingest_node_failure_try_except(annotation_id="image_extractor", raise_on_failure=False)
|
|
45
49
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
46
50
|
"""
|
|
47
51
|
Process the control message by extracting primitives from images.
|
|
@@ -5,32 +5,44 @@
|
|
|
5
5
|
import logging
|
|
6
6
|
import ray
|
|
7
7
|
|
|
8
|
-
from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
|
|
9
8
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
10
9
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
11
10
|
from nv_ingest_api.internal.extract.image.infographic_extractor import extract_infographic_data_from_image_internal
|
|
12
11
|
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
13
|
-
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
12
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
|
|
13
|
+
from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
|
|
14
14
|
from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
15
18
|
|
|
16
19
|
logger = logging.getLogger(__name__)
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
@ray.remote
|
|
20
23
|
class InfographicExtractorStage(RayActorStage):
|
|
21
|
-
|
|
22
|
-
|
|
24
|
+
"""
|
|
25
|
+
A Ray actor stage that extracts infographic data from image content.
|
|
26
|
+
|
|
27
|
+
It expects an IngestControlMessage containing a DataFrame with image data. It then:
|
|
28
|
+
1. Removes the "infographic_data_extract" task from the message.
|
|
29
|
+
2. Calls the infographic extraction logic using a validated configuration.
|
|
30
|
+
3. Updates the message payload with the extracted infographic DataFrame.
|
|
31
|
+
"""
|
|
23
32
|
|
|
33
|
+
def __init__(self, config: InfographicExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
34
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
24
35
|
try:
|
|
25
36
|
self.validated_config = config
|
|
26
|
-
|
|
37
|
+
self._logger.info("InfographicExtractorStage configuration validated successfully.")
|
|
27
38
|
except Exception as e:
|
|
28
|
-
|
|
39
|
+
self._logger.exception(f"Error validating Infographic extractor config: {e}")
|
|
29
40
|
raise
|
|
30
41
|
|
|
31
|
-
@
|
|
42
|
+
@nv_ingest_node_failure_try_except()
|
|
43
|
+
@traceable()
|
|
44
|
+
@udf_intercept_hook()
|
|
32
45
|
@filter_by_task(required_tasks=["infographic_data_extract"])
|
|
33
|
-
@nv_ingest_node_failure_try_except(annotation_id="infographic_extraction", raise_on_failure=False)
|
|
34
46
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
35
47
|
# Extract DataFrame payload
|
|
36
48
|
df_ledger = control_message.payload()
|
|
@@ -51,7 +63,7 @@ class InfographicExtractorStage(RayActorStage):
|
|
|
51
63
|
|
|
52
64
|
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
53
65
|
if do_trace_tagging and execution_trace_log:
|
|
54
|
-
|
|
55
|
-
|
|
66
|
+
parent_name = self.stage_name if self.stage_name else "infographic_extractor"
|
|
67
|
+
set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
|
|
56
68
|
|
|
57
69
|
return control_message
|
|
@@ -7,16 +7,15 @@ import pandas as pd
|
|
|
7
7
|
from typing import Any, Dict, Tuple, Optional
|
|
8
8
|
import ray
|
|
9
9
|
|
|
10
|
-
# Assume these imports come from your project:
|
|
11
|
-
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
12
|
-
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
13
10
|
from nv_ingest_api.internal.extract.pdf.pdf_extractor import extract_primitives_from_pdf_internal
|
|
14
11
|
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
|
|
15
|
-
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
16
12
|
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
13
|
+
|
|
14
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import set_trace_timestamps_with_parent_context, traceable
|
|
15
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
16
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
|
+
from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
|
|
20
19
|
|
|
21
20
|
logger = logging.getLogger(__name__)
|
|
22
21
|
|
|
@@ -51,19 +50,20 @@ class PDFExtractorStage(RayActorStage):
|
|
|
51
50
|
4. Optionally, stores additional extraction info in the message metadata.
|
|
52
51
|
"""
|
|
53
52
|
|
|
54
|
-
def __init__(self, config: PDFExtractorSchema) -> None:
|
|
55
|
-
super().__init__(config)
|
|
53
|
+
def __init__(self, config: PDFExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
54
|
+
super().__init__(config, stage_name=stage_name)
|
|
56
55
|
try:
|
|
57
56
|
# Validate and store the PDF extractor configuration.
|
|
58
57
|
self.validated_config = config
|
|
59
|
-
logger.
|
|
58
|
+
logger.debug("PDFExtractorStage configuration validated successfully.")
|
|
60
59
|
except Exception as e:
|
|
61
60
|
logger.exception(f"Error validating PDF extractor config: {e}")
|
|
62
61
|
raise
|
|
63
62
|
|
|
64
|
-
@
|
|
63
|
+
@nv_ingest_node_failure_try_except()
|
|
64
|
+
@traceable()
|
|
65
|
+
@udf_intercept_hook()
|
|
65
66
|
@filter_by_task(required_tasks=[("extract", {"document_type": "pdf"})])
|
|
66
|
-
@nv_ingest_node_failure_try_except(annotation_id="pdf_extractor", raise_on_failure=False)
|
|
67
67
|
def on_data(self, control_message: Any) -> Any:
|
|
68
68
|
"""
|
|
69
69
|
Process the control message by extracting PDF content.
|
|
@@ -79,7 +79,7 @@ class PDFExtractorStage(RayActorStage):
|
|
|
79
79
|
The updated message with the extracted DataFrame and extraction info in metadata.
|
|
80
80
|
"""
|
|
81
81
|
|
|
82
|
-
logger.
|
|
82
|
+
logger.debug("PDFExtractorStage.on_data: Starting PDF extraction process.")
|
|
83
83
|
|
|
84
84
|
# Extract the DataFrame payload.
|
|
85
85
|
df_extraction_ledger = control_message.payload()
|
|
@@ -97,17 +97,18 @@ class PDFExtractorStage(RayActorStage):
|
|
|
97
97
|
execution_trace_log=execution_trace_log,
|
|
98
98
|
validated_config=self.validated_config,
|
|
99
99
|
)
|
|
100
|
-
logger.
|
|
100
|
+
logger.debug("PDF extraction completed. Extracted %d rows.", len(new_df))
|
|
101
101
|
|
|
102
102
|
# Update the message payload with the extracted DataFrame.
|
|
103
103
|
control_message.payload(new_df)
|
|
104
104
|
# Optionally, annotate the message with extraction info.
|
|
105
105
|
control_message.set_metadata("pdf_extraction_info", extraction_info)
|
|
106
|
-
logger.
|
|
106
|
+
logger.debug("PDF extraction metadata injected successfully.")
|
|
107
107
|
|
|
108
108
|
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
109
109
|
if do_trace_tagging and execution_trace_log:
|
|
110
|
-
|
|
111
|
-
|
|
110
|
+
# Use utility function to set trace timestamps with proper parent-child context
|
|
111
|
+
parent_name = self.stage_name or "pdf_extractor"
|
|
112
|
+
set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
|
|
112
113
|
|
|
113
114
|
return control_message
|
|
@@ -3,8 +3,10 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
from typing import Optional
|
|
6
7
|
|
|
7
8
|
import ray
|
|
9
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
8
10
|
|
|
9
11
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
10
12
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
@@ -28,7 +30,7 @@ class PPTXExtractorStage(RayActorStage):
|
|
|
28
30
|
3. Updates the message payload with the extracted content DataFrame.
|
|
29
31
|
"""
|
|
30
32
|
|
|
31
|
-
def __init__(self, config: PPTXExtractorSchema) -> None:
|
|
33
|
+
def __init__(self, config: PPTXExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
32
34
|
"""
|
|
33
35
|
Initializes the PptxExtractorStage.
|
|
34
36
|
|
|
@@ -36,8 +38,10 @@ class PPTXExtractorStage(RayActorStage):
|
|
|
36
38
|
----------
|
|
37
39
|
config : PPTXExtractorSchema
|
|
38
40
|
The validated configuration object for PPTX extraction.
|
|
41
|
+
stage_name : Optional[str]
|
|
42
|
+
Name of the stage from YAML pipeline configuration.
|
|
39
43
|
"""
|
|
40
|
-
super().__init__(config)
|
|
44
|
+
super().__init__(config, stage_name=stage_name)
|
|
41
45
|
try:
|
|
42
46
|
# The config passed in should already be validated, but storing it.
|
|
43
47
|
self.validated_config = config
|
|
@@ -47,9 +51,10 @@ class PPTXExtractorStage(RayActorStage):
|
|
|
47
51
|
logger.exception(f"Error initializing or validating PPTX Extractor config: {e}")
|
|
48
52
|
raise
|
|
49
53
|
|
|
50
|
-
@
|
|
54
|
+
@nv_ingest_node_failure_try_except()
|
|
55
|
+
@traceable()
|
|
56
|
+
@udf_intercept_hook()
|
|
51
57
|
@filter_by_task(required_tasks=[("extract", {"document_type": "pptx"})])
|
|
52
|
-
@nv_ingest_node_failure_try_except(annotation_id="pptx_extractor", raise_on_failure=False)
|
|
53
58
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
54
59
|
"""
|
|
55
60
|
Process the control message by extracting content from PPTX documents.
|
|
@@ -80,6 +85,6 @@ class PPTXExtractorStage(RayActorStage):
|
|
|
80
85
|
|
|
81
86
|
# Update the message payload with the extracted PPTX content DataFrame.
|
|
82
87
|
control_message.payload(new_df)
|
|
83
|
-
control_message.set_metadata("pptx_extraction_info", extraction_info)
|
|
88
|
+
control_message.set_metadata("pptx_extraction_info", extraction_info)
|
|
84
89
|
|
|
85
90
|
return control_message
|
|
@@ -3,15 +3,15 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Optional
|
|
7
7
|
import ray
|
|
8
8
|
|
|
9
|
-
# These imports are assumed from your project.
|
|
10
9
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
11
10
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
11
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
12
12
|
from nv_ingest_api.internal.extract.image.table_extractor import extract_table_data_from_image_internal
|
|
13
13
|
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
|
|
14
|
-
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
14
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
|
|
15
15
|
from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
|
|
16
16
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
17
|
nv_ingest_node_failure_try_except,
|
|
@@ -31,18 +31,19 @@ class TableExtractorStage(RayActorStage):
|
|
|
31
31
|
and annotates the message metadata with extraction info.
|
|
32
32
|
"""
|
|
33
33
|
|
|
34
|
-
def __init__(self, config: TableExtractorSchema) -> None:
|
|
35
|
-
super().__init__(config)
|
|
34
|
+
def __init__(self, config: TableExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
35
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
36
|
try:
|
|
37
37
|
self.validated_config = config
|
|
38
|
-
logger.
|
|
38
|
+
logger.debug("TableExtractorStage configuration validated successfully.")
|
|
39
39
|
except Exception as e:
|
|
40
40
|
logger.exception("Error validating table extractor config")
|
|
41
41
|
raise e
|
|
42
42
|
|
|
43
|
-
@
|
|
43
|
+
@nv_ingest_node_failure_try_except()
|
|
44
|
+
@traceable()
|
|
45
|
+
@udf_intercept_hook()
|
|
44
46
|
@filter_by_task(required_tasks=["table_data_extract"])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="table_extraction", raise_on_failure=False)
|
|
46
47
|
def on_data(self, control_message: Any) -> Any:
|
|
47
48
|
"""
|
|
48
49
|
Process the control message by extracting table data from the PDF payload.
|
|
@@ -57,7 +58,7 @@ class TableExtractorStage(RayActorStage):
|
|
|
57
58
|
IngestControlMessage
|
|
58
59
|
The updated message with the extracted table data and extraction info in metadata.
|
|
59
60
|
"""
|
|
60
|
-
logger.
|
|
61
|
+
logger.debug("TableExtractorStage.on_data: Starting table extraction.")
|
|
61
62
|
# Extract the DataFrame payload.
|
|
62
63
|
df_payload = control_message.payload()
|
|
63
64
|
logger.debug("Extracted payload with %d rows.", len(df_payload))
|
|
@@ -74,17 +75,17 @@ class TableExtractorStage(RayActorStage):
|
|
|
74
75
|
extraction_config=self.validated_config,
|
|
75
76
|
execution_trace_log=execution_trace_log,
|
|
76
77
|
)
|
|
77
|
-
logger.
|
|
78
|
+
logger.debug("Table extraction completed. Extracted %d rows.", len(new_df))
|
|
78
79
|
|
|
79
80
|
# Update the control message with the new DataFrame.
|
|
80
81
|
control_message.payload(new_df)
|
|
81
82
|
# Annotate the message with extraction info.
|
|
82
83
|
control_message.set_metadata("table_extraction_info", extraction_info)
|
|
83
|
-
logger.
|
|
84
|
+
logger.debug("Table extraction metadata injected successfully.")
|
|
84
85
|
|
|
85
86
|
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
86
87
|
if do_trace_tagging and execution_trace_log:
|
|
87
|
-
|
|
88
|
-
|
|
88
|
+
parent_name = self.stage_name if self.stage_name else "table_extractor"
|
|
89
|
+
set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
|
|
89
90
|
|
|
90
91
|
return control_message
|
|
@@ -4,12 +4,14 @@
|
|
|
4
4
|
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
import logging
|
|
7
|
+
from typing import Optional
|
|
7
8
|
import pandas as pd
|
|
8
|
-
from typing import Any
|
|
9
9
|
from pydantic import BaseModel
|
|
10
10
|
import ray
|
|
11
11
|
|
|
12
12
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
13
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
|
|
14
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
13
15
|
from nv_ingest_api.internal.enums.common import (
|
|
14
16
|
DocumentTypeEnum,
|
|
15
17
|
ContentTypeEnum,
|
|
@@ -17,14 +19,13 @@ from nv_ingest_api.internal.enums.common import (
|
|
|
17
19
|
TextTypeEnum,
|
|
18
20
|
LanguageEnum,
|
|
19
21
|
)
|
|
20
|
-
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
21
22
|
from nv_ingest_api.internal.schemas.meta.metadata_schema import ContentHierarchySchema
|
|
22
23
|
from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
|
|
23
24
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
24
25
|
nv_ingest_node_failure_try_except,
|
|
25
26
|
)
|
|
27
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
26
28
|
|
|
27
|
-
# logging.basicConfig(level=logging.DEBUG)
|
|
28
29
|
logger = logging.getLogger(__name__)
|
|
29
30
|
|
|
30
31
|
|
|
@@ -37,15 +38,16 @@ class MetadataInjectionStage(RayActorStage):
|
|
|
37
38
|
injection is required, and if so, injects the appropriate metadata.
|
|
38
39
|
"""
|
|
39
40
|
|
|
40
|
-
def __init__(self, config: BaseModel) -> None:
|
|
41
|
+
def __init__(self, config: BaseModel, stage_name: Optional[str] = None) -> None:
|
|
41
42
|
# Call the base initializer to set attributes like self._running.
|
|
42
|
-
super().__init__(config)
|
|
43
|
+
super().__init__(config, stage_name=stage_name)
|
|
43
44
|
# Additional initialization can be added here if necessary.
|
|
44
|
-
|
|
45
|
+
self._logger.debug("MetadataInjectionStage initialized with config: %s", config)
|
|
45
46
|
|
|
46
|
-
@
|
|
47
|
-
@
|
|
48
|
-
|
|
47
|
+
@nv_ingest_node_failure_try_except()
|
|
48
|
+
@traceable()
|
|
49
|
+
@udf_intercept_hook()
|
|
50
|
+
def on_data(self, message: IngestControlMessage) -> IngestControlMessage:
|
|
49
51
|
"""
|
|
50
52
|
Process an incoming IngestControlMessage by injecting metadata into its DataFrame payload.
|
|
51
53
|
|
|
@@ -62,7 +64,7 @@ class MetadataInjectionStage(RayActorStage):
|
|
|
62
64
|
df = message.payload()
|
|
63
65
|
update_required = False
|
|
64
66
|
rows = []
|
|
65
|
-
logger.
|
|
67
|
+
logger.debug("Starting metadata injection on DataFrame with %d rows", len(df))
|
|
66
68
|
|
|
67
69
|
for _, row in df.iterrows():
|
|
68
70
|
try:
|
|
@@ -141,7 +143,7 @@ class MetadataInjectionStage(RayActorStage):
|
|
|
141
143
|
"source_metadata": default_source_metadata,
|
|
142
144
|
"text_metadata": default_text_metadata,
|
|
143
145
|
}
|
|
144
|
-
logger.
|
|
146
|
+
logger.debug(
|
|
145
147
|
f"METADATA_INJECTOR_DEBUG: Rebuilt metadata for source_id='{row.get('source_id', 'N/A')}'. "
|
|
146
148
|
f"Metadata keys: {list(row['metadata'].keys())}."
|
|
147
149
|
f"'content' present: {'content' in row['metadata']}"
|
|
@@ -154,8 +156,8 @@ class MetadataInjectionStage(RayActorStage):
|
|
|
154
156
|
if update_required:
|
|
155
157
|
docs = pd.DataFrame(rows)
|
|
156
158
|
message.payload(docs)
|
|
157
|
-
logger.
|
|
159
|
+
logger.debug("Metadata injection updated payload with %d rows", len(docs))
|
|
158
160
|
else:
|
|
159
|
-
logger.
|
|
161
|
+
logger.debug("No metadata update was necessary during metadata injection")
|
|
160
162
|
|
|
161
163
|
return message
|
|
@@ -21,6 +21,9 @@ class RayActorSinkStage(RayActorStage, ABC):
|
|
|
21
21
|
to deliver their final processed messages.
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
|
+
def __init__(self, config: Any, log_to_stdout=False, stage_name: Optional[str] = None) -> None:
|
|
25
|
+
super().__init__(config, log_to_stdout=log_to_stdout, stage_name=stage_name)
|
|
26
|
+
|
|
24
27
|
@ray.method(num_returns=1)
|
|
25
28
|
def set_output_queue(self, queue_handle: any) -> bool:
|
|
26
29
|
raise NotImplementedError("Sink stages do not support an output queue.")
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Optional
|
|
7
7
|
import ray
|
|
8
8
|
import logging
|
|
9
9
|
|
|
@@ -19,8 +19,8 @@ class RayActorSourceStage(RayActorStage, ABC):
|
|
|
19
19
|
Instead, they must implement get_input() to fetch control messages from an external source.
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
|
-
def __init__(self, config: Any, log_to_stdout=False) -> None:
|
|
23
|
-
super().__init__(config, log_to_stdout=log_to_stdout)
|
|
22
|
+
def __init__(self, config: Any, log_to_stdout=False, stage_name: Optional[str] = None) -> None:
|
|
23
|
+
super().__init__(config, log_to_stdout=log_to_stdout, stage_name=stage_name)
|
|
24
24
|
self.paused = False
|
|
25
25
|
|
|
26
26
|
def on_data(self, IngestControlMessage):
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
+
import gc
|
|
5
6
|
import sys
|
|
6
7
|
import threading
|
|
7
8
|
import time
|
|
@@ -9,12 +10,12 @@ from abc import ABC, abstractmethod
|
|
|
9
10
|
from typing import Any, Dict, Optional
|
|
10
11
|
import os
|
|
11
12
|
import psutil
|
|
12
|
-
import gc
|
|
13
13
|
|
|
14
14
|
import ray
|
|
15
15
|
import ray.actor
|
|
16
16
|
from pydantic import BaseModel
|
|
17
17
|
import logging
|
|
18
|
+
import pyarrow as pa
|
|
18
19
|
|
|
19
20
|
from ray import get_runtime_context
|
|
20
21
|
|
|
@@ -50,6 +51,9 @@ class RayActorStage(ABC):
|
|
|
50
51
|
----------
|
|
51
52
|
config : BaseModel
|
|
52
53
|
Configuration object for the stage.
|
|
54
|
+
stage_name : Optional[str]
|
|
55
|
+
Name of the stage from YAML pipeline configuration. Used by
|
|
56
|
+
stage-aware decorators for consistent naming.
|
|
53
57
|
_input_queue : Optional[Any]
|
|
54
58
|
Handle to the Ray queue from which input items are read.
|
|
55
59
|
Expected to be set via `set_input_queue`.
|
|
@@ -81,7 +85,7 @@ class RayActorStage(ABC):
|
|
|
81
85
|
Lock to protect access to shutdown-related state (`_shutting_down`).
|
|
82
86
|
"""
|
|
83
87
|
|
|
84
|
-
def __init__(self, config: BaseModel, log_to_stdout=False) -> None:
|
|
88
|
+
def __init__(self, config: BaseModel, stage_name: Optional[str] = None, log_to_stdout=False) -> None:
|
|
85
89
|
"""
|
|
86
90
|
Initialize the RayActorStage.
|
|
87
91
|
|
|
@@ -90,8 +94,14 @@ class RayActorStage(ABC):
|
|
|
90
94
|
config : BaseModel
|
|
91
95
|
Configuration object specific to the stage's behavior. Passed by
|
|
92
96
|
the orchestrator during actor creation.
|
|
97
|
+
stage_name : Optional[str]
|
|
98
|
+
Name of the stage from YAML pipeline configuration. Used by
|
|
99
|
+
stage-aware decorators for consistent naming.
|
|
100
|
+
log_to_stdout : bool
|
|
101
|
+
Whether to enable stdout logging.
|
|
93
102
|
"""
|
|
94
103
|
self.config: BaseModel = config
|
|
104
|
+
self.stage_name: Optional[str] = stage_name
|
|
95
105
|
self._input_queue: Optional[Any] = None # Ray Queue handle expected
|
|
96
106
|
self._output_queue: Optional[Any] = None # Ray Queue handle expected
|
|
97
107
|
self._running: bool = False
|
|
@@ -130,12 +140,13 @@ class RayActorStage(ABC):
|
|
|
130
140
|
|
|
131
141
|
self._actor_id_str = self._get_actor_id_str()
|
|
132
142
|
|
|
133
|
-
# --- PyArrow
|
|
134
|
-
#
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
self.
|
|
143
|
+
# --- PyArrow Memory Management ---
|
|
144
|
+
# Time-based periodic cleanup to prevent long-term memory accumulation
|
|
145
|
+
self._memory_cleanup_interval_seconds = getattr(
|
|
146
|
+
config, "memory_cleanup_interval_seconds", 300
|
|
147
|
+
) # 5 minutes default
|
|
148
|
+
self._last_memory_cleanup_time = time.time()
|
|
149
|
+
self._memory_cleanups_performed = 0
|
|
139
150
|
|
|
140
151
|
@staticmethod
|
|
141
152
|
def _get_actor_id_str() -> str:
|
|
@@ -429,8 +440,6 @@ class RayActorStage(ABC):
|
|
|
429
440
|
gc.collect()
|
|
430
441
|
|
|
431
442
|
try:
|
|
432
|
-
import pyarrow as pa # Local import to avoid hard dependency at import time
|
|
433
|
-
|
|
434
443
|
pool = pa.default_memory_pool()
|
|
435
444
|
try:
|
|
436
445
|
before_bytes = getattr(pool, "bytes_allocated", lambda: 0)()
|
|
@@ -579,7 +588,7 @@ class RayActorStage(ABC):
|
|
|
579
588
|
self._logger.warning(f"{self._actor_id_str}: Start called but actor is already running.")
|
|
580
589
|
return False
|
|
581
590
|
|
|
582
|
-
self._logger.
|
|
591
|
+
self._logger.debug(f"{self._actor_id_str}: Starting actor...")
|
|
583
592
|
# --- Initialize Actor State ---
|
|
584
593
|
self._running = True
|
|
585
594
|
self._shutting_down = False # Reset shutdown flag on start
|
|
@@ -598,14 +607,14 @@ class RayActorStage(ABC):
|
|
|
598
607
|
)
|
|
599
608
|
self._processing_thread.start()
|
|
600
609
|
|
|
601
|
-
self._logger.
|
|
610
|
+
self._logger.debug(f"{self._actor_id_str}: Actor started successfully.")
|
|
602
611
|
|
|
603
612
|
return True
|
|
604
613
|
|
|
605
614
|
@ray.method(num_returns=0)
|
|
606
615
|
def stop(self) -> None:
|
|
607
616
|
"""Stops the actor's processing loop by setting the running flag to False."""
|
|
608
|
-
self._logger.
|
|
617
|
+
self._logger.debug(f"[{self._actor_id_str}] Stop signal received. Initiating graceful shutdown.")
|
|
609
618
|
self._running = False
|
|
610
619
|
|
|
611
620
|
def is_shutdown_complete(self) -> bool:
|
|
@@ -4,11 +4,13 @@
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
|
+
from typing import Optional
|
|
7
8
|
|
|
8
9
|
import ray
|
|
9
10
|
|
|
10
11
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
11
12
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
13
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
12
14
|
from nv_ingest_api.internal.mutate.deduplicate import deduplicate_images_internal
|
|
13
15
|
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
14
16
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
@@ -31,18 +33,19 @@ class ImageDedupStage(RayActorStage):
|
|
|
31
33
|
3. Updates the message payload with the deduplicated DataFrame.
|
|
32
34
|
"""
|
|
33
35
|
|
|
34
|
-
def __init__(self, config: ImageDedupSchema) -> None:
|
|
35
|
-
super().__init__(config)
|
|
36
|
+
def __init__(self, config: ImageDedupSchema, stage_name: Optional[str] = None) -> None:
|
|
37
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
38
|
try:
|
|
37
39
|
self.validated_config = config
|
|
38
|
-
logger.
|
|
40
|
+
logger.debug("ImageDedupStage configuration validated successfully.")
|
|
39
41
|
except Exception as e:
|
|
40
42
|
logger.exception(f"Error validating Image Deduplication config: {e}")
|
|
41
43
|
raise
|
|
42
44
|
|
|
43
|
-
@
|
|
45
|
+
@nv_ingest_node_failure_try_except()
|
|
46
|
+
@traceable()
|
|
47
|
+
@udf_intercept_hook()
|
|
44
48
|
@filter_by_task(required_tasks=["dedup"])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="image_dedup", raise_on_failure=False)
|
|
46
49
|
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
50
|
"""
|
|
48
51
|
Process the control message by deduplicating images.
|
|
@@ -57,7 +60,7 @@ class ImageDedupStage(RayActorStage):
|
|
|
57
60
|
IngestControlMessage
|
|
58
61
|
The updated message with deduplicated images in the payload.
|
|
59
62
|
"""
|
|
60
|
-
logger.
|
|
63
|
+
logger.debug("ImageDedupStage.on_data: Starting image deduplication process.")
|
|
61
64
|
try:
|
|
62
65
|
# Extract the DataFrame payload.
|
|
63
66
|
df_ledger = control_message.payload()
|
|
@@ -74,7 +77,7 @@ class ImageDedupStage(RayActorStage):
|
|
|
74
77
|
mutate_config=self.validated_config,
|
|
75
78
|
execution_trace_log=None,
|
|
76
79
|
)
|
|
77
|
-
logger.
|
|
80
|
+
logger.debug("Image deduplication completed. Resulting DataFrame has %d rows.", len(new_df))
|
|
78
81
|
|
|
79
82
|
# Update the message payload with the deduplicated DataFrame.
|
|
80
83
|
control_message.payload(new_df)
|