nv-ingest 2025.5.21.dev20250521__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/__init__.py +20 -0
- nv_ingest/api/__init__.py +3 -0
- nv_ingest/api/main.py +43 -0
- nv_ingest/api/v1/__init__.py +3 -0
- nv_ingest/api/v1/health.py +114 -0
- nv_ingest/api/v1/ingest.py +454 -0
- nv_ingest/framework/__init__.py +3 -0
- nv_ingest/framework/orchestration/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
- nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
- nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
- nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
- nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
- nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
- nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
- nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
- nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
- nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
- nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
- nv_ingest/framework/schemas/__init__.py +0 -0
- nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
- nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
- nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
- nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
- nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
- nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
- nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
- nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
- nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
- nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
- nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
- nv_ingest/framework/util/__init__.py +3 -0
- nv_ingest/framework/util/flow_control/__init__.py +8 -0
- nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
- nv_ingest/framework/util/service/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
- nv_ingest/framework/util/service/meta/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
- nv_ingest/framework/util/telemetry/__init__.py +3 -0
- nv_ingest/framework/util/telemetry/global_stats.py +145 -0
- nv_ingest/version.py +38 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import ray
|
|
8
|
+
|
|
9
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
10
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
11
|
+
from nv_ingest_api.internal.extract.image.image_extractor import extract_primitives_from_image_internal
|
|
12
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
13
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
14
|
+
from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExtractorSchema
|
|
15
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
16
|
+
nv_ingest_node_failure_try_except,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@ray.remote
|
|
23
|
+
class ImageExtractorStage(RayActorStage):
|
|
24
|
+
"""
|
|
25
|
+
A Ray actor stage that extracts primitives from image content.
|
|
26
|
+
|
|
27
|
+
It expects an IngestControlMessage containing a DataFrame with image data. It then:
|
|
28
|
+
1. Removes the "extract" task from the message.
|
|
29
|
+
2. Calls the image extraction logic (via extract_primitives_from_image_internal) using a validated configuration.
|
|
30
|
+
3. Updates the message payload with the extracted primitives DataFrame.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: ImageExtractorSchema) -> None:
|
|
34
|
+
super().__init__(config)
|
|
35
|
+
try:
|
|
36
|
+
self.validated_config = config
|
|
37
|
+
logger.info("ImageExtractorStage configuration validated successfully.")
|
|
38
|
+
except Exception as e:
|
|
39
|
+
logger.exception(f"Error validating Image Extractor config: {e}")
|
|
40
|
+
raise
|
|
41
|
+
|
|
42
|
+
@traceable("image_extraction")
|
|
43
|
+
@filter_by_task(required_tasks=[("extract", {"document_type": "regex:^(png|jpeg|jpg|tiff|bmp)$"})])
|
|
44
|
+
@nv_ingest_node_failure_try_except(annotation_id="image_extractor", raise_on_failure=False)
|
|
45
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
46
|
+
"""
|
|
47
|
+
Process the control message by extracting primitives from images.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
control_message : IngestControlMessage
|
|
52
|
+
The message containing a DataFrame payload with image data.
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
IngestControlMessage
|
|
57
|
+
The updated message with extracted image primitives.
|
|
58
|
+
"""
|
|
59
|
+
logger.info("ImageExtractorStage.on_data: Starting image extraction process.")
|
|
60
|
+
try:
|
|
61
|
+
# Extract the DataFrame payload.
|
|
62
|
+
df_ledger = control_message.payload()
|
|
63
|
+
logger.debug("Extracted payload with %d rows.", len(df_ledger))
|
|
64
|
+
|
|
65
|
+
# Remove the "extract" task from the message to obtain task-specific configuration.
|
|
66
|
+
task_config = remove_task_by_type(control_message, "extract")
|
|
67
|
+
logger.debug("Extracted task config: %s", task_config)
|
|
68
|
+
|
|
69
|
+
# Perform image primitives extraction.
|
|
70
|
+
new_df, extraction_info = extract_primitives_from_image_internal(
|
|
71
|
+
df_extraction_ledger=df_ledger,
|
|
72
|
+
task_config=task_config,
|
|
73
|
+
extraction_config=self.validated_config,
|
|
74
|
+
execution_trace_log=None,
|
|
75
|
+
)
|
|
76
|
+
logger.info("Image extraction completed. Resulting DataFrame has %d rows.", len(new_df))
|
|
77
|
+
|
|
78
|
+
# Update the message payload with the extracted primitives DataFrame.
|
|
79
|
+
control_message.payload(new_df)
|
|
80
|
+
control_message.set_metadata("image_extraction_info", extraction_info)
|
|
81
|
+
|
|
82
|
+
return control_message
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.exception(f"ImageExtractorStage failed processing control message: {e}")
|
|
85
|
+
raise
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import ray
|
|
7
|
+
|
|
8
|
+
from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
|
|
9
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
10
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
11
|
+
from nv_ingest_api.internal.extract.image.infographic_extractor import extract_infographic_data_from_image_internal
|
|
12
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
13
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
14
|
+
from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@ray.remote
|
|
20
|
+
class InfographicExtractorStage(RayActorStage):
|
|
21
|
+
def __init__(self, config: InfographicExtractorSchema) -> None:
|
|
22
|
+
super().__init__(config)
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
self.validated_config = config
|
|
26
|
+
logger.info("ImageExtractorStage configuration validated successfully.")
|
|
27
|
+
except Exception as e:
|
|
28
|
+
logger.exception(f"Error validating Image Extractor config: {e}")
|
|
29
|
+
raise
|
|
30
|
+
|
|
31
|
+
@traceable("infographic_extraction")
|
|
32
|
+
@filter_by_task(required_tasks=["infographic_data_extract"])
|
|
33
|
+
@nv_ingest_node_failure_try_except(annotation_id="infographic_extraction", raise_on_failure=False)
|
|
34
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
35
|
+
# Extract DataFrame payload
|
|
36
|
+
df_ledger = control_message.payload()
|
|
37
|
+
|
|
38
|
+
# Remove the "infographic_data_extract" task from the message
|
|
39
|
+
task_config = remove_task_by_type(control_message, "infographic_data_extract")
|
|
40
|
+
|
|
41
|
+
execution_trace_log = {}
|
|
42
|
+
new_df, extraction_info = extract_infographic_data_from_image_internal(
|
|
43
|
+
df_extraction_ledger=df_ledger,
|
|
44
|
+
task_config=task_config,
|
|
45
|
+
extraction_config=self.validated_config,
|
|
46
|
+
execution_trace_log=execution_trace_log,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
control_message.payload(new_df)
|
|
50
|
+
control_message.set_metadata("infographic_extraction_info", extraction_info)
|
|
51
|
+
|
|
52
|
+
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
53
|
+
if do_trace_tagging and execution_trace_log:
|
|
54
|
+
for key, ts in execution_trace_log.items():
|
|
55
|
+
control_message.set_timestamp(key, ts)
|
|
56
|
+
|
|
57
|
+
return control_message
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from typing import Any, Dict, Tuple, Optional
|
|
8
|
+
import ray
|
|
9
|
+
|
|
10
|
+
# Assume these imports come from your project:
|
|
11
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
12
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
13
|
+
from nv_ingest_api.internal.extract.pdf.pdf_extractor import extract_primitives_from_pdf_internal
|
|
14
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
|
|
15
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
16
|
+
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
|
|
17
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
18
|
+
nv_ingest_node_failure_try_except,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _inject_validated_config(
|
|
25
|
+
df_extraction_ledger: pd.DataFrame,
|
|
26
|
+
task_config: Dict,
|
|
27
|
+
execution_trace_log: Optional[Any] = None,
|
|
28
|
+
validated_config: Any = None,
|
|
29
|
+
) -> Tuple[pd.DataFrame, Dict]:
|
|
30
|
+
"""
|
|
31
|
+
Helper function that injects the validated_config into the configuration for PDF extraction
|
|
32
|
+
and calls extract_primitives_from_pdf_internal.
|
|
33
|
+
"""
|
|
34
|
+
return extract_primitives_from_pdf_internal(
|
|
35
|
+
df_extraction_ledger=df_extraction_ledger,
|
|
36
|
+
task_config=task_config,
|
|
37
|
+
extractor_config=validated_config,
|
|
38
|
+
execution_trace_log=execution_trace_log,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@ray.remote
|
|
43
|
+
class PDFExtractorStage(RayActorStage):
|
|
44
|
+
"""
|
|
45
|
+
A Ray actor stage that extracts PDF primitives from a DataFrame payload.
|
|
46
|
+
|
|
47
|
+
It expects an IngestControlMessage containing a DataFrame of PDF documents. It then:
|
|
48
|
+
1. Removes the "extract" task from the message.
|
|
49
|
+
2. Calls the PDF extraction logic (via _inject_validated_config) using a validated configuration.
|
|
50
|
+
3. Updates the message payload with the extracted DataFrame.
|
|
51
|
+
4. Optionally, stores additional extraction info in the message metadata.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, config: PDFExtractorSchema) -> None:
|
|
55
|
+
super().__init__(config)
|
|
56
|
+
try:
|
|
57
|
+
# Validate and store the PDF extractor configuration.
|
|
58
|
+
self.validated_config = config
|
|
59
|
+
logger.info("PDFExtractorStage configuration validated successfully.")
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.exception(f"Error validating PDF extractor config: {e}")
|
|
62
|
+
raise
|
|
63
|
+
|
|
64
|
+
@traceable("pdf_extraction")
|
|
65
|
+
@filter_by_task(required_tasks=[("extract", {"document_type": "pdf"})])
|
|
66
|
+
@nv_ingest_node_failure_try_except(annotation_id="pdf_extractor", raise_on_failure=False)
|
|
67
|
+
def on_data(self, control_message: Any) -> Any:
|
|
68
|
+
"""
|
|
69
|
+
Process the control message by extracting PDF content.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
control_message : IngestControlMessage
|
|
74
|
+
The message containing a DataFrame payload with PDF documents.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
IngestControlMessage
|
|
79
|
+
The updated message with the extracted DataFrame and extraction info in metadata.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
logger.info("PDFExtractorStage.on_data: Starting PDF extraction process.")
|
|
83
|
+
|
|
84
|
+
# Extract the DataFrame payload.
|
|
85
|
+
df_extraction_ledger = control_message.payload()
|
|
86
|
+
logger.debug("Extracted payload with %d rows.", len(df_extraction_ledger))
|
|
87
|
+
|
|
88
|
+
# Remove the "extract" task from the message to obtain task-specific configuration.
|
|
89
|
+
task_config = remove_task_by_type(control_message, "extract")
|
|
90
|
+
logger.debug("Extracted task config: %s", task_config)
|
|
91
|
+
|
|
92
|
+
# Perform PDF extraction.
|
|
93
|
+
execution_trace_log = {}
|
|
94
|
+
new_df, extraction_info = _inject_validated_config(
|
|
95
|
+
df_extraction_ledger,
|
|
96
|
+
task_config,
|
|
97
|
+
execution_trace_log=execution_trace_log,
|
|
98
|
+
validated_config=self.validated_config,
|
|
99
|
+
)
|
|
100
|
+
logger.info("PDF extraction completed. Extracted %d rows.", len(new_df))
|
|
101
|
+
|
|
102
|
+
# Update the message payload with the extracted DataFrame.
|
|
103
|
+
control_message.payload(new_df)
|
|
104
|
+
# Optionally, annotate the message with extraction info.
|
|
105
|
+
control_message.set_metadata("pdf_extraction_info", extraction_info)
|
|
106
|
+
logger.info("PDF extraction metadata injected successfully.")
|
|
107
|
+
|
|
108
|
+
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
109
|
+
if do_trace_tagging and execution_trace_log:
|
|
110
|
+
for key, ts in execution_trace_log.items():
|
|
111
|
+
control_message.set_timestamp(key, ts)
|
|
112
|
+
|
|
113
|
+
return control_message
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import ray
|
|
8
|
+
|
|
9
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
10
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
11
|
+
from nv_ingest_api.internal.extract.pptx.pptx_extractor import extract_primitives_from_pptx_internal
|
|
12
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
13
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
14
|
+
from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
|
|
15
|
+
from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@ray.remote
|
|
21
|
+
class PPTXExtractorStage(RayActorStage):
|
|
22
|
+
"""
|
|
23
|
+
A Ray actor stage that extracts content from PPTX documents.
|
|
24
|
+
|
|
25
|
+
It expects an IngestControlMessage containing a DataFrame with PPTX document data. It then:
|
|
26
|
+
1. Removes the "pptx-extract" task from the message.
|
|
27
|
+
2. Calls the PPTX extraction logic (via extract_primitives_from_pptx_internal) using a validated configuration.
|
|
28
|
+
3. Updates the message payload with the extracted content DataFrame.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, config: PPTXExtractorSchema) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Initializes the PptxExtractorStage.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
config : PPTXExtractorSchema
|
|
38
|
+
The validated configuration object for PPTX extraction.
|
|
39
|
+
"""
|
|
40
|
+
super().__init__(config)
|
|
41
|
+
try:
|
|
42
|
+
# The config passed in should already be validated, but storing it.
|
|
43
|
+
self.validated_config = config
|
|
44
|
+
logger.info("PptxExtractorStage configuration validated successfully.")
|
|
45
|
+
except Exception as e:
|
|
46
|
+
# If RayActorStage.__init__ or config access raises an issue.
|
|
47
|
+
logger.exception(f"Error initializing or validating PPTX Extractor config: {e}")
|
|
48
|
+
raise
|
|
49
|
+
|
|
50
|
+
@traceable("pptx_extractor")
|
|
51
|
+
@filter_by_task(required_tasks=[("extract", {"document_type": "pptx"})])
|
|
52
|
+
@nv_ingest_node_failure_try_except(annotation_id="pptx_extractor", raise_on_failure=False)
|
|
53
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
54
|
+
"""
|
|
55
|
+
Process the control message by extracting content from PPTX documents.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
control_message : IngestControlMessage
|
|
60
|
+
The message containing a DataFrame payload with PPTX document data.
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
IngestControlMessage
|
|
65
|
+
The updated message with extracted PPTX content.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
# Extract the DataFrame payload.
|
|
69
|
+
df_ledger = control_message.payload()
|
|
70
|
+
|
|
71
|
+
# Remove the "pptx-extract" task from the message to obtain task-specific configuration.
|
|
72
|
+
task_config = remove_task_by_type(control_message, "extract")
|
|
73
|
+
|
|
74
|
+
new_df, extraction_info = extract_primitives_from_pptx_internal(
|
|
75
|
+
df_extraction_ledger=df_ledger,
|
|
76
|
+
task_config=task_config,
|
|
77
|
+
extraction_config=self.validated_config,
|
|
78
|
+
execution_trace_log=None, # Assuming None is appropriate here as in DOCX example
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Update the message payload with the extracted PPTX content DataFrame.
|
|
82
|
+
control_message.payload(new_df)
|
|
83
|
+
control_message.set_metadata("pptx_extraction_info", extraction_info) # <-- Changed metadata key
|
|
84
|
+
|
|
85
|
+
return control_message
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
import ray
|
|
8
|
+
|
|
9
|
+
# These imports are assumed from your project.
|
|
10
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
11
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
12
|
+
from nv_ingest_api.internal.extract.image.table_extractor import extract_table_data_from_image_internal
|
|
13
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
|
|
14
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
15
|
+
from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
|
|
16
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
|
+
nv_ingest_node_failure_try_except,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@ray.remote
|
|
24
|
+
class TableExtractorStage(RayActorStage):
|
|
25
|
+
"""
|
|
26
|
+
A Ray actor stage that extracts table data from PDF content.
|
|
27
|
+
|
|
28
|
+
It expects an IngestControlMessage containing a DataFrame payload with PDF documents.
|
|
29
|
+
The stage removes the "table_data_extract" task from the message, calls the internal
|
|
30
|
+
extraction function using a validated TableExtractorSchema, updates the message payload,
|
|
31
|
+
and annotates the message metadata with extraction info.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, config: TableExtractorSchema) -> None:
|
|
35
|
+
super().__init__(config)
|
|
36
|
+
try:
|
|
37
|
+
self.validated_config = config
|
|
38
|
+
logger.info("TableExtractorStage configuration validated successfully.")
|
|
39
|
+
except Exception as e:
|
|
40
|
+
logger.exception("Error validating table extractor config")
|
|
41
|
+
raise e
|
|
42
|
+
|
|
43
|
+
@traceable("table_extraction")
|
|
44
|
+
@filter_by_task(required_tasks=["table_data_extract"])
|
|
45
|
+
@nv_ingest_node_failure_try_except(annotation_id="table_extraction", raise_on_failure=False)
|
|
46
|
+
def on_data(self, control_message: Any) -> Any:
|
|
47
|
+
"""
|
|
48
|
+
Process the control message by extracting table data from the PDF payload.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
control_message : IngestControlMessage
|
|
53
|
+
The incoming message containing the PDF payload.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
IngestControlMessage
|
|
58
|
+
The updated message with the extracted table data and extraction info in metadata.
|
|
59
|
+
"""
|
|
60
|
+
logger.info("TableExtractorStage.on_data: Starting table extraction.")
|
|
61
|
+
# Extract the DataFrame payload.
|
|
62
|
+
df_payload = control_message.payload()
|
|
63
|
+
logger.debug("Extracted payload with %d rows.", len(df_payload))
|
|
64
|
+
|
|
65
|
+
# Remove the "table_data_extract" task to obtain task-specific configuration.
|
|
66
|
+
task_config = remove_task_by_type(control_message, "table_data_extract")
|
|
67
|
+
logger.debug("Extracted task configuration: %s", task_config)
|
|
68
|
+
|
|
69
|
+
# Perform table data extraction.
|
|
70
|
+
execution_trace_log = {}
|
|
71
|
+
new_df, extraction_info = extract_table_data_from_image_internal(
|
|
72
|
+
df_extraction_ledger=df_payload,
|
|
73
|
+
task_config=task_config,
|
|
74
|
+
extraction_config=self.validated_config,
|
|
75
|
+
execution_trace_log=execution_trace_log,
|
|
76
|
+
)
|
|
77
|
+
logger.info("Table extraction completed. Extracted %d rows.", len(new_df))
|
|
78
|
+
|
|
79
|
+
# Update the control message with the new DataFrame.
|
|
80
|
+
control_message.payload(new_df)
|
|
81
|
+
# Annotate the message with extraction info.
|
|
82
|
+
control_message.set_metadata("table_extraction_info", extraction_info)
|
|
83
|
+
logger.info("Table extraction metadata injected successfully.")
|
|
84
|
+
|
|
85
|
+
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
86
|
+
if do_trace_tagging and execution_trace_log:
|
|
87
|
+
for key, ts in execution_trace_log.items():
|
|
88
|
+
control_message.set_timestamp(key, ts)
|
|
89
|
+
|
|
90
|
+
return control_message
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from typing import Any
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
import ray
|
|
10
|
+
|
|
11
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
12
|
+
from nv_ingest_api.internal.enums.common import DocumentTypeEnum, ContentTypeEnum
|
|
13
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
14
|
+
from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
|
|
15
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
16
|
+
nv_ingest_node_failure_try_except,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# logging.basicConfig(level=logging.DEBUG)
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@ray.remote
|
|
24
|
+
class MetadataInjectionStage(RayActorStage):
|
|
25
|
+
"""
|
|
26
|
+
A Ray actor stage that performs metadata injection on IngestControlMessages.
|
|
27
|
+
|
|
28
|
+
This stage iterates over the rows of the DataFrame payload, checks if metadata
|
|
29
|
+
injection is required, and if so, injects the appropriate metadata.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, config: BaseModel) -> None:
|
|
33
|
+
# Call the base initializer to set attributes like self._running.
|
|
34
|
+
super().__init__(config)
|
|
35
|
+
# Additional initialization can be added here if necessary.
|
|
36
|
+
logger.info("MetadataInjectionStage initialized with config: %s", config)
|
|
37
|
+
|
|
38
|
+
@traceable("metadata_injector")
|
|
39
|
+
@nv_ingest_node_failure_try_except(annotation_id="metadata_injector", raise_on_failure=False)
|
|
40
|
+
def on_data(self, message: Any) -> Any:
|
|
41
|
+
"""
|
|
42
|
+
Process an incoming IngestControlMessage by injecting metadata into its DataFrame payload.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
message : IngestControlMessage
|
|
47
|
+
The incoming message containing the payload DataFrame.
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
IngestControlMessage
|
|
52
|
+
The message with updated metadata if injection was required.
|
|
53
|
+
"""
|
|
54
|
+
df = message.payload()
|
|
55
|
+
update_required = False
|
|
56
|
+
rows = []
|
|
57
|
+
logger.info("Starting metadata injection on DataFrame with %d rows", len(df))
|
|
58
|
+
|
|
59
|
+
for _, row in df.iterrows():
|
|
60
|
+
try:
|
|
61
|
+
# Convert document type to content type using enums.
|
|
62
|
+
content_type = doc_type_to_content_type(DocumentTypeEnum(row["document_type"]))
|
|
63
|
+
# Check if metadata is missing or doesn't contain 'content'
|
|
64
|
+
if "metadata" not in row or not isinstance(row["metadata"], dict) or "content" not in row["metadata"]:
|
|
65
|
+
update_required = True
|
|
66
|
+
row["metadata"] = {
|
|
67
|
+
"content": row.get("content"),
|
|
68
|
+
"content_metadata": {
|
|
69
|
+
"type": content_type.name.lower(),
|
|
70
|
+
},
|
|
71
|
+
"error_metadata": None,
|
|
72
|
+
"audio_metadata": (
|
|
73
|
+
None if content_type != ContentTypeEnum.AUDIO else {"audio_type": row["document_type"]}
|
|
74
|
+
),
|
|
75
|
+
"image_metadata": (
|
|
76
|
+
None if content_type != ContentTypeEnum.IMAGE else {"image_type": row["document_type"]}
|
|
77
|
+
),
|
|
78
|
+
"source_metadata": {
|
|
79
|
+
"source_id": row.get("source_id"),
|
|
80
|
+
"source_name": row.get("source_name"),
|
|
81
|
+
"source_type": row["document_type"],
|
|
82
|
+
},
|
|
83
|
+
"text_metadata": (None if content_type != ContentTypeEnum.TEXT else {"text_type": "document"}),
|
|
84
|
+
}
|
|
85
|
+
except Exception as inner_e:
|
|
86
|
+
logger.exception("Failed to process row during metadata injection")
|
|
87
|
+
raise inner_e
|
|
88
|
+
rows.append(row)
|
|
89
|
+
|
|
90
|
+
if update_required:
|
|
91
|
+
docs = pd.DataFrame(rows)
|
|
92
|
+
message.payload(docs)
|
|
93
|
+
logger.info("Metadata injection updated payload with %d rows", len(docs))
|
|
94
|
+
else:
|
|
95
|
+
logger.info("No metadata update was necessary during metadata injection")
|
|
96
|
+
|
|
97
|
+
return message
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import Any, Dict
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# TODO(Devin): Early prototype. Not currently used anywhere
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RayActorEdge(ABC):
|
|
13
|
+
"""
|
|
14
|
+
Abstract base class for a Ray actor edge used in a RayPipeline.
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
max_size : int
|
|
19
|
+
The maximum size of the edge's internal queue.
|
|
20
|
+
multi_reader : bool
|
|
21
|
+
Whether the edge supports multiple concurrent readers.
|
|
22
|
+
multi_writer : bool
|
|
23
|
+
Whether the edge supports multiple concurrent writers.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, max_size: int, multi_reader: bool = False, multi_writer: bool = False) -> None:
|
|
27
|
+
self.max_size = max_size
|
|
28
|
+
self.multi_reader = multi_reader
|
|
29
|
+
self.multi_writer = multi_writer
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def write(self, item: Any) -> bool:
|
|
33
|
+
"""
|
|
34
|
+
Write an item into the edge.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
item : Any
|
|
39
|
+
The item to enqueue.
|
|
40
|
+
|
|
41
|
+
Returns
|
|
42
|
+
-------
|
|
43
|
+
bool
|
|
44
|
+
True if the item was enqueued successfully.
|
|
45
|
+
"""
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def read(self) -> Any:
|
|
50
|
+
"""
|
|
51
|
+
Read an item from the edge.
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
Any
|
|
56
|
+
The next item in the edge.
|
|
57
|
+
"""
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
@abstractmethod
|
|
61
|
+
def get_stats(self) -> Dict[str, int]:
|
|
62
|
+
"""
|
|
63
|
+
Get current statistics for the edge.
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
Dict[str, int]
|
|
68
|
+
A dictionary containing statistics (e.g. write_count, read_count, queue_full_count, current_size).
|
|
69
|
+
"""
|
|
70
|
+
pass
|