nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,85 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+
7
+ import ray
8
+
9
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
10
+ from nv_ingest.framework.util.flow_control import filter_by_task
11
+ from nv_ingest_api.internal.extract.image.image_extractor import extract_primitives_from_image_internal
12
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
13
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
14
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExtractorSchema
15
+ from nv_ingest_api.util.exception_handlers.decorators import (
16
+ nv_ingest_node_failure_try_except,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @ray.remote
23
+ class ImageExtractorStage(RayActorStage):
24
+ """
25
+ A Ray actor stage that extracts primitives from image content.
26
+
27
+ It expects an IngestControlMessage containing a DataFrame with image data. It then:
28
+ 1. Removes the "extract" task from the message.
29
+ 2. Calls the image extraction logic (via extract_primitives_from_image_internal) using a validated configuration.
30
+ 3. Updates the message payload with the extracted primitives DataFrame.
31
+ """
32
+
33
+ def __init__(self, config: ImageExtractorSchema) -> None:
34
+ super().__init__(config)
35
+ try:
36
+ self.validated_config = config
37
+ logger.info("ImageExtractorStage configuration validated successfully.")
38
+ except Exception as e:
39
+ logger.exception(f"Error validating Image Extractor config: {e}")
40
+ raise
41
+
42
+ @traceable("image_extraction")
43
+ @filter_by_task(required_tasks=[("extract", {"document_type": "regex:^(png|jpeg|jpg|tiff|bmp)$"})])
44
+ @nv_ingest_node_failure_try_except(annotation_id="image_extractor", raise_on_failure=False)
45
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
46
+ """
47
+ Process the control message by extracting primitives from images.
48
+
49
+ Parameters
50
+ ----------
51
+ control_message : IngestControlMessage
52
+ The message containing a DataFrame payload with image data.
53
+
54
+ Returns
55
+ -------
56
+ IngestControlMessage
57
+ The updated message with extracted image primitives.
58
+ """
59
+ logger.info("ImageExtractorStage.on_data: Starting image extraction process.")
60
+ try:
61
+ # Extract the DataFrame payload.
62
+ df_ledger = control_message.payload()
63
+ logger.debug("Extracted payload with %d rows.", len(df_ledger))
64
+
65
+ # Remove the "extract" task from the message to obtain task-specific configuration.
66
+ task_config = remove_task_by_type(control_message, "extract")
67
+ logger.debug("Extracted task config: %s", task_config)
68
+
69
+ # Perform image primitives extraction.
70
+ new_df, extraction_info = extract_primitives_from_image_internal(
71
+ df_extraction_ledger=df_ledger,
72
+ task_config=task_config,
73
+ extraction_config=self.validated_config,
74
+ execution_trace_log=None,
75
+ )
76
+ logger.info("Image extraction completed. Resulting DataFrame has %d rows.", len(new_df))
77
+
78
+ # Update the message payload with the extracted primitives DataFrame.
79
+ control_message.payload(new_df)
80
+ control_message.set_metadata("image_extraction_info", extraction_info)
81
+
82
+ return control_message
83
+ except Exception as e:
84
+ logger.exception(f"ImageExtractorStage failed processing control message: {e}")
85
+ raise
@@ -0,0 +1,57 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import ray
7
+
8
+ from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
9
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
10
+ from nv_ingest.framework.util.flow_control import filter_by_task
11
+ from nv_ingest_api.internal.extract.image.infographic_extractor import extract_infographic_data_from_image_internal
12
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
13
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
14
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @ray.remote
20
+ class InfographicExtractorStage(RayActorStage):
21
+ def __init__(self, config: InfographicExtractorSchema) -> None:
22
+ super().__init__(config)
23
+
24
+ try:
25
+ self.validated_config = config
26
+ logger.info("ImageExtractorStage configuration validated successfully.")
27
+ except Exception as e:
28
+ logger.exception(f"Error validating Image Extractor config: {e}")
29
+ raise
30
+
31
+ @traceable("infographic_extraction")
32
+ @filter_by_task(required_tasks=["infographic_data_extract"])
33
+ @nv_ingest_node_failure_try_except(annotation_id="infographic_extraction", raise_on_failure=False)
34
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
35
+ # Extract DataFrame payload
36
+ df_ledger = control_message.payload()
37
+
38
+ # Remove the "infographic_data_extract" task from the message
39
+ task_config = remove_task_by_type(control_message, "infographic_data_extract")
40
+
41
+ execution_trace_log = {}
42
+ new_df, extraction_info = extract_infographic_data_from_image_internal(
43
+ df_extraction_ledger=df_ledger,
44
+ task_config=task_config,
45
+ extraction_config=self.validated_config,
46
+ execution_trace_log=execution_trace_log,
47
+ )
48
+
49
+ control_message.payload(new_df)
50
+ control_message.set_metadata("infographic_extraction_info", extraction_info)
51
+
52
+ do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
53
+ if do_trace_tagging and execution_trace_log:
54
+ for key, ts in execution_trace_log.items():
55
+ control_message.set_timestamp(key, ts)
56
+
57
+ return control_message
@@ -0,0 +1,113 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import pandas as pd
7
+ from typing import Any, Dict, Tuple, Optional
8
+ import ray
9
+
10
+ # Assume these imports come from your project:
11
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
+ from nv_ingest.framework.util.flow_control import filter_by_task
13
+ from nv_ingest_api.internal.extract.pdf.pdf_extractor import extract_primitives_from_pdf_internal
14
+ from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
15
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
16
+ from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
17
+ from nv_ingest_api.util.exception_handlers.decorators import (
18
+ nv_ingest_node_failure_try_except,
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def _inject_validated_config(
25
+ df_extraction_ledger: pd.DataFrame,
26
+ task_config: Dict,
27
+ execution_trace_log: Optional[Any] = None,
28
+ validated_config: Any = None,
29
+ ) -> Tuple[pd.DataFrame, Dict]:
30
+ """
31
+ Helper function that injects the validated_config into the configuration for PDF extraction
32
+ and calls extract_primitives_from_pdf_internal.
33
+ """
34
+ return extract_primitives_from_pdf_internal(
35
+ df_extraction_ledger=df_extraction_ledger,
36
+ task_config=task_config,
37
+ extractor_config=validated_config,
38
+ execution_trace_log=execution_trace_log,
39
+ )
40
+
41
+
42
+ @ray.remote
43
+ class PDFExtractorStage(RayActorStage):
44
+ """
45
+ A Ray actor stage that extracts PDF primitives from a DataFrame payload.
46
+
47
+ It expects an IngestControlMessage containing a DataFrame of PDF documents. It then:
48
+ 1. Removes the "extract" task from the message.
49
+ 2. Calls the PDF extraction logic (via _inject_validated_config) using a validated configuration.
50
+ 3. Updates the message payload with the extracted DataFrame.
51
+ 4. Optionally, stores additional extraction info in the message metadata.
52
+ """
53
+
54
+ def __init__(self, config: PDFExtractorSchema) -> None:
55
+ super().__init__(config)
56
+ try:
57
+ # Validate and store the PDF extractor configuration.
58
+ self.validated_config = config
59
+ logger.info("PDFExtractorStage configuration validated successfully.")
60
+ except Exception as e:
61
+ logger.exception(f"Error validating PDF extractor config: {e}")
62
+ raise
63
+
64
+ @traceable("pdf_extraction")
65
+ @filter_by_task(required_tasks=[("extract", {"document_type": "pdf"})])
66
+ @nv_ingest_node_failure_try_except(annotation_id="pdf_extractor", raise_on_failure=False)
67
+ def on_data(self, control_message: Any) -> Any:
68
+ """
69
+ Process the control message by extracting PDF content.
70
+
71
+ Parameters
72
+ ----------
73
+ control_message : IngestControlMessage
74
+ The message containing a DataFrame payload with PDF documents.
75
+
76
+ Returns
77
+ -------
78
+ IngestControlMessage
79
+ The updated message with the extracted DataFrame and extraction info in metadata.
80
+ """
81
+
82
+ logger.info("PDFExtractorStage.on_data: Starting PDF extraction process.")
83
+
84
+ # Extract the DataFrame payload.
85
+ df_extraction_ledger = control_message.payload()
86
+ logger.debug("Extracted payload with %d rows.", len(df_extraction_ledger))
87
+
88
+ # Remove the "extract" task from the message to obtain task-specific configuration.
89
+ task_config = remove_task_by_type(control_message, "extract")
90
+ logger.debug("Extracted task config: %s", task_config)
91
+
92
+ # Perform PDF extraction.
93
+ execution_trace_log = {}
94
+ new_df, extraction_info = _inject_validated_config(
95
+ df_extraction_ledger,
96
+ task_config,
97
+ execution_trace_log=execution_trace_log,
98
+ validated_config=self.validated_config,
99
+ )
100
+ logger.info("PDF extraction completed. Extracted %d rows.", len(new_df))
101
+
102
+ # Update the message payload with the extracted DataFrame.
103
+ control_message.payload(new_df)
104
+ # Optionally, annotate the message with extraction info.
105
+ control_message.set_metadata("pdf_extraction_info", extraction_info)
106
+ logger.info("PDF extraction metadata injected successfully.")
107
+
108
+ do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
109
+ if do_trace_tagging and execution_trace_log:
110
+ for key, ts in execution_trace_log.items():
111
+ control_message.set_timestamp(key, ts)
112
+
113
+ return control_message
@@ -0,0 +1,85 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+
7
+ import ray
8
+
9
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
10
+ from nv_ingest.framework.util.flow_control import filter_by_task
11
+ from nv_ingest_api.internal.extract.pptx.pptx_extractor import extract_primitives_from_pptx_internal
12
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
13
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
14
+ from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
15
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @ray.remote
21
+ class PPTXExtractorStage(RayActorStage):
22
+ """
23
+ A Ray actor stage that extracts content from PPTX documents.
24
+
25
+ It expects an IngestControlMessage containing a DataFrame with PPTX document data. It then:
26
+ 1. Removes the "pptx-extract" task from the message.
27
+ 2. Calls the PPTX extraction logic (via extract_primitives_from_pptx_internal) using a validated configuration.
28
+ 3. Updates the message payload with the extracted content DataFrame.
29
+ """
30
+
31
+ def __init__(self, config: PPTXExtractorSchema) -> None:
32
+ """
33
+ Initializes the PptxExtractorStage.
34
+
35
+ Parameters
36
+ ----------
37
+ config : PPTXExtractorSchema
38
+ The validated configuration object for PPTX extraction.
39
+ """
40
+ super().__init__(config)
41
+ try:
42
+ # The config passed in should already be validated, but storing it.
43
+ self.validated_config = config
44
+ logger.info("PptxExtractorStage configuration validated successfully.")
45
+ except Exception as e:
46
+ # If RayActorStage.__init__ or config access raises an issue.
47
+ logger.exception(f"Error initializing or validating PPTX Extractor config: {e}")
48
+ raise
49
+
50
+ @traceable("pptx_extractor")
51
+ @filter_by_task(required_tasks=[("extract", {"document_type": "pptx"})])
52
+ @nv_ingest_node_failure_try_except(annotation_id="pptx_extractor", raise_on_failure=False)
53
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
54
+ """
55
+ Process the control message by extracting content from PPTX documents.
56
+
57
+ Parameters
58
+ ----------
59
+ control_message : IngestControlMessage
60
+ The message containing a DataFrame payload with PPTX document data.
61
+
62
+ Returns
63
+ -------
64
+ IngestControlMessage
65
+ The updated message with extracted PPTX content.
66
+ """
67
+
68
+ # Extract the DataFrame payload.
69
+ df_ledger = control_message.payload()
70
+
71
+ # Remove the "pptx-extract" task from the message to obtain task-specific configuration.
72
+ task_config = remove_task_by_type(control_message, "extract")
73
+
74
+ new_df, extraction_info = extract_primitives_from_pptx_internal(
75
+ df_extraction_ledger=df_ledger,
76
+ task_config=task_config,
77
+ extraction_config=self.validated_config,
78
+ execution_trace_log=None, # Assuming None is appropriate here as in DOCX example
79
+ )
80
+
81
+ # Update the message payload with the extracted PPTX content DataFrame.
82
+ control_message.payload(new_df)
83
+ control_message.set_metadata("pptx_extraction_info", extraction_info) # <-- Changed metadata key
84
+
85
+ return control_message
@@ -0,0 +1,90 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Any
7
+ import ray
8
+
9
+ # These imports are assumed from your project.
10
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
+ from nv_ingest.framework.util.flow_control import filter_by_task
12
+ from nv_ingest_api.internal.extract.image.table_extractor import extract_table_data_from_image_internal
13
+ from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
+ from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
16
+ from nv_ingest_api.util.exception_handlers.decorators import (
17
+ nv_ingest_node_failure_try_except,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @ray.remote
24
+ class TableExtractorStage(RayActorStage):
25
+ """
26
+ A Ray actor stage that extracts table data from PDF content.
27
+
28
+ It expects an IngestControlMessage containing a DataFrame payload with PDF documents.
29
+ The stage removes the "table_data_extract" task from the message, calls the internal
30
+ extraction function using a validated TableExtractorSchema, updates the message payload,
31
+ and annotates the message metadata with extraction info.
32
+ """
33
+
34
+ def __init__(self, config: TableExtractorSchema) -> None:
35
+ super().__init__(config)
36
+ try:
37
+ self.validated_config = config
38
+ logger.info("TableExtractorStage configuration validated successfully.")
39
+ except Exception as e:
40
+ logger.exception("Error validating table extractor config")
41
+ raise e
42
+
43
+ @traceable("table_extraction")
44
+ @filter_by_task(required_tasks=["table_data_extract"])
45
+ @nv_ingest_node_failure_try_except(annotation_id="table_extraction", raise_on_failure=False)
46
+ def on_data(self, control_message: Any) -> Any:
47
+ """
48
+ Process the control message by extracting table data from the PDF payload.
49
+
50
+ Parameters
51
+ ----------
52
+ control_message : IngestControlMessage
53
+ The incoming message containing the PDF payload.
54
+
55
+ Returns
56
+ -------
57
+ IngestControlMessage
58
+ The updated message with the extracted table data and extraction info in metadata.
59
+ """
60
+ logger.info("TableExtractorStage.on_data: Starting table extraction.")
61
+ # Extract the DataFrame payload.
62
+ df_payload = control_message.payload()
63
+ logger.debug("Extracted payload with %d rows.", len(df_payload))
64
+
65
+ # Remove the "table_data_extract" task to obtain task-specific configuration.
66
+ task_config = remove_task_by_type(control_message, "table_data_extract")
67
+ logger.debug("Extracted task configuration: %s", task_config)
68
+
69
+ # Perform table data extraction.
70
+ execution_trace_log = {}
71
+ new_df, extraction_info = extract_table_data_from_image_internal(
72
+ df_extraction_ledger=df_payload,
73
+ task_config=task_config,
74
+ extraction_config=self.validated_config,
75
+ execution_trace_log=execution_trace_log,
76
+ )
77
+ logger.info("Table extraction completed. Extracted %d rows.", len(new_df))
78
+
79
+ # Update the control message with the new DataFrame.
80
+ control_message.payload(new_df)
81
+ # Annotate the message with extraction info.
82
+ control_message.set_metadata("table_extraction_info", extraction_info)
83
+ logger.info("Table extraction metadata injected successfully.")
84
+
85
+ do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
86
+ if do_trace_tagging and execution_trace_log:
87
+ for key, ts in execution_trace_log.items():
88
+ control_message.set_timestamp(key, ts)
89
+
90
+ return control_message
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,97 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import pandas as pd
7
+ from typing import Any
8
+ from pydantic import BaseModel
9
+ import ray
10
+
11
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
+ from nv_ingest_api.internal.enums.common import DocumentTypeEnum, ContentTypeEnum
13
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
14
+ from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
15
+ from nv_ingest_api.util.exception_handlers.decorators import (
16
+ nv_ingest_node_failure_try_except,
17
+ )
18
+
19
+ # logging.basicConfig(level=logging.DEBUG)
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @ray.remote
24
+ class MetadataInjectionStage(RayActorStage):
25
+ """
26
+ A Ray actor stage that performs metadata injection on IngestControlMessages.
27
+
28
+ This stage iterates over the rows of the DataFrame payload, checks if metadata
29
+ injection is required, and if so, injects the appropriate metadata.
30
+ """
31
+
32
+ def __init__(self, config: BaseModel) -> None:
33
+ # Call the base initializer to set attributes like self._running.
34
+ super().__init__(config)
35
+ # Additional initialization can be added here if necessary.
36
+ logger.info("MetadataInjectionStage initialized with config: %s", config)
37
+
38
+ @traceable("metadata_injector")
39
+ @nv_ingest_node_failure_try_except(annotation_id="metadata_injector", raise_on_failure=False)
40
+ def on_data(self, message: Any) -> Any:
41
+ """
42
+ Process an incoming IngestControlMessage by injecting metadata into its DataFrame payload.
43
+
44
+ Parameters
45
+ ----------
46
+ message : IngestControlMessage
47
+ The incoming message containing the payload DataFrame.
48
+
49
+ Returns
50
+ -------
51
+ IngestControlMessage
52
+ The message with updated metadata if injection was required.
53
+ """
54
+ df = message.payload()
55
+ update_required = False
56
+ rows = []
57
+ logger.info("Starting metadata injection on DataFrame with %d rows", len(df))
58
+
59
+ for _, row in df.iterrows():
60
+ try:
61
+ # Convert document type to content type using enums.
62
+ content_type = doc_type_to_content_type(DocumentTypeEnum(row["document_type"]))
63
+ # Check if metadata is missing or doesn't contain 'content'
64
+ if "metadata" not in row or not isinstance(row["metadata"], dict) or "content" not in row["metadata"]:
65
+ update_required = True
66
+ row["metadata"] = {
67
+ "content": row.get("content"),
68
+ "content_metadata": {
69
+ "type": content_type.name.lower(),
70
+ },
71
+ "error_metadata": None,
72
+ "audio_metadata": (
73
+ None if content_type != ContentTypeEnum.AUDIO else {"audio_type": row["document_type"]}
74
+ ),
75
+ "image_metadata": (
76
+ None if content_type != ContentTypeEnum.IMAGE else {"image_type": row["document_type"]}
77
+ ),
78
+ "source_metadata": {
79
+ "source_id": row.get("source_id"),
80
+ "source_name": row.get("source_name"),
81
+ "source_type": row["document_type"],
82
+ },
83
+ "text_metadata": (None if content_type != ContentTypeEnum.TEXT else {"text_type": "document"}),
84
+ }
85
+ except Exception as inner_e:
86
+ logger.exception("Failed to process row during metadata injection")
87
+ raise inner_e
88
+ rows.append(row)
89
+
90
+ if update_required:
91
+ docs = pd.DataFrame(rows)
92
+ message.payload(docs)
93
+ logger.info("Metadata injection updated payload with %d rows", len(docs))
94
+ else:
95
+ logger.info("No metadata update was necessary during metadata injection")
96
+
97
+ return message
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,70 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Any, Dict
7
+
8
+
9
+ # TODO(Devin): Early prototype. Not currently used anywhere
10
+
11
+
12
+ class RayActorEdge(ABC):
13
+ """
14
+ Abstract base class for a Ray actor edge used in a RayPipeline.
15
+
16
+ Parameters
17
+ ----------
18
+ max_size : int
19
+ The maximum size of the edge's internal queue.
20
+ multi_reader : bool
21
+ Whether the edge supports multiple concurrent readers.
22
+ multi_writer : bool
23
+ Whether the edge supports multiple concurrent writers.
24
+ """
25
+
26
+ def __init__(self, max_size: int, multi_reader: bool = False, multi_writer: bool = False) -> None:
27
+ self.max_size = max_size
28
+ self.multi_reader = multi_reader
29
+ self.multi_writer = multi_writer
30
+
31
+ @abstractmethod
32
+ def write(self, item: Any) -> bool:
33
+ """
34
+ Write an item into the edge.
35
+
36
+ Parameters
37
+ ----------
38
+ item : Any
39
+ The item to enqueue.
40
+
41
+ Returns
42
+ -------
43
+ bool
44
+ True if the item was enqueued successfully.
45
+ """
46
+ pass
47
+
48
+ @abstractmethod
49
+ def read(self) -> Any:
50
+ """
51
+ Read an item from the edge.
52
+
53
+ Returns
54
+ -------
55
+ Any
56
+ The next item in the edge.
57
+ """
58
+ pass
59
+
60
+ @abstractmethod
61
+ def get_stats(self) -> Dict[str, int]:
62
+ """
63
+ Get current statistics for the edge.
64
+
65
+ Returns
66
+ -------
67
+ Dict[str, int]
68
+ A dictionary containing statistics (e.g. write_count, read_count, queue_full_count, current_size).
69
+ """
70
+ pass