nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,63 @@
1
+ import asyncio
2
+ import ray
3
+ import logging
4
+ from typing import Any, Dict
5
+
6
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_edge_base import RayActorEdge
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ # TODO(Devin): Early prototype. Not currently used anywhere
11
+
12
+
13
+ @ray.remote
14
+ class AsyncQueueEdge(RayActorEdge):
15
+ """
16
+ An asynchronous implementation of RayActorEdge using asyncio.Queue for thread safety.
17
+ """
18
+
19
+ def __init__(self, max_size: int, multi_reader: bool = False, multi_writer: bool = False) -> None:
20
+ super().__init__(max_size, multi_reader, multi_writer)
21
+ self.queue = asyncio.Queue(maxsize=max_size)
22
+ self.stats = {"write_count": 0, "read_count": 0, "queue_full_count": 0}
23
+ # Use a dedicated asyncio lock for updating stats.
24
+ self.stats_lock = asyncio.Lock()
25
+ logger.info(
26
+ f"AsyncQueueEdge initialized with max_size={max_size},"
27
+ f" multi_reader={multi_reader}, multi_writer={multi_writer}"
28
+ )
29
+
30
+ async def write(self, item: Any) -> bool:
31
+ """
32
+ Write an item into the edge asynchronously.
33
+ """
34
+ if self.queue.full():
35
+ async with self.stats_lock:
36
+ self.stats["queue_full_count"] += 1
37
+ logger.info("Queue is full. Incrementing queue_full_count.")
38
+ logger.info("Attempting to put item into the queue.")
39
+ await self.queue.put(item)
40
+ async with self.stats_lock:
41
+ self.stats["write_count"] += 1
42
+ logger.info(f"Item written to queue. New write_count: {self.stats['write_count']}")
43
+ return True
44
+
45
+ async def read(self) -> Any:
46
+ """
47
+ Read an item from the edge asynchronously.
48
+ """
49
+ logger.info("Attempting to get item from the queue.")
50
+ item = await self.queue.get()
51
+ async with self.stats_lock:
52
+ self.stats["read_count"] += 1
53
+ logger.info(f"Item read from queue. New read_count: {self.stats['read_count']}")
54
+ return item
55
+
56
+ async def get_stats(self) -> Dict[str, int]:
57
+ """
58
+ Get current statistics for the queue.
59
+ """
60
+ async with self.stats_lock:
61
+ self.stats["current_size"] = self.queue.qsize()
62
+ logger.info(f"Getting stats: {self.stats}")
63
+ return self.stats.copy()
@@ -0,0 +1,73 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # TODO(Devin): Early prototype. Not currently used anywhere
6
+
7
+ import logging
8
+ from typing import Any, Dict
9
+ from threading import Lock
10
+
11
+ import ray
12
+ from ray.util.queue import Queue
13
+
14
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_edge_base import RayActorEdge
15
+
16
+ logger = logging.getLogger(__name__)
17
+ logger.setLevel(logging.INFO)
18
+
19
+
20
+ @ray.remote
21
+ class RayQueueEdge(RayActorEdge):
22
+ """
23
+ A RayActorEdge implementation using ray.util.queue.Queue for improved efficiency.
24
+ """
25
+
26
+ def __init__(self, max_size: int, multi_reader: bool = False, multi_writer: bool = False) -> None:
27
+ super().__init__(max_size, multi_reader, multi_writer)
28
+ # Use Ray's distributed queue
29
+ self.queue = Queue(maxsize=max_size)
30
+ self.stats = {"write_count": 0, "read_count": 0, "queue_full_count": 0}
31
+ # Dedicated lock for stats updates
32
+ self.stats_lock = Lock()
33
+ logger.info(
34
+ f"ThreadedQueueEdge initialized with max_size={max_size}, "
35
+ f"multi_reader={multi_reader}, multi_writer={multi_writer}"
36
+ )
37
+
38
+ # TODO(Devin): Think about adding timeouts to queue read/writes here. Stage loops already have timeouts, but
39
+ # adding timeouts here would allow for more graceful handling of queue issues.
40
+ def write(self, item: Any) -> bool:
41
+ """
42
+ Write an item into the queue synchronously.
43
+ """
44
+ if self.queue.full():
45
+ with self.stats_lock:
46
+ self.stats["queue_full_count"] += 1
47
+ logger.info("Queue is full. Incrementing queue_full_count.")
48
+ logger.info("Attempting to put item into the queue.")
49
+ self.queue.put(item)
50
+ with self.stats_lock:
51
+ self.stats["write_count"] += 1
52
+ logger.info(f"Item written to queue. New write_count: {self.stats['write_count']}")
53
+ return True
54
+
55
+ def read(self) -> Any:
56
+ """
57
+ Read an item from the queue synchronously.
58
+ """
59
+ logger.info("Attempting to get item from the queue.")
60
+ item = self.queue.get()
61
+ with self.stats_lock:
62
+ self.stats["read_count"] += 1
63
+ logger.info(f"Item read from queue. New read_count: {self.stats['read_count']}")
64
+ return item
65
+
66
+ def get_stats(self) -> Dict[str, int]:
67
+ """
68
+ Get current statistics for the queue.
69
+ """
70
+ with self.stats_lock:
71
+ self.stats["current_size"] = self.queue.qsize()
72
+ logger.info(f"Getting stats: {self.stats}")
73
+ return self.stats.copy()
@@ -0,0 +1,72 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Any, Dict
7
+ from queue import Queue
8
+ from threading import Lock
9
+
10
+ import ray
11
+
12
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_edge_base import RayActorEdge
13
+
14
+ logger = logging.getLogger(__name__)
15
+ logger.setLevel(logging.INFO) # Set logger level to INFO
16
+
17
+
18
+ # TODO(Devin): Early prototype. Not currently used anywhere
19
+
20
+
21
+ @ray.remote
22
+ class ThreadedQueueEdge(RayActorEdge):
23
+ """
24
+ A threaded implementation of RayActorEdge using queue.Queue for thread safety.
25
+ """
26
+
27
+ def __init__(self, max_size: int, multi_reader: bool = False, multi_writer: bool = False) -> None:
28
+ super().__init__(max_size, multi_reader, multi_writer)
29
+ self.queue = Queue(maxsize=max_size)
30
+ self.stats = {"write_count": 0, "read_count": 0, "queue_full_count": 0}
31
+ # Use a dedicated lock for updating stats only.
32
+ self.stats_lock = Lock()
33
+ logger.info(
34
+ f"ThreadedQueueEdge initialized with max_size={max_size},"
35
+ f"multi_reader={multi_reader}, multi_writer={multi_writer}"
36
+ )
37
+
38
+ def write(self, item: Any) -> bool:
39
+ """
40
+ Write an item into the edge synchronously.
41
+ """
42
+ if self.queue.full():
43
+ with self.stats_lock:
44
+ self.stats["queue_full_count"] += 1
45
+ logger.info("Queue is full. Incrementing queue_full_count.")
46
+ logger.info("Attempting to put item into the queue.")
47
+ # Queue operations are thread-safe; no additional lock is needed.
48
+ self.queue.put(item)
49
+ with self.stats_lock:
50
+ self.stats["write_count"] += 1
51
+ logger.info(f"Item written to queue. New write_count: {self.stats['write_count']}")
52
+ return True
53
+
54
+ def read(self) -> Any:
55
+ """
56
+ Read an item from the edge synchronously.
57
+ """
58
+ logger.info("Attempting to get item from the queue.")
59
+ item = self.queue.get()
60
+ with self.stats_lock:
61
+ self.stats["read_count"] += 1
62
+ logger.info(f"Item read from queue. New read_count: {self.stats['read_count']}")
63
+ return item
64
+
65
+ def get_stats(self) -> Dict[str, int]:
66
+ """
67
+ Get current statistics for the queue.
68
+ """
69
+ with self.stats_lock:
70
+ self.stats["current_size"] = self.queue.qsize()
71
+ logger.info(f"Getting stats: {self.stats}")
72
+ return self.stats.copy()
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,408 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import json
6
+ import os
7
+ import ray
8
+ import logging
9
+ import time
10
+ from typing import Dict, Any
11
+
12
+ # Import our new pipeline class.
13
+ from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import RayPipeline
14
+ from nv_ingest.framework.orchestration.ray.stages.extractors.audio_extractor import AudioExtractorStage
15
+ from nv_ingest.framework.orchestration.ray.stages.extractors.chart_extractor import ChartExtractorStage
16
+ from nv_ingest.framework.orchestration.ray.stages.extractors.docx_extractor import DocxExtractorStage
17
+ from nv_ingest.framework.orchestration.ray.stages.extractors.image_extractor import ImageExtractorStage
18
+ from nv_ingest.framework.orchestration.ray.stages.extractors.pdf_extractor import PDFExtractorStage
19
+ from nv_ingest.framework.orchestration.ray.stages.extractors.table_extractor import TableExtractorStage
20
+
21
+ # Import stage implementations and configuration models.
22
+ from nv_ingest.framework.orchestration.ray.stages.injectors.metadata_injector import MetadataInjectionStage
23
+ from nv_ingest.framework.orchestration.ray.stages.mutate.image_dedup import ImageDedupStage
24
+ from nv_ingest.framework.orchestration.ray.stages.mutate.image_filter import ImageFilterStage
25
+ from nv_ingest.framework.orchestration.ray.stages.sinks.message_broker_task_sink import (
26
+ MessageBrokerTaskSinkStage,
27
+ MessageBrokerTaskSinkConfig,
28
+ )
29
+ from nv_ingest.framework.orchestration.ray.stages.sources.message_broker_task_source import (
30
+ MessageBrokerTaskSourceStage,
31
+ MessageBrokerTaskSourceConfig,
32
+ start_simple_message_broker,
33
+ )
34
+ from nv_ingest.framework.orchestration.ray.stages.storage.image_storage import ImageStorageStage
35
+ from nv_ingest.framework.orchestration.ray.stages.storage.store_embeddings import EmbeddingStorageStage
36
+ from nv_ingest.framework.orchestration.ray.stages.transforms.image_caption import ImageCaptionTransformStage
37
+ from nv_ingest.framework.orchestration.ray.stages.transforms.text_embed import TextEmbeddingTransformStage
38
+ from nv_ingest.framework.orchestration.ray.stages.transforms.text_splitter import TextSplitterStage
39
+ from nv_ingest.framework.schemas.framework_metadata_injector_schema import MetadataInjectorSchema
40
+ from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
41
+ from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
42
+ from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
43
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExtractorSchema
44
+ from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
45
+ from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
46
+ from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
47
+ from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
48
+ from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
49
+ from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
50
+ from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
51
+ from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
52
+ from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
53
+
54
+
55
+ def get_nim_service(env_var_prefix):
56
+ prefix = env_var_prefix.upper()
57
+ grpc_endpoint = os.environ.get(
58
+ f"{prefix}_GRPC_ENDPOINT",
59
+ "",
60
+ )
61
+ http_endpoint = os.environ.get(
62
+ f"{prefix}_HTTP_ENDPOINT",
63
+ "",
64
+ )
65
+ auth_token = os.environ.get(
66
+ "NVIDIA_BUILD_API_KEY",
67
+ "",
68
+ ) or os.environ.get(
69
+ "NGC_API_KEY",
70
+ "",
71
+ )
72
+
73
+ infer_protocol = os.environ.get(
74
+ f"{prefix}_INFER_PROTOCOL",
75
+ "http" if http_endpoint else "grpc" if grpc_endpoint else "",
76
+ )
77
+
78
+ logger.info(f"{prefix}_GRPC_ENDPOINT: {grpc_endpoint}")
79
+ logger.info(f"{prefix}_HTTP_ENDPOINT: {http_endpoint}")
80
+ logger.info(f"{prefix}_INFER_PROTOCOL: {infer_protocol}")
81
+
82
+ return grpc_endpoint, http_endpoint, auth_token, infer_protocol
83
+
84
+
85
+ # Broker configuration – using a simple client on a fixed port.
86
+ simple_config: Dict[str, Any] = {
87
+ "client_type": "simple",
88
+ "host": "localhost",
89
+ "port": 7671,
90
+ "max_retries": 3,
91
+ "max_backoff": 2,
92
+ "connection_timeout": 5,
93
+ "broker_params": {"max_queue_size": 1000},
94
+ }
95
+
96
+ if __name__ == "__main__":
97
+ ray.init(
98
+ ignore_reinit_error=True,
99
+ _system_config={
100
+ "local_fs_capacity_threshold": 0.9,
101
+ "object_spilling_config": json.dumps(
102
+ {
103
+ "type": "filesystem",
104
+ "params": {
105
+ "directory_path": [
106
+ "/tmp/ray_spill_testing_0",
107
+ "/tmp/ray_spill_testing_1",
108
+ "/tmp/ray_spill_testing_2",
109
+ "/tmp/ray_spill_testing_3",
110
+ ],
111
+ "buffer_size": 100_000_000,
112
+ },
113
+ },
114
+ ),
115
+ },
116
+ )
117
+ logging.basicConfig(level=logging.INFO)
118
+ logger = logging.getLogger("RayPipelineHarness")
119
+ logger.info("Starting multi-stage pipeline test.")
120
+
121
+ # Start the SimpleMessageBroker server externally.
122
+ logger.info("Starting SimpleMessageBroker server.")
123
+ broker_process = start_simple_message_broker(simple_config)
124
+ logger.info("SimpleMessageBroker server started.")
125
+
126
+ # Build the pipeline.
127
+ pipeline = RayPipeline()
128
+ logger.info("Created RayPipeline instance.")
129
+
130
+ # Create configuration instances for the source and sink stages.
131
+ source_config = MessageBrokerTaskSourceConfig(
132
+ broker_client=simple_config,
133
+ task_queue="ingest_task_queue",
134
+ poll_interval=0.1,
135
+ )
136
+ sink_config = MessageBrokerTaskSinkConfig(
137
+ broker_client=simple_config,
138
+ poll_interval=0.1,
139
+ )
140
+ logger.info("Source and sink configurations created.")
141
+
142
+ # Set environment variables for various services.
143
+ os.environ["YOLOX_GRPC_ENDPOINT"] = "localhost:8001"
144
+ os.environ["YOLOX_INFER_PROTOCOL"] = "grpc"
145
+ os.environ["YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT"] = "127.0.0.1:8007"
146
+ os.environ["YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL"] = "grpc"
147
+ os.environ["YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT"] = "127.0.0.1:8004"
148
+ os.environ["YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT"] = "http://localhost:8003/v1/infer"
149
+ os.environ["YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL"] = "http"
150
+ os.environ["PADDLE_GRPC_ENDPOINT"] = "localhost:8010"
151
+ os.environ["PADDLE_INFER_PROTOCOL"] = "grpc"
152
+ os.environ["NEMORETRIEVER_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
153
+ os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
154
+ os.environ["VLM_CAPTION_MODEL_NAME"] = "meta/llama-3.2-11b-vision-instruct"
155
+ logger.info("Environment variables set.")
156
+
157
+ image_caption_endpoint_url = "https://integrate.api.nvidia.com/v1/chat/completions"
158
+ image_caption_model_name = "meta/llama-3.2-11b-vision-instruct"
159
+ yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
160
+ (
161
+ yolox_table_structure_grpc,
162
+ yolox_table_structure_http,
163
+ yolox_table_structure_auth,
164
+ yolox_table_structure_protocol,
165
+ ) = get_nim_service("yolox_table_structure")
166
+ (
167
+ yolox_graphic_elements_grpc,
168
+ yolox_graphic_elements_http,
169
+ yolox_graphic_elements_auth,
170
+ yolox_graphic_elements_protocol,
171
+ ) = get_nim_service("yolox_graphic_elements")
172
+ nemoretriever_parse_grpc, nemoretriever_parse_http, nemoretriever_parse_auth, nemoretriever_parse_protocol = (
173
+ get_nim_service("nemoretriever_parse")
174
+ )
175
+ paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
176
+
177
+ model_name = os.environ.get("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
178
+ pdf_extractor_config = {
179
+ "pdfium_config": {
180
+ "auth_token": yolox_auth, # All auth tokens are the same for the moment
181
+ "yolox_endpoints": (yolox_grpc, yolox_http),
182
+ "yolox_infer_protocol": yolox_protocol,
183
+ },
184
+ "nemoretriever_parse_config": {
185
+ "auth_token": nemoretriever_parse_auth,
186
+ "nemoretriever_parse_endpoints": (nemoretriever_parse_grpc, nemoretriever_parse_http),
187
+ "nemoretriever_parse_infer_protocol": nemoretriever_parse_protocol,
188
+ "nemoretriever_parse_model_name": model_name,
189
+ "yolox_endpoints": (yolox_grpc, yolox_http),
190
+ "yolox_infer_protocol": yolox_protocol,
191
+ },
192
+ }
193
+ docx_extractor_config = {
194
+ "docx_extraction_config": {
195
+ "yolox_endpoints": (yolox_grpc, yolox_http),
196
+ "yolox_infer_protocol": yolox_protocol,
197
+ "auth_token": yolox_auth,
198
+ }
199
+ }
200
+ chart_extractor_config = {
201
+ "endpoint_config": {
202
+ "yolox_endpoints": (yolox_graphic_elements_grpc, yolox_graphic_elements_http),
203
+ "yolox_infer_protocol": yolox_graphic_elements_protocol,
204
+ "paddle_endpoints": (paddle_grpc, paddle_http),
205
+ "paddle_infer_protocol": paddle_protocol,
206
+ "auth_token": yolox_auth,
207
+ }
208
+ }
209
+ table_extractor_config = {
210
+ "endpoint_config": {
211
+ "yolox_endpoints": (yolox_table_structure_grpc, yolox_table_structure_http),
212
+ "yolox_infer_protocol": yolox_table_structure_protocol,
213
+ "paddle_endpoints": (paddle_grpc, paddle_http),
214
+ "paddle_infer_protocol": paddle_protocol,
215
+ "auth_token": yolox_auth,
216
+ }
217
+ }
218
+ text_embedding_config = {
219
+ "api_key": yolox_auth,
220
+ "embedding_nim_endpoint": "http://localhost:8012/v1",
221
+ "embedding_model": "nvidia/llama-3.2-nv-embedqa-1b-v2",
222
+ }
223
+ image_extraction_config = {
224
+ "yolox_endpoints": (yolox_grpc, yolox_http),
225
+ "yolox_infer_protocol": yolox_protocol,
226
+ "auth_token": yolox_auth, # All auth tokens are the same for the moment
227
+ }
228
+ image_caption_config = {
229
+ "api_key": yolox_auth,
230
+ "endpoint_url": image_caption_endpoint_url,
231
+ "image_caption_model_name": image_caption_model_name,
232
+ "prompt": "Caption the content of this image:",
233
+ }
234
+ logger.info("Service configuration retrieved from get_nim_service and environment variables.")
235
+
236
+ # Add stages:
237
+ pipeline.add_source(
238
+ name="source",
239
+ source_actor=MessageBrokerTaskSourceStage,
240
+ config=source_config,
241
+ )
242
+ # TODO(Job_Counter): Utilizes a global that isn't compatible with Ray, will need to make it a shared object
243
+ # pipeline.add_stage(
244
+ # name="job_counter",
245
+ # stage_actor=JobCounterStage,
246
+ # config=JobCounterSchema(),
247
+ # min_replicas=1,
248
+ # max_replicas=1,
249
+ # )
250
+ pipeline.add_stage(
251
+ name="metadata_injection",
252
+ stage_actor=MetadataInjectionStage,
253
+ config=MetadataInjectorSchema(), # Use stage-specific config if needed.
254
+ min_replicas=0,
255
+ max_replicas=2,
256
+ )
257
+ pipeline.add_stage(
258
+ name="pdf_extractor",
259
+ stage_actor=PDFExtractorStage,
260
+ config=PDFExtractorSchema(**pdf_extractor_config),
261
+ min_replicas=0,
262
+ max_replicas=16,
263
+ )
264
+ pipeline.add_stage(
265
+ name="docx_extractor",
266
+ stage_actor=DocxExtractorStage,
267
+ config=DocxExtractorSchema(**docx_extractor_config),
268
+ min_replicas=0,
269
+ max_replicas=8,
270
+ )
271
+ pipeline.add_stage(
272
+ name="audio_extractor",
273
+ stage_actor=AudioExtractorStage,
274
+ config=AudioExtractorSchema(),
275
+ min_replicas=0,
276
+ max_replicas=8,
277
+ )
278
+ pipeline.add_stage(
279
+ name="image_extractor",
280
+ stage_actor=ImageExtractorStage,
281
+ config=ImageExtractorSchema(**image_extraction_config),
282
+ min_replicas=0,
283
+ max_replicas=8,
284
+ )
285
+ pipeline.add_stage(
286
+ name="table_extractor",
287
+ stage_actor=TableExtractorStage,
288
+ config=TableExtractorSchema(**table_extractor_config),
289
+ min_replicas=0,
290
+ max_replicas=8,
291
+ )
292
+ pipeline.add_stage(
293
+ name="chart_extractor",
294
+ stage_actor=ChartExtractorStage,
295
+ config=ChartExtractorSchema(**chart_extractor_config),
296
+ min_replicas=0,
297
+ max_replicas=8,
298
+ )
299
+ pipeline.add_stage(
300
+ name="text_embedding",
301
+ stage_actor=TextEmbeddingTransformStage,
302
+ config=TextEmbeddingSchema(**text_embedding_config),
303
+ min_replicas=0,
304
+ max_replicas=8,
305
+ )
306
+ pipeline.add_stage(
307
+ name="image_filter",
308
+ stage_actor=ImageFilterStage,
309
+ config=ImageFilterSchema(),
310
+ min_replicas=0,
311
+ max_replicas=4,
312
+ )
313
+ pipeline.add_stage(
314
+ name="image_dedup",
315
+ stage_actor=ImageDedupStage,
316
+ config=ImageDedupSchema(),
317
+ min_replicas=0,
318
+ max_replicas=4,
319
+ )
320
+ pipeline.add_stage(
321
+ name="image_storage",
322
+ stage_actor=ImageStorageStage,
323
+ config=ImageStorageModuleSchema(),
324
+ min_replicas=0,
325
+ max_replicas=4,
326
+ )
327
+ pipeline.add_stage(
328
+ name="embedding_storage",
329
+ stage_actor=EmbeddingStorageStage,
330
+ config=EmbeddingStorageSchema(),
331
+ min_replicas=0,
332
+ max_replicas=4,
333
+ )
334
+ pipeline.add_stage(
335
+ name="text_splitter",
336
+ stage_actor=TextSplitterStage,
337
+ config=TextSplitterSchema(),
338
+ min_replicas=0,
339
+ max_replicas=4,
340
+ )
341
+ pipeline.add_stage(
342
+ name="image_caption",
343
+ stage_actor=ImageCaptionTransformStage,
344
+ config=ImageCaptionExtractionSchema(**image_caption_config),
345
+ min_replicas=0,
346
+ max_replicas=4,
347
+ )
348
+ pipeline.add_sink(
349
+ name="sink",
350
+ sink_actor=MessageBrokerTaskSinkStage,
351
+ config=sink_config,
352
+ min_replicas=0,
353
+ max_replicas=2,
354
+ )
355
+ logger.info("Added sink stage to pipeline.")
356
+
357
+ # Wire the stages together via ThreadedQueueEdge actors.
358
+ ###### INTAKE STAGES ########
359
+ pipeline.make_edge("source", "metadata_injection", queue_size=16)
360
+ # pipeline.make_edge("job_counter", "metadata_injection", queue_size=16)
361
+ pipeline.make_edge("metadata_injection", "pdf_extractor", queue_size=128) # to limit memory pressure
362
+
363
+ ###### Document Extractors ########
364
+ pipeline.make_edge("pdf_extractor", "audio_extractor", queue_size=16)
365
+ pipeline.make_edge("audio_extractor", "docx_extractor", queue_size=16)
366
+ pipeline.make_edge("docx_extractor", "image_extractor", queue_size=16)
367
+ pipeline.make_edge("image_extractor", "table_extractor", queue_size=16)
368
+
369
+ ###### Primitive Extractors ########
370
+ pipeline.make_edge("table_extractor", "chart_extractor", queue_size=16)
371
+ pipeline.make_edge("chart_extractor", "image_filter", queue_size=16)
372
+
373
+ ###### Primitive Mutators ########
374
+ pipeline.make_edge("image_filter", "image_dedup", queue_size=16)
375
+ pipeline.make_edge("image_dedup", "text_splitter", queue_size=16)
376
+
377
+ ###### Primitive Transforms ########
378
+ pipeline.make_edge("text_splitter", "text_embedding", queue_size=16)
379
+ pipeline.make_edge("text_embedding", "image_caption", queue_size=16)
380
+ pipeline.make_edge("image_caption", "image_storage", queue_size=16)
381
+
382
+ ###### Primitive Storage ########
383
+ pipeline.make_edge("image_storage", "embedding_storage", queue_size=16)
384
+ pipeline.make_edge("embedding_storage", "sink", queue_size=16)
385
+
386
+ logger.info("Completed wiring of pipeline edges.")
387
+
388
+ # Build the pipeline (this instantiates actors and wires edges).
389
+ logger.info("Building pipeline...")
390
+ pipeline.build()
391
+ logger.info("Pipeline build complete.")
392
+
393
+ # Optionally, visualize the pipeline graph.
394
+ # pipeline.visualize(mode="text", verbose=True, max_width=120)
395
+
396
+ # Start the pipeline.
397
+ logger.info("Starting pipeline...")
398
+ pipeline.start()
399
+ logger.info("Pipeline started successfully.")
400
+
401
+ try:
402
+ while True:
403
+ time.sleep(5)
404
+ except KeyboardInterrupt:
405
+ logger.info("Interrupt received, shutting down pipeline.")
406
+ pipeline.stop()
407
+ ray.shutdown()
408
+ logger.info("Ray shutdown complete.")
@@ -0,0 +1,63 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import ray
6
+ import logging
7
+ import time
8
+
9
+ # Import the new source stage and its configuration
10
+ from nv_ingest.framework.orchestration.ray.stages.sources.message_broker_task_source import (
11
+ MessageBrokerTaskSourceStage,
12
+ MessageBrokerTaskSourceConfig,
13
+ )
14
+
15
+
16
+ def main():
17
+ # Initialize Ray
18
+ ray.init(ignore_reinit_error=True)
19
+
20
+ # Set up basic logging
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger("RayTestHarness")
23
+
24
+ # Define the Redis configuration for the MessageBrokerTaskSource
25
+ redis_config = {
26
+ "client_type": "redis",
27
+ "host": "localhost", # Adjust host if needed
28
+ "port": 6379, # Default Redis port
29
+ "max_retries": 3,
30
+ "max_backoff": 2,
31
+ "connection_timeout": 5,
32
+ "broker_params": {"db": 0, "use_ssl": False},
33
+ }
34
+
35
+ # Create an instance of the configuration for the source stage.
36
+ config = MessageBrokerTaskSourceConfig(
37
+ broker_client=redis_config,
38
+ task_queue="ingest_task_queue",
39
+ poll_interval=0.1,
40
+ )
41
+
42
+ message_broker_actor = MessageBrokerTaskSourceStage.remote(config)
43
+
44
+ # Start the actor to begin fetching messages.
45
+ ray.get(message_broker_actor.start.remote())
46
+ logger.info("MessageBrokerTaskSource actor started. Listening for messages...")
47
+
48
+ try:
49
+ # Run indefinitely until a KeyboardInterrupt (Ctrl+C) is received.
50
+ while True:
51
+ time.sleep(1)
52
+ except KeyboardInterrupt:
53
+ logger.info("Ctrl+C detected. Stopping actor...")
54
+ ray.get(message_broker_actor.stop.remote())
55
+ stats = ray.get(message_broker_actor.get_stats.remote())
56
+ logger.info(f"Actor processing stats: {stats}")
57
+ finally:
58
+ ray.shutdown()
59
+ logger.info("Ray shutdown complete.")
60
+
61
+
62
+ if __name__ == "__main__":
63
+ main()