nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,94 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import ray
6
+ import logging
7
+ import time
8
+
9
+ # Import the source and sink stages and their configuration models.
10
+ from nv_ingest.framework.orchestration.ray.stages.sources.message_broker_task_source import (
11
+ MessageBrokerTaskSourceStage,
12
+ MessageBrokerTaskSourceConfig,
13
+ )
14
+ from nv_ingest.framework.orchestration.ray.stages.sinks.message_broker_task_sink import (
15
+ MessageBrokerTaskSinkStage,
16
+ MessageBrokerTaskSinkConfig,
17
+ )
18
+
19
+ # Import the async queue edge.
20
+ from nv_ingest.framework.orchestration.ray.edges.async_queue_edge import AsyncQueueEdge
21
+
22
+
23
+ def main():
24
+ # Initialize Ray.
25
+ ray.init(ignore_reinit_error=True)
26
+
27
+ # Set up basic logging.
28
+ logging.basicConfig(level=logging.INFO)
29
+ logger = logging.getLogger("RayPipelineHarness")
30
+
31
+ # Define the Redis configuration for the message broker (used for both source and sink).
32
+ redis_config = {
33
+ "client_type": "redis",
34
+ "host": "localhost", # Adjust as needed.
35
+ "port": 6379,
36
+ "max_retries": 3,
37
+ "max_backoff": 2,
38
+ "connection_timeout": 5,
39
+ "broker_params": {"db": 0, "use_ssl": False},
40
+ }
41
+
42
+ # Create a configuration instance for the source stage.
43
+ source_config = MessageBrokerTaskSourceConfig(
44
+ broker_client=redis_config,
45
+ task_queue="ingest_task_queue",
46
+ poll_interval=0.1,
47
+ batch_size=10,
48
+ )
49
+
50
+ # Create a configuration instance for the sink stage.
51
+ sink_config = MessageBrokerTaskSinkConfig(
52
+ broker_client=redis_config,
53
+ poll_interval=0.1, # Using the same poll_interval; adjust as needed.
54
+ )
55
+
56
+ # Create an instance of the AsyncQueueEdge actor with a maximum size of 100.
57
+ queue_edge = AsyncQueueEdge.remote(max_size=100, multi_reader=True, multi_writer=True)
58
+
59
+ # Create an instance of the MessageBrokerTaskSourceStage actor.
60
+ source_actor = MessageBrokerTaskSourceStage.remote(source_config, 1)
61
+
62
+ # Create an instance of the MessageBrokerTaskSinkStage actor.
63
+ sink_actor = MessageBrokerTaskSinkStage.remote(sink_config, 1)
64
+
65
+ # Connect the stages:
66
+ # The source's output edge is the queue_edge.
67
+ ray.get(source_actor.set_output_edge.remote(queue_edge))
68
+ # The sink's input edge is the same queue_edge.
69
+ ray.get(sink_actor.set_input_edge.remote(queue_edge))
70
+
71
+ # Start both actors.
72
+ ray.get(source_actor.start.remote())
73
+ ray.get(sink_actor.start.remote())
74
+ logger.info("Source and Sink actors started, connected via AsyncQueueEdge.")
75
+
76
+ try:
77
+ # Run indefinitely until a KeyboardInterrupt (Ctrl+C) is received.
78
+ while True:
79
+ time.sleep(1)
80
+ except KeyboardInterrupt:
81
+ logger.info("KeyboardInterrupt received. Stopping actors...")
82
+ ray.get(source_actor.stop.remote())
83
+ ray.get(sink_actor.stop.remote())
84
+ source_stats = ray.get(source_actor.get_stats.remote())
85
+ sink_stats = ray.get(sink_actor.get_stats.remote())
86
+ logger.info(f"Source stats: {source_stats}")
87
+ logger.info(f"Sink stats: {sink_stats}")
88
+ finally:
89
+ ray.shutdown()
90
+ logger.info("Ray shutdown complete.")
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,239 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import threading
7
+ import time
8
+ from dataclasses import dataclass
9
+ from typing import Optional, Any, List, Tuple, Dict
10
+
11
+ from nv_ingest.framework.orchestration.ray.util.system_tools.visualizers import GuiUtilizationDisplay
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass
17
+ class DisplayConfig:
18
+ """Configuration for monitoring display."""
19
+
20
+ use_gui: bool = False
21
+
22
+
23
+ @dataclass
24
+ class MonitorConfig:
25
+ """Configuration specific to the PipelineMonitor."""
26
+
27
+ use_gui: bool = False
28
+ poll_interval: float = 5.0
29
+ # Add console display options here if needed (e.g., enable/disable)
30
+ use_console: bool = True # Example: Default to console if not GUI
31
+
32
+
33
+ class PipelineMonitor:
34
+ """
35
+ Monitors a RayPipeline instance and manages its display (GUI or Console).
36
+
37
+ Runs in a separate thread, periodically fetching data from the pipeline
38
+ and updating the display based on its own configuration.
39
+ Decoupled from the RayPipeline lifecycle.
40
+ """
41
+
42
+ def __init__(self, pipeline: Any, config: MonitorConfig):
43
+ """
44
+ Initializes the monitor.
45
+
46
+ Args:
47
+ pipeline: The RayPipeline instance to monitor.
48
+ config: Configuration for the monitoring behavior and display type.
49
+ """
50
+ if not isinstance(config, MonitorConfig):
51
+ raise TypeError("config argument must be an instance of MonitorConfig")
52
+
53
+ self.pipeline = pipeline
54
+ self.config = config
55
+ self._thread: Optional[threading.Thread] = None
56
+ self._running: bool = False
57
+ self._display_instance: Optional[Any] = None
58
+ logger.debug("PipelineMonitor initialized.")
59
+
60
+ def start(self) -> None:
61
+ """Starts the monitoring thread and display."""
62
+ if not self._running:
63
+ if not self.config.use_gui and not self.config.use_console:
64
+ logger.info("PipelineMonitor not starting: No display (GUI or Console) enabled in MonitorConfig.")
65
+ return
66
+
67
+ self._running = True
68
+ self._thread = threading.Thread(
69
+ target=self._loop,
70
+ args=(self.config.poll_interval,), # Use interval from MonitorConfig
71
+ name="PipelineMonitorThread",
72
+ daemon=True,
73
+ )
74
+ self._thread.start()
75
+ logger.info(f"PipelineMonitor thread launched (Interval: {self.config.poll_interval}s).")
76
+
77
+ def stop(self) -> None:
78
+ """Stops the monitoring thread and cleans up the display."""
79
+ if self._running:
80
+ logger.debug("Stopping PipelineMonitor thread...")
81
+ self._running = False
82
+
83
+ # Signal the display instance to stop/close itself
84
+ display_type = "GUI" if self.config.use_gui else "Console" if self.config.use_console else "None"
85
+ if self._display_instance and hasattr(self._display_instance, "stop"):
86
+ logger.debug(f"Requesting {display_type} display stop...")
87
+ try:
88
+ # GUI stop might need special handling depending on library
89
+ self._display_instance.stop()
90
+ except Exception as e:
91
+ logger.error(f"Error stopping {display_type} display instance: {e}", exc_info=True)
92
+
93
+ # Join the thread
94
+ if self._thread is not None:
95
+ # Timeout might depend on display type shutdown time
96
+ join_timeout = 10.0 if self.config.use_gui else 5.0
97
+ self._thread.join(timeout=join_timeout)
98
+ if self._thread.is_alive():
99
+ logger.warning("PipelineMonitor thread did not exit cleanly.")
100
+
101
+ self._thread = None
102
+ self._display_instance = None
103
+ logger.info("PipelineMonitor stopped.")
104
+
105
+ def _get_monitor_data(self) -> List[Tuple]:
106
+ """
107
+ Fetches stats and topology data from the associated RayPipeline
108
+ and formats it for display.
109
+ """
110
+ output_rows = []
111
+ # Access pipeline components via self.pipeline
112
+ stats_collector = self.pipeline.stats_collector
113
+ stats_config = self.pipeline.stats_config # Need interval for staleness check
114
+ topology = self.pipeline.topology
115
+
116
+ try:
117
+ current_stage_stats, last_update_time, stats_were_successful = stats_collector.get_latest_stats()
118
+ last_update_age = time.time() - last_update_time
119
+
120
+ # Get snapshots from topology
121
+ current_stages = topology.get_stages_info()
122
+ current_stage_actors = topology.get_stage_actors()
123
+ current_edge_queues = topology.get_edge_queues()
124
+ current_scaling_state = topology.get_scaling_state()
125
+ current_is_flushing = topology.get_is_flushing()
126
+
127
+ # --- Check stats staleness/failure ---
128
+ max_stats_age_display = max(10.0, stats_config.collection_interval_seconds * 2.5)
129
+ stats_stale = last_update_age > max_stats_age_display
130
+ if not stats_were_successful or stats_stale:
131
+ status = "Failed" if not stats_were_successful else "Stale"
132
+ warning_msg = f"[bold red]Stats {status} ({last_update_age:.1f}s ago)[/bold red]"
133
+ output_rows.append((warning_msg, "", "", "", "", ""))
134
+
135
+ # --- Format data using topology snapshots ---
136
+ for stage in current_stages:
137
+ # (Formatting logic remains the same as previous version)
138
+ stage_name = stage.name
139
+ replicas = current_stage_actors.get(stage_name, [])
140
+ replicas_str = f"{len(replicas)}/{stage.max_replicas}" + (
141
+ f" (min {stage.min_replicas})" if stage.min_replicas > 0 else ""
142
+ )
143
+ stats = current_stage_stats.get(stage_name, {})
144
+ processing = stats.get("processing", 0)
145
+ in_flight = stats.get("in_flight", 0)
146
+ queue_depth = max(0, in_flight - processing)
147
+ input_edges = [ename for ename in current_edge_queues if ename.endswith(f"_to_{stage_name}")]
148
+ occupancy_str = "N/A"
149
+ if input_edges:
150
+ try:
151
+ q_name = input_edges[0]
152
+ _, max_q = current_edge_queues[q_name]
153
+ occupancy_str = f"{queue_depth}/{max_q}" + (" (multi)" if len(input_edges) > 1 else "")
154
+ except Exception:
155
+ occupancy_str = f"{queue_depth}/ERR"
156
+ elif stage.is_source:
157
+ occupancy_str = "(Source)"
158
+ scaling_state = current_scaling_state.get(stage_name, "Idle")
159
+ output_rows.append(
160
+ (stage_name, replicas_str, occupancy_str, scaling_state, str(processing), str(in_flight))
161
+ )
162
+
163
+ # --- Add Total Summary Row ---
164
+ def _get_global_in_flight(stats: Dict) -> int:
165
+ return sum(d.get("in_flight", 0) for d in stats.values() if isinstance(d, dict))
166
+
167
+ global_processing = sum(s.get("processing", 0) for s in current_stage_stats.values() if isinstance(s, dict))
168
+ global_in_flight = _get_global_in_flight(current_stage_stats)
169
+ is_flushing_str = str(current_is_flushing)
170
+ output_rows.append(
171
+ (
172
+ "[bold]Total Pipeline[/bold]",
173
+ "",
174
+ "",
175
+ f"Flushing: {is_flushing_str}",
176
+ f"[bold]{global_processing}[/bold]",
177
+ f"[bold]{global_in_flight}[/bold]",
178
+ )
179
+ )
180
+
181
+ except Exception as e:
182
+ logger.error(f"Error gathering monitor data: {e}", exc_info=True)
183
+ output_rows.append(("[bold red]Error gathering data[/bold red]", "", "", "", "", ""))
184
+
185
+ return output_rows
186
+
187
+ def _loop(self, poll_interval: float) -> None:
188
+ """Main loop for the monitoring thread."""
189
+ thread_name = threading.current_thread().name
190
+ logger.debug(f"{thread_name}: Monitor loop started.")
191
+ display_initialized = False
192
+ display_type = "None"
193
+ try:
194
+ # --- Initialize Display based on MonitorConfig ---
195
+ if self.config.use_gui:
196
+ display_type = "GUI"
197
+ logger.info(f"{thread_name}: Initializing GUI display...")
198
+ self._display_instance = GuiUtilizationDisplay(refresh_rate_ms=int(poll_interval * 1000))
199
+ display_initialized = True
200
+ logger.info(f"{thread_name}: Starting blocking GUI display loop...")
201
+ self._display_instance.start(self._get_monitor_data)
202
+ logger.info(f"{thread_name}: GUI display loop finished.")
203
+ self._running = False # GUI loop finished, so monitoring stops
204
+ elif self.config.use_console: # TODO: Console display disabled in original template
205
+ display_type = "Console"
206
+ logger.info(f"{thread_name}: Initializing Console display...")
207
+ # self._display_instance = UtilizationDisplay(refresh_rate=poll_interval) # Assuming Rich TUI
208
+ # self._display_instance.start() # Start the TUI context
209
+ display_initialized = True
210
+ logger.info(f"{thread_name}: Console display started.")
211
+ # --- Non-blocking Console Loop ---
212
+ while self._running:
213
+ loop_start = time.time()
214
+ try:
215
+ monitor_data = self._get_monitor_data()
216
+ if self._display_instance and hasattr(self._display_instance, "update"):
217
+ self._display_instance.update(monitor_data)
218
+ elif (
219
+ self._display_instance is None and display_initialized
220
+ ): # Check if display was stopped externally
221
+ logger.warning(f"{thread_name}: Console display instance gone. Stopping loop.")
222
+ break
223
+ except Exception as e:
224
+ logger.error(f"{thread_name}: Error in console monitor loop: {e}", exc_info=True)
225
+
226
+ elapsed = time.time() - loop_start
227
+ sleep_time = max(0.1, poll_interval - elapsed)
228
+ if not self._running:
229
+ break # Check flag before sleeping
230
+ time.sleep(sleep_time)
231
+ # else: No display enabled - loop finishes immediately
232
+
233
+ except Exception as e:
234
+ logger.error(f"{thread_name}: {display_type} Display setup or execution failed: {e}", exc_info=True)
235
+ finally:
236
+ if self._running: # Loop exited unexpectedly
237
+ logger.warning(f"{thread_name}: Monitoring loop exited prematurely.")
238
+ self._running = False
239
+ logger.debug(f"{thread_name}: Monitor loop finished.")