nv-ingest 2025.5.21.dev20250521__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/__init__.py +20 -0
- nv_ingest/api/__init__.py +3 -0
- nv_ingest/api/main.py +43 -0
- nv_ingest/api/v1/__init__.py +3 -0
- nv_ingest/api/v1/health.py +114 -0
- nv_ingest/api/v1/ingest.py +454 -0
- nv_ingest/framework/__init__.py +3 -0
- nv_ingest/framework/orchestration/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
- nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
- nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
- nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
- nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
- nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
- nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
- nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
- nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
- nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
- nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
- nv_ingest/framework/schemas/__init__.py +0 -0
- nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
- nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
- nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
- nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
- nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
- nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
- nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
- nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
- nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
- nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
- nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
- nv_ingest/framework/util/__init__.py +3 -0
- nv_ingest/framework/util/flow_control/__init__.py +8 -0
- nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
- nv_ingest/framework/util/service/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
- nv_ingest/framework/util/service/meta/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
- nv_ingest/framework/util/telemetry/__init__.py +3 -0
- nv_ingest/framework/util/telemetry/global_stats.py +145 -0
- nv_ingest/version.py +38 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.live import Live
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
import tkinter as tk
|
|
10
|
+
from tkinter import ttk
|
|
11
|
+
from tkinter.ttk import Style
|
|
12
|
+
import logging
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# --- Utilization Display Class (Rich Console) ---
|
|
19
|
+
class UtilizationDisplay:
|
|
20
|
+
"""
|
|
21
|
+
Helper class to display queue utilization snapshots in-place using Rich.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, refresh_rate: float = 2):
|
|
25
|
+
self.console = Console()
|
|
26
|
+
self.live: Optional[Live] = None
|
|
27
|
+
self.refresh_rate = refresh_rate
|
|
28
|
+
|
|
29
|
+
def _create_table(self):
|
|
30
|
+
table = Table(title="Pipeline Status Snapshot", caption=f"Updated: {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
31
|
+
table.add_column("Stage", justify="left", style="cyan", no_wrap=True)
|
|
32
|
+
table.add_column("Replicas (cur/max [min])", justify="right", style="magenta")
|
|
33
|
+
table.add_column("Input Queues (occ/max)", justify="left", style="green")
|
|
34
|
+
table.add_column("State", justify="left", style="yellow")
|
|
35
|
+
table.add_column("Processing", justify="right", style="red")
|
|
36
|
+
table.add_column("In Flight (proc+queued)", justify="right", style="bright_blue")
|
|
37
|
+
return table
|
|
38
|
+
|
|
39
|
+
def start(self):
|
|
40
|
+
if self.live is None:
|
|
41
|
+
self.live = Live(
|
|
42
|
+
self._create_table(),
|
|
43
|
+
console=self.console,
|
|
44
|
+
refresh_per_second=1.0 / self.refresh_rate,
|
|
45
|
+
# Use rate here
|
|
46
|
+
transient=False,
|
|
47
|
+
vertical_overflow="visible",
|
|
48
|
+
)
|
|
49
|
+
self.live.start(refresh=True)
|
|
50
|
+
logger.debug("Rich Utilization display started.")
|
|
51
|
+
|
|
52
|
+
def update(self, output_rows):
|
|
53
|
+
if self.live is None:
|
|
54
|
+
self.start()
|
|
55
|
+
if self.live:
|
|
56
|
+
table = self._create_table()
|
|
57
|
+
for row in output_rows:
|
|
58
|
+
if len(row) == 6:
|
|
59
|
+
table.add_row(*row)
|
|
60
|
+
else:
|
|
61
|
+
logger.warning(f"Skipping invalid Rich row for display: {row}")
|
|
62
|
+
try:
|
|
63
|
+
self.live.update(table, refresh=True)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.error(f"Error updating Rich Live display: {e}", exc_info=False)
|
|
66
|
+
|
|
67
|
+
def stop(self):
|
|
68
|
+
if self.live is not None:
|
|
69
|
+
try:
|
|
70
|
+
self.live.stop()
|
|
71
|
+
logger.debug("Rich Utilization display stopped.")
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.error(f"Error stopping Rich Live display: {e}")
|
|
74
|
+
finally:
|
|
75
|
+
self.live = None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class GuiUtilizationDisplay:
|
|
79
|
+
"""
|
|
80
|
+
Displays pipeline status in a Tkinter GUI window using a Treeview.
|
|
81
|
+
Attempts to mimic console colors with a black background using ttk.Style.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
_instance = None
|
|
85
|
+
|
|
86
|
+
def __new__(cls, *args, **kwargs):
|
|
87
|
+
if cls._instance is None:
|
|
88
|
+
try:
|
|
89
|
+
# Check for display availability before creating the main window
|
|
90
|
+
root_test = tk.Tk()
|
|
91
|
+
root_test.withdraw()
|
|
92
|
+
root_test.destroy()
|
|
93
|
+
cls._instance = super(GuiUtilizationDisplay, cls).__new__(cls)
|
|
94
|
+
cls._instance._initialized = False
|
|
95
|
+
logger.info("GUI mode enabled. Tkinter seems available.")
|
|
96
|
+
except tk.TclError as e:
|
|
97
|
+
logger.error(
|
|
98
|
+
f"Cannot initialize Tkinter GUI (maybe no display available?): {e}. Falling back to console."
|
|
99
|
+
)
|
|
100
|
+
cls._instance = None # Explicitly set to None on failure
|
|
101
|
+
return None # Signal failure
|
|
102
|
+
return cls._instance
|
|
103
|
+
|
|
104
|
+
def __init__(self, title="Pipeline Status", refresh_rate_ms=5000):
|
|
105
|
+
# Prevent re-initialization for singleton
|
|
106
|
+
if hasattr(self, "_initialized") and self._initialized:
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
# Ensure root window exists before proceeding
|
|
110
|
+
if not hasattr(self, "root") or self.root is None:
|
|
111
|
+
try:
|
|
112
|
+
self.root = tk.Tk()
|
|
113
|
+
self.root.title(title)
|
|
114
|
+
self.root.protocol("WM_DELETE_WINDOW", self.stop)
|
|
115
|
+
self.root.geometry("1024x400") # Set initial size
|
|
116
|
+
except tk.TclError as e:
|
|
117
|
+
logger.error(f"Failed to create main Tkinter window: {e}")
|
|
118
|
+
self.root = None
|
|
119
|
+
raise RuntimeError("Failed to initialize GUI window") from e
|
|
120
|
+
|
|
121
|
+
self.refresh_rate_ms = refresh_rate_ms
|
|
122
|
+
self._update_callback = None
|
|
123
|
+
self._running = False
|
|
124
|
+
|
|
125
|
+
# --- Style Configuration ---
|
|
126
|
+
self.style = Style()
|
|
127
|
+
try:
|
|
128
|
+
self.style.theme_use("clam")
|
|
129
|
+
except tk.TclError:
|
|
130
|
+
logger.warning("Failed to set 'clam' theme, using default ttk theme.")
|
|
131
|
+
|
|
132
|
+
# Define colors
|
|
133
|
+
BG_COLOR = "black"
|
|
134
|
+
FG_COLOR = "white"
|
|
135
|
+
HEADING_FG = "white"
|
|
136
|
+
|
|
137
|
+
# Configure Treeview style
|
|
138
|
+
self.style.configure(
|
|
139
|
+
"Treeview", background=BG_COLOR, fieldbackground=BG_COLOR, foreground=FG_COLOR, borderwidth=0
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Configure Heading style
|
|
143
|
+
self.style.configure(
|
|
144
|
+
"Treeview.Heading",
|
|
145
|
+
background=BG_COLOR,
|
|
146
|
+
foreground=HEADING_FG,
|
|
147
|
+
font=("Helvetica", 10, "bold"),
|
|
148
|
+
relief="flat",
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Improve selected item appearance
|
|
152
|
+
self.style.map("Treeview", background=[("selected", "#222222")], foreground=[("selected", FG_COLOR)])
|
|
153
|
+
self.style.map("Treeview.Heading", relief=[("active", "flat"), ("pressed", "flat")])
|
|
154
|
+
|
|
155
|
+
# --- SCROLLBAR STYLE CONFIGURATION ---
|
|
156
|
+
# Configure the specific layout for vertical scrollbars
|
|
157
|
+
self.style.configure(
|
|
158
|
+
"Vertical.TtkScrollbar",
|
|
159
|
+
gripcount=0,
|
|
160
|
+
background="#444444", # Color of the slider handle
|
|
161
|
+
darkcolor="#555555", # Shading color (theme dependent)
|
|
162
|
+
lightcolor="#555555", # Shading color (theme dependent)
|
|
163
|
+
troughcolor=BG_COLOR, # Background of the scrollbar track
|
|
164
|
+
bordercolor=BG_COLOR, # Border color (try to match background)
|
|
165
|
+
arrowcolor=FG_COLOR, # Color of the arrows
|
|
166
|
+
relief="flat",
|
|
167
|
+
arrowsize=12,
|
|
168
|
+
) # Adjust arrow size if needed
|
|
169
|
+
|
|
170
|
+
# Define columns
|
|
171
|
+
self.columns = (
|
|
172
|
+
"Stage",
|
|
173
|
+
"Replicas (cur/max [min])",
|
|
174
|
+
"Input Queues (occ/max)",
|
|
175
|
+
"State",
|
|
176
|
+
"Processing",
|
|
177
|
+
"In Flight (proc+queued)",
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Create Treeview
|
|
181
|
+
self.tree = ttk.Treeview(self.root, columns=self.columns, show="headings", style="Treeview")
|
|
182
|
+
|
|
183
|
+
# Configure headings and column properties
|
|
184
|
+
for i, col in enumerate(self.columns):
|
|
185
|
+
self.tree.heading(col, text=col, anchor=tk.CENTER)
|
|
186
|
+
# Set column widths and alignment
|
|
187
|
+
if col == "Stage":
|
|
188
|
+
self.tree.column(col, width=180, anchor=tk.W, stretch=tk.NO)
|
|
189
|
+
elif col == "Input Queues (occ/max)":
|
|
190
|
+
self.tree.column(col, width=180, anchor=tk.W, stretch=tk.NO)
|
|
191
|
+
elif col == "Replicas (cur/max [min])":
|
|
192
|
+
self.tree.column(col, width=150, anchor=tk.CENTER, stretch=tk.NO)
|
|
193
|
+
elif col == "State":
|
|
194
|
+
self.tree.column(col, width=100, anchor=tk.CENTER, stretch=tk.NO)
|
|
195
|
+
else:
|
|
196
|
+
self.tree.column(col, width=100, anchor=tk.CENTER, stretch=tk.YES)
|
|
197
|
+
|
|
198
|
+
# --- SCROLLBAR INSTANTIATION ---
|
|
199
|
+
# Create Scrollbar WITHOUT specifying the 'style' argument explicitly.
|
|
200
|
+
# ttk should use the 'Vertical.TtkScrollbar' layout based on the 'orient' parameter.
|
|
201
|
+
scrollbar = ttk.Scrollbar(self.root, orient=tk.VERTICAL, command=self.tree.yview) # REMOVED style=... here
|
|
202
|
+
|
|
203
|
+
self.tree.configure(yscrollcommand=scrollbar.set)
|
|
204
|
+
|
|
205
|
+
# Layout
|
|
206
|
+
self.tree.grid(row=0, column=0, sticky="nsew")
|
|
207
|
+
scrollbar.grid(row=0, column=1, sticky="ns")
|
|
208
|
+
self.root.grid_rowconfigure(0, weight=1)
|
|
209
|
+
self.root.grid_columnconfigure(0, weight=1)
|
|
210
|
+
|
|
211
|
+
self._initialized = True
|
|
212
|
+
logger.debug("GUIUtilizationDisplay initialized with custom styles.")
|
|
213
|
+
|
|
214
|
+
def _periodic_update(self):
|
|
215
|
+
"""Internal method called by Tkinter's 'after' mechanism."""
|
|
216
|
+
if not self._running or self._update_callback is None:
|
|
217
|
+
return
|
|
218
|
+
|
|
219
|
+
try:
|
|
220
|
+
# Check if root window still exists before proceeding
|
|
221
|
+
if not (hasattr(self, "root") and self.root and self.root.winfo_exists()):
|
|
222
|
+
self._running = False # Stop if window is gone
|
|
223
|
+
return
|
|
224
|
+
|
|
225
|
+
output_rows = self._update_callback()
|
|
226
|
+
self._update_table_data(output_rows)
|
|
227
|
+
|
|
228
|
+
except Exception as e:
|
|
229
|
+
# Avoid logging excessively if window closed during update
|
|
230
|
+
if self._running and self.root and self.root.winfo_exists():
|
|
231
|
+
logger.error(f"Error during GUI periodic update: {e}", exc_info=True)
|
|
232
|
+
|
|
233
|
+
# Schedule the next update only if still running and window exists
|
|
234
|
+
if self._running and self.root and self.root.winfo_exists():
|
|
235
|
+
try:
|
|
236
|
+
self.root.after(self.refresh_rate_ms, self._periodic_update)
|
|
237
|
+
except tk.TclError: # Handle race condition where root is destroyed between check and call
|
|
238
|
+
logger.warning("GUI window closed during periodic update scheduling.")
|
|
239
|
+
self._running = False
|
|
240
|
+
|
|
241
|
+
def _update_table_data(self, output_rows):
|
|
242
|
+
"""Populates the Treeview with new data."""
|
|
243
|
+
if not (hasattr(self, "tree") and self.tree and self.tree.winfo_exists()):
|
|
244
|
+
return # Don't update if treeview is gone
|
|
245
|
+
|
|
246
|
+
try:
|
|
247
|
+
# Clear existing data
|
|
248
|
+
# Using get_children() can be slow on very large trees, but ok here
|
|
249
|
+
for item in self.tree.get_children():
|
|
250
|
+
self.tree.delete(item)
|
|
251
|
+
|
|
252
|
+
# Insert new data
|
|
253
|
+
for i, row_data in enumerate(output_rows):
|
|
254
|
+
cleaned_row = [str(item).replace("[bold]", "").replace("[/bold]", "") for item in row_data]
|
|
255
|
+
if len(cleaned_row) == len(self.columns):
|
|
256
|
+
self.tree.insert("", tk.END, values=cleaned_row)
|
|
257
|
+
else:
|
|
258
|
+
logger.warning(f"Skipping invalid GUI row data: {row_data}")
|
|
259
|
+
except tk.TclError as e:
|
|
260
|
+
logger.warning(f"TclError updating Treeview (likely widget destroyed): {e}")
|
|
261
|
+
except Exception as e:
|
|
262
|
+
logger.error(f"Unexpected error updating Treeview data: {e}", exc_info=True)
|
|
263
|
+
|
|
264
|
+
def start(self, update_callback: callable):
|
|
265
|
+
"""Starts the GUI event loop and periodic updates."""
|
|
266
|
+
if not (hasattr(self, "root") and self.root and self.root.winfo_exists()):
|
|
267
|
+
logger.error("Cannot start GUI: Root window not initialized or already destroyed.")
|
|
268
|
+
return
|
|
269
|
+
if self._running:
|
|
270
|
+
logger.warning("GUI already running.")
|
|
271
|
+
return
|
|
272
|
+
|
|
273
|
+
logger.info("Starting GUI display loop...")
|
|
274
|
+
self._update_callback = update_callback
|
|
275
|
+
self._running = True
|
|
276
|
+
try:
|
|
277
|
+
# Schedule the first update slightly delayed to allow window to draw
|
|
278
|
+
self.root.after(200, self._periodic_update)
|
|
279
|
+
self.root.mainloop() # BLOCKS HERE
|
|
280
|
+
except tk.TclError as e:
|
|
281
|
+
# Catch errors related to application destruction gracefully
|
|
282
|
+
if "application has been destroyed" not in str(e):
|
|
283
|
+
logger.error(f"Tkinter error during GUI startup or mainloop: {e}")
|
|
284
|
+
except Exception as e:
|
|
285
|
+
logger.error(f"Unexpected error in GUI main loop: {e}", exc_info=True)
|
|
286
|
+
finally:
|
|
287
|
+
logger.info("GUI mainloop finished.")
|
|
288
|
+
self._running = False # Ensure state is updated on exit
|
|
289
|
+
|
|
290
|
+
def stop(self):
|
|
291
|
+
"""Stops the GUI update loop and destroys the window."""
|
|
292
|
+
logger.debug("GUI stop requested.")
|
|
293
|
+
self._running = False # Signal periodic update to stop
|
|
294
|
+
if hasattr(self, "root") and self.root:
|
|
295
|
+
try:
|
|
296
|
+
# Check if window exists before destroying
|
|
297
|
+
if self.root.winfo_exists():
|
|
298
|
+
logger.debug("Destroying GUI root window.")
|
|
299
|
+
self.root.destroy()
|
|
300
|
+
except tk.TclError as e:
|
|
301
|
+
# Ignore error if window is already destroyed
|
|
302
|
+
logger.debug(f"TclError during GUI stop (likely already destroyed): {e}")
|
|
303
|
+
except Exception as e:
|
|
304
|
+
logger.error(f"Error destroying GUI window: {e}", exc_info=True)
|
|
305
|
+
finally:
|
|
306
|
+
self.root = None # Clear reference
|
|
307
|
+
# Reset singleton instance if this is the active one
|
|
308
|
+
if GuiUtilizationDisplay._instance is self:
|
|
309
|
+
GuiUtilizationDisplay._instance = None
|
|
File without changes
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from pydantic import ConfigDict, BaseModel
|
|
8
|
+
|
|
9
|
+
from nv_ingest.framework.schemas.framework_job_counter_schema import JobCounterSchema
|
|
10
|
+
from nv_ingest.framework.schemas.framework_message_broker_sink_schema import MessageBrokerTaskSinkSchema
|
|
11
|
+
from nv_ingest.framework.schemas.framework_message_broker_source_schema import MessageBrokerTaskSourceSchema
|
|
12
|
+
from nv_ingest.framework.schemas.framework_metadata_injector_schema import MetadataInjectorSchema
|
|
13
|
+
from nv_ingest.framework.schemas.framework_otel_meter_schema import OpenTelemetryMeterSchema
|
|
14
|
+
from nv_ingest.framework.schemas.framework_otel_tracer_schema import OpenTelemetryTracerSchema
|
|
15
|
+
from nv_ingest.framework.schemas.framework_vdb_task_sink_schema import VdbTaskSinkSchema
|
|
16
|
+
from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
|
|
17
|
+
from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
|
|
18
|
+
from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
|
|
19
|
+
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
|
|
20
|
+
from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
|
|
21
|
+
from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
|
|
22
|
+
from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
|
|
23
|
+
from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
|
|
24
|
+
from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
|
|
25
|
+
from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
|
|
26
|
+
from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
|
|
27
|
+
from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
|
|
28
|
+
from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class PipelineConfigSchema(BaseModel):
|
|
34
|
+
audio_extractor_schema: AudioExtractorSchema = AudioExtractorSchema()
|
|
35
|
+
chart_extractor_module: ChartExtractorSchema = ChartExtractorSchema()
|
|
36
|
+
text_splitter_module: TextSplitterSchema = TextSplitterSchema()
|
|
37
|
+
embedding_storage_module: EmbeddingStorageSchema = EmbeddingStorageSchema()
|
|
38
|
+
embed_extractions_module: TextEmbeddingSchema = TextEmbeddingSchema()
|
|
39
|
+
image_caption_extraction_module: ImageCaptionExtractionSchema = ImageCaptionExtractionSchema()
|
|
40
|
+
image_dedup_module: ImageDedupSchema = ImageDedupSchema()
|
|
41
|
+
image_filter_module: ImageFilterSchema = ImageFilterSchema()
|
|
42
|
+
image_storage_module: ImageStorageModuleSchema = ImageStorageModuleSchema()
|
|
43
|
+
infographic_extractor_module: InfographicExtractorSchema = InfographicExtractorSchema()
|
|
44
|
+
job_counter_module: JobCounterSchema = JobCounterSchema()
|
|
45
|
+
metadata_injection_module: MetadataInjectorSchema = MetadataInjectorSchema()
|
|
46
|
+
otel_meter_module: OpenTelemetryMeterSchema = OpenTelemetryMeterSchema()
|
|
47
|
+
otel_tracer_module: OpenTelemetryTracerSchema = OpenTelemetryTracerSchema()
|
|
48
|
+
pdf_extractor_module: PDFExtractorSchema = PDFExtractorSchema()
|
|
49
|
+
pptx_extractor_module: PPTXExtractorSchema = PPTXExtractorSchema()
|
|
50
|
+
redis_task_sink: MessageBrokerTaskSinkSchema = MessageBrokerTaskSinkSchema()
|
|
51
|
+
redis_task_source: MessageBrokerTaskSourceSchema = MessageBrokerTaskSourceSchema()
|
|
52
|
+
table_extractor_module: TableExtractorSchema = TableExtractorSchema()
|
|
53
|
+
vdb_task_sink: VdbTaskSinkSchema = VdbTaskSinkSchema()
|
|
54
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from pydantic import ConfigDict, BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class JobCounterSchema(BaseModel):
|
|
10
|
+
name: str = "job_counter"
|
|
11
|
+
raise_on_failure: bool = False
|
|
12
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, BaseModel
|
|
7
|
+
|
|
8
|
+
from typing_extensions import Annotated
|
|
9
|
+
|
|
10
|
+
from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MessageBrokerTaskSinkSchema(BaseModel):
|
|
14
|
+
broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema()
|
|
15
|
+
|
|
16
|
+
raise_on_failure: bool = False
|
|
17
|
+
|
|
18
|
+
progress_engines: Annotated[int, Field(ge=1)] = 6
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, BaseModel
|
|
7
|
+
|
|
8
|
+
from typing_extensions import Annotated
|
|
9
|
+
|
|
10
|
+
from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MessageBrokerTaskSourceSchema(BaseModel):
|
|
14
|
+
broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema()
|
|
15
|
+
|
|
16
|
+
task_queue: str = "ingest_task_queue"
|
|
17
|
+
raise_on_failure: bool = False
|
|
18
|
+
|
|
19
|
+
progress_engines: Annotated[int, Field(ge=1)] = 6
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from pydantic import ConfigDict, BaseModel
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MetadataInjectorSchema(BaseModel):
|
|
14
|
+
raise_on_failure: bool = False
|
|
15
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from pydantic import ConfigDict, BaseModel
|
|
7
|
+
|
|
8
|
+
from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class OpenTelemetryMeterSchema(BaseModel):
|
|
12
|
+
broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema()
|
|
13
|
+
|
|
14
|
+
otel_endpoint: str = "localhost:4317"
|
|
15
|
+
raise_on_failure: bool = False
|
|
16
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from pydantic import ConfigDict, BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class OpenTelemetryTracerSchema(BaseModel):
|
|
10
|
+
otel_endpoint: str = "localhost:4317"
|
|
11
|
+
raise_on_failure: bool = False
|
|
12
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict
|
|
6
|
+
from enum import Enum
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ConversionStatus(str, Enum):
|
|
10
|
+
IN_PROGRESS = "in_progress"
|
|
11
|
+
SUCCESS = "success"
|
|
12
|
+
FAILED = "failed"
|
|
13
|
+
|
|
14
|
+
model_config = ConfigDict(extra="forbid")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ProcessingJob(BaseModel):
|
|
18
|
+
submitted_job_id: str
|
|
19
|
+
filename: str
|
|
20
|
+
raw_result: str = ""
|
|
21
|
+
content: str = ""
|
|
22
|
+
status: ConversionStatus
|
|
23
|
+
error: str | None = None
|
|
24
|
+
|
|
25
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from pydantic import ConfigDict, BaseModel
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TaskInjectionSchema(BaseModel):
|
|
14
|
+
raise_on_failure: bool = False
|
|
15
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import typing
|
|
8
|
+
|
|
9
|
+
import pymilvus
|
|
10
|
+
from pydantic import field_validator, ConfigDict, BaseModel
|
|
11
|
+
from pydantic import Field
|
|
12
|
+
from typing_extensions import Annotated
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def build_default_milvus_config(embedding_size: int = 1024) -> typing.Dict[str, typing.Any]:
|
|
18
|
+
"""
|
|
19
|
+
Builds the configuration for Milvus.
|
|
20
|
+
|
|
21
|
+
This function creates a dictionary configuration for a Milvus collection.
|
|
22
|
+
It includes the index configuration and the schema configuration, with
|
|
23
|
+
various fields like id, title, link, summary, page_content, and embedding.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
embedding_size : int
|
|
28
|
+
The size of the embedding vector.
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
typing.Dict[str, Any]
|
|
33
|
+
A dictionary containing the configuration settings for Milvus.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
milvus_resource_kwargs = {
|
|
37
|
+
"index_conf": {
|
|
38
|
+
"field_name": "vector",
|
|
39
|
+
"metric_type": "L2",
|
|
40
|
+
"index_type": "GPU_CAGRA",
|
|
41
|
+
"params": {
|
|
42
|
+
"intermediate_graph_degree": 128,
|
|
43
|
+
"graph_degree": 64,
|
|
44
|
+
"build_algo": "NN_DESCENT",
|
|
45
|
+
},
|
|
46
|
+
},
|
|
47
|
+
"schema_conf": {
|
|
48
|
+
"enable_dynamic_field": True,
|
|
49
|
+
"schema_fields": [
|
|
50
|
+
pymilvus.FieldSchema(
|
|
51
|
+
name="pk",
|
|
52
|
+
dtype=pymilvus.DataType.INT64,
|
|
53
|
+
description="Primary key for the collection",
|
|
54
|
+
is_primary=True,
|
|
55
|
+
auto_id=True,
|
|
56
|
+
).to_dict(),
|
|
57
|
+
pymilvus.FieldSchema(
|
|
58
|
+
name="text", dtype=pymilvus.DataType.VARCHAR, description="Extracted content", max_length=65_535
|
|
59
|
+
).to_dict(),
|
|
60
|
+
pymilvus.FieldSchema(
|
|
61
|
+
name="vector",
|
|
62
|
+
dtype=pymilvus.DataType.FLOAT_VECTOR,
|
|
63
|
+
description="Embedding vectors",
|
|
64
|
+
dim=embedding_size,
|
|
65
|
+
).to_dict(),
|
|
66
|
+
pymilvus.FieldSchema(
|
|
67
|
+
name="source",
|
|
68
|
+
dtype=pymilvus.DataType.JSON,
|
|
69
|
+
description="Source document and raw data extracted content",
|
|
70
|
+
).to_dict(),
|
|
71
|
+
pymilvus.FieldSchema(
|
|
72
|
+
name="content_metadata",
|
|
73
|
+
dtype=pymilvus.DataType.JSON,
|
|
74
|
+
description="Content metadata",
|
|
75
|
+
).to_dict(),
|
|
76
|
+
],
|
|
77
|
+
"description": "NV-INGEST collection schema",
|
|
78
|
+
},
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return milvus_resource_kwargs
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class VdbTaskSinkSchema(BaseModel):
|
|
85
|
+
recreate: bool = False
|
|
86
|
+
service: str = "milvus"
|
|
87
|
+
is_service_serialized: bool = False
|
|
88
|
+
default_resource_name: str = "nv_ingest_collection"
|
|
89
|
+
resource_schemas: dict = {default_resource_name: build_default_milvus_config()}
|
|
90
|
+
resource_kwargs: dict = Field(default_factory=dict)
|
|
91
|
+
service_kwargs: dict = {}
|
|
92
|
+
batch_size: int = 5120
|
|
93
|
+
write_time_interval: float = 1.0
|
|
94
|
+
retry_interval: float = 60.0
|
|
95
|
+
raise_on_failure: bool = False
|
|
96
|
+
progress_engines: Annotated[int, Field(ge=1)] = 1
|
|
97
|
+
|
|
98
|
+
@field_validator("service", mode="before")
|
|
99
|
+
@classmethod
|
|
100
|
+
def validate_service(cls, to_validate): # pylint: disable=no-self-argument
|
|
101
|
+
if not to_validate:
|
|
102
|
+
raise ValueError("Service must be a service name or a serialized instance of VectorDBService")
|
|
103
|
+
return to_validate
|
|
104
|
+
|
|
105
|
+
@field_validator("default_resource_name", mode="before")
|
|
106
|
+
@classmethod
|
|
107
|
+
def validate_resource_name(cls, to_validate): # pylint: disable=no-self-argument
|
|
108
|
+
if not to_validate:
|
|
109
|
+
raise ValueError("Resource name must not be None or Empty.")
|
|
110
|
+
return to_validate
|
|
111
|
+
|
|
112
|
+
model_config = ConfigDict(extra="forbid")
|