nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,309 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from typing import Optional
6
+ from rich.console import Console
7
+ from rich.live import Live
8
+ from rich.table import Table
9
+ import tkinter as tk
10
+ from tkinter import ttk
11
+ from tkinter.ttk import Style
12
+ import logging
13
+ import time
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ # --- Utilization Display Class (Rich Console) ---
19
+ class UtilizationDisplay:
20
+ """
21
+ Helper class to display queue utilization snapshots in-place using Rich.
22
+ """
23
+
24
+ def __init__(self, refresh_rate: float = 2):
25
+ self.console = Console()
26
+ self.live: Optional[Live] = None
27
+ self.refresh_rate = refresh_rate
28
+
29
+ def _create_table(self):
30
+ table = Table(title="Pipeline Status Snapshot", caption=f"Updated: {time.strftime('%Y-%m-%d %H:%M:%S')}")
31
+ table.add_column("Stage", justify="left", style="cyan", no_wrap=True)
32
+ table.add_column("Replicas (cur/max [min])", justify="right", style="magenta")
33
+ table.add_column("Input Queues (occ/max)", justify="left", style="green")
34
+ table.add_column("State", justify="left", style="yellow")
35
+ table.add_column("Processing", justify="right", style="red")
36
+ table.add_column("In Flight (proc+queued)", justify="right", style="bright_blue")
37
+ return table
38
+
39
+ def start(self):
40
+ if self.live is None:
41
+ self.live = Live(
42
+ self._create_table(),
43
+ console=self.console,
44
+ refresh_per_second=1.0 / self.refresh_rate,
45
+ # Use rate here
46
+ transient=False,
47
+ vertical_overflow="visible",
48
+ )
49
+ self.live.start(refresh=True)
50
+ logger.debug("Rich Utilization display started.")
51
+
52
+ def update(self, output_rows):
53
+ if self.live is None:
54
+ self.start()
55
+ if self.live:
56
+ table = self._create_table()
57
+ for row in output_rows:
58
+ if len(row) == 6:
59
+ table.add_row(*row)
60
+ else:
61
+ logger.warning(f"Skipping invalid Rich row for display: {row}")
62
+ try:
63
+ self.live.update(table, refresh=True)
64
+ except Exception as e:
65
+ logger.error(f"Error updating Rich Live display: {e}", exc_info=False)
66
+
67
+ def stop(self):
68
+ if self.live is not None:
69
+ try:
70
+ self.live.stop()
71
+ logger.debug("Rich Utilization display stopped.")
72
+ except Exception as e:
73
+ logger.error(f"Error stopping Rich Live display: {e}")
74
+ finally:
75
+ self.live = None
76
+
77
+
78
+ class GuiUtilizationDisplay:
79
+ """
80
+ Displays pipeline status in a Tkinter GUI window using a Treeview.
81
+ Attempts to mimic console colors with a black background using ttk.Style.
82
+ """
83
+
84
+ _instance = None
85
+
86
+ def __new__(cls, *args, **kwargs):
87
+ if cls._instance is None:
88
+ try:
89
+ # Check for display availability before creating the main window
90
+ root_test = tk.Tk()
91
+ root_test.withdraw()
92
+ root_test.destroy()
93
+ cls._instance = super(GuiUtilizationDisplay, cls).__new__(cls)
94
+ cls._instance._initialized = False
95
+ logger.info("GUI mode enabled. Tkinter seems available.")
96
+ except tk.TclError as e:
97
+ logger.error(
98
+ f"Cannot initialize Tkinter GUI (maybe no display available?): {e}. Falling back to console."
99
+ )
100
+ cls._instance = None # Explicitly set to None on failure
101
+ return None # Signal failure
102
+ return cls._instance
103
+
104
+ def __init__(self, title="Pipeline Status", refresh_rate_ms=5000):
105
+ # Prevent re-initialization for singleton
106
+ if hasattr(self, "_initialized") and self._initialized:
107
+ return
108
+
109
+ # Ensure root window exists before proceeding
110
+ if not hasattr(self, "root") or self.root is None:
111
+ try:
112
+ self.root = tk.Tk()
113
+ self.root.title(title)
114
+ self.root.protocol("WM_DELETE_WINDOW", self.stop)
115
+ self.root.geometry("1024x400") # Set initial size
116
+ except tk.TclError as e:
117
+ logger.error(f"Failed to create main Tkinter window: {e}")
118
+ self.root = None
119
+ raise RuntimeError("Failed to initialize GUI window") from e
120
+
121
+ self.refresh_rate_ms = refresh_rate_ms
122
+ self._update_callback = None
123
+ self._running = False
124
+
125
+ # --- Style Configuration ---
126
+ self.style = Style()
127
+ try:
128
+ self.style.theme_use("clam")
129
+ except tk.TclError:
130
+ logger.warning("Failed to set 'clam' theme, using default ttk theme.")
131
+
132
+ # Define colors
133
+ BG_COLOR = "black"
134
+ FG_COLOR = "white"
135
+ HEADING_FG = "white"
136
+
137
+ # Configure Treeview style
138
+ self.style.configure(
139
+ "Treeview", background=BG_COLOR, fieldbackground=BG_COLOR, foreground=FG_COLOR, borderwidth=0
140
+ )
141
+
142
+ # Configure Heading style
143
+ self.style.configure(
144
+ "Treeview.Heading",
145
+ background=BG_COLOR,
146
+ foreground=HEADING_FG,
147
+ font=("Helvetica", 10, "bold"),
148
+ relief="flat",
149
+ )
150
+
151
+ # Improve selected item appearance
152
+ self.style.map("Treeview", background=[("selected", "#222222")], foreground=[("selected", FG_COLOR)])
153
+ self.style.map("Treeview.Heading", relief=[("active", "flat"), ("pressed", "flat")])
154
+
155
+ # --- SCROLLBAR STYLE CONFIGURATION ---
156
+ # Configure the specific layout for vertical scrollbars
157
+ self.style.configure(
158
+ "Vertical.TtkScrollbar",
159
+ gripcount=0,
160
+ background="#444444", # Color of the slider handle
161
+ darkcolor="#555555", # Shading color (theme dependent)
162
+ lightcolor="#555555", # Shading color (theme dependent)
163
+ troughcolor=BG_COLOR, # Background of the scrollbar track
164
+ bordercolor=BG_COLOR, # Border color (try to match background)
165
+ arrowcolor=FG_COLOR, # Color of the arrows
166
+ relief="flat",
167
+ arrowsize=12,
168
+ ) # Adjust arrow size if needed
169
+
170
+ # Define columns
171
+ self.columns = (
172
+ "Stage",
173
+ "Replicas (cur/max [min])",
174
+ "Input Queues (occ/max)",
175
+ "State",
176
+ "Processing",
177
+ "In Flight (proc+queued)",
178
+ )
179
+
180
+ # Create Treeview
181
+ self.tree = ttk.Treeview(self.root, columns=self.columns, show="headings", style="Treeview")
182
+
183
+ # Configure headings and column properties
184
+ for i, col in enumerate(self.columns):
185
+ self.tree.heading(col, text=col, anchor=tk.CENTER)
186
+ # Set column widths and alignment
187
+ if col == "Stage":
188
+ self.tree.column(col, width=180, anchor=tk.W, stretch=tk.NO)
189
+ elif col == "Input Queues (occ/max)":
190
+ self.tree.column(col, width=180, anchor=tk.W, stretch=tk.NO)
191
+ elif col == "Replicas (cur/max [min])":
192
+ self.tree.column(col, width=150, anchor=tk.CENTER, stretch=tk.NO)
193
+ elif col == "State":
194
+ self.tree.column(col, width=100, anchor=tk.CENTER, stretch=tk.NO)
195
+ else:
196
+ self.tree.column(col, width=100, anchor=tk.CENTER, stretch=tk.YES)
197
+
198
+ # --- SCROLLBAR INSTANTIATION ---
199
+ # Create Scrollbar WITHOUT specifying the 'style' argument explicitly.
200
+ # ttk should use the 'Vertical.TtkScrollbar' layout based on the 'orient' parameter.
201
+ scrollbar = ttk.Scrollbar(self.root, orient=tk.VERTICAL, command=self.tree.yview) # REMOVED style=... here
202
+
203
+ self.tree.configure(yscrollcommand=scrollbar.set)
204
+
205
+ # Layout
206
+ self.tree.grid(row=0, column=0, sticky="nsew")
207
+ scrollbar.grid(row=0, column=1, sticky="ns")
208
+ self.root.grid_rowconfigure(0, weight=1)
209
+ self.root.grid_columnconfigure(0, weight=1)
210
+
211
+ self._initialized = True
212
+ logger.debug("GUIUtilizationDisplay initialized with custom styles.")
213
+
214
+ def _periodic_update(self):
215
+ """Internal method called by Tkinter's 'after' mechanism."""
216
+ if not self._running or self._update_callback is None:
217
+ return
218
+
219
+ try:
220
+ # Check if root window still exists before proceeding
221
+ if not (hasattr(self, "root") and self.root and self.root.winfo_exists()):
222
+ self._running = False # Stop if window is gone
223
+ return
224
+
225
+ output_rows = self._update_callback()
226
+ self._update_table_data(output_rows)
227
+
228
+ except Exception as e:
229
+ # Avoid logging excessively if window closed during update
230
+ if self._running and self.root and self.root.winfo_exists():
231
+ logger.error(f"Error during GUI periodic update: {e}", exc_info=True)
232
+
233
+ # Schedule the next update only if still running and window exists
234
+ if self._running and self.root and self.root.winfo_exists():
235
+ try:
236
+ self.root.after(self.refresh_rate_ms, self._periodic_update)
237
+ except tk.TclError: # Handle race condition where root is destroyed between check and call
238
+ logger.warning("GUI window closed during periodic update scheduling.")
239
+ self._running = False
240
+
241
+ def _update_table_data(self, output_rows):
242
+ """Populates the Treeview with new data."""
243
+ if not (hasattr(self, "tree") and self.tree and self.tree.winfo_exists()):
244
+ return # Don't update if treeview is gone
245
+
246
+ try:
247
+ # Clear existing data
248
+ # Using get_children() can be slow on very large trees, but ok here
249
+ for item in self.tree.get_children():
250
+ self.tree.delete(item)
251
+
252
+ # Insert new data
253
+ for i, row_data in enumerate(output_rows):
254
+ cleaned_row = [str(item).replace("[bold]", "").replace("[/bold]", "") for item in row_data]
255
+ if len(cleaned_row) == len(self.columns):
256
+ self.tree.insert("", tk.END, values=cleaned_row)
257
+ else:
258
+ logger.warning(f"Skipping invalid GUI row data: {row_data}")
259
+ except tk.TclError as e:
260
+ logger.warning(f"TclError updating Treeview (likely widget destroyed): {e}")
261
+ except Exception as e:
262
+ logger.error(f"Unexpected error updating Treeview data: {e}", exc_info=True)
263
+
264
+ def start(self, update_callback: callable):
265
+ """Starts the GUI event loop and periodic updates."""
266
+ if not (hasattr(self, "root") and self.root and self.root.winfo_exists()):
267
+ logger.error("Cannot start GUI: Root window not initialized or already destroyed.")
268
+ return
269
+ if self._running:
270
+ logger.warning("GUI already running.")
271
+ return
272
+
273
+ logger.info("Starting GUI display loop...")
274
+ self._update_callback = update_callback
275
+ self._running = True
276
+ try:
277
+ # Schedule the first update slightly delayed to allow window to draw
278
+ self.root.after(200, self._periodic_update)
279
+ self.root.mainloop() # BLOCKS HERE
280
+ except tk.TclError as e:
281
+ # Catch errors related to application destruction gracefully
282
+ if "application has been destroyed" not in str(e):
283
+ logger.error(f"Tkinter error during GUI startup or mainloop: {e}")
284
+ except Exception as e:
285
+ logger.error(f"Unexpected error in GUI main loop: {e}", exc_info=True)
286
+ finally:
287
+ logger.info("GUI mainloop finished.")
288
+ self._running = False # Ensure state is updated on exit
289
+
290
+ def stop(self):
291
+ """Stops the GUI update loop and destroys the window."""
292
+ logger.debug("GUI stop requested.")
293
+ self._running = False # Signal periodic update to stop
294
+ if hasattr(self, "root") and self.root:
295
+ try:
296
+ # Check if window exists before destroying
297
+ if self.root.winfo_exists():
298
+ logger.debug("Destroying GUI root window.")
299
+ self.root.destroy()
300
+ except tk.TclError as e:
301
+ # Ignore error if window is already destroyed
302
+ logger.debug(f"TclError during GUI stop (likely already destroyed): {e}")
303
+ except Exception as e:
304
+ logger.error(f"Error destroying GUI window: {e}", exc_info=True)
305
+ finally:
306
+ self.root = None # Clear reference
307
+ # Reset singleton instance if this is the active one
308
+ if GuiUtilizationDisplay._instance is self:
309
+ GuiUtilizationDisplay._instance = None
File without changes
@@ -0,0 +1,54 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+
7
+ from pydantic import ConfigDict, BaseModel
8
+
9
+ from nv_ingest.framework.schemas.framework_job_counter_schema import JobCounterSchema
10
+ from nv_ingest.framework.schemas.framework_message_broker_sink_schema import MessageBrokerTaskSinkSchema
11
+ from nv_ingest.framework.schemas.framework_message_broker_source_schema import MessageBrokerTaskSourceSchema
12
+ from nv_ingest.framework.schemas.framework_metadata_injector_schema import MetadataInjectorSchema
13
+ from nv_ingest.framework.schemas.framework_otel_meter_schema import OpenTelemetryMeterSchema
14
+ from nv_ingest.framework.schemas.framework_otel_tracer_schema import OpenTelemetryTracerSchema
15
+ from nv_ingest.framework.schemas.framework_vdb_task_sink_schema import VdbTaskSinkSchema
16
+ from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
17
+ from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
18
+ from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
19
+ from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
20
+ from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
21
+ from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
22
+ from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
23
+ from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
24
+ from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
25
+ from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
26
+ from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
27
+ from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
28
+ from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class PipelineConfigSchema(BaseModel):
34
+ audio_extractor_schema: AudioExtractorSchema = AudioExtractorSchema()
35
+ chart_extractor_module: ChartExtractorSchema = ChartExtractorSchema()
36
+ text_splitter_module: TextSplitterSchema = TextSplitterSchema()
37
+ embedding_storage_module: EmbeddingStorageSchema = EmbeddingStorageSchema()
38
+ embed_extractions_module: TextEmbeddingSchema = TextEmbeddingSchema()
39
+ image_caption_extraction_module: ImageCaptionExtractionSchema = ImageCaptionExtractionSchema()
40
+ image_dedup_module: ImageDedupSchema = ImageDedupSchema()
41
+ image_filter_module: ImageFilterSchema = ImageFilterSchema()
42
+ image_storage_module: ImageStorageModuleSchema = ImageStorageModuleSchema()
43
+ infographic_extractor_module: InfographicExtractorSchema = InfographicExtractorSchema()
44
+ job_counter_module: JobCounterSchema = JobCounterSchema()
45
+ metadata_injection_module: MetadataInjectorSchema = MetadataInjectorSchema()
46
+ otel_meter_module: OpenTelemetryMeterSchema = OpenTelemetryMeterSchema()
47
+ otel_tracer_module: OpenTelemetryTracerSchema = OpenTelemetryTracerSchema()
48
+ pdf_extractor_module: PDFExtractorSchema = PDFExtractorSchema()
49
+ pptx_extractor_module: PPTXExtractorSchema = PPTXExtractorSchema()
50
+ redis_task_sink: MessageBrokerTaskSinkSchema = MessageBrokerTaskSinkSchema()
51
+ redis_task_source: MessageBrokerTaskSourceSchema = MessageBrokerTaskSourceSchema()
52
+ table_extractor_module: TableExtractorSchema = TableExtractorSchema()
53
+ vdb_task_sink: VdbTaskSinkSchema = VdbTaskSinkSchema()
54
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,12 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ from pydantic import ConfigDict, BaseModel
7
+
8
+
9
+ class JobCounterSchema(BaseModel):
10
+ name: str = "job_counter"
11
+ raise_on_failure: bool = False
12
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,18 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ from pydantic import Field, BaseModel
7
+
8
+ from typing_extensions import Annotated
9
+
10
+ from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema
11
+
12
+
13
+ class MessageBrokerTaskSinkSchema(BaseModel):
14
+ broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema()
15
+
16
+ raise_on_failure: bool = False
17
+
18
+ progress_engines: Annotated[int, Field(ge=1)] = 6
@@ -0,0 +1,19 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ from pydantic import Field, BaseModel
7
+
8
+ from typing_extensions import Annotated
9
+
10
+ from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema
11
+
12
+
13
+ class MessageBrokerTaskSourceSchema(BaseModel):
14
+ broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema()
15
+
16
+ task_queue: str = "ingest_task_queue"
17
+ raise_on_failure: bool = False
18
+
19
+ progress_engines: Annotated[int, Field(ge=1)] = 6
@@ -0,0 +1,5 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class MessageWrapper(BaseModel):
5
+ payload: str
@@ -0,0 +1,15 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ConfigDict, BaseModel
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class MetadataInjectorSchema(BaseModel):
14
+ raise_on_failure: bool = False
15
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,16 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ from pydantic import ConfigDict, BaseModel
7
+
8
+ from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema
9
+
10
+
11
+ class OpenTelemetryMeterSchema(BaseModel):
12
+ broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema()
13
+
14
+ otel_endpoint: str = "localhost:4317"
15
+ raise_on_failure: bool = False
16
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,12 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ from pydantic import ConfigDict, BaseModel
7
+
8
+
9
+ class OpenTelemetryTracerSchema(BaseModel):
10
+ otel_endpoint: str = "localhost:4317"
11
+ raise_on_failure: bool = False
12
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,25 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from pydantic import BaseModel, ConfigDict
6
+ from enum import Enum
7
+
8
+
9
+ class ConversionStatus(str, Enum):
10
+ IN_PROGRESS = "in_progress"
11
+ SUCCESS = "success"
12
+ FAILED = "failed"
13
+
14
+ model_config = ConfigDict(extra="forbid")
15
+
16
+
17
+ class ProcessingJob(BaseModel):
18
+ submitted_job_id: str
19
+ filename: str
20
+ raw_result: str = ""
21
+ content: str = ""
22
+ status: ConversionStatus
23
+ error: str | None = None
24
+
25
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,15 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ConfigDict, BaseModel
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class TaskInjectionSchema(BaseModel):
14
+ raise_on_failure: bool = False
15
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,112 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ import typing
8
+
9
+ import pymilvus
10
+ from pydantic import field_validator, ConfigDict, BaseModel
11
+ from pydantic import Field
12
+ from typing_extensions import Annotated
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def build_default_milvus_config(embedding_size: int = 1024) -> typing.Dict[str, typing.Any]:
18
+ """
19
+ Builds the configuration for Milvus.
20
+
21
+ This function creates a dictionary configuration for a Milvus collection.
22
+ It includes the index configuration and the schema configuration, with
23
+ various fields like id, title, link, summary, page_content, and embedding.
24
+
25
+ Parameters
26
+ ----------
27
+ embedding_size : int
28
+ The size of the embedding vector.
29
+
30
+ Returns
31
+ -------
32
+ typing.Dict[str, Any]
33
+ A dictionary containing the configuration settings for Milvus.
34
+ """
35
+
36
+ milvus_resource_kwargs = {
37
+ "index_conf": {
38
+ "field_name": "vector",
39
+ "metric_type": "L2",
40
+ "index_type": "GPU_CAGRA",
41
+ "params": {
42
+ "intermediate_graph_degree": 128,
43
+ "graph_degree": 64,
44
+ "build_algo": "NN_DESCENT",
45
+ },
46
+ },
47
+ "schema_conf": {
48
+ "enable_dynamic_field": True,
49
+ "schema_fields": [
50
+ pymilvus.FieldSchema(
51
+ name="pk",
52
+ dtype=pymilvus.DataType.INT64,
53
+ description="Primary key for the collection",
54
+ is_primary=True,
55
+ auto_id=True,
56
+ ).to_dict(),
57
+ pymilvus.FieldSchema(
58
+ name="text", dtype=pymilvus.DataType.VARCHAR, description="Extracted content", max_length=65_535
59
+ ).to_dict(),
60
+ pymilvus.FieldSchema(
61
+ name="vector",
62
+ dtype=pymilvus.DataType.FLOAT_VECTOR,
63
+ description="Embedding vectors",
64
+ dim=embedding_size,
65
+ ).to_dict(),
66
+ pymilvus.FieldSchema(
67
+ name="source",
68
+ dtype=pymilvus.DataType.JSON,
69
+ description="Source document and raw data extracted content",
70
+ ).to_dict(),
71
+ pymilvus.FieldSchema(
72
+ name="content_metadata",
73
+ dtype=pymilvus.DataType.JSON,
74
+ description="Content metadata",
75
+ ).to_dict(),
76
+ ],
77
+ "description": "NV-INGEST collection schema",
78
+ },
79
+ }
80
+
81
+ return milvus_resource_kwargs
82
+
83
+
84
+ class VdbTaskSinkSchema(BaseModel):
85
+ recreate: bool = False
86
+ service: str = "milvus"
87
+ is_service_serialized: bool = False
88
+ default_resource_name: str = "nv_ingest_collection"
89
+ resource_schemas: dict = {default_resource_name: build_default_milvus_config()}
90
+ resource_kwargs: dict = Field(default_factory=dict)
91
+ service_kwargs: dict = {}
92
+ batch_size: int = 5120
93
+ write_time_interval: float = 1.0
94
+ retry_interval: float = 60.0
95
+ raise_on_failure: bool = False
96
+ progress_engines: Annotated[int, Field(ge=1)] = 1
97
+
98
+ @field_validator("service", mode="before")
99
+ @classmethod
100
+ def validate_service(cls, to_validate): # pylint: disable=no-self-argument
101
+ if not to_validate:
102
+ raise ValueError("Service must be a service name or a serialized instance of VectorDBService")
103
+ return to_validate
104
+
105
+ @field_validator("default_resource_name", mode="before")
106
+ @classmethod
107
+ def validate_resource_name(cls, to_validate): # pylint: disable=no-self-argument
108
+ if not to_validate:
109
+ raise ValueError("Resource name must not be None or Empty.")
110
+ return to_validate
111
+
112
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,8 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ from .filter_by_task import filter_by_task
7
+
8
+ __all__ = ["filter_by_task"]