ignis 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +15 -0
- data/lib/ignis.rb +94 -0
- data/lib/nnw/platform.rb +304 -0
- data/lib/nnw/shared/event_bus.rb +240 -0
- data/lib/nnw/shared/ffi_loader.rb +63 -0
- data/lib/nnw/shared/memory_contract.rb +204 -0
- data/lib/nnw/shared/nv_array.rb +710 -0
- data/lib/nnw/shared/recovery_protocol.rb +307 -0
- data/lib/nvruby/configuration.rb +217 -0
- data/lib/nvruby/cuda/device.rb +275 -0
- data/lib/nvruby/cuda/device_props.rb +202 -0
- data/lib/nvruby/cuda/graph.rb +265 -0
- data/lib/nvruby/cuda/graph_bindings.rb +119 -0
- data/lib/nvruby/cuda/library_loader.rb +285 -0
- data/lib/nvruby/cuda/memory.rb +410 -0
- data/lib/nvruby/cuda/runtime_api.rb +804 -0
- data/lib/nvruby/cuda/stream.rb +234 -0
- data/lib/nvruby/dtype.rb +139 -0
- data/lib/nvruby/epilogues.rb +438 -0
- data/lib/nvruby/errors.rb +303 -0
- data/lib/nvruby/half.rb +97 -0
- data/lib/nvruby/jit/compiled_kernel.rb +80 -0
- data/lib/nvruby/jit/compiler.rb +231 -0
- data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
- data/lib/nvruby/jit/kernel.rb +240 -0
- data/lib/nvruby/jit/kernel_module.rb +133 -0
- data/lib/nvruby/jit/kernels/activations.rb +179 -0
- data/lib/nvruby/jit/kernels/attention.rb +504 -0
- data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
- data/lib/nvruby/jit/kernels/loss.rb +213 -0
- data/lib/nvruby/jit/kernels/normalization.rb +200 -0
- data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
- data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
- data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
- data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
- data/lib/nvruby/linalg/epilog.rb +67 -0
- data/lib/nvruby/linalg/matmul.rb +247 -0
- data/lib/nvruby/linalg/matmul_plan.rb +229 -0
- data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
- data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
- data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
- data/lib/nvruby/memory/device_memory_resource.rb +106 -0
- data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
- data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
- data/lib/nvruby/memory/stats.rb +107 -0
- data/lib/nvruby/memory.rb +124 -0
- data/lib/nvruby/version.rb +5 -0
- metadata +108 -0
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'graph_bindings'
|
|
4
|
+
require 'fiddle'
|
|
5
|
+
|
|
6
|
+
module Ignis
|
|
7
|
+
module CUDA
|
|
8
|
+
# CUDA Graph for capturing and replaying GPU operations.
|
|
9
|
+
# Provides reduced kernel launch overhead for repetitive workloads.
|
|
10
|
+
#
|
|
11
|
+
# Uses GraphBindings (FFI-based) since graph operations are NOT hot-path.
|
|
12
|
+
# Stream handles are Fiddle::Pointer — we convert for FFI interop.
|
|
13
|
+
class Graph
|
|
14
|
+
# @return [FFI::Pointer] Native CUDA graph handle
|
|
15
|
+
attr_reader :handle
|
|
16
|
+
|
|
17
|
+
# @return [Boolean] Whether the graph was created via stream capture
|
|
18
|
+
attr_reader :captured
|
|
19
|
+
|
|
20
|
+
# @return [Integer, nil] device_id for recovery coordinator invalidation
|
|
21
|
+
attr_accessor :device_id
|
|
22
|
+
|
|
23
|
+
# Create a new empty CUDA graph.
|
|
24
|
+
# @param flags [Integer] Graph creation flags (default: 0)
|
|
25
|
+
def initialize(flags: 0)
|
|
26
|
+
GraphBindings.ensure_loaded!
|
|
27
|
+
|
|
28
|
+
graph_ptr = FFI::MemoryPointer.new(:pointer)
|
|
29
|
+
status = GraphBindings.cudaGraphCreate(graph_ptr, flags)
|
|
30
|
+
GraphBindings.check_status!(status, 'cudaGraphCreate')
|
|
31
|
+
|
|
32
|
+
@handle = graph_ptr.read_pointer
|
|
33
|
+
@captured = false
|
|
34
|
+
@destroyed = false
|
|
35
|
+
@device_id = nil
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Capture GPU operations from a stream into a new graph.
|
|
39
|
+
# @param stream [Stream, nil] Stream to capture (creates temporary if nil)
|
|
40
|
+
# @param mode [Symbol] Capture mode (:global, :thread_local, :relaxed)
|
|
41
|
+
# @yield [Stream] Block containing GPU operations to capture
|
|
42
|
+
# @return [Graph]
|
|
43
|
+
def self.capture(stream: nil, mode: :global, &block)
|
|
44
|
+
GraphBindings.ensure_loaded!
|
|
45
|
+
|
|
46
|
+
own_stream = stream.nil?
|
|
47
|
+
stream ||= Stream.new
|
|
48
|
+
|
|
49
|
+
capture_mode = case mode
|
|
50
|
+
when :global then GraphBindings::CUDA_STREAM_CAPTURE_MODE_GLOBAL
|
|
51
|
+
when :thread_local then GraphBindings::CUDA_STREAM_CAPTURE_MODE_THREAD_LOCAL
|
|
52
|
+
when :relaxed then GraphBindings::CUDA_STREAM_CAPTURE_MODE_RELAXED
|
|
53
|
+
else GraphBindings::CUDA_STREAM_CAPTURE_MODE_GLOBAL
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Convert Fiddle::Pointer to FFI::Pointer for GraphBindings
|
|
57
|
+
stream_ffi = to_ffi_ptr(stream.to_ptr)
|
|
58
|
+
|
|
59
|
+
status = GraphBindings.cudaStreamBeginCapture(stream_ffi, capture_mode)
|
|
60
|
+
GraphBindings.check_status!(status, 'cudaStreamBeginCapture')
|
|
61
|
+
|
|
62
|
+
begin
|
|
63
|
+
block.call(stream)
|
|
64
|
+
ensure
|
|
65
|
+
graph_ptr = FFI::MemoryPointer.new(:pointer)
|
|
66
|
+
status = GraphBindings.cudaStreamEndCapture(stream_ffi, graph_ptr)
|
|
67
|
+
GraphBindings.check_status!(status, 'cudaStreamEndCapture')
|
|
68
|
+
|
|
69
|
+
stream.destroy! if own_stream
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
graph = allocate
|
|
73
|
+
graph.instance_variable_set(:@handle, graph_ptr.read_pointer)
|
|
74
|
+
graph.instance_variable_set(:@captured, true)
|
|
75
|
+
graph.instance_variable_set(:@destroyed, false)
|
|
76
|
+
graph.instance_variable_set(:@device_id, nil)
|
|
77
|
+
|
|
78
|
+
graph
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Instantiate the graph to create an executable graph.
|
|
82
|
+
# @param flags [Integer] Instantiation flags
|
|
83
|
+
# @return [GraphExecutable]
|
|
84
|
+
def instantiate(flags: 0)
|
|
85
|
+
raise InvalidOperationError, 'Graph already destroyed' if @destroyed
|
|
86
|
+
|
|
87
|
+
GraphExecutable.new(self, flags: flags)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Clone this graph.
|
|
91
|
+
# @return [Graph]
|
|
92
|
+
def clone
|
|
93
|
+
raise InvalidOperationError, 'Graph already destroyed' if @destroyed
|
|
94
|
+
|
|
95
|
+
clone_ptr = FFI::MemoryPointer.new(:pointer)
|
|
96
|
+
status = GraphBindings.cudaGraphClone(clone_ptr, @handle)
|
|
97
|
+
GraphBindings.check_status!(status, 'cudaGraphClone')
|
|
98
|
+
|
|
99
|
+
cloned = Graph.allocate
|
|
100
|
+
cloned.instance_variable_set(:@handle, clone_ptr.read_pointer)
|
|
101
|
+
cloned.instance_variable_set(:@captured, @captured)
|
|
102
|
+
cloned.instance_variable_set(:@destroyed, false)
|
|
103
|
+
cloned.instance_variable_set(:@device_id, @device_id)
|
|
104
|
+
cloned
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Get number of nodes in the graph.
|
|
108
|
+
# @return [Integer]
|
|
109
|
+
def node_count
|
|
110
|
+
raise InvalidOperationError, 'Graph already destroyed' if @destroyed
|
|
111
|
+
|
|
112
|
+
count_ptr = FFI::MemoryPointer.new(:size_t)
|
|
113
|
+
status = GraphBindings.cudaGraphGetNodes(@handle, FFI::Pointer::NULL, count_ptr)
|
|
114
|
+
GraphBindings.check_status!(status, 'cudaGraphGetNodes')
|
|
115
|
+
|
|
116
|
+
count_ptr.read(:size_t)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Get number of root nodes (nodes with no dependencies).
|
|
120
|
+
# @return [Integer]
|
|
121
|
+
def root_node_count
|
|
122
|
+
raise InvalidOperationError, 'Graph already destroyed' if @destroyed
|
|
123
|
+
|
|
124
|
+
count_ptr = FFI::MemoryPointer.new(:size_t)
|
|
125
|
+
status = GraphBindings.cudaGraphGetRootNodes(@handle, FFI::Pointer::NULL, count_ptr)
|
|
126
|
+
GraphBindings.check_status!(status, 'cudaGraphGetRootNodes')
|
|
127
|
+
|
|
128
|
+
count_ptr.read(:size_t)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# @return [Boolean]
|
|
132
|
+
def destroyed?
|
|
133
|
+
@destroyed
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Destroy the graph and release resources.
|
|
137
|
+
# @return [void]
|
|
138
|
+
def destroy!
|
|
139
|
+
return if @destroyed
|
|
140
|
+
|
|
141
|
+
status = GraphBindings.cudaGraphDestroy(@handle)
|
|
142
|
+
GraphBindings.check_status!(status, 'cudaGraphDestroy')
|
|
143
|
+
|
|
144
|
+
@destroyed = true
|
|
145
|
+
@handle = nil
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Convert a Fiddle::Pointer to an FFI::Pointer for interop.
|
|
149
|
+
# @param fiddle_ptr [Fiddle::Pointer]
|
|
150
|
+
# @return [FFI::Pointer]
|
|
151
|
+
def self.to_ffi_ptr(fiddle_ptr)
|
|
152
|
+
return FFI::Pointer::NULL if fiddle_ptr.nil? || fiddle_ptr.to_i.zero?
|
|
153
|
+
|
|
154
|
+
FFI::Pointer.new(:pointer, fiddle_ptr.to_i)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Executable CUDA Graph ready for launch.
|
|
159
|
+
class GraphExecutable
|
|
160
|
+
# @return [FFI::Pointer] Native executable graph handle
|
|
161
|
+
attr_reader :handle
|
|
162
|
+
|
|
163
|
+
# @return [Graph] Source graph
|
|
164
|
+
attr_reader :source_graph
|
|
165
|
+
|
|
166
|
+
# @return [Integer] Number of times this graph has been launched
|
|
167
|
+
attr_reader :launch_count
|
|
168
|
+
|
|
169
|
+
# @param graph [Graph] Source graph
|
|
170
|
+
# @param flags [Integer] Instantiation flags
|
|
171
|
+
def initialize(graph, flags: 0)
|
|
172
|
+
GraphBindings.ensure_loaded!
|
|
173
|
+
|
|
174
|
+
exec_ptr = FFI::MemoryPointer.new(:pointer)
|
|
175
|
+
status = GraphBindings.cudaGraphInstantiate(exec_ptr, graph.handle, flags)
|
|
176
|
+
GraphBindings.check_status!(status, 'cudaGraphInstantiate')
|
|
177
|
+
|
|
178
|
+
@handle = exec_ptr.read_pointer
|
|
179
|
+
@source_graph = graph
|
|
180
|
+
@launch_count = 0
|
|
181
|
+
@destroyed = false
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Launch the executable graph.
|
|
185
|
+
# @param stream [Stream, nil] Stream to launch on (default stream if nil)
|
|
186
|
+
# @return [self]
|
|
187
|
+
def launch(stream: nil)
|
|
188
|
+
raise InvalidOperationError, 'GraphExecutable already destroyed' if @destroyed
|
|
189
|
+
|
|
190
|
+
stream_handle = stream ? Graph.to_ffi_ptr(stream.to_ptr) : FFI::Pointer::NULL
|
|
191
|
+
status = GraphBindings.cudaGraphLaunch(@handle, stream_handle)
|
|
192
|
+
GraphBindings.check_status!(status, 'cudaGraphLaunch')
|
|
193
|
+
|
|
194
|
+
@launch_count += 1
|
|
195
|
+
self
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Upload the graph to the device for faster first launch.
|
|
199
|
+
# @param stream [Stream, nil]
|
|
200
|
+
# @return [self]
|
|
201
|
+
def upload(stream: nil)
|
|
202
|
+
raise InvalidOperationError, 'GraphExecutable already destroyed' if @destroyed
|
|
203
|
+
|
|
204
|
+
stream_handle = stream ? Graph.to_ffi_ptr(stream.to_ptr) : FFI::Pointer::NULL
|
|
205
|
+
status = GraphBindings.cudaGraphUpload(@handle, stream_handle)
|
|
206
|
+
GraphBindings.check_status!(status, 'cudaGraphUpload')
|
|
207
|
+
|
|
208
|
+
self
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Update the executable graph from its source graph.
|
|
212
|
+
# @return [Boolean] True if update succeeded
|
|
213
|
+
def update!
|
|
214
|
+
raise InvalidOperationError, 'GraphExecutable already destroyed' if @destroyed
|
|
215
|
+
|
|
216
|
+
result_ptr = FFI::MemoryPointer.new(:int)
|
|
217
|
+
status = GraphBindings.cudaGraphExecUpdate(@handle, @source_graph.handle, result_ptr)
|
|
218
|
+
|
|
219
|
+
status.zero?
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# @return [Boolean]
|
|
223
|
+
def destroyed?
|
|
224
|
+
@destroyed
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Destroy the executable graph.
|
|
228
|
+
# @return [void]
|
|
229
|
+
def destroy!
|
|
230
|
+
return if @destroyed
|
|
231
|
+
|
|
232
|
+
status = GraphBindings.cudaGraphExecDestroy(@handle)
|
|
233
|
+
GraphBindings.check_status!(status, 'cudaGraphExecDestroy')
|
|
234
|
+
|
|
235
|
+
@destroyed = true
|
|
236
|
+
@handle = nil
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Convenience module for graph-based operations.
|
|
241
|
+
module GraphCapture
|
|
242
|
+
class << self
|
|
243
|
+
# Capture GPU ops and return executable graph.
|
|
244
|
+
# @param stream [Stream, nil]
|
|
245
|
+
# @yield [Stream]
|
|
246
|
+
# @return [GraphExecutable]
|
|
247
|
+
def capture(stream: nil, &block)
|
|
248
|
+
graph = Graph.capture(stream: stream, &block)
|
|
249
|
+
graph.instantiate
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Capture and immediately launch.
|
|
253
|
+
# @param stream [Stream, nil]
|
|
254
|
+
# @param repeat [Integer]
|
|
255
|
+
# @yield [Stream]
|
|
256
|
+
# @return [GraphExecutable]
|
|
257
|
+
def capture_and_launch(stream: nil, repeat: 1, &block)
|
|
258
|
+
exec = capture(stream: stream, &block)
|
|
259
|
+
repeat.times { exec.launch(stream: stream) }
|
|
260
|
+
exec
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
end
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ffi"
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module CUDA
|
|
7
|
+
# CUDA Graphs FFI bindings
|
|
8
|
+
# Provides stream capture and graph execution for reduced launch overhead
|
|
9
|
+
module GraphBindings
|
|
10
|
+
extend FFI::Library
|
|
11
|
+
|
|
12
|
+
# CUDA Stream Capture Mode
|
|
13
|
+
CUDA_STREAM_CAPTURE_MODE_GLOBAL = 0
|
|
14
|
+
CUDA_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1
|
|
15
|
+
CUDA_STREAM_CAPTURE_MODE_RELAXED = 2
|
|
16
|
+
|
|
17
|
+
# CUDA Graph Instantiate Flags
|
|
18
|
+
CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH = 1
|
|
19
|
+
CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD = 2
|
|
20
|
+
CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH = 4
|
|
21
|
+
CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY = 8
|
|
22
|
+
|
|
23
|
+
# CUDA Graph Node Types
|
|
24
|
+
CUDA_GRAPH_NODE_TYPE_KERNEL = 0
|
|
25
|
+
CUDA_GRAPH_NODE_TYPE_MEMCPY = 1
|
|
26
|
+
CUDA_GRAPH_NODE_TYPE_MEMSET = 2
|
|
27
|
+
CUDA_GRAPH_NODE_TYPE_HOST = 3
|
|
28
|
+
CUDA_GRAPH_NODE_TYPE_GRAPH = 4
|
|
29
|
+
CUDA_GRAPH_NODE_TYPE_EMPTY = 5
|
|
30
|
+
CUDA_GRAPH_NODE_TYPE_WAIT_EVENT = 6
|
|
31
|
+
CUDA_GRAPH_NODE_TYPE_EVENT_RECORD = 7
|
|
32
|
+
|
|
33
|
+
@loaded = false
|
|
34
|
+
@mutex = Mutex.new
|
|
35
|
+
|
|
36
|
+
class << self
|
|
37
|
+
# Ensure CUDA runtime is loaded and graph functions are attached
|
|
38
|
+
# @return [void]
|
|
39
|
+
def ensure_loaded!
|
|
40
|
+
@mutex.synchronize do
|
|
41
|
+
return if @loaded
|
|
42
|
+
|
|
43
|
+
LibraryLoader.load_library(:cuda_runtime)
|
|
44
|
+
|
|
45
|
+
# Resolve cudart path per platform
|
|
46
|
+
dll_path = if defined?(Ignis::Platform)
|
|
47
|
+
Ignis::Platform.cudart_path
|
|
48
|
+
elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
|
|
49
|
+
cuda_bin = Ignis.configuration.cuda_bin_path rescue nil
|
|
50
|
+
cuda_bin ? Dir.glob(File.join(cuda_bin, 'cudart64_*.dll')).max : 'cudart64_130'
|
|
51
|
+
else
|
|
52
|
+
'libcudart.so.13'
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
raise LibraryNotFoundError, 'cudart' unless dll_path
|
|
56
|
+
|
|
57
|
+
ffi_lib dll_path
|
|
58
|
+
attach_graph_functions!
|
|
59
|
+
|
|
60
|
+
@loaded = true
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Check status and raise error if not success
|
|
65
|
+
# @param status [Integer] CUDA error code
|
|
66
|
+
# @param context [String] Context for error message
|
|
67
|
+
# @return [void]
|
|
68
|
+
def check_status!(status, context = "CUDA Graph operation")
|
|
69
|
+
return if status.zero?
|
|
70
|
+
|
|
71
|
+
raise CudaRuntimeError.new("#{context} failed", cuda_code: status)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
private
|
|
75
|
+
|
|
76
|
+
def attach_graph_functions!
|
|
77
|
+
# Graph creation and destruction
|
|
78
|
+
attach_function :cudaGraphCreate, [:pointer, :uint], :int
|
|
79
|
+
attach_function :cudaGraphDestroy, [:pointer], :int
|
|
80
|
+
|
|
81
|
+
# Stream capture for graph creation
|
|
82
|
+
attach_function :cudaStreamBeginCapture, [:pointer, :int], :int
|
|
83
|
+
attach_function :cudaStreamEndCapture, [:pointer, :pointer], :int
|
|
84
|
+
attach_function :cudaStreamIsCapturing, [:pointer, :pointer], :int
|
|
85
|
+
|
|
86
|
+
# Graph instantiation (create executable graph)
|
|
87
|
+
attach_function :cudaGraphInstantiate, [:pointer, :pointer, :uint64], :int
|
|
88
|
+
attach_function :cudaGraphInstantiateWithFlags, [:pointer, :pointer, :uint64], :int
|
|
89
|
+
|
|
90
|
+
# Graph execution
|
|
91
|
+
attach_function :cudaGraphLaunch, [:pointer, :pointer], :int
|
|
92
|
+
|
|
93
|
+
# Executable graph destruction
|
|
94
|
+
attach_function :cudaGraphExecDestroy, [:pointer], :int
|
|
95
|
+
|
|
96
|
+
# Graph update (for parameter changes without re-instantiation)
|
|
97
|
+
attach_function :cudaGraphExecUpdate, [:pointer, :pointer, :pointer], :int
|
|
98
|
+
|
|
99
|
+
# Graph node management
|
|
100
|
+
attach_function :cudaGraphGetNodes, [:pointer, :pointer, :pointer], :int
|
|
101
|
+
attach_function :cudaGraphGetRootNodes, [:pointer, :pointer, :pointer], :int
|
|
102
|
+
attach_function :cudaGraphNodeGetType, [:pointer, :pointer], :int
|
|
103
|
+
attach_function :cudaGraphGetEdges, [:pointer, :pointer, :pointer, :pointer], :int
|
|
104
|
+
|
|
105
|
+
# Graph cloning
|
|
106
|
+
attach_function :cudaGraphClone, [:pointer, :pointer], :int
|
|
107
|
+
|
|
108
|
+
# Conditional graph nodes (CUDA 12+)
|
|
109
|
+
attach_function :cudaGraphAddEmptyNode, [:pointer, :pointer, :pointer, :size_t], :int
|
|
110
|
+
attach_function :cudaGraphAddDependencies, [:pointer, :pointer, :pointer, :size_t], :int
|
|
111
|
+
attach_function :cudaGraphRemoveDependencies, [:pointer, :pointer, :pointer, :size_t], :int
|
|
112
|
+
|
|
113
|
+
# Graph upload to device (for faster first launch)
|
|
114
|
+
attach_function :cudaGraphUpload, [:pointer, :pointer], :int
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'fiddle'
|
|
4
|
+
require 'fiddle/import'
|
|
5
|
+
|
|
6
|
+
module Ignis
|
|
7
|
+
module CUDA
|
|
8
|
+
# Handles dynamic loading of NVIDIA CUDA libraries.
|
|
9
|
+
# Cross-platform: Windows (DLLs via Kernel32) and Linux (.so via dlopen).
|
|
10
|
+
# Uses Ignis::Platform for path resolution when available.
|
|
11
|
+
module LibraryLoader
|
|
12
|
+
# Windows Kernel32 API for DLL search path management (Fiddle-based).
|
|
13
|
+
# Only loaded on Windows.
|
|
14
|
+
if RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i) || (defined?(Ignis::Platform) && Ignis::Platform.windows?)
|
|
15
|
+
module Kernel32
|
|
16
|
+
extend Fiddle::Importer
|
|
17
|
+
dlload 'kernel32.dll'
|
|
18
|
+
|
|
19
|
+
extern 'int SetDllDirectoryW(void*)'
|
|
20
|
+
extern 'void* AddDllDirectory(void*)'
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Library names and their patterns — resolved per platform.
|
|
25
|
+
# @return [Hash{Symbol => String}]
|
|
26
|
+
def self.lib_patterns
|
|
27
|
+
if defined?(Ignis::Platform)
|
|
28
|
+
platform_patterns = {}
|
|
29
|
+
(Ignis::Platform.windows? ? Ignis::Platform::WIN_LIB_PATTERNS : Ignis::Platform::LINUX_LIB_PATTERNS).each do |k, v|
|
|
30
|
+
platform_patterns[k] = v
|
|
31
|
+
end
|
|
32
|
+
platform_patterns
|
|
33
|
+
elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
|
|
34
|
+
WIN_LIB_PATTERNS
|
|
35
|
+
else
|
|
36
|
+
LINUX_LIB_PATTERNS
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Fallback patterns if Platform module not loaded
|
|
41
|
+
WIN_LIB_PATTERNS = {
|
|
42
|
+
cuda_runtime: 'cudart64_*.dll',
|
|
43
|
+
cublas: 'cublas64_*.dll',
|
|
44
|
+
cublaslt: 'cublasLt64_*.dll',
|
|
45
|
+
cufft: 'cufft64_*.dll',
|
|
46
|
+
curand: 'curand64_*.dll',
|
|
47
|
+
cusparse: 'cusparse64_*.dll',
|
|
48
|
+
cusolver: 'cusolver64_*.dll',
|
|
49
|
+
cudnn: 'cudnn64_*.dll',
|
|
50
|
+
nvrtc: 'nvrtc64_*.dll',
|
|
51
|
+
cutensor: 'cutensor.dll',
|
|
52
|
+
cudss: 'cudss64_*.dll',
|
|
53
|
+
mathdx: 'mathdx64_0.dll',
|
|
54
|
+
cuda_driver: 'nvcuda.dll'
|
|
55
|
+
}.freeze
|
|
56
|
+
|
|
57
|
+
LINUX_LIB_PATTERNS = {
|
|
58
|
+
cuda_runtime: 'libcudart.so*',
|
|
59
|
+
cublas: 'libcublas.so*',
|
|
60
|
+
cublaslt: 'libcublasLt.so*',
|
|
61
|
+
cufft: 'libcufft.so*',
|
|
62
|
+
curand: 'libcurand.so*',
|
|
63
|
+
cusparse: 'libcusparse.so*',
|
|
64
|
+
cusolver: 'libcusolver.so*',
|
|
65
|
+
cudnn: 'libcudnn.so*',
|
|
66
|
+
nvrtc: 'libnvrtc.so*',
|
|
67
|
+
cutensor: 'libcutensor.so*',
|
|
68
|
+
cudss: 'libcudss.so*',
|
|
69
|
+
mathdx: 'libmathdx.so*',
|
|
70
|
+
cuda_driver: 'libcuda.so*'
|
|
71
|
+
}.freeze
|
|
72
|
+
|
|
73
|
+
# Custom search paths for additional CUDA libraries
|
|
74
|
+
def self.custom_paths
|
|
75
|
+
if defined?(Ignis::Platform)
|
|
76
|
+
Ignis::Platform.custom_lib_paths
|
|
77
|
+
elsif windows?
|
|
78
|
+
{
|
|
79
|
+
cutensor: 'C:/Program Files/NVIDIA cuTENSOR/v2.4/bin/13',
|
|
80
|
+
cudss: 'C:/Program Files/NVIDIA cuDSS/v0.7/bin/13'
|
|
81
|
+
}
|
|
82
|
+
else
|
|
83
|
+
{
|
|
84
|
+
cutensor: '/usr/local/cutensor/lib',
|
|
85
|
+
cudss: '/usr/local/cudss/lib'
|
|
86
|
+
}
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
@loaded_libraries = {}
|
|
91
|
+
@library_paths = {}
|
|
92
|
+
@fiddle_handles = {}
|
|
93
|
+
@mutex = Mutex.new
|
|
94
|
+
@dll_directory_set = false
|
|
95
|
+
|
|
96
|
+
class << self
|
|
97
|
+
# @return [Hash{Symbol => String}]
|
|
98
|
+
attr_reader :library_paths
|
|
99
|
+
|
|
100
|
+
# @return [Hash{Symbol => Fiddle::Handle}]
|
|
101
|
+
attr_reader :fiddle_handles
|
|
102
|
+
|
|
103
|
+
# @return [Hash{Symbol => String}]
|
|
104
|
+
def loaded_libraries
|
|
105
|
+
@mutex.synchronize { @library_paths.dup }
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# @param library [Symbol]
|
|
109
|
+
# @return [Boolean]
|
|
110
|
+
def loaded?(library)
|
|
111
|
+
@mutex.synchronize { @fiddle_handles.key?(library) }
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Load a specific CUDA library via Fiddle::Handle.
|
|
115
|
+
# @param library [Symbol]
|
|
116
|
+
# @return [Fiddle::Handle]
|
|
117
|
+
def load_library(library)
|
|
118
|
+
@mutex.synchronize do
|
|
119
|
+
return @fiddle_handles[library] if @fiddle_handles[library]
|
|
120
|
+
|
|
121
|
+
ensure_dll_search_path! if windows?
|
|
122
|
+
|
|
123
|
+
patterns = lib_patterns
|
|
124
|
+
pattern = patterns[library]
|
|
125
|
+
raise ArgumentError, "Unknown library: #{library}" unless pattern
|
|
126
|
+
|
|
127
|
+
dll_path = find_library(library, pattern)
|
|
128
|
+
raise LibraryNotFoundError, library.to_s unless dll_path
|
|
129
|
+
|
|
130
|
+
handle = Fiddle::Handle.new(dll_path)
|
|
131
|
+
@fiddle_handles[library] = handle
|
|
132
|
+
@library_paths[library] = dll_path
|
|
133
|
+
|
|
134
|
+
$stderr.puts "[LibraryLoader] Loaded #{library}: #{dll_path}"
|
|
135
|
+
handle
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Load all required CUDA libraries.
|
|
140
|
+
# @param libraries [Array<Symbol>]
|
|
141
|
+
# @return [Hash{Symbol => Fiddle::Handle}]
|
|
142
|
+
def load_all(libraries: %i[cuda_runtime cublas cufft curand])
|
|
143
|
+
libraries.each_with_object({}) do |lib, result|
|
|
144
|
+
result[lib] = load_library(lib)
|
|
145
|
+
rescue LibraryNotFoundError => e
|
|
146
|
+
$stderr.puts "[LibraryLoader] Optional library not found: #{e.message}"
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# @return [Fiddle::Handle]
|
|
151
|
+
def ensure_cuda_runtime!
|
|
152
|
+
load_library(:cuda_runtime)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# @param library [Symbol]
|
|
156
|
+
# @return [Fiddle::Handle, nil]
|
|
157
|
+
def handle_for(library)
|
|
158
|
+
@mutex.synchronize { @fiddle_handles[library] }
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# @return [String] e.g. "13.0"
|
|
162
|
+
def cuda_version
|
|
163
|
+
ensure_cuda_runtime!
|
|
164
|
+
handle = @fiddle_handles[:cuda_runtime]
|
|
165
|
+
|
|
166
|
+
fn = Fiddle::Function.new(
|
|
167
|
+
handle['cudaRuntimeGetVersion'],
|
|
168
|
+
[Fiddle::TYPE_VOIDP],
|
|
169
|
+
Fiddle::TYPE_INT
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
version_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT)
|
|
173
|
+
result = fn.call(version_ptr)
|
|
174
|
+
raise CudaRuntimeError.new('Failed to get CUDA version', cuda_code: result) unless result.zero?
|
|
175
|
+
|
|
176
|
+
version = version_ptr[0, Fiddle::SIZEOF_INT].unpack1('l')
|
|
177
|
+
major = version / 1000
|
|
178
|
+
minor = (version % 1000) / 10
|
|
179
|
+
|
|
180
|
+
"#{major}.#{minor}"
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# @return [void]
|
|
184
|
+
def reset!
|
|
185
|
+
@mutex.synchronize do
|
|
186
|
+
@fiddle_handles.clear
|
|
187
|
+
@loaded_libraries.clear
|
|
188
|
+
@library_paths.clear
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# @return [Boolean]
|
|
193
|
+
def windows?
|
|
194
|
+
if defined?(Ignis::Platform)
|
|
195
|
+
Ignis::Platform.windows?
|
|
196
|
+
else
|
|
197
|
+
RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
private
|
|
202
|
+
|
|
203
|
+
# Ensure CUDA bin path is added to Windows DLL search path.
|
|
204
|
+
# No-op on Linux (LD_LIBRARY_PATH handles it).
|
|
205
|
+
# @return [void]
|
|
206
|
+
def ensure_dll_search_path!
|
|
207
|
+
return if @dll_directory_set
|
|
208
|
+
return unless windows? && defined?(Kernel32)
|
|
209
|
+
|
|
210
|
+
cuda_bin = resolve_cuda_bin
|
|
211
|
+
return unless cuda_bin
|
|
212
|
+
|
|
213
|
+
win_path = cuda_bin.tr('/', '\\')
|
|
214
|
+
utf16_path = (win_path + "\0").encode('UTF-16LE')
|
|
215
|
+
ptr = Fiddle::Pointer.malloc(utf16_path.bytesize)
|
|
216
|
+
ptr[0, utf16_path.bytesize] = utf16_path
|
|
217
|
+
|
|
218
|
+
result = Kernel32.SetDllDirectoryW(ptr)
|
|
219
|
+
if result != 0
|
|
220
|
+
@dll_directory_set = true
|
|
221
|
+
else
|
|
222
|
+
$stderr.puts "[LibraryLoader] WARNING: Failed to set DLL directory: #{win_path}"
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# @return [String, nil] CUDA binary directory
|
|
227
|
+
def resolve_cuda_bin
|
|
228
|
+
if defined?(Ignis::Platform)
|
|
229
|
+
Ignis::Platform.cuda_bin_path
|
|
230
|
+
elsif defined?(Ignis) && Ignis.respond_to?(:configuration)
|
|
231
|
+
Ignis.configuration.cuda_bin_path rescue nil
|
|
232
|
+
else
|
|
233
|
+
windows? ? 'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/bin' : '/usr/local/cuda/lib64'
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
# Find a library by key and pattern.
|
|
238
|
+
# @param key [Symbol]
|
|
239
|
+
# @param pattern [String]
|
|
240
|
+
# @return [String, nil]
|
|
241
|
+
def find_library(key, pattern)
|
|
242
|
+
# Check Platform-aware search first
|
|
243
|
+
if defined?(Ignis::Platform)
|
|
244
|
+
found = Ignis::Platform.find_cuda_lib(key)
|
|
245
|
+
return found if found
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# Check custom paths
|
|
249
|
+
custom = custom_paths
|
|
250
|
+
if custom[key]
|
|
251
|
+
matches = Dir.glob(File.join(custom[key], pattern))
|
|
252
|
+
return matches.max if matches.any?
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
# Check CUDA bin path
|
|
256
|
+
cuda_bin = resolve_cuda_bin
|
|
257
|
+
if cuda_bin
|
|
258
|
+
matches = Dir.glob(File.join(cuda_bin, pattern))
|
|
259
|
+
return matches.max if matches.any?
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
# Check system PATH / LD_LIBRARY_PATH
|
|
263
|
+
separator = windows? ? ';' : ':'
|
|
264
|
+
paths = ENV['PATH']&.split(separator) || []
|
|
265
|
+
|
|
266
|
+
# On Linux also check LD_LIBRARY_PATH
|
|
267
|
+
unless windows?
|
|
268
|
+
ld_paths = ENV['LD_LIBRARY_PATH']&.split(':') || []
|
|
269
|
+
paths = ld_paths + paths
|
|
270
|
+
|
|
271
|
+
# Standard Linux library locations
|
|
272
|
+
paths += ['/usr/lib/x86_64-linux-gnu', '/usr/lib64', '/usr/local/lib']
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
paths.each do |path|
|
|
276
|
+
matches = Dir.glob(File.join(path, pattern))
|
|
277
|
+
return matches.max if matches.any?
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
nil
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
end
|
|
285
|
+
end
|