ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +15 -0
  3. data/lib/ignis.rb +94 -0
  4. data/lib/nnw/platform.rb +304 -0
  5. data/lib/nnw/shared/event_bus.rb +240 -0
  6. data/lib/nnw/shared/ffi_loader.rb +63 -0
  7. data/lib/nnw/shared/memory_contract.rb +204 -0
  8. data/lib/nnw/shared/nv_array.rb +710 -0
  9. data/lib/nnw/shared/recovery_protocol.rb +307 -0
  10. data/lib/nvruby/configuration.rb +217 -0
  11. data/lib/nvruby/cuda/device.rb +275 -0
  12. data/lib/nvruby/cuda/device_props.rb +202 -0
  13. data/lib/nvruby/cuda/graph.rb +265 -0
  14. data/lib/nvruby/cuda/graph_bindings.rb +119 -0
  15. data/lib/nvruby/cuda/library_loader.rb +285 -0
  16. data/lib/nvruby/cuda/memory.rb +410 -0
  17. data/lib/nvruby/cuda/runtime_api.rb +804 -0
  18. data/lib/nvruby/cuda/stream.rb +234 -0
  19. data/lib/nvruby/dtype.rb +139 -0
  20. data/lib/nvruby/epilogues.rb +438 -0
  21. data/lib/nvruby/errors.rb +303 -0
  22. data/lib/nvruby/half.rb +97 -0
  23. data/lib/nvruby/jit/compiled_kernel.rb +80 -0
  24. data/lib/nvruby/jit/compiler.rb +231 -0
  25. data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
  26. data/lib/nvruby/jit/kernel.rb +240 -0
  27. data/lib/nvruby/jit/kernel_module.rb +133 -0
  28. data/lib/nvruby/jit/kernels/activations.rb +179 -0
  29. data/lib/nvruby/jit/kernels/attention.rb +504 -0
  30. data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
  31. data/lib/nvruby/jit/kernels/loss.rb +213 -0
  32. data/lib/nvruby/jit/kernels/normalization.rb +200 -0
  33. data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
  34. data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
  35. data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
  36. data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
  37. data/lib/nvruby/linalg/epilog.rb +67 -0
  38. data/lib/nvruby/linalg/matmul.rb +247 -0
  39. data/lib/nvruby/linalg/matmul_plan.rb +229 -0
  40. data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
  41. data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
  42. data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
  43. data/lib/nvruby/memory/device_memory_resource.rb +106 -0
  44. data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
  45. data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
  46. data/lib/nvruby/memory/stats.rb +107 -0
  47. data/lib/nvruby/memory.rb +124 -0
  48. data/lib/nvruby/version.rb +5 -0
  49. metadata +108 -0
@@ -0,0 +1,265 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'graph_bindings'
4
+ require 'fiddle'
5
+
6
+ module Ignis
7
+ module CUDA
8
+ # CUDA Graph for capturing and replaying GPU operations.
9
+ # Provides reduced kernel launch overhead for repetitive workloads.
10
+ #
11
+ # Uses GraphBindings (FFI-based) since graph operations are NOT hot-path.
12
+ # Stream handles are Fiddle::Pointer — we convert for FFI interop.
13
+ class Graph
14
+ # @return [FFI::Pointer] Native CUDA graph handle
15
+ attr_reader :handle
16
+
17
+ # @return [Boolean] Whether the graph was created via stream capture
18
+ attr_reader :captured
19
+
20
+ # @return [Integer, nil] device_id for recovery coordinator invalidation
21
+ attr_accessor :device_id
22
+
23
+ # Create a new empty CUDA graph.
24
+ # @param flags [Integer] Graph creation flags (default: 0)
25
+ def initialize(flags: 0)
26
+ GraphBindings.ensure_loaded!
27
+
28
+ graph_ptr = FFI::MemoryPointer.new(:pointer)
29
+ status = GraphBindings.cudaGraphCreate(graph_ptr, flags)
30
+ GraphBindings.check_status!(status, 'cudaGraphCreate')
31
+
32
+ @handle = graph_ptr.read_pointer
33
+ @captured = false
34
+ @destroyed = false
35
+ @device_id = nil
36
+ end
37
+
38
+ # Capture GPU operations from a stream into a new graph.
39
+ # @param stream [Stream, nil] Stream to capture (creates temporary if nil)
40
+ # @param mode [Symbol] Capture mode (:global, :thread_local, :relaxed)
41
+ # @yield [Stream] Block containing GPU operations to capture
42
+ # @return [Graph]
43
+ def self.capture(stream: nil, mode: :global, &block)
44
+ GraphBindings.ensure_loaded!
45
+
46
+ own_stream = stream.nil?
47
+ stream ||= Stream.new
48
+
49
+ capture_mode = case mode
50
+ when :global then GraphBindings::CUDA_STREAM_CAPTURE_MODE_GLOBAL
51
+ when :thread_local then GraphBindings::CUDA_STREAM_CAPTURE_MODE_THREAD_LOCAL
52
+ when :relaxed then GraphBindings::CUDA_STREAM_CAPTURE_MODE_RELAXED
53
+ else GraphBindings::CUDA_STREAM_CAPTURE_MODE_GLOBAL
54
+ end
55
+
56
+ # Convert Fiddle::Pointer to FFI::Pointer for GraphBindings
57
+ stream_ffi = to_ffi_ptr(stream.to_ptr)
58
+
59
+ status = GraphBindings.cudaStreamBeginCapture(stream_ffi, capture_mode)
60
+ GraphBindings.check_status!(status, 'cudaStreamBeginCapture')
61
+
62
+ begin
63
+ block.call(stream)
64
+ ensure
65
+ graph_ptr = FFI::MemoryPointer.new(:pointer)
66
+ status = GraphBindings.cudaStreamEndCapture(stream_ffi, graph_ptr)
67
+ GraphBindings.check_status!(status, 'cudaStreamEndCapture')
68
+
69
+ stream.destroy! if own_stream
70
+ end
71
+
72
+ graph = allocate
73
+ graph.instance_variable_set(:@handle, graph_ptr.read_pointer)
74
+ graph.instance_variable_set(:@captured, true)
75
+ graph.instance_variable_set(:@destroyed, false)
76
+ graph.instance_variable_set(:@device_id, nil)
77
+
78
+ graph
79
+ end
80
+
81
+ # Instantiate the graph to create an executable graph.
82
+ # @param flags [Integer] Instantiation flags
83
+ # @return [GraphExecutable]
84
+ def instantiate(flags: 0)
85
+ raise InvalidOperationError, 'Graph already destroyed' if @destroyed
86
+
87
+ GraphExecutable.new(self, flags: flags)
88
+ end
89
+
90
+ # Clone this graph.
91
+ # @return [Graph]
92
+ def clone
93
+ raise InvalidOperationError, 'Graph already destroyed' if @destroyed
94
+
95
+ clone_ptr = FFI::MemoryPointer.new(:pointer)
96
+ status = GraphBindings.cudaGraphClone(clone_ptr, @handle)
97
+ GraphBindings.check_status!(status, 'cudaGraphClone')
98
+
99
+ cloned = Graph.allocate
100
+ cloned.instance_variable_set(:@handle, clone_ptr.read_pointer)
101
+ cloned.instance_variable_set(:@captured, @captured)
102
+ cloned.instance_variable_set(:@destroyed, false)
103
+ cloned.instance_variable_set(:@device_id, @device_id)
104
+ cloned
105
+ end
106
+
107
+ # Get number of nodes in the graph.
108
+ # @return [Integer]
109
+ def node_count
110
+ raise InvalidOperationError, 'Graph already destroyed' if @destroyed
111
+
112
+ count_ptr = FFI::MemoryPointer.new(:size_t)
113
+ status = GraphBindings.cudaGraphGetNodes(@handle, FFI::Pointer::NULL, count_ptr)
114
+ GraphBindings.check_status!(status, 'cudaGraphGetNodes')
115
+
116
+ count_ptr.read(:size_t)
117
+ end
118
+
119
+ # Get number of root nodes (nodes with no dependencies).
120
+ # @return [Integer]
121
+ def root_node_count
122
+ raise InvalidOperationError, 'Graph already destroyed' if @destroyed
123
+
124
+ count_ptr = FFI::MemoryPointer.new(:size_t)
125
+ status = GraphBindings.cudaGraphGetRootNodes(@handle, FFI::Pointer::NULL, count_ptr)
126
+ GraphBindings.check_status!(status, 'cudaGraphGetRootNodes')
127
+
128
+ count_ptr.read(:size_t)
129
+ end
130
+
131
+ # @return [Boolean]
132
+ def destroyed?
133
+ @destroyed
134
+ end
135
+
136
+ # Destroy the graph and release resources.
137
+ # @return [void]
138
+ def destroy!
139
+ return if @destroyed
140
+
141
+ status = GraphBindings.cudaGraphDestroy(@handle)
142
+ GraphBindings.check_status!(status, 'cudaGraphDestroy')
143
+
144
+ @destroyed = true
145
+ @handle = nil
146
+ end
147
+
148
+ # Convert a Fiddle::Pointer to an FFI::Pointer for interop.
149
+ # @param fiddle_ptr [Fiddle::Pointer]
150
+ # @return [FFI::Pointer]
151
+ def self.to_ffi_ptr(fiddle_ptr)
152
+ return FFI::Pointer::NULL if fiddle_ptr.nil? || fiddle_ptr.to_i.zero?
153
+
154
+ FFI::Pointer.new(:pointer, fiddle_ptr.to_i)
155
+ end
156
+ end
157
+
158
+ # Executable CUDA Graph ready for launch.
159
+ class GraphExecutable
160
+ # @return [FFI::Pointer] Native executable graph handle
161
+ attr_reader :handle
162
+
163
+ # @return [Graph] Source graph
164
+ attr_reader :source_graph
165
+
166
+ # @return [Integer] Number of times this graph has been launched
167
+ attr_reader :launch_count
168
+
169
+ # @param graph [Graph] Source graph
170
+ # @param flags [Integer] Instantiation flags
171
+ def initialize(graph, flags: 0)
172
+ GraphBindings.ensure_loaded!
173
+
174
+ exec_ptr = FFI::MemoryPointer.new(:pointer)
175
+ status = GraphBindings.cudaGraphInstantiate(exec_ptr, graph.handle, flags)
176
+ GraphBindings.check_status!(status, 'cudaGraphInstantiate')
177
+
178
+ @handle = exec_ptr.read_pointer
179
+ @source_graph = graph
180
+ @launch_count = 0
181
+ @destroyed = false
182
+ end
183
+
184
+ # Launch the executable graph.
185
+ # @param stream [Stream, nil] Stream to launch on (default stream if nil)
186
+ # @return [self]
187
+ def launch(stream: nil)
188
+ raise InvalidOperationError, 'GraphExecutable already destroyed' if @destroyed
189
+
190
+ stream_handle = stream ? Graph.to_ffi_ptr(stream.to_ptr) : FFI::Pointer::NULL
191
+ status = GraphBindings.cudaGraphLaunch(@handle, stream_handle)
192
+ GraphBindings.check_status!(status, 'cudaGraphLaunch')
193
+
194
+ @launch_count += 1
195
+ self
196
+ end
197
+
198
+ # Upload the graph to the device for faster first launch.
199
+ # @param stream [Stream, nil]
200
+ # @return [self]
201
+ def upload(stream: nil)
202
+ raise InvalidOperationError, 'GraphExecutable already destroyed' if @destroyed
203
+
204
+ stream_handle = stream ? Graph.to_ffi_ptr(stream.to_ptr) : FFI::Pointer::NULL
205
+ status = GraphBindings.cudaGraphUpload(@handle, stream_handle)
206
+ GraphBindings.check_status!(status, 'cudaGraphUpload')
207
+
208
+ self
209
+ end
210
+
211
+ # Update the executable graph from its source graph.
212
+ # @return [Boolean] True if update succeeded
213
+ def update!
214
+ raise InvalidOperationError, 'GraphExecutable already destroyed' if @destroyed
215
+
216
+ result_ptr = FFI::MemoryPointer.new(:int)
217
+ status = GraphBindings.cudaGraphExecUpdate(@handle, @source_graph.handle, result_ptr)
218
+
219
+ status.zero?
220
+ end
221
+
222
+ # @return [Boolean]
223
+ def destroyed?
224
+ @destroyed
225
+ end
226
+
227
+ # Destroy the executable graph.
228
+ # @return [void]
229
+ def destroy!
230
+ return if @destroyed
231
+
232
+ status = GraphBindings.cudaGraphExecDestroy(@handle)
233
+ GraphBindings.check_status!(status, 'cudaGraphExecDestroy')
234
+
235
+ @destroyed = true
236
+ @handle = nil
237
+ end
238
+ end
239
+
240
+ # Convenience module for graph-based operations.
241
+ module GraphCapture
242
+ class << self
243
+ # Capture GPU ops and return executable graph.
244
+ # @param stream [Stream, nil]
245
+ # @yield [Stream]
246
+ # @return [GraphExecutable]
247
+ def capture(stream: nil, &block)
248
+ graph = Graph.capture(stream: stream, &block)
249
+ graph.instantiate
250
+ end
251
+
252
+ # Capture and immediately launch.
253
+ # @param stream [Stream, nil]
254
+ # @param repeat [Integer]
255
+ # @yield [Stream]
256
+ # @return [GraphExecutable]
257
+ def capture_and_launch(stream: nil, repeat: 1, &block)
258
+ exec = capture(stream: stream, &block)
259
+ repeat.times { exec.launch(stream: stream) }
260
+ exec
261
+ end
262
+ end
263
+ end
264
+ end
265
+ end
@@ -0,0 +1,119 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ffi"
4
+
5
+ module Ignis
6
+ module CUDA
7
+ # CUDA Graphs FFI bindings
8
+ # Provides stream capture and graph execution for reduced launch overhead
9
+ module GraphBindings
10
+ extend FFI::Library
11
+
12
+ # CUDA Stream Capture Mode
13
+ CUDA_STREAM_CAPTURE_MODE_GLOBAL = 0
14
+ CUDA_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1
15
+ CUDA_STREAM_CAPTURE_MODE_RELAXED = 2
16
+
17
+ # CUDA Graph Instantiate Flags
18
+ CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH = 1
19
+ CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD = 2
20
+ CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH = 4
21
+ CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY = 8
22
+
23
+ # CUDA Graph Node Types
24
+ CUDA_GRAPH_NODE_TYPE_KERNEL = 0
25
+ CUDA_GRAPH_NODE_TYPE_MEMCPY = 1
26
+ CUDA_GRAPH_NODE_TYPE_MEMSET = 2
27
+ CUDA_GRAPH_NODE_TYPE_HOST = 3
28
+ CUDA_GRAPH_NODE_TYPE_GRAPH = 4
29
+ CUDA_GRAPH_NODE_TYPE_EMPTY = 5
30
+ CUDA_GRAPH_NODE_TYPE_WAIT_EVENT = 6
31
+ CUDA_GRAPH_NODE_TYPE_EVENT_RECORD = 7
32
+
33
+ @loaded = false
34
+ @mutex = Mutex.new
35
+
36
+ class << self
37
+ # Ensure CUDA runtime is loaded and graph functions are attached
38
+ # @return [void]
39
+ def ensure_loaded!
40
+ @mutex.synchronize do
41
+ return if @loaded
42
+
43
+ LibraryLoader.load_library(:cuda_runtime)
44
+
45
+ # Resolve cudart path per platform
46
+ dll_path = if defined?(Ignis::Platform)
47
+ Ignis::Platform.cudart_path
48
+ elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
49
+ cuda_bin = Ignis.configuration.cuda_bin_path rescue nil
50
+ cuda_bin ? Dir.glob(File.join(cuda_bin, 'cudart64_*.dll')).max : 'cudart64_130'
51
+ else
52
+ 'libcudart.so.13'
53
+ end
54
+
55
+ raise LibraryNotFoundError, 'cudart' unless dll_path
56
+
57
+ ffi_lib dll_path
58
+ attach_graph_functions!
59
+
60
+ @loaded = true
61
+ end
62
+ end
63
+
64
+ # Check status and raise error if not success
65
+ # @param status [Integer] CUDA error code
66
+ # @param context [String] Context for error message
67
+ # @return [void]
68
+ def check_status!(status, context = "CUDA Graph operation")
69
+ return if status.zero?
70
+
71
+ raise CudaRuntimeError.new("#{context} failed", cuda_code: status)
72
+ end
73
+
74
+ private
75
+
76
+ def attach_graph_functions!
77
+ # Graph creation and destruction
78
+ attach_function :cudaGraphCreate, [:pointer, :uint], :int
79
+ attach_function :cudaGraphDestroy, [:pointer], :int
80
+
81
+ # Stream capture for graph creation
82
+ attach_function :cudaStreamBeginCapture, [:pointer, :int], :int
83
+ attach_function :cudaStreamEndCapture, [:pointer, :pointer], :int
84
+ attach_function :cudaStreamIsCapturing, [:pointer, :pointer], :int
85
+
86
+ # Graph instantiation (create executable graph)
87
+ attach_function :cudaGraphInstantiate, [:pointer, :pointer, :uint64], :int
88
+ attach_function :cudaGraphInstantiateWithFlags, [:pointer, :pointer, :uint64], :int
89
+
90
+ # Graph execution
91
+ attach_function :cudaGraphLaunch, [:pointer, :pointer], :int
92
+
93
+ # Executable graph destruction
94
+ attach_function :cudaGraphExecDestroy, [:pointer], :int
95
+
96
+ # Graph update (for parameter changes without re-instantiation)
97
+ attach_function :cudaGraphExecUpdate, [:pointer, :pointer, :pointer], :int
98
+
99
+ # Graph node management
100
+ attach_function :cudaGraphGetNodes, [:pointer, :pointer, :pointer], :int
101
+ attach_function :cudaGraphGetRootNodes, [:pointer, :pointer, :pointer], :int
102
+ attach_function :cudaGraphNodeGetType, [:pointer, :pointer], :int
103
+ attach_function :cudaGraphGetEdges, [:pointer, :pointer, :pointer, :pointer], :int
104
+
105
+ # Graph cloning
106
+ attach_function :cudaGraphClone, [:pointer, :pointer], :int
107
+
108
+ # Conditional graph nodes (CUDA 12+)
109
+ attach_function :cudaGraphAddEmptyNode, [:pointer, :pointer, :pointer, :size_t], :int
110
+ attach_function :cudaGraphAddDependencies, [:pointer, :pointer, :pointer, :size_t], :int
111
+ attach_function :cudaGraphRemoveDependencies, [:pointer, :pointer, :pointer, :size_t], :int
112
+
113
+ # Graph upload to device (for faster first launch)
114
+ attach_function :cudaGraphUpload, [:pointer, :pointer], :int
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,285 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fiddle'
4
+ require 'fiddle/import'
5
+
6
+ module Ignis
7
+ module CUDA
8
+ # Handles dynamic loading of NVIDIA CUDA libraries.
9
+ # Cross-platform: Windows (DLLs via Kernel32) and Linux (.so via dlopen).
10
+ # Uses Ignis::Platform for path resolution when available.
11
+ module LibraryLoader
12
+ # Windows Kernel32 API for DLL search path management (Fiddle-based).
13
+ # Only loaded on Windows.
14
+ if RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i) || (defined?(Ignis::Platform) && Ignis::Platform.windows?)
15
+ module Kernel32
16
+ extend Fiddle::Importer
17
+ dlload 'kernel32.dll'
18
+
19
+ extern 'int SetDllDirectoryW(void*)'
20
+ extern 'void* AddDllDirectory(void*)'
21
+ end
22
+ end
23
+
24
+ # Library names and their patterns — resolved per platform.
25
+ # @return [Hash{Symbol => String}]
26
+ def self.lib_patterns
27
+ if defined?(Ignis::Platform)
28
+ platform_patterns = {}
29
+ (Ignis::Platform.windows? ? Ignis::Platform::WIN_LIB_PATTERNS : Ignis::Platform::LINUX_LIB_PATTERNS).each do |k, v|
30
+ platform_patterns[k] = v
31
+ end
32
+ platform_patterns
33
+ elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
34
+ WIN_LIB_PATTERNS
35
+ else
36
+ LINUX_LIB_PATTERNS
37
+ end
38
+ end
39
+
40
+ # Fallback patterns if Platform module not loaded
41
+ WIN_LIB_PATTERNS = {
42
+ cuda_runtime: 'cudart64_*.dll',
43
+ cublas: 'cublas64_*.dll',
44
+ cublaslt: 'cublasLt64_*.dll',
45
+ cufft: 'cufft64_*.dll',
46
+ curand: 'curand64_*.dll',
47
+ cusparse: 'cusparse64_*.dll',
48
+ cusolver: 'cusolver64_*.dll',
49
+ cudnn: 'cudnn64_*.dll',
50
+ nvrtc: 'nvrtc64_*.dll',
51
+ cutensor: 'cutensor.dll',
52
+ cudss: 'cudss64_*.dll',
53
+ mathdx: 'mathdx64_0.dll',
54
+ cuda_driver: 'nvcuda.dll'
55
+ }.freeze
56
+
57
+ LINUX_LIB_PATTERNS = {
58
+ cuda_runtime: 'libcudart.so*',
59
+ cublas: 'libcublas.so*',
60
+ cublaslt: 'libcublasLt.so*',
61
+ cufft: 'libcufft.so*',
62
+ curand: 'libcurand.so*',
63
+ cusparse: 'libcusparse.so*',
64
+ cusolver: 'libcusolver.so*',
65
+ cudnn: 'libcudnn.so*',
66
+ nvrtc: 'libnvrtc.so*',
67
+ cutensor: 'libcutensor.so*',
68
+ cudss: 'libcudss.so*',
69
+ mathdx: 'libmathdx.so*',
70
+ cuda_driver: 'libcuda.so*'
71
+ }.freeze
72
+
73
+ # Custom search paths for additional CUDA libraries
74
+ def self.custom_paths
75
+ if defined?(Ignis::Platform)
76
+ Ignis::Platform.custom_lib_paths
77
+ elsif windows?
78
+ {
79
+ cutensor: 'C:/Program Files/NVIDIA cuTENSOR/v2.4/bin/13',
80
+ cudss: 'C:/Program Files/NVIDIA cuDSS/v0.7/bin/13'
81
+ }
82
+ else
83
+ {
84
+ cutensor: '/usr/local/cutensor/lib',
85
+ cudss: '/usr/local/cudss/lib'
86
+ }
87
+ end
88
+ end
89
+
90
+ @loaded_libraries = {}
91
+ @library_paths = {}
92
+ @fiddle_handles = {}
93
+ @mutex = Mutex.new
94
+ @dll_directory_set = false
95
+
96
+ class << self
97
+ # @return [Hash{Symbol => String}]
98
+ attr_reader :library_paths
99
+
100
+ # @return [Hash{Symbol => Fiddle::Handle}]
101
+ attr_reader :fiddle_handles
102
+
103
+ # @return [Hash{Symbol => String}]
104
+ def loaded_libraries
105
+ @mutex.synchronize { @library_paths.dup }
106
+ end
107
+
108
+ # @param library [Symbol]
109
+ # @return [Boolean]
110
+ def loaded?(library)
111
+ @mutex.synchronize { @fiddle_handles.key?(library) }
112
+ end
113
+
114
+ # Load a specific CUDA library via Fiddle::Handle.
115
+ # @param library [Symbol]
116
+ # @return [Fiddle::Handle]
117
+ def load_library(library)
118
+ @mutex.synchronize do
119
+ return @fiddle_handles[library] if @fiddle_handles[library]
120
+
121
+ ensure_dll_search_path! if windows?
122
+
123
+ patterns = lib_patterns
124
+ pattern = patterns[library]
125
+ raise ArgumentError, "Unknown library: #{library}" unless pattern
126
+
127
+ dll_path = find_library(library, pattern)
128
+ raise LibraryNotFoundError, library.to_s unless dll_path
129
+
130
+ handle = Fiddle::Handle.new(dll_path)
131
+ @fiddle_handles[library] = handle
132
+ @library_paths[library] = dll_path
133
+
134
+ $stderr.puts "[LibraryLoader] Loaded #{library}: #{dll_path}"
135
+ handle
136
+ end
137
+ end
138
+
139
+ # Load all required CUDA libraries.
140
+ # @param libraries [Array<Symbol>]
141
+ # @return [Hash{Symbol => Fiddle::Handle}]
142
+ def load_all(libraries: %i[cuda_runtime cublas cufft curand])
143
+ libraries.each_with_object({}) do |lib, result|
144
+ result[lib] = load_library(lib)
145
+ rescue LibraryNotFoundError => e
146
+ $stderr.puts "[LibraryLoader] Optional library not found: #{e.message}"
147
+ end
148
+ end
149
+
150
+ # @return [Fiddle::Handle]
151
+ def ensure_cuda_runtime!
152
+ load_library(:cuda_runtime)
153
+ end
154
+
155
+ # @param library [Symbol]
156
+ # @return [Fiddle::Handle, nil]
157
+ def handle_for(library)
158
+ @mutex.synchronize { @fiddle_handles[library] }
159
+ end
160
+
161
+ # @return [String] e.g. "13.0"
162
+ def cuda_version
163
+ ensure_cuda_runtime!
164
+ handle = @fiddle_handles[:cuda_runtime]
165
+
166
+ fn = Fiddle::Function.new(
167
+ handle['cudaRuntimeGetVersion'],
168
+ [Fiddle::TYPE_VOIDP],
169
+ Fiddle::TYPE_INT
170
+ )
171
+
172
+ version_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT)
173
+ result = fn.call(version_ptr)
174
+ raise CudaRuntimeError.new('Failed to get CUDA version', cuda_code: result) unless result.zero?
175
+
176
+ version = version_ptr[0, Fiddle::SIZEOF_INT].unpack1('l')
177
+ major = version / 1000
178
+ minor = (version % 1000) / 10
179
+
180
+ "#{major}.#{minor}"
181
+ end
182
+
183
+ # @return [void]
184
+ def reset!
185
+ @mutex.synchronize do
186
+ @fiddle_handles.clear
187
+ @loaded_libraries.clear
188
+ @library_paths.clear
189
+ end
190
+ end
191
+
192
+ # @return [Boolean]
193
+ def windows?
194
+ if defined?(Ignis::Platform)
195
+ Ignis::Platform.windows?
196
+ else
197
+ RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
198
+ end
199
+ end
200
+
201
+ private
202
+
203
+ # Ensure CUDA bin path is added to Windows DLL search path.
204
+ # No-op on Linux (LD_LIBRARY_PATH handles it).
205
+ # @return [void]
206
+ def ensure_dll_search_path!
207
+ return if @dll_directory_set
208
+ return unless windows? && defined?(Kernel32)
209
+
210
+ cuda_bin = resolve_cuda_bin
211
+ return unless cuda_bin
212
+
213
+ win_path = cuda_bin.tr('/', '\\')
214
+ utf16_path = (win_path + "\0").encode('UTF-16LE')
215
+ ptr = Fiddle::Pointer.malloc(utf16_path.bytesize)
216
+ ptr[0, utf16_path.bytesize] = utf16_path
217
+
218
+ result = Kernel32.SetDllDirectoryW(ptr)
219
+ if result != 0
220
+ @dll_directory_set = true
221
+ else
222
+ $stderr.puts "[LibraryLoader] WARNING: Failed to set DLL directory: #{win_path}"
223
+ end
224
+ end
225
+
226
+ # @return [String, nil] CUDA binary directory
227
+ def resolve_cuda_bin
228
+ if defined?(Ignis::Platform)
229
+ Ignis::Platform.cuda_bin_path
230
+ elsif defined?(Ignis) && Ignis.respond_to?(:configuration)
231
+ Ignis.configuration.cuda_bin_path rescue nil
232
+ else
233
+ windows? ? 'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/bin' : '/usr/local/cuda/lib64'
234
+ end
235
+ end
236
+
237
+ # Find a library by key and pattern.
238
+ # @param key [Symbol]
239
+ # @param pattern [String]
240
+ # @return [String, nil]
241
+ def find_library(key, pattern)
242
+ # Check Platform-aware search first
243
+ if defined?(Ignis::Platform)
244
+ found = Ignis::Platform.find_cuda_lib(key)
245
+ return found if found
246
+ end
247
+
248
+ # Check custom paths
249
+ custom = custom_paths
250
+ if custom[key]
251
+ matches = Dir.glob(File.join(custom[key], pattern))
252
+ return matches.max if matches.any?
253
+ end
254
+
255
+ # Check CUDA bin path
256
+ cuda_bin = resolve_cuda_bin
257
+ if cuda_bin
258
+ matches = Dir.glob(File.join(cuda_bin, pattern))
259
+ return matches.max if matches.any?
260
+ end
261
+
262
+ # Check system PATH / LD_LIBRARY_PATH
263
+ separator = windows? ? ';' : ':'
264
+ paths = ENV['PATH']&.split(separator) || []
265
+
266
+ # On Linux also check LD_LIBRARY_PATH
267
+ unless windows?
268
+ ld_paths = ENV['LD_LIBRARY_PATH']&.split(':') || []
269
+ paths = ld_paths + paths
270
+
271
+ # Standard Linux library locations
272
+ paths += ['/usr/lib/x86_64-linux-gnu', '/usr/lib64', '/usr/local/lib']
273
+ end
274
+
275
+ paths.each do |path|
276
+ matches = Dir.glob(File.join(path, pattern))
277
+ return matches.max if matches.any?
278
+ end
279
+
280
+ nil
281
+ end
282
+ end
283
+ end
284
+ end
285
+ end