ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +7 -0
  3. data/lib/ignis-collective.rb +9 -0
  4. data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
  5. data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
  6. data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
  7. data/lib/nvruby/collective/algorithms/ring.rb +421 -0
  8. data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
  9. data/lib/nvruby/collective/algorithms/tree.rb +291 -0
  10. data/lib/nvruby/collective/array_ops.rb +240 -0
  11. data/lib/nvruby/collective/communicator.rb +633 -0
  12. data/lib/nvruby/collective/communicator_healer.rb +276 -0
  13. data/lib/nvruby/collective/device_manager.rb +216 -0
  14. data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
  15. data/lib/nvruby/collective/health_monitor.rb +333 -0
  16. data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
  17. data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
  18. data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
  19. data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
  20. data/lib/nvruby/collective/p2p_bindings.rb +121 -0
  21. data/lib/nvruby/collective/resilient_transport.rb +296 -0
  22. data/lib/nvruby/collective/topology.rb +347 -0
  23. data/lib/nvruby/collective/transport/base.rb +138 -0
  24. data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
  25. data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
  26. data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
  27. data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
  28. data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
  29. data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
  30. data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
  31. data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
  32. data/lib/nvruby/collective/transport_selector.rb +200 -0
  33. data/lib/nvruby/collective/vmm_bindings.rb +212 -0
  34. data/lib/nvruby/collective.rb +156 -0
  35. metadata +92 -0
@@ -0,0 +1,266 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+ require_relative "../vmm_bindings"
5
+
6
+ module Ignis
7
+ module Collective
8
+ module Transport
9
+ # VMM IPC Transport - Modern CUDA Virtual Memory Management IPC
10
+ #
11
+ # Uses cuMemExportToShareableHandle with CU_MEM_HANDLE_TYPE_WIN32 for
12
+ # cross-process GPU memory sharing on Windows. Required for sharing
13
+ # memory allocated with cudaMallocAsync.
14
+ #
15
+ # Workflow:
16
+ # 1. Sender: cuMemCreate -> cuMemExportToShareableHandle -> send HANDLE
17
+ # 2. Receiver: cuMemImportFromShareableHandle -> cuMemAddressReserve -> cuMemMap -> cuMemSetAccess
18
+ class VMMIPCTransport < Base
19
+ # @return [Symbol] Transport type identifier
20
+ def self.transport_type
21
+ :vmm_ipc
22
+ end
23
+
24
+ # @return [Float] Estimated bandwidth (GB/s)
25
+ def estimated_bandwidth
26
+ 25.0 # PCIe-limited with IPC overhead
27
+ end
28
+
29
+ # @return [Float] Estimated latency (microseconds)
30
+ def estimated_latency
31
+ 10.0 # IPC overhead higher than P2P
32
+ end
33
+
34
+ # Initialize the transport
35
+ # @return [void]
36
+ def initialize!
37
+ return if @initialized
38
+
39
+ VMMBindings.ensure_loaded!
40
+ @exported_handles = {} # device_ptr -> {handle: CUmemGenericAllocationHandle, win32_handle: HANDLE}
41
+ @imported_handles = {} # win32_handle -> {alloc_handle, va_ptr, size}
42
+ @initialized = true
43
+ end
44
+
45
+ # Create a VMM allocation on source device (shareable)
46
+ # @param size [Integer] Size in bytes
47
+ # @return [Hash] {:device_ptr, :alloc_handle, :win32_handle}
48
+ def create_shareable_allocation(size)
49
+ ensure_initialized!
50
+
51
+ # Get allocation granularity
52
+ prop = VMMBindings.create_allocation_prop(device_id: @src_device, shareable: true)
53
+ granularity_ptr = FFI::MemoryPointer.new(:size_t)
54
+
55
+ status = VMMBindings.cuMemGetAllocationGranularity(
56
+ granularity_ptr,
57
+ prop,
58
+ 0 # CU_MEM_ALLOC_GRANULARITY_MINIMUM
59
+ )
60
+ VMMBindings.check_status!(status, "Get allocation granularity")
61
+
62
+ granularity = granularity_ptr.read_size_t
63
+ aligned_size = ((size + granularity - 1) / granularity) * granularity
64
+
65
+ # Create allocation
66
+ handle_ptr = FFI::MemoryPointer.new(:uint64)
67
+ status = VMMBindings.cuMemCreate(handle_ptr, aligned_size, prop, 0)
68
+ VMMBindings.check_status!(status, "VMM cuMemCreate")
69
+
70
+ alloc_handle = handle_ptr.read_uint64
71
+
72
+ # Reserve virtual address
73
+ va_ptr_ptr = FFI::MemoryPointer.new(:uint64)
74
+ status = VMMBindings.cuMemAddressReserve(va_ptr_ptr, aligned_size, 0, 0, 0)
75
+ VMMBindings.check_status!(status, "VMM cuMemAddressReserve")
76
+
77
+ va_ptr = va_ptr_ptr.read_uint64
78
+
79
+ # Map allocation to virtual address
80
+ status = VMMBindings.cuMemMap(va_ptr, aligned_size, 0, alloc_handle, 0)
81
+ VMMBindings.check_status!(status, "VMM cuMemMap")
82
+
83
+ # Set access for source device
84
+ access_desc = VMMBindings.create_access_desc(device_id: @src_device, read_write: true)
85
+ status = VMMBindings.cuMemSetAccess(va_ptr, aligned_size, access_desc, 1)
86
+ VMMBindings.check_status!(status, "VMM cuMemSetAccess")
87
+
88
+ # Export to Windows HANDLE
89
+ win32_handle_ptr = FFI::MemoryPointer.new(:pointer)
90
+ status = VMMBindings.cuMemExportToShareableHandle(
91
+ win32_handle_ptr,
92
+ alloc_handle,
93
+ VMMBindings::CU_MEM_HANDLE_TYPE_WIN32,
94
+ 0
95
+ )
96
+ VMMBindings.check_status!(status, "VMM cuMemExportToShareableHandle")
97
+
98
+ win32_handle = win32_handle_ptr.read_pointer
99
+
100
+ # Cache for cleanup
101
+ @exported_handles[va_ptr] = {
102
+ alloc_handle: alloc_handle,
103
+ win32_handle: win32_handle,
104
+ size: aligned_size
105
+ }
106
+
107
+ {
108
+ device_ptr: FFI::Pointer.new(:uint8, va_ptr),
109
+ alloc_handle: alloc_handle,
110
+ win32_handle: win32_handle,
111
+ size: aligned_size
112
+ }
113
+ end
114
+
115
+ # Import a shareable allocation on destination device
116
+ # @param win32_handle [FFI::Pointer] Windows HANDLE from sender
117
+ # @param size [Integer] Size in bytes
118
+ # @return [FFI::Pointer] Device pointer mapped on destination GPU
119
+ def import_shareable_allocation(win32_handle, size)
120
+ ensure_initialized!
121
+
122
+ # Import the handle
123
+ handle_ptr = FFI::MemoryPointer.new(:uint64)
124
+ status = VMMBindings.cuMemImportFromShareableHandle(
125
+ handle_ptr,
126
+ win32_handle,
127
+ VMMBindings::CU_MEM_HANDLE_TYPE_WIN32
128
+ )
129
+ VMMBindings.check_status!(status, "VMM cuMemImportFromShareableHandle")
130
+
131
+ alloc_handle = handle_ptr.read_uint64
132
+
133
+ # Get granularity for alignment
134
+ prop_ptr = VMMBindings::CUmemAllocationProp.new
135
+ status = VMMBindings.cuMemGetAllocationPropertiesFromHandle(prop_ptr, alloc_handle)
136
+ VMMBindings.check_status!(status, "Get allocation properties")
137
+
138
+ # Reserve virtual address on destination device
139
+ va_ptr_ptr = FFI::MemoryPointer.new(:uint64)
140
+ status = VMMBindings.cuMemAddressReserve(va_ptr_ptr, size, 0, 0, 0)
141
+ VMMBindings.check_status!(status, "VMM cuMemAddressReserve (import)")
142
+
143
+ va_ptr = va_ptr_ptr.read_uint64
144
+
145
+ # Map
146
+ status = VMMBindings.cuMemMap(va_ptr, size, 0, alloc_handle, 0)
147
+ VMMBindings.check_status!(status, "VMM cuMemMap (import)")
148
+
149
+ # Set access for destination device
150
+ access_desc = VMMBindings.create_access_desc(device_id: @dst_device, read_write: true)
151
+ status = VMMBindings.cuMemSetAccess(va_ptr, size, access_desc, 1)
152
+ VMMBindings.check_status!(status, "VMM cuMemSetAccess (import)")
153
+
154
+ # Cache for cleanup
155
+ @imported_handles[win32_handle.address] = {
156
+ alloc_handle: alloc_handle,
157
+ va_ptr: va_ptr,
158
+ size: size
159
+ }
160
+
161
+ FFI::Pointer.new(:uint8, va_ptr)
162
+ end
163
+
164
+ # Close an imported handle
165
+ # @param device_ptr [FFI::Pointer] The mapped device pointer
166
+ # @return [void]
167
+ def close_imported_handle(device_ptr)
168
+ va_ptr = device_ptr.address
169
+
170
+ # Find the imported handle info
171
+ @imported_handles.each do |handle_addr, info|
172
+ next unless info[:va_ptr] == va_ptr
173
+
174
+ # Unmap
175
+ VMMBindings.cuMemUnmap(va_ptr, info[:size])
176
+
177
+ # Free address
178
+ VMMBindings.cuMemAddressFree(va_ptr, info[:size])
179
+
180
+ # Release handle
181
+ VMMBindings.cuMemRelease(info[:alloc_handle])
182
+
183
+ @imported_handles.delete(handle_addr)
184
+ break
185
+ end
186
+ end
187
+
188
+ # Copy data via VMM IPC (used when both sender and receiver have mapped)
189
+ # @param dst_buffer [FFI::Pointer] Destination (on dst_device)
190
+ # @param src_buffer [FFI::Pointer] Source (on src_device)
191
+ # @param size [Integer] Size in bytes
192
+ # @param stream [FFI::Pointer] CUDA stream
193
+ def copy_async(dst_buffer, src_buffer, size, stream)
194
+ ensure_initialized!
195
+
196
+ # For VMM IPC, once both sides have mapped the same allocation,
197
+ # we can use regular cudaMemcpyAsync
198
+ CUDA::RuntimeAPI.ensure_loaded!
199
+
200
+ status = CUDA::RuntimeAPI.cudaMemcpyAsync(
201
+ dst_buffer,
202
+ src_buffer,
203
+ size,
204
+ CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_DEVICE,
205
+ stream
206
+ )
207
+ CUDA::RuntimeAPI.check_status!(status, "VMM IPC copy")
208
+ end
209
+
210
+ # Async send (export and copy)
211
+ # @param buffer [FFI::Pointer] Source buffer
212
+ # @param size [Integer] Size in bytes
213
+ # @param stream [FFI::Pointer] CUDA stream
214
+ # @return [Hash] {:win32_handle, :size} for receiver
215
+ def send_async(buffer, size, stream)
216
+ # For VMM IPC, we typically create a shareable allocation first
217
+ # This method assumes buffer is already a VMM allocation
218
+ raise NotImplementedError, "Use create_shareable_allocation and copy_async"
219
+ end
220
+
221
+ # Async receive
222
+ # @param buffer [FFI::Pointer] Destination buffer
223
+ # @param size [Integer] Size
224
+ # @param stream [FFI::Pointer] CUDA stream
225
+ # @return [void]
226
+ def recv_async(buffer, size, stream)
227
+ raise NotImplementedError, "Use import_shareable_allocation and copy_async"
228
+ end
229
+
230
+ # Check if VMM IPC is available
231
+ # @return [Boolean] True if available
232
+ def self.available?
233
+ begin
234
+ VMMBindings.ensure_loaded!
235
+ true
236
+ rescue LoadError
237
+ false
238
+ end
239
+ end
240
+
241
+ # Clean up all handles
242
+ # @return [void]
243
+ def destroy!
244
+ # Cleanup exported handles
245
+ @exported_handles.each do |va_ptr, info|
246
+ VMMBindings.cuMemUnmap(va_ptr, info[:size]) rescue nil
247
+ VMMBindings.cuMemAddressFree(va_ptr, info[:size]) rescue nil
248
+ VMMBindings.cuMemRelease(info[:alloc_handle]) rescue nil
249
+ # Note: win32_handle should be closed by caller
250
+ end
251
+ @exported_handles.clear
252
+
253
+ # Cleanup imported handles
254
+ @imported_handles.each do |_, info|
255
+ VMMBindings.cuMemUnmap(info[:va_ptr], info[:size]) rescue nil
256
+ VMMBindings.cuMemAddressFree(info[:va_ptr], info[:size]) rescue nil
257
+ VMMBindings.cuMemRelease(info[:alloc_handle]) rescue nil
258
+ end
259
+ @imported_handles.clear
260
+
261
+ @initialized = false
262
+ end
263
+ end
264
+ end
265
+ end
266
+ end
@@ -0,0 +1,200 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "topology"
4
+ require_relative "transport/base"
5
+ require_relative "transport/p2p_transport"
6
+ require_relative "transport/ipc_transport"
7
+
8
+ module Ignis
9
+ module Collective
10
+ # Automatic transport selector - NCCL-style
11
+ # Detects topology at init time and selects optimal transport for each GPU pair
12
+ class TransportSelector
13
+ # Transport types ranked by performance (highest first)
14
+ TRANSPORT_PRIORITY = [
15
+ :nvlink, # NVLink - 900 GB/s
16
+ :pcie_p2p, # PCIe P2P - 32 GB/s
17
+ :cuda_vmm_ipc, # cuMem VMM IPC - 25 GB/s
18
+ :cuda_ipc, # Legacy CUDA IPC - 20 GB/s
19
+ :host_staged, # Host staging - 12 GB/s
20
+ :rio_network, # Windows RIO - 100 Gbps
21
+ :tcp, # TCP fallback - variable
22
+ ].freeze
23
+
24
+ # Map interconnect types to transport classes
25
+ TRANSPORT_CLASSES = {
26
+ nvlink: Transport::P2PTransport,
27
+ pcie_p2p: Transport::P2PTransport,
28
+ cuda_ipc: Transport::IPCTransport,
29
+ cuda_vmm_ipc: Transport::IPCTransport,
30
+ host_staged: nil, # TODO: Implement SHMTransport
31
+ rio_network: nil, # TODO: Implement RIOTransport
32
+ tcp: nil, # TODO: Implement TCPTransport
33
+ }.freeze
34
+
35
+ # @return [Array<Integer>] GPU device IDs in this communicator
36
+ attr_reader :device_ids
37
+
38
+ # @return [Topology::Detector] Topology detector
39
+ attr_reader :topology
40
+
41
+ # @return [Hash<Array<Integer>, Transport::Base>] Transport matrix
42
+ attr_reader :transport_matrix
43
+
44
+ # Create transport selector for given GPUs
45
+ # @param device_ids [Array<Integer>] GPU device IDs
46
+ def initialize(device_ids)
47
+ @device_ids = device_ids.dup.freeze
48
+ @topology = Topology::Detector.new(device_ids: @device_ids)
49
+ @transport_matrix = {}
50
+ @initialized = false
51
+ end
52
+
53
+ # Initialize all transports
54
+ # @return [void]
55
+ def initialize!
56
+ return if @initialized
57
+
58
+ build_transport_matrix!
59
+ initialize_transports!
60
+ @initialized = true
61
+ end
62
+
63
+ # Get optimal transport for a GPU pair
64
+ # @param src [Integer] Source GPU
65
+ # @param dst [Integer] Destination GPU
66
+ # @return [Transport::Base, nil] Best transport or nil if same GPU
67
+ def select_transport(src, dst)
68
+ return nil if src == dst
69
+
70
+ @transport_matrix[[src, dst]]
71
+ end
72
+
73
+ # Get transport type for a GPU pair
74
+ # @param src [Integer] Source GPU
75
+ # @param dst [Integer] Destination GPU
76
+ # @return [Symbol, nil] Transport type
77
+ def transport_type(src, dst)
78
+ transport = select_transport(src, dst)
79
+ transport&.class&.transport_type
80
+ end
81
+
82
+ # Get optimal ring order based on topology
83
+ # @return [Array<Integer>] Ordered GPU IDs for ring algorithm
84
+ def optimal_ring_order
85
+ @topology.optimal_ring_order
86
+ end
87
+
88
+ # Check if all transports are ready
89
+ # @return [Boolean] True if initialized
90
+ def ready?
91
+ @initialized && @transport_matrix.values.all?(&:ready?)
92
+ end
93
+
94
+ # Get performance summary for logging
95
+ # @return [Hash] Performance stats
96
+ def performance_summary
97
+ nvlink_count = 0
98
+ p2p_count = 0
99
+ ipc_count = 0
100
+ staged_count = 0
101
+
102
+ @transport_matrix.each_value do |transport|
103
+ case transport
104
+ when Transport::P2PTransport
105
+ if transport.interconnect_type == :nvlink
106
+ nvlink_count += 1
107
+ else
108
+ p2p_count += 1
109
+ end
110
+ when Transport::IPCTransport
111
+ ipc_count += 1
112
+ else
113
+ staged_count += 1
114
+ end
115
+ end
116
+
117
+ total = @device_ids.size * (@device_ids.size - 1)
118
+ avg_bandwidth = @transport_matrix.values.sum(&:estimated_bandwidth) / [@transport_matrix.size, 1].max
119
+
120
+ {
121
+ total_paths: total,
122
+ nvlink_paths: nvlink_count,
123
+ p2p_paths: p2p_count,
124
+ ipc_paths: ipc_count,
125
+ staged_paths: staged_count,
126
+ avg_bandwidth_gbps: avg_bandwidth.round(1),
127
+ }
128
+ end
129
+
130
+ # Clean up all transports
131
+ # @return [void]
132
+ def destroy!
133
+ @transport_matrix.each_value(&:destroy!)
134
+ @transport_matrix.clear
135
+ @initialized = false
136
+ end
137
+
138
+ # @return [String] Human-readable summary
139
+ def to_s
140
+ stats = performance_summary
141
+ "TransportSelector[#{@device_ids.size} GPUs]: " \
142
+ "#{stats[:nvlink_paths]} NVLink, #{stats[:p2p_paths]} P2P, " \
143
+ "#{stats[:ipc_paths]} IPC (avg #{stats[:avg_bandwidth_gbps]} GB/s)"
144
+ end
145
+
146
+ private
147
+
148
+ # Build transport matrix based on detected topology
149
+ def build_transport_matrix!
150
+ @device_ids.each do |src|
151
+ @device_ids.each do |dst|
152
+ next if src == dst
153
+
154
+ @transport_matrix[[src, dst]] = create_transport(src, dst)
155
+ end
156
+ end
157
+ end
158
+
159
+ # Create appropriate transport for a GPU pair
160
+ # @param src [Integer] Source GPU
161
+ # @param dst [Integer] Destination GPU
162
+ # @return [Transport::Base] Transport instance
163
+ def create_transport(src, dst)
164
+ path = @topology.matrix.path(src, dst)
165
+ interconnect = path&.interconnect_type || :host_staged
166
+
167
+ case interconnect
168
+ when :nvlink
169
+ Transport::P2PTransport.new(
170
+ src_device: src,
171
+ dst_device: dst,
172
+ interconnect_type: :nvlink
173
+ )
174
+ when :pcie_p2p
175
+ Transport::P2PTransport.new(
176
+ src_device: src,
177
+ dst_device: dst,
178
+ interconnect_type: :pcie_p2p
179
+ )
180
+ when :cuda_ipc, :cuda_vmm_ipc
181
+ Transport::IPCTransport.new(
182
+ src_device: src,
183
+ dst_device: dst
184
+ )
185
+ else
186
+ # Fallback to IPC which always works
187
+ Transport::IPCTransport.new(
188
+ src_device: src,
189
+ dst_device: dst
190
+ )
191
+ end
192
+ end
193
+
194
+ # Initialize all transports
195
+ def initialize_transports!
196
+ @transport_matrix.each_value(&:initialize!)
197
+ end
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,212 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ffi"
4
+
5
+ module Ignis
6
+ module Collective
7
+ # cuMem VMM (Virtual Memory Management) API bindings for modern IPC
8
+ # Required for sharing cudaMallocAsync allocations on Windows
9
+ # Uses cuMemExportToShareableHandle with CU_MEM_HANDLE_TYPE_WIN32
10
+ module VMMBindings
11
+ extend FFI::Library
12
+
13
+ # Handle types for shareable handles
14
+ CU_MEM_HANDLE_TYPE_NONE = 0
15
+ CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 1
16
+ CU_MEM_HANDLE_TYPE_WIN32 = 2
17
+ CU_MEM_HANDLE_TYPE_WIN32_KMT = 4
18
+
19
+ # Memory allocation types
20
+ CU_MEM_ALLOCATION_TYPE_INVALID = 0
21
+ CU_MEM_ALLOCATION_TYPE_PINNED = 1 # Pinned memory, can be shared
22
+ CU_MEM_ALLOCATION_TYPE_MAX = 0xFFFFFFFF
23
+
24
+ # Allocation location types
25
+ CU_MEM_LOCATION_TYPE_INVALID = 0
26
+ CU_MEM_LOCATION_TYPE_DEVICE = 1
27
+ CU_MEM_LOCATION_TYPE_HOST = 2
28
+ CU_MEM_LOCATION_TYPE_HOST_NUMA = 3
29
+ CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT = 4
30
+ CU_MEM_LOCATION_TYPE_MAX = 0xFFFFFFFF
31
+
32
+ # Memory access flags
33
+ CU_MEM_ACCESS_FLAGS_PROT_NONE = 0
34
+ CU_MEM_ACCESS_FLAGS_PROT_READ = 1
35
+ CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 3
36
+
37
+ # CUmemLocation structure
38
+ class CUmemLocation < FFI::Struct
39
+ layout :type, :int, # CU_MEM_LOCATION_TYPE_*
40
+ :id, :int # Device ordinal for DEVICE type
41
+ end
42
+
43
+ # CUmemAllocationProp structure for cuMemCreate
44
+ class CUmemAllocationProp < FFI::Struct
45
+ layout :type, :int, # CU_MEM_ALLOCATION_TYPE_*
46
+ :requestedHandleTypes, :int, # OR of CU_MEM_HANDLE_TYPE_*
47
+ :location, CUmemLocation, # Memory location
48
+ :win32HandleMetaData, :pointer, # LPSECURITY_ATTRIBUTES for Win32
49
+ :allocFlags, :uint16, # Reserved flags
50
+ :reserved, [:uint8, 14] # Padding
51
+ end
52
+
53
+ # CUmemAccessDesc structure for cuMemSetAccess
54
+ class CUmemAccessDesc < FFI::Struct
55
+ layout :location, CUmemLocation, # Target device/host
56
+ :flags, :int # CU_MEM_ACCESS_FLAGS_*
57
+ end
58
+
59
+ # CUmemGenericAllocationHandle (unsigned long long)
60
+ typedef :uint64, :CUmemGenericAllocationHandle
61
+
62
+ # Load CUDA driver library (platform-aware)
63
+ def self.ensure_loaded!
64
+ return if @loaded
65
+
66
+ begin
67
+ driver_lib = if defined?(Ignis::Platform)
68
+ Ignis::Platform.find_cuda_lib(:cuda_driver) || (Ignis::Platform.windows? ? 'nvcuda' : 'libcuda')
69
+ else
70
+ RUBY_PLATFORM =~ /mswin|mingw|cygwin/ ? 'nvcuda' : 'libcuda'
71
+ end
72
+ ffi_lib [driver_lib, "nvcuda", "libcuda"]
73
+ attach_vmm_functions!
74
+ @loaded = true
75
+ rescue FFI::NotFoundError => e
76
+ raise LoadError, "Could not load CUDA driver library: #{e.message}"
77
+ end
78
+ end
79
+
80
+ def self.attach_vmm_functions!
81
+ # Memory allocation via VMM
82
+ attach_function :cuMemCreate, [
83
+ :pointer, # CUmemGenericAllocationHandle* handle
84
+ :size_t, # size
85
+ :pointer, # const CUmemAllocationProp* prop
86
+ :uint64 # flags (must be 0)
87
+ ], :int
88
+
89
+ # Get allocation size granularity
90
+ attach_function :cuMemGetAllocationGranularity, [
91
+ :pointer, # size_t* granularity
92
+ :pointer, # const CUmemAllocationProp* prop
93
+ :int # CUmemAllocationGranularity_flags
94
+ ], :int
95
+
96
+ # Release allocation handle
97
+ attach_function :cuMemRelease, [
98
+ :uint64 # CUmemGenericAllocationHandle handle
99
+ ], :int
100
+
101
+ # Export to shareable handle (Windows HANDLE or POSIX fd)
102
+ attach_function :cuMemExportToShareableHandle, [
103
+ :pointer, # void* shareableHandle (HANDLE* or int*)
104
+ :uint64, # CUmemGenericAllocationHandle handle
105
+ :int, # CUmemAllocationHandleType handleType
106
+ :uint64 # flags (must be 0)
107
+ ], :int
108
+
109
+ # Import from shareable handle
110
+ attach_function :cuMemImportFromShareableHandle, [
111
+ :pointer, # CUmemGenericAllocationHandle* handle
112
+ :pointer, # void* osHandle (HANDLE or int)
113
+ :int # CUmemAllocationHandleType handleType
114
+ ], :int
115
+
116
+ # Reserve virtual address range
117
+ attach_function :cuMemAddressReserve, [
118
+ :pointer, # CUdeviceptr* ptr
119
+ :size_t, # size
120
+ :size_t, # alignment (0 = default)
121
+ :uint64, # addr (0 = any)
122
+ :uint64 # flags (must be 0)
123
+ ], :int
124
+
125
+ # Free reserved address range
126
+ attach_function :cuMemAddressFree, [
127
+ :uint64, # CUdeviceptr ptr
128
+ :size_t # size
129
+ ], :int
130
+
131
+ # Map allocation to address range
132
+ attach_function :cuMemMap, [
133
+ :uint64, # CUdeviceptr ptr
134
+ :size_t, # size
135
+ :size_t, # offset
136
+ :uint64, # CUmemGenericAllocationHandle handle
137
+ :uint64 # flags (must be 0)
138
+ ], :int
139
+
140
+ # Unmap allocation
141
+ attach_function :cuMemUnmap, [
142
+ :uint64, # CUdeviceptr ptr
143
+ :size_t # size
144
+ ], :int
145
+
146
+ # Set memory access for specific devices
147
+ attach_function :cuMemSetAccess, [
148
+ :uint64, # CUdeviceptr ptr
149
+ :size_t, # size
150
+ :pointer, # const CUmemAccessDesc* desc
151
+ :size_t # count (number of descriptors)
152
+ ], :int
153
+
154
+ # Get allocation properties
155
+ attach_function :cuMemGetAllocationPropertiesFromHandle, [
156
+ :pointer, # CUmemAllocationProp* prop
157
+ :uint64 # CUmemGenericAllocationHandle handle
158
+ ], :int
159
+ end
160
+
161
+ # Check status and raise on error
162
+ # @param status [Integer] CUDA driver error code
163
+ # @param context [String] Operation description
164
+ def self.check_status!(status, context = "VMM operation")
165
+ return if status.zero? # CUDA_SUCCESS
166
+
167
+ # NOTE: previously this called cuGetErrorName, which is never attached in
168
+ # this module — so on ANY driver error it raised NoMethodError, masking the
169
+ # real failure. Report the numeric driver code directly instead.
170
+ raise CudaRuntimeError.new("#{context}: CUDA driver error #{status}", cuda_code: status)
171
+ end
172
+
173
+ # Helper: create allocation properties for device memory
174
+ # @param device_id [Integer] Target GPU device
175
+ # @param shareable [Boolean] Whether to allow IPC sharing
176
+ # @return [CUmemAllocationProp] Allocation properties
177
+ def self.create_allocation_prop(device_id:, shareable: true)
178
+ prop = CUmemAllocationProp.new
179
+ prop[:type] = CU_MEM_ALLOCATION_TYPE_PINNED
180
+
181
+ if shareable
182
+ handle_type = if defined?(Ignis::Platform)
183
+ Ignis::Platform.windows? ? CU_MEM_HANDLE_TYPE_WIN32 : CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
184
+ else
185
+ RUBY_PLATFORM =~ /mswin|mingw|cygwin/ ? CU_MEM_HANDLE_TYPE_WIN32 : CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
186
+ end
187
+ prop[:requestedHandleTypes] = handle_type
188
+ else
189
+ prop[:requestedHandleTypes] = 0
190
+ end
191
+
192
+ prop[:location][:type] = CU_MEM_LOCATION_TYPE_DEVICE
193
+ prop[:location][:id] = device_id
194
+ prop[:win32HandleMetaData] = FFI::Pointer::NULL
195
+ prop[:allocFlags] = 0
196
+ prop
197
+ end
198
+
199
+ # Helper: create access descriptor for device
200
+ # @param device_id [Integer] Target device
201
+ # @param read_write [Boolean] Read-write access?
202
+ # @return [CUmemAccessDesc] Access descriptor
203
+ def self.create_access_desc(device_id:, read_write: true)
204
+ desc = CUmemAccessDesc.new
205
+ desc[:location][:type] = CU_MEM_LOCATION_TYPE_DEVICE
206
+ desc[:location][:id] = device_id
207
+ desc[:flags] = read_write ? CU_MEM_ACCESS_FLAGS_PROT_READWRITE : CU_MEM_ACCESS_FLAGS_PROT_READ
208
+ desc
209
+ end
210
+ end
211
+ end
212
+ end