ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +7 -0
  3. data/lib/ignis-collective.rb +9 -0
  4. data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
  5. data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
  6. data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
  7. data/lib/nvruby/collective/algorithms/ring.rb +421 -0
  8. data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
  9. data/lib/nvruby/collective/algorithms/tree.rb +291 -0
  10. data/lib/nvruby/collective/array_ops.rb +240 -0
  11. data/lib/nvruby/collective/communicator.rb +633 -0
  12. data/lib/nvruby/collective/communicator_healer.rb +276 -0
  13. data/lib/nvruby/collective/device_manager.rb +216 -0
  14. data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
  15. data/lib/nvruby/collective/health_monitor.rb +333 -0
  16. data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
  17. data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
  18. data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
  19. data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
  20. data/lib/nvruby/collective/p2p_bindings.rb +121 -0
  21. data/lib/nvruby/collective/resilient_transport.rb +296 -0
  22. data/lib/nvruby/collective/topology.rb +347 -0
  23. data/lib/nvruby/collective/transport/base.rb +138 -0
  24. data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
  25. data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
  26. data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
  27. data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
  28. data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
  29. data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
  30. data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
  31. data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
  32. data/lib/nvruby/collective/transport_selector.rb +200 -0
  33. data/lib/nvruby/collective/vmm_bindings.rb +212 -0
  34. data/lib/nvruby/collective.rb +156 -0
  35. metadata +92 -0
@@ -0,0 +1,366 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "nd_bindings"
4
+ require_relative "nd_adapter"
5
+ require_relative "../transport/base"
6
+
7
+ module Ignis
8
+ module Collective
9
+ module NetworkDirect
10
+ # RDMA Transport for multi-node GPU communication
11
+ #
12
+ # Uses Windows NetworkDirect for zero-copy, kernel-bypass transfers.
13
+ # Integrates with CUDA for GPU memory registration.
14
+ #
15
+ # Workflow:
16
+ # 1. Discover RDMA adapters
17
+ # 2. Create queue pairs and completion queues
18
+ # 3. Register GPU memory for RDMA
19
+ # 4. Connect to remote peers
20
+ # 5. Perform RDMA Read/Write operations
21
+ class RDMATransport < Transport::Base
22
+ # @return [Symbol] Transport type identifier
23
+ def self.transport_type
24
+ :rdma
25
+ end
26
+
27
+ # @return [Float] Estimated bandwidth (GB/s)
28
+ def estimated_bandwidth
29
+ 100.0 # 100 Gbps Mellanox ConnectX-6
30
+ end
31
+
32
+ # @return [Float] Estimated latency (microseconds)
33
+ def estimated_latency
34
+ 1.5 # RDMA latency typically < 2us
35
+ end
36
+
37
+ # Initialize the transport
38
+ # @param local_address [String] Local IP for RDMA bind
39
+ # @param local_port [Integer] Local port
40
+ def initialize(local_address: nil, local_port: nil, **opts)
41
+ super(**opts)
42
+ @local_address = local_address
43
+ @local_port = local_port || 0
44
+ @adapter = nil
45
+ @send_cq = nil
46
+ @recv_cq = nil
47
+ @qp = nil
48
+ @connector = nil
49
+ @memory_regions = {} # gpu_ptr -> MemoryRegion
50
+ end
51
+
52
+ # Initialize RDMA resources
53
+ # @return [void]
54
+ def initialize!
55
+ return if @initialized
56
+
57
+ Bindings.ensure_loaded!
58
+
59
+ unless Bindings.available?
60
+ raise RDMAError, "NetworkDirect not available: #{Bindings.load_error}"
61
+ end
62
+
63
+ # Start NetworkDirect
64
+ status = Bindings.NdStartup(2) # Version 2
65
+ Bindings.check_status!(status, "NdStartup")
66
+
67
+ # Open first available adapter
68
+ @adapter = open_adapter
69
+
70
+ # Create completion queues
71
+ @send_cq = @adapter.create_completion_queue(depth: 256)
72
+ @recv_cq = @adapter.create_completion_queue(depth: 256)
73
+
74
+ # Create queue pair
75
+ @qp = @adapter.create_queue_pair(
76
+ send_cq: @send_cq,
77
+ recv_cq: @recv_cq,
78
+ send_depth: 64,
79
+ recv_depth: 64,
80
+ sge_count: 4
81
+ )
82
+
83
+ # Create connector
84
+ @connector = @adapter.create_connector
85
+
86
+ # Bind to local address
87
+ if @local_address
88
+ @connector.bind(address: @local_address, port: @local_port)
89
+ end
90
+
91
+ @initialized = true
92
+ end
93
+
94
+ # Connect to remote peer (client mode)
95
+ # @param remote_address [String] Remote IP address
96
+ # @param remote_port [Integer] Remote port
97
+ # @param private_data [String, nil] Connection private data
98
+ # @return [void]
99
+ def connect(remote_address:, remote_port:, private_data: nil)
100
+ ensure_initialized!
101
+
102
+ @connector.connect(
103
+ qp: @qp,
104
+ remote_address: remote_address,
105
+ remote_port: remote_port,
106
+ private_data: private_data
107
+ )
108
+ end
109
+
110
+ # Accept incoming connection (server mode)
111
+ # @param private_data [String, nil] Response private data
112
+ # @return [void]
113
+ def accept(private_data: nil)
114
+ ensure_initialized!
115
+
116
+ @connector.accept(qp: @qp, private_data: private_data)
117
+ end
118
+
119
+ # Register GPU memory for RDMA
120
+ # @param gpu_ptr [FFI::Pointer] GPU device pointer
121
+ # @param size [Integer] Size in bytes
122
+ # @return [Hash] Remote access info {:address, :token, :size}
123
+ def register_gpu_memory(gpu_ptr, size)
124
+ ensure_initialized!
125
+
126
+ # For GPU memory, we need to either:
127
+ # 1. Use cuMemExportToShareableHandle for mapping
128
+ # 2. Stage through pinned host memory
129
+
130
+ # Option 2 is more portable: allocate pinned host buffer
131
+ host_ptr = allocate_pinned_host(size)
132
+
133
+ # Register the host buffer with RDMA
134
+ mr = @adapter.register_memory(host_ptr, size)
135
+ @memory_regions[gpu_ptr.address] = {
136
+ memory_region: mr,
137
+ host_buffer: host_ptr,
138
+ size: size,
139
+ gpu_ptr: gpu_ptr
140
+ }
141
+
142
+ mr.remote_access_info
143
+ end
144
+
145
+ # Deregister GPU memory
146
+ # @param gpu_ptr [FFI::Pointer] GPU device pointer
147
+ # @return [void]
148
+ def deregister_gpu_memory(gpu_ptr)
149
+ info = @memory_regions.delete(gpu_ptr.address)
150
+ return unless info
151
+
152
+ info[:memory_region].deregister!
153
+ free_pinned_host(info[:host_buffer])
154
+ end
155
+
156
+ # RDMA Send (two-sided)
157
+ # @param buffer [FFI::Pointer] Source buffer (GPU)
158
+ # @param size [Integer] Size in bytes
159
+ # @return [void]
160
+ def send(buffer, size)
161
+ ensure_initialized!
162
+
163
+ info = @memory_regions[buffer.address]
164
+ raise RDMAError, "Buffer not registered" unless info
165
+
166
+ # Copy GPU -> Host (staging)
167
+ CUDA::RuntimeAPI.cudaMemcpy(
168
+ info[:host_buffer],
169
+ buffer,
170
+ size,
171
+ CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_HOST
172
+ )
173
+
174
+ # Post RDMA send
175
+ @qp.post_send(
176
+ sge_list: [{
177
+ buffer: info[:host_buffer],
178
+ length: size,
179
+ token: info[:memory_region].token
180
+ }]
181
+ )
182
+
183
+ # Wait for completion
184
+ poll_completion(@send_cq)
185
+ end
186
+
187
+ # RDMA Receive (two-sided)
188
+ # @param buffer [FFI::Pointer] Destination buffer (GPU)
189
+ # @param size [Integer] Size in bytes
190
+ # @return [void]
191
+ def receive(buffer, size)
192
+ ensure_initialized!
193
+
194
+ info = @memory_regions[buffer.address]
195
+ raise RDMAError, "Buffer not registered" unless info
196
+
197
+ # Post RDMA receive
198
+ @qp.post_receive(
199
+ sge_list: [{
200
+ buffer: info[:host_buffer],
201
+ length: size,
202
+ token: info[:memory_region].token
203
+ }]
204
+ )
205
+
206
+ # Wait for completion
207
+ poll_completion(@recv_cq)
208
+
209
+ # Copy Host -> GPU
210
+ CUDA::RuntimeAPI.cudaMemcpy(
211
+ buffer,
212
+ info[:host_buffer],
213
+ size,
214
+ CUDA::RuntimeAPI::MEMCPY_HOST_TO_DEVICE
215
+ )
216
+ end
217
+
218
+ # RDMA Write (one-sided, zero-copy to remote)
219
+ # @param local_buffer [FFI::Pointer] Local GPU buffer
220
+ # @param remote_address [Integer] Remote buffer address
221
+ # @param remote_token [Integer] Remote memory token
222
+ # @param size [Integer] Size in bytes
223
+ # @return [void]
224
+ def rdma_write(local_buffer:, remote_address:, remote_token:, size:)
225
+ ensure_initialized!
226
+
227
+ info = @memory_regions[local_buffer.address]
228
+ raise RDMAError, "Buffer not registered" unless info
229
+
230
+ # Stage to host
231
+ CUDA::RuntimeAPI.cudaMemcpy(
232
+ info[:host_buffer],
233
+ local_buffer,
234
+ size,
235
+ CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_HOST
236
+ )
237
+
238
+ # RDMA Write
239
+ @qp.rdma_write(
240
+ remote_address: remote_address,
241
+ remote_token: remote_token,
242
+ sge_list: [{
243
+ buffer: info[:host_buffer],
244
+ length: size,
245
+ token: info[:memory_region].token
246
+ }]
247
+ )
248
+
249
+ poll_completion(@send_cq)
250
+ end
251
+
252
+ # RDMA Read (one-sided, zero-copy from remote)
253
+ # @param local_buffer [FFI::Pointer] Local GPU buffer
254
+ # @param remote_address [Integer] Remote buffer address
255
+ # @param remote_token [Integer] Remote memory token
256
+ # @param size [Integer] Size in bytes
257
+ # @return [void]
258
+ def rdma_read(local_buffer:, remote_address:, remote_token:, size:)
259
+ ensure_initialized!
260
+
261
+ info = @memory_regions[local_buffer.address]
262
+ raise RDMAError, "Buffer not registered" unless info
263
+
264
+ # RDMA Read
265
+ @qp.rdma_read(
266
+ remote_address: remote_address,
267
+ remote_token: remote_token,
268
+ sge_list: [{
269
+ buffer: info[:host_buffer],
270
+ length: size,
271
+ token: info[:memory_region].token
272
+ }]
273
+ )
274
+
275
+ poll_completion(@send_cq)
276
+
277
+ # Stage to GPU
278
+ CUDA::RuntimeAPI.cudaMemcpy(
279
+ local_buffer,
280
+ info[:host_buffer],
281
+ size,
282
+ CUDA::RuntimeAPI::MEMCPY_HOST_TO_DEVICE
283
+ )
284
+ end
285
+
286
+ # Check if RDMA is available
287
+ # @return [Boolean] True if RDMA hardware present
288
+ def self.available?
289
+ Bindings.available?
290
+ end
291
+
292
+ # Cleanup
293
+ # @return [void]
294
+ def destroy!
295
+ # Deregister all memory
296
+ @memory_regions.each_key do |ptr_addr|
297
+ ptr = FFI::Pointer.new(:uint8, ptr_addr)
298
+ deregister_gpu_memory(ptr) rescue nil
299
+ end
300
+
301
+ @connector&.close!
302
+ @qp&.close!
303
+ @send_cq&.close!
304
+ @recv_cq&.close!
305
+ @adapter&.close!
306
+
307
+ Bindings.NdCleanup rescue nil
308
+
309
+ @initialized = false
310
+ end
311
+
312
+ private
313
+
314
+ def open_adapter
315
+ # Query available adapters
316
+ address_list_size = FFI::MemoryPointer.new(:uint32)
317
+ address_list_size.write_uint32(0)
318
+
319
+ # Get size first
320
+ Bindings.NdQueryAddressList(0, FFI::Pointer::NULL, address_list_size)
321
+
322
+ # Allocate and query
323
+ size = address_list_size.read_uint32
324
+ address_list = FFI::MemoryPointer.new(:uint8, size)
325
+ status = Bindings.NdQueryAddressList(0, address_list, address_list_size)
326
+ Bindings.check_status!(status, "NdQueryAddressList")
327
+
328
+ # Open first adapter
329
+ adapter_ptr = FFI::MemoryPointer.new(:pointer)
330
+ status = Bindings.NdOpenAdapter(address_list, size, adapter_ptr)
331
+ Bindings.check_status!(status, "NdOpenAdapter")
332
+
333
+ Adapter.new(adapter_ptr.read_pointer, "RDMA0")
334
+ end
335
+
336
+ def allocate_pinned_host(size)
337
+ ptr_ptr = FFI::MemoryPointer.new(:pointer)
338
+ status = CUDA::RuntimeAPI.cudaHostAlloc(ptr_ptr, size, 1) # cudaHostAllocDefault
339
+ CUDA::RuntimeAPI.check_status!(status, "cudaHostAlloc for RDMA")
340
+ ptr_ptr.read_pointer
341
+ end
342
+
343
+ def free_pinned_host(ptr)
344
+ CUDA::RuntimeAPI.cudaFreeHost(ptr) rescue nil
345
+ end
346
+
347
+ def poll_completion(cq, timeout_ms: 5000)
348
+ start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
349
+
350
+ loop do
351
+ results = cq.poll(max_results: 1)
352
+ return results.first if results.any?
353
+
354
+ elapsed = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000
355
+ if elapsed > timeout_ms
356
+ raise RDMAError, "Completion poll timeout"
357
+ end
358
+
359
+ # Brief sleep to avoid busy-wait
360
+ sleep(0.0001)
361
+ end
362
+ end
363
+ end
364
+ end
365
+ end
366
+ end
@@ -0,0 +1,230 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module Collective
5
+ # Adapter for seamless NvArray integration with collective operations
6
+ # Handles dtype detection, shape validation, and buffer extraction
7
+ #
8
+ # @example Direct tensor usage
9
+ # tensors = gpus.map { |gpu| NvArray.zeros([1000, 128], device: gpu) }
10
+ # comm.all_reduce(tensors) # NvArrayAdapter handles conversion
11
+ #
12
+ module NvArrayAdapter
13
+ # Supported dtypes and their byte sizes
14
+ DTYPE_SIZES = {
15
+ float32: 4,
16
+ float64: 8,
17
+ float16: 2,
18
+ bfloat16: 2,
19
+ int32: 4,
20
+ int64: 8,
21
+ int16: 2,
22
+ int8: 1,
23
+ uint8: 1,
24
+ uint32: 4,
25
+ uint64: 8
26
+ }.freeze
27
+
28
+ # CUDA dtype codes for kernel dispatch
29
+ DTYPE_CUDA_CODES = {
30
+ float32: 0,
31
+ float64: 1,
32
+ float16: 2,
33
+ bfloat16: 3,
34
+ int32: 4,
35
+ int64: 5,
36
+ int16: 6,
37
+ int8: 7,
38
+ uint8: 8,
39
+ uint32: 9,
40
+ uint64: 10
41
+ }.freeze
42
+
43
+ class << self
44
+ # Normalize input to array of NvArrays with validated properties
45
+ #
46
+ # @param inputs [Array<NvArray>, Array<FFI::Pointer>, NvArray] Input tensors
47
+ # @param expected_count [Integer, nil] Expected number of tensors (optional)
48
+ # @return [Array<NvArray>] Normalized tensor array
49
+ # @raise [ArgumentError] If inputs are invalid
50
+ def normalize(inputs, expected_count: nil)
51
+ tensors = wrap_array(inputs)
52
+
53
+ if expected_count && tensors.size != expected_count
54
+ raise ArgumentError,
55
+ "Expected #{expected_count} tensors, got #{tensors.size}"
56
+ end
57
+
58
+ validate_tensors!(tensors)
59
+ tensors
60
+ end
61
+
62
+ # Extract device pointers from tensors for raw operations
63
+ #
64
+ # @param tensors [Array<NvArray>] Input tensors
65
+ # @return [Array<FFI::Pointer>] Device pointers
66
+ def extract_pointers(tensors)
67
+ tensors.map do |t|
68
+ if t.respond_to?(:data_ptr)
69
+ t.data_ptr
70
+ elsif t.respond_to?(:device_ptr)
71
+ t.device_ptr
72
+ elsif t.respond_to?(:pointer)
73
+ t.pointer
74
+ elsif t.is_a?(Fiddle::Pointer)
75
+ t
76
+ elsif defined?(FFI::Pointer) && t.is_a?(FFI::Pointer)
77
+ t
78
+ else
79
+ raise ArgumentError, "Cannot extract pointer from #{t.class}"
80
+ end
81
+ end
82
+ end
83
+
84
+ # Get common dtype from tensor array
85
+ #
86
+ # @param tensors [Array<NvArray>] Input tensors
87
+ # @return [Symbol] Common dtype
88
+ # @raise [ArgumentError] If dtypes don't match
89
+ def common_dtype(tensors)
90
+ return :float32 if tensors.empty?
91
+
92
+ dtypes = tensors.map { |t| extract_dtype(t) }.uniq
93
+ if dtypes.size > 1
94
+ raise ArgumentError,
95
+ "All tensors must have same dtype, got: #{dtypes.join(', ')}"
96
+ end
97
+
98
+ dtypes.first
99
+ end
100
+
101
+ # Get element count from tensors (must match)
102
+ #
103
+ # @param tensors [Array<NvArray>] Input tensors
104
+ # @return [Integer] Element count
105
+ # @raise [ArgumentError] If element counts don't match
106
+ def common_element_count(tensors)
107
+ return 0 if tensors.empty?
108
+
109
+ counts = tensors.map { |t| extract_element_count(t) }.uniq
110
+ if counts.size > 1
111
+ raise ArgumentError,
112
+ "All tensors must have same element count, got: #{counts.join(', ')}"
113
+ end
114
+
115
+ counts.first
116
+ end
117
+
118
+ # Get byte size for a dtype
119
+ #
120
+ # @param dtype [Symbol] Data type
121
+ # @return [Integer] Bytes per element
122
+ def dtype_size(dtype)
123
+ DTYPE_SIZES[dtype] || raise(ArgumentError, "Unknown dtype: #{dtype}")
124
+ end
125
+
126
+ # Get CUDA type code for kernel dispatch
127
+ #
128
+ # @param dtype [Symbol] Data type
129
+ # @return [Integer] CUDA type code
130
+ def dtype_cuda_code(dtype)
131
+ DTYPE_CUDA_CODES[dtype] || 0
132
+ end
133
+
134
+ # Calculate total byte size for tensors
135
+ #
136
+ # @param tensors [Array<NvArray>] Input tensors
137
+ # @return [Integer] Total bytes
138
+ def total_byte_size(tensors)
139
+ tensors.sum do |t|
140
+ extract_element_count(t) * dtype_size(extract_dtype(t))
141
+ end
142
+ end
143
+
144
+ # Validate shape compatibility for broadcast operations
145
+ #
146
+ # @param src_tensor [NvArray] Source tensor
147
+ # @param dst_tensors [Array<NvArray>] Destination tensors
148
+ # @return [Boolean] True if compatible
149
+ def broadcast_compatible?(src_tensor, dst_tensors)
150
+ src_shape = extract_shape(src_tensor)
151
+ dst_tensors.all? { |t| extract_shape(t) == src_shape }
152
+ end
153
+
154
+ # Create buffer info for collective ops
155
+ #
156
+ # @param tensors [Array<NvArray>] Input tensors
157
+ # @return [Hash] Buffer information
158
+ def buffer_info(tensors)
159
+ {
160
+ pointers: extract_pointers(tensors),
161
+ dtype: common_dtype(tensors),
162
+ dtype_code: dtype_cuda_code(common_dtype(tensors)),
163
+ element_count: common_element_count(tensors),
164
+ byte_size: common_element_count(tensors) * dtype_size(common_dtype(tensors)),
165
+ tensor_count: tensors.size
166
+ }
167
+ end
168
+
169
+ private
170
+
171
+ def wrap_array(inputs)
172
+ case inputs
173
+ when Array
174
+ inputs
175
+ when Ignis::NvArray
176
+ [inputs]
177
+ else
178
+ raise ArgumentError, "Expected Array or NvArray, got #{inputs.class}"
179
+ end
180
+ end
181
+
182
+ def validate_tensors!(tensors)
183
+ return if tensors.empty?
184
+
185
+ # All tensors must have extractable properties
186
+ tensors.each_with_index do |t, i|
187
+ unless t.respond_to?(:data_ptr) || t.respond_to?(:device_ptr) || t.respond_to?(:pointer) ||
188
+ t.is_a?(Fiddle::Pointer) || (defined?(FFI::Pointer) && t.is_a?(FFI::Pointer))
189
+ raise ArgumentError, "Tensor #{i} has no data_ptr/device_ptr method"
190
+ end
191
+ end
192
+
193
+ # Check dtype consistency
194
+ common_dtype(tensors)
195
+
196
+ # Check element count consistency
197
+ common_element_count(tensors)
198
+ end
199
+
200
+ def extract_dtype(tensor)
201
+ if tensor.respond_to?(:dtype)
202
+ tensor.dtype
203
+ else
204
+ :float32 # Default assumption for raw pointers
205
+ end
206
+ end
207
+
208
+ def extract_element_count(tensor)
209
+ if tensor.respond_to?(:numel)
210
+ tensor.numel
211
+ elsif tensor.respond_to?(:shape)
212
+ tensor.shape.reduce(1, :*)
213
+ elsif tensor.respond_to?(:size)
214
+ tensor.size
215
+ else
216
+ raise ArgumentError, "Cannot determine element count for #{tensor.class}"
217
+ end
218
+ end
219
+
220
+ def extract_shape(tensor)
221
+ if tensor.respond_to?(:shape)
222
+ tensor.shape
223
+ else
224
+ nil
225
+ end
226
+ end
227
+ end
228
+ end
229
+ end
230
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ffi"
4
+
5
+ module Ignis
6
+ module Collective
7
+ # P2P-specific CUDA bindings for multi-GPU communication
8
+ # Extends RuntimeAPI with peer-to-peer and IPC functions
9
+ module P2PBindings
10
+ extend FFI::Library
11
+
12
+ # P2P attribute constants (cudaDeviceP2PAttr)
13
+ P2P_ATTR_PERFORMANCE_RANK = 1
14
+ P2P_ATTR_ACCESS_SUPPORTED = 2
15
+ P2P_ATTR_NATIVE_ATOMIC_SUPPORTED = 3
16
+ P2P_ATTR_CUDA_ARRAY_ACCESS_SUPPORTED = 4
17
+
18
+ # IPC memory flags
19
+ IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 1
20
+
21
+ # CUDA IPC memory handle size (64 bytes)
22
+ IPC_HANDLE_SIZE = 64
23
+
24
+ # cudaIpcMemHandle_t structure
25
+ class CudaIpcMemHandle < FFI::Struct
26
+ layout :reserved, [:uint8, IPC_HANDLE_SIZE]
27
+ end
28
+
29
+ class << self
30
+ # Ensure P2P bindings are loaded
31
+ # @return [void]
32
+ def ensure_loaded!
33
+ return if @loaded
34
+
35
+ CUDA::LibraryLoader.ensure_cuda_runtime!
36
+
37
+ # Resolve cudart path per platform
38
+ dll_path = if defined?(Ignis::Platform)
39
+ Ignis::Platform.cudart_path
40
+ elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
41
+ cuda_bin = Ignis.configuration.cuda_bin_path rescue nil
42
+ cuda_bin ? Dir.glob(File.join(cuda_bin, 'cudart64_*.dll')).max : 'cudart64_130'
43
+ else
44
+ 'libcudart.so.13'
45
+ end
46
+
47
+ ffi_lib dll_path if dll_path
48
+
49
+ attach_p2p_functions!
50
+ @loaded = true
51
+ end
52
+
53
+ private
54
+
55
+ def attach_p2p_functions!
56
+ # Peer-to-peer access
57
+ attach_function :cudaDeviceCanAccessPeer,
58
+ [:pointer, :int, :int],
59
+ :int
60
+
61
+ attach_function :cudaDeviceEnablePeerAccess,
62
+ [:int, :uint],
63
+ :int
64
+
65
+ attach_function :cudaDeviceDisablePeerAccess,
66
+ [:int],
67
+ :int
68
+
69
+ attach_function :cudaDeviceGetP2PAttribute,
70
+ [:pointer, :int, :int, :int],
71
+ :int
72
+
73
+ # Peer-to-peer memory copy
74
+ attach_function :cudaMemcpyPeer,
75
+ [:pointer, :int, :pointer, :int, :size_t],
76
+ :int
77
+
78
+ attach_function :cudaMemcpyPeerAsync,
79
+ [:pointer, :int, :pointer, :int, :size_t, :pointer],
80
+ :int
81
+
82
+ # IPC memory handle functions
83
+ attach_function :cudaIpcGetMemHandle,
84
+ [CudaIpcMemHandle.ptr, :pointer],
85
+ :int
86
+
87
+ attach_function :cudaIpcOpenMemHandle,
88
+ [:pointer, CudaIpcMemHandle.by_value, :uint],
89
+ :int
90
+
91
+ attach_function :cudaIpcCloseMemHandle,
92
+ [:pointer],
93
+ :int
94
+
95
+ # IPC event handle functions
96
+ attach_function :cudaIpcGetEventHandle,
97
+ [:pointer, :pointer],
98
+ :int
99
+
100
+ attach_function :cudaIpcOpenEventHandle,
101
+ [:pointer, :pointer],
102
+ :int
103
+ end
104
+ end
105
+
106
+ # Check CUDA status and raise error if not success
107
+ # @param status [Integer] CUDA status code
108
+ # @param context [String] Error context
109
+ # @return [void]
110
+ # @raise [CudaRuntimeError] If status indicates an error
111
+ def self.check_status!(status, context = "P2P operation")
112
+ return if status.zero?
113
+
114
+ CUDA::RuntimeAPI.ensure_loaded!
115
+ error_name = CUDA::RuntimeAPI.cudaGetErrorName(status)
116
+ error_string = CUDA::RuntimeAPI.cudaGetErrorString(status)
117
+ raise CudaRuntimeError.new("#{context}: #{error_name} - #{error_string}", cuda_code: status)
118
+ end
119
+ end
120
+ end
121
+ end