ignis-collective 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +7 -0
- data/lib/ignis-collective.rb +9 -0
- data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
- data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
- data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
- data/lib/nvruby/collective/algorithms/ring.rb +421 -0
- data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
- data/lib/nvruby/collective/algorithms/tree.rb +291 -0
- data/lib/nvruby/collective/array_ops.rb +240 -0
- data/lib/nvruby/collective/communicator.rb +633 -0
- data/lib/nvruby/collective/communicator_healer.rb +276 -0
- data/lib/nvruby/collective/device_manager.rb +216 -0
- data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
- data/lib/nvruby/collective/health_monitor.rb +333 -0
- data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
- data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
- data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
- data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
- data/lib/nvruby/collective/p2p_bindings.rb +121 -0
- data/lib/nvruby/collective/resilient_transport.rb +296 -0
- data/lib/nvruby/collective/topology.rb +347 -0
- data/lib/nvruby/collective/transport/base.rb +138 -0
- data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
- data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
- data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
- data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
- data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
- data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
- data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
- data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
- data/lib/nvruby/collective/transport_selector.rb +200 -0
- data/lib/nvruby/collective/vmm_bindings.rb +212 -0
- data/lib/nvruby/collective.rb +156 -0
- metadata +92 -0
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "nd_bindings"
|
|
4
|
+
require_relative "nd_adapter"
|
|
5
|
+
require_relative "../transport/base"
|
|
6
|
+
|
|
7
|
+
module Ignis
|
|
8
|
+
module Collective
|
|
9
|
+
module NetworkDirect
|
|
10
|
+
# RDMA Transport for multi-node GPU communication
|
|
11
|
+
#
|
|
12
|
+
# Uses Windows NetworkDirect for zero-copy, kernel-bypass transfers.
|
|
13
|
+
# Integrates with CUDA for GPU memory registration.
|
|
14
|
+
#
|
|
15
|
+
# Workflow:
|
|
16
|
+
# 1. Discover RDMA adapters
|
|
17
|
+
# 2. Create queue pairs and completion queues
|
|
18
|
+
# 3. Register GPU memory for RDMA
|
|
19
|
+
# 4. Connect to remote peers
|
|
20
|
+
# 5. Perform RDMA Read/Write operations
|
|
21
|
+
class RDMATransport < Transport::Base
|
|
22
|
+
# @return [Symbol] Transport type identifier
|
|
23
|
+
def self.transport_type
|
|
24
|
+
:rdma
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# @return [Float] Estimated bandwidth (GB/s)
|
|
28
|
+
def estimated_bandwidth
|
|
29
|
+
100.0 # 100 Gbps Mellanox ConnectX-6
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# @return [Float] Estimated latency (microseconds)
|
|
33
|
+
def estimated_latency
|
|
34
|
+
1.5 # RDMA latency typically < 2us
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Initialize the transport
|
|
38
|
+
# @param local_address [String] Local IP for RDMA bind
|
|
39
|
+
# @param local_port [Integer] Local port
|
|
40
|
+
def initialize(local_address: nil, local_port: nil, **opts)
|
|
41
|
+
super(**opts)
|
|
42
|
+
@local_address = local_address
|
|
43
|
+
@local_port = local_port || 0
|
|
44
|
+
@adapter = nil
|
|
45
|
+
@send_cq = nil
|
|
46
|
+
@recv_cq = nil
|
|
47
|
+
@qp = nil
|
|
48
|
+
@connector = nil
|
|
49
|
+
@memory_regions = {} # gpu_ptr -> MemoryRegion
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Initialize RDMA resources
|
|
53
|
+
# @return [void]
|
|
54
|
+
def initialize!
|
|
55
|
+
return if @initialized
|
|
56
|
+
|
|
57
|
+
Bindings.ensure_loaded!
|
|
58
|
+
|
|
59
|
+
unless Bindings.available?
|
|
60
|
+
raise RDMAError, "NetworkDirect not available: #{Bindings.load_error}"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Start NetworkDirect
|
|
64
|
+
status = Bindings.NdStartup(2) # Version 2
|
|
65
|
+
Bindings.check_status!(status, "NdStartup")
|
|
66
|
+
|
|
67
|
+
# Open first available adapter
|
|
68
|
+
@adapter = open_adapter
|
|
69
|
+
|
|
70
|
+
# Create completion queues
|
|
71
|
+
@send_cq = @adapter.create_completion_queue(depth: 256)
|
|
72
|
+
@recv_cq = @adapter.create_completion_queue(depth: 256)
|
|
73
|
+
|
|
74
|
+
# Create queue pair
|
|
75
|
+
@qp = @adapter.create_queue_pair(
|
|
76
|
+
send_cq: @send_cq,
|
|
77
|
+
recv_cq: @recv_cq,
|
|
78
|
+
send_depth: 64,
|
|
79
|
+
recv_depth: 64,
|
|
80
|
+
sge_count: 4
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Create connector
|
|
84
|
+
@connector = @adapter.create_connector
|
|
85
|
+
|
|
86
|
+
# Bind to local address
|
|
87
|
+
if @local_address
|
|
88
|
+
@connector.bind(address: @local_address, port: @local_port)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
@initialized = true
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Connect to remote peer (client mode)
|
|
95
|
+
# @param remote_address [String] Remote IP address
|
|
96
|
+
# @param remote_port [Integer] Remote port
|
|
97
|
+
# @param private_data [String, nil] Connection private data
|
|
98
|
+
# @return [void]
|
|
99
|
+
def connect(remote_address:, remote_port:, private_data: nil)
|
|
100
|
+
ensure_initialized!
|
|
101
|
+
|
|
102
|
+
@connector.connect(
|
|
103
|
+
qp: @qp,
|
|
104
|
+
remote_address: remote_address,
|
|
105
|
+
remote_port: remote_port,
|
|
106
|
+
private_data: private_data
|
|
107
|
+
)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Accept incoming connection (server mode)
|
|
111
|
+
# @param private_data [String, nil] Response private data
|
|
112
|
+
# @return [void]
|
|
113
|
+
def accept(private_data: nil)
|
|
114
|
+
ensure_initialized!
|
|
115
|
+
|
|
116
|
+
@connector.accept(qp: @qp, private_data: private_data)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Register GPU memory for RDMA
|
|
120
|
+
# @param gpu_ptr [FFI::Pointer] GPU device pointer
|
|
121
|
+
# @param size [Integer] Size in bytes
|
|
122
|
+
# @return [Hash] Remote access info {:address, :token, :size}
|
|
123
|
+
def register_gpu_memory(gpu_ptr, size)
|
|
124
|
+
ensure_initialized!
|
|
125
|
+
|
|
126
|
+
# For GPU memory, we need to either:
|
|
127
|
+
# 1. Use cuMemExportToShareableHandle for mapping
|
|
128
|
+
# 2. Stage through pinned host memory
|
|
129
|
+
|
|
130
|
+
# Option 2 is more portable: allocate pinned host buffer
|
|
131
|
+
host_ptr = allocate_pinned_host(size)
|
|
132
|
+
|
|
133
|
+
# Register the host buffer with RDMA
|
|
134
|
+
mr = @adapter.register_memory(host_ptr, size)
|
|
135
|
+
@memory_regions[gpu_ptr.address] = {
|
|
136
|
+
memory_region: mr,
|
|
137
|
+
host_buffer: host_ptr,
|
|
138
|
+
size: size,
|
|
139
|
+
gpu_ptr: gpu_ptr
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
mr.remote_access_info
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Deregister GPU memory
|
|
146
|
+
# @param gpu_ptr [FFI::Pointer] GPU device pointer
|
|
147
|
+
# @return [void]
|
|
148
|
+
def deregister_gpu_memory(gpu_ptr)
|
|
149
|
+
info = @memory_regions.delete(gpu_ptr.address)
|
|
150
|
+
return unless info
|
|
151
|
+
|
|
152
|
+
info[:memory_region].deregister!
|
|
153
|
+
free_pinned_host(info[:host_buffer])
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# RDMA Send (two-sided)
|
|
157
|
+
# @param buffer [FFI::Pointer] Source buffer (GPU)
|
|
158
|
+
# @param size [Integer] Size in bytes
|
|
159
|
+
# @return [void]
|
|
160
|
+
def send(buffer, size)
|
|
161
|
+
ensure_initialized!
|
|
162
|
+
|
|
163
|
+
info = @memory_regions[buffer.address]
|
|
164
|
+
raise RDMAError, "Buffer not registered" unless info
|
|
165
|
+
|
|
166
|
+
# Copy GPU -> Host (staging)
|
|
167
|
+
CUDA::RuntimeAPI.cudaMemcpy(
|
|
168
|
+
info[:host_buffer],
|
|
169
|
+
buffer,
|
|
170
|
+
size,
|
|
171
|
+
CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_HOST
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Post RDMA send
|
|
175
|
+
@qp.post_send(
|
|
176
|
+
sge_list: [{
|
|
177
|
+
buffer: info[:host_buffer],
|
|
178
|
+
length: size,
|
|
179
|
+
token: info[:memory_region].token
|
|
180
|
+
}]
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Wait for completion
|
|
184
|
+
poll_completion(@send_cq)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# RDMA Receive (two-sided)
|
|
188
|
+
# @param buffer [FFI::Pointer] Destination buffer (GPU)
|
|
189
|
+
# @param size [Integer] Size in bytes
|
|
190
|
+
# @return [void]
|
|
191
|
+
def receive(buffer, size)
|
|
192
|
+
ensure_initialized!
|
|
193
|
+
|
|
194
|
+
info = @memory_regions[buffer.address]
|
|
195
|
+
raise RDMAError, "Buffer not registered" unless info
|
|
196
|
+
|
|
197
|
+
# Post RDMA receive
|
|
198
|
+
@qp.post_receive(
|
|
199
|
+
sge_list: [{
|
|
200
|
+
buffer: info[:host_buffer],
|
|
201
|
+
length: size,
|
|
202
|
+
token: info[:memory_region].token
|
|
203
|
+
}]
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Wait for completion
|
|
207
|
+
poll_completion(@recv_cq)
|
|
208
|
+
|
|
209
|
+
# Copy Host -> GPU
|
|
210
|
+
CUDA::RuntimeAPI.cudaMemcpy(
|
|
211
|
+
buffer,
|
|
212
|
+
info[:host_buffer],
|
|
213
|
+
size,
|
|
214
|
+
CUDA::RuntimeAPI::MEMCPY_HOST_TO_DEVICE
|
|
215
|
+
)
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# RDMA Write (one-sided, zero-copy to remote)
|
|
219
|
+
# @param local_buffer [FFI::Pointer] Local GPU buffer
|
|
220
|
+
# @param remote_address [Integer] Remote buffer address
|
|
221
|
+
# @param remote_token [Integer] Remote memory token
|
|
222
|
+
# @param size [Integer] Size in bytes
|
|
223
|
+
# @return [void]
|
|
224
|
+
def rdma_write(local_buffer:, remote_address:, remote_token:, size:)
|
|
225
|
+
ensure_initialized!
|
|
226
|
+
|
|
227
|
+
info = @memory_regions[local_buffer.address]
|
|
228
|
+
raise RDMAError, "Buffer not registered" unless info
|
|
229
|
+
|
|
230
|
+
# Stage to host
|
|
231
|
+
CUDA::RuntimeAPI.cudaMemcpy(
|
|
232
|
+
info[:host_buffer],
|
|
233
|
+
local_buffer,
|
|
234
|
+
size,
|
|
235
|
+
CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_HOST
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# RDMA Write
|
|
239
|
+
@qp.rdma_write(
|
|
240
|
+
remote_address: remote_address,
|
|
241
|
+
remote_token: remote_token,
|
|
242
|
+
sge_list: [{
|
|
243
|
+
buffer: info[:host_buffer],
|
|
244
|
+
length: size,
|
|
245
|
+
token: info[:memory_region].token
|
|
246
|
+
}]
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
poll_completion(@send_cq)
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# RDMA Read (one-sided, zero-copy from remote)
|
|
253
|
+
# @param local_buffer [FFI::Pointer] Local GPU buffer
|
|
254
|
+
# @param remote_address [Integer] Remote buffer address
|
|
255
|
+
# @param remote_token [Integer] Remote memory token
|
|
256
|
+
# @param size [Integer] Size in bytes
|
|
257
|
+
# @return [void]
|
|
258
|
+
def rdma_read(local_buffer:, remote_address:, remote_token:, size:)
|
|
259
|
+
ensure_initialized!
|
|
260
|
+
|
|
261
|
+
info = @memory_regions[local_buffer.address]
|
|
262
|
+
raise RDMAError, "Buffer not registered" unless info
|
|
263
|
+
|
|
264
|
+
# RDMA Read
|
|
265
|
+
@qp.rdma_read(
|
|
266
|
+
remote_address: remote_address,
|
|
267
|
+
remote_token: remote_token,
|
|
268
|
+
sge_list: [{
|
|
269
|
+
buffer: info[:host_buffer],
|
|
270
|
+
length: size,
|
|
271
|
+
token: info[:memory_region].token
|
|
272
|
+
}]
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
poll_completion(@send_cq)
|
|
276
|
+
|
|
277
|
+
# Stage to GPU
|
|
278
|
+
CUDA::RuntimeAPI.cudaMemcpy(
|
|
279
|
+
local_buffer,
|
|
280
|
+
info[:host_buffer],
|
|
281
|
+
size,
|
|
282
|
+
CUDA::RuntimeAPI::MEMCPY_HOST_TO_DEVICE
|
|
283
|
+
)
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
# Check if RDMA is available
|
|
287
|
+
# @return [Boolean] True if RDMA hardware present
|
|
288
|
+
def self.available?
|
|
289
|
+
Bindings.available?
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
# Cleanup
|
|
293
|
+
# @return [void]
|
|
294
|
+
def destroy!
|
|
295
|
+
# Deregister all memory
|
|
296
|
+
@memory_regions.each_key do |ptr_addr|
|
|
297
|
+
ptr = FFI::Pointer.new(:uint8, ptr_addr)
|
|
298
|
+
deregister_gpu_memory(ptr) rescue nil
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
@connector&.close!
|
|
302
|
+
@qp&.close!
|
|
303
|
+
@send_cq&.close!
|
|
304
|
+
@recv_cq&.close!
|
|
305
|
+
@adapter&.close!
|
|
306
|
+
|
|
307
|
+
Bindings.NdCleanup rescue nil
|
|
308
|
+
|
|
309
|
+
@initialized = false
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
private
|
|
313
|
+
|
|
314
|
+
def open_adapter
|
|
315
|
+
# Query available adapters
|
|
316
|
+
address_list_size = FFI::MemoryPointer.new(:uint32)
|
|
317
|
+
address_list_size.write_uint32(0)
|
|
318
|
+
|
|
319
|
+
# Get size first
|
|
320
|
+
Bindings.NdQueryAddressList(0, FFI::Pointer::NULL, address_list_size)
|
|
321
|
+
|
|
322
|
+
# Allocate and query
|
|
323
|
+
size = address_list_size.read_uint32
|
|
324
|
+
address_list = FFI::MemoryPointer.new(:uint8, size)
|
|
325
|
+
status = Bindings.NdQueryAddressList(0, address_list, address_list_size)
|
|
326
|
+
Bindings.check_status!(status, "NdQueryAddressList")
|
|
327
|
+
|
|
328
|
+
# Open first adapter
|
|
329
|
+
adapter_ptr = FFI::MemoryPointer.new(:pointer)
|
|
330
|
+
status = Bindings.NdOpenAdapter(address_list, size, adapter_ptr)
|
|
331
|
+
Bindings.check_status!(status, "NdOpenAdapter")
|
|
332
|
+
|
|
333
|
+
Adapter.new(adapter_ptr.read_pointer, "RDMA0")
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
def allocate_pinned_host(size)
|
|
337
|
+
ptr_ptr = FFI::MemoryPointer.new(:pointer)
|
|
338
|
+
status = CUDA::RuntimeAPI.cudaHostAlloc(ptr_ptr, size, 1) # cudaHostAllocDefault
|
|
339
|
+
CUDA::RuntimeAPI.check_status!(status, "cudaHostAlloc for RDMA")
|
|
340
|
+
ptr_ptr.read_pointer
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
def free_pinned_host(ptr)
|
|
344
|
+
CUDA::RuntimeAPI.cudaFreeHost(ptr) rescue nil
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
def poll_completion(cq, timeout_ms: 5000)
|
|
348
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
349
|
+
|
|
350
|
+
loop do
|
|
351
|
+
results = cq.poll(max_results: 1)
|
|
352
|
+
return results.first if results.any?
|
|
353
|
+
|
|
354
|
+
elapsed = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000
|
|
355
|
+
if elapsed > timeout_ms
|
|
356
|
+
raise RDMAError, "Completion poll timeout"
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
# Brief sleep to avoid busy-wait
|
|
360
|
+
sleep(0.0001)
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
end
|
|
365
|
+
end
|
|
366
|
+
end
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module Collective
|
|
5
|
+
# Adapter for seamless NvArray integration with collective operations
|
|
6
|
+
# Handles dtype detection, shape validation, and buffer extraction
|
|
7
|
+
#
|
|
8
|
+
# @example Direct tensor usage
|
|
9
|
+
# tensors = gpus.map { |gpu| NvArray.zeros([1000, 128], device: gpu) }
|
|
10
|
+
# comm.all_reduce(tensors) # NvArrayAdapter handles conversion
|
|
11
|
+
#
|
|
12
|
+
module NvArrayAdapter
|
|
13
|
+
# Supported dtypes and their byte sizes
|
|
14
|
+
DTYPE_SIZES = {
|
|
15
|
+
float32: 4,
|
|
16
|
+
float64: 8,
|
|
17
|
+
float16: 2,
|
|
18
|
+
bfloat16: 2,
|
|
19
|
+
int32: 4,
|
|
20
|
+
int64: 8,
|
|
21
|
+
int16: 2,
|
|
22
|
+
int8: 1,
|
|
23
|
+
uint8: 1,
|
|
24
|
+
uint32: 4,
|
|
25
|
+
uint64: 8
|
|
26
|
+
}.freeze
|
|
27
|
+
|
|
28
|
+
# CUDA dtype codes for kernel dispatch
|
|
29
|
+
DTYPE_CUDA_CODES = {
|
|
30
|
+
float32: 0,
|
|
31
|
+
float64: 1,
|
|
32
|
+
float16: 2,
|
|
33
|
+
bfloat16: 3,
|
|
34
|
+
int32: 4,
|
|
35
|
+
int64: 5,
|
|
36
|
+
int16: 6,
|
|
37
|
+
int8: 7,
|
|
38
|
+
uint8: 8,
|
|
39
|
+
uint32: 9,
|
|
40
|
+
uint64: 10
|
|
41
|
+
}.freeze
|
|
42
|
+
|
|
43
|
+
class << self
|
|
44
|
+
# Normalize input to array of NvArrays with validated properties
|
|
45
|
+
#
|
|
46
|
+
# @param inputs [Array<NvArray>, Array<FFI::Pointer>, NvArray] Input tensors
|
|
47
|
+
# @param expected_count [Integer, nil] Expected number of tensors (optional)
|
|
48
|
+
# @return [Array<NvArray>] Normalized tensor array
|
|
49
|
+
# @raise [ArgumentError] If inputs are invalid
|
|
50
|
+
def normalize(inputs, expected_count: nil)
|
|
51
|
+
tensors = wrap_array(inputs)
|
|
52
|
+
|
|
53
|
+
if expected_count && tensors.size != expected_count
|
|
54
|
+
raise ArgumentError,
|
|
55
|
+
"Expected #{expected_count} tensors, got #{tensors.size}"
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
validate_tensors!(tensors)
|
|
59
|
+
tensors
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Extract device pointers from tensors for raw operations
|
|
63
|
+
#
|
|
64
|
+
# @param tensors [Array<NvArray>] Input tensors
|
|
65
|
+
# @return [Array<FFI::Pointer>] Device pointers
|
|
66
|
+
def extract_pointers(tensors)
|
|
67
|
+
tensors.map do |t|
|
|
68
|
+
if t.respond_to?(:data_ptr)
|
|
69
|
+
t.data_ptr
|
|
70
|
+
elsif t.respond_to?(:device_ptr)
|
|
71
|
+
t.device_ptr
|
|
72
|
+
elsif t.respond_to?(:pointer)
|
|
73
|
+
t.pointer
|
|
74
|
+
elsif t.is_a?(Fiddle::Pointer)
|
|
75
|
+
t
|
|
76
|
+
elsif defined?(FFI::Pointer) && t.is_a?(FFI::Pointer)
|
|
77
|
+
t
|
|
78
|
+
else
|
|
79
|
+
raise ArgumentError, "Cannot extract pointer from #{t.class}"
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Get common dtype from tensor array
|
|
85
|
+
#
|
|
86
|
+
# @param tensors [Array<NvArray>] Input tensors
|
|
87
|
+
# @return [Symbol] Common dtype
|
|
88
|
+
# @raise [ArgumentError] If dtypes don't match
|
|
89
|
+
def common_dtype(tensors)
|
|
90
|
+
return :float32 if tensors.empty?
|
|
91
|
+
|
|
92
|
+
dtypes = tensors.map { |t| extract_dtype(t) }.uniq
|
|
93
|
+
if dtypes.size > 1
|
|
94
|
+
raise ArgumentError,
|
|
95
|
+
"All tensors must have same dtype, got: #{dtypes.join(', ')}"
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
dtypes.first
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Get element count from tensors (must match)
|
|
102
|
+
#
|
|
103
|
+
# @param tensors [Array<NvArray>] Input tensors
|
|
104
|
+
# @return [Integer] Element count
|
|
105
|
+
# @raise [ArgumentError] If element counts don't match
|
|
106
|
+
def common_element_count(tensors)
|
|
107
|
+
return 0 if tensors.empty?
|
|
108
|
+
|
|
109
|
+
counts = tensors.map { |t| extract_element_count(t) }.uniq
|
|
110
|
+
if counts.size > 1
|
|
111
|
+
raise ArgumentError,
|
|
112
|
+
"All tensors must have same element count, got: #{counts.join(', ')}"
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
counts.first
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Get byte size for a dtype
|
|
119
|
+
#
|
|
120
|
+
# @param dtype [Symbol] Data type
|
|
121
|
+
# @return [Integer] Bytes per element
|
|
122
|
+
def dtype_size(dtype)
|
|
123
|
+
DTYPE_SIZES[dtype] || raise(ArgumentError, "Unknown dtype: #{dtype}")
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Get CUDA type code for kernel dispatch
|
|
127
|
+
#
|
|
128
|
+
# @param dtype [Symbol] Data type
|
|
129
|
+
# @return [Integer] CUDA type code
|
|
130
|
+
def dtype_cuda_code(dtype)
|
|
131
|
+
DTYPE_CUDA_CODES[dtype] || 0
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Calculate total byte size for tensors
|
|
135
|
+
#
|
|
136
|
+
# @param tensors [Array<NvArray>] Input tensors
|
|
137
|
+
# @return [Integer] Total bytes
|
|
138
|
+
def total_byte_size(tensors)
|
|
139
|
+
tensors.sum do |t|
|
|
140
|
+
extract_element_count(t) * dtype_size(extract_dtype(t))
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Validate shape compatibility for broadcast operations
|
|
145
|
+
#
|
|
146
|
+
# @param src_tensor [NvArray] Source tensor
|
|
147
|
+
# @param dst_tensors [Array<NvArray>] Destination tensors
|
|
148
|
+
# @return [Boolean] True if compatible
|
|
149
|
+
def broadcast_compatible?(src_tensor, dst_tensors)
|
|
150
|
+
src_shape = extract_shape(src_tensor)
|
|
151
|
+
dst_tensors.all? { |t| extract_shape(t) == src_shape }
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Create buffer info for collective ops
|
|
155
|
+
#
|
|
156
|
+
# @param tensors [Array<NvArray>] Input tensors
|
|
157
|
+
# @return [Hash] Buffer information
|
|
158
|
+
def buffer_info(tensors)
|
|
159
|
+
{
|
|
160
|
+
pointers: extract_pointers(tensors),
|
|
161
|
+
dtype: common_dtype(tensors),
|
|
162
|
+
dtype_code: dtype_cuda_code(common_dtype(tensors)),
|
|
163
|
+
element_count: common_element_count(tensors),
|
|
164
|
+
byte_size: common_element_count(tensors) * dtype_size(common_dtype(tensors)),
|
|
165
|
+
tensor_count: tensors.size
|
|
166
|
+
}
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
private
|
|
170
|
+
|
|
171
|
+
def wrap_array(inputs)
|
|
172
|
+
case inputs
|
|
173
|
+
when Array
|
|
174
|
+
inputs
|
|
175
|
+
when Ignis::NvArray
|
|
176
|
+
[inputs]
|
|
177
|
+
else
|
|
178
|
+
raise ArgumentError, "Expected Array or NvArray, got #{inputs.class}"
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def validate_tensors!(tensors)
|
|
183
|
+
return if tensors.empty?
|
|
184
|
+
|
|
185
|
+
# All tensors must have extractable properties
|
|
186
|
+
tensors.each_with_index do |t, i|
|
|
187
|
+
unless t.respond_to?(:data_ptr) || t.respond_to?(:device_ptr) || t.respond_to?(:pointer) ||
|
|
188
|
+
t.is_a?(Fiddle::Pointer) || (defined?(FFI::Pointer) && t.is_a?(FFI::Pointer))
|
|
189
|
+
raise ArgumentError, "Tensor #{i} has no data_ptr/device_ptr method"
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Check dtype consistency
|
|
194
|
+
common_dtype(tensors)
|
|
195
|
+
|
|
196
|
+
# Check element count consistency
|
|
197
|
+
common_element_count(tensors)
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def extract_dtype(tensor)
|
|
201
|
+
if tensor.respond_to?(:dtype)
|
|
202
|
+
tensor.dtype
|
|
203
|
+
else
|
|
204
|
+
:float32 # Default assumption for raw pointers
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def extract_element_count(tensor)
|
|
209
|
+
if tensor.respond_to?(:numel)
|
|
210
|
+
tensor.numel
|
|
211
|
+
elsif tensor.respond_to?(:shape)
|
|
212
|
+
tensor.shape.reduce(1, :*)
|
|
213
|
+
elsif tensor.respond_to?(:size)
|
|
214
|
+
tensor.size
|
|
215
|
+
else
|
|
216
|
+
raise ArgumentError, "Cannot determine element count for #{tensor.class}"
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def extract_shape(tensor)
|
|
221
|
+
if tensor.respond_to?(:shape)
|
|
222
|
+
tensor.shape
|
|
223
|
+
else
|
|
224
|
+
nil
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
end
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ffi"
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module Collective
|
|
7
|
+
# P2P-specific CUDA bindings for multi-GPU communication
|
|
8
|
+
# Extends RuntimeAPI with peer-to-peer and IPC functions
|
|
9
|
+
module P2PBindings
|
|
10
|
+
extend FFI::Library
|
|
11
|
+
|
|
12
|
+
# P2P attribute constants (cudaDeviceP2PAttr)
|
|
13
|
+
P2P_ATTR_PERFORMANCE_RANK = 1
|
|
14
|
+
P2P_ATTR_ACCESS_SUPPORTED = 2
|
|
15
|
+
P2P_ATTR_NATIVE_ATOMIC_SUPPORTED = 3
|
|
16
|
+
P2P_ATTR_CUDA_ARRAY_ACCESS_SUPPORTED = 4
|
|
17
|
+
|
|
18
|
+
# IPC memory flags
|
|
19
|
+
IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 1
|
|
20
|
+
|
|
21
|
+
# CUDA IPC memory handle size (64 bytes)
|
|
22
|
+
IPC_HANDLE_SIZE = 64
|
|
23
|
+
|
|
24
|
+
# cudaIpcMemHandle_t structure
|
|
25
|
+
class CudaIpcMemHandle < FFI::Struct
|
|
26
|
+
layout :reserved, [:uint8, IPC_HANDLE_SIZE]
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
class << self
|
|
30
|
+
# Ensure P2P bindings are loaded
|
|
31
|
+
# @return [void]
|
|
32
|
+
def ensure_loaded!
|
|
33
|
+
return if @loaded
|
|
34
|
+
|
|
35
|
+
CUDA::LibraryLoader.ensure_cuda_runtime!
|
|
36
|
+
|
|
37
|
+
# Resolve cudart path per platform
|
|
38
|
+
dll_path = if defined?(Ignis::Platform)
|
|
39
|
+
Ignis::Platform.cudart_path
|
|
40
|
+
elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
|
|
41
|
+
cuda_bin = Ignis.configuration.cuda_bin_path rescue nil
|
|
42
|
+
cuda_bin ? Dir.glob(File.join(cuda_bin, 'cudart64_*.dll')).max : 'cudart64_130'
|
|
43
|
+
else
|
|
44
|
+
'libcudart.so.13'
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
ffi_lib dll_path if dll_path
|
|
48
|
+
|
|
49
|
+
attach_p2p_functions!
|
|
50
|
+
@loaded = true
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
def attach_p2p_functions!
|
|
56
|
+
# Peer-to-peer access
|
|
57
|
+
attach_function :cudaDeviceCanAccessPeer,
|
|
58
|
+
[:pointer, :int, :int],
|
|
59
|
+
:int
|
|
60
|
+
|
|
61
|
+
attach_function :cudaDeviceEnablePeerAccess,
|
|
62
|
+
[:int, :uint],
|
|
63
|
+
:int
|
|
64
|
+
|
|
65
|
+
attach_function :cudaDeviceDisablePeerAccess,
|
|
66
|
+
[:int],
|
|
67
|
+
:int
|
|
68
|
+
|
|
69
|
+
attach_function :cudaDeviceGetP2PAttribute,
|
|
70
|
+
[:pointer, :int, :int, :int],
|
|
71
|
+
:int
|
|
72
|
+
|
|
73
|
+
# Peer-to-peer memory copy
|
|
74
|
+
attach_function :cudaMemcpyPeer,
|
|
75
|
+
[:pointer, :int, :pointer, :int, :size_t],
|
|
76
|
+
:int
|
|
77
|
+
|
|
78
|
+
attach_function :cudaMemcpyPeerAsync,
|
|
79
|
+
[:pointer, :int, :pointer, :int, :size_t, :pointer],
|
|
80
|
+
:int
|
|
81
|
+
|
|
82
|
+
# IPC memory handle functions
|
|
83
|
+
attach_function :cudaIpcGetMemHandle,
|
|
84
|
+
[CudaIpcMemHandle.ptr, :pointer],
|
|
85
|
+
:int
|
|
86
|
+
|
|
87
|
+
attach_function :cudaIpcOpenMemHandle,
|
|
88
|
+
[:pointer, CudaIpcMemHandle.by_value, :uint],
|
|
89
|
+
:int
|
|
90
|
+
|
|
91
|
+
attach_function :cudaIpcCloseMemHandle,
|
|
92
|
+
[:pointer],
|
|
93
|
+
:int
|
|
94
|
+
|
|
95
|
+
# IPC event handle functions
|
|
96
|
+
attach_function :cudaIpcGetEventHandle,
|
|
97
|
+
[:pointer, :pointer],
|
|
98
|
+
:int
|
|
99
|
+
|
|
100
|
+
attach_function :cudaIpcOpenEventHandle,
|
|
101
|
+
[:pointer, :pointer],
|
|
102
|
+
:int
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Check CUDA status and raise error if not success
|
|
107
|
+
# @param status [Integer] CUDA status code
|
|
108
|
+
# @param context [String] Error context
|
|
109
|
+
# @return [void]
|
|
110
|
+
# @raise [CudaRuntimeError] If status indicates an error
|
|
111
|
+
def self.check_status!(status, context = "P2P operation")
|
|
112
|
+
return if status.zero?
|
|
113
|
+
|
|
114
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
115
|
+
error_name = CUDA::RuntimeAPI.cudaGetErrorName(status)
|
|
116
|
+
error_string = CUDA::RuntimeAPI.cudaGetErrorString(status)
|
|
117
|
+
raise CudaRuntimeError.new("#{context}: #{error_name} - #{error_string}", cuda_code: status)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|