ignis-collective 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +7 -0
- data/lib/ignis-collective.rb +9 -0
- data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
- data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
- data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
- data/lib/nvruby/collective/algorithms/ring.rb +421 -0
- data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
- data/lib/nvruby/collective/algorithms/tree.rb +291 -0
- data/lib/nvruby/collective/array_ops.rb +240 -0
- data/lib/nvruby/collective/communicator.rb +633 -0
- data/lib/nvruby/collective/communicator_healer.rb +276 -0
- data/lib/nvruby/collective/device_manager.rb +216 -0
- data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
- data/lib/nvruby/collective/health_monitor.rb +333 -0
- data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
- data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
- data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
- data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
- data/lib/nvruby/collective/p2p_bindings.rb +121 -0
- data/lib/nvruby/collective/resilient_transport.rb +296 -0
- data/lib/nvruby/collective/topology.rb +347 -0
- data/lib/nvruby/collective/transport/base.rb +138 -0
- data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
- data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
- data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
- data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
- data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
- data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
- data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
- data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
- data/lib/nvruby/collective/transport_selector.rb +200 -0
- data/lib/nvruby/collective/vmm_bindings.rb +212 -0
- data/lib/nvruby/collective.rb +156 -0
- metadata +92 -0
|
@@ -0,0 +1,633 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "device_manager"
|
|
4
|
+
require_relative "transport_selector"
|
|
5
|
+
require_relative "topology"
|
|
6
|
+
require_relative "algorithms/reduction_ops"
|
|
7
|
+
require_relative "algorithms/ring"
|
|
8
|
+
require_relative "algorithms/tree"
|
|
9
|
+
|
|
10
|
+
module Ignis
|
|
11
|
+
module Collective
|
|
12
|
+
# Primary user-facing abstraction for collective operations
|
|
13
|
+
# Provides AllReduce, Broadcast, Reduce, and other collective primitives
|
|
14
|
+
class Communicator
|
|
15
|
+
# Reduction operations
|
|
16
|
+
REDUCTION_OPS = [:sum, :prod, :min, :max, :avg].freeze
|
|
17
|
+
|
|
18
|
+
# @return [Array<Integer>] GPU device IDs in this communicator
|
|
19
|
+
attr_reader :gpu_ids
|
|
20
|
+
|
|
21
|
+
# @return [DeviceManager] Device manager
|
|
22
|
+
attr_reader :device_manager
|
|
23
|
+
|
|
24
|
+
# @return [TransportSelector] Transport selector
|
|
25
|
+
attr_reader :transport_selector
|
|
26
|
+
|
|
27
|
+
# @return [Integer] Rank of this communicator (for multi-process)
|
|
28
|
+
attr_reader :rank
|
|
29
|
+
|
|
30
|
+
# @return [Integer] Total number of ranks
|
|
31
|
+
attr_reader :world_size
|
|
32
|
+
|
|
33
|
+
# Create a new communicator for the specified GPUs
|
|
34
|
+
# @param gpu_ids [Array<Integer>] GPU device IDs to include
|
|
35
|
+
# @param rank [Integer] Rank of this process (default 0 for single-process)
|
|
36
|
+
# @param world_size [Integer] Total ranks (default 1 for single-process)
|
|
37
|
+
def initialize(gpu_ids:, rank: 0, world_size: 1)
|
|
38
|
+
@gpu_ids = gpu_ids.dup.freeze
|
|
39
|
+
@rank = rank
|
|
40
|
+
@world_size = world_size
|
|
41
|
+
|
|
42
|
+
validate_gpu_ids!
|
|
43
|
+
|
|
44
|
+
@device_manager = DeviceManager.new(device_ids: @gpu_ids)
|
|
45
|
+
@transport_selector = TransportSelector.new(@gpu_ids)
|
|
46
|
+
@ring_order = nil
|
|
47
|
+
@initialized = false
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Initialize the communicator (detect topology, enable P2P, etc.)
|
|
51
|
+
# @return [self]
|
|
52
|
+
def initialize!
|
|
53
|
+
return self if @initialized
|
|
54
|
+
|
|
55
|
+
@device_manager.initialize!
|
|
56
|
+
@device_manager.enable_all_p2p_access!
|
|
57
|
+
@transport_selector.initialize!
|
|
58
|
+
@ring_order = @transport_selector.optimal_ring_order
|
|
59
|
+
|
|
60
|
+
@initialized = true
|
|
61
|
+
self
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Perform AllReduce operation - reduce and distribute result to all GPUs
|
|
65
|
+
# @param tensors [Array<NvArray>] One tensor per GPU
|
|
66
|
+
# @param op [Symbol] Reduction operation (:sum, :prod, :min, :max)
|
|
67
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
68
|
+
# @return [Array<NvArray>] Reduced tensors (same references as input)
|
|
69
|
+
def all_reduce(tensors, op: :sum, stream: nil)
|
|
70
|
+
validate_operation!(op)
|
|
71
|
+
validate_tensors!(tensors)
|
|
72
|
+
ensure_initialized!
|
|
73
|
+
|
|
74
|
+
# Single GPU case - no-op
|
|
75
|
+
return tensors if @gpu_ids.size == 1
|
|
76
|
+
|
|
77
|
+
# Use Ring AllReduce for multi-GPU
|
|
78
|
+
ring_all_reduce(tensors, op, stream)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Async AllReduce - requires explicit synchronization
|
|
82
|
+
# @param tensors [Array<NvArray>] One tensor per GPU
|
|
83
|
+
# @param op [Symbol] Reduction operation
|
|
84
|
+
# @param stream [CUDA::Stream] CUDA stream for async execution
|
|
85
|
+
# @return [Array<NvArray>] Tensors (result available after sync)
|
|
86
|
+
def all_reduce_async(tensors, op: :sum, stream:)
|
|
87
|
+
raise ArgumentError, "Stream required for async operation" unless stream
|
|
88
|
+
|
|
89
|
+
all_reduce(tensors, op: op, stream: stream)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Broadcast tensor from root GPU to all GPUs
|
|
93
|
+
# @param tensor [NvArray] Source tensor on root GPU
|
|
94
|
+
# @param root [Integer] Root GPU index (default 0)
|
|
95
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
96
|
+
# @return [Array<NvArray>] Tensors on all GPUs with broadcasted data
|
|
97
|
+
def broadcast(tensor, root: 0, stream: nil)
|
|
98
|
+
ensure_initialized!
|
|
99
|
+
validate_gpu_index!(root)
|
|
100
|
+
|
|
101
|
+
return [tensor] if @gpu_ids.size == 1
|
|
102
|
+
|
|
103
|
+
# TODO: Implement tree broadcast algorithm
|
|
104
|
+
# For now, use simple fan-out from root
|
|
105
|
+
simple_broadcast(tensor, root, stream)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Reduce tensors to root GPU
|
|
109
|
+
# @param tensors [Array<NvArray>] One tensor per GPU
|
|
110
|
+
# @param root [Integer] Root GPU index
|
|
111
|
+
# @param op [Symbol] Reduction operation
|
|
112
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
113
|
+
# @return [NvArray] Reduced tensor on root GPU
|
|
114
|
+
def reduce(tensors, root: 0, op: :sum, stream: nil)
|
|
115
|
+
validate_operation!(op)
|
|
116
|
+
validate_tensors!(tensors)
|
|
117
|
+
ensure_initialized!
|
|
118
|
+
validate_gpu_index!(root)
|
|
119
|
+
|
|
120
|
+
return tensors[0] if @gpu_ids.size == 1
|
|
121
|
+
|
|
122
|
+
# TODO: Implement tree reduce algorithm
|
|
123
|
+
simple_reduce(tensors, root, op, stream)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# AllGather - gather tensors from all GPUs to all GPUs
|
|
127
|
+
# @param tensors [Array<NvArray>] One tensor per GPU (each may be different size)
|
|
128
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
129
|
+
# @return [Array<Array<NvArray>>] Gathered tensors on each GPU
|
|
130
|
+
def all_gather(tensors, stream: nil)
|
|
131
|
+
validate_tensors!(tensors)
|
|
132
|
+
ensure_initialized!
|
|
133
|
+
|
|
134
|
+
return [tensors] if @gpu_ids.size == 1
|
|
135
|
+
|
|
136
|
+
# TODO: Implement ring all-gather
|
|
137
|
+
simple_all_gather(tensors, stream)
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# ReduceScatter - reduce and scatter result
|
|
141
|
+
# @param tensors [Array<NvArray>] One tensor per GPU
|
|
142
|
+
# @param op [Symbol] Reduction operation
|
|
143
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
144
|
+
# @return [Array<FFI::Pointer>] Scattered reduced chunks (chunk size = total_size / N)
|
|
145
|
+
def reduce_scatter(tensors, op: :sum, stream: nil)
|
|
146
|
+
validate_operation!(op)
|
|
147
|
+
validate_tensors!(tensors)
|
|
148
|
+
ensure_initialized!
|
|
149
|
+
|
|
150
|
+
return tensors if @gpu_ids.size == 1
|
|
151
|
+
|
|
152
|
+
ring = Algorithms::Ring.new(
|
|
153
|
+
ring_order: @ring_order,
|
|
154
|
+
transport_selector: @transport_selector
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
buffers = tensors.map { |t| device_buffer(t) }
|
|
158
|
+
sizes = tensors.map { |t| byte_size_of(t) }
|
|
159
|
+
|
|
160
|
+
dtype = if tensors[0].respond_to?(:dtype)
|
|
161
|
+
tensors[0].dtype
|
|
162
|
+
else
|
|
163
|
+
:float32
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Calculate chunk size
|
|
167
|
+
total_size = sizes[0]
|
|
168
|
+
chunk_size = ring.calculate_chunk_size(total_size)
|
|
169
|
+
|
|
170
|
+
# Allocate result buffers
|
|
171
|
+
result_buffers = @gpu_ids.map do |gpu_id|
|
|
172
|
+
allocate_buffer_on_device(gpu_id, chunk_size)
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
streams = stream ? [stream] * @gpu_ids.size : create_null_streams(@gpu_ids.size)
|
|
176
|
+
|
|
177
|
+
ring.reduce_scatter(
|
|
178
|
+
buffers: buffers,
|
|
179
|
+
result_buffers: result_buffers,
|
|
180
|
+
sizes: sizes,
|
|
181
|
+
dtype: dtype,
|
|
182
|
+
op: op,
|
|
183
|
+
streams: streams
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
result_buffers
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# AllToAll - full exchange between all GPUs
|
|
190
|
+
# Each GPU sends N chunks (one to each GPU) and receives N chunks (one from each GPU)
|
|
191
|
+
# @param send_buffers [Array<Array<FFI::Pointer>>] N×N array: send_buffers[src][dst]
|
|
192
|
+
# @param recv_buffers [Array<Array<FFI::Pointer>>] N×N array: recv_buffers[dst][src]
|
|
193
|
+
# @param chunk_size [Integer] Size of each chunk in bytes
|
|
194
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
195
|
+
# @return [void]
|
|
196
|
+
def all_to_all(send_buffers, recv_buffers, chunk_size:, stream: nil)
|
|
197
|
+
ensure_initialized!
|
|
198
|
+
|
|
199
|
+
n = @gpu_ids.size
|
|
200
|
+
return if n == 1
|
|
201
|
+
|
|
202
|
+
streams = stream ? [stream] * n : create_null_streams(n)
|
|
203
|
+
|
|
204
|
+
# Phase 1: Copy local data (GPU[i] → GPU[i])
|
|
205
|
+
n.times do |rank|
|
|
206
|
+
gpu_id = @gpu_ids[rank]
|
|
207
|
+
CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
|
|
208
|
+
stream_ptr = get_stream_ptr(streams[rank])
|
|
209
|
+
|
|
210
|
+
CUDA::RuntimeAPI.cudaMemcpyAsync(
|
|
211
|
+
recv_buffers[rank][rank],
|
|
212
|
+
send_buffers[rank][rank],
|
|
213
|
+
chunk_size,
|
|
214
|
+
CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_DEVICE,
|
|
215
|
+
stream_ptr
|
|
216
|
+
)
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Phase 2: N-1 rounds of pairwise exchange
|
|
220
|
+
(n - 1).times do |round|
|
|
221
|
+
n.times do |rank|
|
|
222
|
+
gpu_id = @gpu_ids[rank]
|
|
223
|
+
|
|
224
|
+
# Calculate partner for this round (rotation pattern)
|
|
225
|
+
partner = (rank + round + 1) % n
|
|
226
|
+
partner_gpu = @gpu_ids[partner]
|
|
227
|
+
|
|
228
|
+
stream_ptr = get_stream_ptr(streams[rank])
|
|
229
|
+
|
|
230
|
+
# Send to partner
|
|
231
|
+
transport = @transport_selector.select_transport(gpu_id, partner_gpu)
|
|
232
|
+
|
|
233
|
+
if transport.is_a?(Transport::P2PTransport)
|
|
234
|
+
transport.copy_async(
|
|
235
|
+
recv_buffers[partner][rank], # Partner receives from me
|
|
236
|
+
send_buffers[rank][partner], # I send to partner
|
|
237
|
+
chunk_size,
|
|
238
|
+
stream_ptr
|
|
239
|
+
)
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Synchronize after each round
|
|
244
|
+
synchronize_all_streams!(streams)
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# Point-to-point send from current rank to destination
|
|
249
|
+
# @param tensor [NvArray, FFI::Pointer] Data to send
|
|
250
|
+
# @param dest_rank [Integer] Destination rank (index in gpu_ids)
|
|
251
|
+
# @param size [Integer, nil] Size in bytes (inferred from tensor if nil)
|
|
252
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
253
|
+
# @return [void]
|
|
254
|
+
def send(tensor, dest_rank:, size: nil, stream: nil)
|
|
255
|
+
ensure_initialized!
|
|
256
|
+
validate_gpu_index!(dest_rank)
|
|
257
|
+
|
|
258
|
+
src_rank = 0 # Default sender is rank 0
|
|
259
|
+
src_gpu = @gpu_ids[src_rank]
|
|
260
|
+
dst_gpu = @gpu_ids[dest_rank]
|
|
261
|
+
|
|
262
|
+
return if src_rank == dest_rank
|
|
263
|
+
|
|
264
|
+
buffer = device_buffer(tensor)
|
|
265
|
+
byte_size = size || byte_size_of(tensor)
|
|
266
|
+
|
|
267
|
+
transport = @transport_selector.select_transport(src_gpu, dst_gpu)
|
|
268
|
+
stream_ptr = stream ? get_stream_ptr(stream) : FFI::Pointer::NULL
|
|
269
|
+
|
|
270
|
+
if transport.is_a?(Transport::P2PTransport)
|
|
271
|
+
# P2P copy requires destination buffer
|
|
272
|
+
# Assumes tensor has been pre-allocated on dest
|
|
273
|
+
raise ArgumentError, "P2P send requires pre-allocated recv buffer on dest"
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
# Point-to-point send from specific source rank
|
|
278
|
+
# @param buffer [FFI::Pointer] Source buffer on src_rank GPU
|
|
279
|
+
# @param src_rank [Integer] Source rank
|
|
280
|
+
# @param dst_buffer [FFI::Pointer] Destination buffer on dst_rank GPU
|
|
281
|
+
# @param dst_rank [Integer] Destination rank
|
|
282
|
+
# @param size [Integer] Size in bytes
|
|
283
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
284
|
+
# @return [void]
|
|
285
|
+
def send_recv(buffer, src_rank:, dst_buffer:, dst_rank:, size:, stream: nil)
|
|
286
|
+
ensure_initialized!
|
|
287
|
+
validate_gpu_index!(src_rank)
|
|
288
|
+
validate_gpu_index!(dst_rank)
|
|
289
|
+
|
|
290
|
+
return if src_rank == dst_rank
|
|
291
|
+
|
|
292
|
+
src_gpu = @gpu_ids[src_rank]
|
|
293
|
+
dst_gpu = @gpu_ids[dst_rank]
|
|
294
|
+
|
|
295
|
+
transport = @transport_selector.select_transport(src_gpu, dst_gpu)
|
|
296
|
+
stream_ptr = stream ? get_stream_ptr(stream) : FFI::Pointer::NULL
|
|
297
|
+
|
|
298
|
+
if transport.is_a?(Transport::P2PTransport)
|
|
299
|
+
# Set source device context
|
|
300
|
+
CUDA::RuntimeAPI.cudaSetDevice(src_gpu)
|
|
301
|
+
transport.copy_async(dst_buffer, buffer, size, stream_ptr)
|
|
302
|
+
elsif transport.is_a?(Transport::IPCTransport)
|
|
303
|
+
# For IPC, export/import handles
|
|
304
|
+
handle = transport.export_handle(buffer)
|
|
305
|
+
CUDA::RuntimeAPI.cudaSetDevice(dst_gpu)
|
|
306
|
+
mapped = transport.import_handle(handle)
|
|
307
|
+
|
|
308
|
+
# Copy from mapped to destination
|
|
309
|
+
CUDA::RuntimeAPI.cudaMemcpyAsync(
|
|
310
|
+
dst_buffer,
|
|
311
|
+
mapped,
|
|
312
|
+
size,
|
|
313
|
+
CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_DEVICE,
|
|
314
|
+
stream_ptr
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
transport.close_imported_handle(mapped)
|
|
318
|
+
end
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
# Point-to-point receive (no-op, actual receive happens in send_recv)
|
|
322
|
+
# @param buffer [FFI::Pointer] Buffer to receive into
|
|
323
|
+
# @param src_rank [Integer] Source rank
|
|
324
|
+
# @param size [Integer] Expected size in bytes
|
|
325
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
326
|
+
# @return [void]
|
|
327
|
+
def recv(buffer, src_rank:, size:, stream: nil)
|
|
328
|
+
ensure_initialized!
|
|
329
|
+
validate_gpu_index!(src_rank)
|
|
330
|
+
# Actual data transfer happens via send_recv from sender side
|
|
331
|
+
# This just marks the receive buffer as ready
|
|
332
|
+
barrier
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
# Barrier synchronization across all GPUs
|
|
336
|
+
# @return [void]
|
|
337
|
+
def barrier
|
|
338
|
+
ensure_initialized!
|
|
339
|
+
@device_manager.synchronize_all!
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
# Check if communicator is ready
|
|
343
|
+
# @return [Boolean] True if initialized
|
|
344
|
+
def ready?
|
|
345
|
+
@initialized &&
|
|
346
|
+
@device_manager.ready? &&
|
|
347
|
+
@transport_selector.ready?
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
# Get the topology matrix
|
|
351
|
+
# @return [Topology::Matrix] Topology information
|
|
352
|
+
def topology
|
|
353
|
+
@device_manager.topology&.matrix
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
# Get performance summary
|
|
357
|
+
# @return [Hash] Performance statistics
|
|
358
|
+
def performance_summary
|
|
359
|
+
@transport_selector.performance_summary
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
# Clean up all resources
|
|
363
|
+
# @return [void]
|
|
364
|
+
def destroy!
|
|
365
|
+
@transport_selector.destroy!
|
|
366
|
+
@device_manager.destroy!
|
|
367
|
+
@initialized = false
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
# @return [String] Human-readable description
|
|
371
|
+
def to_s
|
|
372
|
+
status = @initialized ? "ready" : "uninitialized"
|
|
373
|
+
"Communicator[#{@gpu_ids.size} GPUs, #{status}]"
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
# @return [String] Detailed inspection
|
|
377
|
+
def inspect
|
|
378
|
+
"#<Ignis::Collective::Communicator " \
|
|
379
|
+
"gpu_ids=#{@gpu_ids} " \
|
|
380
|
+
"rank=#{@rank}/#{@world_size} " \
|
|
381
|
+
"initialized=#{@initialized}>"
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
private
|
|
385
|
+
|
|
386
|
+
def validate_gpu_ids!
|
|
387
|
+
raise ArgumentError, "gpu_ids cannot be empty" if @gpu_ids.empty?
|
|
388
|
+
|
|
389
|
+
max_id = CUDA::Device.count - 1
|
|
390
|
+
invalid = @gpu_ids.reject { |id| id.between?(0, max_id) }
|
|
391
|
+
return if invalid.empty?
|
|
392
|
+
|
|
393
|
+
raise ArgumentError, "Invalid GPU IDs: #{invalid}. Valid range: 0-#{max_id}"
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
def validate_operation!(op)
|
|
397
|
+
return if REDUCTION_OPS.include?(op)
|
|
398
|
+
|
|
399
|
+
raise ArgumentError, "Invalid reduction op: #{op}. Valid: #{REDUCTION_OPS}"
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
def validate_tensors!(tensors)
|
|
403
|
+
if tensors.size != @gpu_ids.size
|
|
404
|
+
raise ArgumentError,
|
|
405
|
+
"Expected #{@gpu_ids.size} tensors, got #{tensors.size}"
|
|
406
|
+
end
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
def validate_gpu_index!(index)
|
|
410
|
+
return if index >= 0 && index < @gpu_ids.size
|
|
411
|
+
|
|
412
|
+
raise ArgumentError, "Invalid GPU index: #{index}. Valid: 0-#{@gpu_ids.size - 1}"
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
def ensure_initialized!
|
|
416
|
+
return if @initialized
|
|
417
|
+
|
|
418
|
+
raise CommunicatorError, "Communicator not initialized. Call initialize! first."
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
# Ring AllReduce using the Ring algorithm
|
|
422
|
+
# Uses scatter-reduce + allgather pattern
|
|
423
|
+
def ring_all_reduce(tensors, op, stream)
|
|
424
|
+
n = @gpu_ids.size
|
|
425
|
+
return tensors if n == 1
|
|
426
|
+
|
|
427
|
+
# Create Ring algorithm instance
|
|
428
|
+
ring = Algorithms::Ring.new(
|
|
429
|
+
ring_order: @ring_order,
|
|
430
|
+
transport_selector: @transport_selector
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# Extract device pointers and BYTE sizes from tensors.
|
|
434
|
+
buffers = tensors.map { |t| device_buffer(t) }
|
|
435
|
+
sizes = tensors.map { |t| byte_size_of(t) }
|
|
436
|
+
|
|
437
|
+
# Detect dtype (default to float32)
|
|
438
|
+
dtype = tensors[0].respond_to?(:dtype) ? tensors[0].dtype : :float32
|
|
439
|
+
|
|
440
|
+
# Create streams for each GPU
|
|
441
|
+
streams = stream ? [stream] * n : create_null_streams(n)
|
|
442
|
+
|
|
443
|
+
# Execute Ring AllReduce (scatter-reduce + all-gather). For :avg the ring
|
|
444
|
+
# accumulates a SUM; we divide by the participant count once at the end.
|
|
445
|
+
ring.all_reduce(
|
|
446
|
+
buffers: buffers,
|
|
447
|
+
sizes: sizes,
|
|
448
|
+
dtype: dtype,
|
|
449
|
+
op: op,
|
|
450
|
+
streams: streams
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
apply_avg!(buffers, sizes, dtype, n) if op == :avg
|
|
454
|
+
|
|
455
|
+
tensors
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
# Create null stream pointers for each GPU
|
|
459
|
+
def create_null_streams(n)
|
|
460
|
+
Array.new(n) { FFI::Pointer::NULL }
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
# Extract an FFI device pointer from a tensor/NvArray (either class) or
|
|
464
|
+
# pass through a raw pointer. (Was `t.data_ptr` — which no NvArray defines.)
|
|
465
|
+
def device_buffer(t)
|
|
466
|
+
if t.respond_to?(:device_ffi_ptr) then t.device_ffi_ptr # Ignis/Shared NvArray
|
|
467
|
+
elsif t.respond_to?(:device_ptr) then t.device_ptr
|
|
468
|
+
elsif t.respond_to?(:data_ptr) then t.data_ptr
|
|
469
|
+
else t # assume FFI::Pointer
|
|
470
|
+
end
|
|
471
|
+
end
|
|
472
|
+
|
|
473
|
+
# Size of a tensor in BYTES. (Was `t.byte_size || t.size`, and t.size is the
|
|
474
|
+
# element COUNT — not bytes — for NvArray, so reductions ran on 1/4 the data.)
|
|
475
|
+
def byte_size_of(t)
|
|
476
|
+
if t.respond_to?(:nbytes) then t.nbytes # Ignis::NvArray
|
|
477
|
+
elsif t.respond_to?(:size_bytes) then t.size_bytes # Ignis::Shared::NvArray
|
|
478
|
+
elsif t.respond_to?(:byte_size) then t.byte_size
|
|
479
|
+
else raise ArgumentError, "cannot determine byte size of #{t.class}"
|
|
480
|
+
end
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
# Element size in bytes for a dtype.
|
|
484
|
+
def elem_size_of(dtype)
|
|
485
|
+
case dtype
|
|
486
|
+
when :float64, :int64 then 8
|
|
487
|
+
when :float16, :bfloat16 then 2
|
|
488
|
+
when :int8, :uint8 then 1
|
|
489
|
+
else 4
|
|
490
|
+
end
|
|
491
|
+
end
|
|
492
|
+
|
|
493
|
+
# Divide each buffer by the participant count in place (for op: :avg).
|
|
494
|
+
def apply_avg!(buffers, sizes, dtype, n)
|
|
495
|
+
scale = 1.0 / n
|
|
496
|
+
kernel = Ignis::JIT::Kernels::Elementwise.scale_forward
|
|
497
|
+
es = elem_size_of(dtype)
|
|
498
|
+
buffers.each_with_index do |buf, i|
|
|
499
|
+
count = sizes[i] / es
|
|
500
|
+
CUDA::RuntimeAPI.cudaSetDevice(@gpu_ids[i])
|
|
501
|
+
kernel.launch(grid: [(count + 255) / 256], block: [256], args: [buf, buf, scale, count])
|
|
502
|
+
end
|
|
503
|
+
Ignis.synchronize
|
|
504
|
+
end
|
|
505
|
+
|
|
506
|
+
# Tree broadcast from root
|
|
507
|
+
def simple_broadcast(tensor, root, stream)
|
|
508
|
+
# Create buffers array with tensor at root position
|
|
509
|
+
buffers = @gpu_ids.map.with_index do |gpu_id, i|
|
|
510
|
+
if i == root
|
|
511
|
+
device_buffer(tensor)
|
|
512
|
+
else
|
|
513
|
+
# Allocate buffer on other GPUs
|
|
514
|
+
allocate_buffer_on_device(gpu_id, byte_size_of(tensor))
|
|
515
|
+
end
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
size = byte_size_of(tensor)
|
|
519
|
+
|
|
520
|
+
# Create Tree algorithm instance
|
|
521
|
+
tree = Algorithms::Tree.new(
|
|
522
|
+
gpu_ids: @gpu_ids,
|
|
523
|
+
transport_selector: @transport_selector
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
# Create streams
|
|
527
|
+
streams = stream ? [stream] * @gpu_ids.size : create_null_streams(@gpu_ids.size)
|
|
528
|
+
|
|
529
|
+
# Execute broadcast
|
|
530
|
+
tree.broadcast(
|
|
531
|
+
buffer: buffers[root],
|
|
532
|
+
buffers: buffers,
|
|
533
|
+
size: size,
|
|
534
|
+
root: root,
|
|
535
|
+
streams: streams
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
buffers
|
|
539
|
+
end
|
|
540
|
+
|
|
541
|
+
# Tree reduce to root
|
|
542
|
+
def simple_reduce(tensors, root, op, stream)
|
|
543
|
+
buffers = tensors.map { |t| device_buffer(t) }
|
|
544
|
+
sizes = tensors.map { |t| byte_size_of(t) }
|
|
545
|
+
|
|
546
|
+
dtype = if tensors[0].respond_to?(:dtype)
|
|
547
|
+
tensors[0].dtype
|
|
548
|
+
else
|
|
549
|
+
:float32
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
tree = Algorithms::Tree.new(
|
|
553
|
+
gpu_ids: @gpu_ids,
|
|
554
|
+
transport_selector: @transport_selector
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
streams = stream ? [stream] * @gpu_ids.size : create_null_streams(@gpu_ids.size)
|
|
558
|
+
|
|
559
|
+
tree.reduce(
|
|
560
|
+
buffers: buffers,
|
|
561
|
+
sizes: sizes,
|
|
562
|
+
dtype: dtype,
|
|
563
|
+
op: op,
|
|
564
|
+
root: root,
|
|
565
|
+
streams: streams
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
tensors[root]
|
|
569
|
+
end
|
|
570
|
+
|
|
571
|
+
# All-gather. In this single-process model every rank's tensor is already
|
|
572
|
+
# an accessible Ruby/GPU object, so the gathered set on each rank is simply
|
|
573
|
+
# all input tensors. We return an independent list per rank (so callers may
|
|
574
|
+
# mutate one rank's view without aliasing others).
|
|
575
|
+
#
|
|
576
|
+
# NOTE: true cross-PROCESS / cross-node gather (world_size > 1) requires the
|
|
577
|
+
# transport layer to physically move device buffers, which is not wired yet;
|
|
578
|
+
# ring AllGather machinery exists in Algorithms::Ring#all_gather_standalone
|
|
579
|
+
# for when P2P/host-staged transports are completed.
|
|
580
|
+
def simple_all_gather(tensors, _stream)
|
|
581
|
+
barrier
|
|
582
|
+
Array.new(@gpu_ids.size) { tensors.dup }
|
|
583
|
+
end
|
|
584
|
+
|
|
585
|
+
# Helper: get tensor size
|
|
586
|
+
def tensor_size(tensor)
|
|
587
|
+
if tensor.respond_to?(:byte_size)
|
|
588
|
+
tensor.byte_size
|
|
589
|
+
elsif tensor.respond_to?(:size)
|
|
590
|
+
tensor.size
|
|
591
|
+
else
|
|
592
|
+
4 # Default to 4 bytes
|
|
593
|
+
end
|
|
594
|
+
end
|
|
595
|
+
|
|
596
|
+
# Helper: allocate buffer on specific device
|
|
597
|
+
def allocate_buffer_on_device(gpu_id, size)
|
|
598
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
599
|
+
CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
|
|
600
|
+
ptr_ptr = FFI::MemoryPointer.new(:pointer)
|
|
601
|
+
status = CUDA::RuntimeAPI.cudaMalloc(ptr_ptr, size)
|
|
602
|
+
CUDA::RuntimeAPI.check_status!(status, "Alloc broadcast buffer")
|
|
603
|
+
ptr_ptr.read_pointer
|
|
604
|
+
end
|
|
605
|
+
|
|
606
|
+
# Helper: get stream pointer for FFI
|
|
607
|
+
def get_stream_ptr(stream)
|
|
608
|
+
case stream
|
|
609
|
+
when FFI::Pointer
|
|
610
|
+
stream
|
|
611
|
+
when CUDA::Stream
|
|
612
|
+
stream.ptr
|
|
613
|
+
else
|
|
614
|
+
FFI::Pointer::NULL
|
|
615
|
+
end
|
|
616
|
+
end
|
|
617
|
+
|
|
618
|
+
# Helper: synchronize all streams
|
|
619
|
+
def synchronize_all_streams!(streams)
|
|
620
|
+
streams.each_with_index do |stream, i|
|
|
621
|
+
CUDA::RuntimeAPI.cudaSetDevice(@gpu_ids[i])
|
|
622
|
+
|
|
623
|
+
stream_ptr = get_stream_ptr(stream)
|
|
624
|
+
if stream_ptr.null?
|
|
625
|
+
CUDA::RuntimeAPI.cudaDeviceSynchronize
|
|
626
|
+
else
|
|
627
|
+
CUDA::RuntimeAPI.cudaStreamSynchronize(stream_ptr)
|
|
628
|
+
end
|
|
629
|
+
end
|
|
630
|
+
end
|
|
631
|
+
end
|
|
632
|
+
end
|
|
633
|
+
end
|