ignis-collective 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +7 -0
- data/lib/ignis-collective.rb +9 -0
- data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
- data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
- data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
- data/lib/nvruby/collective/algorithms/ring.rb +421 -0
- data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
- data/lib/nvruby/collective/algorithms/tree.rb +291 -0
- data/lib/nvruby/collective/array_ops.rb +240 -0
- data/lib/nvruby/collective/communicator.rb +633 -0
- data/lib/nvruby/collective/communicator_healer.rb +276 -0
- data/lib/nvruby/collective/device_manager.rb +216 -0
- data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
- data/lib/nvruby/collective/health_monitor.rb +333 -0
- data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
- data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
- data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
- data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
- data/lib/nvruby/collective/p2p_bindings.rb +121 -0
- data/lib/nvruby/collective/resilient_transport.rb +296 -0
- data/lib/nvruby/collective/topology.rb +347 -0
- data/lib/nvruby/collective/transport/base.rb +138 -0
- data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
- data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
- data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
- data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
- data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
- data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
- data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
- data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
- data/lib/nvruby/collective/transport_selector.rb +200 -0
- data/lib/nvruby/collective/vmm_bindings.rb +212 -0
- data/lib/nvruby/collective.rb +156 -0
- metadata +92 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 7343f5cf832fab47fa86cc79eab930cafb142b6f0967687d65d8b5896c8ad488
|
|
4
|
+
data.tar.gz: 87f2e47d253db19ede66f71d9fcfc24bc59fc1b7813b67230d6fe1bd1be87b67
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 791c617270822aad2c14bbca288da77baf1b3202c5ed46d0695936ec8953582d1c6e0677e513c58f21ebde11ef07eacb795f480cc126a5c337c59e52943f38fd
|
|
7
|
+
data.tar.gz: 78a9a949ca36e3b880c7df0ebe7ccdc3408ed082546957703936ef80a810f27d5c98e74ee5b047a76770bc31105bb44326f76519d57d395e8aadb2ea07563602
|
data/README.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# ignis-collective (experimental)
|
|
2
|
+
|
|
3
|
+
Multi-GPU collective communication for Ruby — NCCL-style ring/tree all-reduce with P2P / IPC (VMM) / host-staged / TCP transports, on the [`ignis`](https://rubygems.org/gems/ignis) foundation.
|
|
4
|
+
|
|
5
|
+
> **Experimental.** The transports require multiple GPUs / nodes and cannot be exercised on a single GPU, so this gem is shipped separately from the verified single-GPU stack. APIs may change.
|
|
6
|
+
|
|
7
|
+
MIT.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# ignis-collective — multi-GPU collective communication (NvCCL: ring/tree all-reduce,
|
|
4
|
+
# P2P/IPC/host-staged/TCP transports) on the Ignis foundation. EXPERIMENTAL: the
|
|
5
|
+
# transports cannot be exercised on a single GPU, so this gem is provided for
|
|
6
|
+
# multi-GPU/multi-node setups and is not part of the verified single-GPU path.
|
|
7
|
+
|
|
8
|
+
require "ignis"
|
|
9
|
+
require_relative "nvruby/collective"
|
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module Collective
|
|
5
|
+
module Algorithms
|
|
6
|
+
# Double Binary Tree Algorithm for heterogeneous GPU counts
|
|
7
|
+
#
|
|
8
|
+
# When GPU count is not a power of 2, a single binary tree has imbalanced load.
|
|
9
|
+
# The double binary tree uses two overlapping trees to balance communication.
|
|
10
|
+
#
|
|
11
|
+
# Key insight: GPU i participates in both trees but at different levels.
|
|
12
|
+
# - Tree 1: Standard binary tree rooted at 0
|
|
13
|
+
# - Tree 2: Binary tree rooted at N/2 (or closest power of 2)
|
|
14
|
+
#
|
|
15
|
+
# This achieves near-optimal latency even for non-power-of-2 GPU counts.
|
|
16
|
+
class DoubleBinaryTree
|
|
17
|
+
# Node in the double tree structure
|
|
18
|
+
TreeNode = Struct.new(:rank, :gpu_id, :parent, :left_child, :right_child, :tree_id, keyword_init: true)
|
|
19
|
+
|
|
20
|
+
# @return [Array<Integer>] GPU IDs
|
|
21
|
+
attr_reader :gpu_ids
|
|
22
|
+
|
|
23
|
+
# @return [Integer] Number of GPUs
|
|
24
|
+
attr_reader :n_gpus
|
|
25
|
+
|
|
26
|
+
# @return [TransportSelector] Transport selector
|
|
27
|
+
attr_reader :transport_selector
|
|
28
|
+
|
|
29
|
+
# @param gpu_ids [Array<Integer>] GPU device IDs
|
|
30
|
+
# @param transport_selector [TransportSelector] Transport selector
|
|
31
|
+
def initialize(gpu_ids:, transport_selector:)
|
|
32
|
+
@gpu_ids = gpu_ids.dup.freeze
|
|
33
|
+
@n_gpus = gpu_ids.size
|
|
34
|
+
@transport_selector = transport_selector
|
|
35
|
+
@tree1 = nil
|
|
36
|
+
@tree2 = nil
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Build the double binary tree structure
|
|
40
|
+
# @return [void]
|
|
41
|
+
def build!
|
|
42
|
+
@tree1 = build_binary_tree(root_rank: 0, tree_id: 1)
|
|
43
|
+
|
|
44
|
+
# For tree 2, use a different root to balance load
|
|
45
|
+
# Root is at opposite end of the array
|
|
46
|
+
tree2_root = @n_gpus / 2
|
|
47
|
+
@tree2 = build_binary_tree(root_rank: tree2_root, tree_id: 2)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Broadcast using double tree (for fault tolerance / load balancing)
|
|
51
|
+
#
|
|
52
|
+
# @param buffer [FFI::Pointer] Source buffer on root
|
|
53
|
+
# @param buffers [Array<FFI::Pointer>] Destination buffers on all GPUs
|
|
54
|
+
# @param size [Integer] Buffer size in bytes
|
|
55
|
+
# @param root [Integer] Root rank
|
|
56
|
+
# @param streams [Array<FFI::Pointer>] CUDA streams
|
|
57
|
+
# @return [void]
|
|
58
|
+
def broadcast(buffer:, buffers:, size:, root:, streams:)
|
|
59
|
+
return if @n_gpus == 1
|
|
60
|
+
|
|
61
|
+
build! if @tree1.nil?
|
|
62
|
+
|
|
63
|
+
# For non-power-of-2, use split strategy
|
|
64
|
+
if power_of_2?(@n_gpus)
|
|
65
|
+
# Standard single tree broadcast
|
|
66
|
+
broadcast_tree(@tree1, buffers, size, root, streams)
|
|
67
|
+
else
|
|
68
|
+
# Split data: half via tree1, half via tree2
|
|
69
|
+
half_size = size / 2
|
|
70
|
+
|
|
71
|
+
# Tree 1 handles first half
|
|
72
|
+
broadcast_tree_partial(@tree1, buffers, 0, half_size, root, streams)
|
|
73
|
+
|
|
74
|
+
# Tree 2 handles second half
|
|
75
|
+
broadcast_tree_partial(@tree2, buffers, half_size, half_size, root, streams)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Reduce using double tree
|
|
80
|
+
#
|
|
81
|
+
# @param buffers [Array<FFI::Pointer>] Source buffers
|
|
82
|
+
# @param sizes [Array<Integer>] Buffer sizes
|
|
83
|
+
# @param dtype [Symbol] Data type
|
|
84
|
+
# @param op [Symbol] Reduction operation
|
|
85
|
+
# @param root [Integer] Root rank
|
|
86
|
+
# @param streams [Array<FFI::Pointer>] CUDA streams
|
|
87
|
+
# @return [void]
|
|
88
|
+
def reduce(buffers:, sizes:, dtype:, op:, root:, streams:)
|
|
89
|
+
return if @n_gpus == 1
|
|
90
|
+
|
|
91
|
+
build! if @tree1.nil?
|
|
92
|
+
|
|
93
|
+
if power_of_2?(@n_gpus)
|
|
94
|
+
reduce_tree(@tree1, buffers, sizes[0], dtype, op, root, streams)
|
|
95
|
+
else
|
|
96
|
+
# Split reduction via both trees, combine at root
|
|
97
|
+
half_size = sizes[0] / 2
|
|
98
|
+
elem_size = dtype_elem_size(dtype)
|
|
99
|
+
half_count = half_size / elem_size
|
|
100
|
+
|
|
101
|
+
# Allocate temp buffer at root for tree2 partial result
|
|
102
|
+
temp_buffer = allocate_buffer(@gpu_ids[root], half_size)
|
|
103
|
+
|
|
104
|
+
begin
|
|
105
|
+
# Tree 1: reduce first half to root
|
|
106
|
+
reduce_tree_partial(@tree1, buffers, 0, half_size, dtype, op, root, streams)
|
|
107
|
+
|
|
108
|
+
# Tree 2: reduce second half to tree2's root, then send to actual root
|
|
109
|
+
tree2_root = @n_gpus / 2
|
|
110
|
+
reduce_tree_partial(@tree2, buffers, half_size, half_size, dtype, op, tree2_root, streams)
|
|
111
|
+
|
|
112
|
+
# If tree2 root != actual root, transfer
|
|
113
|
+
if tree2_root != root
|
|
114
|
+
transport = @transport_selector.select_transport(@gpu_ids[tree2_root], @gpu_ids[root])
|
|
115
|
+
if transport.is_a?(Transport::P2PTransport)
|
|
116
|
+
src = ptr_offset(buffers[tree2_root], half_size)
|
|
117
|
+
dst = ptr_offset(buffers[root], half_size)
|
|
118
|
+
transport.copy_async(dst, src, half_size, get_stream_ptr(streams[root]))
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
synchronize_all_streams!(streams)
|
|
123
|
+
ensure
|
|
124
|
+
free_buffer(temp_buffer, @gpu_ids[root])
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
private
|
|
130
|
+
|
|
131
|
+
def build_binary_tree(root_rank:, tree_id:)
|
|
132
|
+
nodes = @n_gpus.times.map do |rank|
|
|
133
|
+
TreeNode.new(
|
|
134
|
+
rank: rank,
|
|
135
|
+
gpu_id: @gpu_ids[rank],
|
|
136
|
+
parent: nil,
|
|
137
|
+
left_child: nil,
|
|
138
|
+
right_child: nil,
|
|
139
|
+
tree_id: tree_id
|
|
140
|
+
)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Build parent-child relationships
|
|
144
|
+
# Reorder so root is at position 0 in tree
|
|
145
|
+
reordered = reorder_for_root(root_rank)
|
|
146
|
+
|
|
147
|
+
reordered.each_with_index do |orig_rank, tree_pos|
|
|
148
|
+
node = nodes[orig_rank]
|
|
149
|
+
|
|
150
|
+
left_tree_pos = 2 * tree_pos + 1
|
|
151
|
+
right_tree_pos = 2 * tree_pos + 2
|
|
152
|
+
|
|
153
|
+
if left_tree_pos < @n_gpus
|
|
154
|
+
left_orig_rank = reordered[left_tree_pos]
|
|
155
|
+
node.left_child = left_orig_rank
|
|
156
|
+
nodes[left_orig_rank].parent = orig_rank
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
if right_tree_pos < @n_gpus
|
|
160
|
+
right_orig_rank = reordered[right_tree_pos]
|
|
161
|
+
node.right_child = right_orig_rank
|
|
162
|
+
nodes[right_orig_rank].parent = orig_rank
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
nodes
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def reorder_for_root(root_rank)
|
|
170
|
+
# Circular shift so root is at index 0
|
|
171
|
+
order = (0...@n_gpus).to_a
|
|
172
|
+
order.rotate(root_rank)
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def broadcast_tree(tree, buffers, size, root, streams)
|
|
176
|
+
depth = tree_depth
|
|
177
|
+
|
|
178
|
+
depth.times do |d|
|
|
179
|
+
tree.each do |node|
|
|
180
|
+
next unless node_depth(node, tree) == d
|
|
181
|
+
|
|
182
|
+
[node.left_child, node.right_child].compact.each do |child_rank|
|
|
183
|
+
src_gpu = node.gpu_id
|
|
184
|
+
dst_gpu = @gpu_ids[child_rank]
|
|
185
|
+
|
|
186
|
+
transport = @transport_selector.select_transport(src_gpu, dst_gpu)
|
|
187
|
+
stream_ptr = get_stream_ptr(streams[node.rank])
|
|
188
|
+
|
|
189
|
+
if transport.is_a?(Transport::P2PTransport)
|
|
190
|
+
transport.copy_async(buffers[child_rank], buffers[node.rank], size, stream_ptr)
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
synchronize_all_streams!(streams)
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def broadcast_tree_partial(tree, buffers, offset, size, root, streams)
|
|
200
|
+
depth = tree_depth
|
|
201
|
+
|
|
202
|
+
depth.times do |d|
|
|
203
|
+
tree.each do |node|
|
|
204
|
+
next unless node_depth(node, tree) == d
|
|
205
|
+
|
|
206
|
+
[node.left_child, node.right_child].compact.each do |child_rank|
|
|
207
|
+
src_gpu = node.gpu_id
|
|
208
|
+
dst_gpu = @gpu_ids[child_rank]
|
|
209
|
+
|
|
210
|
+
transport = @transport_selector.select_transport(src_gpu, dst_gpu)
|
|
211
|
+
stream_ptr = get_stream_ptr(streams[node.rank])
|
|
212
|
+
|
|
213
|
+
if transport.is_a?(Transport::P2PTransport)
|
|
214
|
+
src = ptr_offset(buffers[node.rank], offset)
|
|
215
|
+
dst = ptr_offset(buffers[child_rank], offset)
|
|
216
|
+
transport.copy_async(dst, src, size, stream_ptr)
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
synchronize_all_streams!(streams)
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def reduce_tree(tree, buffers, size, dtype, op, root, streams)
|
|
226
|
+
depth = tree_depth
|
|
227
|
+
elem_size = dtype_elem_size(dtype)
|
|
228
|
+
count = size / elem_size
|
|
229
|
+
|
|
230
|
+
recv_buffers = allocate_recv_buffers(size)
|
|
231
|
+
|
|
232
|
+
begin
|
|
233
|
+
(depth - 1).downto(0) do |d|
|
|
234
|
+
tree.each do |node|
|
|
235
|
+
next unless node_depth(node, tree) == d
|
|
236
|
+
next unless node.left_child || node.right_child
|
|
237
|
+
|
|
238
|
+
CUDA::RuntimeAPI.cudaSetDevice(node.gpu_id)
|
|
239
|
+
stream_ptr = get_stream_ptr(streams[node.rank])
|
|
240
|
+
|
|
241
|
+
[node.left_child, node.right_child].compact.each do |child_rank|
|
|
242
|
+
src_gpu = @gpu_ids[child_rank]
|
|
243
|
+
|
|
244
|
+
transport = @transport_selector.select_transport(src_gpu, node.gpu_id)
|
|
245
|
+
|
|
246
|
+
if transport.is_a?(Transport::P2PTransport)
|
|
247
|
+
transport.copy_async(recv_buffers[node.rank], buffers[child_rank], size, stream_ptr)
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
synchronize_stream!(streams[node.rank])
|
|
251
|
+
|
|
252
|
+
ReductionOps.execute(op, buffers[node.rank], recv_buffers[node.rank],
|
|
253
|
+
buffers[node.rank], count, dtype, stream_ptr)
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
synchronize_all_streams!(streams)
|
|
258
|
+
end
|
|
259
|
+
ensure
|
|
260
|
+
free_recv_buffers(recv_buffers)
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
def reduce_tree_partial(tree, buffers, offset, size, dtype, op, root, streams)
|
|
265
|
+
# Similar to reduce_tree but only for a portion of the buffer
|
|
266
|
+
# Implementation mirrors reduce_tree with offset applied
|
|
267
|
+
reduce_tree(
|
|
268
|
+
tree,
|
|
269
|
+
buffers.map { |b| ptr_offset(b, offset) },
|
|
270
|
+
size,
|
|
271
|
+
dtype,
|
|
272
|
+
op,
|
|
273
|
+
root,
|
|
274
|
+
streams
|
|
275
|
+
)
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def tree_depth
|
|
279
|
+
return 0 if @n_gpus <= 1
|
|
280
|
+
(Math.log2(@n_gpus).ceil)
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
def node_depth(node, tree)
|
|
284
|
+
depth = 0
|
|
285
|
+
current = node
|
|
286
|
+
while current.parent
|
|
287
|
+
current = tree[current.parent]
|
|
288
|
+
depth += 1
|
|
289
|
+
end
|
|
290
|
+
depth
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
def power_of_2?(n)
|
|
294
|
+
n > 0 && (n & (n - 1)).zero?
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def dtype_elem_size(dtype)
|
|
298
|
+
case dtype
|
|
299
|
+
when :float32, :int32 then 4
|
|
300
|
+
when :float64, :int64 then 8
|
|
301
|
+
when :float16, :bfloat16 then 2
|
|
302
|
+
else 4
|
|
303
|
+
end
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
def get_stream_ptr(stream)
|
|
307
|
+
case stream
|
|
308
|
+
when FFI::Pointer then stream
|
|
309
|
+
when CUDA::Stream then stream.ptr
|
|
310
|
+
else FFI::Pointer::NULL
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
def ptr_offset(ptr, offset)
|
|
315
|
+
FFI::Pointer.new(:uint8, ptr.address + offset)
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
def allocate_recv_buffers(size)
|
|
319
|
+
@gpu_ids.map do |gpu_id|
|
|
320
|
+
allocate_buffer(gpu_id, size)
|
|
321
|
+
end
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
def allocate_buffer(gpu_id, size)
|
|
325
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
326
|
+
CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
|
|
327
|
+
ptr_ptr = FFI::MemoryPointer.new(:pointer)
|
|
328
|
+
CUDA::RuntimeAPI.cudaMalloc(ptr_ptr, size)
|
|
329
|
+
ptr_ptr.read_pointer
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
def free_buffer(buf, gpu_id)
|
|
333
|
+
return unless buf && !buf.null?
|
|
334
|
+
CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
|
|
335
|
+
CUDA::RuntimeAPI.cudaFree(buf)
|
|
336
|
+
rescue StandardError
|
|
337
|
+
# Ignore cleanup errors
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
def free_recv_buffers(buffers)
|
|
341
|
+
buffers.each_with_index do |buf, i|
|
|
342
|
+
free_buffer(buf, @gpu_ids[i])
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
def synchronize_stream!(stream)
|
|
347
|
+
stream_ptr = get_stream_ptr(stream)
|
|
348
|
+
if stream_ptr.null?
|
|
349
|
+
CUDA::RuntimeAPI.cudaDeviceSynchronize
|
|
350
|
+
else
|
|
351
|
+
CUDA::RuntimeAPI.cudaStreamSynchronize(stream_ptr)
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
def synchronize_all_streams!(streams)
|
|
356
|
+
streams.each_with_index do |stream, i|
|
|
357
|
+
CUDA::RuntimeAPI.cudaSetDevice(@gpu_ids[i])
|
|
358
|
+
synchronize_stream!(stream)
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
end
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module Collective
|
|
5
|
+
module Algorithms
|
|
6
|
+
# Message Pipelining for Overlapping Communication and Computation
|
|
7
|
+
#
|
|
8
|
+
# Splits large messages into chunks that can be processed concurrently:
|
|
9
|
+
# - While GPU processes chunk N, transfer chunk N+1
|
|
10
|
+
# - Hides transfer latency behind compute
|
|
11
|
+
#
|
|
12
|
+
# Optimal chunk size balances:
|
|
13
|
+
# - Too small: overhead dominates
|
|
14
|
+
# - Too large: no overlap opportunity
|
|
15
|
+
class Pipeliner
|
|
16
|
+
# Pipeline stage
|
|
17
|
+
PipelineStage = Struct.new(:chunk_id, :offset, :size, :state, keyword_init: true)
|
|
18
|
+
|
|
19
|
+
# Pipeline states
|
|
20
|
+
STATE_PENDING = :pending
|
|
21
|
+
STATE_TRANSFERRING = :transferring
|
|
22
|
+
STATE_COMPUTING = :computing
|
|
23
|
+
STATE_COMPLETE = :complete
|
|
24
|
+
|
|
25
|
+
# Default chunk size: 512KB (good for PCIe 4.0)
|
|
26
|
+
DEFAULT_CHUNK_SIZE = 512 * 1024
|
|
27
|
+
|
|
28
|
+
# @return [Integer] Chunk size in bytes
|
|
29
|
+
attr_reader :chunk_size
|
|
30
|
+
|
|
31
|
+
# @return [Integer] Number of pipeline stages (double-buffering = 2)
|
|
32
|
+
attr_reader :num_stages
|
|
33
|
+
|
|
34
|
+
# @param chunk_size [Integer] Size of each pipeline chunk
|
|
35
|
+
# @param num_stages [Integer] Number of concurrent stages (default 2)
|
|
36
|
+
def initialize(chunk_size: DEFAULT_CHUNK_SIZE, num_stages: 2)
|
|
37
|
+
@chunk_size = chunk_size
|
|
38
|
+
@num_stages = num_stages
|
|
39
|
+
@streams = {}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Calculate optimal chunk size based on bandwidth and compute speed
|
|
43
|
+
#
|
|
44
|
+
# @param bandwidth_gbps [Float] Transfer bandwidth in GB/s
|
|
45
|
+
# @param compute_gflops [Float] Compute throughput in GFLOPS
|
|
46
|
+
# @param flops_per_element [Float] FLOPs per element in reduction
|
|
47
|
+
# @param element_size [Integer] Bytes per element
|
|
48
|
+
# @return [Integer] Optimal chunk size in bytes
|
|
49
|
+
def self.optimal_chunk_size(bandwidth_gbps:, compute_gflops:, flops_per_element: 1.0, element_size: 4)
|
|
50
|
+
# Balance: transfer_time = compute_time
|
|
51
|
+
# chunk_size / bandwidth = (chunk_size / element_size) * flops_per_element / compute_throughput
|
|
52
|
+
|
|
53
|
+
# Convert GB/s to bytes/s
|
|
54
|
+
bandwidth_bps = bandwidth_gbps * 1e9
|
|
55
|
+
# Convert GFLOPS to FLOPS
|
|
56
|
+
compute_flops = compute_gflops * 1e9
|
|
57
|
+
|
|
58
|
+
# chunk_size / bandwidth = (chunk_size * flops_per_element) / (element_size * compute_flops)
|
|
59
|
+
# Solving: chunk_size = bandwidth * element_size * compute_flops / (compute_flops + bandwidth * flops_per_element)
|
|
60
|
+
|
|
61
|
+
# Simplified: aim for transfer time ~ compute time
|
|
62
|
+
# chunk ~= bandwidth * target_latency_seconds
|
|
63
|
+
target_latency = 0.001 # 1ms pipeline stage
|
|
64
|
+
optimal = (bandwidth_bps * target_latency).to_i
|
|
65
|
+
|
|
66
|
+
# Clamp to reasonable range
|
|
67
|
+
[[optimal, 64 * 1024].max, 4 * 1024 * 1024].min
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Execute a pipelined reduction operation
|
|
71
|
+
#
|
|
72
|
+
# @param buffers [Array<FFI::Pointer>] Device buffers
|
|
73
|
+
# @param total_size [Integer] Total buffer size
|
|
74
|
+
# @param dtype [Symbol] Data type
|
|
75
|
+
# @param op [Symbol] Reduction operation
|
|
76
|
+
# @param transfer_fn [Proc] Transfer function (chunk_offset, chunk_size, stream) -> void
|
|
77
|
+
# @param reduce_fn [Proc] Reduce function (chunk_offset, chunk_size, stream) -> void
|
|
78
|
+
# @param streams [Array<CUDA::Stream>] CUDA streams (one per stage)
|
|
79
|
+
# @return [void]
|
|
80
|
+
def execute_pipelined(buffers:, total_size:, dtype:, op:, transfer_fn:, reduce_fn:, streams: nil)
|
|
81
|
+
# Calculate chunks
|
|
82
|
+
n_chunks = (total_size + @chunk_size - 1) / @chunk_size
|
|
83
|
+
|
|
84
|
+
# Create streams if not provided
|
|
85
|
+
streams ||= create_streams(@num_stages, buffers[0])
|
|
86
|
+
|
|
87
|
+
# Initialize pipeline stages
|
|
88
|
+
stages = n_chunks.times.map do |i|
|
|
89
|
+
offset = i * @chunk_size
|
|
90
|
+
size = [total_size - offset, @chunk_size].min
|
|
91
|
+
PipelineStage.new(chunk_id: i, offset: offset, size: size, state: STATE_PENDING)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Pipeline execution
|
|
95
|
+
active_stages = []
|
|
96
|
+
pending_stages = stages.dup
|
|
97
|
+
|
|
98
|
+
while pending_stages.any? || active_stages.any?
|
|
99
|
+
# Start new transfers up to num_stages
|
|
100
|
+
while active_stages.size < @num_stages && pending_stages.any?
|
|
101
|
+
stage = pending_stages.shift
|
|
102
|
+
stream = streams[stage.chunk_id % @num_stages]
|
|
103
|
+
|
|
104
|
+
# Start transfer
|
|
105
|
+
transfer_fn.call(stage.offset, stage.size, stream)
|
|
106
|
+
stage.state = STATE_TRANSFERRING
|
|
107
|
+
active_stages << stage
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Check for completed transfers, start compute
|
|
111
|
+
active_stages.each do |stage|
|
|
112
|
+
if stage.state == STATE_TRANSFERRING
|
|
113
|
+
stream = streams[stage.chunk_id % @num_stages]
|
|
114
|
+
|
|
115
|
+
# Sync to ensure transfer complete
|
|
116
|
+
sync_stream(stream)
|
|
117
|
+
|
|
118
|
+
# Start compute
|
|
119
|
+
reduce_fn.call(stage.offset, stage.size, stream)
|
|
120
|
+
stage.state = STATE_COMPUTING
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Check for completed compute
|
|
125
|
+
active_stages.reject! do |stage|
|
|
126
|
+
if stage.state == STATE_COMPUTING
|
|
127
|
+
stream = streams[stage.chunk_id % @num_stages]
|
|
128
|
+
sync_stream(stream)
|
|
129
|
+
stage.state = STATE_COMPLETE
|
|
130
|
+
true
|
|
131
|
+
else
|
|
132
|
+
false
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Execute a pipelined AllReduce with overlap
|
|
139
|
+
#
|
|
140
|
+
# @param ring [Ring] Ring algorithm instance
|
|
141
|
+
# @param buffers [Array<FFI::Pointer>] Device buffers
|
|
142
|
+
# @param total_size [Integer] Total buffer size
|
|
143
|
+
# @param dtype [Symbol] Data type
|
|
144
|
+
# @param op [Symbol] Reduction operation
|
|
145
|
+
# @param gpu_streams [Array<Array<CUDA::Stream>>] Streams per GPU per stage
|
|
146
|
+
# @return [void]
|
|
147
|
+
def execute_pipelined_all_reduce(ring:, buffers:, total_size:, dtype:, op:, gpu_streams: nil)
|
|
148
|
+
n_gpus = ring.n_gpus
|
|
149
|
+
n_chunks = (total_size + @chunk_size - 1) / @chunk_size
|
|
150
|
+
|
|
151
|
+
# Create streams: 2 per GPU for double-buffering
|
|
152
|
+
gpu_streams ||= n_gpus.times.map do |rank|
|
|
153
|
+
CUDA::RuntimeAPI.cudaSetDevice(ring.ring_order[rank])
|
|
154
|
+
create_streams_for_device(@num_stages)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Process in pipeline fashion
|
|
158
|
+
n_chunks.times do |chunk_id|
|
|
159
|
+
chunk_offset = chunk_id * @chunk_size
|
|
160
|
+
chunk_size = [total_size - chunk_offset, @chunk_size].min
|
|
161
|
+
|
|
162
|
+
stage_idx = chunk_id % @num_stages
|
|
163
|
+
|
|
164
|
+
# Get streams for this stage
|
|
165
|
+
stage_streams = gpu_streams.map { |streams| streams[stage_idx] }
|
|
166
|
+
|
|
167
|
+
# Create offset buffers
|
|
168
|
+
chunk_buffers = buffers.map { |buf| ptr_offset(buf, chunk_offset) }
|
|
169
|
+
|
|
170
|
+
# Execute ring all-reduce on this chunk
|
|
171
|
+
ring.all_reduce(
|
|
172
|
+
buffers: chunk_buffers,
|
|
173
|
+
sizes: [chunk_size] * n_gpus,
|
|
174
|
+
dtype: dtype,
|
|
175
|
+
op: op,
|
|
176
|
+
streams: stage_streams
|
|
177
|
+
)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Final sync
|
|
181
|
+
gpu_streams.flatten.each { |s| sync_stream(s) }
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
private
|
|
185
|
+
|
|
186
|
+
def create_streams(count, buffer)
|
|
187
|
+
count.times.map do
|
|
188
|
+
stream_ptr = FFI::MemoryPointer.new(:pointer)
|
|
189
|
+
CUDA::RuntimeAPI.cudaStreamCreate(stream_ptr)
|
|
190
|
+
stream_ptr.read_pointer
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def create_streams_for_device(count)
|
|
195
|
+
count.times.map do
|
|
196
|
+
stream_ptr = FFI::MemoryPointer.new(:pointer)
|
|
197
|
+
CUDA::RuntimeAPI.cudaStreamCreate(stream_ptr)
|
|
198
|
+
stream_ptr.read_pointer
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def sync_stream(stream)
|
|
203
|
+
stream_ptr = case stream
|
|
204
|
+
when FFI::Pointer then stream
|
|
205
|
+
when CUDA::Stream then stream.ptr
|
|
206
|
+
else FFI::Pointer::NULL
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
if stream_ptr.null?
|
|
210
|
+
CUDA::RuntimeAPI.cudaDeviceSynchronize
|
|
211
|
+
else
|
|
212
|
+
CUDA::RuntimeAPI.cudaStreamSynchronize(stream_ptr)
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
def ptr_offset(ptr, offset)
|
|
217
|
+
FFI::Pointer.new(:uint8, ptr.address + offset)
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
end
|