ignis-collective 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +7 -0
- data/lib/ignis-collective.rb +9 -0
- data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
- data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
- data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
- data/lib/nvruby/collective/algorithms/ring.rb +421 -0
- data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
- data/lib/nvruby/collective/algorithms/tree.rb +291 -0
- data/lib/nvruby/collective/array_ops.rb +240 -0
- data/lib/nvruby/collective/communicator.rb +633 -0
- data/lib/nvruby/collective/communicator_healer.rb +276 -0
- data/lib/nvruby/collective/device_manager.rb +216 -0
- data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
- data/lib/nvruby/collective/health_monitor.rb +333 -0
- data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
- data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
- data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
- data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
- data/lib/nvruby/collective/p2p_bindings.rb +121 -0
- data/lib/nvruby/collective/resilient_transport.rb +296 -0
- data/lib/nvruby/collective/topology.rb +347 -0
- data/lib/nvruby/collective/transport/base.rb +138 -0
- data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
- data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
- data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
- data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
- data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
- data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
- data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
- data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
- data/lib/nvruby/collective/transport_selector.rb +200 -0
- data/lib/nvruby/collective/vmm_bindings.rb +212 -0
- data/lib/nvruby/collective.rb +156 -0
- metadata +92 -0
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module Collective
|
|
5
|
+
module Algorithms
|
|
6
|
+
# Topology-Aware Routing Optimizer
|
|
7
|
+
#
|
|
8
|
+
# Optimizes collective operation routing based on PCIe/NVLink topology.
|
|
9
|
+
# Groups GPUs by PCIe switch to minimize cross-switch traffic.
|
|
10
|
+
#
|
|
11
|
+
# Key optimizations:
|
|
12
|
+
# 1. Intra-switch communication first (lower latency)
|
|
13
|
+
# 2. Ring order follows physical topology
|
|
14
|
+
# 3. Tree structure respects switch boundaries
|
|
15
|
+
class TopologyRouter
|
|
16
|
+
# Switch group for routing
|
|
17
|
+
SwitchGroup = Struct.new(:switch_id, :gpu_ids, :bandwidth_gbps, keyword_init: true)
|
|
18
|
+
|
|
19
|
+
# @return [Topology::Detector] Topology detector
|
|
20
|
+
attr_reader :topology
|
|
21
|
+
|
|
22
|
+
# @return [Array<Integer>] GPU IDs
|
|
23
|
+
attr_reader :gpu_ids
|
|
24
|
+
|
|
25
|
+
# @return [Array<SwitchGroup>] GPU groups by switch
|
|
26
|
+
attr_reader :switch_groups
|
|
27
|
+
|
|
28
|
+
# @param gpu_ids [Array<Integer>] GPU device IDs
|
|
29
|
+
# @param topology [Topology::Detector, nil] Optional topology detector
|
|
30
|
+
def initialize(gpu_ids:, topology: nil)
|
|
31
|
+
@gpu_ids = gpu_ids.dup.freeze
|
|
32
|
+
@topology = topology || Topology::Detector.new
|
|
33
|
+
@switch_groups = []
|
|
34
|
+
@routing_table = {}
|
|
35
|
+
detect_switch_groups!
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Get optimized ring order based on topology
|
|
39
|
+
# Groups GPUs by PCIe switch, then chains switches
|
|
40
|
+
#
|
|
41
|
+
# @return [Array<Integer>] Optimal ring order
|
|
42
|
+
def optimal_ring_order
|
|
43
|
+
return @gpu_ids if @switch_groups.size <= 1
|
|
44
|
+
|
|
45
|
+
# Build ring by chaining switch groups
|
|
46
|
+
# Within each group, order by P2P performance
|
|
47
|
+
ring = []
|
|
48
|
+
|
|
49
|
+
# Sort groups by average bandwidth (best first)
|
|
50
|
+
sorted_groups = @switch_groups.sort_by { |g| -g.bandwidth_gbps }
|
|
51
|
+
|
|
52
|
+
sorted_groups.each do |group|
|
|
53
|
+
# Order GPUs within group by P2P performance
|
|
54
|
+
ordered = order_within_group(group.gpu_ids)
|
|
55
|
+
ring.concat(ordered)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
ring
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Get optimized tree structure that respects switch boundaries
|
|
62
|
+
#
|
|
63
|
+
# @param root [Integer] Root rank
|
|
64
|
+
# @return [Hash] Tree structure with :parent, :children for each rank
|
|
65
|
+
def optimal_tree_structure(root: 0)
|
|
66
|
+
tree = {}
|
|
67
|
+
n = @gpu_ids.size
|
|
68
|
+
|
|
69
|
+
return { 0 => { parent: nil, children: [] } } if n == 1
|
|
70
|
+
|
|
71
|
+
# Build hierarchical tree:
|
|
72
|
+
# - Level 0: Root
|
|
73
|
+
# - Level 1: One representative from each switch group
|
|
74
|
+
# - Level 2+: Other GPUs within each switch group
|
|
75
|
+
|
|
76
|
+
# Find which group the root belongs to
|
|
77
|
+
root_group_idx = @switch_groups.find_index { |g| g.gpu_ids.include?(@gpu_ids[root]) }
|
|
78
|
+
root_group = @switch_groups[root_group_idx]
|
|
79
|
+
|
|
80
|
+
# Root has no parent
|
|
81
|
+
tree[root] = { parent: nil, children: [] }
|
|
82
|
+
|
|
83
|
+
# Connect switch group representatives to root
|
|
84
|
+
group_reps = []
|
|
85
|
+
@switch_groups.each_with_index do |group, idx|
|
|
86
|
+
next if idx == root_group_idx
|
|
87
|
+
|
|
88
|
+
# Pick highest-bandwidth GPU in group as representative
|
|
89
|
+
rep_gpu = pick_best_connector(root_group, group)
|
|
90
|
+
rep_rank = @gpu_ids.index(rep_gpu)
|
|
91
|
+
|
|
92
|
+
tree[root][:children] << rep_rank
|
|
93
|
+
tree[rep_rank] = { parent: root, children: [] }
|
|
94
|
+
group_reps << { rank: rep_rank, group: group }
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Connect other GPUs in root's group directly to root
|
|
98
|
+
root_group.gpu_ids.each do |gpu|
|
|
99
|
+
rank = @gpu_ids.index(gpu)
|
|
100
|
+
next if rank == root
|
|
101
|
+
|
|
102
|
+
tree[root][:children] << rank
|
|
103
|
+
tree[rank] = { parent: root, children: [] }
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Connect remaining GPUs in each group to their representative
|
|
107
|
+
group_reps.each do |rep_info|
|
|
108
|
+
rep_rank = rep_info[:rank]
|
|
109
|
+
rep_gpu = @gpu_ids[rep_rank]
|
|
110
|
+
|
|
111
|
+
rep_info[:group].gpu_ids.each do |gpu|
|
|
112
|
+
next if gpu == rep_gpu
|
|
113
|
+
|
|
114
|
+
rank = @gpu_ids.index(gpu)
|
|
115
|
+
tree[rep_rank][:children] << rank
|
|
116
|
+
tree[rank] = { parent: rep_rank, children: [] }
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
tree
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Suggest optimal algorithm based on message size and topology
|
|
124
|
+
#
|
|
125
|
+
# @param message_size [Integer] Total message size in bytes
|
|
126
|
+
# @return [Symbol] :ring, :tree, or :double_tree
|
|
127
|
+
def suggest_algorithm(message_size)
|
|
128
|
+
# Ring is bandwidth-optimal for large messages
|
|
129
|
+
# Tree is latency-optimal for small messages
|
|
130
|
+
# Double tree for non-power-of-2 with medium messages
|
|
131
|
+
|
|
132
|
+
n = @gpu_ids.size
|
|
133
|
+
is_power_of_2 = n > 0 && (n & (n - 1)).zero?
|
|
134
|
+
|
|
135
|
+
if message_size < 1024 # < 1KB
|
|
136
|
+
:tree
|
|
137
|
+
elsif message_size > 1_048_576 # > 1MB
|
|
138
|
+
:ring
|
|
139
|
+
elsif !is_power_of_2
|
|
140
|
+
:double_tree
|
|
141
|
+
else
|
|
142
|
+
# Medium messages: prefer ring if good interconnect
|
|
143
|
+
avg_bandwidth = @switch_groups.sum { |g| g.bandwidth_gbps } / @switch_groups.size.to_f
|
|
144
|
+
avg_bandwidth > 50 ? :ring : :tree
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Get routing path between two GPUs
|
|
149
|
+
#
|
|
150
|
+
# @param src_rank [Integer] Source rank
|
|
151
|
+
# @param dst_rank [Integer] Destination rank
|
|
152
|
+
# @return [Array<Integer>] Path as list of rank hops
|
|
153
|
+
def get_path(src_rank, dst_rank)
|
|
154
|
+
return [src_rank] if src_rank == dst_rank
|
|
155
|
+
|
|
156
|
+
src_gpu = @gpu_ids[src_rank]
|
|
157
|
+
dst_gpu = @gpu_ids[dst_rank]
|
|
158
|
+
|
|
159
|
+
src_group = find_group(src_gpu)
|
|
160
|
+
dst_group = find_group(dst_gpu)
|
|
161
|
+
|
|
162
|
+
if src_group == dst_group
|
|
163
|
+
# Direct path within same switch
|
|
164
|
+
[src_rank, dst_rank]
|
|
165
|
+
else
|
|
166
|
+
# Need to cross switch boundary - use group representative
|
|
167
|
+
# Find best intermediate hop
|
|
168
|
+
intermediate = pick_best_connector_rank(src_group, dst_group)
|
|
169
|
+
|
|
170
|
+
if intermediate == src_rank || intermediate == dst_rank
|
|
171
|
+
[src_rank, dst_rank]
|
|
172
|
+
else
|
|
173
|
+
[src_rank, intermediate, dst_rank]
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
private
|
|
179
|
+
|
|
180
|
+
def detect_switch_groups!
|
|
181
|
+
# Use topology to group GPUs by PCIe switch
|
|
182
|
+
# GPUs with NVLink form one group, others grouped by P2P connectivity
|
|
183
|
+
|
|
184
|
+
matrix = @topology.matrix
|
|
185
|
+
visited = Set.new
|
|
186
|
+
group_id = 0
|
|
187
|
+
|
|
188
|
+
@gpu_ids.each do |gpu|
|
|
189
|
+
next if visited.include?(gpu)
|
|
190
|
+
|
|
191
|
+
group_gpus = [gpu]
|
|
192
|
+
visited.add(gpu)
|
|
193
|
+
|
|
194
|
+
# Find all GPUs that have NVLink or high-speed P2P to this one
|
|
195
|
+
@gpu_ids.each do |other|
|
|
196
|
+
next if visited.include?(other) || other == gpu
|
|
197
|
+
|
|
198
|
+
path = matrix.path(gpu, other)
|
|
199
|
+
if path && (path.nvlink? || path.pcie_p2p?)
|
|
200
|
+
# Check if they're likely on same switch (both have good P2P to each other)
|
|
201
|
+
reverse_path = matrix.path(other, gpu)
|
|
202
|
+
if reverse_path && (reverse_path.nvlink? || reverse_path.pcie_p2p?)
|
|
203
|
+
group_gpus << other
|
|
204
|
+
visited.add(other)
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Calculate average bandwidth for this group
|
|
210
|
+
total_bw = 0.0
|
|
211
|
+
count = 0
|
|
212
|
+
group_gpus.combination(2).each do |a, b|
|
|
213
|
+
path = matrix.path(a, b)
|
|
214
|
+
if path
|
|
215
|
+
total_bw += path.bandwidth_gbps
|
|
216
|
+
count += 1
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
avg_bw = count > 0 ? total_bw / count : 16.0 # Default PCIe
|
|
220
|
+
|
|
221
|
+
@switch_groups << SwitchGroup.new(
|
|
222
|
+
switch_id: group_id,
|
|
223
|
+
gpu_ids: group_gpus,
|
|
224
|
+
bandwidth_gbps: avg_bw
|
|
225
|
+
)
|
|
226
|
+
group_id += 1
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def order_within_group(gpus)
|
|
231
|
+
return gpus if gpus.size <= 2
|
|
232
|
+
|
|
233
|
+
# Order by greedy nearest-neighbor based on P2P performance
|
|
234
|
+
matrix = @topology.matrix
|
|
235
|
+
remaining = gpus.dup
|
|
236
|
+
ordered = [remaining.shift]
|
|
237
|
+
|
|
238
|
+
while remaining.any?
|
|
239
|
+
current = ordered.last
|
|
240
|
+
best = remaining.max_by do |gpu|
|
|
241
|
+
path = matrix.path(current, gpu)
|
|
242
|
+
path ? path.performance_rank : 0
|
|
243
|
+
end
|
|
244
|
+
ordered << best
|
|
245
|
+
remaining.delete(best)
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
ordered
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def pick_best_connector(group_a, group_b)
|
|
252
|
+
# Find GPU in group_b that has best connection to group_a
|
|
253
|
+
matrix = @topology.matrix
|
|
254
|
+
|
|
255
|
+
best_gpu = nil
|
|
256
|
+
best_score = -1
|
|
257
|
+
|
|
258
|
+
group_b.gpu_ids.each do |gpu_b|
|
|
259
|
+
score = group_a.gpu_ids.sum do |gpu_a|
|
|
260
|
+
path = matrix.path(gpu_a, gpu_b)
|
|
261
|
+
path ? path.performance_rank : 0
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
if score > best_score
|
|
265
|
+
best_score = score
|
|
266
|
+
best_gpu = gpu_b
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
best_gpu || group_b.gpu_ids.first
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
def pick_best_connector_rank(group_a, group_b)
|
|
274
|
+
gpu = pick_best_connector(group_a, group_b)
|
|
275
|
+
@gpu_ids.index(gpu)
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def find_group(gpu)
|
|
279
|
+
@switch_groups.find { |g| g.gpu_ids.include?(gpu) }
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
end
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module Collective
|
|
5
|
+
module Algorithms
|
|
6
|
+
# Tree-based algorithms for broadcast and reduce operations
|
|
7
|
+
#
|
|
8
|
+
# Binary tree algorithms are optimal for small messages where latency dominates:
|
|
9
|
+
# - Latency complexity: O(log N) steps
|
|
10
|
+
# - Bandwidth complexity: O(data_size) per step (not optimal for large messages)
|
|
11
|
+
#
|
|
12
|
+
# Best for: Small messages (<1KB) where latency is critical
|
|
13
|
+
class Tree
|
|
14
|
+
# Tree node representing a GPU in the tree structure
|
|
15
|
+
TreeNode = Struct.new(:gpu_id, :rank, :parent, :children, :depth, keyword_init: true)
|
|
16
|
+
|
|
17
|
+
# @return [Array<Integer>] GPU IDs
|
|
18
|
+
attr_reader :gpu_ids
|
|
19
|
+
|
|
20
|
+
# @return [Integer] Number of GPUs
|
|
21
|
+
attr_reader :n_gpus
|
|
22
|
+
|
|
23
|
+
# @return [TransportSelector] Transport selector
|
|
24
|
+
attr_reader :transport_selector
|
|
25
|
+
|
|
26
|
+
# @return [Array<TreeNode>] Tree structure
|
|
27
|
+
attr_reader :tree
|
|
28
|
+
|
|
29
|
+
# @param gpu_ids [Array<Integer>] GPU device IDs
|
|
30
|
+
# @param transport_selector [TransportSelector] Transport selector
|
|
31
|
+
def initialize(gpu_ids:, transport_selector:)
|
|
32
|
+
@gpu_ids = gpu_ids.dup.freeze
|
|
33
|
+
@n_gpus = gpu_ids.size
|
|
34
|
+
@transport_selector = transport_selector
|
|
35
|
+
@tree = build_tree
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Broadcast data from root to all GPUs
|
|
39
|
+
#
|
|
40
|
+
# @param buffer [FFI::Pointer] Source buffer on root GPU
|
|
41
|
+
# @param buffers [Array<FFI::Pointer>] Destination buffers on all GPUs
|
|
42
|
+
# @param size [Integer] Buffer size in bytes
|
|
43
|
+
# @param root [Integer] Root rank (index in gpu_ids)
|
|
44
|
+
# @param streams [Array<CUDA::Stream, FFI::Pointer>] CUDA streams
|
|
45
|
+
# @return [void]
|
|
46
|
+
def broadcast(buffer:, buffers:, size:, root:, streams:)
|
|
47
|
+
return if @n_gpus == 1
|
|
48
|
+
|
|
49
|
+
validate_root!(root)
|
|
50
|
+
|
|
51
|
+
# Build tree rooted at specified root
|
|
52
|
+
tree = build_tree(root: root)
|
|
53
|
+
|
|
54
|
+
# Broadcast down the tree: log2(N) steps
|
|
55
|
+
depth = tree_depth(tree)
|
|
56
|
+
|
|
57
|
+
depth.times do |d|
|
|
58
|
+
# At each depth level, nodes at depth d send to their children
|
|
59
|
+
tree.each do |node|
|
|
60
|
+
next unless node.depth == d
|
|
61
|
+
next if node.children.empty?
|
|
62
|
+
|
|
63
|
+
src_rank = node.rank
|
|
64
|
+
src_gpu = node.gpu_id
|
|
65
|
+
src_buffer = buffers[src_rank]
|
|
66
|
+
stream_ptr = get_stream_ptr(streams[src_rank])
|
|
67
|
+
|
|
68
|
+
node.children.each do |child_rank|
|
|
69
|
+
dst_gpu = @gpu_ids[child_rank]
|
|
70
|
+
dst_buffer = buffers[child_rank]
|
|
71
|
+
|
|
72
|
+
transport = @transport_selector.select_transport(src_gpu, dst_gpu)
|
|
73
|
+
move!(transport, dst_buffer, src_buffer, size, stream_ptr)
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Synchronize after each level
|
|
78
|
+
synchronize_all_streams!(streams)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Reduce data from all GPUs to root
|
|
83
|
+
#
|
|
84
|
+
# @param buffers [Array<FFI::Pointer>] Source buffers on all GPUs
|
|
85
|
+
# @param sizes [Array<Integer>] Buffer sizes
|
|
86
|
+
# @param dtype [Symbol] Data type
|
|
87
|
+
# @param op [Symbol] Reduction operation
|
|
88
|
+
# @param root [Integer] Root rank
|
|
89
|
+
# @param streams [Array<CUDA::Stream, FFI::Pointer>] CUDA streams
|
|
90
|
+
# @return [void]
|
|
91
|
+
def reduce(buffers:, sizes:, dtype:, op:, root:, streams:)
|
|
92
|
+
return if @n_gpus == 1
|
|
93
|
+
|
|
94
|
+
validate_root!(root)
|
|
95
|
+
|
|
96
|
+
size = sizes[0]
|
|
97
|
+
elem_size = dtype_elem_size(dtype)
|
|
98
|
+
elem_count = size / elem_size
|
|
99
|
+
|
|
100
|
+
# Build tree rooted at root
|
|
101
|
+
tree = build_tree(root: root)
|
|
102
|
+
|
|
103
|
+
# Allocate temp buffers
|
|
104
|
+
recv_buffers = allocate_recv_buffers(size)
|
|
105
|
+
|
|
106
|
+
begin
|
|
107
|
+
# Reduce up the tree: log2(N) steps
|
|
108
|
+
depth = tree_depth(tree)
|
|
109
|
+
|
|
110
|
+
# Start from leaves, work up to root
|
|
111
|
+
(depth - 1).downto(0) do |d|
|
|
112
|
+
tree.each do |node|
|
|
113
|
+
next unless node.depth == d
|
|
114
|
+
next if node.children.empty?
|
|
115
|
+
|
|
116
|
+
dst_rank = node.rank
|
|
117
|
+
dst_gpu = node.gpu_id
|
|
118
|
+
dst_buffer = buffers[dst_rank]
|
|
119
|
+
stream_ptr = get_stream_ptr(streams[dst_rank])
|
|
120
|
+
|
|
121
|
+
# Set device for reduction
|
|
122
|
+
CUDA::RuntimeAPI.cudaSetDevice(dst_gpu)
|
|
123
|
+
|
|
124
|
+
# For each child: copy its buffer into recv, then reduce it into
|
|
125
|
+
# the parent's local buffer BEFORE the next child overwrites recv.
|
|
126
|
+
# (Previously every child was copied into the SAME recv buffer and
|
|
127
|
+
# then reduced once per child, which double-counted the last child
|
|
128
|
+
# and dropped the rest — e.g. parent += child2 + child2.)
|
|
129
|
+
recv_buffer = recv_buffers[dst_rank]
|
|
130
|
+
node.children.each do |child_rank|
|
|
131
|
+
src_gpu = @gpu_ids[child_rank]
|
|
132
|
+
src_buffer = buffers[child_rank]
|
|
133
|
+
|
|
134
|
+
transport = @transport_selector.select_transport(src_gpu, dst_gpu)
|
|
135
|
+
move!(transport, recv_buffer, src_buffer, size, stream_ptr)
|
|
136
|
+
synchronize_stream!(streams[dst_rank])
|
|
137
|
+
|
|
138
|
+
ReductionOps.execute(op, dst_buffer, recv_buffer, dst_buffer,
|
|
139
|
+
elem_count, dtype, stream_ptr)
|
|
140
|
+
synchronize_stream!(streams[dst_rank])
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
synchronize_all_streams!(streams)
|
|
145
|
+
end
|
|
146
|
+
ensure
|
|
147
|
+
free_recv_buffers(recv_buffers)
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
private
|
|
152
|
+
|
|
153
|
+
# Move bytes via the selected transport, failing LOUDLY rather than
|
|
154
|
+
# silently skipping. A non-P2P transport without copy_async would
|
|
155
|
+
# otherwise drop data and corrupt the reduction with no error.
|
|
156
|
+
def move!(transport, dst, src, size, stream_ptr)
|
|
157
|
+
if transport.respond_to?(:copy_async)
|
|
158
|
+
transport.copy_async(dst, src, size, stream_ptr)
|
|
159
|
+
else
|
|
160
|
+
raise NotImplementedError,
|
|
161
|
+
"Transport #{transport.class} has no copy_async; non-P2P tree " \
|
|
162
|
+
"movement is not wired yet (refusing to silently drop data)"
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Build binary tree structure
|
|
167
|
+
# @param root [Integer] Root rank (default 0)
|
|
168
|
+
# @return [Array<TreeNode>] Tree nodes
|
|
169
|
+
def build_tree(root: 0)
|
|
170
|
+
nodes = @n_gpus.times.map do |rank|
|
|
171
|
+
TreeNode.new(
|
|
172
|
+
gpu_id: @gpu_ids[rank],
|
|
173
|
+
rank: rank,
|
|
174
|
+
parent: nil,
|
|
175
|
+
children: [],
|
|
176
|
+
depth: 0
|
|
177
|
+
)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Build binary tree with specified root
|
|
181
|
+
# Use BFS to assign depths and parent/child relationships
|
|
182
|
+
if root != 0
|
|
183
|
+
# Reorder so root is first in tree building
|
|
184
|
+
reordered = [root] + (0...@n_gpus).reject { |i| i == root }
|
|
185
|
+
else
|
|
186
|
+
reordered = (0...@n_gpus).to_a
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Build tree using heap-like structure
|
|
190
|
+
reordered.each_with_index do |rank, idx|
|
|
191
|
+
next if idx.zero? # Root has no parent
|
|
192
|
+
|
|
193
|
+
parent_idx = (idx - 1) / 2
|
|
194
|
+
parent_rank = reordered[parent_idx]
|
|
195
|
+
|
|
196
|
+
nodes[rank].parent = parent_rank
|
|
197
|
+
nodes[parent_rank].children << rank
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# Calculate depths via BFS
|
|
201
|
+
queue = [root]
|
|
202
|
+
nodes[root].depth = 0
|
|
203
|
+
|
|
204
|
+
until queue.empty?
|
|
205
|
+
current_rank = queue.shift
|
|
206
|
+
current_depth = nodes[current_rank].depth
|
|
207
|
+
|
|
208
|
+
nodes[current_rank].children.each do |child_rank|
|
|
209
|
+
nodes[child_rank].depth = current_depth + 1
|
|
210
|
+
queue << child_rank
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
nodes
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Get maximum tree depth
|
|
218
|
+
def tree_depth(tree)
|
|
219
|
+
tree.map(&:depth).max + 1
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def validate_root!(root)
|
|
223
|
+
unless root >= 0 && root < @n_gpus
|
|
224
|
+
raise ArgumentError, "Invalid root #{root}. Valid: 0-#{@n_gpus - 1}"
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Allocate temp buffers
|
|
229
|
+
def allocate_recv_buffers(size)
|
|
230
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
231
|
+
|
|
232
|
+
@gpu_ids.map do |gpu_id|
|
|
233
|
+
CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
|
|
234
|
+
ptr_ptr = FFI::MemoryPointer.new(:pointer)
|
|
235
|
+
status = CUDA::RuntimeAPI.cudaMalloc(ptr_ptr, size)
|
|
236
|
+
CUDA::RuntimeAPI.check_status!(status, "Alloc tree recv buffer")
|
|
237
|
+
ptr_ptr.read_pointer
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Free temp buffers
|
|
242
|
+
def free_recv_buffers(recv_buffers)
|
|
243
|
+
recv_buffers.each_with_index do |buf, i|
|
|
244
|
+
next unless buf && !buf.null?
|
|
245
|
+
|
|
246
|
+
CUDA::RuntimeAPI.cudaSetDevice(@gpu_ids[i])
|
|
247
|
+
CUDA::RuntimeAPI.cudaFree(buf)
|
|
248
|
+
rescue StandardError
|
|
249
|
+
# Ignore cleanup errors
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
def get_stream_ptr(stream)
|
|
254
|
+
case stream
|
|
255
|
+
when FFI::Pointer
|
|
256
|
+
stream
|
|
257
|
+
when CUDA::Stream
|
|
258
|
+
stream.ptr
|
|
259
|
+
else
|
|
260
|
+
FFI::Pointer::NULL
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
def dtype_elem_size(dtype)
|
|
265
|
+
case dtype
|
|
266
|
+
when :float32, :int32 then 4
|
|
267
|
+
when :float64, :int64 then 8
|
|
268
|
+
when :float16, :bfloat16 then 2
|
|
269
|
+
else 4
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
def synchronize_stream!(stream)
|
|
274
|
+
stream_ptr = get_stream_ptr(stream)
|
|
275
|
+
if stream_ptr.null?
|
|
276
|
+
CUDA::RuntimeAPI.cudaDeviceSynchronize
|
|
277
|
+
else
|
|
278
|
+
CUDA::RuntimeAPI.cudaStreamSynchronize(stream_ptr)
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
def synchronize_all_streams!(streams)
|
|
283
|
+
streams.each_with_index do |stream, i|
|
|
284
|
+
CUDA::RuntimeAPI.cudaSetDevice(@gpu_ids[i])
|
|
285
|
+
synchronize_stream!(stream)
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
end
|