ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +7 -0
  3. data/lib/ignis-collective.rb +9 -0
  4. data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
  5. data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
  6. data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
  7. data/lib/nvruby/collective/algorithms/ring.rb +421 -0
  8. data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
  9. data/lib/nvruby/collective/algorithms/tree.rb +291 -0
  10. data/lib/nvruby/collective/array_ops.rb +240 -0
  11. data/lib/nvruby/collective/communicator.rb +633 -0
  12. data/lib/nvruby/collective/communicator_healer.rb +276 -0
  13. data/lib/nvruby/collective/device_manager.rb +216 -0
  14. data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
  15. data/lib/nvruby/collective/health_monitor.rb +333 -0
  16. data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
  17. data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
  18. data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
  19. data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
  20. data/lib/nvruby/collective/p2p_bindings.rb +121 -0
  21. data/lib/nvruby/collective/resilient_transport.rb +296 -0
  22. data/lib/nvruby/collective/topology.rb +347 -0
  23. data/lib/nvruby/collective/transport/base.rb +138 -0
  24. data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
  25. data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
  26. data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
  27. data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
  28. data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
  29. data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
  30. data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
  31. data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
  32. data/lib/nvruby/collective/transport_selector.rb +200 -0
  33. data/lib/nvruby/collective/vmm_bindings.rb +212 -0
  34. data/lib/nvruby/collective.rb +156 -0
  35. metadata +92 -0
@@ -0,0 +1,284 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module Collective
5
+ module Algorithms
6
+ # Topology-Aware Routing Optimizer
7
+ #
8
+ # Optimizes collective operation routing based on PCIe/NVLink topology.
9
+ # Groups GPUs by PCIe switch to minimize cross-switch traffic.
10
+ #
11
+ # Key optimizations:
12
+ # 1. Intra-switch communication first (lower latency)
13
+ # 2. Ring order follows physical topology
14
+ # 3. Tree structure respects switch boundaries
15
+ class TopologyRouter
16
+ # Switch group for routing
17
+ SwitchGroup = Struct.new(:switch_id, :gpu_ids, :bandwidth_gbps, keyword_init: true)
18
+
19
+ # @return [Topology::Detector] Topology detector
20
+ attr_reader :topology
21
+
22
+ # @return [Array<Integer>] GPU IDs
23
+ attr_reader :gpu_ids
24
+
25
+ # @return [Array<SwitchGroup>] GPU groups by switch
26
+ attr_reader :switch_groups
27
+
28
+ # @param gpu_ids [Array<Integer>] GPU device IDs
29
+ # @param topology [Topology::Detector, nil] Optional topology detector
30
+ def initialize(gpu_ids:, topology: nil)
31
+ @gpu_ids = gpu_ids.dup.freeze
32
+ @topology = topology || Topology::Detector.new
33
+ @switch_groups = []
34
+ @routing_table = {}
35
+ detect_switch_groups!
36
+ end
37
+
38
+ # Get optimized ring order based on topology
39
+ # Groups GPUs by PCIe switch, then chains switches
40
+ #
41
+ # @return [Array<Integer>] Optimal ring order
42
+ def optimal_ring_order
43
+ return @gpu_ids if @switch_groups.size <= 1
44
+
45
+ # Build ring by chaining switch groups
46
+ # Within each group, order by P2P performance
47
+ ring = []
48
+
49
+ # Sort groups by average bandwidth (best first)
50
+ sorted_groups = @switch_groups.sort_by { |g| -g.bandwidth_gbps }
51
+
52
+ sorted_groups.each do |group|
53
+ # Order GPUs within group by P2P performance
54
+ ordered = order_within_group(group.gpu_ids)
55
+ ring.concat(ordered)
56
+ end
57
+
58
+ ring
59
+ end
60
+
61
+ # Get optimized tree structure that respects switch boundaries
62
+ #
63
+ # @param root [Integer] Root rank
64
+ # @return [Hash] Tree structure with :parent, :children for each rank
65
+ def optimal_tree_structure(root: 0)
66
+ tree = {}
67
+ n = @gpu_ids.size
68
+
69
+ return { 0 => { parent: nil, children: [] } } if n == 1
70
+
71
+ # Build hierarchical tree:
72
+ # - Level 0: Root
73
+ # - Level 1: One representative from each switch group
74
+ # - Level 2+: Other GPUs within each switch group
75
+
76
+ # Find which group the root belongs to
77
+ root_group_idx = @switch_groups.find_index { |g| g.gpu_ids.include?(@gpu_ids[root]) }
78
+ root_group = @switch_groups[root_group_idx]
79
+
80
+ # Root has no parent
81
+ tree[root] = { parent: nil, children: [] }
82
+
83
+ # Connect switch group representatives to root
84
+ group_reps = []
85
+ @switch_groups.each_with_index do |group, idx|
86
+ next if idx == root_group_idx
87
+
88
+ # Pick highest-bandwidth GPU in group as representative
89
+ rep_gpu = pick_best_connector(root_group, group)
90
+ rep_rank = @gpu_ids.index(rep_gpu)
91
+
92
+ tree[root][:children] << rep_rank
93
+ tree[rep_rank] = { parent: root, children: [] }
94
+ group_reps << { rank: rep_rank, group: group }
95
+ end
96
+
97
+ # Connect other GPUs in root's group directly to root
98
+ root_group.gpu_ids.each do |gpu|
99
+ rank = @gpu_ids.index(gpu)
100
+ next if rank == root
101
+
102
+ tree[root][:children] << rank
103
+ tree[rank] = { parent: root, children: [] }
104
+ end
105
+
106
+ # Connect remaining GPUs in each group to their representative
107
+ group_reps.each do |rep_info|
108
+ rep_rank = rep_info[:rank]
109
+ rep_gpu = @gpu_ids[rep_rank]
110
+
111
+ rep_info[:group].gpu_ids.each do |gpu|
112
+ next if gpu == rep_gpu
113
+
114
+ rank = @gpu_ids.index(gpu)
115
+ tree[rep_rank][:children] << rank
116
+ tree[rank] = { parent: rep_rank, children: [] }
117
+ end
118
+ end
119
+
120
+ tree
121
+ end
122
+
123
+ # Suggest optimal algorithm based on message size and topology
124
+ #
125
+ # @param message_size [Integer] Total message size in bytes
126
+ # @return [Symbol] :ring, :tree, or :double_tree
127
+ def suggest_algorithm(message_size)
128
+ # Ring is bandwidth-optimal for large messages
129
+ # Tree is latency-optimal for small messages
130
+ # Double tree for non-power-of-2 with medium messages
131
+
132
+ n = @gpu_ids.size
133
+ is_power_of_2 = n > 0 && (n & (n - 1)).zero?
134
+
135
+ if message_size < 1024 # < 1KB
136
+ :tree
137
+ elsif message_size > 1_048_576 # > 1MB
138
+ :ring
139
+ elsif !is_power_of_2
140
+ :double_tree
141
+ else
142
+ # Medium messages: prefer ring if good interconnect
143
+ avg_bandwidth = @switch_groups.sum { |g| g.bandwidth_gbps } / @switch_groups.size.to_f
144
+ avg_bandwidth > 50 ? :ring : :tree
145
+ end
146
+ end
147
+
148
+ # Get routing path between two GPUs
149
+ #
150
+ # @param src_rank [Integer] Source rank
151
+ # @param dst_rank [Integer] Destination rank
152
+ # @return [Array<Integer>] Path as list of rank hops
153
+ def get_path(src_rank, dst_rank)
154
+ return [src_rank] if src_rank == dst_rank
155
+
156
+ src_gpu = @gpu_ids[src_rank]
157
+ dst_gpu = @gpu_ids[dst_rank]
158
+
159
+ src_group = find_group(src_gpu)
160
+ dst_group = find_group(dst_gpu)
161
+
162
+ if src_group == dst_group
163
+ # Direct path within same switch
164
+ [src_rank, dst_rank]
165
+ else
166
+ # Need to cross switch boundary - use group representative
167
+ # Find best intermediate hop
168
+ intermediate = pick_best_connector_rank(src_group, dst_group)
169
+
170
+ if intermediate == src_rank || intermediate == dst_rank
171
+ [src_rank, dst_rank]
172
+ else
173
+ [src_rank, intermediate, dst_rank]
174
+ end
175
+ end
176
+ end
177
+
178
+ private
179
+
180
+ def detect_switch_groups!
181
+ # Use topology to group GPUs by PCIe switch
182
+ # GPUs with NVLink form one group, others grouped by P2P connectivity
183
+
184
+ matrix = @topology.matrix
185
+ visited = Set.new
186
+ group_id = 0
187
+
188
+ @gpu_ids.each do |gpu|
189
+ next if visited.include?(gpu)
190
+
191
+ group_gpus = [gpu]
192
+ visited.add(gpu)
193
+
194
+ # Find all GPUs that have NVLink or high-speed P2P to this one
195
+ @gpu_ids.each do |other|
196
+ next if visited.include?(other) || other == gpu
197
+
198
+ path = matrix.path(gpu, other)
199
+ if path && (path.nvlink? || path.pcie_p2p?)
200
+ # Check if they're likely on same switch (both have good P2P to each other)
201
+ reverse_path = matrix.path(other, gpu)
202
+ if reverse_path && (reverse_path.nvlink? || reverse_path.pcie_p2p?)
203
+ group_gpus << other
204
+ visited.add(other)
205
+ end
206
+ end
207
+ end
208
+
209
+ # Calculate average bandwidth for this group
210
+ total_bw = 0.0
211
+ count = 0
212
+ group_gpus.combination(2).each do |a, b|
213
+ path = matrix.path(a, b)
214
+ if path
215
+ total_bw += path.bandwidth_gbps
216
+ count += 1
217
+ end
218
+ end
219
+ avg_bw = count > 0 ? total_bw / count : 16.0 # Default PCIe
220
+
221
+ @switch_groups << SwitchGroup.new(
222
+ switch_id: group_id,
223
+ gpu_ids: group_gpus,
224
+ bandwidth_gbps: avg_bw
225
+ )
226
+ group_id += 1
227
+ end
228
+ end
229
+
230
+ def order_within_group(gpus)
231
+ return gpus if gpus.size <= 2
232
+
233
+ # Order by greedy nearest-neighbor based on P2P performance
234
+ matrix = @topology.matrix
235
+ remaining = gpus.dup
236
+ ordered = [remaining.shift]
237
+
238
+ while remaining.any?
239
+ current = ordered.last
240
+ best = remaining.max_by do |gpu|
241
+ path = matrix.path(current, gpu)
242
+ path ? path.performance_rank : 0
243
+ end
244
+ ordered << best
245
+ remaining.delete(best)
246
+ end
247
+
248
+ ordered
249
+ end
250
+
251
+ def pick_best_connector(group_a, group_b)
252
+ # Find GPU in group_b that has best connection to group_a
253
+ matrix = @topology.matrix
254
+
255
+ best_gpu = nil
256
+ best_score = -1
257
+
258
+ group_b.gpu_ids.each do |gpu_b|
259
+ score = group_a.gpu_ids.sum do |gpu_a|
260
+ path = matrix.path(gpu_a, gpu_b)
261
+ path ? path.performance_rank : 0
262
+ end
263
+
264
+ if score > best_score
265
+ best_score = score
266
+ best_gpu = gpu_b
267
+ end
268
+ end
269
+
270
+ best_gpu || group_b.gpu_ids.first
271
+ end
272
+
273
+ def pick_best_connector_rank(group_a, group_b)
274
+ gpu = pick_best_connector(group_a, group_b)
275
+ @gpu_ids.index(gpu)
276
+ end
277
+
278
+ def find_group(gpu)
279
+ @switch_groups.find { |g| g.gpu_ids.include?(gpu) }
280
+ end
281
+ end
282
+ end
283
+ end
284
+ end
@@ -0,0 +1,291 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module Collective
5
+ module Algorithms
6
+ # Tree-based algorithms for broadcast and reduce operations
7
+ #
8
+ # Binary tree algorithms are optimal for small messages where latency dominates:
9
+ # - Latency complexity: O(log N) steps
10
+ # - Bandwidth complexity: O(data_size) per step (not optimal for large messages)
11
+ #
12
+ # Best for: Small messages (<1KB) where latency is critical
13
+ class Tree
14
+ # Tree node representing a GPU in the tree structure
15
+ TreeNode = Struct.new(:gpu_id, :rank, :parent, :children, :depth, keyword_init: true)
16
+
17
+ # @return [Array<Integer>] GPU IDs
18
+ attr_reader :gpu_ids
19
+
20
+ # @return [Integer] Number of GPUs
21
+ attr_reader :n_gpus
22
+
23
+ # @return [TransportSelector] Transport selector
24
+ attr_reader :transport_selector
25
+
26
+ # @return [Array<TreeNode>] Tree structure
27
+ attr_reader :tree
28
+
29
+ # @param gpu_ids [Array<Integer>] GPU device IDs
30
+ # @param transport_selector [TransportSelector] Transport selector
31
+ def initialize(gpu_ids:, transport_selector:)
32
+ @gpu_ids = gpu_ids.dup.freeze
33
+ @n_gpus = gpu_ids.size
34
+ @transport_selector = transport_selector
35
+ @tree = build_tree
36
+ end
37
+
38
+ # Broadcast data from root to all GPUs
39
+ #
40
+ # @param buffer [FFI::Pointer] Source buffer on root GPU
41
+ # @param buffers [Array<FFI::Pointer>] Destination buffers on all GPUs
42
+ # @param size [Integer] Buffer size in bytes
43
+ # @param root [Integer] Root rank (index in gpu_ids)
44
+ # @param streams [Array<CUDA::Stream, FFI::Pointer>] CUDA streams
45
+ # @return [void]
46
+ def broadcast(buffer:, buffers:, size:, root:, streams:)
47
+ return if @n_gpus == 1
48
+
49
+ validate_root!(root)
50
+
51
+ # Build tree rooted at specified root
52
+ tree = build_tree(root: root)
53
+
54
+ # Broadcast down the tree: log2(N) steps
55
+ depth = tree_depth(tree)
56
+
57
+ depth.times do |d|
58
+ # At each depth level, nodes at depth d send to their children
59
+ tree.each do |node|
60
+ next unless node.depth == d
61
+ next if node.children.empty?
62
+
63
+ src_rank = node.rank
64
+ src_gpu = node.gpu_id
65
+ src_buffer = buffers[src_rank]
66
+ stream_ptr = get_stream_ptr(streams[src_rank])
67
+
68
+ node.children.each do |child_rank|
69
+ dst_gpu = @gpu_ids[child_rank]
70
+ dst_buffer = buffers[child_rank]
71
+
72
+ transport = @transport_selector.select_transport(src_gpu, dst_gpu)
73
+ move!(transport, dst_buffer, src_buffer, size, stream_ptr)
74
+ end
75
+ end
76
+
77
+ # Synchronize after each level
78
+ synchronize_all_streams!(streams)
79
+ end
80
+ end
81
+
82
+ # Reduce data from all GPUs to root
83
+ #
84
+ # @param buffers [Array<FFI::Pointer>] Source buffers on all GPUs
85
+ # @param sizes [Array<Integer>] Buffer sizes
86
+ # @param dtype [Symbol] Data type
87
+ # @param op [Symbol] Reduction operation
88
+ # @param root [Integer] Root rank
89
+ # @param streams [Array<CUDA::Stream, FFI::Pointer>] CUDA streams
90
+ # @return [void]
91
+ def reduce(buffers:, sizes:, dtype:, op:, root:, streams:)
92
+ return if @n_gpus == 1
93
+
94
+ validate_root!(root)
95
+
96
+ size = sizes[0]
97
+ elem_size = dtype_elem_size(dtype)
98
+ elem_count = size / elem_size
99
+
100
+ # Build tree rooted at root
101
+ tree = build_tree(root: root)
102
+
103
+ # Allocate temp buffers
104
+ recv_buffers = allocate_recv_buffers(size)
105
+
106
+ begin
107
+ # Reduce up the tree: log2(N) steps
108
+ depth = tree_depth(tree)
109
+
110
+ # Start from leaves, work up to root
111
+ (depth - 1).downto(0) do |d|
112
+ tree.each do |node|
113
+ next unless node.depth == d
114
+ next if node.children.empty?
115
+
116
+ dst_rank = node.rank
117
+ dst_gpu = node.gpu_id
118
+ dst_buffer = buffers[dst_rank]
119
+ stream_ptr = get_stream_ptr(streams[dst_rank])
120
+
121
+ # Set device for reduction
122
+ CUDA::RuntimeAPI.cudaSetDevice(dst_gpu)
123
+
124
+ # For each child: copy its buffer into recv, then reduce it into
125
+ # the parent's local buffer BEFORE the next child overwrites recv.
126
+ # (Previously every child was copied into the SAME recv buffer and
127
+ # then reduced once per child, which double-counted the last child
128
+ # and dropped the rest — e.g. parent += child2 + child2.)
129
+ recv_buffer = recv_buffers[dst_rank]
130
+ node.children.each do |child_rank|
131
+ src_gpu = @gpu_ids[child_rank]
132
+ src_buffer = buffers[child_rank]
133
+
134
+ transport = @transport_selector.select_transport(src_gpu, dst_gpu)
135
+ move!(transport, recv_buffer, src_buffer, size, stream_ptr)
136
+ synchronize_stream!(streams[dst_rank])
137
+
138
+ ReductionOps.execute(op, dst_buffer, recv_buffer, dst_buffer,
139
+ elem_count, dtype, stream_ptr)
140
+ synchronize_stream!(streams[dst_rank])
141
+ end
142
+ end
143
+
144
+ synchronize_all_streams!(streams)
145
+ end
146
+ ensure
147
+ free_recv_buffers(recv_buffers)
148
+ end
149
+ end
150
+
151
+ private
152
+
153
+ # Move bytes via the selected transport, failing LOUDLY rather than
154
+ # silently skipping. A non-P2P transport without copy_async would
155
+ # otherwise drop data and corrupt the reduction with no error.
156
+ def move!(transport, dst, src, size, stream_ptr)
157
+ if transport.respond_to?(:copy_async)
158
+ transport.copy_async(dst, src, size, stream_ptr)
159
+ else
160
+ raise NotImplementedError,
161
+ "Transport #{transport.class} has no copy_async; non-P2P tree " \
162
+ "movement is not wired yet (refusing to silently drop data)"
163
+ end
164
+ end
165
+
166
+ # Build binary tree structure
167
+ # @param root [Integer] Root rank (default 0)
168
+ # @return [Array<TreeNode>] Tree nodes
169
+ def build_tree(root: 0)
170
+ nodes = @n_gpus.times.map do |rank|
171
+ TreeNode.new(
172
+ gpu_id: @gpu_ids[rank],
173
+ rank: rank,
174
+ parent: nil,
175
+ children: [],
176
+ depth: 0
177
+ )
178
+ end
179
+
180
+ # Build binary tree with specified root
181
+ # Use BFS to assign depths and parent/child relationships
182
+ if root != 0
183
+ # Reorder so root is first in tree building
184
+ reordered = [root] + (0...@n_gpus).reject { |i| i == root }
185
+ else
186
+ reordered = (0...@n_gpus).to_a
187
+ end
188
+
189
+ # Build tree using heap-like structure
190
+ reordered.each_with_index do |rank, idx|
191
+ next if idx.zero? # Root has no parent
192
+
193
+ parent_idx = (idx - 1) / 2
194
+ parent_rank = reordered[parent_idx]
195
+
196
+ nodes[rank].parent = parent_rank
197
+ nodes[parent_rank].children << rank
198
+ end
199
+
200
+ # Calculate depths via BFS
201
+ queue = [root]
202
+ nodes[root].depth = 0
203
+
204
+ until queue.empty?
205
+ current_rank = queue.shift
206
+ current_depth = nodes[current_rank].depth
207
+
208
+ nodes[current_rank].children.each do |child_rank|
209
+ nodes[child_rank].depth = current_depth + 1
210
+ queue << child_rank
211
+ end
212
+ end
213
+
214
+ nodes
215
+ end
216
+
217
+ # Get maximum tree depth
218
+ def tree_depth(tree)
219
+ tree.map(&:depth).max + 1
220
+ end
221
+
222
+ def validate_root!(root)
223
+ unless root >= 0 && root < @n_gpus
224
+ raise ArgumentError, "Invalid root #{root}. Valid: 0-#{@n_gpus - 1}"
225
+ end
226
+ end
227
+
228
+ # Allocate temp buffers
229
+ def allocate_recv_buffers(size)
230
+ CUDA::RuntimeAPI.ensure_loaded!
231
+
232
+ @gpu_ids.map do |gpu_id|
233
+ CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
234
+ ptr_ptr = FFI::MemoryPointer.new(:pointer)
235
+ status = CUDA::RuntimeAPI.cudaMalloc(ptr_ptr, size)
236
+ CUDA::RuntimeAPI.check_status!(status, "Alloc tree recv buffer")
237
+ ptr_ptr.read_pointer
238
+ end
239
+ end
240
+
241
+ # Free temp buffers
242
+ def free_recv_buffers(recv_buffers)
243
+ recv_buffers.each_with_index do |buf, i|
244
+ next unless buf && !buf.null?
245
+
246
+ CUDA::RuntimeAPI.cudaSetDevice(@gpu_ids[i])
247
+ CUDA::RuntimeAPI.cudaFree(buf)
248
+ rescue StandardError
249
+ # Ignore cleanup errors
250
+ end
251
+ end
252
+
253
+ def get_stream_ptr(stream)
254
+ case stream
255
+ when FFI::Pointer
256
+ stream
257
+ when CUDA::Stream
258
+ stream.ptr
259
+ else
260
+ FFI::Pointer::NULL
261
+ end
262
+ end
263
+
264
+ def dtype_elem_size(dtype)
265
+ case dtype
266
+ when :float32, :int32 then 4
267
+ when :float64, :int64 then 8
268
+ when :float16, :bfloat16 then 2
269
+ else 4
270
+ end
271
+ end
272
+
273
+ def synchronize_stream!(stream)
274
+ stream_ptr = get_stream_ptr(stream)
275
+ if stream_ptr.null?
276
+ CUDA::RuntimeAPI.cudaDeviceSynchronize
277
+ else
278
+ CUDA::RuntimeAPI.cudaStreamSynchronize(stream_ptr)
279
+ end
280
+ end
281
+
282
+ def synchronize_all_streams!(streams)
283
+ streams.each_with_index do |stream, i|
284
+ CUDA::RuntimeAPI.cudaSetDevice(@gpu_ids[i])
285
+ synchronize_stream!(stream)
286
+ end
287
+ end
288
+ end
289
+ end
290
+ end
291
+ end