ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +7 -0
  3. data/lib/ignis-collective.rb +9 -0
  4. data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
  5. data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
  6. data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
  7. data/lib/nvruby/collective/algorithms/ring.rb +421 -0
  8. data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
  9. data/lib/nvruby/collective/algorithms/tree.rb +291 -0
  10. data/lib/nvruby/collective/array_ops.rb +240 -0
  11. data/lib/nvruby/collective/communicator.rb +633 -0
  12. data/lib/nvruby/collective/communicator_healer.rb +276 -0
  13. data/lib/nvruby/collective/device_manager.rb +216 -0
  14. data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
  15. data/lib/nvruby/collective/health_monitor.rb +333 -0
  16. data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
  17. data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
  18. data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
  19. data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
  20. data/lib/nvruby/collective/p2p_bindings.rb +121 -0
  21. data/lib/nvruby/collective/resilient_transport.rb +296 -0
  22. data/lib/nvruby/collective/topology.rb +347 -0
  23. data/lib/nvruby/collective/transport/base.rb +138 -0
  24. data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
  25. data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
  26. data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
  27. data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
  28. data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
  29. data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
  30. data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
  31. data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
  32. data/lib/nvruby/collective/transport_selector.rb +200 -0
  33. data/lib/nvruby/collective/vmm_bindings.rb +212 -0
  34. data/lib/nvruby/collective.rb +156 -0
  35. metadata +92 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7343f5cf832fab47fa86cc79eab930cafb142b6f0967687d65d8b5896c8ad488
4
+ data.tar.gz: 87f2e47d253db19ede66f71d9fcfc24bc59fc1b7813b67230d6fe1bd1be87b67
5
+ SHA512:
6
+ metadata.gz: 791c617270822aad2c14bbca288da77baf1b3202c5ed46d0695936ec8953582d1c6e0677e513c58f21ebde11ef07eacb795f480cc126a5c337c59e52943f38fd
7
+ data.tar.gz: 78a9a949ca36e3b880c7df0ebe7ccdc3408ed082546957703936ef80a810f27d5c98e74ee5b047a76770bc31105bb44326f76519d57d395e8aadb2ea07563602
data/README.md ADDED
@@ -0,0 +1,7 @@
1
+ # ignis-collective (experimental)
2
+
3
+ Multi-GPU collective communication for Ruby — NCCL-style ring/tree all-reduce with P2P / IPC (VMM) / host-staged / TCP transports, on the [`ignis`](https://rubygems.org/gems/ignis) foundation.
4
+
5
+ > **Experimental.** The transports require multiple GPUs / nodes and cannot be exercised on a single GPU, so this gem is shipped separately from the verified single-GPU stack. APIs may change.
6
+
7
+ MIT.
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ # ignis-collective — multi-GPU collective communication (NvCCL: ring/tree all-reduce,
4
+ # P2P/IPC/host-staged/TCP transports) on the Ignis foundation. EXPERIMENTAL: the
5
+ # transports cannot be exercised on a single GPU, so this gem is provided for
6
+ # multi-GPU/multi-node setups and is not part of the verified single-GPU path.
7
+
8
+ require "ignis"
9
+ require_relative "nvruby/collective"
@@ -0,0 +1,364 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module Collective
5
+ module Algorithms
6
+ # Double Binary Tree Algorithm for heterogeneous GPU counts
7
+ #
8
+ # When GPU count is not a power of 2, a single binary tree has imbalanced load.
9
+ # The double binary tree uses two overlapping trees to balance communication.
10
+ #
11
+ # Key insight: GPU i participates in both trees but at different levels.
12
+ # - Tree 1: Standard binary tree rooted at 0
13
+ # - Tree 2: Binary tree rooted at N/2 (or closest power of 2)
14
+ #
15
+ # This achieves near-optimal latency even for non-power-of-2 GPU counts.
16
+ class DoubleBinaryTree
17
+ # Node in the double tree structure
18
+ TreeNode = Struct.new(:rank, :gpu_id, :parent, :left_child, :right_child, :tree_id, keyword_init: true)
19
+
20
+ # @return [Array<Integer>] GPU IDs
21
+ attr_reader :gpu_ids
22
+
23
+ # @return [Integer] Number of GPUs
24
+ attr_reader :n_gpus
25
+
26
+ # @return [TransportSelector] Transport selector
27
+ attr_reader :transport_selector
28
+
29
+ # @param gpu_ids [Array<Integer>] GPU device IDs
30
+ # @param transport_selector [TransportSelector] Transport selector
31
+ def initialize(gpu_ids:, transport_selector:)
32
+ @gpu_ids = gpu_ids.dup.freeze
33
+ @n_gpus = gpu_ids.size
34
+ @transport_selector = transport_selector
35
+ @tree1 = nil
36
+ @tree2 = nil
37
+ end
38
+
39
+ # Build the double binary tree structure
40
+ # @return [void]
41
+ def build!
42
+ @tree1 = build_binary_tree(root_rank: 0, tree_id: 1)
43
+
44
+ # For tree 2, use a different root to balance load
45
+ # Root is at opposite end of the array
46
+ tree2_root = @n_gpus / 2
47
+ @tree2 = build_binary_tree(root_rank: tree2_root, tree_id: 2)
48
+ end
49
+
50
+ # Broadcast using double tree (for fault tolerance / load balancing)
51
+ #
52
+ # @param buffer [FFI::Pointer] Source buffer on root
53
+ # @param buffers [Array<FFI::Pointer>] Destination buffers on all GPUs
54
+ # @param size [Integer] Buffer size in bytes
55
+ # @param root [Integer] Root rank
56
+ # @param streams [Array<FFI::Pointer>] CUDA streams
57
+ # @return [void]
58
+ def broadcast(buffer:, buffers:, size:, root:, streams:)
59
+ return if @n_gpus == 1
60
+
61
+ build! if @tree1.nil?
62
+
63
+ # For non-power-of-2, use split strategy
64
+ if power_of_2?(@n_gpus)
65
+ # Standard single tree broadcast
66
+ broadcast_tree(@tree1, buffers, size, root, streams)
67
+ else
68
+ # Split data: half via tree1, half via tree2
69
+ half_size = size / 2
70
+
71
+ # Tree 1 handles first half
72
+ broadcast_tree_partial(@tree1, buffers, 0, half_size, root, streams)
73
+
74
+ # Tree 2 handles second half
75
+ broadcast_tree_partial(@tree2, buffers, half_size, half_size, root, streams)
76
+ end
77
+ end
78
+
79
+ # Reduce using double tree
80
+ #
81
+ # @param buffers [Array<FFI::Pointer>] Source buffers
82
+ # @param sizes [Array<Integer>] Buffer sizes
83
+ # @param dtype [Symbol] Data type
84
+ # @param op [Symbol] Reduction operation
85
+ # @param root [Integer] Root rank
86
+ # @param streams [Array<FFI::Pointer>] CUDA streams
87
+ # @return [void]
88
+ def reduce(buffers:, sizes:, dtype:, op:, root:, streams:)
89
+ return if @n_gpus == 1
90
+
91
+ build! if @tree1.nil?
92
+
93
+ if power_of_2?(@n_gpus)
94
+ reduce_tree(@tree1, buffers, sizes[0], dtype, op, root, streams)
95
+ else
96
+ # Split reduction via both trees, combine at root
97
+ half_size = sizes[0] / 2
98
+ elem_size = dtype_elem_size(dtype)
99
+ half_count = half_size / elem_size
100
+
101
+ # Allocate temp buffer at root for tree2 partial result
102
+ temp_buffer = allocate_buffer(@gpu_ids[root], half_size)
103
+
104
+ begin
105
+ # Tree 1: reduce first half to root
106
+ reduce_tree_partial(@tree1, buffers, 0, half_size, dtype, op, root, streams)
107
+
108
+ # Tree 2: reduce second half to tree2's root, then send to actual root
109
+ tree2_root = @n_gpus / 2
110
+ reduce_tree_partial(@tree2, buffers, half_size, half_size, dtype, op, tree2_root, streams)
111
+
112
+ # If tree2 root != actual root, transfer
113
+ if tree2_root != root
114
+ transport = @transport_selector.select_transport(@gpu_ids[tree2_root], @gpu_ids[root])
115
+ if transport.is_a?(Transport::P2PTransport)
116
+ src = ptr_offset(buffers[tree2_root], half_size)
117
+ dst = ptr_offset(buffers[root], half_size)
118
+ transport.copy_async(dst, src, half_size, get_stream_ptr(streams[root]))
119
+ end
120
+ end
121
+
122
+ synchronize_all_streams!(streams)
123
+ ensure
124
+ free_buffer(temp_buffer, @gpu_ids[root])
125
+ end
126
+ end
127
+ end
128
+
129
+ private
130
+
131
+ def build_binary_tree(root_rank:, tree_id:)
132
+ nodes = @n_gpus.times.map do |rank|
133
+ TreeNode.new(
134
+ rank: rank,
135
+ gpu_id: @gpu_ids[rank],
136
+ parent: nil,
137
+ left_child: nil,
138
+ right_child: nil,
139
+ tree_id: tree_id
140
+ )
141
+ end
142
+
143
+ # Build parent-child relationships
144
+ # Reorder so root is at position 0 in tree
145
+ reordered = reorder_for_root(root_rank)
146
+
147
+ reordered.each_with_index do |orig_rank, tree_pos|
148
+ node = nodes[orig_rank]
149
+
150
+ left_tree_pos = 2 * tree_pos + 1
151
+ right_tree_pos = 2 * tree_pos + 2
152
+
153
+ if left_tree_pos < @n_gpus
154
+ left_orig_rank = reordered[left_tree_pos]
155
+ node.left_child = left_orig_rank
156
+ nodes[left_orig_rank].parent = orig_rank
157
+ end
158
+
159
+ if right_tree_pos < @n_gpus
160
+ right_orig_rank = reordered[right_tree_pos]
161
+ node.right_child = right_orig_rank
162
+ nodes[right_orig_rank].parent = orig_rank
163
+ end
164
+ end
165
+
166
+ nodes
167
+ end
168
+
169
+ def reorder_for_root(root_rank)
170
+ # Circular shift so root is at index 0
171
+ order = (0...@n_gpus).to_a
172
+ order.rotate(root_rank)
173
+ end
174
+
175
+ def broadcast_tree(tree, buffers, size, root, streams)
176
+ depth = tree_depth
177
+
178
+ depth.times do |d|
179
+ tree.each do |node|
180
+ next unless node_depth(node, tree) == d
181
+
182
+ [node.left_child, node.right_child].compact.each do |child_rank|
183
+ src_gpu = node.gpu_id
184
+ dst_gpu = @gpu_ids[child_rank]
185
+
186
+ transport = @transport_selector.select_transport(src_gpu, dst_gpu)
187
+ stream_ptr = get_stream_ptr(streams[node.rank])
188
+
189
+ if transport.is_a?(Transport::P2PTransport)
190
+ transport.copy_async(buffers[child_rank], buffers[node.rank], size, stream_ptr)
191
+ end
192
+ end
193
+ end
194
+
195
+ synchronize_all_streams!(streams)
196
+ end
197
+ end
198
+
199
+ def broadcast_tree_partial(tree, buffers, offset, size, root, streams)
200
+ depth = tree_depth
201
+
202
+ depth.times do |d|
203
+ tree.each do |node|
204
+ next unless node_depth(node, tree) == d
205
+
206
+ [node.left_child, node.right_child].compact.each do |child_rank|
207
+ src_gpu = node.gpu_id
208
+ dst_gpu = @gpu_ids[child_rank]
209
+
210
+ transport = @transport_selector.select_transport(src_gpu, dst_gpu)
211
+ stream_ptr = get_stream_ptr(streams[node.rank])
212
+
213
+ if transport.is_a?(Transport::P2PTransport)
214
+ src = ptr_offset(buffers[node.rank], offset)
215
+ dst = ptr_offset(buffers[child_rank], offset)
216
+ transport.copy_async(dst, src, size, stream_ptr)
217
+ end
218
+ end
219
+ end
220
+
221
+ synchronize_all_streams!(streams)
222
+ end
223
+ end
224
+
225
+ def reduce_tree(tree, buffers, size, dtype, op, root, streams)
226
+ depth = tree_depth
227
+ elem_size = dtype_elem_size(dtype)
228
+ count = size / elem_size
229
+
230
+ recv_buffers = allocate_recv_buffers(size)
231
+
232
+ begin
233
+ (depth - 1).downto(0) do |d|
234
+ tree.each do |node|
235
+ next unless node_depth(node, tree) == d
236
+ next unless node.left_child || node.right_child
237
+
238
+ CUDA::RuntimeAPI.cudaSetDevice(node.gpu_id)
239
+ stream_ptr = get_stream_ptr(streams[node.rank])
240
+
241
+ [node.left_child, node.right_child].compact.each do |child_rank|
242
+ src_gpu = @gpu_ids[child_rank]
243
+
244
+ transport = @transport_selector.select_transport(src_gpu, node.gpu_id)
245
+
246
+ if transport.is_a?(Transport::P2PTransport)
247
+ transport.copy_async(recv_buffers[node.rank], buffers[child_rank], size, stream_ptr)
248
+ end
249
+
250
+ synchronize_stream!(streams[node.rank])
251
+
252
+ ReductionOps.execute(op, buffers[node.rank], recv_buffers[node.rank],
253
+ buffers[node.rank], count, dtype, stream_ptr)
254
+ end
255
+ end
256
+
257
+ synchronize_all_streams!(streams)
258
+ end
259
+ ensure
260
+ free_recv_buffers(recv_buffers)
261
+ end
262
+ end
263
+
264
+ def reduce_tree_partial(tree, buffers, offset, size, dtype, op, root, streams)
265
+ # Similar to reduce_tree but only for a portion of the buffer
266
+ # Implementation mirrors reduce_tree with offset applied
267
+ reduce_tree(
268
+ tree,
269
+ buffers.map { |b| ptr_offset(b, offset) },
270
+ size,
271
+ dtype,
272
+ op,
273
+ root,
274
+ streams
275
+ )
276
+ end
277
+
278
+ def tree_depth
279
+ return 0 if @n_gpus <= 1
280
+ (Math.log2(@n_gpus).ceil)
281
+ end
282
+
283
+ def node_depth(node, tree)
284
+ depth = 0
285
+ current = node
286
+ while current.parent
287
+ current = tree[current.parent]
288
+ depth += 1
289
+ end
290
+ depth
291
+ end
292
+
293
+ def power_of_2?(n)
294
+ n > 0 && (n & (n - 1)).zero?
295
+ end
296
+
297
+ def dtype_elem_size(dtype)
298
+ case dtype
299
+ when :float32, :int32 then 4
300
+ when :float64, :int64 then 8
301
+ when :float16, :bfloat16 then 2
302
+ else 4
303
+ end
304
+ end
305
+
306
+ def get_stream_ptr(stream)
307
+ case stream
308
+ when FFI::Pointer then stream
309
+ when CUDA::Stream then stream.ptr
310
+ else FFI::Pointer::NULL
311
+ end
312
+ end
313
+
314
+ def ptr_offset(ptr, offset)
315
+ FFI::Pointer.new(:uint8, ptr.address + offset)
316
+ end
317
+
318
+ def allocate_recv_buffers(size)
319
+ @gpu_ids.map do |gpu_id|
320
+ allocate_buffer(gpu_id, size)
321
+ end
322
+ end
323
+
324
+ def allocate_buffer(gpu_id, size)
325
+ CUDA::RuntimeAPI.ensure_loaded!
326
+ CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
327
+ ptr_ptr = FFI::MemoryPointer.new(:pointer)
328
+ CUDA::RuntimeAPI.cudaMalloc(ptr_ptr, size)
329
+ ptr_ptr.read_pointer
330
+ end
331
+
332
+ def free_buffer(buf, gpu_id)
333
+ return unless buf && !buf.null?
334
+ CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
335
+ CUDA::RuntimeAPI.cudaFree(buf)
336
+ rescue StandardError
337
+ # Ignore cleanup errors
338
+ end
339
+
340
+ def free_recv_buffers(buffers)
341
+ buffers.each_with_index do |buf, i|
342
+ free_buffer(buf, @gpu_ids[i])
343
+ end
344
+ end
345
+
346
+ def synchronize_stream!(stream)
347
+ stream_ptr = get_stream_ptr(stream)
348
+ if stream_ptr.null?
349
+ CUDA::RuntimeAPI.cudaDeviceSynchronize
350
+ else
351
+ CUDA::RuntimeAPI.cudaStreamSynchronize(stream_ptr)
352
+ end
353
+ end
354
+
355
+ def synchronize_all_streams!(streams)
356
+ streams.each_with_index do |stream, i|
357
+ CUDA::RuntimeAPI.cudaSetDevice(@gpu_ids[i])
358
+ synchronize_stream!(stream)
359
+ end
360
+ end
361
+ end
362
+ end
363
+ end
364
+ end
@@ -0,0 +1,222 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module Collective
5
+ module Algorithms
6
+ # Message Pipelining for Overlapping Communication and Computation
7
+ #
8
+ # Splits large messages into chunks that can be processed concurrently:
9
+ # - While GPU processes chunk N, transfer chunk N+1
10
+ # - Hides transfer latency behind compute
11
+ #
12
+ # Optimal chunk size balances:
13
+ # - Too small: overhead dominates
14
+ # - Too large: no overlap opportunity
15
+ class Pipeliner
16
+ # Pipeline stage
17
+ PipelineStage = Struct.new(:chunk_id, :offset, :size, :state, keyword_init: true)
18
+
19
+ # Pipeline states
20
+ STATE_PENDING = :pending
21
+ STATE_TRANSFERRING = :transferring
22
+ STATE_COMPUTING = :computing
23
+ STATE_COMPLETE = :complete
24
+
25
+ # Default chunk size: 512KB (good for PCIe 4.0)
26
+ DEFAULT_CHUNK_SIZE = 512 * 1024
27
+
28
+ # @return [Integer] Chunk size in bytes
29
+ attr_reader :chunk_size
30
+
31
+ # @return [Integer] Number of pipeline stages (double-buffering = 2)
32
+ attr_reader :num_stages
33
+
34
+ # @param chunk_size [Integer] Size of each pipeline chunk
35
+ # @param num_stages [Integer] Number of concurrent stages (default 2)
36
+ def initialize(chunk_size: DEFAULT_CHUNK_SIZE, num_stages: 2)
37
+ @chunk_size = chunk_size
38
+ @num_stages = num_stages
39
+ @streams = {}
40
+ end
41
+
42
+ # Calculate optimal chunk size based on bandwidth and compute speed
43
+ #
44
+ # @param bandwidth_gbps [Float] Transfer bandwidth in GB/s
45
+ # @param compute_gflops [Float] Compute throughput in GFLOPS
46
+ # @param flops_per_element [Float] FLOPs per element in reduction
47
+ # @param element_size [Integer] Bytes per element
48
+ # @return [Integer] Optimal chunk size in bytes
49
+ def self.optimal_chunk_size(bandwidth_gbps:, compute_gflops:, flops_per_element: 1.0, element_size: 4)
50
+ # Balance: transfer_time = compute_time
51
+ # chunk_size / bandwidth = (chunk_size / element_size) * flops_per_element / compute_throughput
52
+
53
+ # Convert GB/s to bytes/s
54
+ bandwidth_bps = bandwidth_gbps * 1e9
55
+ # Convert GFLOPS to FLOPS
56
+ compute_flops = compute_gflops * 1e9
57
+
58
+ # chunk_size / bandwidth = (chunk_size * flops_per_element) / (element_size * compute_flops)
59
+ # Solving: chunk_size = bandwidth * element_size * compute_flops / (compute_flops + bandwidth * flops_per_element)
60
+
61
+ # Simplified: aim for transfer time ~ compute time
62
+ # chunk ~= bandwidth * target_latency_seconds
63
+ target_latency = 0.001 # 1ms pipeline stage
64
+ optimal = (bandwidth_bps * target_latency).to_i
65
+
66
+ # Clamp to reasonable range
67
+ [[optimal, 64 * 1024].max, 4 * 1024 * 1024].min
68
+ end
69
+
70
+ # Execute a pipelined reduction operation
71
+ #
72
+ # @param buffers [Array<FFI::Pointer>] Device buffers
73
+ # @param total_size [Integer] Total buffer size
74
+ # @param dtype [Symbol] Data type
75
+ # @param op [Symbol] Reduction operation
76
+ # @param transfer_fn [Proc] Transfer function (chunk_offset, chunk_size, stream) -> void
77
+ # @param reduce_fn [Proc] Reduce function (chunk_offset, chunk_size, stream) -> void
78
+ # @param streams [Array<CUDA::Stream>] CUDA streams (one per stage)
79
+ # @return [void]
80
+ def execute_pipelined(buffers:, total_size:, dtype:, op:, transfer_fn:, reduce_fn:, streams: nil)
81
+ # Calculate chunks
82
+ n_chunks = (total_size + @chunk_size - 1) / @chunk_size
83
+
84
+ # Create streams if not provided
85
+ streams ||= create_streams(@num_stages, buffers[0])
86
+
87
+ # Initialize pipeline stages
88
+ stages = n_chunks.times.map do |i|
89
+ offset = i * @chunk_size
90
+ size = [total_size - offset, @chunk_size].min
91
+ PipelineStage.new(chunk_id: i, offset: offset, size: size, state: STATE_PENDING)
92
+ end
93
+
94
+ # Pipeline execution
95
+ active_stages = []
96
+ pending_stages = stages.dup
97
+
98
+ while pending_stages.any? || active_stages.any?
99
+ # Start new transfers up to num_stages
100
+ while active_stages.size < @num_stages && pending_stages.any?
101
+ stage = pending_stages.shift
102
+ stream = streams[stage.chunk_id % @num_stages]
103
+
104
+ # Start transfer
105
+ transfer_fn.call(stage.offset, stage.size, stream)
106
+ stage.state = STATE_TRANSFERRING
107
+ active_stages << stage
108
+ end
109
+
110
+ # Check for completed transfers, start compute
111
+ active_stages.each do |stage|
112
+ if stage.state == STATE_TRANSFERRING
113
+ stream = streams[stage.chunk_id % @num_stages]
114
+
115
+ # Sync to ensure transfer complete
116
+ sync_stream(stream)
117
+
118
+ # Start compute
119
+ reduce_fn.call(stage.offset, stage.size, stream)
120
+ stage.state = STATE_COMPUTING
121
+ end
122
+ end
123
+
124
+ # Check for completed compute
125
+ active_stages.reject! do |stage|
126
+ if stage.state == STATE_COMPUTING
127
+ stream = streams[stage.chunk_id % @num_stages]
128
+ sync_stream(stream)
129
+ stage.state = STATE_COMPLETE
130
+ true
131
+ else
132
+ false
133
+ end
134
+ end
135
+ end
136
+ end
137
+
138
+ # Execute a pipelined AllReduce with overlap
139
+ #
140
+ # @param ring [Ring] Ring algorithm instance
141
+ # @param buffers [Array<FFI::Pointer>] Device buffers
142
+ # @param total_size [Integer] Total buffer size
143
+ # @param dtype [Symbol] Data type
144
+ # @param op [Symbol] Reduction operation
145
+ # @param gpu_streams [Array<Array<CUDA::Stream>>] Streams per GPU per stage
146
+ # @return [void]
147
+ def execute_pipelined_all_reduce(ring:, buffers:, total_size:, dtype:, op:, gpu_streams: nil)
148
+ n_gpus = ring.n_gpus
149
+ n_chunks = (total_size + @chunk_size - 1) / @chunk_size
150
+
151
+ # Create streams: 2 per GPU for double-buffering
152
+ gpu_streams ||= n_gpus.times.map do |rank|
153
+ CUDA::RuntimeAPI.cudaSetDevice(ring.ring_order[rank])
154
+ create_streams_for_device(@num_stages)
155
+ end
156
+
157
+ # Process in pipeline fashion
158
+ n_chunks.times do |chunk_id|
159
+ chunk_offset = chunk_id * @chunk_size
160
+ chunk_size = [total_size - chunk_offset, @chunk_size].min
161
+
162
+ stage_idx = chunk_id % @num_stages
163
+
164
+ # Get streams for this stage
165
+ stage_streams = gpu_streams.map { |streams| streams[stage_idx] }
166
+
167
+ # Create offset buffers
168
+ chunk_buffers = buffers.map { |buf| ptr_offset(buf, chunk_offset) }
169
+
170
+ # Execute ring all-reduce on this chunk
171
+ ring.all_reduce(
172
+ buffers: chunk_buffers,
173
+ sizes: [chunk_size] * n_gpus,
174
+ dtype: dtype,
175
+ op: op,
176
+ streams: stage_streams
177
+ )
178
+ end
179
+
180
+ # Final sync
181
+ gpu_streams.flatten.each { |s| sync_stream(s) }
182
+ end
183
+
184
+ private
185
+
186
+ def create_streams(count, buffer)
187
+ count.times.map do
188
+ stream_ptr = FFI::MemoryPointer.new(:pointer)
189
+ CUDA::RuntimeAPI.cudaStreamCreate(stream_ptr)
190
+ stream_ptr.read_pointer
191
+ end
192
+ end
193
+
194
+ def create_streams_for_device(count)
195
+ count.times.map do
196
+ stream_ptr = FFI::MemoryPointer.new(:pointer)
197
+ CUDA::RuntimeAPI.cudaStreamCreate(stream_ptr)
198
+ stream_ptr.read_pointer
199
+ end
200
+ end
201
+
202
+ def sync_stream(stream)
203
+ stream_ptr = case stream
204
+ when FFI::Pointer then stream
205
+ when CUDA::Stream then stream.ptr
206
+ else FFI::Pointer::NULL
207
+ end
208
+
209
+ if stream_ptr.null?
210
+ CUDA::RuntimeAPI.cudaDeviceSynchronize
211
+ else
212
+ CUDA::RuntimeAPI.cudaStreamSynchronize(stream_ptr)
213
+ end
214
+ end
215
+
216
+ def ptr_offset(ptr, offset)
217
+ FFI::Pointer.new(:uint8, ptr.address + offset)
218
+ end
219
+ end
220
+ end
221
+ end
222
+ end