ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +7 -0
  3. data/lib/ignis-collective.rb +9 -0
  4. data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
  5. data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
  6. data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
  7. data/lib/nvruby/collective/algorithms/ring.rb +421 -0
  8. data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
  9. data/lib/nvruby/collective/algorithms/tree.rb +291 -0
  10. data/lib/nvruby/collective/array_ops.rb +240 -0
  11. data/lib/nvruby/collective/communicator.rb +633 -0
  12. data/lib/nvruby/collective/communicator_healer.rb +276 -0
  13. data/lib/nvruby/collective/device_manager.rb +216 -0
  14. data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
  15. data/lib/nvruby/collective/health_monitor.rb +333 -0
  16. data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
  17. data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
  18. data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
  19. data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
  20. data/lib/nvruby/collective/p2p_bindings.rb +121 -0
  21. data/lib/nvruby/collective/resilient_transport.rb +296 -0
  22. data/lib/nvruby/collective/topology.rb +347 -0
  23. data/lib/nvruby/collective/transport/base.rb +138 -0
  24. data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
  25. data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
  26. data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
  27. data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
  28. data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
  29. data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
  30. data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
  31. data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
  32. data/lib/nvruby/collective/transport_selector.rb +200 -0
  33. data/lib/nvruby/collective/vmm_bindings.rb +212 -0
  34. data/lib/nvruby/collective.rb +156 -0
  35. metadata +92 -0
@@ -0,0 +1,633 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "device_manager"
4
+ require_relative "transport_selector"
5
+ require_relative "topology"
6
+ require_relative "algorithms/reduction_ops"
7
+ require_relative "algorithms/ring"
8
+ require_relative "algorithms/tree"
9
+
10
+ module Ignis
11
+ module Collective
12
+ # Primary user-facing abstraction for collective operations
13
+ # Provides AllReduce, Broadcast, Reduce, and other collective primitives
14
+ class Communicator
15
+ # Reduction operations
16
+ REDUCTION_OPS = [:sum, :prod, :min, :max, :avg].freeze
17
+
18
+ # @return [Array<Integer>] GPU device IDs in this communicator
19
+ attr_reader :gpu_ids
20
+
21
+ # @return [DeviceManager] Device manager
22
+ attr_reader :device_manager
23
+
24
+ # @return [TransportSelector] Transport selector
25
+ attr_reader :transport_selector
26
+
27
+ # @return [Integer] Rank of this communicator (for multi-process)
28
+ attr_reader :rank
29
+
30
+ # @return [Integer] Total number of ranks
31
+ attr_reader :world_size
32
+
33
+ # Create a new communicator for the specified GPUs
34
+ # @param gpu_ids [Array<Integer>] GPU device IDs to include
35
+ # @param rank [Integer] Rank of this process (default 0 for single-process)
36
+ # @param world_size [Integer] Total ranks (default 1 for single-process)
37
+ def initialize(gpu_ids:, rank: 0, world_size: 1)
38
+ @gpu_ids = gpu_ids.dup.freeze
39
+ @rank = rank
40
+ @world_size = world_size
41
+
42
+ validate_gpu_ids!
43
+
44
+ @device_manager = DeviceManager.new(device_ids: @gpu_ids)
45
+ @transport_selector = TransportSelector.new(@gpu_ids)
46
+ @ring_order = nil
47
+ @initialized = false
48
+ end
49
+
50
+ # Initialize the communicator (detect topology, enable P2P, etc.)
51
+ # @return [self]
52
+ def initialize!
53
+ return self if @initialized
54
+
55
+ @device_manager.initialize!
56
+ @device_manager.enable_all_p2p_access!
57
+ @transport_selector.initialize!
58
+ @ring_order = @transport_selector.optimal_ring_order
59
+
60
+ @initialized = true
61
+ self
62
+ end
63
+
64
+ # Perform AllReduce operation - reduce and distribute result to all GPUs
65
+ # @param tensors [Array<NvArray>] One tensor per GPU
66
+ # @param op [Symbol] Reduction operation (:sum, :prod, :min, :max)
67
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
68
+ # @return [Array<NvArray>] Reduced tensors (same references as input)
69
+ def all_reduce(tensors, op: :sum, stream: nil)
70
+ validate_operation!(op)
71
+ validate_tensors!(tensors)
72
+ ensure_initialized!
73
+
74
+ # Single GPU case - no-op
75
+ return tensors if @gpu_ids.size == 1
76
+
77
+ # Use Ring AllReduce for multi-GPU
78
+ ring_all_reduce(tensors, op, stream)
79
+ end
80
+
81
+ # Async AllReduce - requires explicit synchronization
82
+ # @param tensors [Array<NvArray>] One tensor per GPU
83
+ # @param op [Symbol] Reduction operation
84
+ # @param stream [CUDA::Stream] CUDA stream for async execution
85
+ # @return [Array<NvArray>] Tensors (result available after sync)
86
+ def all_reduce_async(tensors, op: :sum, stream:)
87
+ raise ArgumentError, "Stream required for async operation" unless stream
88
+
89
+ all_reduce(tensors, op: op, stream: stream)
90
+ end
91
+
92
+ # Broadcast tensor from root GPU to all GPUs
93
+ # @param tensor [NvArray] Source tensor on root GPU
94
+ # @param root [Integer] Root GPU index (default 0)
95
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
96
+ # @return [Array<NvArray>] Tensors on all GPUs with broadcasted data
97
+ def broadcast(tensor, root: 0, stream: nil)
98
+ ensure_initialized!
99
+ validate_gpu_index!(root)
100
+
101
+ return [tensor] if @gpu_ids.size == 1
102
+
103
+ # TODO: Implement tree broadcast algorithm
104
+ # For now, use simple fan-out from root
105
+ simple_broadcast(tensor, root, stream)
106
+ end
107
+
108
+ # Reduce tensors to root GPU
109
+ # @param tensors [Array<NvArray>] One tensor per GPU
110
+ # @param root [Integer] Root GPU index
111
+ # @param op [Symbol] Reduction operation
112
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
113
+ # @return [NvArray] Reduced tensor on root GPU
114
+ def reduce(tensors, root: 0, op: :sum, stream: nil)
115
+ validate_operation!(op)
116
+ validate_tensors!(tensors)
117
+ ensure_initialized!
118
+ validate_gpu_index!(root)
119
+
120
+ return tensors[0] if @gpu_ids.size == 1
121
+
122
+ # TODO: Implement tree reduce algorithm
123
+ simple_reduce(tensors, root, op, stream)
124
+ end
125
+
126
+ # AllGather - gather tensors from all GPUs to all GPUs
127
+ # @param tensors [Array<NvArray>] One tensor per GPU (each may be different size)
128
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
129
+ # @return [Array<Array<NvArray>>] Gathered tensors on each GPU
130
+ def all_gather(tensors, stream: nil)
131
+ validate_tensors!(tensors)
132
+ ensure_initialized!
133
+
134
+ return [tensors] if @gpu_ids.size == 1
135
+
136
+ # TODO: Implement ring all-gather
137
+ simple_all_gather(tensors, stream)
138
+ end
139
+
140
+ # ReduceScatter - reduce and scatter result
141
+ # @param tensors [Array<NvArray>] One tensor per GPU
142
+ # @param op [Symbol] Reduction operation
143
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
144
+ # @return [Array<FFI::Pointer>] Scattered reduced chunks (chunk size = total_size / N)
145
+ def reduce_scatter(tensors, op: :sum, stream: nil)
146
+ validate_operation!(op)
147
+ validate_tensors!(tensors)
148
+ ensure_initialized!
149
+
150
+ return tensors if @gpu_ids.size == 1
151
+
152
+ ring = Algorithms::Ring.new(
153
+ ring_order: @ring_order,
154
+ transport_selector: @transport_selector
155
+ )
156
+
157
+ buffers = tensors.map { |t| device_buffer(t) }
158
+ sizes = tensors.map { |t| byte_size_of(t) }
159
+
160
+ dtype = if tensors[0].respond_to?(:dtype)
161
+ tensors[0].dtype
162
+ else
163
+ :float32
164
+ end
165
+
166
+ # Calculate chunk size
167
+ total_size = sizes[0]
168
+ chunk_size = ring.calculate_chunk_size(total_size)
169
+
170
+ # Allocate result buffers
171
+ result_buffers = @gpu_ids.map do |gpu_id|
172
+ allocate_buffer_on_device(gpu_id, chunk_size)
173
+ end
174
+
175
+ streams = stream ? [stream] * @gpu_ids.size : create_null_streams(@gpu_ids.size)
176
+
177
+ ring.reduce_scatter(
178
+ buffers: buffers,
179
+ result_buffers: result_buffers,
180
+ sizes: sizes,
181
+ dtype: dtype,
182
+ op: op,
183
+ streams: streams
184
+ )
185
+
186
+ result_buffers
187
+ end
188
+
189
+ # AllToAll - full exchange between all GPUs
190
+ # Each GPU sends N chunks (one to each GPU) and receives N chunks (one from each GPU)
191
+ # @param send_buffers [Array<Array<FFI::Pointer>>] N×N array: send_buffers[src][dst]
192
+ # @param recv_buffers [Array<Array<FFI::Pointer>>] N×N array: recv_buffers[dst][src]
193
+ # @param chunk_size [Integer] Size of each chunk in bytes
194
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
195
+ # @return [void]
196
+ def all_to_all(send_buffers, recv_buffers, chunk_size:, stream: nil)
197
+ ensure_initialized!
198
+
199
+ n = @gpu_ids.size
200
+ return if n == 1
201
+
202
+ streams = stream ? [stream] * n : create_null_streams(n)
203
+
204
+ # Phase 1: Copy local data (GPU[i] → GPU[i])
205
+ n.times do |rank|
206
+ gpu_id = @gpu_ids[rank]
207
+ CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
208
+ stream_ptr = get_stream_ptr(streams[rank])
209
+
210
+ CUDA::RuntimeAPI.cudaMemcpyAsync(
211
+ recv_buffers[rank][rank],
212
+ send_buffers[rank][rank],
213
+ chunk_size,
214
+ CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_DEVICE,
215
+ stream_ptr
216
+ )
217
+ end
218
+
219
+ # Phase 2: N-1 rounds of pairwise exchange
220
+ (n - 1).times do |round|
221
+ n.times do |rank|
222
+ gpu_id = @gpu_ids[rank]
223
+
224
+ # Calculate partner for this round (rotation pattern)
225
+ partner = (rank + round + 1) % n
226
+ partner_gpu = @gpu_ids[partner]
227
+
228
+ stream_ptr = get_stream_ptr(streams[rank])
229
+
230
+ # Send to partner
231
+ transport = @transport_selector.select_transport(gpu_id, partner_gpu)
232
+
233
+ if transport.is_a?(Transport::P2PTransport)
234
+ transport.copy_async(
235
+ recv_buffers[partner][rank], # Partner receives from me
236
+ send_buffers[rank][partner], # I send to partner
237
+ chunk_size,
238
+ stream_ptr
239
+ )
240
+ end
241
+ end
242
+
243
+ # Synchronize after each round
244
+ synchronize_all_streams!(streams)
245
+ end
246
+ end
247
+
248
+ # Point-to-point send from current rank to destination
249
+ # @param tensor [NvArray, FFI::Pointer] Data to send
250
+ # @param dest_rank [Integer] Destination rank (index in gpu_ids)
251
+ # @param size [Integer, nil] Size in bytes (inferred from tensor if nil)
252
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
253
+ # @return [void]
254
+ def send(tensor, dest_rank:, size: nil, stream: nil)
255
+ ensure_initialized!
256
+ validate_gpu_index!(dest_rank)
257
+
258
+ src_rank = 0 # Default sender is rank 0
259
+ src_gpu = @gpu_ids[src_rank]
260
+ dst_gpu = @gpu_ids[dest_rank]
261
+
262
+ return if src_rank == dest_rank
263
+
264
+ buffer = device_buffer(tensor)
265
+ byte_size = size || byte_size_of(tensor)
266
+
267
+ transport = @transport_selector.select_transport(src_gpu, dst_gpu)
268
+ stream_ptr = stream ? get_stream_ptr(stream) : FFI::Pointer::NULL
269
+
270
+ if transport.is_a?(Transport::P2PTransport)
271
+ # P2P copy requires destination buffer
272
+ # Assumes tensor has been pre-allocated on dest
273
+ raise ArgumentError, "P2P send requires pre-allocated recv buffer on dest"
274
+ end
275
+ end
276
+
277
+ # Point-to-point send from specific source rank
278
+ # @param buffer [FFI::Pointer] Source buffer on src_rank GPU
279
+ # @param src_rank [Integer] Source rank
280
+ # @param dst_buffer [FFI::Pointer] Destination buffer on dst_rank GPU
281
+ # @param dst_rank [Integer] Destination rank
282
+ # @param size [Integer] Size in bytes
283
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
284
+ # @return [void]
285
+ def send_recv(buffer, src_rank:, dst_buffer:, dst_rank:, size:, stream: nil)
286
+ ensure_initialized!
287
+ validate_gpu_index!(src_rank)
288
+ validate_gpu_index!(dst_rank)
289
+
290
+ return if src_rank == dst_rank
291
+
292
+ src_gpu = @gpu_ids[src_rank]
293
+ dst_gpu = @gpu_ids[dst_rank]
294
+
295
+ transport = @transport_selector.select_transport(src_gpu, dst_gpu)
296
+ stream_ptr = stream ? get_stream_ptr(stream) : FFI::Pointer::NULL
297
+
298
+ if transport.is_a?(Transport::P2PTransport)
299
+ # Set source device context
300
+ CUDA::RuntimeAPI.cudaSetDevice(src_gpu)
301
+ transport.copy_async(dst_buffer, buffer, size, stream_ptr)
302
+ elsif transport.is_a?(Transport::IPCTransport)
303
+ # For IPC, export/import handles
304
+ handle = transport.export_handle(buffer)
305
+ CUDA::RuntimeAPI.cudaSetDevice(dst_gpu)
306
+ mapped = transport.import_handle(handle)
307
+
308
+ # Copy from mapped to destination
309
+ CUDA::RuntimeAPI.cudaMemcpyAsync(
310
+ dst_buffer,
311
+ mapped,
312
+ size,
313
+ CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_DEVICE,
314
+ stream_ptr
315
+ )
316
+
317
+ transport.close_imported_handle(mapped)
318
+ end
319
+ end
320
+
321
+ # Point-to-point receive (no-op, actual receive happens in send_recv)
322
+ # @param buffer [FFI::Pointer] Buffer to receive into
323
+ # @param src_rank [Integer] Source rank
324
+ # @param size [Integer] Expected size in bytes
325
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
326
+ # @return [void]
327
+ def recv(buffer, src_rank:, size:, stream: nil)
328
+ ensure_initialized!
329
+ validate_gpu_index!(src_rank)
330
+ # Actual data transfer happens via send_recv from sender side
331
+ # This just marks the receive buffer as ready
332
+ barrier
333
+ end
334
+
335
+ # Barrier synchronization across all GPUs
336
+ # @return [void]
337
+ def barrier
338
+ ensure_initialized!
339
+ @device_manager.synchronize_all!
340
+ end
341
+
342
+ # Check if communicator is ready
343
+ # @return [Boolean] True if initialized
344
+ def ready?
345
+ @initialized &&
346
+ @device_manager.ready? &&
347
+ @transport_selector.ready?
348
+ end
349
+
350
+ # Get the topology matrix
351
+ # @return [Topology::Matrix] Topology information
352
+ def topology
353
+ @device_manager.topology&.matrix
354
+ end
355
+
356
+ # Get performance summary
357
+ # @return [Hash] Performance statistics
358
+ def performance_summary
359
+ @transport_selector.performance_summary
360
+ end
361
+
362
+ # Clean up all resources
363
+ # @return [void]
364
+ def destroy!
365
+ @transport_selector.destroy!
366
+ @device_manager.destroy!
367
+ @initialized = false
368
+ end
369
+
370
+ # @return [String] Human-readable description
371
+ def to_s
372
+ status = @initialized ? "ready" : "uninitialized"
373
+ "Communicator[#{@gpu_ids.size} GPUs, #{status}]"
374
+ end
375
+
376
+ # @return [String] Detailed inspection
377
+ def inspect
378
+ "#<Ignis::Collective::Communicator " \
379
+ "gpu_ids=#{@gpu_ids} " \
380
+ "rank=#{@rank}/#{@world_size} " \
381
+ "initialized=#{@initialized}>"
382
+ end
383
+
384
+ private
385
+
386
+ def validate_gpu_ids!
387
+ raise ArgumentError, "gpu_ids cannot be empty" if @gpu_ids.empty?
388
+
389
+ max_id = CUDA::Device.count - 1
390
+ invalid = @gpu_ids.reject { |id| id.between?(0, max_id) }
391
+ return if invalid.empty?
392
+
393
+ raise ArgumentError, "Invalid GPU IDs: #{invalid}. Valid range: 0-#{max_id}"
394
+ end
395
+
396
+ def validate_operation!(op)
397
+ return if REDUCTION_OPS.include?(op)
398
+
399
+ raise ArgumentError, "Invalid reduction op: #{op}. Valid: #{REDUCTION_OPS}"
400
+ end
401
+
402
+ def validate_tensors!(tensors)
403
+ if tensors.size != @gpu_ids.size
404
+ raise ArgumentError,
405
+ "Expected #{@gpu_ids.size} tensors, got #{tensors.size}"
406
+ end
407
+ end
408
+
409
+ def validate_gpu_index!(index)
410
+ return if index >= 0 && index < @gpu_ids.size
411
+
412
+ raise ArgumentError, "Invalid GPU index: #{index}. Valid: 0-#{@gpu_ids.size - 1}"
413
+ end
414
+
415
+ def ensure_initialized!
416
+ return if @initialized
417
+
418
+ raise CommunicatorError, "Communicator not initialized. Call initialize! first."
419
+ end
420
+
421
+ # Ring AllReduce using the Ring algorithm
422
+ # Uses scatter-reduce + allgather pattern
423
+ def ring_all_reduce(tensors, op, stream)
424
+ n = @gpu_ids.size
425
+ return tensors if n == 1
426
+
427
+ # Create Ring algorithm instance
428
+ ring = Algorithms::Ring.new(
429
+ ring_order: @ring_order,
430
+ transport_selector: @transport_selector
431
+ )
432
+
433
+ # Extract device pointers and BYTE sizes from tensors.
434
+ buffers = tensors.map { |t| device_buffer(t) }
435
+ sizes = tensors.map { |t| byte_size_of(t) }
436
+
437
+ # Detect dtype (default to float32)
438
+ dtype = tensors[0].respond_to?(:dtype) ? tensors[0].dtype : :float32
439
+
440
+ # Create streams for each GPU
441
+ streams = stream ? [stream] * n : create_null_streams(n)
442
+
443
+ # Execute Ring AllReduce (scatter-reduce + all-gather). For :avg the ring
444
+ # accumulates a SUM; we divide by the participant count once at the end.
445
+ ring.all_reduce(
446
+ buffers: buffers,
447
+ sizes: sizes,
448
+ dtype: dtype,
449
+ op: op,
450
+ streams: streams
451
+ )
452
+
453
+ apply_avg!(buffers, sizes, dtype, n) if op == :avg
454
+
455
+ tensors
456
+ end
457
+
458
+ # Create null stream pointers for each GPU
459
+ def create_null_streams(n)
460
+ Array.new(n) { FFI::Pointer::NULL }
461
+ end
462
+
463
+ # Extract an FFI device pointer from a tensor/NvArray (either class) or
464
+ # pass through a raw pointer. (Was `t.data_ptr` — which no NvArray defines.)
465
+ def device_buffer(t)
466
+ if t.respond_to?(:device_ffi_ptr) then t.device_ffi_ptr # Ignis/Shared NvArray
467
+ elsif t.respond_to?(:device_ptr) then t.device_ptr
468
+ elsif t.respond_to?(:data_ptr) then t.data_ptr
469
+ else t # assume FFI::Pointer
470
+ end
471
+ end
472
+
473
+ # Size of a tensor in BYTES. (Was `t.byte_size || t.size`, and t.size is the
474
+ # element COUNT — not bytes — for NvArray, so reductions ran on 1/4 the data.)
475
+ def byte_size_of(t)
476
+ if t.respond_to?(:nbytes) then t.nbytes # Ignis::NvArray
477
+ elsif t.respond_to?(:size_bytes) then t.size_bytes # Ignis::Shared::NvArray
478
+ elsif t.respond_to?(:byte_size) then t.byte_size
479
+ else raise ArgumentError, "cannot determine byte size of #{t.class}"
480
+ end
481
+ end
482
+
483
+ # Element size in bytes for a dtype.
484
+ def elem_size_of(dtype)
485
+ case dtype
486
+ when :float64, :int64 then 8
487
+ when :float16, :bfloat16 then 2
488
+ when :int8, :uint8 then 1
489
+ else 4
490
+ end
491
+ end
492
+
493
+ # Divide each buffer by the participant count in place (for op: :avg).
494
+ def apply_avg!(buffers, sizes, dtype, n)
495
+ scale = 1.0 / n
496
+ kernel = Ignis::JIT::Kernels::Elementwise.scale_forward
497
+ es = elem_size_of(dtype)
498
+ buffers.each_with_index do |buf, i|
499
+ count = sizes[i] / es
500
+ CUDA::RuntimeAPI.cudaSetDevice(@gpu_ids[i])
501
+ kernel.launch(grid: [(count + 255) / 256], block: [256], args: [buf, buf, scale, count])
502
+ end
503
+ Ignis.synchronize
504
+ end
505
+
506
+ # Tree broadcast from root
507
+ def simple_broadcast(tensor, root, stream)
508
+ # Create buffers array with tensor at root position
509
+ buffers = @gpu_ids.map.with_index do |gpu_id, i|
510
+ if i == root
511
+ device_buffer(tensor)
512
+ else
513
+ # Allocate buffer on other GPUs
514
+ allocate_buffer_on_device(gpu_id, byte_size_of(tensor))
515
+ end
516
+ end
517
+
518
+ size = byte_size_of(tensor)
519
+
520
+ # Create Tree algorithm instance
521
+ tree = Algorithms::Tree.new(
522
+ gpu_ids: @gpu_ids,
523
+ transport_selector: @transport_selector
524
+ )
525
+
526
+ # Create streams
527
+ streams = stream ? [stream] * @gpu_ids.size : create_null_streams(@gpu_ids.size)
528
+
529
+ # Execute broadcast
530
+ tree.broadcast(
531
+ buffer: buffers[root],
532
+ buffers: buffers,
533
+ size: size,
534
+ root: root,
535
+ streams: streams
536
+ )
537
+
538
+ buffers
539
+ end
540
+
541
+ # Tree reduce to root
542
+ def simple_reduce(tensors, root, op, stream)
543
+ buffers = tensors.map { |t| device_buffer(t) }
544
+ sizes = tensors.map { |t| byte_size_of(t) }
545
+
546
+ dtype = if tensors[0].respond_to?(:dtype)
547
+ tensors[0].dtype
548
+ else
549
+ :float32
550
+ end
551
+
552
+ tree = Algorithms::Tree.new(
553
+ gpu_ids: @gpu_ids,
554
+ transport_selector: @transport_selector
555
+ )
556
+
557
+ streams = stream ? [stream] * @gpu_ids.size : create_null_streams(@gpu_ids.size)
558
+
559
+ tree.reduce(
560
+ buffers: buffers,
561
+ sizes: sizes,
562
+ dtype: dtype,
563
+ op: op,
564
+ root: root,
565
+ streams: streams
566
+ )
567
+
568
+ tensors[root]
569
+ end
570
+
571
+ # All-gather. In this single-process model every rank's tensor is already
572
+ # an accessible Ruby/GPU object, so the gathered set on each rank is simply
573
+ # all input tensors. We return an independent list per rank (so callers may
574
+ # mutate one rank's view without aliasing others).
575
+ #
576
+ # NOTE: true cross-PROCESS / cross-node gather (world_size > 1) requires the
577
+ # transport layer to physically move device buffers, which is not wired yet;
578
+ # ring AllGather machinery exists in Algorithms::Ring#all_gather_standalone
579
+ # for when P2P/host-staged transports are completed.
580
+ def simple_all_gather(tensors, _stream)
581
+ barrier
582
+ Array.new(@gpu_ids.size) { tensors.dup }
583
+ end
584
+
585
+ # Helper: get tensor size
586
+ def tensor_size(tensor)
587
+ if tensor.respond_to?(:byte_size)
588
+ tensor.byte_size
589
+ elsif tensor.respond_to?(:size)
590
+ tensor.size
591
+ else
592
+ 4 # Default to 4 bytes
593
+ end
594
+ end
595
+
596
+ # Helper: allocate buffer on specific device
597
+ def allocate_buffer_on_device(gpu_id, size)
598
+ CUDA::RuntimeAPI.ensure_loaded!
599
+ CUDA::RuntimeAPI.cudaSetDevice(gpu_id)
600
+ ptr_ptr = FFI::MemoryPointer.new(:pointer)
601
+ status = CUDA::RuntimeAPI.cudaMalloc(ptr_ptr, size)
602
+ CUDA::RuntimeAPI.check_status!(status, "Alloc broadcast buffer")
603
+ ptr_ptr.read_pointer
604
+ end
605
+
606
+ # Helper: get stream pointer for FFI
607
+ def get_stream_ptr(stream)
608
+ case stream
609
+ when FFI::Pointer
610
+ stream
611
+ when CUDA::Stream
612
+ stream.ptr
613
+ else
614
+ FFI::Pointer::NULL
615
+ end
616
+ end
617
+
618
+ # Helper: synchronize all streams
619
+ def synchronize_all_streams!(streams)
620
+ streams.each_with_index do |stream, i|
621
+ CUDA::RuntimeAPI.cudaSetDevice(@gpu_ids[i])
622
+
623
+ stream_ptr = get_stream_ptr(stream)
624
+ if stream_ptr.null?
625
+ CUDA::RuntimeAPI.cudaDeviceSynchronize
626
+ else
627
+ CUDA::RuntimeAPI.cudaStreamSynchronize(stream_ptr)
628
+ end
629
+ end
630
+ end
631
+ end
632
+ end
633
+ end