ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +7 -0
  3. data/lib/ignis-collective.rb +9 -0
  4. data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
  5. data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
  6. data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
  7. data/lib/nvruby/collective/algorithms/ring.rb +421 -0
  8. data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
  9. data/lib/nvruby/collective/algorithms/tree.rb +291 -0
  10. data/lib/nvruby/collective/array_ops.rb +240 -0
  11. data/lib/nvruby/collective/communicator.rb +633 -0
  12. data/lib/nvruby/collective/communicator_healer.rb +276 -0
  13. data/lib/nvruby/collective/device_manager.rb +216 -0
  14. data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
  15. data/lib/nvruby/collective/health_monitor.rb +333 -0
  16. data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
  17. data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
  18. data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
  19. data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
  20. data/lib/nvruby/collective/p2p_bindings.rb +121 -0
  21. data/lib/nvruby/collective/resilient_transport.rb +296 -0
  22. data/lib/nvruby/collective/topology.rb +347 -0
  23. data/lib/nvruby/collective/transport/base.rb +138 -0
  24. data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
  25. data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
  26. data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
  27. data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
  28. data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
  29. data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
  30. data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
  31. data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
  32. data/lib/nvruby/collective/transport_selector.rb +200 -0
  33. data/lib/nvruby/collective/vmm_bindings.rb +212 -0
  34. data/lib/nvruby/collective.rb +156 -0
  35. metadata +92 -0
@@ -0,0 +1,240 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module Collective
5
+ # NvArray Integration for Collective Operations
6
+ #
7
+ # Provides tensor-aware wrappers around collective primitives
8
+ # that work directly with NvArray tensors.
9
+ module ArrayOps
10
+ class << self
11
+ # AllReduce on NvArray tensors
12
+ #
13
+ # @param tensors [Array<NvArray>] One NvArray per GPU
14
+ # @param op [Symbol] Reduction operation (:sum, :prod, :min, :max)
15
+ # @param comm [Communicator] Communicator instance
16
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
17
+ # @return [Array<NvArray>] Modified tensors with reduced values
18
+ def all_reduce(tensors, op: :sum, comm:, stream: nil)
19
+ validate_tensors!(tensors)
20
+ validate_devices!(tensors, comm)
21
+
22
+ comm.all_reduce(tensors, op: op, stream: stream)
23
+ end
24
+
25
+ # Broadcast NvArray from root to all GPUs
26
+ #
27
+ # @param tensor [NvArray] Tensor on root GPU
28
+ # @param root [Integer] Root rank
29
+ # @param comm [Communicator] Communicator instance
30
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
31
+ # @return [Array<NvArray>] NvArrays on all GPUs with same data
32
+ def broadcast(tensor, root: 0, comm:, stream: nil)
33
+ validate_single_tensor!(tensor)
34
+
35
+ # Comm.broadcast returns FFI::Pointers, wrap in NvArrays
36
+ pointers = comm.broadcast(tensor, root: root, stream: stream)
37
+
38
+ pointers.map.with_index do |ptr, i|
39
+ if i == root
40
+ tensor
41
+ else
42
+ create_nvarray_from_ptr(
43
+ ptr: ptr,
44
+ shape: tensor.shape,
45
+ dtype: tensor.dtype,
46
+ device: comm.gpu_ids[i]
47
+ )
48
+ end
49
+ end
50
+ end
51
+
52
+ # Reduce to root GPU
53
+ #
54
+ # @param tensors [Array<NvArray>] One NvArray per GPU
55
+ # @param root [Integer] Root rank
56
+ # @param op [Symbol] Reduction operation
57
+ # @param comm [Communicator] Communicator instance
58
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
59
+ # @return [NvArray] Reduced tensor on root GPU
60
+ def reduce(tensors, root: 0, op: :sum, comm:, stream: nil)
61
+ validate_tensors!(tensors)
62
+
63
+ result = comm.reduce(tensors, root: root, op: op, stream: stream)
64
+
65
+ # result is already the root tensor or pointer
66
+ result.is_a?(NvArray) ? result : tensors[root]
67
+ end
68
+
69
+ # AllGather - each GPU contributes, all receive all
70
+ #
71
+ # @param tensors [Array<NvArray>] One NvArray per GPU (local chunks)
72
+ # @param comm [Communicator] Communicator instance
73
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
74
+ # @return [Array<NvArray>] NvArrays on all GPUs with gathered data
75
+ def all_gather(tensors, comm:, stream: nil)
76
+ validate_tensors!(tensors)
77
+
78
+ # Calculate total size after gathering
79
+ n_gpus = tensors.size
80
+ chunk_shape = tensors[0].shape.dup
81
+ gathered_shape = chunk_shape.dup
82
+ gathered_shape[0] = chunk_shape[0] * n_gpus # Concatenate along first axis
83
+
84
+ gathered_pointers = comm.all_gather(tensors, stream: stream)
85
+
86
+ # Wrap results in NvArrays
87
+ gathered_pointers.map.with_index do |ptr_or_arr, i|
88
+ if ptr_or_arr.is_a?(Array)
89
+ # Already NvArrays
90
+ ptr_or_arr[i]
91
+ else
92
+ create_nvarray_from_ptr(
93
+ ptr: ptr_or_arr,
94
+ shape: gathered_shape,
95
+ dtype: tensors[0].dtype,
96
+ device: comm.gpu_ids[i]
97
+ )
98
+ end
99
+ end
100
+ end
101
+
102
+ # ReduceScatter - reduce then scatter
103
+ #
104
+ # @param tensors [Array<NvArray>] One NvArray per GPU (full size)
105
+ # @param op [Symbol] Reduction operation
106
+ # @param comm [Communicator] Communicator instance
107
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
108
+ # @return [Array<NvArray>] Chunk NvArrays (1/N size each)
109
+ def reduce_scatter(tensors, op: :sum, comm:, stream: nil)
110
+ validate_tensors!(tensors)
111
+
112
+ n_gpus = tensors.size
113
+ full_shape = tensors[0].shape.dup
114
+ chunk_shape = full_shape.dup
115
+ chunk_shape[0] = full_shape[0] / n_gpus
116
+
117
+ chunk_pointers = comm.reduce_scatter(tensors, op: op, stream: stream)
118
+
119
+ chunk_pointers.map.with_index do |ptr, i|
120
+ create_nvarray_from_ptr(
121
+ ptr: ptr,
122
+ shape: chunk_shape,
123
+ dtype: tensors[0].dtype,
124
+ device: comm.gpu_ids[i]
125
+ )
126
+ end
127
+ end
128
+
129
+ # Distribute tensor across GPUs (scatter)
130
+ #
131
+ # @param tensor [NvArray] Source tensor on root GPU
132
+ # @param root [Integer] Root rank
133
+ # @param comm [Communicator] Communicator instance
134
+ # @param stream [CUDA::Stream, nil] Optional CUDA stream
135
+ # @return [Array<NvArray>] Chunks on each GPU
136
+ def scatter(tensor, root: 0, comm:, stream: nil)
137
+ validate_single_tensor!(tensor)
138
+
139
+ n_gpus = comm.gpu_ids.size
140
+ full_shape = tensor.shape.dup
141
+ chunk_shape = full_shape.dup
142
+ chunk_shape[0] = full_shape[0] / n_gpus
143
+ chunk_size = tensor.nbytes / n_gpus
144
+
145
+ chunks = []
146
+
147
+ n_gpus.times do |i|
148
+ if i == root
149
+ # Root keeps first chunk
150
+ chunk_ptr = tensor.device_ptr
151
+ chunks << create_nvarray_from_ptr(
152
+ ptr: chunk_ptr,
153
+ shape: chunk_shape,
154
+ dtype: tensor.dtype,
155
+ device: comm.gpu_ids[i]
156
+ )
157
+ else
158
+ # Allocate and send chunk
159
+ chunk_ptr = allocate_on_device(comm.gpu_ids[i], chunk_size)
160
+ offset = i * chunk_size
161
+ src_ptr = FFI::Pointer.new(:uint8, tensor.device_ptr.address + offset)
162
+
163
+ comm.send_recv(
164
+ src_ptr,
165
+ src_rank: root,
166
+ dst_buffer: chunk_ptr,
167
+ dst_rank: i,
168
+ size: chunk_size,
169
+ stream: stream
170
+ )
171
+
172
+ chunks << create_nvarray_from_ptr(
173
+ ptr: chunk_ptr,
174
+ shape: chunk_shape,
175
+ dtype: tensor.dtype,
176
+ device: comm.gpu_ids[i]
177
+ )
178
+ end
179
+ end
180
+
181
+ chunks
182
+ end
183
+
184
+ private
185
+
186
+ def validate_tensors!(tensors)
187
+ raise ArgumentError, "Expected array of tensors" unless tensors.is_a?(Array)
188
+ raise ArgumentError, "Empty tensor array" if tensors.empty?
189
+
190
+ first = tensors[0]
191
+ raise ArgumentError, "Tensors must be NvArray" unless first.is_a?(NvArray)
192
+ raise ArgumentError, "Tensors must be on device" unless first.on_device?
193
+
194
+ tensors.each do |t|
195
+ unless t.shape == first.shape
196
+ raise ArgumentError, "All tensors must have same shape"
197
+ end
198
+ unless t.dtype == first.dtype
199
+ raise ArgumentError, "All tensors must have same dtype"
200
+ end
201
+ end
202
+ end
203
+
204
+ def validate_single_tensor!(tensor)
205
+ raise ArgumentError, "Expected NvArray" unless tensor.is_a?(NvArray)
206
+ raise ArgumentError, "Tensor must be on device" unless tensor.on_device?
207
+ end
208
+
209
+ def validate_devices!(tensors, comm)
210
+ tensors.each_with_index do |t, i|
211
+ expected_device = comm.gpu_ids[i]
212
+ actual_device = t.device
213
+
214
+ unless actual_device == expected_device
215
+ raise ArgumentError, "Tensor #{i} on device #{actual_device}, expected #{expected_device}"
216
+ end
217
+ end
218
+ end
219
+
220
+ def create_nvarray_from_ptr(ptr:, shape:, dtype:, device:)
221
+ NvArray.from_device_ptr(
222
+ ptr,
223
+ shape: shape,
224
+ dtype: dtype,
225
+ take_ownership: true
226
+ )
227
+ end
228
+
229
+ def allocate_on_device(device_id, size)
230
+ CUDA::RuntimeAPI.ensure_loaded!
231
+ CUDA::RuntimeAPI.cudaSetDevice(device_id)
232
+ ptr_ptr = FFI::MemoryPointer.new(:pointer)
233
+ status = CUDA::RuntimeAPI.cudaMalloc(ptr_ptr, size)
234
+ CUDA::RuntimeAPI.check_status!(status, "Alloc for scatter")
235
+ ptr_ptr.read_pointer
236
+ end
237
+ end
238
+ end
239
+ end
240
+ end