ignis-collective 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +7 -0
- data/lib/ignis-collective.rb +9 -0
- data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
- data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
- data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
- data/lib/nvruby/collective/algorithms/ring.rb +421 -0
- data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
- data/lib/nvruby/collective/algorithms/tree.rb +291 -0
- data/lib/nvruby/collective/array_ops.rb +240 -0
- data/lib/nvruby/collective/communicator.rb +633 -0
- data/lib/nvruby/collective/communicator_healer.rb +276 -0
- data/lib/nvruby/collective/device_manager.rb +216 -0
- data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
- data/lib/nvruby/collective/health_monitor.rb +333 -0
- data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
- data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
- data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
- data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
- data/lib/nvruby/collective/p2p_bindings.rb +121 -0
- data/lib/nvruby/collective/resilient_transport.rb +296 -0
- data/lib/nvruby/collective/topology.rb +347 -0
- data/lib/nvruby/collective/transport/base.rb +138 -0
- data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
- data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
- data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
- data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
- data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
- data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
- data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
- data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
- data/lib/nvruby/collective/transport_selector.rb +200 -0
- data/lib/nvruby/collective/vmm_bindings.rb +212 -0
- data/lib/nvruby/collective.rb +156 -0
- metadata +92 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module Collective
|
|
5
|
+
# NvArray Integration for Collective Operations
|
|
6
|
+
#
|
|
7
|
+
# Provides tensor-aware wrappers around collective primitives
|
|
8
|
+
# that work directly with NvArray tensors.
|
|
9
|
+
module ArrayOps
|
|
10
|
+
class << self
|
|
11
|
+
# AllReduce on NvArray tensors
|
|
12
|
+
#
|
|
13
|
+
# @param tensors [Array<NvArray>] One NvArray per GPU
|
|
14
|
+
# @param op [Symbol] Reduction operation (:sum, :prod, :min, :max)
|
|
15
|
+
# @param comm [Communicator] Communicator instance
|
|
16
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
17
|
+
# @return [Array<NvArray>] Modified tensors with reduced values
|
|
18
|
+
def all_reduce(tensors, op: :sum, comm:, stream: nil)
|
|
19
|
+
validate_tensors!(tensors)
|
|
20
|
+
validate_devices!(tensors, comm)
|
|
21
|
+
|
|
22
|
+
comm.all_reduce(tensors, op: op, stream: stream)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Broadcast NvArray from root to all GPUs
|
|
26
|
+
#
|
|
27
|
+
# @param tensor [NvArray] Tensor on root GPU
|
|
28
|
+
# @param root [Integer] Root rank
|
|
29
|
+
# @param comm [Communicator] Communicator instance
|
|
30
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
31
|
+
# @return [Array<NvArray>] NvArrays on all GPUs with same data
|
|
32
|
+
def broadcast(tensor, root: 0, comm:, stream: nil)
|
|
33
|
+
validate_single_tensor!(tensor)
|
|
34
|
+
|
|
35
|
+
# Comm.broadcast returns FFI::Pointers, wrap in NvArrays
|
|
36
|
+
pointers = comm.broadcast(tensor, root: root, stream: stream)
|
|
37
|
+
|
|
38
|
+
pointers.map.with_index do |ptr, i|
|
|
39
|
+
if i == root
|
|
40
|
+
tensor
|
|
41
|
+
else
|
|
42
|
+
create_nvarray_from_ptr(
|
|
43
|
+
ptr: ptr,
|
|
44
|
+
shape: tensor.shape,
|
|
45
|
+
dtype: tensor.dtype,
|
|
46
|
+
device: comm.gpu_ids[i]
|
|
47
|
+
)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Reduce to root GPU
|
|
53
|
+
#
|
|
54
|
+
# @param tensors [Array<NvArray>] One NvArray per GPU
|
|
55
|
+
# @param root [Integer] Root rank
|
|
56
|
+
# @param op [Symbol] Reduction operation
|
|
57
|
+
# @param comm [Communicator] Communicator instance
|
|
58
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
59
|
+
# @return [NvArray] Reduced tensor on root GPU
|
|
60
|
+
def reduce(tensors, root: 0, op: :sum, comm:, stream: nil)
|
|
61
|
+
validate_tensors!(tensors)
|
|
62
|
+
|
|
63
|
+
result = comm.reduce(tensors, root: root, op: op, stream: stream)
|
|
64
|
+
|
|
65
|
+
# result is already the root tensor or pointer
|
|
66
|
+
result.is_a?(NvArray) ? result : tensors[root]
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# AllGather - each GPU contributes, all receive all
|
|
70
|
+
#
|
|
71
|
+
# @param tensors [Array<NvArray>] One NvArray per GPU (local chunks)
|
|
72
|
+
# @param comm [Communicator] Communicator instance
|
|
73
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
74
|
+
# @return [Array<NvArray>] NvArrays on all GPUs with gathered data
|
|
75
|
+
def all_gather(tensors, comm:, stream: nil)
|
|
76
|
+
validate_tensors!(tensors)
|
|
77
|
+
|
|
78
|
+
# Calculate total size after gathering
|
|
79
|
+
n_gpus = tensors.size
|
|
80
|
+
chunk_shape = tensors[0].shape.dup
|
|
81
|
+
gathered_shape = chunk_shape.dup
|
|
82
|
+
gathered_shape[0] = chunk_shape[0] * n_gpus # Concatenate along first axis
|
|
83
|
+
|
|
84
|
+
gathered_pointers = comm.all_gather(tensors, stream: stream)
|
|
85
|
+
|
|
86
|
+
# Wrap results in NvArrays
|
|
87
|
+
gathered_pointers.map.with_index do |ptr_or_arr, i|
|
|
88
|
+
if ptr_or_arr.is_a?(Array)
|
|
89
|
+
# Already NvArrays
|
|
90
|
+
ptr_or_arr[i]
|
|
91
|
+
else
|
|
92
|
+
create_nvarray_from_ptr(
|
|
93
|
+
ptr: ptr_or_arr,
|
|
94
|
+
shape: gathered_shape,
|
|
95
|
+
dtype: tensors[0].dtype,
|
|
96
|
+
device: comm.gpu_ids[i]
|
|
97
|
+
)
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# ReduceScatter - reduce then scatter
|
|
103
|
+
#
|
|
104
|
+
# @param tensors [Array<NvArray>] One NvArray per GPU (full size)
|
|
105
|
+
# @param op [Symbol] Reduction operation
|
|
106
|
+
# @param comm [Communicator] Communicator instance
|
|
107
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
108
|
+
# @return [Array<NvArray>] Chunk NvArrays (1/N size each)
|
|
109
|
+
def reduce_scatter(tensors, op: :sum, comm:, stream: nil)
|
|
110
|
+
validate_tensors!(tensors)
|
|
111
|
+
|
|
112
|
+
n_gpus = tensors.size
|
|
113
|
+
full_shape = tensors[0].shape.dup
|
|
114
|
+
chunk_shape = full_shape.dup
|
|
115
|
+
chunk_shape[0] = full_shape[0] / n_gpus
|
|
116
|
+
|
|
117
|
+
chunk_pointers = comm.reduce_scatter(tensors, op: op, stream: stream)
|
|
118
|
+
|
|
119
|
+
chunk_pointers.map.with_index do |ptr, i|
|
|
120
|
+
create_nvarray_from_ptr(
|
|
121
|
+
ptr: ptr,
|
|
122
|
+
shape: chunk_shape,
|
|
123
|
+
dtype: tensors[0].dtype,
|
|
124
|
+
device: comm.gpu_ids[i]
|
|
125
|
+
)
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Distribute tensor across GPUs (scatter)
|
|
130
|
+
#
|
|
131
|
+
# @param tensor [NvArray] Source tensor on root GPU
|
|
132
|
+
# @param root [Integer] Root rank
|
|
133
|
+
# @param comm [Communicator] Communicator instance
|
|
134
|
+
# @param stream [CUDA::Stream, nil] Optional CUDA stream
|
|
135
|
+
# @return [Array<NvArray>] Chunks on each GPU
|
|
136
|
+
def scatter(tensor, root: 0, comm:, stream: nil)
|
|
137
|
+
validate_single_tensor!(tensor)
|
|
138
|
+
|
|
139
|
+
n_gpus = comm.gpu_ids.size
|
|
140
|
+
full_shape = tensor.shape.dup
|
|
141
|
+
chunk_shape = full_shape.dup
|
|
142
|
+
chunk_shape[0] = full_shape[0] / n_gpus
|
|
143
|
+
chunk_size = tensor.nbytes / n_gpus
|
|
144
|
+
|
|
145
|
+
chunks = []
|
|
146
|
+
|
|
147
|
+
n_gpus.times do |i|
|
|
148
|
+
if i == root
|
|
149
|
+
# Root keeps first chunk
|
|
150
|
+
chunk_ptr = tensor.device_ptr
|
|
151
|
+
chunks << create_nvarray_from_ptr(
|
|
152
|
+
ptr: chunk_ptr,
|
|
153
|
+
shape: chunk_shape,
|
|
154
|
+
dtype: tensor.dtype,
|
|
155
|
+
device: comm.gpu_ids[i]
|
|
156
|
+
)
|
|
157
|
+
else
|
|
158
|
+
# Allocate and send chunk
|
|
159
|
+
chunk_ptr = allocate_on_device(comm.gpu_ids[i], chunk_size)
|
|
160
|
+
offset = i * chunk_size
|
|
161
|
+
src_ptr = FFI::Pointer.new(:uint8, tensor.device_ptr.address + offset)
|
|
162
|
+
|
|
163
|
+
comm.send_recv(
|
|
164
|
+
src_ptr,
|
|
165
|
+
src_rank: root,
|
|
166
|
+
dst_buffer: chunk_ptr,
|
|
167
|
+
dst_rank: i,
|
|
168
|
+
size: chunk_size,
|
|
169
|
+
stream: stream
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
chunks << create_nvarray_from_ptr(
|
|
173
|
+
ptr: chunk_ptr,
|
|
174
|
+
shape: chunk_shape,
|
|
175
|
+
dtype: tensor.dtype,
|
|
176
|
+
device: comm.gpu_ids[i]
|
|
177
|
+
)
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
chunks
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
private
|
|
185
|
+
|
|
186
|
+
def validate_tensors!(tensors)
|
|
187
|
+
raise ArgumentError, "Expected array of tensors" unless tensors.is_a?(Array)
|
|
188
|
+
raise ArgumentError, "Empty tensor array" if tensors.empty?
|
|
189
|
+
|
|
190
|
+
first = tensors[0]
|
|
191
|
+
raise ArgumentError, "Tensors must be NvArray" unless first.is_a?(NvArray)
|
|
192
|
+
raise ArgumentError, "Tensors must be on device" unless first.on_device?
|
|
193
|
+
|
|
194
|
+
tensors.each do |t|
|
|
195
|
+
unless t.shape == first.shape
|
|
196
|
+
raise ArgumentError, "All tensors must have same shape"
|
|
197
|
+
end
|
|
198
|
+
unless t.dtype == first.dtype
|
|
199
|
+
raise ArgumentError, "All tensors must have same dtype"
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def validate_single_tensor!(tensor)
|
|
205
|
+
raise ArgumentError, "Expected NvArray" unless tensor.is_a?(NvArray)
|
|
206
|
+
raise ArgumentError, "Tensor must be on device" unless tensor.on_device?
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def validate_devices!(tensors, comm)
|
|
210
|
+
tensors.each_with_index do |t, i|
|
|
211
|
+
expected_device = comm.gpu_ids[i]
|
|
212
|
+
actual_device = t.device
|
|
213
|
+
|
|
214
|
+
unless actual_device == expected_device
|
|
215
|
+
raise ArgumentError, "Tensor #{i} on device #{actual_device}, expected #{expected_device}"
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def create_nvarray_from_ptr(ptr:, shape:, dtype:, device:)
|
|
221
|
+
NvArray.from_device_ptr(
|
|
222
|
+
ptr,
|
|
223
|
+
shape: shape,
|
|
224
|
+
dtype: dtype,
|
|
225
|
+
take_ownership: true
|
|
226
|
+
)
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def allocate_on_device(device_id, size)
|
|
230
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
231
|
+
CUDA::RuntimeAPI.cudaSetDevice(device_id)
|
|
232
|
+
ptr_ptr = FFI::MemoryPointer.new(:pointer)
|
|
233
|
+
status = CUDA::RuntimeAPI.cudaMalloc(ptr_ptr, size)
|
|
234
|
+
CUDA::RuntimeAPI.check_status!(status, "Alloc for scatter")
|
|
235
|
+
ptr_ptr.read_pointer
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|