ignis-collective 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +7 -0
- data/lib/ignis-collective.rb +9 -0
- data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
- data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
- data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
- data/lib/nvruby/collective/algorithms/ring.rb +421 -0
- data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
- data/lib/nvruby/collective/algorithms/tree.rb +291 -0
- data/lib/nvruby/collective/array_ops.rb +240 -0
- data/lib/nvruby/collective/communicator.rb +633 -0
- data/lib/nvruby/collective/communicator_healer.rb +276 -0
- data/lib/nvruby/collective/device_manager.rb +216 -0
- data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
- data/lib/nvruby/collective/health_monitor.rb +333 -0
- data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
- data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
- data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
- data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
- data/lib/nvruby/collective/p2p_bindings.rb +121 -0
- data/lib/nvruby/collective/resilient_transport.rb +296 -0
- data/lib/nvruby/collective/topology.rb +347 -0
- data/lib/nvruby/collective/transport/base.rb +138 -0
- data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
- data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
- data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
- data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
- data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
- data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
- data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
- data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
- data/lib/nvruby/collective/transport_selector.rb +200 -0
- data/lib/nvruby/collective/vmm_bindings.rb +212 -0
- data/lib/nvruby/collective.rb +156 -0
- metadata +92 -0
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base"
|
|
4
|
+
require_relative "../vmm_bindings"
|
|
5
|
+
|
|
6
|
+
module Ignis
|
|
7
|
+
module Collective
|
|
8
|
+
module Transport
|
|
9
|
+
# VMM IPC Transport - Modern CUDA Virtual Memory Management IPC
|
|
10
|
+
#
|
|
11
|
+
# Uses cuMemExportToShareableHandle with CU_MEM_HANDLE_TYPE_WIN32 for
|
|
12
|
+
# cross-process GPU memory sharing on Windows. Required for sharing
|
|
13
|
+
# memory allocated with cudaMallocAsync.
|
|
14
|
+
#
|
|
15
|
+
# Workflow:
|
|
16
|
+
# 1. Sender: cuMemCreate -> cuMemExportToShareableHandle -> send HANDLE
|
|
17
|
+
# 2. Receiver: cuMemImportFromShareableHandle -> cuMemAddressReserve -> cuMemMap -> cuMemSetAccess
|
|
18
|
+
class VMMIPCTransport < Base
|
|
19
|
+
# @return [Symbol] Transport type identifier
|
|
20
|
+
def self.transport_type
|
|
21
|
+
:vmm_ipc
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# @return [Float] Estimated bandwidth (GB/s)
|
|
25
|
+
def estimated_bandwidth
|
|
26
|
+
25.0 # PCIe-limited with IPC overhead
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# @return [Float] Estimated latency (microseconds)
|
|
30
|
+
def estimated_latency
|
|
31
|
+
10.0 # IPC overhead higher than P2P
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Initialize the transport
|
|
35
|
+
# @return [void]
|
|
36
|
+
def initialize!
|
|
37
|
+
return if @initialized
|
|
38
|
+
|
|
39
|
+
VMMBindings.ensure_loaded!
|
|
40
|
+
@exported_handles = {} # device_ptr -> {handle: CUmemGenericAllocationHandle, win32_handle: HANDLE}
|
|
41
|
+
@imported_handles = {} # win32_handle -> {alloc_handle, va_ptr, size}
|
|
42
|
+
@initialized = true
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Create a VMM allocation on source device (shareable)
|
|
46
|
+
# @param size [Integer] Size in bytes
|
|
47
|
+
# @return [Hash] {:device_ptr, :alloc_handle, :win32_handle}
|
|
48
|
+
def create_shareable_allocation(size)
|
|
49
|
+
ensure_initialized!
|
|
50
|
+
|
|
51
|
+
# Get allocation granularity
|
|
52
|
+
prop = VMMBindings.create_allocation_prop(device_id: @src_device, shareable: true)
|
|
53
|
+
granularity_ptr = FFI::MemoryPointer.new(:size_t)
|
|
54
|
+
|
|
55
|
+
status = VMMBindings.cuMemGetAllocationGranularity(
|
|
56
|
+
granularity_ptr,
|
|
57
|
+
prop,
|
|
58
|
+
0 # CU_MEM_ALLOC_GRANULARITY_MINIMUM
|
|
59
|
+
)
|
|
60
|
+
VMMBindings.check_status!(status, "Get allocation granularity")
|
|
61
|
+
|
|
62
|
+
granularity = granularity_ptr.read_size_t
|
|
63
|
+
aligned_size = ((size + granularity - 1) / granularity) * granularity
|
|
64
|
+
|
|
65
|
+
# Create allocation
|
|
66
|
+
handle_ptr = FFI::MemoryPointer.new(:uint64)
|
|
67
|
+
status = VMMBindings.cuMemCreate(handle_ptr, aligned_size, prop, 0)
|
|
68
|
+
VMMBindings.check_status!(status, "VMM cuMemCreate")
|
|
69
|
+
|
|
70
|
+
alloc_handle = handle_ptr.read_uint64
|
|
71
|
+
|
|
72
|
+
# Reserve virtual address
|
|
73
|
+
va_ptr_ptr = FFI::MemoryPointer.new(:uint64)
|
|
74
|
+
status = VMMBindings.cuMemAddressReserve(va_ptr_ptr, aligned_size, 0, 0, 0)
|
|
75
|
+
VMMBindings.check_status!(status, "VMM cuMemAddressReserve")
|
|
76
|
+
|
|
77
|
+
va_ptr = va_ptr_ptr.read_uint64
|
|
78
|
+
|
|
79
|
+
# Map allocation to virtual address
|
|
80
|
+
status = VMMBindings.cuMemMap(va_ptr, aligned_size, 0, alloc_handle, 0)
|
|
81
|
+
VMMBindings.check_status!(status, "VMM cuMemMap")
|
|
82
|
+
|
|
83
|
+
# Set access for source device
|
|
84
|
+
access_desc = VMMBindings.create_access_desc(device_id: @src_device, read_write: true)
|
|
85
|
+
status = VMMBindings.cuMemSetAccess(va_ptr, aligned_size, access_desc, 1)
|
|
86
|
+
VMMBindings.check_status!(status, "VMM cuMemSetAccess")
|
|
87
|
+
|
|
88
|
+
# Export to Windows HANDLE
|
|
89
|
+
win32_handle_ptr = FFI::MemoryPointer.new(:pointer)
|
|
90
|
+
status = VMMBindings.cuMemExportToShareableHandle(
|
|
91
|
+
win32_handle_ptr,
|
|
92
|
+
alloc_handle,
|
|
93
|
+
VMMBindings::CU_MEM_HANDLE_TYPE_WIN32,
|
|
94
|
+
0
|
|
95
|
+
)
|
|
96
|
+
VMMBindings.check_status!(status, "VMM cuMemExportToShareableHandle")
|
|
97
|
+
|
|
98
|
+
win32_handle = win32_handle_ptr.read_pointer
|
|
99
|
+
|
|
100
|
+
# Cache for cleanup
|
|
101
|
+
@exported_handles[va_ptr] = {
|
|
102
|
+
alloc_handle: alloc_handle,
|
|
103
|
+
win32_handle: win32_handle,
|
|
104
|
+
size: aligned_size
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
{
|
|
108
|
+
device_ptr: FFI::Pointer.new(:uint8, va_ptr),
|
|
109
|
+
alloc_handle: alloc_handle,
|
|
110
|
+
win32_handle: win32_handle,
|
|
111
|
+
size: aligned_size
|
|
112
|
+
}
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Import a shareable allocation on destination device
|
|
116
|
+
# @param win32_handle [FFI::Pointer] Windows HANDLE from sender
|
|
117
|
+
# @param size [Integer] Size in bytes
|
|
118
|
+
# @return [FFI::Pointer] Device pointer mapped on destination GPU
|
|
119
|
+
def import_shareable_allocation(win32_handle, size)
|
|
120
|
+
ensure_initialized!
|
|
121
|
+
|
|
122
|
+
# Import the handle
|
|
123
|
+
handle_ptr = FFI::MemoryPointer.new(:uint64)
|
|
124
|
+
status = VMMBindings.cuMemImportFromShareableHandle(
|
|
125
|
+
handle_ptr,
|
|
126
|
+
win32_handle,
|
|
127
|
+
VMMBindings::CU_MEM_HANDLE_TYPE_WIN32
|
|
128
|
+
)
|
|
129
|
+
VMMBindings.check_status!(status, "VMM cuMemImportFromShareableHandle")
|
|
130
|
+
|
|
131
|
+
alloc_handle = handle_ptr.read_uint64
|
|
132
|
+
|
|
133
|
+
# Get granularity for alignment
|
|
134
|
+
prop_ptr = VMMBindings::CUmemAllocationProp.new
|
|
135
|
+
status = VMMBindings.cuMemGetAllocationPropertiesFromHandle(prop_ptr, alloc_handle)
|
|
136
|
+
VMMBindings.check_status!(status, "Get allocation properties")
|
|
137
|
+
|
|
138
|
+
# Reserve virtual address on destination device
|
|
139
|
+
va_ptr_ptr = FFI::MemoryPointer.new(:uint64)
|
|
140
|
+
status = VMMBindings.cuMemAddressReserve(va_ptr_ptr, size, 0, 0, 0)
|
|
141
|
+
VMMBindings.check_status!(status, "VMM cuMemAddressReserve (import)")
|
|
142
|
+
|
|
143
|
+
va_ptr = va_ptr_ptr.read_uint64
|
|
144
|
+
|
|
145
|
+
# Map
|
|
146
|
+
status = VMMBindings.cuMemMap(va_ptr, size, 0, alloc_handle, 0)
|
|
147
|
+
VMMBindings.check_status!(status, "VMM cuMemMap (import)")
|
|
148
|
+
|
|
149
|
+
# Set access for destination device
|
|
150
|
+
access_desc = VMMBindings.create_access_desc(device_id: @dst_device, read_write: true)
|
|
151
|
+
status = VMMBindings.cuMemSetAccess(va_ptr, size, access_desc, 1)
|
|
152
|
+
VMMBindings.check_status!(status, "VMM cuMemSetAccess (import)")
|
|
153
|
+
|
|
154
|
+
# Cache for cleanup
|
|
155
|
+
@imported_handles[win32_handle.address] = {
|
|
156
|
+
alloc_handle: alloc_handle,
|
|
157
|
+
va_ptr: va_ptr,
|
|
158
|
+
size: size
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
FFI::Pointer.new(:uint8, va_ptr)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Close an imported handle
|
|
165
|
+
# @param device_ptr [FFI::Pointer] The mapped device pointer
|
|
166
|
+
# @return [void]
|
|
167
|
+
def close_imported_handle(device_ptr)
|
|
168
|
+
va_ptr = device_ptr.address
|
|
169
|
+
|
|
170
|
+
# Find the imported handle info
|
|
171
|
+
@imported_handles.each do |handle_addr, info|
|
|
172
|
+
next unless info[:va_ptr] == va_ptr
|
|
173
|
+
|
|
174
|
+
# Unmap
|
|
175
|
+
VMMBindings.cuMemUnmap(va_ptr, info[:size])
|
|
176
|
+
|
|
177
|
+
# Free address
|
|
178
|
+
VMMBindings.cuMemAddressFree(va_ptr, info[:size])
|
|
179
|
+
|
|
180
|
+
# Release handle
|
|
181
|
+
VMMBindings.cuMemRelease(info[:alloc_handle])
|
|
182
|
+
|
|
183
|
+
@imported_handles.delete(handle_addr)
|
|
184
|
+
break
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Copy data via VMM IPC (used when both sender and receiver have mapped)
|
|
189
|
+
# @param dst_buffer [FFI::Pointer] Destination (on dst_device)
|
|
190
|
+
# @param src_buffer [FFI::Pointer] Source (on src_device)
|
|
191
|
+
# @param size [Integer] Size in bytes
|
|
192
|
+
# @param stream [FFI::Pointer] CUDA stream
|
|
193
|
+
def copy_async(dst_buffer, src_buffer, size, stream)
|
|
194
|
+
ensure_initialized!
|
|
195
|
+
|
|
196
|
+
# For VMM IPC, once both sides have mapped the same allocation,
|
|
197
|
+
# we can use regular cudaMemcpyAsync
|
|
198
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
199
|
+
|
|
200
|
+
status = CUDA::RuntimeAPI.cudaMemcpyAsync(
|
|
201
|
+
dst_buffer,
|
|
202
|
+
src_buffer,
|
|
203
|
+
size,
|
|
204
|
+
CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_DEVICE,
|
|
205
|
+
stream
|
|
206
|
+
)
|
|
207
|
+
CUDA::RuntimeAPI.check_status!(status, "VMM IPC copy")
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# Async send (export and copy)
|
|
211
|
+
# @param buffer [FFI::Pointer] Source buffer
|
|
212
|
+
# @param size [Integer] Size in bytes
|
|
213
|
+
# @param stream [FFI::Pointer] CUDA stream
|
|
214
|
+
# @return [Hash] {:win32_handle, :size} for receiver
|
|
215
|
+
def send_async(buffer, size, stream)
|
|
216
|
+
# For VMM IPC, we typically create a shareable allocation first
|
|
217
|
+
# This method assumes buffer is already a VMM allocation
|
|
218
|
+
raise NotImplementedError, "Use create_shareable_allocation and copy_async"
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Async receive
|
|
222
|
+
# @param buffer [FFI::Pointer] Destination buffer
|
|
223
|
+
# @param size [Integer] Size
|
|
224
|
+
# @param stream [FFI::Pointer] CUDA stream
|
|
225
|
+
# @return [void]
|
|
226
|
+
def recv_async(buffer, size, stream)
|
|
227
|
+
raise NotImplementedError, "Use import_shareable_allocation and copy_async"
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Check if VMM IPC is available
|
|
231
|
+
# @return [Boolean] True if available
|
|
232
|
+
def self.available?
|
|
233
|
+
begin
|
|
234
|
+
VMMBindings.ensure_loaded!
|
|
235
|
+
true
|
|
236
|
+
rescue LoadError
|
|
237
|
+
false
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Clean up all handles
|
|
242
|
+
# @return [void]
|
|
243
|
+
def destroy!
|
|
244
|
+
# Cleanup exported handles
|
|
245
|
+
@exported_handles.each do |va_ptr, info|
|
|
246
|
+
VMMBindings.cuMemUnmap(va_ptr, info[:size]) rescue nil
|
|
247
|
+
VMMBindings.cuMemAddressFree(va_ptr, info[:size]) rescue nil
|
|
248
|
+
VMMBindings.cuMemRelease(info[:alloc_handle]) rescue nil
|
|
249
|
+
# Note: win32_handle should be closed by caller
|
|
250
|
+
end
|
|
251
|
+
@exported_handles.clear
|
|
252
|
+
|
|
253
|
+
# Cleanup imported handles
|
|
254
|
+
@imported_handles.each do |_, info|
|
|
255
|
+
VMMBindings.cuMemUnmap(info[:va_ptr], info[:size]) rescue nil
|
|
256
|
+
VMMBindings.cuMemAddressFree(info[:va_ptr], info[:size]) rescue nil
|
|
257
|
+
VMMBindings.cuMemRelease(info[:alloc_handle]) rescue nil
|
|
258
|
+
end
|
|
259
|
+
@imported_handles.clear
|
|
260
|
+
|
|
261
|
+
@initialized = false
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
end
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "topology"
|
|
4
|
+
require_relative "transport/base"
|
|
5
|
+
require_relative "transport/p2p_transport"
|
|
6
|
+
require_relative "transport/ipc_transport"
|
|
7
|
+
|
|
8
|
+
module Ignis
|
|
9
|
+
module Collective
|
|
10
|
+
# Automatic transport selector - NCCL-style
|
|
11
|
+
# Detects topology at init time and selects optimal transport for each GPU pair
|
|
12
|
+
class TransportSelector
|
|
13
|
+
# Transport types ranked by performance (highest first)
|
|
14
|
+
TRANSPORT_PRIORITY = [
|
|
15
|
+
:nvlink, # NVLink - 900 GB/s
|
|
16
|
+
:pcie_p2p, # PCIe P2P - 32 GB/s
|
|
17
|
+
:cuda_vmm_ipc, # cuMem VMM IPC - 25 GB/s
|
|
18
|
+
:cuda_ipc, # Legacy CUDA IPC - 20 GB/s
|
|
19
|
+
:host_staged, # Host staging - 12 GB/s
|
|
20
|
+
:rio_network, # Windows RIO - 100 Gbps
|
|
21
|
+
:tcp, # TCP fallback - variable
|
|
22
|
+
].freeze
|
|
23
|
+
|
|
24
|
+
# Map interconnect types to transport classes
|
|
25
|
+
TRANSPORT_CLASSES = {
|
|
26
|
+
nvlink: Transport::P2PTransport,
|
|
27
|
+
pcie_p2p: Transport::P2PTransport,
|
|
28
|
+
cuda_ipc: Transport::IPCTransport,
|
|
29
|
+
cuda_vmm_ipc: Transport::IPCTransport,
|
|
30
|
+
host_staged: nil, # TODO: Implement SHMTransport
|
|
31
|
+
rio_network: nil, # TODO: Implement RIOTransport
|
|
32
|
+
tcp: nil, # TODO: Implement TCPTransport
|
|
33
|
+
}.freeze
|
|
34
|
+
|
|
35
|
+
# @return [Array<Integer>] GPU device IDs in this communicator
|
|
36
|
+
attr_reader :device_ids
|
|
37
|
+
|
|
38
|
+
# @return [Topology::Detector] Topology detector
|
|
39
|
+
attr_reader :topology
|
|
40
|
+
|
|
41
|
+
# @return [Hash<Array<Integer>, Transport::Base>] Transport matrix
|
|
42
|
+
attr_reader :transport_matrix
|
|
43
|
+
|
|
44
|
+
# Create transport selector for given GPUs
|
|
45
|
+
# @param device_ids [Array<Integer>] GPU device IDs
|
|
46
|
+
def initialize(device_ids)
|
|
47
|
+
@device_ids = device_ids.dup.freeze
|
|
48
|
+
@topology = Topology::Detector.new(device_ids: @device_ids)
|
|
49
|
+
@transport_matrix = {}
|
|
50
|
+
@initialized = false
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Initialize all transports
|
|
54
|
+
# @return [void]
|
|
55
|
+
def initialize!
|
|
56
|
+
return if @initialized
|
|
57
|
+
|
|
58
|
+
build_transport_matrix!
|
|
59
|
+
initialize_transports!
|
|
60
|
+
@initialized = true
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Get optimal transport for a GPU pair
|
|
64
|
+
# @param src [Integer] Source GPU
|
|
65
|
+
# @param dst [Integer] Destination GPU
|
|
66
|
+
# @return [Transport::Base, nil] Best transport or nil if same GPU
|
|
67
|
+
def select_transport(src, dst)
|
|
68
|
+
return nil if src == dst
|
|
69
|
+
|
|
70
|
+
@transport_matrix[[src, dst]]
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Get transport type for a GPU pair
|
|
74
|
+
# @param src [Integer] Source GPU
|
|
75
|
+
# @param dst [Integer] Destination GPU
|
|
76
|
+
# @return [Symbol, nil] Transport type
|
|
77
|
+
def transport_type(src, dst)
|
|
78
|
+
transport = select_transport(src, dst)
|
|
79
|
+
transport&.class&.transport_type
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Get optimal ring order based on topology
|
|
83
|
+
# @return [Array<Integer>] Ordered GPU IDs for ring algorithm
|
|
84
|
+
def optimal_ring_order
|
|
85
|
+
@topology.optimal_ring_order
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Check if all transports are ready
|
|
89
|
+
# @return [Boolean] True if initialized
|
|
90
|
+
def ready?
|
|
91
|
+
@initialized && @transport_matrix.values.all?(&:ready?)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Get performance summary for logging
|
|
95
|
+
# @return [Hash] Performance stats
|
|
96
|
+
def performance_summary
|
|
97
|
+
nvlink_count = 0
|
|
98
|
+
p2p_count = 0
|
|
99
|
+
ipc_count = 0
|
|
100
|
+
staged_count = 0
|
|
101
|
+
|
|
102
|
+
@transport_matrix.each_value do |transport|
|
|
103
|
+
case transport
|
|
104
|
+
when Transport::P2PTransport
|
|
105
|
+
if transport.interconnect_type == :nvlink
|
|
106
|
+
nvlink_count += 1
|
|
107
|
+
else
|
|
108
|
+
p2p_count += 1
|
|
109
|
+
end
|
|
110
|
+
when Transport::IPCTransport
|
|
111
|
+
ipc_count += 1
|
|
112
|
+
else
|
|
113
|
+
staged_count += 1
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
total = @device_ids.size * (@device_ids.size - 1)
|
|
118
|
+
avg_bandwidth = @transport_matrix.values.sum(&:estimated_bandwidth) / [@transport_matrix.size, 1].max
|
|
119
|
+
|
|
120
|
+
{
|
|
121
|
+
total_paths: total,
|
|
122
|
+
nvlink_paths: nvlink_count,
|
|
123
|
+
p2p_paths: p2p_count,
|
|
124
|
+
ipc_paths: ipc_count,
|
|
125
|
+
staged_paths: staged_count,
|
|
126
|
+
avg_bandwidth_gbps: avg_bandwidth.round(1),
|
|
127
|
+
}
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Clean up all transports
|
|
131
|
+
# @return [void]
|
|
132
|
+
def destroy!
|
|
133
|
+
@transport_matrix.each_value(&:destroy!)
|
|
134
|
+
@transport_matrix.clear
|
|
135
|
+
@initialized = false
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# @return [String] Human-readable summary
|
|
139
|
+
def to_s
|
|
140
|
+
stats = performance_summary
|
|
141
|
+
"TransportSelector[#{@device_ids.size} GPUs]: " \
|
|
142
|
+
"#{stats[:nvlink_paths]} NVLink, #{stats[:p2p_paths]} P2P, " \
|
|
143
|
+
"#{stats[:ipc_paths]} IPC (avg #{stats[:avg_bandwidth_gbps]} GB/s)"
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
private
|
|
147
|
+
|
|
148
|
+
# Build transport matrix based on detected topology
|
|
149
|
+
def build_transport_matrix!
|
|
150
|
+
@device_ids.each do |src|
|
|
151
|
+
@device_ids.each do |dst|
|
|
152
|
+
next if src == dst
|
|
153
|
+
|
|
154
|
+
@transport_matrix[[src, dst]] = create_transport(src, dst)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Create appropriate transport for a GPU pair
|
|
160
|
+
# @param src [Integer] Source GPU
|
|
161
|
+
# @param dst [Integer] Destination GPU
|
|
162
|
+
# @return [Transport::Base] Transport instance
|
|
163
|
+
def create_transport(src, dst)
|
|
164
|
+
path = @topology.matrix.path(src, dst)
|
|
165
|
+
interconnect = path&.interconnect_type || :host_staged
|
|
166
|
+
|
|
167
|
+
case interconnect
|
|
168
|
+
when :nvlink
|
|
169
|
+
Transport::P2PTransport.new(
|
|
170
|
+
src_device: src,
|
|
171
|
+
dst_device: dst,
|
|
172
|
+
interconnect_type: :nvlink
|
|
173
|
+
)
|
|
174
|
+
when :pcie_p2p
|
|
175
|
+
Transport::P2PTransport.new(
|
|
176
|
+
src_device: src,
|
|
177
|
+
dst_device: dst,
|
|
178
|
+
interconnect_type: :pcie_p2p
|
|
179
|
+
)
|
|
180
|
+
when :cuda_ipc, :cuda_vmm_ipc
|
|
181
|
+
Transport::IPCTransport.new(
|
|
182
|
+
src_device: src,
|
|
183
|
+
dst_device: dst
|
|
184
|
+
)
|
|
185
|
+
else
|
|
186
|
+
# Fallback to IPC which always works
|
|
187
|
+
Transport::IPCTransport.new(
|
|
188
|
+
src_device: src,
|
|
189
|
+
dst_device: dst
|
|
190
|
+
)
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Initialize all transports
|
|
195
|
+
def initialize_transports!
|
|
196
|
+
@transport_matrix.each_value(&:initialize!)
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
end
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ffi"
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module Collective
|
|
7
|
+
# cuMem VMM (Virtual Memory Management) API bindings for modern IPC
|
|
8
|
+
# Required for sharing cudaMallocAsync allocations on Windows
|
|
9
|
+
# Uses cuMemExportToShareableHandle with CU_MEM_HANDLE_TYPE_WIN32
|
|
10
|
+
module VMMBindings
|
|
11
|
+
extend FFI::Library
|
|
12
|
+
|
|
13
|
+
# Handle types for shareable handles
|
|
14
|
+
CU_MEM_HANDLE_TYPE_NONE = 0
|
|
15
|
+
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 1
|
|
16
|
+
CU_MEM_HANDLE_TYPE_WIN32 = 2
|
|
17
|
+
CU_MEM_HANDLE_TYPE_WIN32_KMT = 4
|
|
18
|
+
|
|
19
|
+
# Memory allocation types
|
|
20
|
+
CU_MEM_ALLOCATION_TYPE_INVALID = 0
|
|
21
|
+
CU_MEM_ALLOCATION_TYPE_PINNED = 1 # Pinned memory, can be shared
|
|
22
|
+
CU_MEM_ALLOCATION_TYPE_MAX = 0xFFFFFFFF
|
|
23
|
+
|
|
24
|
+
# Allocation location types
|
|
25
|
+
CU_MEM_LOCATION_TYPE_INVALID = 0
|
|
26
|
+
CU_MEM_LOCATION_TYPE_DEVICE = 1
|
|
27
|
+
CU_MEM_LOCATION_TYPE_HOST = 2
|
|
28
|
+
CU_MEM_LOCATION_TYPE_HOST_NUMA = 3
|
|
29
|
+
CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT = 4
|
|
30
|
+
CU_MEM_LOCATION_TYPE_MAX = 0xFFFFFFFF
|
|
31
|
+
|
|
32
|
+
# Memory access flags
|
|
33
|
+
CU_MEM_ACCESS_FLAGS_PROT_NONE = 0
|
|
34
|
+
CU_MEM_ACCESS_FLAGS_PROT_READ = 1
|
|
35
|
+
CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 3
|
|
36
|
+
|
|
37
|
+
# CUmemLocation structure
|
|
38
|
+
class CUmemLocation < FFI::Struct
|
|
39
|
+
layout :type, :int, # CU_MEM_LOCATION_TYPE_*
|
|
40
|
+
:id, :int # Device ordinal for DEVICE type
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# CUmemAllocationProp structure for cuMemCreate
|
|
44
|
+
class CUmemAllocationProp < FFI::Struct
|
|
45
|
+
layout :type, :int, # CU_MEM_ALLOCATION_TYPE_*
|
|
46
|
+
:requestedHandleTypes, :int, # OR of CU_MEM_HANDLE_TYPE_*
|
|
47
|
+
:location, CUmemLocation, # Memory location
|
|
48
|
+
:win32HandleMetaData, :pointer, # LPSECURITY_ATTRIBUTES for Win32
|
|
49
|
+
:allocFlags, :uint16, # Reserved flags
|
|
50
|
+
:reserved, [:uint8, 14] # Padding
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# CUmemAccessDesc structure for cuMemSetAccess
|
|
54
|
+
class CUmemAccessDesc < FFI::Struct
|
|
55
|
+
layout :location, CUmemLocation, # Target device/host
|
|
56
|
+
:flags, :int # CU_MEM_ACCESS_FLAGS_*
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# CUmemGenericAllocationHandle (unsigned long long)
|
|
60
|
+
typedef :uint64, :CUmemGenericAllocationHandle
|
|
61
|
+
|
|
62
|
+
# Load CUDA driver library (platform-aware)
|
|
63
|
+
def self.ensure_loaded!
|
|
64
|
+
return if @loaded
|
|
65
|
+
|
|
66
|
+
begin
|
|
67
|
+
driver_lib = if defined?(Ignis::Platform)
|
|
68
|
+
Ignis::Platform.find_cuda_lib(:cuda_driver) || (Ignis::Platform.windows? ? 'nvcuda' : 'libcuda')
|
|
69
|
+
else
|
|
70
|
+
RUBY_PLATFORM =~ /mswin|mingw|cygwin/ ? 'nvcuda' : 'libcuda'
|
|
71
|
+
end
|
|
72
|
+
ffi_lib [driver_lib, "nvcuda", "libcuda"]
|
|
73
|
+
attach_vmm_functions!
|
|
74
|
+
@loaded = true
|
|
75
|
+
rescue FFI::NotFoundError => e
|
|
76
|
+
raise LoadError, "Could not load CUDA driver library: #{e.message}"
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def self.attach_vmm_functions!
|
|
81
|
+
# Memory allocation via VMM
|
|
82
|
+
attach_function :cuMemCreate, [
|
|
83
|
+
:pointer, # CUmemGenericAllocationHandle* handle
|
|
84
|
+
:size_t, # size
|
|
85
|
+
:pointer, # const CUmemAllocationProp* prop
|
|
86
|
+
:uint64 # flags (must be 0)
|
|
87
|
+
], :int
|
|
88
|
+
|
|
89
|
+
# Get allocation size granularity
|
|
90
|
+
attach_function :cuMemGetAllocationGranularity, [
|
|
91
|
+
:pointer, # size_t* granularity
|
|
92
|
+
:pointer, # const CUmemAllocationProp* prop
|
|
93
|
+
:int # CUmemAllocationGranularity_flags
|
|
94
|
+
], :int
|
|
95
|
+
|
|
96
|
+
# Release allocation handle
|
|
97
|
+
attach_function :cuMemRelease, [
|
|
98
|
+
:uint64 # CUmemGenericAllocationHandle handle
|
|
99
|
+
], :int
|
|
100
|
+
|
|
101
|
+
# Export to shareable handle (Windows HANDLE or POSIX fd)
|
|
102
|
+
attach_function :cuMemExportToShareableHandle, [
|
|
103
|
+
:pointer, # void* shareableHandle (HANDLE* or int*)
|
|
104
|
+
:uint64, # CUmemGenericAllocationHandle handle
|
|
105
|
+
:int, # CUmemAllocationHandleType handleType
|
|
106
|
+
:uint64 # flags (must be 0)
|
|
107
|
+
], :int
|
|
108
|
+
|
|
109
|
+
# Import from shareable handle
|
|
110
|
+
attach_function :cuMemImportFromShareableHandle, [
|
|
111
|
+
:pointer, # CUmemGenericAllocationHandle* handle
|
|
112
|
+
:pointer, # void* osHandle (HANDLE or int)
|
|
113
|
+
:int # CUmemAllocationHandleType handleType
|
|
114
|
+
], :int
|
|
115
|
+
|
|
116
|
+
# Reserve virtual address range
|
|
117
|
+
attach_function :cuMemAddressReserve, [
|
|
118
|
+
:pointer, # CUdeviceptr* ptr
|
|
119
|
+
:size_t, # size
|
|
120
|
+
:size_t, # alignment (0 = default)
|
|
121
|
+
:uint64, # addr (0 = any)
|
|
122
|
+
:uint64 # flags (must be 0)
|
|
123
|
+
], :int
|
|
124
|
+
|
|
125
|
+
# Free reserved address range
|
|
126
|
+
attach_function :cuMemAddressFree, [
|
|
127
|
+
:uint64, # CUdeviceptr ptr
|
|
128
|
+
:size_t # size
|
|
129
|
+
], :int
|
|
130
|
+
|
|
131
|
+
# Map allocation to address range
|
|
132
|
+
attach_function :cuMemMap, [
|
|
133
|
+
:uint64, # CUdeviceptr ptr
|
|
134
|
+
:size_t, # size
|
|
135
|
+
:size_t, # offset
|
|
136
|
+
:uint64, # CUmemGenericAllocationHandle handle
|
|
137
|
+
:uint64 # flags (must be 0)
|
|
138
|
+
], :int
|
|
139
|
+
|
|
140
|
+
# Unmap allocation
|
|
141
|
+
attach_function :cuMemUnmap, [
|
|
142
|
+
:uint64, # CUdeviceptr ptr
|
|
143
|
+
:size_t # size
|
|
144
|
+
], :int
|
|
145
|
+
|
|
146
|
+
# Set memory access for specific devices
|
|
147
|
+
attach_function :cuMemSetAccess, [
|
|
148
|
+
:uint64, # CUdeviceptr ptr
|
|
149
|
+
:size_t, # size
|
|
150
|
+
:pointer, # const CUmemAccessDesc* desc
|
|
151
|
+
:size_t # count (number of descriptors)
|
|
152
|
+
], :int
|
|
153
|
+
|
|
154
|
+
# Get allocation properties
|
|
155
|
+
attach_function :cuMemGetAllocationPropertiesFromHandle, [
|
|
156
|
+
:pointer, # CUmemAllocationProp* prop
|
|
157
|
+
:uint64 # CUmemGenericAllocationHandle handle
|
|
158
|
+
], :int
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Check status and raise on error
|
|
162
|
+
# @param status [Integer] CUDA driver error code
|
|
163
|
+
# @param context [String] Operation description
|
|
164
|
+
def self.check_status!(status, context = "VMM operation")
|
|
165
|
+
return if status.zero? # CUDA_SUCCESS
|
|
166
|
+
|
|
167
|
+
# NOTE: previously this called cuGetErrorName, which is never attached in
|
|
168
|
+
# this module — so on ANY driver error it raised NoMethodError, masking the
|
|
169
|
+
# real failure. Report the numeric driver code directly instead.
|
|
170
|
+
raise CudaRuntimeError.new("#{context}: CUDA driver error #{status}", cuda_code: status)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Helper: create allocation properties for device memory
|
|
174
|
+
# @param device_id [Integer] Target GPU device
|
|
175
|
+
# @param shareable [Boolean] Whether to allow IPC sharing
|
|
176
|
+
# @return [CUmemAllocationProp] Allocation properties
|
|
177
|
+
def self.create_allocation_prop(device_id:, shareable: true)
|
|
178
|
+
prop = CUmemAllocationProp.new
|
|
179
|
+
prop[:type] = CU_MEM_ALLOCATION_TYPE_PINNED
|
|
180
|
+
|
|
181
|
+
if shareable
|
|
182
|
+
handle_type = if defined?(Ignis::Platform)
|
|
183
|
+
Ignis::Platform.windows? ? CU_MEM_HANDLE_TYPE_WIN32 : CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
|
|
184
|
+
else
|
|
185
|
+
RUBY_PLATFORM =~ /mswin|mingw|cygwin/ ? CU_MEM_HANDLE_TYPE_WIN32 : CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
|
|
186
|
+
end
|
|
187
|
+
prop[:requestedHandleTypes] = handle_type
|
|
188
|
+
else
|
|
189
|
+
prop[:requestedHandleTypes] = 0
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
prop[:location][:type] = CU_MEM_LOCATION_TYPE_DEVICE
|
|
193
|
+
prop[:location][:id] = device_id
|
|
194
|
+
prop[:win32HandleMetaData] = FFI::Pointer::NULL
|
|
195
|
+
prop[:allocFlags] = 0
|
|
196
|
+
prop
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Helper: create access descriptor for device
|
|
200
|
+
# @param device_id [Integer] Target device
|
|
201
|
+
# @param read_write [Boolean] Read-write access?
|
|
202
|
+
# @return [CUmemAccessDesc] Access descriptor
|
|
203
|
+
def self.create_access_desc(device_id:, read_write: true)
|
|
204
|
+
desc = CUmemAccessDesc.new
|
|
205
|
+
desc[:location][:type] = CU_MEM_LOCATION_TYPE_DEVICE
|
|
206
|
+
desc[:location][:id] = device_id
|
|
207
|
+
desc[:flags] = read_write ? CU_MEM_ACCESS_FLAGS_PROT_READWRITE : CU_MEM_ACCESS_FLAGS_PROT_READ
|
|
208
|
+
desc
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
end
|