ignis-collective 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +7 -0
- data/lib/ignis-collective.rb +9 -0
- data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
- data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
- data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
- data/lib/nvruby/collective/algorithms/ring.rb +421 -0
- data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
- data/lib/nvruby/collective/algorithms/tree.rb +291 -0
- data/lib/nvruby/collective/array_ops.rb +240 -0
- data/lib/nvruby/collective/communicator.rb +633 -0
- data/lib/nvruby/collective/communicator_healer.rb +276 -0
- data/lib/nvruby/collective/device_manager.rb +216 -0
- data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
- data/lib/nvruby/collective/health_monitor.rb +333 -0
- data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
- data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
- data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
- data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
- data/lib/nvruby/collective/p2p_bindings.rb +121 -0
- data/lib/nvruby/collective/resilient_transport.rb +296 -0
- data/lib/nvruby/collective/topology.rb +347 -0
- data/lib/nvruby/collective/transport/base.rb +138 -0
- data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
- data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
- data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
- data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
- data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
- data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
- data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
- data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
- data/lib/nvruby/collective/transport_selector.rb +200 -0
- data/lib/nvruby/collective/vmm_bindings.rb +212 -0
- data/lib/nvruby/collective.rb +156 -0
- metadata +92 -0
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base"
|
|
4
|
+
require "socket"
|
|
5
|
+
|
|
6
|
+
module Ignis
|
|
7
|
+
module Collective
|
|
8
|
+
module Transport
|
|
9
|
+
# TCP Fallback Transport
|
|
10
|
+
# Generic TCP transport for multi-node communication
|
|
11
|
+
# Used when higher-performance transports are unavailable
|
|
12
|
+
#
|
|
13
|
+
# @note This is the lowest-performance multi-node option
|
|
14
|
+
# but works on any network configuration
|
|
15
|
+
#
|
|
16
|
+
# @example Create TCP transport
|
|
17
|
+
# transport = TCPTransport.new(
|
|
18
|
+
# local_addr: "192.168.1.100",
|
|
19
|
+
# local_port: 50000,
|
|
20
|
+
# remote_addr: "192.168.1.101",
|
|
21
|
+
# remote_port: 50000,
|
|
22
|
+
# mode: :client # or :server
|
|
23
|
+
# )
|
|
24
|
+
#
|
|
25
|
+
class TCPTransport < Base
|
|
26
|
+
# Default buffer size for staging
|
|
27
|
+
DEFAULT_BUFFER_SIZE = 16 * 1024 * 1024 # 16 MB
|
|
28
|
+
|
|
29
|
+
# Connection timeout in seconds
|
|
30
|
+
CONNECT_TIMEOUT = 30
|
|
31
|
+
|
|
32
|
+
# @return [String] Local address
|
|
33
|
+
attr_reader :local_addr
|
|
34
|
+
|
|
35
|
+
# @return [Integer] Local port
|
|
36
|
+
attr_reader :local_port
|
|
37
|
+
|
|
38
|
+
# @return [String] Remote address
|
|
39
|
+
attr_reader :remote_addr
|
|
40
|
+
|
|
41
|
+
# @return [Integer] Remote port
|
|
42
|
+
attr_reader :remote_port
|
|
43
|
+
|
|
44
|
+
# @return [Symbol] Mode (:client or :server)
|
|
45
|
+
attr_reader :mode
|
|
46
|
+
|
|
47
|
+
# @return [Symbol] Transport type
|
|
48
|
+
def self.transport_type
|
|
49
|
+
:tcp
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# @param local_addr [String] Local IP address
|
|
53
|
+
# @param local_port [Integer] Local port
|
|
54
|
+
# @param remote_addr [String] Remote IP address
|
|
55
|
+
# @param remote_port [Integer] Remote port
|
|
56
|
+
# @param mode [Symbol] :client or :server
|
|
57
|
+
# @param buffer_size [Integer] Staging buffer size
|
|
58
|
+
def initialize(local_addr:, local_port:, remote_addr:, remote_port:,
|
|
59
|
+
mode: :client, buffer_size: DEFAULT_BUFFER_SIZE)
|
|
60
|
+
super(src_device: 0, dst_device: 0)
|
|
61
|
+
@local_addr = local_addr
|
|
62
|
+
@local_port = local_port
|
|
63
|
+
@remote_addr = remote_addr
|
|
64
|
+
@remote_port = remote_port
|
|
65
|
+
@mode = mode
|
|
66
|
+
@buffer_size = buffer_size
|
|
67
|
+
@socket = nil
|
|
68
|
+
@server_socket = nil
|
|
69
|
+
@send_buffer = nil
|
|
70
|
+
@recv_buffer = nil
|
|
71
|
+
@initialized = false
|
|
72
|
+
@mutex = Mutex.new
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Initialize TCP transport
|
|
76
|
+
# @return [void]
|
|
77
|
+
def initialize!
|
|
78
|
+
return if @initialized
|
|
79
|
+
|
|
80
|
+
allocate_buffers!
|
|
81
|
+
|
|
82
|
+
if @mode == :server
|
|
83
|
+
start_server!
|
|
84
|
+
else
|
|
85
|
+
connect_to_server!
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
@initialized = true
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Check if ready
|
|
92
|
+
# @return [Boolean]
|
|
93
|
+
def ready?
|
|
94
|
+
@initialized && @socket && !@socket.closed?
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Send data to remote
|
|
98
|
+
#
|
|
99
|
+
# @param src_ptr [FFI::Pointer] Source GPU buffer pointer
|
|
100
|
+
# @param size [Integer] Size in bytes
|
|
101
|
+
# @param stream [FFI::Pointer, nil] CUDA stream
|
|
102
|
+
# @return [Boolean] Success
|
|
103
|
+
def send(src_ptr, size, stream: nil)
|
|
104
|
+
ensure_initialized!
|
|
105
|
+
# Guard against overflowing the fixed-size host staging buffer:
|
|
106
|
+
# stage_to_host copies `size` bytes into the @buffer_size buffer, so a
|
|
107
|
+
# larger transfer corrupts the heap. Chunked staging for >buffer_size
|
|
108
|
+
# transfers isn't implemented yet — fail loudly instead of overflowing.
|
|
109
|
+
if size > @buffer_size
|
|
110
|
+
raise TransportError,
|
|
111
|
+
"TCP transfer of #{size} bytes exceeds the #{@buffer_size}-byte host staging " \
|
|
112
|
+
"buffer (chunked staging not implemented); increase buffer_size or chunk the transfer"
|
|
113
|
+
end
|
|
114
|
+
@mutex.synchronize do
|
|
115
|
+
# Stage GPU data to host buffer
|
|
116
|
+
stage_to_host(src_ptr, size, stream)
|
|
117
|
+
|
|
118
|
+
# Send size header
|
|
119
|
+
send_header(size)
|
|
120
|
+
|
|
121
|
+
# Send data in chunks
|
|
122
|
+
bytes_sent = 0
|
|
123
|
+
while bytes_sent < size
|
|
124
|
+
chunk_size = [size - bytes_sent, @buffer_size].min
|
|
125
|
+
sent = @socket.send(@send_buffer.get_bytes(bytes_sent, chunk_size), 0)
|
|
126
|
+
raise TransportError, "TCP send failed" if sent <= 0
|
|
127
|
+
|
|
128
|
+
bytes_sent += sent
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
true
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Receive data from remote
|
|
136
|
+
#
|
|
137
|
+
# @param dst_ptr [FFI::Pointer] Destination GPU buffer pointer
|
|
138
|
+
# @param size [Integer] Expected size in bytes
|
|
139
|
+
# @param stream [FFI::Pointer, nil] CUDA stream
|
|
140
|
+
# @return [Integer] Bytes received
|
|
141
|
+
def recv(dst_ptr, size, stream: nil)
|
|
142
|
+
ensure_initialized!
|
|
143
|
+
@mutex.synchronize do
|
|
144
|
+
# Receive size header
|
|
145
|
+
actual_size = recv_header
|
|
146
|
+
|
|
147
|
+
# The header size comes from the peer; a malformed/hostile peer could
|
|
148
|
+
# announce a size larger than the staging buffer, and put_bytes below
|
|
149
|
+
# would then write past it (heap overflow). Bound it.
|
|
150
|
+
if actual_size > @buffer_size
|
|
151
|
+
raise TransportError,
|
|
152
|
+
"TCP recv header announces #{actual_size} bytes, exceeding the " \
|
|
153
|
+
"#{@buffer_size}-byte staging buffer (refusing to overflow)"
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Receive data in chunks
|
|
157
|
+
bytes_received = 0
|
|
158
|
+
while bytes_received < actual_size
|
|
159
|
+
chunk_size = [actual_size - bytes_received, @buffer_size].min
|
|
160
|
+
data = @socket.recv(chunk_size)
|
|
161
|
+
raise TransportError, "TCP recv failed" if data.nil? || data.empty?
|
|
162
|
+
|
|
163
|
+
@recv_buffer.put_bytes(bytes_received, data)
|
|
164
|
+
bytes_received += data.bytesize
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Copy from host buffer to GPU
|
|
168
|
+
unstage_from_host(dst_ptr, actual_size, stream)
|
|
169
|
+
|
|
170
|
+
actual_size
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Estimated bandwidth in GB/s
|
|
175
|
+
# @return [Float]
|
|
176
|
+
def estimated_bandwidth
|
|
177
|
+
1.25 # ~10 Gbps typical for TCP
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Clean up resources
|
|
181
|
+
# @return [void]
|
|
182
|
+
def destroy!
|
|
183
|
+
return unless @initialized
|
|
184
|
+
|
|
185
|
+
@socket&.close
|
|
186
|
+
@server_socket&.close
|
|
187
|
+
free_buffers!
|
|
188
|
+
|
|
189
|
+
@socket = nil
|
|
190
|
+
@server_socket = nil
|
|
191
|
+
@initialized = false
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# @return [String]
|
|
195
|
+
def to_s
|
|
196
|
+
status = ready? ? "connected" : "disconnected"
|
|
197
|
+
"TCPTransport[#{@local_addr}:#{@local_port} <-> #{@remote_addr}:#{@remote_port}, #{status}]"
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
private
|
|
201
|
+
|
|
202
|
+
def ensure_initialized!
|
|
203
|
+
raise TransportError, "TCP transport not initialized" unless @initialized
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def allocate_buffers!
|
|
207
|
+
# Allocate pinned host memory for GPU staging
|
|
208
|
+
@send_buffer = FFI::MemoryPointer.new(:char, @buffer_size)
|
|
209
|
+
@recv_buffer = FFI::MemoryPointer.new(:char, @buffer_size)
|
|
210
|
+
|
|
211
|
+
# Pin for faster GPU transfers
|
|
212
|
+
CUDA::RuntimeAPI.cudaHostRegister(@send_buffer, @buffer_size, 0)
|
|
213
|
+
CUDA::RuntimeAPI.cudaHostRegister(@recv_buffer, @buffer_size, 0)
|
|
214
|
+
rescue StandardError => e
|
|
215
|
+
Ignis.logger.warn { "Failed to pin TCP buffers: #{e.message}" }
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def free_buffers!
|
|
219
|
+
CUDA::RuntimeAPI.cudaHostUnregister(@send_buffer) if @send_buffer
|
|
220
|
+
CUDA::RuntimeAPI.cudaHostUnregister(@recv_buffer) if @recv_buffer
|
|
221
|
+
rescue StandardError
|
|
222
|
+
# Ignore unregister errors during cleanup
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def start_server!
|
|
226
|
+
@server_socket = TCPServer.new(@local_addr, @local_port)
|
|
227
|
+
@server_socket.setsockopt(Socket::SOL_SOCKET, Socket::SO_REUSEADDR, true)
|
|
228
|
+
@server_socket.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_NODELAY, true)
|
|
229
|
+
|
|
230
|
+
Ignis.logger.info { "TCP transport listening on #{@local_addr}:#{@local_port}" }
|
|
231
|
+
|
|
232
|
+
# Accept connection with timeout
|
|
233
|
+
ready = IO.select([@server_socket], nil, nil, CONNECT_TIMEOUT)
|
|
234
|
+
raise TransportError, "Timeout waiting for connection" unless ready
|
|
235
|
+
|
|
236
|
+
@socket = @server_socket.accept
|
|
237
|
+
configure_socket!
|
|
238
|
+
|
|
239
|
+
Ignis.logger.info { "TCP transport accepted connection from #{@socket.peeraddr[2]}" }
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def connect_to_server!
|
|
243
|
+
@socket = TCPSocket.new(@remote_addr, @remote_port)
|
|
244
|
+
configure_socket!
|
|
245
|
+
|
|
246
|
+
Ignis.logger.info { "TCP transport connected to #{@remote_addr}:#{@remote_port}" }
|
|
247
|
+
rescue Errno::ECONNREFUSED => e
|
|
248
|
+
raise TransportError, "Connection refused: #{e.message}"
|
|
249
|
+
rescue Errno::ETIMEDOUT => e
|
|
250
|
+
raise TransportError, "Connection timeout: #{e.message}"
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
def configure_socket!
|
|
254
|
+
@socket.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_NODELAY, true)
|
|
255
|
+
@socket.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, @buffer_size)
|
|
256
|
+
@socket.setsockopt(Socket::SOL_SOCKET, Socket::SO_RCVBUF, @buffer_size)
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
def stage_to_host(src_ptr, size, stream)
|
|
260
|
+
if stream
|
|
261
|
+
CUDA::RuntimeAPI.cudaMemcpyAsync(@send_buffer, src_ptr, size, :device_to_host, stream)
|
|
262
|
+
CUDA::RuntimeAPI.cudaStreamSynchronize(stream)
|
|
263
|
+
else
|
|
264
|
+
CUDA::RuntimeAPI.cudaMemcpy(@send_buffer, src_ptr, size, :device_to_host)
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
def unstage_from_host(dst_ptr, size, stream)
|
|
269
|
+
if stream
|
|
270
|
+
CUDA::RuntimeAPI.cudaMemcpyAsync(dst_ptr, @recv_buffer, size, :host_to_device, stream)
|
|
271
|
+
else
|
|
272
|
+
CUDA::RuntimeAPI.cudaMemcpy(dst_ptr, @recv_buffer, size, :host_to_device)
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
def send_header(size)
|
|
277
|
+
header = [size].pack("Q<") # 64-bit little-endian
|
|
278
|
+
@socket.send(header, 0)
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
def recv_header
|
|
282
|
+
header = @socket.recv(8)
|
|
283
|
+
raise TransportError, "Failed to receive header" if header.nil? || header.bytesize < 8
|
|
284
|
+
|
|
285
|
+
header.unpack1("Q<")
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
end
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# VMM IPC Structs — FFI::Struct definitions for CUDA Virtual Memory Management
|
|
4
|
+
#
|
|
5
|
+
# Rule 4: FFI structs live in their own file, never mixed with Fiddle hot-path calls.
|
|
6
|
+
# These structs are used by the VMM IPC transport for multi-GPU memory sharing.
|
|
7
|
+
|
|
8
|
+
require "ignis"
|
|
9
|
+
Ignis::Shared::FFILoader.load!
|
|
10
|
+
|
|
11
|
+
module Ignis
|
|
12
|
+
module Collective
|
|
13
|
+
module Transport
|
|
14
|
+
# FFI struct definitions for CUDA VMM (Virtual Memory Management) IPC.
|
|
15
|
+
module VmmIpcStructs
|
|
16
|
+
extend FFI::Library
|
|
17
|
+
|
|
18
|
+
# Resolve CUDA driver library per platform.
|
|
19
|
+
CUDA_DRIVER_LIB = if defined?(Ignis::Platform)
|
|
20
|
+
Ignis::Platform.find_cuda_lib(:cuda_driver) || (Ignis::Platform.windows? ? 'nvcuda.dll' : 'libcuda.so.1')
|
|
21
|
+
elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
|
|
22
|
+
cuda_bin = File.join('C:', 'Program Files', 'NVIDIA GPU Computing Toolkit', 'CUDA', 'v13.0', 'bin')
|
|
23
|
+
File.join(cuda_bin, 'nvcuda.dll')
|
|
24
|
+
else
|
|
25
|
+
'libcuda.so.1'
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
begin
|
|
29
|
+
ffi_lib CUDA_DRIVER_LIB
|
|
30
|
+
rescue LoadError => e
|
|
31
|
+
$stderr.puts "[NvCCL] WARNING: Cannot load #{CUDA_DRIVER_LIB}: #{e.message}"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# CUmemLocationType — specifies where memory is located
|
|
35
|
+
CU_MEM_LOCATION_TYPE_INVALID = 0
|
|
36
|
+
CU_MEM_LOCATION_TYPE_DEVICE = 1
|
|
37
|
+
CU_MEM_LOCATION_TYPE_HOST = 2
|
|
38
|
+
|
|
39
|
+
# CUmemAllocationType
|
|
40
|
+
CU_MEM_ALLOCATION_TYPE_INVALID = 0
|
|
41
|
+
CU_MEM_ALLOCATION_TYPE_PINNED = 1
|
|
42
|
+
|
|
43
|
+
# CUmemAccess_flags
|
|
44
|
+
CU_MEM_ACCESS_FLAGS_PROT_NONE = 0
|
|
45
|
+
CU_MEM_ACCESS_FLAGS_PROT_READ = 1
|
|
46
|
+
CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 3
|
|
47
|
+
|
|
48
|
+
# CUmemAllocationHandleType
|
|
49
|
+
CU_MEM_HANDLE_TYPE_NONE = 0
|
|
50
|
+
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESC = 1
|
|
51
|
+
CU_MEM_HANDLE_TYPE_WIN32 = 2
|
|
52
|
+
CU_MEM_HANDLE_TYPE_WIN32_KMT = 4
|
|
53
|
+
CU_MEM_HANDLE_TYPE_FABRIC = 8
|
|
54
|
+
|
|
55
|
+
# CUmemLocation — memory location descriptor
|
|
56
|
+
class CUmemLocation < FFI::Struct
|
|
57
|
+
layout \
|
|
58
|
+
:type, :int, # CUmemLocationType
|
|
59
|
+
:id, :int # Device ordinal (for DEVICE type)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# CUmemAccessDesc — memory access descriptor
|
|
63
|
+
class CUmemAccessDesc < FFI::Struct
|
|
64
|
+
layout \
|
|
65
|
+
:location, CUmemLocation,
|
|
66
|
+
:flags, :uint # CUmemAccess_flags
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# CUmemAllocationProp — memory allocation properties
|
|
70
|
+
class CUmemAllocationProp < FFI::Struct
|
|
71
|
+
layout \
|
|
72
|
+
:type, :int, # CUmemAllocationType
|
|
73
|
+
:requestedHandleTypes, :int, # CUmemAllocationHandleType bitmask
|
|
74
|
+
:location, CUmemLocation,
|
|
75
|
+
:win32HandleMetaData, :pointer, # Windows security descriptor (NULL for default)
|
|
76
|
+
:allocFlags_compressionType, :uchar,
|
|
77
|
+
:allocFlags_gpuDirectRDMACapable, :uchar,
|
|
78
|
+
:allocFlags_usage, :ushort,
|
|
79
|
+
:_reserved, [:uchar, 4]
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# CUmemGenericAllocationHandle — opaque handle
|
|
83
|
+
# This is a uint64 (CUmemGenericAllocationHandle) on all platforms.
|
|
84
|
+
# We use :uint64 since FFI doesn't have an opaque handle type.
|
|
85
|
+
|
|
86
|
+
# IPC memory handle for sharing across processes
|
|
87
|
+
class CUipcMemHandle < FFI::Struct
|
|
88
|
+
layout :reserved, [:char, 64]
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# IPC event handle
|
|
92
|
+
class CUipcEventHandle < FFI::Struct
|
|
93
|
+
layout :reserved, [:char, 64]
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Helper: Create a CUmemLocation for a given GPU device.
|
|
97
|
+
# @param device_id [Integer]
|
|
98
|
+
# @return [CUmemLocation]
|
|
99
|
+
def self.device_location(device_id)
|
|
100
|
+
loc = CUmemLocation.new
|
|
101
|
+
loc[:type] = CU_MEM_LOCATION_TYPE_DEVICE
|
|
102
|
+
loc[:id] = device_id
|
|
103
|
+
loc
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Helper: Create a CUmemAccessDesc for read-write access to a device.
|
|
107
|
+
# @param device_id [Integer]
|
|
108
|
+
# @return [CUmemAccessDesc]
|
|
109
|
+
def self.device_rw_access(device_id)
|
|
110
|
+
desc = CUmemAccessDesc.new
|
|
111
|
+
desc[:location][:type] = CU_MEM_LOCATION_TYPE_DEVICE
|
|
112
|
+
desc[:location][:id] = device_id
|
|
113
|
+
desc[:flags] = CU_MEM_ACCESS_FLAGS_PROT_READWRITE
|
|
114
|
+
desc
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Helper: Create CUmemAllocationProp for device-pinned memory
|
|
118
|
+
# with Windows handle support (Win32).
|
|
119
|
+
#
|
|
120
|
+
# @param device_id [Integer]
|
|
121
|
+
# @param rdma_capable [Boolean] whether to request GPU Direct RDMA
|
|
122
|
+
# @return [CUmemAllocationProp]
|
|
123
|
+
def self.pinned_device_prop(device_id, rdma_capable: false)
|
|
124
|
+
prop = CUmemAllocationProp.new
|
|
125
|
+
prop[:type] = CU_MEM_ALLOCATION_TYPE_PINNED
|
|
126
|
+
prop[:requestedHandleTypes] = if RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
|
|
127
|
+
CU_MEM_HANDLE_TYPE_WIN32
|
|
128
|
+
else
|
|
129
|
+
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESC
|
|
130
|
+
end
|
|
131
|
+
prop[:location][:type] = CU_MEM_LOCATION_TYPE_DEVICE
|
|
132
|
+
prop[:location][:id] = device_id
|
|
133
|
+
prop[:win32HandleMetaData] = FFI::Pointer::NULL
|
|
134
|
+
prop[:allocFlags_compressionType] = 0
|
|
135
|
+
prop[:allocFlags_gpuDirectRDMACapable] = rdma_capable ? 1 : 0
|
|
136
|
+
prop[:allocFlags_usage] = 0
|
|
137
|
+
prop
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Bind VMM functions if DLL is available
|
|
141
|
+
begin
|
|
142
|
+
# cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags)
|
|
143
|
+
attach_function :cuMemAddressReserve, [:pointer, :size_t, :size_t, :uint64, :uint64], :int
|
|
144
|
+
|
|
145
|
+
# cuMemAddressFree(CUdeviceptr ptr, size_t size)
|
|
146
|
+
attach_function :cuMemAddressFree, [:uint64, :size_t], :int
|
|
147
|
+
|
|
148
|
+
# cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags)
|
|
149
|
+
attach_function :cuMemCreate, [:pointer, :size_t, :pointer, :uint64], :int
|
|
150
|
+
|
|
151
|
+
# cuMemRelease(CUmemGenericAllocationHandle handle)
|
|
152
|
+
attach_function :cuMemRelease, [:uint64], :int
|
|
153
|
+
|
|
154
|
+
# cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags)
|
|
155
|
+
attach_function :cuMemMap, [:uint64, :size_t, :size_t, :uint64, :uint64], :int
|
|
156
|
+
|
|
157
|
+
# cuMemUnmap(CUdeviceptr ptr, size_t size)
|
|
158
|
+
attach_function :cuMemUnmap, [:uint64, :size_t], :int
|
|
159
|
+
|
|
160
|
+
# cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count)
|
|
161
|
+
attach_function :cuMemSetAccess, [:uint64, :size_t, :pointer, :size_t], :int
|
|
162
|
+
|
|
163
|
+
# cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option)
|
|
164
|
+
attach_function :cuMemGetAllocationGranularity, [:pointer, :pointer, :int], :int
|
|
165
|
+
|
|
166
|
+
# IPC functions
|
|
167
|
+
attach_function :cuIpcGetMemHandle, [:pointer, :uint64], :int
|
|
168
|
+
attach_function :cuIpcOpenMemHandle, [:pointer, CUipcMemHandle.by_value, :uint], :int
|
|
169
|
+
attach_function :cuIpcCloseMemHandle, [:uint64], :int
|
|
170
|
+
|
|
171
|
+
attach_function :cuIpcGetEventHandle, [:pointer, :uint64], :int
|
|
172
|
+
attach_function :cuIpcOpenEventHandle, [:pointer, CUipcEventHandle.by_value], :int
|
|
173
|
+
|
|
174
|
+
VMM_AVAILABLE = true
|
|
175
|
+
rescue FFI::NotFoundError => e
|
|
176
|
+
$stderr.puts "[NvCCL] VMM functions not available: #{e.message}"
|
|
177
|
+
VMM_AVAILABLE = false
|
|
178
|
+
rescue StandardError
|
|
179
|
+
VMM_AVAILABLE = false
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# @return [Boolean]
|
|
183
|
+
def self.available?
|
|
184
|
+
defined?(VMM_AVAILABLE) && VMM_AVAILABLE
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
end
|