ignis-collective 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +7 -0
- data/lib/ignis-collective.rb +9 -0
- data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
- data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
- data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
- data/lib/nvruby/collective/algorithms/ring.rb +421 -0
- data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
- data/lib/nvruby/collective/algorithms/tree.rb +291 -0
- data/lib/nvruby/collective/array_ops.rb +240 -0
- data/lib/nvruby/collective/communicator.rb +633 -0
- data/lib/nvruby/collective/communicator_healer.rb +276 -0
- data/lib/nvruby/collective/device_manager.rb +216 -0
- data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
- data/lib/nvruby/collective/health_monitor.rb +333 -0
- data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
- data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
- data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
- data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
- data/lib/nvruby/collective/p2p_bindings.rb +121 -0
- data/lib/nvruby/collective/resilient_transport.rb +296 -0
- data/lib/nvruby/collective/topology.rb +347 -0
- data/lib/nvruby/collective/transport/base.rb +138 -0
- data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
- data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
- data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
- data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
- data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
- data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
- data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
- data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
- data/lib/nvruby/collective/transport_selector.rb +200 -0
- data/lib/nvruby/collective/vmm_bindings.rb +212 -0
- data/lib/nvruby/collective.rb +156 -0
- metadata +92 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module Collective
|
|
5
|
+
module Transport
|
|
6
|
+
# Abstract base class for all transport implementations
|
|
7
|
+
# Each transport handles GPU-to-GPU or GPU-to-network data movement
|
|
8
|
+
class Base
|
|
9
|
+
# Transport type identifier
|
|
10
|
+
# @return [Symbol] Transport type
|
|
11
|
+
def self.transport_type
|
|
12
|
+
raise NotImplementedError, "Subclass must define transport_type"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# @return [Integer] Source device ID
|
|
16
|
+
attr_reader :src_device
|
|
17
|
+
|
|
18
|
+
# @return [Integer] Destination device ID
|
|
19
|
+
attr_reader :dst_device
|
|
20
|
+
|
|
21
|
+
# @param src_device [Integer] Source GPU device ID
|
|
22
|
+
# @param dst_device [Integer] Destination GPU device ID
|
|
23
|
+
def initialize(src_device:, dst_device:)
|
|
24
|
+
@src_device = src_device
|
|
25
|
+
@dst_device = dst_device
|
|
26
|
+
@initialized = false
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Initialize the transport (called once per communicator)
|
|
30
|
+
# @return [void]
|
|
31
|
+
def initialize!
|
|
32
|
+
raise NotImplementedError
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Check if transport is initialized and ready
|
|
36
|
+
# @return [Boolean] True if ready for use
|
|
37
|
+
def ready?
|
|
38
|
+
@initialized
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Send data asynchronously
|
|
42
|
+
# @param buffer [FFI::Pointer] Device pointer to send
|
|
43
|
+
# @param size [Integer] Bytes to send
|
|
44
|
+
# @param stream [CUDA::Stream, FFI::Pointer] CUDA stream for async execution
|
|
45
|
+
# @return [void]
|
|
46
|
+
def send_async(buffer, size, stream)
|
|
47
|
+
raise NotImplementedError
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Receive data asynchronously
|
|
51
|
+
# @param buffer [FFI::Pointer] Device pointer to receive into
|
|
52
|
+
# @param size [Integer] Bytes to receive
|
|
53
|
+
# @param stream [CUDA::Stream, FFI::Pointer] CUDA stream for async execution
|
|
54
|
+
# @return [void]
|
|
55
|
+
def recv_async(buffer, size, stream)
|
|
56
|
+
raise NotImplementedError
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Synchronous send (waits for completion)
|
|
60
|
+
# @param buffer [FFI::Pointer] Device pointer to send
|
|
61
|
+
# @param size [Integer] Bytes to send
|
|
62
|
+
# @return [void]
|
|
63
|
+
def send_sync(buffer, size)
|
|
64
|
+
null_stream = FFI::Pointer::NULL
|
|
65
|
+
send_async(buffer, size, null_stream)
|
|
66
|
+
synchronize!
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Synchronous receive (waits for completion)
|
|
70
|
+
# @param buffer [FFI::Pointer] Device pointer to receive into
|
|
71
|
+
# @param size [Integer] Bytes to receive
|
|
72
|
+
# @return [void]
|
|
73
|
+
def recv_sync(buffer, size)
|
|
74
|
+
null_stream = FFI::Pointer::NULL
|
|
75
|
+
recv_async(buffer, size, null_stream)
|
|
76
|
+
synchronize!
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Wait for all pending operations to complete
|
|
80
|
+
# @return [void]
|
|
81
|
+
def synchronize!
|
|
82
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
83
|
+
status = CUDA::RuntimeAPI.cudaDeviceSynchronize
|
|
84
|
+
CUDA::RuntimeAPI.check_status!(status, "Transport synchronize")
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Estimated bandwidth in GB/s
|
|
88
|
+
# @return [Float] Bandwidth estimate
|
|
89
|
+
def estimated_bandwidth
|
|
90
|
+
raise NotImplementedError
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Estimated latency in microseconds
|
|
94
|
+
# @return [Float] Latency estimate
|
|
95
|
+
def estimated_latency
|
|
96
|
+
raise NotImplementedError
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Check if this transport is available for the given GPU pair
|
|
100
|
+
# @param src [Integer] Source GPU
|
|
101
|
+
# @param dst [Integer] Destination GPU
|
|
102
|
+
# @return [Boolean] True if available
|
|
103
|
+
def self.available?(src, dst)
|
|
104
|
+
raise NotImplementedError
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Clean up resources
|
|
108
|
+
# @return [void]
|
|
109
|
+
def destroy!
|
|
110
|
+
@initialized = false
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# @return [String] Human-readable description
|
|
114
|
+
def to_s
|
|
115
|
+
"#{self.class.transport_type}[#{@src_device}→#{@dst_device}]"
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
protected
|
|
119
|
+
|
|
120
|
+
# Get stream pointer for FFI calls
|
|
121
|
+
# @param stream [CUDA::Stream, FFI::Pointer, nil] Stream
|
|
122
|
+
# @return [FFI::Pointer] Stream pointer
|
|
123
|
+
def stream_ptr(stream)
|
|
124
|
+
case stream
|
|
125
|
+
when FFI::Pointer
|
|
126
|
+
stream
|
|
127
|
+
when CUDA::Stream
|
|
128
|
+
stream.ptr
|
|
129
|
+
when NilClass
|
|
130
|
+
FFI::Pointer::NULL
|
|
131
|
+
else
|
|
132
|
+
raise ArgumentError, "Invalid stream type: #{stream.class}"
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base"
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module Collective
|
|
7
|
+
module Transport
|
|
8
|
+
# Host-Staged Transport - Fallback when P2P is not available
|
|
9
|
+
#
|
|
10
|
+
# Uses host (pinned) memory as an intermediate staging buffer.
|
|
11
|
+
# Data path: GPU_A -> Host -> GPU_B
|
|
12
|
+
#
|
|
13
|
+
# Slower than P2P but always works regardless of topology.
|
|
14
|
+
# Bandwidth limited by PCIe x2 (upload + download).
|
|
15
|
+
class HostStagedTransport < Base
|
|
16
|
+
# @return [Symbol] Transport type identifier
|
|
17
|
+
def self.transport_type
|
|
18
|
+
:host_staged
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# @return [Float] Estimated bandwidth (GB/s)
|
|
22
|
+
def estimated_bandwidth
|
|
23
|
+
12.0 # PCIe 4.0 x16 / 2 (round trip)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# @return [Float] Estimated latency (microseconds)
|
|
27
|
+
def estimated_latency
|
|
28
|
+
25.0 # Higher due to double copy
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Initialize the transport
|
|
32
|
+
# @return [void]
|
|
33
|
+
def initialize!
|
|
34
|
+
return if @initialized
|
|
35
|
+
|
|
36
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
37
|
+
@staging_buffers = {} # size -> pinned host buffer
|
|
38
|
+
@initialized = true
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Copy data via host staging
|
|
42
|
+
# @param dst_buffer [FFI::Pointer] Destination GPU buffer
|
|
43
|
+
# @param src_buffer [FFI::Pointer] Source GPU buffer
|
|
44
|
+
# @param size [Integer] Size in bytes
|
|
45
|
+
# @param stream [FFI::Pointer] CUDA stream (for async ops)
|
|
46
|
+
# @return [void]
|
|
47
|
+
def copy_async(dst_buffer, src_buffer, size, stream)
|
|
48
|
+
ensure_initialized!
|
|
49
|
+
|
|
50
|
+
# Get or allocate staging buffer
|
|
51
|
+
staging = get_staging_buffer(size)
|
|
52
|
+
|
|
53
|
+
# Step 1: GPU_src -> Host (async)
|
|
54
|
+
CUDA::RuntimeAPI.cudaSetDevice(@src_device)
|
|
55
|
+
status = CUDA::RuntimeAPI.cudaMemcpyAsync(
|
|
56
|
+
staging,
|
|
57
|
+
src_buffer,
|
|
58
|
+
size,
|
|
59
|
+
CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_HOST,
|
|
60
|
+
stream
|
|
61
|
+
)
|
|
62
|
+
CUDA::RuntimeAPI.check_status!(status, "Host-staged D2H copy")
|
|
63
|
+
|
|
64
|
+
# Synchronize to ensure data is in host memory
|
|
65
|
+
sync_stream(stream)
|
|
66
|
+
|
|
67
|
+
# Step 2: Host -> GPU_dst (async)
|
|
68
|
+
CUDA::RuntimeAPI.cudaSetDevice(@dst_device)
|
|
69
|
+
status = CUDA::RuntimeAPI.cudaMemcpyAsync(
|
|
70
|
+
dst_buffer,
|
|
71
|
+
staging,
|
|
72
|
+
size,
|
|
73
|
+
CUDA::RuntimeAPI::MEMCPY_HOST_TO_DEVICE,
|
|
74
|
+
stream
|
|
75
|
+
)
|
|
76
|
+
CUDA::RuntimeAPI.check_status!(status, "Host-staged H2D copy")
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Synchronous copy
|
|
80
|
+
# @param dst_buffer [FFI::Pointer] Destination
|
|
81
|
+
# @param src_buffer [FFI::Pointer] Source
|
|
82
|
+
# @param size [Integer] Size in bytes
|
|
83
|
+
# @return [void]
|
|
84
|
+
def copy_sync(dst_buffer, src_buffer, size)
|
|
85
|
+
ensure_initialized!
|
|
86
|
+
|
|
87
|
+
staging = get_staging_buffer(size)
|
|
88
|
+
|
|
89
|
+
# GPU_src -> Host
|
|
90
|
+
CUDA::RuntimeAPI.cudaSetDevice(@src_device)
|
|
91
|
+
status = CUDA::RuntimeAPI.cudaMemcpy(
|
|
92
|
+
staging,
|
|
93
|
+
src_buffer,
|
|
94
|
+
size,
|
|
95
|
+
CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_HOST
|
|
96
|
+
)
|
|
97
|
+
CUDA::RuntimeAPI.check_status!(status, "Host-staged D2H sync")
|
|
98
|
+
|
|
99
|
+
# Host -> GPU_dst
|
|
100
|
+
CUDA::RuntimeAPI.cudaSetDevice(@dst_device)
|
|
101
|
+
status = CUDA::RuntimeAPI.cudaMemcpy(
|
|
102
|
+
dst_buffer,
|
|
103
|
+
staging,
|
|
104
|
+
size,
|
|
105
|
+
CUDA::RuntimeAPI::MEMCPY_HOST_TO_DEVICE
|
|
106
|
+
)
|
|
107
|
+
CUDA::RuntimeAPI.check_status!(status, "Host-staged H2D sync")
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Async send (GPU to host staging)
|
|
111
|
+
# @param buffer [FFI::Pointer] Source GPU buffer
|
|
112
|
+
# @param size [Integer] Size in bytes
|
|
113
|
+
# @param stream [FFI::Pointer] CUDA stream
|
|
114
|
+
# @return [FFI::Pointer] Staging buffer with data
|
|
115
|
+
def send_async(buffer, size, stream)
|
|
116
|
+
ensure_initialized!
|
|
117
|
+
|
|
118
|
+
staging = get_staging_buffer(size)
|
|
119
|
+
|
|
120
|
+
CUDA::RuntimeAPI.cudaSetDevice(@src_device)
|
|
121
|
+
status = CUDA::RuntimeAPI.cudaMemcpyAsync(
|
|
122
|
+
staging,
|
|
123
|
+
buffer,
|
|
124
|
+
size,
|
|
125
|
+
CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_HOST,
|
|
126
|
+
stream
|
|
127
|
+
)
|
|
128
|
+
CUDA::RuntimeAPI.check_status!(status, "Host-staged send")
|
|
129
|
+
|
|
130
|
+
staging
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Async receive (host staging to GPU)
|
|
134
|
+
# @param buffer [FFI::Pointer] Destination GPU buffer
|
|
135
|
+
# @param staging [FFI::Pointer] Host staging buffer with data
|
|
136
|
+
# @param size [Integer] Size in bytes
|
|
137
|
+
# @param stream [FFI::Pointer] CUDA stream
|
|
138
|
+
# @return [void]
|
|
139
|
+
def recv_async(buffer, staging, size, stream)
|
|
140
|
+
ensure_initialized!
|
|
141
|
+
|
|
142
|
+
CUDA::RuntimeAPI.cudaSetDevice(@dst_device)
|
|
143
|
+
status = CUDA::RuntimeAPI.cudaMemcpyAsync(
|
|
144
|
+
buffer,
|
|
145
|
+
staging,
|
|
146
|
+
size,
|
|
147
|
+
CUDA::RuntimeAPI::MEMCPY_HOST_TO_DEVICE,
|
|
148
|
+
stream
|
|
149
|
+
)
|
|
150
|
+
CUDA::RuntimeAPI.check_status!(status, "Host-staged recv")
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Check if host-staged is available (always true)
|
|
154
|
+
# @return [Boolean] Always true
|
|
155
|
+
def self.available?
|
|
156
|
+
true
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Clean up staging buffers
|
|
160
|
+
# @return [void]
|
|
161
|
+
def destroy!
|
|
162
|
+
@staging_buffers.each_value do |buf|
|
|
163
|
+
CUDA::RuntimeAPI.cudaFreeHost(buf) rescue nil
|
|
164
|
+
end
|
|
165
|
+
@staging_buffers.clear
|
|
166
|
+
@initialized = false
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
private
|
|
170
|
+
|
|
171
|
+
# Get or allocate a pinned host staging buffer
|
|
172
|
+
# @param size [Integer] Required size
|
|
173
|
+
# @return [FFI::Pointer] Pinned host buffer
|
|
174
|
+
def get_staging_buffer(size)
|
|
175
|
+
# Round up to nearest power of 2 for reuse efficiency
|
|
176
|
+
rounded_size = next_power_of_2(size)
|
|
177
|
+
|
|
178
|
+
return @staging_buffers[rounded_size] if @staging_buffers[rounded_size]
|
|
179
|
+
|
|
180
|
+
# Allocate pinned host memory
|
|
181
|
+
ptr_ptr = FFI::MemoryPointer.new(:pointer)
|
|
182
|
+
status = CUDA::RuntimeAPI.cudaHostAlloc(
|
|
183
|
+
ptr_ptr,
|
|
184
|
+
rounded_size,
|
|
185
|
+
1 # cudaHostAllocDefault
|
|
186
|
+
)
|
|
187
|
+
CUDA::RuntimeAPI.check_status!(status, "Alloc pinned staging buffer")
|
|
188
|
+
|
|
189
|
+
@staging_buffers[rounded_size] = ptr_ptr.read_pointer
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Round up to next power of 2
|
|
193
|
+
def next_power_of_2(n)
|
|
194
|
+
return 1 if n <= 0
|
|
195
|
+
|
|
196
|
+
n -= 1
|
|
197
|
+
n |= n >> 1
|
|
198
|
+
n |= n >> 2
|
|
199
|
+
n |= n >> 4
|
|
200
|
+
n |= n >> 8
|
|
201
|
+
n |= n >> 16
|
|
202
|
+
n |= n >> 32
|
|
203
|
+
n + 1
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Synchronize stream
|
|
207
|
+
def sync_stream(stream)
|
|
208
|
+
if stream.null?
|
|
209
|
+
CUDA::RuntimeAPI.cudaDeviceSynchronize
|
|
210
|
+
else
|
|
211
|
+
CUDA::RuntimeAPI.cudaStreamSynchronize(stream)
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
end
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base"
|
|
4
|
+
require_relative "../p2p_bindings"
|
|
5
|
+
|
|
6
|
+
module Ignis
|
|
7
|
+
module Collective
|
|
8
|
+
module Transport
|
|
9
|
+
# CUDA IPC transport for inter-process GPU memory sharing
|
|
10
|
+
# Uses cudaIpcGetMemHandle/cudaIpcOpenMemHandle for zero-copy cross-process access
|
|
11
|
+
# Supports both legacy IPC and cuMem VMM API
|
|
12
|
+
class IPCTransport < Base
|
|
13
|
+
# @return [Symbol] Transport type identifier
|
|
14
|
+
def self.transport_type
|
|
15
|
+
:cuda_ipc
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Estimated bandwidth (host memory throughput for handle exchange)
|
|
19
|
+
BANDWIDTH_GBS = 25.0
|
|
20
|
+
|
|
21
|
+
# Estimated latency (IPC overhead)
|
|
22
|
+
LATENCY_US = 10.0
|
|
23
|
+
|
|
24
|
+
# Handle exchange methods
|
|
25
|
+
EXCHANGE_METHODS = [:named_pipe, :shared_memory, :socket].freeze
|
|
26
|
+
|
|
27
|
+
# @return [P2PBindings::CudaIpcMemHandle, nil] Exported handle
|
|
28
|
+
attr_reader :exported_handle
|
|
29
|
+
|
|
30
|
+
# @return [Symbol] Handle exchange method
|
|
31
|
+
attr_reader :exchange_method
|
|
32
|
+
|
|
33
|
+
# @param src_device [Integer] Source GPU
|
|
34
|
+
# @param dst_device [Integer] Destination GPU
|
|
35
|
+
# @param exchange_method [Symbol] Method for handle exchange
|
|
36
|
+
def initialize(src_device:, dst_device:, exchange_method: :named_pipe)
|
|
37
|
+
super(src_device: src_device, dst_device: dst_device)
|
|
38
|
+
@exchange_method = exchange_method
|
|
39
|
+
@exported_handles = {} # device_ptr -> handle
|
|
40
|
+
@imported_handles = {} # handle -> mapped_ptr
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Initialize IPC transport
|
|
44
|
+
# @return [void]
|
|
45
|
+
def initialize!
|
|
46
|
+
return if @initialized
|
|
47
|
+
|
|
48
|
+
P2PBindings.ensure_loaded!
|
|
49
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
50
|
+
|
|
51
|
+
@initialized = true
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Export GPU memory for sharing with another process
|
|
55
|
+
# @param device_ptr [FFI::Pointer] GPU memory pointer to export
|
|
56
|
+
# @return [String] Binary handle (64 bytes) for transfer to other process
|
|
57
|
+
def export_handle(device_ptr)
|
|
58
|
+
ensure_initialized!
|
|
59
|
+
|
|
60
|
+
# Set source device context
|
|
61
|
+
status = CUDA::RuntimeAPI.cudaSetDevice(@src_device)
|
|
62
|
+
CUDA::RuntimeAPI.check_status!(status, "Set device for IPC export")
|
|
63
|
+
|
|
64
|
+
handle = P2PBindings::CudaIpcMemHandle.new
|
|
65
|
+
status = P2PBindings.cudaIpcGetMemHandle(handle, device_ptr)
|
|
66
|
+
P2PBindings.check_status!(status, "Get IPC memory handle")
|
|
67
|
+
|
|
68
|
+
# Cache the handle for cleanup
|
|
69
|
+
handle_bytes = handle[:reserved].to_a.pack("C*")
|
|
70
|
+
@exported_handles[device_ptr.address] = handle_bytes
|
|
71
|
+
|
|
72
|
+
handle_bytes
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Import GPU memory handle from another process
|
|
76
|
+
# @param handle_bytes [String] 64-byte binary handle
|
|
77
|
+
# @param flags [Integer] IPC memory flags
|
|
78
|
+
# @return [FFI::Pointer] Mapped device pointer usable in this process
|
|
79
|
+
def import_handle(handle_bytes, flags: P2PBindings::IPC_MEM_LAZY_ENABLE_PEER_ACCESS)
|
|
80
|
+
ensure_initialized!
|
|
81
|
+
|
|
82
|
+
# Set destination device context
|
|
83
|
+
status = CUDA::RuntimeAPI.cudaSetDevice(@dst_device)
|
|
84
|
+
CUDA::RuntimeAPI.check_status!(status, "Set device for IPC import")
|
|
85
|
+
|
|
86
|
+
# Reconstruct handle from bytes
|
|
87
|
+
handle = P2PBindings::CudaIpcMemHandle.new
|
|
88
|
+
handle_bytes.each_byte.with_index do |byte, i|
|
|
89
|
+
handle[:reserved][i] = byte
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Open the handle
|
|
93
|
+
mapped_ptr_ptr = FFI::MemoryPointer.new(:pointer)
|
|
94
|
+
status = P2PBindings.cudaIpcOpenMemHandle(mapped_ptr_ptr, handle, flags)
|
|
95
|
+
P2PBindings.check_status!(status, "Open IPC memory handle")
|
|
96
|
+
|
|
97
|
+
mapped_ptr = mapped_ptr_ptr.read_pointer
|
|
98
|
+
@imported_handles[handle_bytes] = mapped_ptr
|
|
99
|
+
|
|
100
|
+
mapped_ptr
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Close an imported handle
|
|
104
|
+
# @param mapped_ptr [FFI::Pointer] Previously imported pointer
|
|
105
|
+
# @return [void]
|
|
106
|
+
def close_imported_handle(mapped_ptr)
|
|
107
|
+
ensure_initialized!
|
|
108
|
+
|
|
109
|
+
status = P2PBindings.cudaIpcCloseMemHandle(mapped_ptr)
|
|
110
|
+
P2PBindings.check_status!(status, "Close IPC memory handle")
|
|
111
|
+
|
|
112
|
+
@imported_handles.delete_if { |_, ptr| ptr == mapped_ptr }
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Send data using IPC (export phase)
|
|
116
|
+
# In IPC, send means making buffer available to receiver
|
|
117
|
+
# @param buffer [FFI::Pointer] Buffer to share
|
|
118
|
+
# @param size [Integer] Buffer size (for validation)
|
|
119
|
+
# @param stream [Object] CUDA stream (unused for export, sync only)
|
|
120
|
+
# @return [String] Handle bytes to transmit to receiver
|
|
121
|
+
def send_async(buffer, size, stream)
|
|
122
|
+
export_handle(buffer)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Receive data using IPC (import phase)
|
|
126
|
+
# @param buffer [FFI::Pointer] NOT USED - returns new mapped pointer
|
|
127
|
+
# @param size [Integer] Expected size
|
|
128
|
+
# @param stream [Object] CUDA stream
|
|
129
|
+
# @return [FFI::Pointer] Mapped device pointer
|
|
130
|
+
def recv_async(buffer, size, stream)
|
|
131
|
+
# IPC recv requires handle from sender - actual implementation
|
|
132
|
+
# would receive handle via exchange_method and then import
|
|
133
|
+
raise TransportError, "IPC recv requires handle - use recv_with_handle"
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Receive using provided handle
|
|
137
|
+
# @param handle_bytes [String] Handle from sender
|
|
138
|
+
# @return [FFI::Pointer] Mapped device pointer
|
|
139
|
+
def recv_with_handle(handle_bytes)
|
|
140
|
+
import_handle(handle_bytes)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# @return [Float] Bandwidth in GB/s
|
|
144
|
+
def estimated_bandwidth
|
|
145
|
+
BANDWIDTH_GBS
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# @return [Float] Latency in microseconds
|
|
149
|
+
def estimated_latency
|
|
150
|
+
LATENCY_US
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# IPC is always available on Windows for same-node communication
|
|
154
|
+
# @param src [Integer] Source GPU
|
|
155
|
+
# @param dst [Integer] Destination GPU
|
|
156
|
+
# @return [Boolean] True (always available)
|
|
157
|
+
def self.available?(src, dst)
|
|
158
|
+
true
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Clean up all handles
|
|
162
|
+
# @return [void]
|
|
163
|
+
def destroy!
|
|
164
|
+
@imported_handles.each_value do |ptr|
|
|
165
|
+
P2PBindings.cudaIpcCloseMemHandle(ptr)
|
|
166
|
+
rescue StandardError
|
|
167
|
+
# Ignore cleanup errors
|
|
168
|
+
end
|
|
169
|
+
@imported_handles.clear
|
|
170
|
+
@exported_handles.clear
|
|
171
|
+
super
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# @return [String] Transport description
|
|
175
|
+
def to_s
|
|
176
|
+
"IPC[#{@src_device}→#{@dst_device}, #{@exchange_method}]"
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
private
|
|
180
|
+
|
|
181
|
+
def ensure_initialized!
|
|
182
|
+
raise TransportError, "IPC Transport not initialized" unless @initialized
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base"
|
|
4
|
+
require_relative "../p2p_bindings"
|
|
5
|
+
|
|
6
|
+
module Ignis
|
|
7
|
+
module Collective
|
|
8
|
+
module Transport
|
|
9
|
+
# PCIe/NVLink peer-to-peer transport for direct GPU-to-GPU transfers
|
|
10
|
+
# Uses cudaMemcpyPeerAsync for high-bandwidth same-process communication
|
|
11
|
+
class P2PTransport < Base
|
|
12
|
+
# @return [Symbol] Transport type identifier
|
|
13
|
+
def self.transport_type
|
|
14
|
+
:pcie_p2p
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Estimated bandwidth based on interconnect type
|
|
18
|
+
BANDWIDTH_ESTIMATES = {
|
|
19
|
+
nvlink: 900.0, # GB/s - NVLink 4.0
|
|
20
|
+
pcie_p2p: 32.0, # GB/s - PCIe Gen4 x16
|
|
21
|
+
}.freeze
|
|
22
|
+
|
|
23
|
+
# Estimated latency
|
|
24
|
+
LATENCY_ESTIMATES = {
|
|
25
|
+
nvlink: 1.0, # microseconds
|
|
26
|
+
pcie_p2p: 5.0, # microseconds
|
|
27
|
+
}.freeze
|
|
28
|
+
|
|
29
|
+
# @return [Symbol] Actual interconnect type (:nvlink or :pcie_p2p)
|
|
30
|
+
attr_reader :interconnect_type
|
|
31
|
+
|
|
32
|
+
# @param src_device [Integer] Source GPU
|
|
33
|
+
# @param dst_device [Integer] Destination GPU
|
|
34
|
+
# @param interconnect_type [Symbol] Detected interconnect type
|
|
35
|
+
def initialize(src_device:, dst_device:, interconnect_type: :pcie_p2p)
|
|
36
|
+
super(src_device: src_device, dst_device: dst_device)
|
|
37
|
+
@interconnect_type = interconnect_type
|
|
38
|
+
@peer_access_enabled = false
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Initialize P2P transport by enabling peer access
|
|
42
|
+
# @return [void]
|
|
43
|
+
def initialize!
|
|
44
|
+
return if @initialized
|
|
45
|
+
|
|
46
|
+
P2PBindings.ensure_loaded!
|
|
47
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
48
|
+
|
|
49
|
+
# Set source device context
|
|
50
|
+
status = CUDA::RuntimeAPI.cudaSetDevice(@src_device)
|
|
51
|
+
CUDA::RuntimeAPI.check_status!(status, "Set device #{@src_device}")
|
|
52
|
+
|
|
53
|
+
# Enable peer access to destination
|
|
54
|
+
status = P2PBindings.cudaDeviceEnablePeerAccess(@dst_device, 0)
|
|
55
|
+
|
|
56
|
+
# Status 0 = success, 704 = already enabled (cudaErrorPeerAccessAlreadyEnabled)
|
|
57
|
+
unless status.zero? || status == 704
|
|
58
|
+
P2PBindings.check_status!(status, "Enable peer access #{@src_device}→#{@dst_device}")
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
@peer_access_enabled = true
|
|
62
|
+
@initialized = true
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Send data to destination GPU asynchronously
|
|
66
|
+
# @param buffer [FFI::Pointer] Source buffer on src_device
|
|
67
|
+
# @param size [Integer] Bytes to send
|
|
68
|
+
# @param stream [CUDA::Stream, FFI::Pointer] CUDA stream
|
|
69
|
+
# @return [void]
|
|
70
|
+
def send_async(buffer, size, stream)
|
|
71
|
+
ensure_initialized!
|
|
72
|
+
|
|
73
|
+
# For P2P, we use the receive buffer on dst, so this is a no-op
|
|
74
|
+
# The actual transfer happens in recv_async with combined send buffer
|
|
75
|
+
# This is for the half-duplex ring pattern
|
|
76
|
+
nil
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Receive data from source GPU asynchronously
|
|
80
|
+
# @param dst_buffer [FFI::Pointer] Destination buffer on dst_device
|
|
81
|
+
# @param src_buffer [FFI::Pointer] Source buffer on src_device
|
|
82
|
+
# @param size [Integer] Bytes to receive
|
|
83
|
+
# @param stream [CUDA::Stream, FFI::Pointer] CUDA stream
|
|
84
|
+
# @return [void]
|
|
85
|
+
def copy_async(dst_buffer, src_buffer, size, stream)
|
|
86
|
+
ensure_initialized!
|
|
87
|
+
|
|
88
|
+
status = P2PBindings.cudaMemcpyPeerAsync(
|
|
89
|
+
dst_buffer,
|
|
90
|
+
@dst_device,
|
|
91
|
+
src_buffer,
|
|
92
|
+
@src_device,
|
|
93
|
+
size,
|
|
94
|
+
stream_ptr(stream)
|
|
95
|
+
)
|
|
96
|
+
P2PBindings.check_status!(status, "P2P copy #{@src_device}→#{@dst_device}")
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Alias for copy_async with reversed semantics
|
|
100
|
+
# @param buffer [FFI::Pointer] Destination buffer
|
|
101
|
+
# @param size [Integer] Bytes to receive
|
|
102
|
+
# @param stream [CUDA::Stream, FFI::Pointer] CUDA stream
|
|
103
|
+
# @return [void]
|
|
104
|
+
def recv_async(buffer, size, stream)
|
|
105
|
+
# Note: For full implementation, source buffer would come from send side
|
|
106
|
+
# In ring allreduce, we manage buffers differently
|
|
107
|
+
nil
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# @return [Float] Bandwidth in GB/s
|
|
111
|
+
def estimated_bandwidth
|
|
112
|
+
BANDWIDTH_ESTIMATES[@interconnect_type] || 12.0
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# @return [Float] Latency in microseconds
|
|
116
|
+
def estimated_latency
|
|
117
|
+
LATENCY_ESTIMATES[@interconnect_type] || 20.0
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Check if P2P is available between two GPUs
|
|
121
|
+
# @param src [Integer] Source GPU
|
|
122
|
+
# @param dst [Integer] Destination GPU
|
|
123
|
+
# @return [Boolean] True if P2P available
|
|
124
|
+
def self.available?(src, dst)
|
|
125
|
+
P2PBindings.ensure_loaded!
|
|
126
|
+
|
|
127
|
+
can_access_ptr = FFI::MemoryPointer.new(:int)
|
|
128
|
+
status = P2PBindings.cudaDeviceCanAccessPeer(can_access_ptr, src, dst)
|
|
129
|
+
status.zero? && can_access_ptr.read_int == 1
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Clean up by disabling peer access
|
|
133
|
+
# @return [void]
|
|
134
|
+
def destroy!
|
|
135
|
+
if @peer_access_enabled
|
|
136
|
+
CUDA::RuntimeAPI.cudaSetDevice(@src_device)
|
|
137
|
+
P2PBindings.cudaDeviceDisablePeerAccess(@dst_device)
|
|
138
|
+
@peer_access_enabled = false
|
|
139
|
+
end
|
|
140
|
+
super
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# @return [String] Transport description
|
|
144
|
+
def to_s
|
|
145
|
+
bw = estimated_bandwidth
|
|
146
|
+
"P2P[#{@src_device}→#{@dst_device}, #{@interconnect_type}, #{bw} GB/s]"
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
private
|
|
150
|
+
|
|
151
|
+
def ensure_initialized!
|
|
152
|
+
raise TransportError, "Transport not initialized" unless @initialized
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|