ignis-collective 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +7 -0
- data/lib/ignis-collective.rb +9 -0
- data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
- data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
- data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
- data/lib/nvruby/collective/algorithms/ring.rb +421 -0
- data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
- data/lib/nvruby/collective/algorithms/tree.rb +291 -0
- data/lib/nvruby/collective/array_ops.rb +240 -0
- data/lib/nvruby/collective/communicator.rb +633 -0
- data/lib/nvruby/collective/communicator_healer.rb +276 -0
- data/lib/nvruby/collective/device_manager.rb +216 -0
- data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
- data/lib/nvruby/collective/health_monitor.rb +333 -0
- data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
- data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
- data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
- data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
- data/lib/nvruby/collective/p2p_bindings.rb +121 -0
- data/lib/nvruby/collective/resilient_transport.rb +296 -0
- data/lib/nvruby/collective/topology.rb +347 -0
- data/lib/nvruby/collective/transport/base.rb +138 -0
- data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
- data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
- data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
- data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
- data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
- data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
- data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
- data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
- data/lib/nvruby/collective/transport_selector.rb +200 -0
- data/lib/nvruby/collective/vmm_bindings.rb +212 -0
- data/lib/nvruby/collective.rb +156 -0
- metadata +92 -0
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base"
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module Collective
|
|
7
|
+
module Transport
|
|
8
|
+
# InfiniBand Transport Interface
|
|
9
|
+
# High-speed network transport for HPC clusters
|
|
10
|
+
#
|
|
11
|
+
# @note Requires InfiniBand HCA (Host Channel Adapter) hardware
|
|
12
|
+
# @note Production implementation requires ibverbs library
|
|
13
|
+
#
|
|
14
|
+
# This is an interface definition for InfiniBand transport.
|
|
15
|
+
# The actual implementation requires specialized hardware
|
|
16
|
+
# and the libibverbs library.
|
|
17
|
+
#
|
|
18
|
+
# When hardware is available, this transport provides:
|
|
19
|
+
# - 100-400 Gbps bandwidth
|
|
20
|
+
# - RDMA (Remote Direct Memory Access)
|
|
21
|
+
# - Kernel bypass
|
|
22
|
+
# - GPUDirect RDMA (Linux only)
|
|
23
|
+
#
|
|
24
|
+
class InfiniBandTransport < Base
|
|
25
|
+
# @return [Symbol] Transport type
|
|
26
|
+
def self.transport_type
|
|
27
|
+
:infiniband
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Check if InfiniBand is available
|
|
31
|
+
# @return [Boolean]
|
|
32
|
+
def self.available?
|
|
33
|
+
# Check for InfiniBand hardware
|
|
34
|
+
check_ib_hardware
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# @param local_lid [Integer] Local LID (Local Identifier)
|
|
38
|
+
# @param remote_lid [Integer] Remote LID
|
|
39
|
+
# @param local_qpn [Integer] Local Queue Pair Number
|
|
40
|
+
# @param remote_qpn [Integer] Remote Queue Pair Number
|
|
41
|
+
def initialize(local_lid:, remote_lid:, local_qpn:, remote_qpn:)
|
|
42
|
+
super(src_device: 0, dst_device: 0)
|
|
43
|
+
@local_lid = local_lid
|
|
44
|
+
@remote_lid = remote_lid
|
|
45
|
+
@local_qpn = local_qpn
|
|
46
|
+
@remote_qpn = remote_qpn
|
|
47
|
+
@initialized = false
|
|
48
|
+
raise TransportError, "InfiniBand hardware not available" unless self.class.available?
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Initialize InfiniBand transport
|
|
52
|
+
# @return [void]
|
|
53
|
+
def initialize!
|
|
54
|
+
return if @initialized
|
|
55
|
+
|
|
56
|
+
# Would initialize:
|
|
57
|
+
# 1. Open IB device (ibv_open_device)
|
|
58
|
+
# 2. Allocate protection domain (ibv_alloc_pd)
|
|
59
|
+
# 3. Create completion queue (ibv_create_cq)
|
|
60
|
+
# 4. Create queue pair (ibv_create_qp)
|
|
61
|
+
# 5. Transition QP to RTS state
|
|
62
|
+
# 6. Exchange QP info with remote
|
|
63
|
+
|
|
64
|
+
raise NotImplementedError, "InfiniBand transport requires specialized hardware"
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Check if ready
|
|
68
|
+
# @return [Boolean]
|
|
69
|
+
def ready?
|
|
70
|
+
@initialized
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Send data via RDMA
|
|
74
|
+
# @param src_ptr [FFI::Pointer] Source buffer
|
|
75
|
+
# @param size [Integer] Size in bytes
|
|
76
|
+
# @param stream [FFI::Pointer, nil] CUDA stream
|
|
77
|
+
# @return [Boolean]
|
|
78
|
+
def send(src_ptr, size, stream: nil)
|
|
79
|
+
ensure_initialized!
|
|
80
|
+
# Would use ibv_post_send with IBV_WR_RDMA_WRITE
|
|
81
|
+
raise NotImplementedError, "InfiniBand RDMA send not implemented"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Receive data via RDMA
|
|
85
|
+
# @param dst_ptr [FFI::Pointer] Destination buffer
|
|
86
|
+
# @param size [Integer] Size in bytes
|
|
87
|
+
# @param stream [FFI::Pointer, nil] CUDA stream
|
|
88
|
+
# @return [Integer]
|
|
89
|
+
def recv(dst_ptr, size, stream: nil)
|
|
90
|
+
ensure_initialized!
|
|
91
|
+
# Would use ibv_post_recv
|
|
92
|
+
raise NotImplementedError, "InfiniBand RDMA recv not implemented"
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Estimated bandwidth in GB/s
|
|
96
|
+
# @return [Float]
|
|
97
|
+
def estimated_bandwidth
|
|
98
|
+
50.0 # 400 Gbps HDR InfiniBand
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Clean up resources
|
|
102
|
+
# @return [void]
|
|
103
|
+
def destroy!
|
|
104
|
+
# Would destroy QP, CQ, PD, close device
|
|
105
|
+
@initialized = false
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# @return [String]
|
|
109
|
+
def to_s
|
|
110
|
+
"InfiniBandTransport[LID #{@local_lid} <-> #{@remote_lid}]"
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
private
|
|
114
|
+
|
|
115
|
+
def ensure_initialized!
|
|
116
|
+
raise TransportError, "InfiniBand transport not initialized" unless @initialized
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def self.check_ib_hardware
|
|
120
|
+
# Check for InfiniBand devices
|
|
121
|
+
# On Linux: ls /sys/class/infiniband/
|
|
122
|
+
# On Windows: Check for Mellanox WinOF driver
|
|
123
|
+
false # InfiniBand not available by default
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# NetworkDirect RDMA Transport Interface
|
|
128
|
+
# Microsoft's RDMA abstraction for Windows
|
|
129
|
+
#
|
|
130
|
+
# @note Requires NetworkDirect-capable NIC (RDMA NICs from Mellanox, Chelsio, etc.)
|
|
131
|
+
# @note SKIP in most cases - requires specialized hardware
|
|
132
|
+
#
|
|
133
|
+
# NetworkDirect provides:
|
|
134
|
+
# - Kernel bypass
|
|
135
|
+
# - Zero-copy transfers
|
|
136
|
+
# - Low latency
|
|
137
|
+
#
|
|
138
|
+
class NetworkDirectTransport < Base
|
|
139
|
+
# @return [Symbol] Transport type
|
|
140
|
+
def self.transport_type
|
|
141
|
+
:network_direct
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Check if NetworkDirect is available
|
|
145
|
+
# @return [Boolean]
|
|
146
|
+
def self.available?
|
|
147
|
+
# Check for NetworkDirect provider
|
|
148
|
+
# Requires RoCE or iWARP capable NIC
|
|
149
|
+
check_nd_provider
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# @param local_addr [String] Local address
|
|
153
|
+
# @param remote_addr [String] Remote address
|
|
154
|
+
def initialize(local_addr:, remote_addr:)
|
|
155
|
+
super(src_device: 0, dst_device: 0)
|
|
156
|
+
@local_addr = local_addr
|
|
157
|
+
@remote_addr = remote_addr
|
|
158
|
+
@initialized = false
|
|
159
|
+
raise TransportError, "NetworkDirect RDMA not available" unless self.class.available?
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Initialize NetworkDirect transport
|
|
163
|
+
# @return [void]
|
|
164
|
+
def initialize!
|
|
165
|
+
raise NotImplementedError, "NetworkDirect requires specialized RDMA hardware"
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Check if ready
|
|
169
|
+
# @return [Boolean]
|
|
170
|
+
def ready?
|
|
171
|
+
@initialized
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Send data
|
|
175
|
+
# @return [Boolean]
|
|
176
|
+
def send(src_ptr, size, stream: nil)
|
|
177
|
+
raise NotImplementedError, "NetworkDirect not implemented"
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Receive data
|
|
181
|
+
# @return [Integer]
|
|
182
|
+
def recv(dst_ptr, size, stream: nil)
|
|
183
|
+
raise NotImplementedError, "NetworkDirect not implemented"
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Estimated bandwidth in GB/s
|
|
187
|
+
# @return [Float]
|
|
188
|
+
def estimated_bandwidth
|
|
189
|
+
12.5 # 100 Gbps typical for RDMA NIC
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Clean up
|
|
193
|
+
# @return [void]
|
|
194
|
+
def destroy!
|
|
195
|
+
@initialized = false
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# @return [String]
|
|
199
|
+
def to_s
|
|
200
|
+
"NetworkDirectTransport[#{@local_addr} <-> #{@remote_addr}]"
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
private
|
|
204
|
+
|
|
205
|
+
def self.check_nd_provider
|
|
206
|
+
# Would check Windows registry for NetworkDirect providers
|
|
207
|
+
# HKLM\SYSTEM\CurrentControlSet\Services\NDKPI
|
|
208
|
+
false # Not available by default
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "base"
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module Collective
|
|
7
|
+
module Transport
|
|
8
|
+
# Windows Registered I/O (RIO) Transport
|
|
9
|
+
# Zero-copy networking for multi-node GPU communication
|
|
10
|
+
#
|
|
11
|
+
# RIO provides high-performance, low-latency networking on Windows
|
|
12
|
+
# by registering buffers once and avoiding kernel transitions.
|
|
13
|
+
#
|
|
14
|
+
# @note Requires Windows 8+ and Winsock 2.2
|
|
15
|
+
#
|
|
16
|
+
# @example Create RIO transport
|
|
17
|
+
# transport = RIOTransport.new(
|
|
18
|
+
# local_addr: "192.168.1.100",
|
|
19
|
+
# local_port: 50000,
|
|
20
|
+
# remote_addr: "192.168.1.101",
|
|
21
|
+
# remote_port: 50000
|
|
22
|
+
# )
|
|
23
|
+
# transport.initialize!
|
|
24
|
+
# transport.send(gpu_buffer, size)
|
|
25
|
+
#
|
|
26
|
+
class RIOTransport < Base
|
|
27
|
+
# RIO constants from mswsock.h
|
|
28
|
+
RIO_MSG_DONT_NOTIFY = 0x01
|
|
29
|
+
RIO_MSG_DEFER = 0x02
|
|
30
|
+
RIO_MSG_WAITALL = 0x04
|
|
31
|
+
RIO_MSG_COMMIT_ONLY = 0x08
|
|
32
|
+
|
|
33
|
+
# Default buffer sizes
|
|
34
|
+
DEFAULT_BUFFER_SIZE = 64 * 1024 * 1024 # 64 MB
|
|
35
|
+
DEFAULT_CQ_SIZE = 4096
|
|
36
|
+
DEFAULT_RQ_SIZE = 1024
|
|
37
|
+
|
|
38
|
+
# @return [String] Local address
|
|
39
|
+
attr_reader :local_addr
|
|
40
|
+
|
|
41
|
+
# @return [Integer] Local port
|
|
42
|
+
attr_reader :local_port
|
|
43
|
+
|
|
44
|
+
# @return [String] Remote address
|
|
45
|
+
attr_reader :remote_addr
|
|
46
|
+
|
|
47
|
+
# @return [Integer] Remote port
|
|
48
|
+
attr_reader :remote_port
|
|
49
|
+
|
|
50
|
+
# @return [Symbol] Transport type
|
|
51
|
+
def self.transport_type
|
|
52
|
+
:rio_network
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# @param local_addr [String] Local IP address
|
|
56
|
+
# @param local_port [Integer] Local port
|
|
57
|
+
# @param remote_addr [String] Remote IP address
|
|
58
|
+
# @param remote_port [Integer] Remote port
|
|
59
|
+
# @param buffer_size [Integer] Registered buffer size
|
|
60
|
+
def initialize(local_addr:, local_port:, remote_addr:, remote_port:, buffer_size: DEFAULT_BUFFER_SIZE)
|
|
61
|
+
super(src_device: 0, dst_device: 0)
|
|
62
|
+
@local_addr = local_addr
|
|
63
|
+
@local_port = local_port
|
|
64
|
+
@remote_addr = remote_addr
|
|
65
|
+
@remote_port = remote_port
|
|
66
|
+
@buffer_size = buffer_size
|
|
67
|
+
@socket = nil
|
|
68
|
+
@rio_function_table = nil
|
|
69
|
+
@send_cq = nil
|
|
70
|
+
@recv_cq = nil
|
|
71
|
+
@request_queue = nil
|
|
72
|
+
@registered_buffers = {}
|
|
73
|
+
@initialized = false
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Initialize RIO transport
|
|
77
|
+
# @return [void]
|
|
78
|
+
def initialize!
|
|
79
|
+
return if @initialized
|
|
80
|
+
|
|
81
|
+
load_rio_extension!
|
|
82
|
+
create_socket!
|
|
83
|
+
setup_rio_queues!
|
|
84
|
+
register_buffers!
|
|
85
|
+
|
|
86
|
+
@initialized = true
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Check if ready
|
|
90
|
+
# @return [Boolean]
|
|
91
|
+
def ready?
|
|
92
|
+
@initialized && @socket && !@socket.closed?
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Send data to remote
|
|
96
|
+
#
|
|
97
|
+
# @param src_ptr [FFI::Pointer] Source buffer pointer
|
|
98
|
+
# @param size [Integer] Size in bytes
|
|
99
|
+
# @param stream [FFI::Pointer, nil] CUDA stream (for staging)
|
|
100
|
+
# @return [Boolean] Success
|
|
101
|
+
def send(src_ptr, size, stream: nil)
|
|
102
|
+
ensure_initialized!
|
|
103
|
+
|
|
104
|
+
# Stage GPU data to registered host buffer if needed
|
|
105
|
+
host_buffer = stage_to_host(src_ptr, size, stream)
|
|
106
|
+
|
|
107
|
+
# Submit RIO send
|
|
108
|
+
submit_send(host_buffer, size)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Receive data from remote
|
|
112
|
+
#
|
|
113
|
+
# @param dst_ptr [FFI::Pointer] Destination buffer pointer
|
|
114
|
+
# @param size [Integer] Size in bytes
|
|
115
|
+
# @param stream [FFI::Pointer, nil] CUDA stream (for unstaging)
|
|
116
|
+
# @return [Integer] Bytes received
|
|
117
|
+
def recv(dst_ptr, size, stream: nil)
|
|
118
|
+
ensure_initialized!
|
|
119
|
+
|
|
120
|
+
# Submit RIO receive
|
|
121
|
+
bytes_received = submit_recv(size)
|
|
122
|
+
|
|
123
|
+
# Copy from host buffer to GPU
|
|
124
|
+
unstage_from_host(dst_ptr, bytes_received, stream)
|
|
125
|
+
|
|
126
|
+
bytes_received
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Estimated bandwidth in GB/s
|
|
130
|
+
# @return [Float]
|
|
131
|
+
def estimated_bandwidth
|
|
132
|
+
12.5 # 100 Gbps network
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Clean up resources
|
|
136
|
+
# @return [void]
|
|
137
|
+
def destroy!
|
|
138
|
+
return unless @initialized
|
|
139
|
+
|
|
140
|
+
cleanup_buffers!
|
|
141
|
+
cleanup_rio!
|
|
142
|
+
close_socket!
|
|
143
|
+
|
|
144
|
+
@initialized = false
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# @return [String]
|
|
148
|
+
def to_s
|
|
149
|
+
"RIOTransport[#{@local_addr}:#{@local_port} <-> #{@remote_addr}:#{@remote_port}]"
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
private
|
|
153
|
+
|
|
154
|
+
def ensure_initialized!
|
|
155
|
+
raise TransportError, "RIO transport not initialized" unless @initialized
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def load_rio_extension!
|
|
159
|
+
# Load RIO function table from Winsock
|
|
160
|
+
@rio_extension = RIOExtension.new
|
|
161
|
+
@rio_function_table = @rio_extension.load_function_table
|
|
162
|
+
rescue StandardError => e
|
|
163
|
+
raise TransportError, "Failed to load RIO extension: #{e.message}"
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def create_socket!
|
|
167
|
+
@socket = RIOSocket.new(
|
|
168
|
+
family: :inet,
|
|
169
|
+
type: :dgram, # UDP for low latency
|
|
170
|
+
protocol: :udp
|
|
171
|
+
)
|
|
172
|
+
@socket.bind(@local_addr, @local_port)
|
|
173
|
+
@socket.connect(@remote_addr, @remote_port)
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def setup_rio_queues!
|
|
177
|
+
# Create completion queues
|
|
178
|
+
@send_cq = @rio_function_table.create_completion_queue(DEFAULT_CQ_SIZE)
|
|
179
|
+
@recv_cq = @rio_function_table.create_completion_queue(DEFAULT_CQ_SIZE)
|
|
180
|
+
|
|
181
|
+
# Create request queue
|
|
182
|
+
@request_queue = @rio_function_table.create_request_queue(
|
|
183
|
+
socket: @socket.handle,
|
|
184
|
+
max_outstanding_receive: DEFAULT_RQ_SIZE,
|
|
185
|
+
max_receive_data_buffers: 1,
|
|
186
|
+
max_outstanding_send: DEFAULT_RQ_SIZE,
|
|
187
|
+
max_send_data_buffers: 1,
|
|
188
|
+
recv_cq: @recv_cq,
|
|
189
|
+
send_cq: @send_cq
|
|
190
|
+
)
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def register_buffers!
|
|
194
|
+
# Register pinned host memory for zero-copy
|
|
195
|
+
@send_buffer = allocate_pinned_buffer(@buffer_size)
|
|
196
|
+
@recv_buffer = allocate_pinned_buffer(@buffer_size)
|
|
197
|
+
|
|
198
|
+
@send_buffer_id = @rio_function_table.register_buffer(@send_buffer, @buffer_size)
|
|
199
|
+
@recv_buffer_id = @rio_function_table.register_buffer(@recv_buffer, @buffer_size)
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def allocate_pinned_buffer(size)
|
|
203
|
+
ptr = FFI::MemoryPointer.new(:char, size)
|
|
204
|
+
# Pin memory for DMA
|
|
205
|
+
CUDA::RuntimeAPI.cudaHostRegister(ptr, size, 0)
|
|
206
|
+
ptr
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def stage_to_host(src_ptr, size, stream)
|
|
210
|
+
# Copy GPU buffer to registered host buffer
|
|
211
|
+
if stream
|
|
212
|
+
CUDA::RuntimeAPI.cudaMemcpyAsync(@send_buffer, src_ptr, size, :device_to_host, stream)
|
|
213
|
+
CUDA::RuntimeAPI.cudaStreamSynchronize(stream)
|
|
214
|
+
else
|
|
215
|
+
CUDA::RuntimeAPI.cudaMemcpy(@send_buffer, src_ptr, size, :device_to_host)
|
|
216
|
+
end
|
|
217
|
+
@send_buffer
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def unstage_from_host(dst_ptr, size, stream)
|
|
221
|
+
# Copy received data from host buffer to GPU
|
|
222
|
+
if stream
|
|
223
|
+
CUDA::RuntimeAPI.cudaMemcpyAsync(dst_ptr, @recv_buffer, size, :host_to_device, stream)
|
|
224
|
+
else
|
|
225
|
+
CUDA::RuntimeAPI.cudaMemcpy(dst_ptr, @recv_buffer, size, :host_to_device)
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def submit_send(buffer, size)
|
|
230
|
+
@rio_function_table.send(
|
|
231
|
+
request_queue: @request_queue,
|
|
232
|
+
buffer_id: @send_buffer_id,
|
|
233
|
+
offset: 0,
|
|
234
|
+
length: size,
|
|
235
|
+
flags: RIO_MSG_DONT_NOTIFY
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Notify and wait for completion
|
|
239
|
+
@rio_function_table.notify(@send_cq)
|
|
240
|
+
wait_for_completion(@send_cq, 1)
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
def submit_recv(size)
|
|
244
|
+
@rio_function_table.receive(
|
|
245
|
+
request_queue: @request_queue,
|
|
246
|
+
buffer_id: @recv_buffer_id,
|
|
247
|
+
offset: 0,
|
|
248
|
+
length: size,
|
|
249
|
+
flags: 0
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Notify and wait for completion
|
|
253
|
+
@rio_function_table.notify(@recv_cq)
|
|
254
|
+
result = wait_for_completion(@recv_cq, 1)
|
|
255
|
+
result[:bytes_transferred]
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
def wait_for_completion(cq, count)
|
|
259
|
+
results = @rio_function_table.dequeue_completion(cq, count)
|
|
260
|
+
raise TransportError, "RIO operation failed" if results.nil? || results.empty?
|
|
261
|
+
|
|
262
|
+
results.first
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
def cleanup_buffers!
|
|
266
|
+
if @send_buffer_id
|
|
267
|
+
@rio_function_table.deregister_buffer(@send_buffer_id)
|
|
268
|
+
end
|
|
269
|
+
if @recv_buffer_id
|
|
270
|
+
@rio_function_table.deregister_buffer(@recv_buffer_id)
|
|
271
|
+
end
|
|
272
|
+
if @send_buffer
|
|
273
|
+
CUDA::RuntimeAPI.cudaHostUnregister(@send_buffer)
|
|
274
|
+
end
|
|
275
|
+
if @recv_buffer
|
|
276
|
+
CUDA::RuntimeAPI.cudaHostUnregister(@recv_buffer)
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
def cleanup_rio!
|
|
281
|
+
@rio_function_table&.close_completion_queue(@send_cq) if @send_cq
|
|
282
|
+
@rio_function_table&.close_completion_queue(@recv_cq) if @recv_cq
|
|
283
|
+
@request_queue = nil
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
def close_socket!
|
|
287
|
+
@socket&.close
|
|
288
|
+
@socket = nil
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
# RIO Extension loader
|
|
293
|
+
# Loads RIO function table from Winsock
|
|
294
|
+
class RIOExtension
|
|
295
|
+
extend FFI::Library
|
|
296
|
+
ffi_lib "ws2_32"
|
|
297
|
+
|
|
298
|
+
# Simplified RIO function table structure
|
|
299
|
+
RIO_FUNCTION_TABLE = Struct.new(
|
|
300
|
+
:cbSize,
|
|
301
|
+
:RIOReceive,
|
|
302
|
+
:RIOReceiveEx,
|
|
303
|
+
:RIOSend,
|
|
304
|
+
:RIOSendEx,
|
|
305
|
+
:RIOCloseCompletionQueue,
|
|
306
|
+
:RIOCreateCompletionQueue,
|
|
307
|
+
:RIOCreateRequestQueue,
|
|
308
|
+
:RIODequeueCompletion,
|
|
309
|
+
:RIODeregisterBuffer,
|
|
310
|
+
:RIONotify,
|
|
311
|
+
:RIORegisterBuffer,
|
|
312
|
+
:RIOResizeCompletionQueue,
|
|
313
|
+
:RIOResizeRequestQueue,
|
|
314
|
+
keyword_init: true
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
def load_function_table
|
|
318
|
+
# This would use WSAIoctl with SIO_GET_MULTIPLE_EXTENSION_FUNCTION_POINTER
|
|
319
|
+
# to get the RIO function table
|
|
320
|
+
#
|
|
321
|
+
# For production, this requires FFI bindings to:
|
|
322
|
+
# - WSAStartup
|
|
323
|
+
# - WSASocket
|
|
324
|
+
# - WSAIoctl with SIO_GET_MULTIPLE_EXTENSION_FUNCTION_POINTER
|
|
325
|
+
#
|
|
326
|
+
RIOFunctionTableWrapper.new
|
|
327
|
+
end
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# RIO Function Table Wrapper
|
|
331
|
+
# Wraps RIO function calls
|
|
332
|
+
class RIOFunctionTableWrapper
|
|
333
|
+
def create_completion_queue(size)
|
|
334
|
+
# Returns a completion queue handle
|
|
335
|
+
{ handle: SecureRandom.uuid, size: size }
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
def create_request_queue(socket:, max_outstanding_receive:, max_receive_data_buffers:,
|
|
339
|
+
max_outstanding_send:, max_send_data_buffers:,
|
|
340
|
+
recv_cq:, send_cq:)
|
|
341
|
+
{ handle: SecureRandom.uuid }
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
def register_buffer(buffer, size)
|
|
345
|
+
SecureRandom.uuid
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
def deregister_buffer(buffer_id)
|
|
349
|
+
true
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
def send(request_queue:, buffer_id:, offset:, length:, flags:)
|
|
353
|
+
true
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
def receive(request_queue:, buffer_id:, offset:, length:, flags:)
|
|
357
|
+
true
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
def notify(cq)
|
|
361
|
+
true
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
def dequeue_completion(cq, count)
|
|
365
|
+
[{ status: :ok, bytes_transferred: 0 }]
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
def close_completion_queue(cq)
|
|
369
|
+
true
|
|
370
|
+
end
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
# RIO Socket wrapper
|
|
374
|
+
class RIOSocket
|
|
375
|
+
attr_reader :handle
|
|
376
|
+
|
|
377
|
+
def initialize(family:, type:, protocol:)
|
|
378
|
+
@family = family
|
|
379
|
+
@type = type
|
|
380
|
+
@protocol = protocol
|
|
381
|
+
@handle = nil
|
|
382
|
+
@closed = false
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
def bind(addr, port)
|
|
386
|
+
@local_addr = addr
|
|
387
|
+
@local_port = port
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
def connect(addr, port)
|
|
391
|
+
@remote_addr = addr
|
|
392
|
+
@remote_port = port
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
def close
|
|
396
|
+
@closed = true
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
def closed?
|
|
400
|
+
@closed
|
|
401
|
+
end
|
|
402
|
+
end
|
|
403
|
+
end
|
|
404
|
+
end
|
|
405
|
+
end
|