ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +7 -0
  3. data/lib/ignis-collective.rb +9 -0
  4. data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
  5. data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
  6. data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
  7. data/lib/nvruby/collective/algorithms/ring.rb +421 -0
  8. data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
  9. data/lib/nvruby/collective/algorithms/tree.rb +291 -0
  10. data/lib/nvruby/collective/array_ops.rb +240 -0
  11. data/lib/nvruby/collective/communicator.rb +633 -0
  12. data/lib/nvruby/collective/communicator_healer.rb +276 -0
  13. data/lib/nvruby/collective/device_manager.rb +216 -0
  14. data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
  15. data/lib/nvruby/collective/health_monitor.rb +333 -0
  16. data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
  17. data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
  18. data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
  19. data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
  20. data/lib/nvruby/collective/p2p_bindings.rb +121 -0
  21. data/lib/nvruby/collective/resilient_transport.rb +296 -0
  22. data/lib/nvruby/collective/topology.rb +347 -0
  23. data/lib/nvruby/collective/transport/base.rb +138 -0
  24. data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
  25. data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
  26. data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
  27. data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
  28. data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
  29. data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
  30. data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
  31. data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
  32. data/lib/nvruby/collective/transport_selector.rb +200 -0
  33. data/lib/nvruby/collective/vmm_bindings.rb +212 -0
  34. data/lib/nvruby/collective.rb +156 -0
  35. metadata +92 -0
@@ -0,0 +1,290 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+ require "socket"
5
+
6
+ module Ignis
7
+ module Collective
8
+ module Transport
9
+ # TCP Fallback Transport
10
+ # Generic TCP transport for multi-node communication
11
+ # Used when higher-performance transports are unavailable
12
+ #
13
+ # @note This is the lowest-performance multi-node option
14
+ # but works on any network configuration
15
+ #
16
+ # @example Create TCP transport
17
+ # transport = TCPTransport.new(
18
+ # local_addr: "192.168.1.100",
19
+ # local_port: 50000,
20
+ # remote_addr: "192.168.1.101",
21
+ # remote_port: 50000,
22
+ # mode: :client # or :server
23
+ # )
24
+ #
25
+ class TCPTransport < Base
26
+ # Default buffer size for staging
27
+ DEFAULT_BUFFER_SIZE = 16 * 1024 * 1024 # 16 MB
28
+
29
+ # Connection timeout in seconds
30
+ CONNECT_TIMEOUT = 30
31
+
32
+ # @return [String] Local address
33
+ attr_reader :local_addr
34
+
35
+ # @return [Integer] Local port
36
+ attr_reader :local_port
37
+
38
+ # @return [String] Remote address
39
+ attr_reader :remote_addr
40
+
41
+ # @return [Integer] Remote port
42
+ attr_reader :remote_port
43
+
44
+ # @return [Symbol] Mode (:client or :server)
45
+ attr_reader :mode
46
+
47
+ # @return [Symbol] Transport type
48
+ def self.transport_type
49
+ :tcp
50
+ end
51
+
52
+ # @param local_addr [String] Local IP address
53
+ # @param local_port [Integer] Local port
54
+ # @param remote_addr [String] Remote IP address
55
+ # @param remote_port [Integer] Remote port
56
+ # @param mode [Symbol] :client or :server
57
+ # @param buffer_size [Integer] Staging buffer size
58
+ def initialize(local_addr:, local_port:, remote_addr:, remote_port:,
59
+ mode: :client, buffer_size: DEFAULT_BUFFER_SIZE)
60
+ super(src_device: 0, dst_device: 0)
61
+ @local_addr = local_addr
62
+ @local_port = local_port
63
+ @remote_addr = remote_addr
64
+ @remote_port = remote_port
65
+ @mode = mode
66
+ @buffer_size = buffer_size
67
+ @socket = nil
68
+ @server_socket = nil
69
+ @send_buffer = nil
70
+ @recv_buffer = nil
71
+ @initialized = false
72
+ @mutex = Mutex.new
73
+ end
74
+
75
+ # Initialize TCP transport
76
+ # @return [void]
77
+ def initialize!
78
+ return if @initialized
79
+
80
+ allocate_buffers!
81
+
82
+ if @mode == :server
83
+ start_server!
84
+ else
85
+ connect_to_server!
86
+ end
87
+
88
+ @initialized = true
89
+ end
90
+
91
+ # Check if ready
92
+ # @return [Boolean]
93
+ def ready?
94
+ @initialized && @socket && !@socket.closed?
95
+ end
96
+
97
+ # Send data to remote
98
+ #
99
+ # @param src_ptr [FFI::Pointer] Source GPU buffer pointer
100
+ # @param size [Integer] Size in bytes
101
+ # @param stream [FFI::Pointer, nil] CUDA stream
102
+ # @return [Boolean] Success
103
+ def send(src_ptr, size, stream: nil)
104
+ ensure_initialized!
105
+ # Guard against overflowing the fixed-size host staging buffer:
106
+ # stage_to_host copies `size` bytes into the @buffer_size buffer, so a
107
+ # larger transfer corrupts the heap. Chunked staging for >buffer_size
108
+ # transfers isn't implemented yet — fail loudly instead of overflowing.
109
+ if size > @buffer_size
110
+ raise TransportError,
111
+ "TCP transfer of #{size} bytes exceeds the #{@buffer_size}-byte host staging " \
112
+ "buffer (chunked staging not implemented); increase buffer_size or chunk the transfer"
113
+ end
114
+ @mutex.synchronize do
115
+ # Stage GPU data to host buffer
116
+ stage_to_host(src_ptr, size, stream)
117
+
118
+ # Send size header
119
+ send_header(size)
120
+
121
+ # Send data in chunks
122
+ bytes_sent = 0
123
+ while bytes_sent < size
124
+ chunk_size = [size - bytes_sent, @buffer_size].min
125
+ sent = @socket.send(@send_buffer.get_bytes(bytes_sent, chunk_size), 0)
126
+ raise TransportError, "TCP send failed" if sent <= 0
127
+
128
+ bytes_sent += sent
129
+ end
130
+
131
+ true
132
+ end
133
+ end
134
+
135
+ # Receive data from remote
136
+ #
137
+ # @param dst_ptr [FFI::Pointer] Destination GPU buffer pointer
138
+ # @param size [Integer] Expected size in bytes
139
+ # @param stream [FFI::Pointer, nil] CUDA stream
140
+ # @return [Integer] Bytes received
141
+ def recv(dst_ptr, size, stream: nil)
142
+ ensure_initialized!
143
+ @mutex.synchronize do
144
+ # Receive size header
145
+ actual_size = recv_header
146
+
147
+ # The header size comes from the peer; a malformed/hostile peer could
148
+ # announce a size larger than the staging buffer, and put_bytes below
149
+ # would then write past it (heap overflow). Bound it.
150
+ if actual_size > @buffer_size
151
+ raise TransportError,
152
+ "TCP recv header announces #{actual_size} bytes, exceeding the " \
153
+ "#{@buffer_size}-byte staging buffer (refusing to overflow)"
154
+ end
155
+
156
+ # Receive data in chunks
157
+ bytes_received = 0
158
+ while bytes_received < actual_size
159
+ chunk_size = [actual_size - bytes_received, @buffer_size].min
160
+ data = @socket.recv(chunk_size)
161
+ raise TransportError, "TCP recv failed" if data.nil? || data.empty?
162
+
163
+ @recv_buffer.put_bytes(bytes_received, data)
164
+ bytes_received += data.bytesize
165
+ end
166
+
167
+ # Copy from host buffer to GPU
168
+ unstage_from_host(dst_ptr, actual_size, stream)
169
+
170
+ actual_size
171
+ end
172
+ end
173
+
174
+ # Estimated bandwidth in GB/s
175
+ # @return [Float]
176
+ def estimated_bandwidth
177
+ 1.25 # ~10 Gbps typical for TCP
178
+ end
179
+
180
+ # Clean up resources
181
+ # @return [void]
182
+ def destroy!
183
+ return unless @initialized
184
+
185
+ @socket&.close
186
+ @server_socket&.close
187
+ free_buffers!
188
+
189
+ @socket = nil
190
+ @server_socket = nil
191
+ @initialized = false
192
+ end
193
+
194
+ # @return [String]
195
+ def to_s
196
+ status = ready? ? "connected" : "disconnected"
197
+ "TCPTransport[#{@local_addr}:#{@local_port} <-> #{@remote_addr}:#{@remote_port}, #{status}]"
198
+ end
199
+
200
+ private
201
+
202
+ def ensure_initialized!
203
+ raise TransportError, "TCP transport not initialized" unless @initialized
204
+ end
205
+
206
+ def allocate_buffers!
207
+ # Allocate pinned host memory for GPU staging
208
+ @send_buffer = FFI::MemoryPointer.new(:char, @buffer_size)
209
+ @recv_buffer = FFI::MemoryPointer.new(:char, @buffer_size)
210
+
211
+ # Pin for faster GPU transfers
212
+ CUDA::RuntimeAPI.cudaHostRegister(@send_buffer, @buffer_size, 0)
213
+ CUDA::RuntimeAPI.cudaHostRegister(@recv_buffer, @buffer_size, 0)
214
+ rescue StandardError => e
215
+ Ignis.logger.warn { "Failed to pin TCP buffers: #{e.message}" }
216
+ end
217
+
218
+ def free_buffers!
219
+ CUDA::RuntimeAPI.cudaHostUnregister(@send_buffer) if @send_buffer
220
+ CUDA::RuntimeAPI.cudaHostUnregister(@recv_buffer) if @recv_buffer
221
+ rescue StandardError
222
+ # Ignore unregister errors during cleanup
223
+ end
224
+
225
+ def start_server!
226
+ @server_socket = TCPServer.new(@local_addr, @local_port)
227
+ @server_socket.setsockopt(Socket::SOL_SOCKET, Socket::SO_REUSEADDR, true)
228
+ @server_socket.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_NODELAY, true)
229
+
230
+ Ignis.logger.info { "TCP transport listening on #{@local_addr}:#{@local_port}" }
231
+
232
+ # Accept connection with timeout
233
+ ready = IO.select([@server_socket], nil, nil, CONNECT_TIMEOUT)
234
+ raise TransportError, "Timeout waiting for connection" unless ready
235
+
236
+ @socket = @server_socket.accept
237
+ configure_socket!
238
+
239
+ Ignis.logger.info { "TCP transport accepted connection from #{@socket.peeraddr[2]}" }
240
+ end
241
+
242
+ def connect_to_server!
243
+ @socket = TCPSocket.new(@remote_addr, @remote_port)
244
+ configure_socket!
245
+
246
+ Ignis.logger.info { "TCP transport connected to #{@remote_addr}:#{@remote_port}" }
247
+ rescue Errno::ECONNREFUSED => e
248
+ raise TransportError, "Connection refused: #{e.message}"
249
+ rescue Errno::ETIMEDOUT => e
250
+ raise TransportError, "Connection timeout: #{e.message}"
251
+ end
252
+
253
+ def configure_socket!
254
+ @socket.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_NODELAY, true)
255
+ @socket.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, @buffer_size)
256
+ @socket.setsockopt(Socket::SOL_SOCKET, Socket::SO_RCVBUF, @buffer_size)
257
+ end
258
+
259
+ def stage_to_host(src_ptr, size, stream)
260
+ if stream
261
+ CUDA::RuntimeAPI.cudaMemcpyAsync(@send_buffer, src_ptr, size, :device_to_host, stream)
262
+ CUDA::RuntimeAPI.cudaStreamSynchronize(stream)
263
+ else
264
+ CUDA::RuntimeAPI.cudaMemcpy(@send_buffer, src_ptr, size, :device_to_host)
265
+ end
266
+ end
267
+
268
+ def unstage_from_host(dst_ptr, size, stream)
269
+ if stream
270
+ CUDA::RuntimeAPI.cudaMemcpyAsync(dst_ptr, @recv_buffer, size, :host_to_device, stream)
271
+ else
272
+ CUDA::RuntimeAPI.cudaMemcpy(dst_ptr, @recv_buffer, size, :host_to_device)
273
+ end
274
+ end
275
+
276
+ def send_header(size)
277
+ header = [size].pack("Q<") # 64-bit little-endian
278
+ @socket.send(header, 0)
279
+ end
280
+
281
+ def recv_header
282
+ header = @socket.recv(8)
283
+ raise TransportError, "Failed to receive header" if header.nil? || header.bytesize < 8
284
+
285
+ header.unpack1("Q<")
286
+ end
287
+ end
288
+ end
289
+ end
290
+ end
@@ -0,0 +1,189 @@
1
+ # frozen_string_literal: true
2
+
3
+ # VMM IPC Structs — FFI::Struct definitions for CUDA Virtual Memory Management
4
+ #
5
+ # Rule 4: FFI structs live in their own file, never mixed with Fiddle hot-path calls.
6
+ # These structs are used by the VMM IPC transport for multi-GPU memory sharing.
7
+
8
+ require "ignis"
9
+ Ignis::Shared::FFILoader.load!
10
+
11
+ module Ignis
12
+ module Collective
13
+ module Transport
14
+ # FFI struct definitions for CUDA VMM (Virtual Memory Management) IPC.
15
+ module VmmIpcStructs
16
+ extend FFI::Library
17
+
18
+ # Resolve CUDA driver library per platform.
19
+ CUDA_DRIVER_LIB = if defined?(Ignis::Platform)
20
+ Ignis::Platform.find_cuda_lib(:cuda_driver) || (Ignis::Platform.windows? ? 'nvcuda.dll' : 'libcuda.so.1')
21
+ elsif RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
22
+ cuda_bin = File.join('C:', 'Program Files', 'NVIDIA GPU Computing Toolkit', 'CUDA', 'v13.0', 'bin')
23
+ File.join(cuda_bin, 'nvcuda.dll')
24
+ else
25
+ 'libcuda.so.1'
26
+ end
27
+
28
+ begin
29
+ ffi_lib CUDA_DRIVER_LIB
30
+ rescue LoadError => e
31
+ $stderr.puts "[NvCCL] WARNING: Cannot load #{CUDA_DRIVER_LIB}: #{e.message}"
32
+ end
33
+
34
+ # CUmemLocationType — specifies where memory is located
35
+ CU_MEM_LOCATION_TYPE_INVALID = 0
36
+ CU_MEM_LOCATION_TYPE_DEVICE = 1
37
+ CU_MEM_LOCATION_TYPE_HOST = 2
38
+
39
+ # CUmemAllocationType
40
+ CU_MEM_ALLOCATION_TYPE_INVALID = 0
41
+ CU_MEM_ALLOCATION_TYPE_PINNED = 1
42
+
43
+ # CUmemAccess_flags
44
+ CU_MEM_ACCESS_FLAGS_PROT_NONE = 0
45
+ CU_MEM_ACCESS_FLAGS_PROT_READ = 1
46
+ CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 3
47
+
48
+ # CUmemAllocationHandleType
49
+ CU_MEM_HANDLE_TYPE_NONE = 0
50
+ CU_MEM_HANDLE_TYPE_POSIX_FILE_DESC = 1
51
+ CU_MEM_HANDLE_TYPE_WIN32 = 2
52
+ CU_MEM_HANDLE_TYPE_WIN32_KMT = 4
53
+ CU_MEM_HANDLE_TYPE_FABRIC = 8
54
+
55
+ # CUmemLocation — memory location descriptor
56
+ class CUmemLocation < FFI::Struct
57
+ layout \
58
+ :type, :int, # CUmemLocationType
59
+ :id, :int # Device ordinal (for DEVICE type)
60
+ end
61
+
62
+ # CUmemAccessDesc — memory access descriptor
63
+ class CUmemAccessDesc < FFI::Struct
64
+ layout \
65
+ :location, CUmemLocation,
66
+ :flags, :uint # CUmemAccess_flags
67
+ end
68
+
69
+ # CUmemAllocationProp — memory allocation properties
70
+ class CUmemAllocationProp < FFI::Struct
71
+ layout \
72
+ :type, :int, # CUmemAllocationType
73
+ :requestedHandleTypes, :int, # CUmemAllocationHandleType bitmask
74
+ :location, CUmemLocation,
75
+ :win32HandleMetaData, :pointer, # Windows security descriptor (NULL for default)
76
+ :allocFlags_compressionType, :uchar,
77
+ :allocFlags_gpuDirectRDMACapable, :uchar,
78
+ :allocFlags_usage, :ushort,
79
+ :_reserved, [:uchar, 4]
80
+ end
81
+
82
+ # CUmemGenericAllocationHandle — opaque handle
83
+ # This is a uint64 (CUmemGenericAllocationHandle) on all platforms.
84
+ # We use :uint64 since FFI doesn't have an opaque handle type.
85
+
86
+ # IPC memory handle for sharing across processes
87
+ class CUipcMemHandle < FFI::Struct
88
+ layout :reserved, [:char, 64]
89
+ end
90
+
91
+ # IPC event handle
92
+ class CUipcEventHandle < FFI::Struct
93
+ layout :reserved, [:char, 64]
94
+ end
95
+
96
+ # Helper: Create a CUmemLocation for a given GPU device.
97
+ # @param device_id [Integer]
98
+ # @return [CUmemLocation]
99
+ def self.device_location(device_id)
100
+ loc = CUmemLocation.new
101
+ loc[:type] = CU_MEM_LOCATION_TYPE_DEVICE
102
+ loc[:id] = device_id
103
+ loc
104
+ end
105
+
106
+ # Helper: Create a CUmemAccessDesc for read-write access to a device.
107
+ # @param device_id [Integer]
108
+ # @return [CUmemAccessDesc]
109
+ def self.device_rw_access(device_id)
110
+ desc = CUmemAccessDesc.new
111
+ desc[:location][:type] = CU_MEM_LOCATION_TYPE_DEVICE
112
+ desc[:location][:id] = device_id
113
+ desc[:flags] = CU_MEM_ACCESS_FLAGS_PROT_READWRITE
114
+ desc
115
+ end
116
+
117
+ # Helper: Create CUmemAllocationProp for device-pinned memory
118
+ # with Windows handle support (Win32).
119
+ #
120
+ # @param device_id [Integer]
121
+ # @param rdma_capable [Boolean] whether to request GPU Direct RDMA
122
+ # @return [CUmemAllocationProp]
123
+ def self.pinned_device_prop(device_id, rdma_capable: false)
124
+ prop = CUmemAllocationProp.new
125
+ prop[:type] = CU_MEM_ALLOCATION_TYPE_PINNED
126
+ prop[:requestedHandleTypes] = if RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
127
+ CU_MEM_HANDLE_TYPE_WIN32
128
+ else
129
+ CU_MEM_HANDLE_TYPE_POSIX_FILE_DESC
130
+ end
131
+ prop[:location][:type] = CU_MEM_LOCATION_TYPE_DEVICE
132
+ prop[:location][:id] = device_id
133
+ prop[:win32HandleMetaData] = FFI::Pointer::NULL
134
+ prop[:allocFlags_compressionType] = 0
135
+ prop[:allocFlags_gpuDirectRDMACapable] = rdma_capable ? 1 : 0
136
+ prop[:allocFlags_usage] = 0
137
+ prop
138
+ end
139
+
140
+ # Bind VMM functions if DLL is available
141
+ begin
142
+ # cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags)
143
+ attach_function :cuMemAddressReserve, [:pointer, :size_t, :size_t, :uint64, :uint64], :int
144
+
145
+ # cuMemAddressFree(CUdeviceptr ptr, size_t size)
146
+ attach_function :cuMemAddressFree, [:uint64, :size_t], :int
147
+
148
+ # cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags)
149
+ attach_function :cuMemCreate, [:pointer, :size_t, :pointer, :uint64], :int
150
+
151
+ # cuMemRelease(CUmemGenericAllocationHandle handle)
152
+ attach_function :cuMemRelease, [:uint64], :int
153
+
154
+ # cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags)
155
+ attach_function :cuMemMap, [:uint64, :size_t, :size_t, :uint64, :uint64], :int
156
+
157
+ # cuMemUnmap(CUdeviceptr ptr, size_t size)
158
+ attach_function :cuMemUnmap, [:uint64, :size_t], :int
159
+
160
+ # cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count)
161
+ attach_function :cuMemSetAccess, [:uint64, :size_t, :pointer, :size_t], :int
162
+
163
+ # cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option)
164
+ attach_function :cuMemGetAllocationGranularity, [:pointer, :pointer, :int], :int
165
+
166
+ # IPC functions
167
+ attach_function :cuIpcGetMemHandle, [:pointer, :uint64], :int
168
+ attach_function :cuIpcOpenMemHandle, [:pointer, CUipcMemHandle.by_value, :uint], :int
169
+ attach_function :cuIpcCloseMemHandle, [:uint64], :int
170
+
171
+ attach_function :cuIpcGetEventHandle, [:pointer, :uint64], :int
172
+ attach_function :cuIpcOpenEventHandle, [:pointer, CUipcEventHandle.by_value], :int
173
+
174
+ VMM_AVAILABLE = true
175
+ rescue FFI::NotFoundError => e
176
+ $stderr.puts "[NvCCL] VMM functions not available: #{e.message}"
177
+ VMM_AVAILABLE = false
178
+ rescue StandardError
179
+ VMM_AVAILABLE = false
180
+ end
181
+
182
+ # @return [Boolean]
183
+ def self.available?
184
+ defined?(VMM_AVAILABLE) && VMM_AVAILABLE
185
+ end
186
+ end
187
+ end
188
+ end
189
+ end