ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +7 -0
  3. data/lib/ignis-collective.rb +9 -0
  4. data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
  5. data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
  6. data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
  7. data/lib/nvruby/collective/algorithms/ring.rb +421 -0
  8. data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
  9. data/lib/nvruby/collective/algorithms/tree.rb +291 -0
  10. data/lib/nvruby/collective/array_ops.rb +240 -0
  11. data/lib/nvruby/collective/communicator.rb +633 -0
  12. data/lib/nvruby/collective/communicator_healer.rb +276 -0
  13. data/lib/nvruby/collective/device_manager.rb +216 -0
  14. data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
  15. data/lib/nvruby/collective/health_monitor.rb +333 -0
  16. data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
  17. data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
  18. data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
  19. data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
  20. data/lib/nvruby/collective/p2p_bindings.rb +121 -0
  21. data/lib/nvruby/collective/resilient_transport.rb +296 -0
  22. data/lib/nvruby/collective/topology.rb +347 -0
  23. data/lib/nvruby/collective/transport/base.rb +138 -0
  24. data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
  25. data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
  26. data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
  27. data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
  28. data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
  29. data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
  30. data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
  31. data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
  32. data/lib/nvruby/collective/transport_selector.rb +200 -0
  33. data/lib/nvruby/collective/vmm_bindings.rb +212 -0
  34. data/lib/nvruby/collective.rb +156 -0
  35. metadata +92 -0
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module Collective
5
+ module Transport
6
+ # Abstract base class for all transport implementations
7
+ # Each transport handles GPU-to-GPU or GPU-to-network data movement
8
+ class Base
9
+ # Transport type identifier
10
+ # @return [Symbol] Transport type
11
+ def self.transport_type
12
+ raise NotImplementedError, "Subclass must define transport_type"
13
+ end
14
+
15
+ # @return [Integer] Source device ID
16
+ attr_reader :src_device
17
+
18
+ # @return [Integer] Destination device ID
19
+ attr_reader :dst_device
20
+
21
+ # @param src_device [Integer] Source GPU device ID
22
+ # @param dst_device [Integer] Destination GPU device ID
23
+ def initialize(src_device:, dst_device:)
24
+ @src_device = src_device
25
+ @dst_device = dst_device
26
+ @initialized = false
27
+ end
28
+
29
+ # Initialize the transport (called once per communicator)
30
+ # @return [void]
31
+ def initialize!
32
+ raise NotImplementedError
33
+ end
34
+
35
+ # Check if transport is initialized and ready
36
+ # @return [Boolean] True if ready for use
37
+ def ready?
38
+ @initialized
39
+ end
40
+
41
+ # Send data asynchronously
42
+ # @param buffer [FFI::Pointer] Device pointer to send
43
+ # @param size [Integer] Bytes to send
44
+ # @param stream [CUDA::Stream, FFI::Pointer] CUDA stream for async execution
45
+ # @return [void]
46
+ def send_async(buffer, size, stream)
47
+ raise NotImplementedError
48
+ end
49
+
50
+ # Receive data asynchronously
51
+ # @param buffer [FFI::Pointer] Device pointer to receive into
52
+ # @param size [Integer] Bytes to receive
53
+ # @param stream [CUDA::Stream, FFI::Pointer] CUDA stream for async execution
54
+ # @return [void]
55
+ def recv_async(buffer, size, stream)
56
+ raise NotImplementedError
57
+ end
58
+
59
+ # Synchronous send (waits for completion)
60
+ # @param buffer [FFI::Pointer] Device pointer to send
61
+ # @param size [Integer] Bytes to send
62
+ # @return [void]
63
+ def send_sync(buffer, size)
64
+ null_stream = FFI::Pointer::NULL
65
+ send_async(buffer, size, null_stream)
66
+ synchronize!
67
+ end
68
+
69
+ # Synchronous receive (waits for completion)
70
+ # @param buffer [FFI::Pointer] Device pointer to receive into
71
+ # @param size [Integer] Bytes to receive
72
+ # @return [void]
73
+ def recv_sync(buffer, size)
74
+ null_stream = FFI::Pointer::NULL
75
+ recv_async(buffer, size, null_stream)
76
+ synchronize!
77
+ end
78
+
79
+ # Wait for all pending operations to complete
80
+ # @return [void]
81
+ def synchronize!
82
+ CUDA::RuntimeAPI.ensure_loaded!
83
+ status = CUDA::RuntimeAPI.cudaDeviceSynchronize
84
+ CUDA::RuntimeAPI.check_status!(status, "Transport synchronize")
85
+ end
86
+
87
+ # Estimated bandwidth in GB/s
88
+ # @return [Float] Bandwidth estimate
89
+ def estimated_bandwidth
90
+ raise NotImplementedError
91
+ end
92
+
93
+ # Estimated latency in microseconds
94
+ # @return [Float] Latency estimate
95
+ def estimated_latency
96
+ raise NotImplementedError
97
+ end
98
+
99
+ # Check if this transport is available for the given GPU pair
100
+ # @param src [Integer] Source GPU
101
+ # @param dst [Integer] Destination GPU
102
+ # @return [Boolean] True if available
103
+ def self.available?(src, dst)
104
+ raise NotImplementedError
105
+ end
106
+
107
+ # Clean up resources
108
+ # @return [void]
109
+ def destroy!
110
+ @initialized = false
111
+ end
112
+
113
+ # @return [String] Human-readable description
114
+ def to_s
115
+ "#{self.class.transport_type}[#{@src_device}→#{@dst_device}]"
116
+ end
117
+
118
+ protected
119
+
120
+ # Get stream pointer for FFI calls
121
+ # @param stream [CUDA::Stream, FFI::Pointer, nil] Stream
122
+ # @return [FFI::Pointer] Stream pointer
123
+ def stream_ptr(stream)
124
+ case stream
125
+ when FFI::Pointer
126
+ stream
127
+ when CUDA::Stream
128
+ stream.ptr
129
+ when NilClass
130
+ FFI::Pointer::NULL
131
+ else
132
+ raise ArgumentError, "Invalid stream type: #{stream.class}"
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,217 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+
5
+ module Ignis
6
+ module Collective
7
+ module Transport
8
+ # Host-Staged Transport - Fallback when P2P is not available
9
+ #
10
+ # Uses host (pinned) memory as an intermediate staging buffer.
11
+ # Data path: GPU_A -> Host -> GPU_B
12
+ #
13
+ # Slower than P2P but always works regardless of topology.
14
+ # Bandwidth limited by PCIe x2 (upload + download).
15
+ class HostStagedTransport < Base
16
+ # @return [Symbol] Transport type identifier
17
+ def self.transport_type
18
+ :host_staged
19
+ end
20
+
21
+ # @return [Float] Estimated bandwidth (GB/s)
22
+ def estimated_bandwidth
23
+ 12.0 # PCIe 4.0 x16 / 2 (round trip)
24
+ end
25
+
26
+ # @return [Float] Estimated latency (microseconds)
27
+ def estimated_latency
28
+ 25.0 # Higher due to double copy
29
+ end
30
+
31
+ # Initialize the transport
32
+ # @return [void]
33
+ def initialize!
34
+ return if @initialized
35
+
36
+ CUDA::RuntimeAPI.ensure_loaded!
37
+ @staging_buffers = {} # size -> pinned host buffer
38
+ @initialized = true
39
+ end
40
+
41
+ # Copy data via host staging
42
+ # @param dst_buffer [FFI::Pointer] Destination GPU buffer
43
+ # @param src_buffer [FFI::Pointer] Source GPU buffer
44
+ # @param size [Integer] Size in bytes
45
+ # @param stream [FFI::Pointer] CUDA stream (for async ops)
46
+ # @return [void]
47
+ def copy_async(dst_buffer, src_buffer, size, stream)
48
+ ensure_initialized!
49
+
50
+ # Get or allocate staging buffer
51
+ staging = get_staging_buffer(size)
52
+
53
+ # Step 1: GPU_src -> Host (async)
54
+ CUDA::RuntimeAPI.cudaSetDevice(@src_device)
55
+ status = CUDA::RuntimeAPI.cudaMemcpyAsync(
56
+ staging,
57
+ src_buffer,
58
+ size,
59
+ CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_HOST,
60
+ stream
61
+ )
62
+ CUDA::RuntimeAPI.check_status!(status, "Host-staged D2H copy")
63
+
64
+ # Synchronize to ensure data is in host memory
65
+ sync_stream(stream)
66
+
67
+ # Step 2: Host -> GPU_dst (async)
68
+ CUDA::RuntimeAPI.cudaSetDevice(@dst_device)
69
+ status = CUDA::RuntimeAPI.cudaMemcpyAsync(
70
+ dst_buffer,
71
+ staging,
72
+ size,
73
+ CUDA::RuntimeAPI::MEMCPY_HOST_TO_DEVICE,
74
+ stream
75
+ )
76
+ CUDA::RuntimeAPI.check_status!(status, "Host-staged H2D copy")
77
+ end
78
+
79
+ # Synchronous copy
80
+ # @param dst_buffer [FFI::Pointer] Destination
81
+ # @param src_buffer [FFI::Pointer] Source
82
+ # @param size [Integer] Size in bytes
83
+ # @return [void]
84
+ def copy_sync(dst_buffer, src_buffer, size)
85
+ ensure_initialized!
86
+
87
+ staging = get_staging_buffer(size)
88
+
89
+ # GPU_src -> Host
90
+ CUDA::RuntimeAPI.cudaSetDevice(@src_device)
91
+ status = CUDA::RuntimeAPI.cudaMemcpy(
92
+ staging,
93
+ src_buffer,
94
+ size,
95
+ CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_HOST
96
+ )
97
+ CUDA::RuntimeAPI.check_status!(status, "Host-staged D2H sync")
98
+
99
+ # Host -> GPU_dst
100
+ CUDA::RuntimeAPI.cudaSetDevice(@dst_device)
101
+ status = CUDA::RuntimeAPI.cudaMemcpy(
102
+ dst_buffer,
103
+ staging,
104
+ size,
105
+ CUDA::RuntimeAPI::MEMCPY_HOST_TO_DEVICE
106
+ )
107
+ CUDA::RuntimeAPI.check_status!(status, "Host-staged H2D sync")
108
+ end
109
+
110
+ # Async send (GPU to host staging)
111
+ # @param buffer [FFI::Pointer] Source GPU buffer
112
+ # @param size [Integer] Size in bytes
113
+ # @param stream [FFI::Pointer] CUDA stream
114
+ # @return [FFI::Pointer] Staging buffer with data
115
+ def send_async(buffer, size, stream)
116
+ ensure_initialized!
117
+
118
+ staging = get_staging_buffer(size)
119
+
120
+ CUDA::RuntimeAPI.cudaSetDevice(@src_device)
121
+ status = CUDA::RuntimeAPI.cudaMemcpyAsync(
122
+ staging,
123
+ buffer,
124
+ size,
125
+ CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_HOST,
126
+ stream
127
+ )
128
+ CUDA::RuntimeAPI.check_status!(status, "Host-staged send")
129
+
130
+ staging
131
+ end
132
+
133
+ # Async receive (host staging to GPU)
134
+ # @param buffer [FFI::Pointer] Destination GPU buffer
135
+ # @param staging [FFI::Pointer] Host staging buffer with data
136
+ # @param size [Integer] Size in bytes
137
+ # @param stream [FFI::Pointer] CUDA stream
138
+ # @return [void]
139
+ def recv_async(buffer, staging, size, stream)
140
+ ensure_initialized!
141
+
142
+ CUDA::RuntimeAPI.cudaSetDevice(@dst_device)
143
+ status = CUDA::RuntimeAPI.cudaMemcpyAsync(
144
+ buffer,
145
+ staging,
146
+ size,
147
+ CUDA::RuntimeAPI::MEMCPY_HOST_TO_DEVICE,
148
+ stream
149
+ )
150
+ CUDA::RuntimeAPI.check_status!(status, "Host-staged recv")
151
+ end
152
+
153
+ # Check if host-staged is available (always true)
154
+ # @return [Boolean] Always true
155
+ def self.available?
156
+ true
157
+ end
158
+
159
+ # Clean up staging buffers
160
+ # @return [void]
161
+ def destroy!
162
+ @staging_buffers.each_value do |buf|
163
+ CUDA::RuntimeAPI.cudaFreeHost(buf) rescue nil
164
+ end
165
+ @staging_buffers.clear
166
+ @initialized = false
167
+ end
168
+
169
+ private
170
+
171
+ # Get or allocate a pinned host staging buffer
172
+ # @param size [Integer] Required size
173
+ # @return [FFI::Pointer] Pinned host buffer
174
+ def get_staging_buffer(size)
175
+ # Round up to nearest power of 2 for reuse efficiency
176
+ rounded_size = next_power_of_2(size)
177
+
178
+ return @staging_buffers[rounded_size] if @staging_buffers[rounded_size]
179
+
180
+ # Allocate pinned host memory
181
+ ptr_ptr = FFI::MemoryPointer.new(:pointer)
182
+ status = CUDA::RuntimeAPI.cudaHostAlloc(
183
+ ptr_ptr,
184
+ rounded_size,
185
+ 1 # cudaHostAllocDefault
186
+ )
187
+ CUDA::RuntimeAPI.check_status!(status, "Alloc pinned staging buffer")
188
+
189
+ @staging_buffers[rounded_size] = ptr_ptr.read_pointer
190
+ end
191
+
192
+ # Round up to next power of 2
193
+ def next_power_of_2(n)
194
+ return 1 if n <= 0
195
+
196
+ n -= 1
197
+ n |= n >> 1
198
+ n |= n >> 2
199
+ n |= n >> 4
200
+ n |= n >> 8
201
+ n |= n >> 16
202
+ n |= n >> 32
203
+ n + 1
204
+ end
205
+
206
+ # Synchronize stream
207
+ def sync_stream(stream)
208
+ if stream.null?
209
+ CUDA::RuntimeAPI.cudaDeviceSynchronize
210
+ else
211
+ CUDA::RuntimeAPI.cudaStreamSynchronize(stream)
212
+ end
213
+ end
214
+ end
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,187 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+ require_relative "../p2p_bindings"
5
+
6
+ module Ignis
7
+ module Collective
8
+ module Transport
9
+ # CUDA IPC transport for inter-process GPU memory sharing
10
+ # Uses cudaIpcGetMemHandle/cudaIpcOpenMemHandle for zero-copy cross-process access
11
+ # Supports both legacy IPC and cuMem VMM API
12
+ class IPCTransport < Base
13
+ # @return [Symbol] Transport type identifier
14
+ def self.transport_type
15
+ :cuda_ipc
16
+ end
17
+
18
+ # Estimated bandwidth (host memory throughput for handle exchange)
19
+ BANDWIDTH_GBS = 25.0
20
+
21
+ # Estimated latency (IPC overhead)
22
+ LATENCY_US = 10.0
23
+
24
+ # Handle exchange methods
25
+ EXCHANGE_METHODS = [:named_pipe, :shared_memory, :socket].freeze
26
+
27
+ # @return [P2PBindings::CudaIpcMemHandle, nil] Exported handle
28
+ attr_reader :exported_handle
29
+
30
+ # @return [Symbol] Handle exchange method
31
+ attr_reader :exchange_method
32
+
33
+ # @param src_device [Integer] Source GPU
34
+ # @param dst_device [Integer] Destination GPU
35
+ # @param exchange_method [Symbol] Method for handle exchange
36
+ def initialize(src_device:, dst_device:, exchange_method: :named_pipe)
37
+ super(src_device: src_device, dst_device: dst_device)
38
+ @exchange_method = exchange_method
39
+ @exported_handles = {} # device_ptr -> handle
40
+ @imported_handles = {} # handle -> mapped_ptr
41
+ end
42
+
43
+ # Initialize IPC transport
44
+ # @return [void]
45
+ def initialize!
46
+ return if @initialized
47
+
48
+ P2PBindings.ensure_loaded!
49
+ CUDA::RuntimeAPI.ensure_loaded!
50
+
51
+ @initialized = true
52
+ end
53
+
54
+ # Export GPU memory for sharing with another process
55
+ # @param device_ptr [FFI::Pointer] GPU memory pointer to export
56
+ # @return [String] Binary handle (64 bytes) for transfer to other process
57
+ def export_handle(device_ptr)
58
+ ensure_initialized!
59
+
60
+ # Set source device context
61
+ status = CUDA::RuntimeAPI.cudaSetDevice(@src_device)
62
+ CUDA::RuntimeAPI.check_status!(status, "Set device for IPC export")
63
+
64
+ handle = P2PBindings::CudaIpcMemHandle.new
65
+ status = P2PBindings.cudaIpcGetMemHandle(handle, device_ptr)
66
+ P2PBindings.check_status!(status, "Get IPC memory handle")
67
+
68
+ # Cache the handle for cleanup
69
+ handle_bytes = handle[:reserved].to_a.pack("C*")
70
+ @exported_handles[device_ptr.address] = handle_bytes
71
+
72
+ handle_bytes
73
+ end
74
+
75
+ # Import GPU memory handle from another process
76
+ # @param handle_bytes [String] 64-byte binary handle
77
+ # @param flags [Integer] IPC memory flags
78
+ # @return [FFI::Pointer] Mapped device pointer usable in this process
79
+ def import_handle(handle_bytes, flags: P2PBindings::IPC_MEM_LAZY_ENABLE_PEER_ACCESS)
80
+ ensure_initialized!
81
+
82
+ # Set destination device context
83
+ status = CUDA::RuntimeAPI.cudaSetDevice(@dst_device)
84
+ CUDA::RuntimeAPI.check_status!(status, "Set device for IPC import")
85
+
86
+ # Reconstruct handle from bytes
87
+ handle = P2PBindings::CudaIpcMemHandle.new
88
+ handle_bytes.each_byte.with_index do |byte, i|
89
+ handle[:reserved][i] = byte
90
+ end
91
+
92
+ # Open the handle
93
+ mapped_ptr_ptr = FFI::MemoryPointer.new(:pointer)
94
+ status = P2PBindings.cudaIpcOpenMemHandle(mapped_ptr_ptr, handle, flags)
95
+ P2PBindings.check_status!(status, "Open IPC memory handle")
96
+
97
+ mapped_ptr = mapped_ptr_ptr.read_pointer
98
+ @imported_handles[handle_bytes] = mapped_ptr
99
+
100
+ mapped_ptr
101
+ end
102
+
103
+ # Close an imported handle
104
+ # @param mapped_ptr [FFI::Pointer] Previously imported pointer
105
+ # @return [void]
106
+ def close_imported_handle(mapped_ptr)
107
+ ensure_initialized!
108
+
109
+ status = P2PBindings.cudaIpcCloseMemHandle(mapped_ptr)
110
+ P2PBindings.check_status!(status, "Close IPC memory handle")
111
+
112
+ @imported_handles.delete_if { |_, ptr| ptr == mapped_ptr }
113
+ end
114
+
115
+ # Send data using IPC (export phase)
116
+ # In IPC, send means making buffer available to receiver
117
+ # @param buffer [FFI::Pointer] Buffer to share
118
+ # @param size [Integer] Buffer size (for validation)
119
+ # @param stream [Object] CUDA stream (unused for export, sync only)
120
+ # @return [String] Handle bytes to transmit to receiver
121
+ def send_async(buffer, size, stream)
122
+ export_handle(buffer)
123
+ end
124
+
125
+ # Receive data using IPC (import phase)
126
+ # @param buffer [FFI::Pointer] NOT USED - returns new mapped pointer
127
+ # @param size [Integer] Expected size
128
+ # @param stream [Object] CUDA stream
129
+ # @return [FFI::Pointer] Mapped device pointer
130
+ def recv_async(buffer, size, stream)
131
+ # IPC recv requires handle from sender - actual implementation
132
+ # would receive handle via exchange_method and then import
133
+ raise TransportError, "IPC recv requires handle - use recv_with_handle"
134
+ end
135
+
136
+ # Receive using provided handle
137
+ # @param handle_bytes [String] Handle from sender
138
+ # @return [FFI::Pointer] Mapped device pointer
139
+ def recv_with_handle(handle_bytes)
140
+ import_handle(handle_bytes)
141
+ end
142
+
143
+ # @return [Float] Bandwidth in GB/s
144
+ def estimated_bandwidth
145
+ BANDWIDTH_GBS
146
+ end
147
+
148
+ # @return [Float] Latency in microseconds
149
+ def estimated_latency
150
+ LATENCY_US
151
+ end
152
+
153
+ # IPC is always available on Windows for same-node communication
154
+ # @param src [Integer] Source GPU
155
+ # @param dst [Integer] Destination GPU
156
+ # @return [Boolean] True (always available)
157
+ def self.available?(src, dst)
158
+ true
159
+ end
160
+
161
+ # Clean up all handles
162
+ # @return [void]
163
+ def destroy!
164
+ @imported_handles.each_value do |ptr|
165
+ P2PBindings.cudaIpcCloseMemHandle(ptr)
166
+ rescue StandardError
167
+ # Ignore cleanup errors
168
+ end
169
+ @imported_handles.clear
170
+ @exported_handles.clear
171
+ super
172
+ end
173
+
174
+ # @return [String] Transport description
175
+ def to_s
176
+ "IPC[#{@src_device}→#{@dst_device}, #{@exchange_method}]"
177
+ end
178
+
179
+ private
180
+
181
+ def ensure_initialized!
182
+ raise TransportError, "IPC Transport not initialized" unless @initialized
183
+ end
184
+ end
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,157 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+ require_relative "../p2p_bindings"
5
+
6
+ module Ignis
7
+ module Collective
8
+ module Transport
9
+ # PCIe/NVLink peer-to-peer transport for direct GPU-to-GPU transfers
10
+ # Uses cudaMemcpyPeerAsync for high-bandwidth same-process communication
11
+ class P2PTransport < Base
12
+ # @return [Symbol] Transport type identifier
13
+ def self.transport_type
14
+ :pcie_p2p
15
+ end
16
+
17
+ # Estimated bandwidth based on interconnect type
18
+ BANDWIDTH_ESTIMATES = {
19
+ nvlink: 900.0, # GB/s - NVLink 4.0
20
+ pcie_p2p: 32.0, # GB/s - PCIe Gen4 x16
21
+ }.freeze
22
+
23
+ # Estimated latency
24
+ LATENCY_ESTIMATES = {
25
+ nvlink: 1.0, # microseconds
26
+ pcie_p2p: 5.0, # microseconds
27
+ }.freeze
28
+
29
+ # @return [Symbol] Actual interconnect type (:nvlink or :pcie_p2p)
30
+ attr_reader :interconnect_type
31
+
32
+ # @param src_device [Integer] Source GPU
33
+ # @param dst_device [Integer] Destination GPU
34
+ # @param interconnect_type [Symbol] Detected interconnect type
35
+ def initialize(src_device:, dst_device:, interconnect_type: :pcie_p2p)
36
+ super(src_device: src_device, dst_device: dst_device)
37
+ @interconnect_type = interconnect_type
38
+ @peer_access_enabled = false
39
+ end
40
+
41
+ # Initialize P2P transport by enabling peer access
42
+ # @return [void]
43
+ def initialize!
44
+ return if @initialized
45
+
46
+ P2PBindings.ensure_loaded!
47
+ CUDA::RuntimeAPI.ensure_loaded!
48
+
49
+ # Set source device context
50
+ status = CUDA::RuntimeAPI.cudaSetDevice(@src_device)
51
+ CUDA::RuntimeAPI.check_status!(status, "Set device #{@src_device}")
52
+
53
+ # Enable peer access to destination
54
+ status = P2PBindings.cudaDeviceEnablePeerAccess(@dst_device, 0)
55
+
56
+ # Status 0 = success, 704 = already enabled (cudaErrorPeerAccessAlreadyEnabled)
57
+ unless status.zero? || status == 704
58
+ P2PBindings.check_status!(status, "Enable peer access #{@src_device}→#{@dst_device}")
59
+ end
60
+
61
+ @peer_access_enabled = true
62
+ @initialized = true
63
+ end
64
+
65
+ # Send data to destination GPU asynchronously
66
+ # @param buffer [FFI::Pointer] Source buffer on src_device
67
+ # @param size [Integer] Bytes to send
68
+ # @param stream [CUDA::Stream, FFI::Pointer] CUDA stream
69
+ # @return [void]
70
+ def send_async(buffer, size, stream)
71
+ ensure_initialized!
72
+
73
+ # For P2P, we use the receive buffer on dst, so this is a no-op
74
+ # The actual transfer happens in recv_async with combined send buffer
75
+ # This is for the half-duplex ring pattern
76
+ nil
77
+ end
78
+
79
+ # Receive data from source GPU asynchronously
80
+ # @param dst_buffer [FFI::Pointer] Destination buffer on dst_device
81
+ # @param src_buffer [FFI::Pointer] Source buffer on src_device
82
+ # @param size [Integer] Bytes to receive
83
+ # @param stream [CUDA::Stream, FFI::Pointer] CUDA stream
84
+ # @return [void]
85
+ def copy_async(dst_buffer, src_buffer, size, stream)
86
+ ensure_initialized!
87
+
88
+ status = P2PBindings.cudaMemcpyPeerAsync(
89
+ dst_buffer,
90
+ @dst_device,
91
+ src_buffer,
92
+ @src_device,
93
+ size,
94
+ stream_ptr(stream)
95
+ )
96
+ P2PBindings.check_status!(status, "P2P copy #{@src_device}→#{@dst_device}")
97
+ end
98
+
99
+ # Alias for copy_async with reversed semantics
100
+ # @param buffer [FFI::Pointer] Destination buffer
101
+ # @param size [Integer] Bytes to receive
102
+ # @param stream [CUDA::Stream, FFI::Pointer] CUDA stream
103
+ # @return [void]
104
+ def recv_async(buffer, size, stream)
105
+ # Note: For full implementation, source buffer would come from send side
106
+ # In ring allreduce, we manage buffers differently
107
+ nil
108
+ end
109
+
110
+ # @return [Float] Bandwidth in GB/s
111
+ def estimated_bandwidth
112
+ BANDWIDTH_ESTIMATES[@interconnect_type] || 12.0
113
+ end
114
+
115
+ # @return [Float] Latency in microseconds
116
+ def estimated_latency
117
+ LATENCY_ESTIMATES[@interconnect_type] || 20.0
118
+ end
119
+
120
+ # Check if P2P is available between two GPUs
121
+ # @param src [Integer] Source GPU
122
+ # @param dst [Integer] Destination GPU
123
+ # @return [Boolean] True if P2P available
124
+ def self.available?(src, dst)
125
+ P2PBindings.ensure_loaded!
126
+
127
+ can_access_ptr = FFI::MemoryPointer.new(:int)
128
+ status = P2PBindings.cudaDeviceCanAccessPeer(can_access_ptr, src, dst)
129
+ status.zero? && can_access_ptr.read_int == 1
130
+ end
131
+
132
+ # Clean up by disabling peer access
133
+ # @return [void]
134
+ def destroy!
135
+ if @peer_access_enabled
136
+ CUDA::RuntimeAPI.cudaSetDevice(@src_device)
137
+ P2PBindings.cudaDeviceDisablePeerAccess(@dst_device)
138
+ @peer_access_enabled = false
139
+ end
140
+ super
141
+ end
142
+
143
+ # @return [String] Transport description
144
+ def to_s
145
+ bw = estimated_bandwidth
146
+ "P2P[#{@src_device}→#{@dst_device}, #{@interconnect_type}, #{bw} GB/s]"
147
+ end
148
+
149
+ private
150
+
151
+ def ensure_initialized!
152
+ raise TransportError, "Transport not initialized" unless @initialized
153
+ end
154
+ end
155
+ end
156
+ end
157
+ end