ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +7 -0
  3. data/lib/ignis-collective.rb +9 -0
  4. data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
  5. data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
  6. data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
  7. data/lib/nvruby/collective/algorithms/ring.rb +421 -0
  8. data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
  9. data/lib/nvruby/collective/algorithms/tree.rb +291 -0
  10. data/lib/nvruby/collective/array_ops.rb +240 -0
  11. data/lib/nvruby/collective/communicator.rb +633 -0
  12. data/lib/nvruby/collective/communicator_healer.rb +276 -0
  13. data/lib/nvruby/collective/device_manager.rb +216 -0
  14. data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
  15. data/lib/nvruby/collective/health_monitor.rb +333 -0
  16. data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
  17. data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
  18. data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
  19. data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
  20. data/lib/nvruby/collective/p2p_bindings.rb +121 -0
  21. data/lib/nvruby/collective/resilient_transport.rb +296 -0
  22. data/lib/nvruby/collective/topology.rb +347 -0
  23. data/lib/nvruby/collective/transport/base.rb +138 -0
  24. data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
  25. data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
  26. data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
  27. data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
  28. data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
  29. data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
  30. data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
  31. data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
  32. data/lib/nvruby/collective/transport_selector.rb +200 -0
  33. data/lib/nvruby/collective/vmm_bindings.rb +212 -0
  34. data/lib/nvruby/collective.rb +156 -0
  35. metadata +92 -0
@@ -0,0 +1,296 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "transport/base"
4
+ require_relative "transport/p2p_transport"
5
+ require_relative "transport/ipc_transport"
6
+ require_relative "transport/host_staged_transport"
7
+
8
+ module Ignis
9
+ module Collective
10
+ # Resilient transport wrapper with retry, fallback, and circuit breaker
11
+ # Inspired by RapidsMPF's three-phase protocol and error handling patterns
12
+ #
13
+ # @example Usage
14
+ # transport = ResilientTransport.new(
15
+ # src_device: 0, dst_device: 1,
16
+ # topology: topology_detector
17
+ # )
18
+ # transport.send_async(src_ptr, dst_ptr, size, stream)
19
+ #
20
+ class ResilientTransport
21
+ # Maximum retry attempts before fallback
22
+ MAX_RETRIES = 3
23
+
24
+ # Retry delays with exponential backoff (seconds)
25
+ RETRY_DELAYS = [0.1, 0.5, 1.0].freeze
26
+
27
+ # Transport fallback chain (highest → lowest performance)
28
+ FALLBACK_CHAIN = [:p2p, :ipc, :host_staged].freeze
29
+
30
+ # Circuit breaker threshold (failures before marking unhealthy)
31
+ CIRCUIT_BREAKER_THRESHOLD = 3
32
+
33
+ # Circuit breaker reset time (seconds)
34
+ CIRCUIT_BREAKER_RESET = 60.0
35
+
36
+ # CUDA error codes that indicate transport failure
37
+ RECOVERABLE_ERRORS = [
38
+ 702, # CUDA_ERROR_LAUNCH_TIMEOUT
39
+ 716, # CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
40
+ 999, # CUDA_ERROR_UNKNOWN
41
+ ].freeze
42
+
43
+ # @return [Integer] Source GPU device ID
44
+ attr_reader :src_device
45
+
46
+ # @return [Integer] Destination GPU device ID
47
+ attr_reader :dst_device
48
+
49
+ # @return [Symbol] Current transport type
50
+ attr_reader :current_transport_type
51
+
52
+ # @return [Transport::Base] Active transport
53
+ attr_reader :active_transport
54
+
55
+ # @return [Hash] Transport health status
56
+ attr_reader :health_status
57
+
58
+ # Create resilient transport wrapper
59
+ #
60
+ # @param src_device [Integer] Source GPU
61
+ # @param dst_device [Integer] Destination GPU
62
+ # @param topology [Topology::Detector] Topology for path detection
63
+ # @param preferred_transport [Symbol, nil] Force specific transport
64
+ def initialize(src_device:, dst_device:, topology:, preferred_transport: nil)
65
+ @src_device = src_device
66
+ @dst_device = dst_device
67
+ @topology = topology
68
+ @preferred_transport = preferred_transport
69
+
70
+ @transports = {}
71
+ @health_status = Hash.new { |h, k| h[k] = { failures: 0, last_failure: nil } }
72
+ @current_transport_type = nil
73
+ @active_transport = nil
74
+ @initialized = false
75
+ end
76
+
77
+ # Initialize transports
78
+ # @return [void]
79
+ def initialize!
80
+ return if @initialized
81
+
82
+ select_initial_transport!
83
+ @initialized = true
84
+ end
85
+
86
+ # Send data with retry and fallback
87
+ #
88
+ # @param src_ptr [FFI::Pointer] Source buffer
89
+ # @param dst_ptr [FFI::Pointer] Destination buffer
90
+ # @param size [Integer] Bytes to transfer
91
+ # @param stream [FFI::Pointer, nil] CUDA stream
92
+ # @return [Boolean] Success status
93
+ def send_async(src_ptr, dst_ptr, size, stream = nil)
94
+ ensure_initialized!
95
+
96
+ attempt = 0
97
+ last_error = nil
98
+
99
+ while attempt < MAX_RETRIES
100
+ begin
101
+ # Transports expose copy_async(dst, src, size, stream) (P2P) or
102
+ # send_async(buffer, size, stream) (base) — NOT a 4-arg send_async,
103
+ # which raised ArgumentError on every attempt before.
104
+ result = if @active_transport.respond_to?(:copy_async)
105
+ @active_transport.copy_async(dst_ptr, src_ptr, size, stream)
106
+ else
107
+ @active_transport.send_async(src_ptr, size, stream)
108
+ end
109
+ reset_circuit_breaker!(@current_transport_type)
110
+ return result
111
+ rescue StandardError => e
112
+ last_error = e
113
+ record_failure!(@current_transport_type, e)
114
+ attempt += 1
115
+
116
+ if attempt < MAX_RETRIES
117
+ sleep(RETRY_DELAYS[[attempt - 1, RETRY_DELAYS.size - 1].min])
118
+ end
119
+ end
120
+ end
121
+
122
+ # All retries failed, try fallback
123
+ if try_fallback!
124
+ send_async(src_ptr, dst_ptr, size, stream)
125
+ else
126
+ raise TransportError, "All transports failed: #{last_error&.message}"
127
+ end
128
+ end
129
+
130
+ # Synchronize transfer completion
131
+ #
132
+ # @param stream [FFI::Pointer, nil] CUDA stream
133
+ # @return [void]
134
+ def synchronize(_stream = nil)
135
+ # Transports define synchronize! (no args), not synchronize(stream).
136
+ @active_transport&.synchronize!
137
+ end
138
+
139
+ # Check if transport is healthy
140
+ # @return [Boolean] True if healthy
141
+ def healthy?
142
+ @active_transport&.ready? && !circuit_open?(@current_transport_type)
143
+ end
144
+
145
+ # Check if any transport is available
146
+ # @return [Boolean] True if ready
147
+ def ready?
148
+ @initialized && @active_transport&.ready?
149
+ end
150
+
151
+ # Get estimated bandwidth
152
+ # @return [Float] GB/s
153
+ def estimated_bandwidth
154
+ @active_transport&.estimated_bandwidth || 0.0
155
+ end
156
+
157
+ # Force fallback to next transport in chain
158
+ # @return [Boolean] True if fallback succeeded
159
+ def force_fallback!
160
+ try_fallback!
161
+ end
162
+
163
+ # Reset all circuit breakers
164
+ # @return [void]
165
+ def reset_health!
166
+ @health_status.clear
167
+ end
168
+
169
+ # Clean up resources
170
+ # @return [void]
171
+ def destroy!
172
+ @transports.each_value(&:destroy!)
173
+ @transports.clear
174
+ @active_transport = nil
175
+ @initialized = false
176
+ end
177
+
178
+ # @return [String] Human-readable description
179
+ def to_s
180
+ status = healthy? ? "healthy" : "degraded"
181
+ "ResilientTransport[#{@src_device}→#{@dst_device}]: " \
182
+ "#{@current_transport_type} (#{status})"
183
+ end
184
+
185
+ private
186
+
187
+ def ensure_initialized!
188
+ initialize! unless @initialized
189
+ end
190
+
191
+ def select_initial_transport!
192
+ if @preferred_transport && !circuit_open?(@preferred_transport)
193
+ @current_transport_type = @preferred_transport
194
+ @active_transport = create_transport(@preferred_transport)
195
+ @active_transport.initialize!
196
+ return
197
+ end
198
+
199
+ # Select based on topology
200
+ path = @topology.matrix.path(@src_device, @dst_device)
201
+
202
+ transport_type = if path&.nvlink?
203
+ :p2p
204
+ elsif path&.pcie_p2p?
205
+ :p2p
206
+ elsif path&.p2p_supported
207
+ :ipc
208
+ else
209
+ :host_staged
210
+ end
211
+
212
+ @current_transport_type = transport_type
213
+ @active_transport = create_transport(transport_type)
214
+ @active_transport.initialize!
215
+ end
216
+
217
+ def create_transport(type)
218
+ @transports[type] ||= case type
219
+ when :p2p
220
+ Transport::P2PTransport.new(
221
+ src_device: @src_device,
222
+ dst_device: @dst_device,
223
+ interconnect_type: detect_interconnect_type
224
+ )
225
+ when :ipc
226
+ Transport::IPCTransport.new(
227
+ src_device: @src_device,
228
+ dst_device: @dst_device
229
+ )
230
+ when :host_staged
231
+ Transport::HostStagedTransport.new(
232
+ src_device: @src_device,
233
+ dst_device: @dst_device
234
+ )
235
+ else
236
+ raise ArgumentError, "Unknown transport: #{type}"
237
+ end
238
+ end
239
+
240
+ def detect_interconnect_type
241
+ path = @topology.matrix.path(@src_device, @dst_device)
242
+ path&.interconnect_type || :pcie_p2p
243
+ end
244
+
245
+ def try_fallback!
246
+ current_idx = FALLBACK_CHAIN.index(@current_transport_type) || -1
247
+
248
+ FALLBACK_CHAIN[(current_idx + 1)..].each do |transport_type|
249
+ next if circuit_open?(transport_type)
250
+
251
+ begin
252
+ @current_transport_type = transport_type
253
+ @active_transport = create_transport(transport_type)
254
+ @active_transport.initialize!
255
+ return true
256
+ rescue StandardError => e
257
+ record_failure!(transport_type, e)
258
+ end
259
+ end
260
+
261
+ false
262
+ end
263
+
264
+ def record_failure!(transport_type, error)
265
+ status = @health_status[transport_type]
266
+ status[:failures] += 1
267
+ status[:last_failure] = Time.now
268
+ status[:last_error] = error.message
269
+ end
270
+
271
+ def reset_circuit_breaker!(transport_type)
272
+ @health_status[transport_type] = { failures: 0, last_failure: nil }
273
+ end
274
+
275
+ def circuit_open?(transport_type)
276
+ status = @health_status[transport_type]
277
+ return false if status[:failures] < CIRCUIT_BREAKER_THRESHOLD
278
+ return false if status[:last_failure].nil?
279
+
280
+ # Check if reset time has passed
281
+ elapsed = Time.now - status[:last_failure]
282
+ if elapsed >= CIRCUIT_BREAKER_RESET
283
+ reset_circuit_breaker!(transport_type)
284
+ return false
285
+ end
286
+
287
+ true
288
+ end
289
+ end
290
+
291
+ # NOTE: TransportError is defined once in collective.rb as `< Error`.
292
+ # It used to be redefined here as `< StandardError`, which caused a
293
+ # "superclass mismatch" TypeError when collective.rb was required (it loads
294
+ # this file before reopening the class), blocking the entire NvCCL layer.
295
+ end
296
+ end
@@ -0,0 +1,347 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "p2p_bindings"
4
+
5
+ module Ignis
6
+ module Collective
7
+ # GPU topology detection and interconnect analysis
8
+ # Detects NVLink, PCIe P2P, and shared memory paths between GPUs
9
+ module Topology
10
+ # Interconnect types ranked by performance
11
+ INTERCONNECT_TYPES = {
12
+ nvlink: { bandwidth_gbps: 900, latency_us: 1 },
13
+ pcie_p2p: { bandwidth_gbps: 32, latency_us: 5 },
14
+ host_staged: { bandwidth_gbps: 12, latency_us: 20 },
15
+ none: { bandwidth_gbps: 0, latency_us: Float::INFINITY }
16
+ }.freeze
17
+
18
+ # Represents a connection path between two GPUs
19
+ class Path
20
+ # @return [Integer] Source GPU device ID
21
+ attr_reader :src_device
22
+
23
+ # @return [Integer] Destination GPU device ID
24
+ attr_reader :dst_device
25
+
26
+ # @return [Symbol] Interconnect type (:nvlink, :pcie_p2p, :host_staged, :none)
27
+ attr_reader :interconnect_type
28
+
29
+ # @return [Integer] Performance rank (0 = best)
30
+ attr_reader :performance_rank
31
+
32
+ # @return [Boolean] Whether P2P access is supported
33
+ attr_reader :p2p_supported
34
+
35
+ # @return [Boolean] Whether native atomics are supported
36
+ attr_reader :native_atomics
37
+
38
+ # @param src_device [Integer] Source GPU device ID
39
+ # @param dst_device [Integer] Destination GPU device ID
40
+ # @param interconnect_type [Symbol] Detected interconnect type
41
+ # @param performance_rank [Integer] Performance rank from CUDA
42
+ # @param p2p_supported [Boolean] P2P support status
43
+ # @param native_atomics [Boolean] Native atomic support
44
+ def initialize(src_device:, dst_device:, interconnect_type:,
45
+ performance_rank:, p2p_supported:, native_atomics: false)
46
+ @src_device = src_device
47
+ @dst_device = dst_device
48
+ @interconnect_type = interconnect_type
49
+ @performance_rank = performance_rank
50
+ @p2p_supported = p2p_supported
51
+ @native_atomics = native_atomics
52
+ end
53
+
54
+ # Estimated bandwidth in GB/s
55
+ # @return [Float] Bandwidth estimate
56
+ def estimated_bandwidth
57
+ INTERCONNECT_TYPES.dig(@interconnect_type, :bandwidth_gbps) || 0
58
+ end
59
+
60
+ # Estimated latency in microseconds
61
+ # @return [Float] Latency estimate
62
+ def estimated_latency
63
+ INTERCONNECT_TYPES.dig(@interconnect_type, :latency_us) || Float::INFINITY
64
+ end
65
+
66
+ # @return [Boolean] Whether direct P2P is possible
67
+ def direct_access?
68
+ @p2p_supported && [:nvlink, :pcie_p2p].include?(@interconnect_type)
69
+ end
70
+
71
+ # @return [Boolean] Whether this path uses NVLink
72
+ def nvlink?
73
+ @interconnect_type == :nvlink
74
+ end
75
+
76
+ # @return [Boolean] Whether this path uses PCIe P2P
77
+ def pcie_p2p?
78
+ @interconnect_type == :pcie_p2p && @p2p_supported
79
+ end
80
+
81
+ # Alias for estimated_bandwidth for test compatibility
82
+ # @return [Float] Bandwidth in GB/s
83
+ def bandwidth_gbps
84
+ estimated_bandwidth
85
+ end
86
+
87
+ # @return [String] Human-readable description
88
+ def to_s
89
+ "Path[#{@src_device}→#{@dst_device}]: #{@interconnect_type} " \
90
+ "(rank=#{@performance_rank}, p2p=#{@p2p_supported})"
91
+ end
92
+ end
93
+
94
+ # Topology matrix for a set of GPUs
95
+ class Matrix
96
+ # @return [Array<Integer>] List of GPU device IDs
97
+ attr_reader :device_ids
98
+
99
+ # @return [Hash<Array<Integer>, Path>] Map of [src, dst] to Path
100
+ attr_reader :paths
101
+
102
+ # @param device_ids [Array<Integer>] GPU device IDs to analyze
103
+ def initialize(device_ids)
104
+ @device_ids = device_ids.dup.freeze
105
+ @paths = {}
106
+ build_matrix!
107
+ end
108
+
109
+ # Get path between two GPUs
110
+ # @param src [Integer] Source GPU
111
+ # @param dst [Integer] Destination GPU
112
+ # @return [Path, nil] Path object or nil if same device
113
+ def path(src, dst)
114
+ return nil if src == dst
115
+
116
+ @paths[[src, dst]]
117
+ end
118
+
119
+ # Get optimal ring order based on topology
120
+ # Minimizes total latency by placing NVLink-connected GPUs adjacent
121
+ # @return [Array<Integer>] Ordered device IDs for ring algorithm
122
+ def optimal_ring_order
123
+ return @device_ids.dup if @device_ids.size <= 2
124
+
125
+ # Greedy nearest-neighbor heuristic
126
+ remaining = @device_ids.dup
127
+ order = [remaining.shift]
128
+
129
+ until remaining.empty?
130
+ current = order.last
131
+ # Find GPU with best connection to current
132
+ best_next = remaining.min_by do |gpu|
133
+ path_obj = path(current, gpu)
134
+ path_obj ? path_obj.performance_rank : Float::INFINITY
135
+ end
136
+ order << best_next
137
+ remaining.delete(best_next)
138
+ end
139
+
140
+ order
141
+ end
142
+
143
+ # Get all paths with NVLink connectivity
144
+ # @return [Array<Path>] Paths with NVLink
145
+ def nvlink_paths
146
+ @paths.values.select { |p| p.interconnect_type == :nvlink }
147
+ end
148
+
149
+ # Get all paths with P2P support
150
+ # @return [Array<Path>] Paths with P2P
151
+ def p2p_paths
152
+ @paths.values.select(&:p2p_supported)
153
+ end
154
+
155
+ # Check if all GPUs have full P2P mesh
156
+ # @return [Boolean] True if all pairs have P2P
157
+ def full_p2p_mesh?
158
+ @paths.values.all?(&:p2p_supported)
159
+ end
160
+
161
+ # @return [String] Human-readable matrix representation
162
+ def to_s
163
+ header = "Topology Matrix (#{@device_ids.size} GPUs)\n"
164
+ rows = @device_ids.map do |src|
165
+ cols = @device_ids.map do |dst|
166
+ if src == dst
167
+ " - "
168
+ else
169
+ path_obj = path(src, dst)
170
+ type_abbr = path_obj.interconnect_type.to_s[0..3].upcase
171
+ "#{type_abbr.ljust(5)}"
172
+ end
173
+ end
174
+ "GPU#{src}: #{cols.join(' | ')}"
175
+ end
176
+ header + rows.join("\n")
177
+ end
178
+
179
+ private
180
+
181
+ def build_matrix!
182
+ P2PBindings.ensure_loaded!
183
+
184
+ @device_ids.each do |src|
185
+ @device_ids.each do |dst|
186
+ next if src == dst
187
+
188
+ @paths[[src, dst]] = detect_path(src, dst)
189
+ end
190
+ end
191
+ end
192
+
193
+ # Detect interconnect between two GPUs
194
+ # @param src [Integer] Source GPU
195
+ # @param dst [Integer] Destination GPU
196
+ # @return [Path] Detected path
197
+ def detect_path(src, dst)
198
+ # Check P2P accessibility
199
+ can_access_ptr = FFI::MemoryPointer.new(:int)
200
+ status = P2PBindings.cudaDeviceCanAccessPeer(can_access_ptr, src, dst)
201
+ P2PBindings.check_status!(status, "Check P2P access #{src}→#{dst}")
202
+ p2p_supported = can_access_ptr.read_int == 1
203
+
204
+ # Get performance rank (0 = NVLink, higher = PCIe)
205
+ performance_rank = 99
206
+ interconnect_type = :host_staged
207
+
208
+ if p2p_supported
209
+ perf_ptr = FFI::MemoryPointer.new(:int)
210
+ status = P2PBindings.cudaDeviceGetP2PAttribute(
211
+ perf_ptr,
212
+ P2PBindings::P2P_ATTR_PERFORMANCE_RANK,
213
+ src,
214
+ dst
215
+ )
216
+
217
+ if status.zero?
218
+ performance_rank = perf_ptr.read_int
219
+ # Performance rank 0 indicates NVLink (highest performance)
220
+ interconnect_type = if performance_rank.zero?
221
+ :nvlink
222
+ else
223
+ :pcie_p2p
224
+ end
225
+ else
226
+ # P2P supported but can't get rank - assume PCIe
227
+ interconnect_type = :pcie_p2p
228
+ performance_rank = 1
229
+ end
230
+ end
231
+
232
+ # Check native atomic support
233
+ native_atomics = false
234
+ if p2p_supported
235
+ atomic_ptr = FFI::MemoryPointer.new(:int)
236
+ status = P2PBindings.cudaDeviceGetP2PAttribute(
237
+ atomic_ptr,
238
+ P2PBindings::P2P_ATTR_NATIVE_ATOMIC_SUPPORTED,
239
+ src,
240
+ dst
241
+ )
242
+ native_atomics = status.zero? && atomic_ptr.read_int == 1
243
+ end
244
+
245
+ Path.new(
246
+ src_device: src,
247
+ dst_device: dst,
248
+ interconnect_type: interconnect_type,
249
+ performance_rank: performance_rank,
250
+ p2p_supported: p2p_supported,
251
+ native_atomics: native_atomics
252
+ )
253
+ end
254
+ end
255
+
256
+ # GPU topology detector - main entry point
257
+ class Detector
258
+ # @return [Matrix] Current topology matrix
259
+ attr_reader :matrix
260
+
261
+ # Detect topology for specified GPUs
262
+ # @param device_ids [Array<Integer>, nil] GPU IDs or nil for all GPUs
263
+ def initialize(device_ids: nil)
264
+ @device_ids = device_ids || all_device_ids
265
+ @matrix = Matrix.new(@device_ids)
266
+ end
267
+
268
+ # @return [Array<Integer>] All visible GPU device IDs
269
+ def all_device_ids
270
+ CUDA::Device.list.map(&:index)
271
+ end
272
+
273
+ # @return [Integer] Number of GPUs in this topology
274
+ def gpu_count
275
+ @device_ids.size
276
+ end
277
+
278
+ # Get interconnect type between two GPUs
279
+ # @param device_a [Integer] First GPU
280
+ # @param device_b [Integer] Second GPU
281
+ # @return [Symbol] Interconnect type
282
+ def interconnect_type(device_a, device_b)
283
+ path = @matrix.path(device_a, device_b)
284
+ path&.interconnect_type || :none
285
+ end
286
+
287
+ # Get optimal ring order for collective operations
288
+ # @return [Array<Integer>] Ordered GPU IDs
289
+ def optimal_ring_order
290
+ @matrix.optimal_ring_order
291
+ end
292
+
293
+ # Check if specific GPU pair has NVLink
294
+ # @param device_a [Integer] First GPU
295
+ # @param device_b [Integer] Second GPU
296
+ # @return [Boolean] True if NVLink connected
297
+ def nvlink_connected?(device_a, device_b)
298
+ path = @matrix.path(device_a, device_b)
299
+ path&.interconnect_type == :nvlink
300
+ end
301
+
302
+ # Check if P2P is available between GPUs
303
+ # @param device_a [Integer] First GPU
304
+ # @param device_b [Integer] Second GPU
305
+ # @return [Boolean] True if P2P available
306
+ def p2p_available?(device_a, device_b)
307
+ path = @matrix.path(device_a, device_b)
308
+ path&.p2p_supported || false
309
+ end
310
+
311
+ # Enable P2P access between all GPUs in the topology
312
+ # @return [Hash<Array<Integer>, Boolean>] Map of [src, dst] to success
313
+ def enable_all_p2p!
314
+ results = {}
315
+
316
+ @matrix.p2p_paths.each do |path|
317
+ src = path.src_device
318
+ dst = path.dst_device
319
+
320
+ # Set source device context
321
+ status = CUDA::RuntimeAPI.cudaSetDevice(src)
322
+ CUDA::RuntimeAPI.check_status!(status, "Set device #{src}")
323
+
324
+ # Enable peer access
325
+ status = P2PBindings.cudaDeviceEnablePeerAccess(dst, 0)
326
+
327
+ # Status 0 = success, 704 = already enabled
328
+ results[[src, dst]] = status.zero? || status == 704
329
+ end
330
+
331
+ results
332
+ end
333
+
334
+ # @return [String] Summary of detected topology
335
+ def to_s
336
+ nvlink_count = @matrix.nvlink_paths.size
337
+ p2p_count = @matrix.p2p_paths.size
338
+ total_pairs = @device_ids.size * (@device_ids.size - 1)
339
+
340
+ "Topology: #{@device_ids.size} GPUs, " \
341
+ "#{nvlink_count}/#{total_pairs} NVLink, " \
342
+ "#{p2p_count}/#{total_pairs} P2P"
343
+ end
344
+ end
345
+ end
346
+ end
347
+ end