ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +7 -0
  3. data/lib/ignis-collective.rb +9 -0
  4. data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
  5. data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
  6. data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
  7. data/lib/nvruby/collective/algorithms/ring.rb +421 -0
  8. data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
  9. data/lib/nvruby/collective/algorithms/tree.rb +291 -0
  10. data/lib/nvruby/collective/array_ops.rb +240 -0
  11. data/lib/nvruby/collective/communicator.rb +633 -0
  12. data/lib/nvruby/collective/communicator_healer.rb +276 -0
  13. data/lib/nvruby/collective/device_manager.rb +216 -0
  14. data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
  15. data/lib/nvruby/collective/health_monitor.rb +333 -0
  16. data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
  17. data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
  18. data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
  19. data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
  20. data/lib/nvruby/collective/p2p_bindings.rb +121 -0
  21. data/lib/nvruby/collective/resilient_transport.rb +296 -0
  22. data/lib/nvruby/collective/topology.rb +347 -0
  23. data/lib/nvruby/collective/transport/base.rb +138 -0
  24. data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
  25. data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
  26. data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
  27. data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
  28. data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
  29. data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
  30. data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
  31. data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
  32. data/lib/nvruby/collective/transport_selector.rb +200 -0
  33. data/lib/nvruby/collective/vmm_bindings.rb +212 -0
  34. data/lib/nvruby/collective.rb +156 -0
  35. metadata +92 -0
@@ -0,0 +1,276 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "health_monitor"
4
+ require_relative "topology"
5
+ require_relative "transport_selector"
6
+
7
+ module Ignis
8
+ module Collective
9
+ # Communicator healing for dynamic reconfiguration on GPU failure
10
+ # Inspired by Universal Checkpointing (USENIX ATC 2025) patterns
11
+ #
12
+ # Enables recovery without full restart:
13
+ # 1. Detect failed GPUs via HealthMonitor
14
+ # 2. Exclude from active set
15
+ # 3. Rebuild topology and transports
16
+ # 4. Invalidate stale CUDA Graphs
17
+ # 5. Resume operations with reduced GPU count
18
+ #
19
+ # @example Usage with communicator
20
+ # healer = CommunicatorHealer.new(communicator)
21
+ # monitor.on_failure { |gpu| healer.heal!([gpu]) }
22
+ #
23
+ class CommunicatorHealer
24
+ # @return [Communicator] Parent communicator
25
+ attr_reader :communicator
26
+
27
+ # @return [Array<Integer>] Currently active GPU IDs
28
+ attr_reader :active_devices
29
+
30
+ # @return [Array<Integer>] Failed GPU IDs
31
+ attr_reader :failed_devices
32
+
33
+ # @return [Integer] Total heal operations performed
34
+ attr_reader :heal_count
35
+
36
+ # @return [Array<Hash>] Heal history
37
+ attr_reader :heal_history
38
+
39
+ # Create healer for a communicator
40
+ #
41
+ # @param communicator [Communicator] Parent communicator
42
+ def initialize(communicator)
43
+ @communicator = communicator
44
+ @active_devices = communicator.gpu_ids.dup
45
+ @failed_devices = []
46
+ @heal_count = 0
47
+ @heal_history = []
48
+ @callbacks = { pre_heal: [], post_heal: [] }
49
+ @cuda_graph_cache = []
50
+ end
51
+
52
+ # Perform healing operation - exclude failed GPUs and rebuild
53
+ #
54
+ # @param failed_gpu_ids [Array<Integer>] GPUs to exclude
55
+ # @return [Boolean] True if healing succeeded
56
+ def heal!(failed_gpu_ids)
57
+ return true if failed_gpu_ids.empty?
58
+
59
+ notify_pre_heal(failed_gpu_ids)
60
+
61
+ begin
62
+ # 1. Record failed devices
63
+ @failed_devices |= failed_gpu_ids
64
+ @active_devices -= failed_gpu_ids
65
+
66
+ # 2. Validate we have enough GPUs left
67
+ if @active_devices.size < minimum_gpu_count
68
+ raise HealingError, "Too few GPUs remaining: #{@active_devices.size}"
69
+ end
70
+
71
+ # 3. Invalidate CUDA Graphs (they reference old topology)
72
+ invalidate_cuda_graphs!
73
+
74
+ # 4. Rebuild topology for survivors
75
+ rebuild_topology!
76
+
77
+ # 5. Rebuild transports
78
+ rebuild_transports!
79
+
80
+ # 6. Update communicator state
81
+ update_communicator_state!
82
+
83
+ # 7. Record success
84
+ record_heal(failed_gpu_ids, :success)
85
+ notify_post_heal(failed_gpu_ids, :success)
86
+
87
+ true
88
+ rescue StandardError => e
89
+ record_heal(failed_gpu_ids, :failed, e.message)
90
+ notify_post_heal(failed_gpu_ids, :failed)
91
+ raise
92
+ end
93
+ end
94
+
95
+ # Attempt to recover a failed GPU
96
+ #
97
+ # @param gpu_id [Integer] GPU to recover
98
+ # @return [Boolean] True if recovery succeeded
99
+ def recover!(gpu_id)
100
+ return false unless @failed_devices.include?(gpu_id)
101
+
102
+ # Test if GPU is responsive
103
+ return false unless test_gpu_health(gpu_id)
104
+
105
+ # Reintegrate
106
+ @failed_devices.delete(gpu_id)
107
+ @active_devices << gpu_id
108
+ @active_devices.sort!
109
+
110
+ # Rebuild topology with recovered GPU
111
+ rebuild_topology!
112
+ rebuild_transports!
113
+ update_communicator_state!
114
+
115
+ record_heal([gpu_id], :recovered)
116
+ true
117
+ rescue StandardError
118
+ false
119
+ end
120
+
121
+ # Register CUDA Graph for invalidation on heal
122
+ #
123
+ # @param graph [CUDA::Graph, FFI::Pointer] Graph to track
124
+ # @return [void]
125
+ def register_cuda_graph(graph)
126
+ @cuda_graph_cache << graph unless @cuda_graph_cache.include?(graph)
127
+ end
128
+
129
+ # Unregister CUDA Graph
130
+ #
131
+ # @param graph [CUDA::Graph, FFI::Pointer] Graph to untrack
132
+ # @return [void]
133
+ def unregister_cuda_graph(graph)
134
+ @cuda_graph_cache.delete(graph)
135
+ end
136
+
137
+ # Get current world size (active GPUs)
138
+ # @return [Integer] Number of active GPUs
139
+ def world_size
140
+ @active_devices.size
141
+ end
142
+
143
+ # Check if any GPUs have failed
144
+ # @return [Boolean] True if degraded
145
+ def degraded?
146
+ @failed_devices.any?
147
+ end
148
+
149
+ # Get health summary
150
+ # @return [Hash] Health statistics
151
+ def health_summary
152
+ {
153
+ active_count: @active_devices.size,
154
+ failed_count: @failed_devices.size,
155
+ active_devices: @active_devices.dup,
156
+ failed_devices: @failed_devices.dup,
157
+ heal_count: @heal_count,
158
+ degraded: degraded?
159
+ }
160
+ end
161
+
162
+ # Register pre-heal callback
163
+ # @yield [failed_gpu_ids] Called before healing
164
+ def on_pre_heal(&block)
165
+ @callbacks[:pre_heal] << block
166
+ end
167
+
168
+ # Register post-heal callback
169
+ # @yield [failed_gpu_ids, status] Called after healing
170
+ def on_post_heal(&block)
171
+ @callbacks[:post_heal] << block
172
+ end
173
+
174
+ # @return [String] Human-readable status
175
+ def to_s
176
+ status = degraded? ? "degraded" : "healthy"
177
+ "CommunicatorHealer[#{@active_devices.size}/#{@communicator.gpu_ids.size} active, #{status}]"
178
+ end
179
+
180
+ private
181
+
182
+ def minimum_gpu_count
183
+ # Need at least 1 GPU for any operation
184
+ # Could be made configurable
185
+ 1
186
+ end
187
+
188
+ def invalidate_cuda_graphs!
189
+ @cuda_graph_cache.each do |graph|
190
+ begin
191
+ if graph.respond_to?(:invalidate!)
192
+ graph.invalidate!
193
+ elsif graph.respond_to?(:destroy!)
194
+ graph.destroy!
195
+ end
196
+ rescue StandardError
197
+ # Best effort - graph may already be invalid
198
+ end
199
+ end
200
+ @cuda_graph_cache.clear
201
+ end
202
+
203
+ def rebuild_topology!
204
+ # Create new topology for active devices only
205
+ @new_topology = Topology::Detector.new(device_ids: @active_devices)
206
+ end
207
+
208
+ def rebuild_transports!
209
+ # Destroy old transports
210
+ if @communicator.respond_to?(:transport_selector)
211
+ @communicator.transport_selector&.destroy!
212
+ end
213
+
214
+ # Create new transport selector for active devices
215
+ @new_transport_selector = TransportSelector.new(@active_devices)
216
+ @new_transport_selector.initialize!
217
+ end
218
+
219
+ def update_communicator_state!
220
+ # Update communicator's internal state
221
+ # This requires the Communicator to expose update methods
222
+
223
+ if @communicator.respond_to?(:update_topology!)
224
+ @communicator.update_topology!(@new_topology)
225
+ end
226
+
227
+ if @communicator.respond_to?(:update_transport_selector!)
228
+ @communicator.update_transport_selector!(@new_transport_selector)
229
+ end
230
+
231
+ if @communicator.respond_to?(:update_device_ids!)
232
+ @communicator.update_device_ids!(@active_devices)
233
+ end
234
+
235
+ # Update ring order for collective algorithms
236
+ if @communicator.respond_to?(:update_ring_order!)
237
+ @communicator.update_ring_order!(@new_topology.optimal_ring_order)
238
+ end
239
+ end
240
+
241
+ def test_gpu_health(gpu_id)
242
+ CUDA::RuntimeAPI.ensure_loaded!
243
+ CUDA::RuntimeAPI.set_device(gpu_id)
244
+ CUDA::RuntimeAPI.device_synchronize
245
+ true
246
+ rescue StandardError
247
+ false
248
+ end
249
+
250
+ def record_heal(gpu_ids, status, error = nil)
251
+ @heal_count += 1
252
+ @heal_history << {
253
+ timestamp: Time.now,
254
+ gpu_ids: gpu_ids.dup,
255
+ status: status,
256
+ error: error,
257
+ active_after: @active_devices.size
258
+ }
259
+
260
+ # Keep history bounded
261
+ @heal_history.shift if @heal_history.size > 100
262
+ end
263
+
264
+ def notify_pre_heal(gpu_ids)
265
+ @callbacks[:pre_heal].each { |cb| cb.call(gpu_ids) }
266
+ end
267
+
268
+ def notify_post_heal(gpu_ids, status)
269
+ @callbacks[:post_heal].each { |cb| cb.call(gpu_ids, status) }
270
+ end
271
+ end
272
+
273
+ # Error during healing operation
274
+ class HealingError < StandardError; end
275
+ end
276
+ end
@@ -0,0 +1,216 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "p2p_bindings"
4
+ require_relative "topology"
5
+
6
+ module Ignis
7
+ module Collective
8
+ # Multi-GPU device manager
9
+ # Handles device enumeration, context management, and peer access configuration
10
+ class DeviceManager
11
+ # @return [Array<Integer>] Managed GPU device IDs
12
+ attr_reader :device_ids
13
+
14
+ # @return [Hash<Integer, CUDA::Device>] Device objects by ID
15
+ attr_reader :devices
16
+
17
+ # @return [Topology::Detector] Topology detector
18
+ attr_reader :topology
19
+
20
+ # @return [Hash<Array<Integer>, Boolean>] P2P access status
21
+ attr_reader :p2p_access_enabled
22
+
23
+ # Create device manager for specified GPUs
24
+ # @param device_ids [Array<Integer>, nil] GPUs to manage (nil = all)
25
+ def initialize(device_ids: nil)
26
+ @device_ids = (device_ids || all_device_ids).dup.freeze
27
+ @devices = {}
28
+ @topology = nil
29
+ @p2p_access_enabled = {}
30
+ @initialized = false
31
+
32
+ validate_devices!
33
+ create_device_objects!
34
+ end
35
+
36
+ # Initialize device manager and detect topology
37
+ # @return [void]
38
+ def initialize!
39
+ return if @initialized
40
+
41
+ detect_topology!
42
+ @initialized = true
43
+ end
44
+
45
+ # Detect GPU topology
46
+ # @return [Topology::Detector] Topology detector
47
+ def detect_topology!
48
+ @topology = Topology::Detector.new(device_ids: @device_ids)
49
+ end
50
+
51
+ # Enable P2P access between all GPU pairs where available
52
+ # @return [Hash<Array<Integer>, Boolean>] Map of (src, dst) to success status
53
+ def enable_all_p2p_access!
54
+ return @p2p_access_enabled unless @p2p_access_enabled.empty?
55
+
56
+ detect_topology! unless @topology
57
+
58
+ P2PBindings.ensure_loaded!
59
+ CUDA::RuntimeAPI.ensure_loaded!
60
+
61
+ @device_ids.each do |src|
62
+ @device_ids.each do |dst|
63
+ next if src == dst
64
+
65
+ # Check if P2P is possible
66
+ unless @topology.p2p_available?(src, dst)
67
+ @p2p_access_enabled[[src, dst]] = false
68
+ next
69
+ end
70
+
71
+ # Set source device context
72
+ status = CUDA::RuntimeAPI.cudaSetDevice(src)
73
+ CUDA::RuntimeAPI.check_status!(status, "Set device #{src}")
74
+
75
+ # Enable peer access
76
+ status = P2PBindings.cudaDeviceEnablePeerAccess(dst, 0)
77
+
78
+ # 0 = success, 704 = already enabled
79
+ @p2p_access_enabled[[src, dst]] = status.zero? || status == 704
80
+ end
81
+ end
82
+
83
+ @p2p_access_enabled
84
+ end
85
+
86
+ # Disable all P2P access
87
+ # @return [void]
88
+ def disable_all_p2p_access!
89
+ @p2p_access_enabled.each_key do |(src, dst)|
90
+ CUDA::RuntimeAPI.cudaSetDevice(src)
91
+ P2PBindings.cudaDeviceDisablePeerAccess(dst)
92
+ rescue StandardError
93
+ # Ignore errors during cleanup
94
+ end
95
+ @p2p_access_enabled.clear
96
+ end
97
+
98
+ # Get optimal ring order for collective operations
99
+ # @return [Array<Integer>] Ordered device IDs
100
+ def optimal_ring_order
101
+ detect_topology! unless @topology
102
+ @topology.optimal_ring_order
103
+ end
104
+
105
+ # Get device by ID
106
+ # @param device_id [Integer] GPU device ID
107
+ # @return [CUDA::Device, nil] Device object
108
+ def device(device_id)
109
+ @devices[device_id]
110
+ end
111
+
112
+ # Set current CUDA device
113
+ # @param device_id [Integer] GPU to activate
114
+ # @return [void]
115
+ def set_device!(device_id)
116
+ validate_device_id!(device_id)
117
+ @devices[device_id].set_current!
118
+ end
119
+
120
+ # Synchronize a device
121
+ # @param device_id [Integer] GPU to synchronize
122
+ # @return [void]
123
+ def synchronize!(device_id)
124
+ validate_device_id!(device_id)
125
+ @devices[device_id].synchronize
126
+ end
127
+
128
+ # Synchronize all managed devices
129
+ # @return [void]
130
+ def synchronize_all!
131
+ @device_ids.each { |id| synchronize!(id) }
132
+ end
133
+
134
+ # Get number of managed GPUs
135
+ # @return [Integer] GPU count
136
+ def size
137
+ @device_ids.size
138
+ end
139
+
140
+ # Check if fully initialized
141
+ # @return [Boolean] True if ready
142
+ def ready?
143
+ @initialized && @topology
144
+ end
145
+
146
+ # Get P2P capability summary
147
+ # @return [Hash] P2P statistics
148
+ def p2p_summary
149
+ return {} unless @topology
150
+
151
+ matrix = @topology.matrix
152
+ {
153
+ gpu_count: @device_ids.size,
154
+ total_paths: @device_ids.size * (@device_ids.size - 1),
155
+ p2p_enabled: @p2p_access_enabled.count { |_, v| v },
156
+ nvlink_paths: matrix.nvlink_paths.size,
157
+ full_mesh: matrix.full_p2p_mesh?,
158
+ }
159
+ end
160
+
161
+ # Clean up resources
162
+ # @return [void]
163
+ def destroy!
164
+ disable_all_p2p_access!
165
+ @devices.clear
166
+ @topology = nil
167
+ @initialized = false
168
+ end
169
+
170
+ # @return [String] Human-readable summary
171
+ def to_s
172
+ names = @devices.values.map { |d| "#{d.index}:#{d.name[0..15]}" }
173
+ "DeviceManager[#{names.join(', ')}]"
174
+ end
175
+
176
+ private
177
+
178
+ # Get all available GPU IDs
179
+ # @return [Array<Integer>] All device IDs
180
+ def all_device_ids
181
+ CUDA::Device.list.map(&:index)
182
+ end
183
+
184
+ # Validate requested device IDs exist
185
+ # @return [void]
186
+ # @raise [ArgumentError] If invalid device ID
187
+ def validate_devices!
188
+ all_ids = all_device_ids
189
+
190
+ @device_ids.each do |id|
191
+ next if all_ids.include?(id)
192
+
193
+ raise ArgumentError, "Invalid device ID #{id}. Available: #{all_ids}"
194
+ end
195
+ end
196
+
197
+ # Validate single device ID
198
+ # @param device_id [Integer] GPU ID
199
+ # @return [void]
200
+ # @raise [ArgumentError] If invalid
201
+ def validate_device_id!(device_id)
202
+ return if @device_ids.include?(device_id)
203
+
204
+ raise ArgumentError, "Device #{device_id} not managed by this DeviceManager"
205
+ end
206
+
207
+ # Create CUDA::Device objects for all managed GPUs
208
+ # @return [void]
209
+ def create_device_objects!
210
+ @device_ids.each do |id|
211
+ @devices[id] = CUDA::Device.new(id)
212
+ end
213
+ end
214
+ end
215
+ end
216
+ end