ignis-collective 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +7 -0
- data/lib/ignis-collective.rb +9 -0
- data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
- data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
- data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
- data/lib/nvruby/collective/algorithms/ring.rb +421 -0
- data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
- data/lib/nvruby/collective/algorithms/tree.rb +291 -0
- data/lib/nvruby/collective/array_ops.rb +240 -0
- data/lib/nvruby/collective/communicator.rb +633 -0
- data/lib/nvruby/collective/communicator_healer.rb +276 -0
- data/lib/nvruby/collective/device_manager.rb +216 -0
- data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
- data/lib/nvruby/collective/health_monitor.rb +333 -0
- data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
- data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
- data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
- data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
- data/lib/nvruby/collective/p2p_bindings.rb +121 -0
- data/lib/nvruby/collective/resilient_transport.rb +296 -0
- data/lib/nvruby/collective/topology.rb +347 -0
- data/lib/nvruby/collective/transport/base.rb +138 -0
- data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
- data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
- data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
- data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
- data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
- data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
- data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
- data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
- data/lib/nvruby/collective/transport_selector.rb +200 -0
- data/lib/nvruby/collective/vmm_bindings.rb +212 -0
- data/lib/nvruby/collective.rb +156 -0
- metadata +92 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "health_monitor"
|
|
4
|
+
require_relative "topology"
|
|
5
|
+
|
|
6
|
+
module Ignis
|
|
7
|
+
module Collective
|
|
8
|
+
# Dynamic GPU Optimizer
|
|
9
|
+
# Consolidates all dynamic optimization strategies for NvCCL
|
|
10
|
+
#
|
|
11
|
+
# Features:
|
|
12
|
+
# - Thermal (heatmap) optimization
|
|
13
|
+
# - Load balancing
|
|
14
|
+
# - Power management
|
|
15
|
+
# - Memory management
|
|
16
|
+
# - GPU availability/selection
|
|
17
|
+
# - Scheduling optimization
|
|
18
|
+
# - Synchronization optimization
|
|
19
|
+
#
|
|
20
|
+
# @example Create optimizer and get optimal ring order
|
|
21
|
+
# optimizer = DynamicOptimizer.new(device_ids: [0, 1, 2, 3])
|
|
22
|
+
# ring_order = optimizer.optimal_ring_order(strategy: :thermal)
|
|
23
|
+
#
|
|
24
|
+
class DynamicOptimizer
|
|
25
|
+
# Optimization strategies
|
|
26
|
+
STRATEGIES = %i[
|
|
27
|
+
thermal
|
|
28
|
+
load
|
|
29
|
+
power
|
|
30
|
+
memory
|
|
31
|
+
availability
|
|
32
|
+
scheduling
|
|
33
|
+
synchronization
|
|
34
|
+
balanced
|
|
35
|
+
].freeze
|
|
36
|
+
|
|
37
|
+
# @return [Array<Integer>] Device IDs
|
|
38
|
+
attr_reader :device_ids
|
|
39
|
+
|
|
40
|
+
# @return [HealthMonitor] Health monitor instance
|
|
41
|
+
attr_reader :health_monitor
|
|
42
|
+
|
|
43
|
+
# @return [Topology::Matrix] Topology matrix
|
|
44
|
+
attr_reader :topology
|
|
45
|
+
|
|
46
|
+
# @param device_ids [Array<Integer>] GPU device IDs
|
|
47
|
+
# @param health_monitor [HealthMonitor, nil] Optional health monitor
|
|
48
|
+
def initialize(device_ids:, health_monitor: nil)
|
|
49
|
+
@device_ids = device_ids.dup.freeze
|
|
50
|
+
@health_monitor = health_monitor || HealthMonitor.new(device_ids: device_ids)
|
|
51
|
+
@topology = Topology::Matrix.new(device_ids)
|
|
52
|
+
@metrics_cache = {}
|
|
53
|
+
@cache_ttl_seconds = 1.0
|
|
54
|
+
@last_cache_time = Time.now - 100
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Get optimal ring order based on strategy
|
|
58
|
+
#
|
|
59
|
+
# @param strategy [Symbol] Optimization strategy
|
|
60
|
+
# @return [Array<Integer>] Ordered device IDs
|
|
61
|
+
def optimal_ring_order(strategy: :balanced)
|
|
62
|
+
refresh_metrics_if_needed!
|
|
63
|
+
|
|
64
|
+
case strategy
|
|
65
|
+
when :thermal
|
|
66
|
+
thermal_optimized_order
|
|
67
|
+
when :load
|
|
68
|
+
load_balanced_order
|
|
69
|
+
when :power
|
|
70
|
+
power_optimized_order
|
|
71
|
+
when :memory
|
|
72
|
+
memory_optimized_order
|
|
73
|
+
when :availability
|
|
74
|
+
availability_optimized_order
|
|
75
|
+
when :scheduling
|
|
76
|
+
scheduling_optimized_order
|
|
77
|
+
when :synchronization
|
|
78
|
+
synchronization_optimized_order
|
|
79
|
+
when :balanced
|
|
80
|
+
balanced_optimized_order
|
|
81
|
+
else
|
|
82
|
+
@device_ids.dup
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Get current GPU metrics for all devices
|
|
87
|
+
#
|
|
88
|
+
# @return [Hash<Integer, Hash>] Metrics per device
|
|
89
|
+
def current_metrics
|
|
90
|
+
refresh_metrics_if_needed!
|
|
91
|
+
@metrics_cache.dup
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Check if a GPU should be excluded from collective
|
|
95
|
+
#
|
|
96
|
+
# @param device_id [Integer] Device ID
|
|
97
|
+
# @return [Boolean] True if device should be excluded
|
|
98
|
+
def should_exclude?(device_id)
|
|
99
|
+
metrics = @metrics_cache[device_id]
|
|
100
|
+
return true unless metrics
|
|
101
|
+
|
|
102
|
+
# Exclude if thermal throttling
|
|
103
|
+
return true if metrics[:temperature] && metrics[:temperature] > 90
|
|
104
|
+
|
|
105
|
+
# Exclude if out of memory
|
|
106
|
+
return true if metrics[:memory_used_percent] && metrics[:memory_used_percent] > 95
|
|
107
|
+
|
|
108
|
+
# Exclude if marked unhealthy
|
|
109
|
+
return true if metrics[:healthy] == false
|
|
110
|
+
|
|
111
|
+
false
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Get list of available GPUs
|
|
115
|
+
#
|
|
116
|
+
# @return [Array<Integer>] Available device IDs
|
|
117
|
+
def available_devices
|
|
118
|
+
refresh_metrics_if_needed!
|
|
119
|
+
@device_ids.reject { |id| should_exclude?(id) }
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Suggest optimal chunk size based on memory
|
|
123
|
+
#
|
|
124
|
+
# @param total_size [Integer] Total data size in bytes
|
|
125
|
+
# @return [Integer] Recommended chunk size
|
|
126
|
+
def suggest_chunk_size(total_size)
|
|
127
|
+
refresh_metrics_if_needed!
|
|
128
|
+
|
|
129
|
+
# Find minimum available memory across all devices
|
|
130
|
+
min_available = @metrics_cache.values.map do |m|
|
|
131
|
+
m[:memory_free] || Float::INFINITY
|
|
132
|
+
end.min
|
|
133
|
+
|
|
134
|
+
# Use at most 25% of minimum available memory
|
|
135
|
+
max_chunk = (min_available * 0.25).to_i
|
|
136
|
+
|
|
137
|
+
# Align to 256 bytes
|
|
138
|
+
chunk = [max_chunk, total_size].min
|
|
139
|
+
(chunk / 256) * 256
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Get power-aware throttling recommendation
|
|
143
|
+
#
|
|
144
|
+
# @return [Hash] Power recommendations
|
|
145
|
+
def power_recommendation
|
|
146
|
+
refresh_metrics_if_needed!
|
|
147
|
+
|
|
148
|
+
high_power_devices = @metrics_cache.select do |_id, m|
|
|
149
|
+
m[:power_usage] && m[:power_limit] &&
|
|
150
|
+
(m[:power_usage].to_f / m[:power_limit]) > 0.9
|
|
151
|
+
end.keys
|
|
152
|
+
|
|
153
|
+
{
|
|
154
|
+
throttle_devices: high_power_devices,
|
|
155
|
+
should_throttle: high_power_devices.any?,
|
|
156
|
+
recommendation: high_power_devices.any? ? :reduce_batch_size : :continue
|
|
157
|
+
}
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# @return [String]
|
|
161
|
+
def to_s
|
|
162
|
+
"DynamicOptimizer[#{@device_ids.size} GPUs, strategy=balanced]"
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
private
|
|
166
|
+
|
|
167
|
+
def refresh_metrics_if_needed!
|
|
168
|
+
return if Time.now - @last_cache_time < @cache_ttl_seconds
|
|
169
|
+
|
|
170
|
+
@device_ids.each do |device_id|
|
|
171
|
+
@metrics_cache[device_id] = collect_device_metrics(device_id)
|
|
172
|
+
end
|
|
173
|
+
@last_cache_time = Time.now
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def collect_device_metrics(device_id)
|
|
177
|
+
CUDA::RuntimeAPI.cudaSetDevice(device_id)
|
|
178
|
+
|
|
179
|
+
# Memory info
|
|
180
|
+
free_ptr = FFI::MemoryPointer.new(:size_t)
|
|
181
|
+
total_ptr = FFI::MemoryPointer.new(:size_t)
|
|
182
|
+
CUDA::RuntimeAPI.cudaMemGetInfo(free_ptr, total_ptr)
|
|
183
|
+
|
|
184
|
+
free = free_ptr.read(:size_t)
|
|
185
|
+
total = total_ptr.read(:size_t)
|
|
186
|
+
used = total - free
|
|
187
|
+
|
|
188
|
+
# Get temperature and power from NVML if available
|
|
189
|
+
temp = query_temperature(device_id)
|
|
190
|
+
power = query_power(device_id)
|
|
191
|
+
power_limit = query_power_limit(device_id)
|
|
192
|
+
|
|
193
|
+
{
|
|
194
|
+
device_id: device_id,
|
|
195
|
+
memory_free: free,
|
|
196
|
+
memory_total: total,
|
|
197
|
+
memory_used: used,
|
|
198
|
+
memory_used_percent: (used.to_f / total * 100).round(1),
|
|
199
|
+
temperature: temp,
|
|
200
|
+
power_usage: power,
|
|
201
|
+
power_limit: power_limit,
|
|
202
|
+
healthy: @health_monitor.healthy?(device_id),
|
|
203
|
+
timestamp: Time.now
|
|
204
|
+
}
|
|
205
|
+
rescue StandardError => e
|
|
206
|
+
Ignis.logger.warn { "Failed to collect metrics for device #{device_id}: #{e.message}" }
|
|
207
|
+
{ device_id: device_id, healthy: false, error: e.message }
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def query_temperature(device_id)
|
|
211
|
+
# Use NVML via FFI if available
|
|
212
|
+
return nil unless defined?(NVML) && NVML.respond_to?(:get_temperature)
|
|
213
|
+
|
|
214
|
+
NVML.get_temperature(device_id)
|
|
215
|
+
rescue StandardError
|
|
216
|
+
nil
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def query_power(device_id)
|
|
220
|
+
return nil unless defined?(NVML) && NVML.respond_to?(:get_power_usage)
|
|
221
|
+
|
|
222
|
+
NVML.get_power_usage(device_id)
|
|
223
|
+
rescue StandardError
|
|
224
|
+
nil
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def query_power_limit(device_id)
|
|
228
|
+
return nil unless defined?(NVML) && NVML.respond_to?(:get_power_limit)
|
|
229
|
+
|
|
230
|
+
NVML.get_power_limit(device_id)
|
|
231
|
+
rescue StandardError
|
|
232
|
+
nil
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
# Thermal optimization: order by temperature (coolest first)
|
|
236
|
+
def thermal_optimized_order
|
|
237
|
+
@device_ids.sort_by do |id|
|
|
238
|
+
@metrics_cache[id][:temperature] || 0
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# Load balancing: order by memory usage (least used first)
|
|
243
|
+
def load_balanced_order
|
|
244
|
+
@device_ids.sort_by do |id|
|
|
245
|
+
@metrics_cache[id][:memory_used_percent] || 0
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
# Power optimization: order by power usage (lowest first)
|
|
250
|
+
def power_optimized_order
|
|
251
|
+
@device_ids.sort_by do |id|
|
|
252
|
+
@metrics_cache[id][:power_usage] || 0
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
# Memory optimization: order by free memory (most free first)
|
|
257
|
+
def memory_optimized_order
|
|
258
|
+
@device_ids.sort_by do |id|
|
|
259
|
+
-(@metrics_cache[id][:memory_free] || 0)
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# Availability: filter out unhealthy, keep order
|
|
264
|
+
def availability_optimized_order
|
|
265
|
+
@device_ids.select { |id| !should_exclude?(id) }
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
# Scheduling: round-robin with healthy bias
|
|
269
|
+
def scheduling_optimized_order
|
|
270
|
+
healthy = @device_ids.select { |id| @metrics_cache[id][:healthy] }
|
|
271
|
+
unhealthy = @device_ids - healthy
|
|
272
|
+
healthy + unhealthy
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
# Synchronization: order by topology (minimize hops)
|
|
276
|
+
def synchronization_optimized_order
|
|
277
|
+
@topology.optimal_ring_order
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Balanced: weighted combination of all factors
|
|
281
|
+
def balanced_optimized_order
|
|
282
|
+
scores = @device_ids.map do |id|
|
|
283
|
+
metrics = @metrics_cache[id]
|
|
284
|
+
|
|
285
|
+
# Lower is better for each component
|
|
286
|
+
temp_score = (metrics[:temperature] || 50) / 100.0
|
|
287
|
+
mem_score = (metrics[:memory_used_percent] || 50) / 100.0
|
|
288
|
+
power_score = if metrics[:power_usage] && metrics[:power_limit] && metrics[:power_limit] > 0
|
|
289
|
+
metrics[:power_usage].to_f / metrics[:power_limit]
|
|
290
|
+
else
|
|
291
|
+
0.5
|
|
292
|
+
end
|
|
293
|
+
health_score = metrics[:healthy] ? 0.0 : 1.0
|
|
294
|
+
|
|
295
|
+
# Weighted sum
|
|
296
|
+
total = (temp_score * 0.25) +
|
|
297
|
+
(mem_score * 0.25) +
|
|
298
|
+
(power_score * 0.25) +
|
|
299
|
+
(health_score * 0.25)
|
|
300
|
+
|
|
301
|
+
[id, total]
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
scores.sort_by { |_id, score| score }.map(&:first)
|
|
305
|
+
end
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
end
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module Collective
|
|
5
|
+
# GPU health monitoring with proactive failure detection
|
|
6
|
+
# Monitors heartbeat, memory, and thermal status for each GPU
|
|
7
|
+
#
|
|
8
|
+
# @example Usage
|
|
9
|
+
# monitor = HealthMonitor.new([0, 1, 2, 3])
|
|
10
|
+
# monitor.start!
|
|
11
|
+
#
|
|
12
|
+
# if monitor.unhealthy_devices.any?
|
|
13
|
+
# healer.heal!(monitor.unhealthy_devices)
|
|
14
|
+
# end
|
|
15
|
+
#
|
|
16
|
+
class HealthMonitor
|
|
17
|
+
# Health check interval (seconds)
|
|
18
|
+
HEARTBEAT_INTERVAL = 5.0
|
|
19
|
+
|
|
20
|
+
# Minimum free memory percentage
|
|
21
|
+
MEMORY_THRESHOLD = 0.05
|
|
22
|
+
|
|
23
|
+
# Critical temperature (Celsius)
|
|
24
|
+
TEMP_CRITICAL = 90
|
|
25
|
+
|
|
26
|
+
# Warning temperature (Celsius)
|
|
27
|
+
TEMP_WARNING = 80
|
|
28
|
+
|
|
29
|
+
# Consecutive failures before marking unhealthy
|
|
30
|
+
FAILURE_THRESHOLD = 3
|
|
31
|
+
|
|
32
|
+
# Health status values
|
|
33
|
+
STATUS_HEALTHY = :healthy
|
|
34
|
+
STATUS_WARNING = :warning
|
|
35
|
+
STATUS_CRITICAL = :critical
|
|
36
|
+
STATUS_FAILED = :failed
|
|
37
|
+
|
|
38
|
+
# @return [Array<Integer>] Monitored device IDs
|
|
39
|
+
attr_reader :device_ids
|
|
40
|
+
|
|
41
|
+
# @return [Hash<Integer, Hash>] Device health status
|
|
42
|
+
attr_reader :device_status
|
|
43
|
+
|
|
44
|
+
# @return [Boolean] Whether monitoring is active
|
|
45
|
+
attr_reader :monitoring
|
|
46
|
+
|
|
47
|
+
# Create health monitor for specified GPUs
|
|
48
|
+
#
|
|
49
|
+
# @param device_ids [Array<Integer>] GPU device IDs to monitor
|
|
50
|
+
def initialize(device_ids)
|
|
51
|
+
@device_ids = device_ids.dup.freeze
|
|
52
|
+
@device_status = {}
|
|
53
|
+
@monitoring = false
|
|
54
|
+
@monitor_thread = nil
|
|
55
|
+
@callbacks = { on_failure: [], on_recovery: [], on_warning: [] }
|
|
56
|
+
|
|
57
|
+
initialize_status!
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Start background monitoring
|
|
61
|
+
# @return [void]
|
|
62
|
+
def start!
|
|
63
|
+
return if @monitoring
|
|
64
|
+
|
|
65
|
+
@monitoring = true
|
|
66
|
+
@monitor_thread = Thread.new { monitor_loop }
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Stop background monitoring
|
|
70
|
+
# @return [void]
|
|
71
|
+
def stop!
|
|
72
|
+
@monitoring = false
|
|
73
|
+
@monitor_thread&.join(2.0)
|
|
74
|
+
@monitor_thread = nil
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Run single health check (synchronous)
|
|
78
|
+
# @return [Hash<Integer, Symbol>] Device status map
|
|
79
|
+
def check_now!
|
|
80
|
+
@device_ids.each { |id| check_device!(id) }
|
|
81
|
+
status_summary
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Check if specific device is healthy
|
|
85
|
+
#
|
|
86
|
+
# @param device_id [Integer] GPU device ID
|
|
87
|
+
# @return [Boolean] True if healthy
|
|
88
|
+
def healthy?(device_id)
|
|
89
|
+
status = @device_status[device_id]
|
|
90
|
+
return false unless status
|
|
91
|
+
|
|
92
|
+
status[:status] == STATUS_HEALTHY || status[:status] == STATUS_WARNING
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Get all healthy device IDs
|
|
96
|
+
# @return [Array<Integer>] Healthy devices
|
|
97
|
+
def healthy_devices
|
|
98
|
+
@device_ids.select { |id| healthy?(id) }
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Get all unhealthy device IDs
|
|
102
|
+
# @return [Array<Integer>] Unhealthy devices
|
|
103
|
+
def unhealthy_devices
|
|
104
|
+
@device_ids.reject { |id| healthy?(id) }
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Get status summary
|
|
108
|
+
# @return [Hash<Integer, Symbol>] Device → status
|
|
109
|
+
def status_summary
|
|
110
|
+
@device_status.transform_values { |v| v[:status] }
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Register callback for device failure
|
|
114
|
+
#
|
|
115
|
+
# @yield [device_id, reason] Called when device fails
|
|
116
|
+
# @return [void]
|
|
117
|
+
def on_failure(&block)
|
|
118
|
+
@callbacks[:on_failure] << block
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Register callback for device recovery
|
|
122
|
+
#
|
|
123
|
+
# @yield [device_id] Called when device recovers
|
|
124
|
+
# @return [void]
|
|
125
|
+
def on_recovery(&block)
|
|
126
|
+
@callbacks[:on_recovery] << block
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Register callback for warnings
|
|
130
|
+
#
|
|
131
|
+
# @yield [device_id, warning_type, value] Called on warning
|
|
132
|
+
# @return [void]
|
|
133
|
+
def on_warning(&block)
|
|
134
|
+
@callbacks[:on_warning] << block
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Get detailed status for a device
|
|
138
|
+
#
|
|
139
|
+
# @param device_id [Integer] GPU device ID
|
|
140
|
+
# @return [Hash] Status details
|
|
141
|
+
def device_details(device_id)
|
|
142
|
+
@device_status[device_id]&.dup
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Force device status (for testing or manual override)
|
|
146
|
+
#
|
|
147
|
+
# @param device_id [Integer] GPU device ID
|
|
148
|
+
# @param status [Symbol] New status
|
|
149
|
+
# @return [void]
|
|
150
|
+
def force_status!(device_id, status)
|
|
151
|
+
return unless @device_status.key?(device_id)
|
|
152
|
+
|
|
153
|
+
@device_status[device_id][:status] = status
|
|
154
|
+
@device_status[device_id][:forced] = true
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Clean up resources
|
|
158
|
+
# @return [void]
|
|
159
|
+
def destroy!
|
|
160
|
+
stop!
|
|
161
|
+
@device_status.clear
|
|
162
|
+
@callbacks.each_value(&:clear)
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# @return [String] Human-readable summary
|
|
166
|
+
def to_s
|
|
167
|
+
healthy = healthy_devices.size
|
|
168
|
+
total = @device_ids.size
|
|
169
|
+
"HealthMonitor[#{healthy}/#{total} healthy]"
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
private
|
|
173
|
+
|
|
174
|
+
def initialize_status!
|
|
175
|
+
@device_ids.each do |id|
|
|
176
|
+
@device_status[id] = {
|
|
177
|
+
status: STATUS_HEALTHY,
|
|
178
|
+
consecutive_failures: 0,
|
|
179
|
+
last_check: nil,
|
|
180
|
+
last_healthy: Time.now,
|
|
181
|
+
memory_free_pct: 1.0,
|
|
182
|
+
temperature: 0,
|
|
183
|
+
forced: false
|
|
184
|
+
}
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def monitor_loop
|
|
189
|
+
while @monitoring
|
|
190
|
+
@device_ids.each { |id| check_device!(id) }
|
|
191
|
+
sleep(HEARTBEAT_INTERVAL)
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def check_device!(device_id)
|
|
196
|
+
status = @device_status[device_id]
|
|
197
|
+
return if status[:forced]
|
|
198
|
+
|
|
199
|
+
begin
|
|
200
|
+
# 1. Heartbeat - synchronize device
|
|
201
|
+
heartbeat_ok = check_heartbeat(device_id)
|
|
202
|
+
|
|
203
|
+
# 2. Memory check
|
|
204
|
+
memory_pct = check_memory(device_id)
|
|
205
|
+
memory_ok = memory_pct >= MEMORY_THRESHOLD
|
|
206
|
+
|
|
207
|
+
# 3. Temperature check (if NVML available)
|
|
208
|
+
temp = check_temperature(device_id)
|
|
209
|
+
temp_ok = temp < TEMP_CRITICAL
|
|
210
|
+
|
|
211
|
+
status[:last_check] = Time.now
|
|
212
|
+
status[:memory_free_pct] = memory_pct
|
|
213
|
+
status[:temperature] = temp
|
|
214
|
+
|
|
215
|
+
if heartbeat_ok && memory_ok && temp_ok
|
|
216
|
+
# Recovery from failure
|
|
217
|
+
if status[:status] == STATUS_FAILED
|
|
218
|
+
notify_recovery(device_id)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
if temp > TEMP_WARNING
|
|
222
|
+
status[:status] = STATUS_WARNING
|
|
223
|
+
notify_warning(device_id, :temperature, temp)
|
|
224
|
+
elsif memory_pct < MEMORY_THRESHOLD * 2
|
|
225
|
+
status[:status] = STATUS_WARNING
|
|
226
|
+
notify_warning(device_id, :memory, memory_pct)
|
|
227
|
+
else
|
|
228
|
+
status[:status] = STATUS_HEALTHY
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
status[:consecutive_failures] = 0
|
|
232
|
+
status[:last_healthy] = Time.now
|
|
233
|
+
else
|
|
234
|
+
status[:consecutive_failures] += 1
|
|
235
|
+
|
|
236
|
+
if status[:consecutive_failures] >= FAILURE_THRESHOLD
|
|
237
|
+
old_status = status[:status]
|
|
238
|
+
status[:status] = STATUS_FAILED
|
|
239
|
+
|
|
240
|
+
if old_status != STATUS_FAILED
|
|
241
|
+
reason = determine_failure_reason(heartbeat_ok, memory_ok, temp_ok, memory_pct, temp)
|
|
242
|
+
notify_failure(device_id, reason)
|
|
243
|
+
end
|
|
244
|
+
else
|
|
245
|
+
status[:status] = STATUS_CRITICAL
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
rescue StandardError => e
|
|
249
|
+
# Check itself failed
|
|
250
|
+
status[:consecutive_failures] += 1
|
|
251
|
+
if status[:consecutive_failures] >= FAILURE_THRESHOLD
|
|
252
|
+
status[:status] = STATUS_FAILED
|
|
253
|
+
notify_failure(device_id, "Health check error: #{e.message}")
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
def check_heartbeat(device_id)
|
|
259
|
+
CUDA::RuntimeAPI.ensure_loaded!
|
|
260
|
+
|
|
261
|
+
# Use new Fiddle-based RuntimeAPI methods
|
|
262
|
+
CUDA::RuntimeAPI.set_device(device_id)
|
|
263
|
+
CUDA::RuntimeAPI.device_synchronize
|
|
264
|
+
true
|
|
265
|
+
rescue StandardError
|
|
266
|
+
false
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
def check_memory(device_id)
|
|
270
|
+
CUDA::RuntimeAPI.set_device(device_id)
|
|
271
|
+
|
|
272
|
+
info = CUDA::RuntimeAPI.mem_get_info
|
|
273
|
+
free = info[:free_bytes]
|
|
274
|
+
total = info[:total_bytes]
|
|
275
|
+
|
|
276
|
+
total.positive? ? free.to_f / total : 0.0
|
|
277
|
+
rescue StandardError
|
|
278
|
+
0.0
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
def check_temperature(device_id)
|
|
282
|
+
# Try NVML if available
|
|
283
|
+
return 0 unless defined?(CUDA::NVML)
|
|
284
|
+
|
|
285
|
+
CUDA::NVML.device_temperature(device_id)
|
|
286
|
+
rescue StandardError
|
|
287
|
+
0 # Assume OK if NVML not available
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
def determine_failure_reason(heartbeat_ok, memory_ok, temp_ok, memory_pct, temp)
|
|
291
|
+
reasons = []
|
|
292
|
+
reasons << "heartbeat timeout" unless heartbeat_ok
|
|
293
|
+
reasons << "out of memory (#{(memory_pct * 100).round(1)}% free)" unless memory_ok
|
|
294
|
+
reasons << "overheating (#{temp}°C)" unless temp_ok
|
|
295
|
+
reasons.join(", ")
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
def notify_failure(device_id, reason)
|
|
299
|
+
@callbacks[:on_failure].each { |cb| cb.call(device_id, reason) }
|
|
300
|
+
|
|
301
|
+
# Publish to EventBus for system-wide notification
|
|
302
|
+
if defined?(Ignis::Shared::EventBus)
|
|
303
|
+
Ignis::Shared::EventBus.publish(:gpu_failed, payload: {
|
|
304
|
+
gpu_id: device_id,
|
|
305
|
+
reason: reason
|
|
306
|
+
})
|
|
307
|
+
end
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
def notify_recovery(device_id)
|
|
311
|
+
@callbacks[:on_recovery].each { |cb| cb.call(device_id) }
|
|
312
|
+
|
|
313
|
+
if defined?(Ignis::Shared::EventBus)
|
|
314
|
+
Ignis::Shared::EventBus.publish(:gpu_recovered, payload: {
|
|
315
|
+
gpu_id: device_id
|
|
316
|
+
})
|
|
317
|
+
end
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
def notify_warning(device_id, warning_type, value)
|
|
321
|
+
@callbacks[:on_warning].each { |cb| cb.call(device_id, warning_type, value) }
|
|
322
|
+
|
|
323
|
+
if defined?(Ignis::Shared::EventBus)
|
|
324
|
+
Ignis::Shared::EventBus.publish(:health_alert, payload: {
|
|
325
|
+
gpu_id: device_id,
|
|
326
|
+
metric: warning_type,
|
|
327
|
+
value: value
|
|
328
|
+
})
|
|
329
|
+
end
|
|
330
|
+
end
|
|
331
|
+
end
|
|
332
|
+
end
|
|
333
|
+
end
|