ignis-collective 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +7 -0
  3. data/lib/ignis-collective.rb +9 -0
  4. data/lib/nvruby/collective/algorithms/double_binary_tree.rb +364 -0
  5. data/lib/nvruby/collective/algorithms/pipeliner.rb +222 -0
  6. data/lib/nvruby/collective/algorithms/reduction_ops.rb +168 -0
  7. data/lib/nvruby/collective/algorithms/ring.rb +421 -0
  8. data/lib/nvruby/collective/algorithms/topology_router.rb +284 -0
  9. data/lib/nvruby/collective/algorithms/tree.rb +291 -0
  10. data/lib/nvruby/collective/array_ops.rb +240 -0
  11. data/lib/nvruby/collective/communicator.rb +633 -0
  12. data/lib/nvruby/collective/communicator_healer.rb +276 -0
  13. data/lib/nvruby/collective/device_manager.rb +216 -0
  14. data/lib/nvruby/collective/dynamic_optimizer.rb +308 -0
  15. data/lib/nvruby/collective/health_monitor.rb +333 -0
  16. data/lib/nvruby/collective/net/nd_adapter.rb +450 -0
  17. data/lib/nvruby/collective/net/nd_bindings.rb +166 -0
  18. data/lib/nvruby/collective/net/rdma_transport.rb +366 -0
  19. data/lib/nvruby/collective/nvarray_adapter.rb +230 -0
  20. data/lib/nvruby/collective/p2p_bindings.rb +121 -0
  21. data/lib/nvruby/collective/resilient_transport.rb +296 -0
  22. data/lib/nvruby/collective/topology.rb +347 -0
  23. data/lib/nvruby/collective/transport/base.rb +138 -0
  24. data/lib/nvruby/collective/transport/host_staged_transport.rb +217 -0
  25. data/lib/nvruby/collective/transport/ipc_transport.rb +187 -0
  26. data/lib/nvruby/collective/transport/p2p_transport.rb +157 -0
  27. data/lib/nvruby/collective/transport/rdma_transports.rb +213 -0
  28. data/lib/nvruby/collective/transport/rio_transport.rb +405 -0
  29. data/lib/nvruby/collective/transport/tcp_transport.rb +290 -0
  30. data/lib/nvruby/collective/transport/vmm_ipc_structs.rb +189 -0
  31. data/lib/nvruby/collective/transport/vmm_ipc_transport.rb +266 -0
  32. data/lib/nvruby/collective/transport_selector.rb +200 -0
  33. data/lib/nvruby/collective/vmm_bindings.rb +212 -0
  34. data/lib/nvruby/collective.rb +156 -0
  35. metadata +92 -0
@@ -0,0 +1,308 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "health_monitor"
4
+ require_relative "topology"
5
+
6
+ module Ignis
7
+ module Collective
8
+ # Dynamic GPU Optimizer
9
+ # Consolidates all dynamic optimization strategies for NvCCL
10
+ #
11
+ # Features:
12
+ # - Thermal (heatmap) optimization
13
+ # - Load balancing
14
+ # - Power management
15
+ # - Memory management
16
+ # - GPU availability/selection
17
+ # - Scheduling optimization
18
+ # - Synchronization optimization
19
+ #
20
+ # @example Create optimizer and get optimal ring order
21
+ # optimizer = DynamicOptimizer.new(device_ids: [0, 1, 2, 3])
22
+ # ring_order = optimizer.optimal_ring_order(strategy: :thermal)
23
+ #
24
+ class DynamicOptimizer
25
+ # Optimization strategies
26
+ STRATEGIES = %i[
27
+ thermal
28
+ load
29
+ power
30
+ memory
31
+ availability
32
+ scheduling
33
+ synchronization
34
+ balanced
35
+ ].freeze
36
+
37
+ # @return [Array<Integer>] Device IDs
38
+ attr_reader :device_ids
39
+
40
+ # @return [HealthMonitor] Health monitor instance
41
+ attr_reader :health_monitor
42
+
43
+ # @return [Topology::Matrix] Topology matrix
44
+ attr_reader :topology
45
+
46
+ # @param device_ids [Array<Integer>] GPU device IDs
47
+ # @param health_monitor [HealthMonitor, nil] Optional health monitor
48
+ def initialize(device_ids:, health_monitor: nil)
49
+ @device_ids = device_ids.dup.freeze
50
+ @health_monitor = health_monitor || HealthMonitor.new(device_ids: device_ids)
51
+ @topology = Topology::Matrix.new(device_ids)
52
+ @metrics_cache = {}
53
+ @cache_ttl_seconds = 1.0
54
+ @last_cache_time = Time.now - 100
55
+ end
56
+
57
+ # Get optimal ring order based on strategy
58
+ #
59
+ # @param strategy [Symbol] Optimization strategy
60
+ # @return [Array<Integer>] Ordered device IDs
61
+ def optimal_ring_order(strategy: :balanced)
62
+ refresh_metrics_if_needed!
63
+
64
+ case strategy
65
+ when :thermal
66
+ thermal_optimized_order
67
+ when :load
68
+ load_balanced_order
69
+ when :power
70
+ power_optimized_order
71
+ when :memory
72
+ memory_optimized_order
73
+ when :availability
74
+ availability_optimized_order
75
+ when :scheduling
76
+ scheduling_optimized_order
77
+ when :synchronization
78
+ synchronization_optimized_order
79
+ when :balanced
80
+ balanced_optimized_order
81
+ else
82
+ @device_ids.dup
83
+ end
84
+ end
85
+
86
+ # Get current GPU metrics for all devices
87
+ #
88
+ # @return [Hash<Integer, Hash>] Metrics per device
89
+ def current_metrics
90
+ refresh_metrics_if_needed!
91
+ @metrics_cache.dup
92
+ end
93
+
94
+ # Check if a GPU should be excluded from collective
95
+ #
96
+ # @param device_id [Integer] Device ID
97
+ # @return [Boolean] True if device should be excluded
98
+ def should_exclude?(device_id)
99
+ metrics = @metrics_cache[device_id]
100
+ return true unless metrics
101
+
102
+ # Exclude if thermal throttling
103
+ return true if metrics[:temperature] && metrics[:temperature] > 90
104
+
105
+ # Exclude if out of memory
106
+ return true if metrics[:memory_used_percent] && metrics[:memory_used_percent] > 95
107
+
108
+ # Exclude if marked unhealthy
109
+ return true if metrics[:healthy] == false
110
+
111
+ false
112
+ end
113
+
114
+ # Get list of available GPUs
115
+ #
116
+ # @return [Array<Integer>] Available device IDs
117
+ def available_devices
118
+ refresh_metrics_if_needed!
119
+ @device_ids.reject { |id| should_exclude?(id) }
120
+ end
121
+
122
+ # Suggest optimal chunk size based on memory
123
+ #
124
+ # @param total_size [Integer] Total data size in bytes
125
+ # @return [Integer] Recommended chunk size
126
+ def suggest_chunk_size(total_size)
127
+ refresh_metrics_if_needed!
128
+
129
+ # Find minimum available memory across all devices
130
+ min_available = @metrics_cache.values.map do |m|
131
+ m[:memory_free] || Float::INFINITY
132
+ end.min
133
+
134
+ # Use at most 25% of minimum available memory
135
+ max_chunk = (min_available * 0.25).to_i
136
+
137
+ # Align to 256 bytes
138
+ chunk = [max_chunk, total_size].min
139
+ (chunk / 256) * 256
140
+ end
141
+
142
+ # Get power-aware throttling recommendation
143
+ #
144
+ # @return [Hash] Power recommendations
145
+ def power_recommendation
146
+ refresh_metrics_if_needed!
147
+
148
+ high_power_devices = @metrics_cache.select do |_id, m|
149
+ m[:power_usage] && m[:power_limit] &&
150
+ (m[:power_usage].to_f / m[:power_limit]) > 0.9
151
+ end.keys
152
+
153
+ {
154
+ throttle_devices: high_power_devices,
155
+ should_throttle: high_power_devices.any?,
156
+ recommendation: high_power_devices.any? ? :reduce_batch_size : :continue
157
+ }
158
+ end
159
+
160
+ # @return [String]
161
+ def to_s
162
+ "DynamicOptimizer[#{@device_ids.size} GPUs, strategy=balanced]"
163
+ end
164
+
165
+ private
166
+
167
+ def refresh_metrics_if_needed!
168
+ return if Time.now - @last_cache_time < @cache_ttl_seconds
169
+
170
+ @device_ids.each do |device_id|
171
+ @metrics_cache[device_id] = collect_device_metrics(device_id)
172
+ end
173
+ @last_cache_time = Time.now
174
+ end
175
+
176
+ def collect_device_metrics(device_id)
177
+ CUDA::RuntimeAPI.cudaSetDevice(device_id)
178
+
179
+ # Memory info
180
+ free_ptr = FFI::MemoryPointer.new(:size_t)
181
+ total_ptr = FFI::MemoryPointer.new(:size_t)
182
+ CUDA::RuntimeAPI.cudaMemGetInfo(free_ptr, total_ptr)
183
+
184
+ free = free_ptr.read(:size_t)
185
+ total = total_ptr.read(:size_t)
186
+ used = total - free
187
+
188
+ # Get temperature and power from NVML if available
189
+ temp = query_temperature(device_id)
190
+ power = query_power(device_id)
191
+ power_limit = query_power_limit(device_id)
192
+
193
+ {
194
+ device_id: device_id,
195
+ memory_free: free,
196
+ memory_total: total,
197
+ memory_used: used,
198
+ memory_used_percent: (used.to_f / total * 100).round(1),
199
+ temperature: temp,
200
+ power_usage: power,
201
+ power_limit: power_limit,
202
+ healthy: @health_monitor.healthy?(device_id),
203
+ timestamp: Time.now
204
+ }
205
+ rescue StandardError => e
206
+ Ignis.logger.warn { "Failed to collect metrics for device #{device_id}: #{e.message}" }
207
+ { device_id: device_id, healthy: false, error: e.message }
208
+ end
209
+
210
+ def query_temperature(device_id)
211
+ # Use NVML via FFI if available
212
+ return nil unless defined?(NVML) && NVML.respond_to?(:get_temperature)
213
+
214
+ NVML.get_temperature(device_id)
215
+ rescue StandardError
216
+ nil
217
+ end
218
+
219
+ def query_power(device_id)
220
+ return nil unless defined?(NVML) && NVML.respond_to?(:get_power_usage)
221
+
222
+ NVML.get_power_usage(device_id)
223
+ rescue StandardError
224
+ nil
225
+ end
226
+
227
+ def query_power_limit(device_id)
228
+ return nil unless defined?(NVML) && NVML.respond_to?(:get_power_limit)
229
+
230
+ NVML.get_power_limit(device_id)
231
+ rescue StandardError
232
+ nil
233
+ end
234
+
235
+ # Thermal optimization: order by temperature (coolest first)
236
+ def thermal_optimized_order
237
+ @device_ids.sort_by do |id|
238
+ @metrics_cache[id][:temperature] || 0
239
+ end
240
+ end
241
+
242
+ # Load balancing: order by memory usage (least used first)
243
+ def load_balanced_order
244
+ @device_ids.sort_by do |id|
245
+ @metrics_cache[id][:memory_used_percent] || 0
246
+ end
247
+ end
248
+
249
+ # Power optimization: order by power usage (lowest first)
250
+ def power_optimized_order
251
+ @device_ids.sort_by do |id|
252
+ @metrics_cache[id][:power_usage] || 0
253
+ end
254
+ end
255
+
256
+ # Memory optimization: order by free memory (most free first)
257
+ def memory_optimized_order
258
+ @device_ids.sort_by do |id|
259
+ -(@metrics_cache[id][:memory_free] || 0)
260
+ end
261
+ end
262
+
263
+ # Availability: filter out unhealthy, keep order
264
+ def availability_optimized_order
265
+ @device_ids.select { |id| !should_exclude?(id) }
266
+ end
267
+
268
+ # Scheduling: round-robin with healthy bias
269
+ def scheduling_optimized_order
270
+ healthy = @device_ids.select { |id| @metrics_cache[id][:healthy] }
271
+ unhealthy = @device_ids - healthy
272
+ healthy + unhealthy
273
+ end
274
+
275
+ # Synchronization: order by topology (minimize hops)
276
+ def synchronization_optimized_order
277
+ @topology.optimal_ring_order
278
+ end
279
+
280
+ # Balanced: weighted combination of all factors
281
+ def balanced_optimized_order
282
+ scores = @device_ids.map do |id|
283
+ metrics = @metrics_cache[id]
284
+
285
+ # Lower is better for each component
286
+ temp_score = (metrics[:temperature] || 50) / 100.0
287
+ mem_score = (metrics[:memory_used_percent] || 50) / 100.0
288
+ power_score = if metrics[:power_usage] && metrics[:power_limit] && metrics[:power_limit] > 0
289
+ metrics[:power_usage].to_f / metrics[:power_limit]
290
+ else
291
+ 0.5
292
+ end
293
+ health_score = metrics[:healthy] ? 0.0 : 1.0
294
+
295
+ # Weighted sum
296
+ total = (temp_score * 0.25) +
297
+ (mem_score * 0.25) +
298
+ (power_score * 0.25) +
299
+ (health_score * 0.25)
300
+
301
+ [id, total]
302
+ end
303
+
304
+ scores.sort_by { |_id, score| score }.map(&:first)
305
+ end
306
+ end
307
+ end
308
+ end
@@ -0,0 +1,333 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module Collective
5
+ # GPU health monitoring with proactive failure detection
6
+ # Monitors heartbeat, memory, and thermal status for each GPU
7
+ #
8
+ # @example Usage
9
+ # monitor = HealthMonitor.new([0, 1, 2, 3])
10
+ # monitor.start!
11
+ #
12
+ # if monitor.unhealthy_devices.any?
13
+ # healer.heal!(monitor.unhealthy_devices)
14
+ # end
15
+ #
16
+ class HealthMonitor
17
+ # Health check interval (seconds)
18
+ HEARTBEAT_INTERVAL = 5.0
19
+
20
+ # Minimum free memory percentage
21
+ MEMORY_THRESHOLD = 0.05
22
+
23
+ # Critical temperature (Celsius)
24
+ TEMP_CRITICAL = 90
25
+
26
+ # Warning temperature (Celsius)
27
+ TEMP_WARNING = 80
28
+
29
+ # Consecutive failures before marking unhealthy
30
+ FAILURE_THRESHOLD = 3
31
+
32
+ # Health status values
33
+ STATUS_HEALTHY = :healthy
34
+ STATUS_WARNING = :warning
35
+ STATUS_CRITICAL = :critical
36
+ STATUS_FAILED = :failed
37
+
38
+ # @return [Array<Integer>] Monitored device IDs
39
+ attr_reader :device_ids
40
+
41
+ # @return [Hash<Integer, Hash>] Device health status
42
+ attr_reader :device_status
43
+
44
+ # @return [Boolean] Whether monitoring is active
45
+ attr_reader :monitoring
46
+
47
+ # Create health monitor for specified GPUs
48
+ #
49
+ # @param device_ids [Array<Integer>] GPU device IDs to monitor
50
+ def initialize(device_ids)
51
+ @device_ids = device_ids.dup.freeze
52
+ @device_status = {}
53
+ @monitoring = false
54
+ @monitor_thread = nil
55
+ @callbacks = { on_failure: [], on_recovery: [], on_warning: [] }
56
+
57
+ initialize_status!
58
+ end
59
+
60
+ # Start background monitoring
61
+ # @return [void]
62
+ def start!
63
+ return if @monitoring
64
+
65
+ @monitoring = true
66
+ @monitor_thread = Thread.new { monitor_loop }
67
+ end
68
+
69
+ # Stop background monitoring
70
+ # @return [void]
71
+ def stop!
72
+ @monitoring = false
73
+ @monitor_thread&.join(2.0)
74
+ @monitor_thread = nil
75
+ end
76
+
77
+ # Run single health check (synchronous)
78
+ # @return [Hash<Integer, Symbol>] Device status map
79
+ def check_now!
80
+ @device_ids.each { |id| check_device!(id) }
81
+ status_summary
82
+ end
83
+
84
+ # Check if specific device is healthy
85
+ #
86
+ # @param device_id [Integer] GPU device ID
87
+ # @return [Boolean] True if healthy
88
+ def healthy?(device_id)
89
+ status = @device_status[device_id]
90
+ return false unless status
91
+
92
+ status[:status] == STATUS_HEALTHY || status[:status] == STATUS_WARNING
93
+ end
94
+
95
+ # Get all healthy device IDs
96
+ # @return [Array<Integer>] Healthy devices
97
+ def healthy_devices
98
+ @device_ids.select { |id| healthy?(id) }
99
+ end
100
+
101
+ # Get all unhealthy device IDs
102
+ # @return [Array<Integer>] Unhealthy devices
103
+ def unhealthy_devices
104
+ @device_ids.reject { |id| healthy?(id) }
105
+ end
106
+
107
+ # Get status summary
108
+ # @return [Hash<Integer, Symbol>] Device → status
109
+ def status_summary
110
+ @device_status.transform_values { |v| v[:status] }
111
+ end
112
+
113
+ # Register callback for device failure
114
+ #
115
+ # @yield [device_id, reason] Called when device fails
116
+ # @return [void]
117
+ def on_failure(&block)
118
+ @callbacks[:on_failure] << block
119
+ end
120
+
121
+ # Register callback for device recovery
122
+ #
123
+ # @yield [device_id] Called when device recovers
124
+ # @return [void]
125
+ def on_recovery(&block)
126
+ @callbacks[:on_recovery] << block
127
+ end
128
+
129
+ # Register callback for warnings
130
+ #
131
+ # @yield [device_id, warning_type, value] Called on warning
132
+ # @return [void]
133
+ def on_warning(&block)
134
+ @callbacks[:on_warning] << block
135
+ end
136
+
137
+ # Get detailed status for a device
138
+ #
139
+ # @param device_id [Integer] GPU device ID
140
+ # @return [Hash] Status details
141
+ def device_details(device_id)
142
+ @device_status[device_id]&.dup
143
+ end
144
+
145
+ # Force device status (for testing or manual override)
146
+ #
147
+ # @param device_id [Integer] GPU device ID
148
+ # @param status [Symbol] New status
149
+ # @return [void]
150
+ def force_status!(device_id, status)
151
+ return unless @device_status.key?(device_id)
152
+
153
+ @device_status[device_id][:status] = status
154
+ @device_status[device_id][:forced] = true
155
+ end
156
+
157
+ # Clean up resources
158
+ # @return [void]
159
+ def destroy!
160
+ stop!
161
+ @device_status.clear
162
+ @callbacks.each_value(&:clear)
163
+ end
164
+
165
+ # @return [String] Human-readable summary
166
+ def to_s
167
+ healthy = healthy_devices.size
168
+ total = @device_ids.size
169
+ "HealthMonitor[#{healthy}/#{total} healthy]"
170
+ end
171
+
172
+ private
173
+
174
+ def initialize_status!
175
+ @device_ids.each do |id|
176
+ @device_status[id] = {
177
+ status: STATUS_HEALTHY,
178
+ consecutive_failures: 0,
179
+ last_check: nil,
180
+ last_healthy: Time.now,
181
+ memory_free_pct: 1.0,
182
+ temperature: 0,
183
+ forced: false
184
+ }
185
+ end
186
+ end
187
+
188
+ def monitor_loop
189
+ while @monitoring
190
+ @device_ids.each { |id| check_device!(id) }
191
+ sleep(HEARTBEAT_INTERVAL)
192
+ end
193
+ end
194
+
195
+ def check_device!(device_id)
196
+ status = @device_status[device_id]
197
+ return if status[:forced]
198
+
199
+ begin
200
+ # 1. Heartbeat - synchronize device
201
+ heartbeat_ok = check_heartbeat(device_id)
202
+
203
+ # 2. Memory check
204
+ memory_pct = check_memory(device_id)
205
+ memory_ok = memory_pct >= MEMORY_THRESHOLD
206
+
207
+ # 3. Temperature check (if NVML available)
208
+ temp = check_temperature(device_id)
209
+ temp_ok = temp < TEMP_CRITICAL
210
+
211
+ status[:last_check] = Time.now
212
+ status[:memory_free_pct] = memory_pct
213
+ status[:temperature] = temp
214
+
215
+ if heartbeat_ok && memory_ok && temp_ok
216
+ # Recovery from failure
217
+ if status[:status] == STATUS_FAILED
218
+ notify_recovery(device_id)
219
+ end
220
+
221
+ if temp > TEMP_WARNING
222
+ status[:status] = STATUS_WARNING
223
+ notify_warning(device_id, :temperature, temp)
224
+ elsif memory_pct < MEMORY_THRESHOLD * 2
225
+ status[:status] = STATUS_WARNING
226
+ notify_warning(device_id, :memory, memory_pct)
227
+ else
228
+ status[:status] = STATUS_HEALTHY
229
+ end
230
+
231
+ status[:consecutive_failures] = 0
232
+ status[:last_healthy] = Time.now
233
+ else
234
+ status[:consecutive_failures] += 1
235
+
236
+ if status[:consecutive_failures] >= FAILURE_THRESHOLD
237
+ old_status = status[:status]
238
+ status[:status] = STATUS_FAILED
239
+
240
+ if old_status != STATUS_FAILED
241
+ reason = determine_failure_reason(heartbeat_ok, memory_ok, temp_ok, memory_pct, temp)
242
+ notify_failure(device_id, reason)
243
+ end
244
+ else
245
+ status[:status] = STATUS_CRITICAL
246
+ end
247
+ end
248
+ rescue StandardError => e
249
+ # Check itself failed
250
+ status[:consecutive_failures] += 1
251
+ if status[:consecutive_failures] >= FAILURE_THRESHOLD
252
+ status[:status] = STATUS_FAILED
253
+ notify_failure(device_id, "Health check error: #{e.message}")
254
+ end
255
+ end
256
+ end
257
+
258
+ def check_heartbeat(device_id)
259
+ CUDA::RuntimeAPI.ensure_loaded!
260
+
261
+ # Use new Fiddle-based RuntimeAPI methods
262
+ CUDA::RuntimeAPI.set_device(device_id)
263
+ CUDA::RuntimeAPI.device_synchronize
264
+ true
265
+ rescue StandardError
266
+ false
267
+ end
268
+
269
+ def check_memory(device_id)
270
+ CUDA::RuntimeAPI.set_device(device_id)
271
+
272
+ info = CUDA::RuntimeAPI.mem_get_info
273
+ free = info[:free_bytes]
274
+ total = info[:total_bytes]
275
+
276
+ total.positive? ? free.to_f / total : 0.0
277
+ rescue StandardError
278
+ 0.0
279
+ end
280
+
281
+ def check_temperature(device_id)
282
+ # Try NVML if available
283
+ return 0 unless defined?(CUDA::NVML)
284
+
285
+ CUDA::NVML.device_temperature(device_id)
286
+ rescue StandardError
287
+ 0 # Assume OK if NVML not available
288
+ end
289
+
290
+ def determine_failure_reason(heartbeat_ok, memory_ok, temp_ok, memory_pct, temp)
291
+ reasons = []
292
+ reasons << "heartbeat timeout" unless heartbeat_ok
293
+ reasons << "out of memory (#{(memory_pct * 100).round(1)}% free)" unless memory_ok
294
+ reasons << "overheating (#{temp}°C)" unless temp_ok
295
+ reasons.join(", ")
296
+ end
297
+
298
+ def notify_failure(device_id, reason)
299
+ @callbacks[:on_failure].each { |cb| cb.call(device_id, reason) }
300
+
301
+ # Publish to EventBus for system-wide notification
302
+ if defined?(Ignis::Shared::EventBus)
303
+ Ignis::Shared::EventBus.publish(:gpu_failed, payload: {
304
+ gpu_id: device_id,
305
+ reason: reason
306
+ })
307
+ end
308
+ end
309
+
310
+ def notify_recovery(device_id)
311
+ @callbacks[:on_recovery].each { |cb| cb.call(device_id) }
312
+
313
+ if defined?(Ignis::Shared::EventBus)
314
+ Ignis::Shared::EventBus.publish(:gpu_recovered, payload: {
315
+ gpu_id: device_id
316
+ })
317
+ end
318
+ end
319
+
320
+ def notify_warning(device_id, warning_type, value)
321
+ @callbacks[:on_warning].each { |cb| cb.call(device_id, warning_type, value) }
322
+
323
+ if defined?(Ignis::Shared::EventBus)
324
+ Ignis::Shared::EventBus.publish(:health_alert, payload: {
325
+ gpu_id: device_id,
326
+ metric: warning_type,
327
+ value: value
328
+ })
329
+ end
330
+ end
331
+ end
332
+ end
333
+ end