ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +15 -0
  3. data/lib/ignis.rb +94 -0
  4. data/lib/nnw/platform.rb +304 -0
  5. data/lib/nnw/shared/event_bus.rb +240 -0
  6. data/lib/nnw/shared/ffi_loader.rb +63 -0
  7. data/lib/nnw/shared/memory_contract.rb +204 -0
  8. data/lib/nnw/shared/nv_array.rb +710 -0
  9. data/lib/nnw/shared/recovery_protocol.rb +307 -0
  10. data/lib/nvruby/configuration.rb +217 -0
  11. data/lib/nvruby/cuda/device.rb +275 -0
  12. data/lib/nvruby/cuda/device_props.rb +202 -0
  13. data/lib/nvruby/cuda/graph.rb +265 -0
  14. data/lib/nvruby/cuda/graph_bindings.rb +119 -0
  15. data/lib/nvruby/cuda/library_loader.rb +285 -0
  16. data/lib/nvruby/cuda/memory.rb +410 -0
  17. data/lib/nvruby/cuda/runtime_api.rb +804 -0
  18. data/lib/nvruby/cuda/stream.rb +234 -0
  19. data/lib/nvruby/dtype.rb +139 -0
  20. data/lib/nvruby/epilogues.rb +438 -0
  21. data/lib/nvruby/errors.rb +303 -0
  22. data/lib/nvruby/half.rb +97 -0
  23. data/lib/nvruby/jit/compiled_kernel.rb +80 -0
  24. data/lib/nvruby/jit/compiler.rb +231 -0
  25. data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
  26. data/lib/nvruby/jit/kernel.rb +240 -0
  27. data/lib/nvruby/jit/kernel_module.rb +133 -0
  28. data/lib/nvruby/jit/kernels/activations.rb +179 -0
  29. data/lib/nvruby/jit/kernels/attention.rb +504 -0
  30. data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
  31. data/lib/nvruby/jit/kernels/loss.rb +213 -0
  32. data/lib/nvruby/jit/kernels/normalization.rb +200 -0
  33. data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
  34. data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
  35. data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
  36. data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
  37. data/lib/nvruby/linalg/epilog.rb +67 -0
  38. data/lib/nvruby/linalg/matmul.rb +247 -0
  39. data/lib/nvruby/linalg/matmul_plan.rb +229 -0
  40. data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
  41. data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
  42. data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
  43. data/lib/nvruby/memory/device_memory_resource.rb +106 -0
  44. data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
  45. data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
  46. data/lib/nvruby/memory/stats.rb +107 -0
  47. data/lib/nvruby/memory.rb +124 -0
  48. data/lib/nvruby/version.rb +5 -0
  49. metadata +108 -0
@@ -0,0 +1,307 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module Shared
5
+ # RecoveryProtocol — Unified failure state machine shared by all three layers.
6
+ #
7
+ # This is the spine of the recovery flow. The previous build had each layer
8
+ # implement its own recovery logic — they got out of sync. This singleton
9
+ # ensures all layers see the same state and transition atomically.
10
+ #
11
+ # States:
12
+ # HEALTHY → DEGRADED → RECOVERING → HEALTHY
13
+ # ↘ FAILED (if recovery exceeds max_attempts)
14
+ #
15
+ # Thread-safe: all operations protected by Monitor.
16
+ class RecoveryProtocol
17
+ # Valid states for the recovery state machine.
18
+ STATES = %i[healthy degraded recovering failed].freeze
19
+
20
+ # Valid transition map: current_state => [allowed_next_states]
21
+ TRANSITIONS = {
22
+ healthy: [:degraded],
23
+ degraded: [:recovering],
24
+ recovering: [:healthy, :failed],
25
+ failed: [:recovering]
26
+ }.freeze
27
+
28
+ # Callback event names per state.
29
+ CALLBACK_EVENTS = %i[on_degraded on_recovering on_healthy on_failed].freeze
30
+
31
+ # Default maximum recovery attempts before transitioning to FAILED.
32
+ DEFAULT_MAX_ATTEMPTS = 3
33
+
34
+ # Default recovery timeout in milliseconds.
35
+ DEFAULT_RECOVERY_TIMEOUT_MS = 30_000
36
+
37
+ # Maximum history entries.
38
+ MAX_HISTORY = 20
39
+
40
+ # @return [RecoveryProtocol] singleton instance
41
+ def self.instance
42
+ @instance ||= new
43
+ end
44
+
45
+ # Reset the singleton instance (for testing only).
46
+ # @return [void]
47
+ def self.reset!
48
+ @instance = new
49
+ end
50
+
51
+ # Register callbacks for a layer.
52
+ #
53
+ # @param layer [Symbol] identifying the layer (:nvruby, :nvccl, :wnais)
54
+ # @param on_degraded [Proc, nil] called when state transitions to DEGRADED
55
+ # @param on_recovering [Proc, nil] called when state transitions to RECOVERING
56
+ # @param on_healthy [Proc, nil] called when state transitions to HEALTHY
57
+ # @param on_failed [Proc, nil] called when state transitions to FAILED
58
+ # @return [void]
59
+ def self.register(layer:, on_degraded: nil, on_recovering: nil, on_healthy: nil, on_failed: nil)
60
+ instance.register(
61
+ layer: layer,
62
+ on_degraded: on_degraded,
63
+ on_recovering: on_recovering,
64
+ on_healthy: on_healthy,
65
+ on_failed: on_failed
66
+ )
67
+ end
68
+
69
+ # Get the current state.
70
+ # @return [Symbol] one of :healthy, :degraded, :recovering, :failed
71
+ def self.state
72
+ instance.state
73
+ end
74
+
75
+ # Begin recovery for a failed GPU.
76
+ #
77
+ # Transitions: HEALTHY → DEGRADED (fires on_degraded callbacks),
78
+ # then DEGRADED → RECOVERING (fires on_recovering callbacks).
79
+ #
80
+ # @param gpu_id [Integer] the failed GPU device ID
81
+ # @param reason [Symbol] reason for failure (:heartbeat_timeout, :memory_error, etc.)
82
+ # @return [Symbol] the new state
83
+ # @raise [RuntimeError] if current state doesn't allow this transition
84
+ def self.begin!(gpu_id:, reason:)
85
+ instance.begin!(gpu_id: gpu_id, reason: reason)
86
+ end
87
+
88
+ # Complete recovery successfully.
89
+ #
90
+ # Transitions: RECOVERING → HEALTHY (fires on_healthy callbacks).
91
+ # Resets attempt counter.
92
+ #
93
+ # @param recovered_gpus [Array<Integer>] list of active GPU IDs after recovery
94
+ # @return [Symbol] the new state (:healthy)
95
+ def self.complete!(recovered_gpus:)
96
+ instance.complete!(recovered_gpus: recovered_gpus)
97
+ end
98
+
99
+ # Manually reset from FAILED state to start recovery again.
100
+ #
101
+ # Transitions: FAILED → RECOVERING (fires on_recovering callbacks).
102
+ # Resets attempt counter.
103
+ #
104
+ # @return [Symbol] the new state (:recovering)
105
+ # @raise [RuntimeError] if current state is not :failed
106
+ def self.reset!
107
+ # Note: this shadows the singleton reset! above.
108
+ # In production, use instance.manual_reset! instead.
109
+ instance.manual_reset!
110
+ end
111
+
112
+ # Get transition history.
113
+ #
114
+ # @return [Array<Hash>] last 20 transitions with {from:, to:, timestamp:, context:}
115
+ def self.history
116
+ instance.history
117
+ end
118
+
119
+ # Get current attempt count.
120
+ # @return [Integer]
121
+ def self.attempt_count
122
+ instance.attempt_count
123
+ end
124
+
125
+ # Get the failed GPU ID (if in DEGRADED or RECOVERING state).
126
+ # @return [Integer, nil]
127
+ def self.failed_gpu_id
128
+ instance.failed_gpu_id
129
+ end
130
+
131
+ # Configuration accessors.
132
+ # @return [Integer]
133
+ def self.max_attempts
134
+ instance.max_attempts
135
+ end
136
+
137
+ # @param value [Integer]
138
+ def self.max_attempts=(value)
139
+ instance.max_attempts = value
140
+ end
141
+
142
+ # @return [Integer]
143
+ def self.recovery_timeout_ms
144
+ instance.recovery_timeout_ms
145
+ end
146
+
147
+ # @param value [Integer]
148
+ def self.recovery_timeout_ms=(value)
149
+ instance.recovery_timeout_ms = value
150
+ end
151
+
152
+ # Instance methods
153
+
154
+ attr_accessor :max_attempts, :recovery_timeout_ms
155
+
156
+ def initialize
157
+ @monitor = Monitor.new
158
+ @state = :healthy
159
+ @callbacks = {} # layer => {on_degraded: proc, ...}
160
+ @history = []
161
+ @attempt_count = 0
162
+ @failed_gpu_id = nil
163
+ @failed_reason = nil
164
+ @max_attempts = DEFAULT_MAX_ATTEMPTS
165
+ @recovery_timeout_ms = DEFAULT_RECOVERY_TIMEOUT_MS
166
+ end
167
+
168
+ def register(layer:, on_degraded: nil, on_recovering: nil, on_healthy: nil, on_failed: nil)
169
+ @monitor.synchronize do
170
+ @callbacks[layer] = {
171
+ on_degraded: on_degraded,
172
+ on_recovering: on_recovering,
173
+ on_healthy: on_healthy,
174
+ on_failed: on_failed
175
+ }
176
+ end
177
+ end
178
+
179
+ # @return [Symbol]
180
+ def state
181
+ @monitor.synchronize { @state }
182
+ end
183
+
184
+ # @return [Integer]
185
+ def attempt_count
186
+ @monitor.synchronize { @attempt_count }
187
+ end
188
+
189
+ # @return [Integer, nil]
190
+ def failed_gpu_id
191
+ @monitor.synchronize { @failed_gpu_id }
192
+ end
193
+
194
+ def begin!(gpu_id:, reason:)
195
+ @monitor.synchronize do
196
+ # Transition HEALTHY → DEGRADED
197
+ unless @state == :healthy
198
+ raise "Cannot begin recovery: current state is #{@state.inspect}, expected :healthy"
199
+ end
200
+
201
+ @failed_gpu_id = gpu_id
202
+ @failed_reason = reason
203
+ @attempt_count += 1
204
+
205
+ transition_to!(:degraded, context: { gpu_id: gpu_id, reason: reason, attempt: @attempt_count })
206
+
207
+ # Fire on_degraded callbacks
208
+ fire_callbacks(:on_degraded, gpu_id: gpu_id, reason: reason)
209
+
210
+ # Immediately transition DEGRADED → RECOVERING
211
+ if @attempt_count > @max_attempts
212
+ transition_to!(:failed, context: { gpu_id: gpu_id, reason: :max_attempts_exceeded, attempt: @attempt_count })
213
+ fire_callbacks(:on_failed, gpu_id: gpu_id, reason: :max_attempts_exceeded)
214
+ else
215
+ transition_to!(:recovering, context: { gpu_id: gpu_id, reason: reason, attempt: @attempt_count })
216
+ fire_callbacks(:on_recovering, gpu_id: gpu_id, reason: reason)
217
+ end
218
+
219
+ @state
220
+ end
221
+ end
222
+
223
+ def complete!(recovered_gpus:)
224
+ @monitor.synchronize do
225
+ unless @state == :recovering
226
+ raise "Cannot complete recovery: current state is #{@state.inspect}, expected :recovering"
227
+ end
228
+
229
+ @attempt_count = 0
230
+ transition_to!(:healthy, context: { recovered_gpus: recovered_gpus })
231
+
232
+ fire_callbacks(:on_healthy, recovered_gpus: recovered_gpus)
233
+
234
+ @failed_gpu_id = nil
235
+ @failed_reason = nil
236
+
237
+ @state
238
+ end
239
+ end
240
+
241
+ def manual_reset!
242
+ @monitor.synchronize do
243
+ unless @state == :failed
244
+ raise "Cannot manual reset: current state is #{@state.inspect}, expected :failed"
245
+ end
246
+
247
+ @attempt_count = 0
248
+ transition_to!(:recovering, context: { manual_reset: true })
249
+
250
+ fire_callbacks(:on_recovering, manual_reset: true)
251
+
252
+ @state
253
+ end
254
+ end
255
+
256
+ def history
257
+ @monitor.synchronize do
258
+ @history.dup
259
+ end
260
+ end
261
+
262
+ private
263
+
264
+ # Perform a state transition and record it in history.
265
+ #
266
+ # @param new_state [Symbol]
267
+ # @param context [Hash] additional context for the transition
268
+ def transition_to!(new_state, context: {})
269
+ from = @state
270
+ unless TRANSITIONS[from]&.include?(new_state)
271
+ raise "Invalid transition: #{from.inspect} → #{new_state.inspect}"
272
+ end
273
+
274
+ entry = {
275
+ from: from,
276
+ to: new_state,
277
+ timestamp: Time.now,
278
+ context: context
279
+ }
280
+
281
+ @history << entry
282
+ @history.shift while @history.size > MAX_HISTORY
283
+
284
+ @state = new_state
285
+ end
286
+
287
+ # Fire all registered callbacks for a given event.
288
+ #
289
+ # Catches and logs callback exceptions (never raises).
290
+ #
291
+ # @param event [Symbol] callback event name (e.g., :on_degraded)
292
+ # @param kwargs [Hash] keyword arguments passed to the callback
293
+ def fire_callbacks(event, **kwargs)
294
+ @callbacks.each do |layer, cbs|
295
+ cb = cbs[event]
296
+ next unless cb
297
+
298
+ begin
299
+ cb.call(**kwargs)
300
+ rescue => e
301
+ $stderr.puts "[RecoveryProtocol] Callback #{event} for layer #{layer} raised: #{e.class}: #{e.message}"
302
+ end
303
+ end
304
+ end
305
+ end
306
+ end
307
+ end
@@ -0,0 +1,217 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "logger"
4
+
5
+ module Ignis
6
+ # Global configuration for Ignis
7
+ # Thread-safe configuration access using Mutex
8
+ class Configuration
9
+ # Default CUDA installation paths — Windows and Linux
10
+ DEFAULT_CUDA_PATHS = if RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
11
+ [
12
+ 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1',
13
+ 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0',
14
+ 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6',
15
+ 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5',
16
+ 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4',
17
+ 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.3',
18
+ 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2',
19
+ 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1',
20
+ 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0',
21
+ 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8'
22
+ ].freeze
23
+ else
24
+ [
25
+ '/usr/local/cuda-13.1',
26
+ '/usr/local/cuda-13.0',
27
+ '/usr/local/cuda-12.6',
28
+ '/usr/local/cuda-12.5',
29
+ '/usr/local/cuda',
30
+ '/opt/cuda'
31
+ ].freeze
32
+ end
33
+
34
+ # @return [String, nil] Custom CUDA installation path
35
+ attr_accessor :cuda_path
36
+
37
+ # @return [Integer] Default device index to use
38
+ attr_accessor :default_device
39
+
40
+ # @return [Boolean] Whether to enable autotuning by default
41
+ attr_accessor :autotuning_enabled
42
+
43
+ # @return [Integer] Default number of autotuning iterations
44
+ attr_accessor :autotuning_iterations
45
+
46
+ # @return [Logger] Logger instance for Ignis operations
47
+ attr_accessor :logger
48
+
49
+ # @return [Symbol] Log level (:debug, :info, :warn, :error, :fatal)
50
+ attr_reader :log_level
51
+
52
+ # @return [Boolean] Whether to use synchronous execution by default
53
+ attr_accessor :synchronous
54
+
55
+ # @return [Integer] Default workspace size in bytes for cuBLAS operations
56
+ attr_accessor :default_workspace_size
57
+
58
+ # @return [Boolean] Whether to enable memory pooling
59
+ attr_accessor :memory_pooling
60
+
61
+ # @return [Integer] Maximum memory pool size in bytes (0 for unlimited)
62
+ attr_accessor :max_pool_size
63
+
64
+ # Alias for memory.rb compatibility
65
+ alias use_memory_pool memory_pooling
66
+ alias use_memory_pool= memory_pooling=
67
+
68
+ def initialize
69
+ @mutex = Mutex.new
70
+ @cuda_path = nil
71
+ @default_device = 0
72
+ @autotuning_enabled = true
73
+ @autotuning_iterations = 10
74
+ @logger = create_default_logger
75
+ @log_level = :info
76
+ @synchronous = false
77
+ @default_workspace_size = 32 * 1024 * 1024 # 32 MB
78
+ @memory_pooling = true
79
+ @max_pool_size = 0
80
+ end
81
+
82
+ # Set log level
83
+ # @param level [Symbol] Log level (:debug, :info, :warn, :error, :fatal)
84
+ # @return [void]
85
+ def log_level=(level)
86
+ @mutex.synchronize do
87
+ @log_level = level
88
+ @logger.level = log_level_to_constant(level)
89
+ end
90
+ end
91
+
92
+ # Get the resolved CUDA path
93
+ # @return [String, nil] The CUDA installation path or nil if not found
94
+ def resolved_cuda_path
95
+ @mutex.synchronize do
96
+ return @cuda_path if @cuda_path && File.directory?(@cuda_path)
97
+
98
+ # Check environment variable
99
+ env_path = ENV["CUDA_PATH"]
100
+ return env_path if env_path && File.directory?(env_path)
101
+
102
+ # Search default paths
103
+ DEFAULT_CUDA_PATHS.find { |path| File.directory?(path) }
104
+ end
105
+ end
106
+
107
+ # Get the CUDA bin directory
108
+ # @return [String, nil] Path to CUDA bin/lib directory
109
+ def cuda_bin_path
110
+ base = resolved_cuda_path
111
+ return nil unless base
112
+
113
+ # Use forward slashes for Ruby compatibility
114
+ base_normalized = base.tr("\\", "/")
115
+
116
+ if RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
117
+ # Windows: check bin/x64 first, then bin
118
+ x64_path = File.join(base_normalized, "bin", "x64")
119
+ if File.directory?(x64_path)
120
+ dll_check = Dir.glob(File.join(x64_path, "*.dll"))
121
+ return x64_path if dll_check.any?
122
+ end
123
+ File.join(base_normalized, "bin")
124
+ else
125
+ # Linux: use lib64
126
+ lib64_path = File.join(base_normalized, "lib64")
127
+ return lib64_path if File.directory?(lib64_path)
128
+ File.join(base_normalized, "lib")
129
+ end
130
+ end
131
+
132
+ # Get the CUDA lib directory
133
+ # @return [String, nil] Path to CUDA lib directory
134
+ def cuda_lib_path
135
+ base = resolved_cuda_path
136
+ return nil unless base
137
+
138
+ if RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
139
+ lib_path = File.join(base, "lib", "x64")
140
+ return lib_path if File.directory?(lib_path)
141
+ File.join(base, "lib")
142
+ else
143
+ lib_path = File.join(base, "lib64")
144
+ return lib_path if File.directory?(lib_path)
145
+ File.join(base, "lib")
146
+ end
147
+ end
148
+
149
+ # Reset configuration to defaults
150
+ # @return [void]
151
+ def reset!
152
+ @mutex.synchronize do
153
+ @cuda_path = nil
154
+ @default_device = 0
155
+ @autotuning_enabled = true
156
+ @autotuning_iterations = 10
157
+ @logger = create_default_logger
158
+ @log_level = :info
159
+ @synchronous = false
160
+ @default_workspace_size = 32 * 1024 * 1024
161
+ @memory_pooling = true
162
+ @max_pool_size = 0
163
+ end
164
+ end
165
+
166
+ # Thread-safe read of configuration values
167
+ # @yield Block with access to configuration
168
+ # @return [Object] Return value of the block
169
+ def with_lock(&block)
170
+ @mutex.synchronize(&block)
171
+ end
172
+
173
+ private
174
+
175
+ # @return [Logger] Configured logger instance
176
+ def create_default_logger
177
+ logger = Logger.new($stdout)
178
+ logger.level = Logger::INFO
179
+ logger.formatter = proc do |severity, datetime, _progname, msg|
180
+ "[Ignis #{datetime.strftime('%Y-%m-%d %H:%M:%S')}] #{severity}: #{msg}\n"
181
+ end
182
+ logger
183
+ end
184
+
185
+ # @param level [Symbol] Symbolic log level
186
+ # @return [Integer] Logger constant
187
+ def log_level_to_constant(level)
188
+ case level
189
+ when :debug then Logger::DEBUG
190
+ when :info then Logger::INFO
191
+ when :warn then Logger::WARN
192
+ when :error then Logger::ERROR
193
+ when :fatal then Logger::FATAL
194
+ else Logger::INFO
195
+ end
196
+ end
197
+ end
198
+
199
+ class << self
200
+ # @return [Configuration] Global configuration instance
201
+ def configuration
202
+ @configuration ||= Configuration.new
203
+ end
204
+
205
+ # Configure Ignis
206
+ # @yield [Configuration] Configuration instance
207
+ # @return [void]
208
+ def configure
209
+ yield(configuration) if block_given?
210
+ end
211
+
212
+ # @return [Logger] Logger instance
213
+ def logger
214
+ configuration.logger
215
+ end
216
+ end
217
+ end