RubyGems - ignis - Versions diffs - 0.0.1 - Mend

ignis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

checksums.yaml +7 -0
data/README.md +15 -0
data/lib/ignis.rb +94 -0
data/lib/nnw/platform.rb +304 -0
data/lib/nnw/shared/event_bus.rb +240 -0
data/lib/nnw/shared/ffi_loader.rb +63 -0
data/lib/nnw/shared/memory_contract.rb +204 -0
data/lib/nnw/shared/nv_array.rb +710 -0
data/lib/nnw/shared/recovery_protocol.rb +307 -0
data/lib/nvruby/configuration.rb +217 -0
data/lib/nvruby/cuda/device.rb +275 -0
data/lib/nvruby/cuda/device_props.rb +202 -0
data/lib/nvruby/cuda/graph.rb +265 -0
data/lib/nvruby/cuda/graph_bindings.rb +119 -0
data/lib/nvruby/cuda/library_loader.rb +285 -0
data/lib/nvruby/cuda/memory.rb +410 -0
data/lib/nvruby/cuda/runtime_api.rb +804 -0
data/lib/nvruby/cuda/stream.rb +234 -0
data/lib/nvruby/dtype.rb +139 -0
data/lib/nvruby/epilogues.rb +438 -0
data/lib/nvruby/errors.rb +303 -0
data/lib/nvruby/half.rb +97 -0
data/lib/nvruby/jit/compiled_kernel.rb +80 -0
data/lib/nvruby/jit/compiler.rb +231 -0
data/lib/nvruby/jit/driver_api_bindings.rb +363 -0
data/lib/nvruby/jit/kernel.rb +240 -0
data/lib/nvruby/jit/kernel_module.rb +133 -0
data/lib/nvruby/jit/kernels/activations.rb +179 -0
data/lib/nvruby/jit/kernels/attention.rb +504 -0
data/lib/nvruby/jit/kernels/elementwise.rb +488 -0
data/lib/nvruby/jit/kernels/loss.rb +213 -0
data/lib/nvruby/jit/kernels/normalization.rb +200 -0
data/lib/nvruby/jit/kernels/optimizer.rb +193 -0
data/lib/nvruby/jit/nvrtc_bindings.rb +282 -0
data/lib/nvruby/linalg/cublas_bindings.rb +295 -0
data/lib/nvruby/linalg/cublaslt_bindings.rb +342 -0
data/lib/nvruby/linalg/epilog.rb +67 -0
data/lib/nvruby/linalg/matmul.rb +247 -0
data/lib/nvruby/linalg/matmul_plan.rb +229 -0
data/lib/nvruby/linalg/optimized_matmul.rb +412 -0
data/lib/nvruby/memory/cuda_async_memory_resource.rb +123 -0
data/lib/nvruby/memory/cuda_memory_resource.rb +68 -0
data/lib/nvruby/memory/device_memory_resource.rb +106 -0
data/lib/nvruby/memory/pinned_host_memory_resource.rb +112 -0
data/lib/nvruby/memory/pool_memory_resource.rb +242 -0
data/lib/nvruby/memory/stats.rb +107 -0
data/lib/nvruby/memory.rb +124 -0
data/lib/nvruby/version.rb +5 -0
metadata +108 -0

data/lib/nnw/shared/recovery_protocol.rb ADDED Viewed

@@ -0,0 +1,307 @@
+# frozen_string_literal: true
+module Ignis
+  module Shared
+    # RecoveryProtocol — Unified failure state machine shared by all three layers.
+    #
+    # This is the spine of the recovery flow. The previous build had each layer
+    # implement its own recovery logic — they got out of sync. This singleton
+    # ensures all layers see the same state and transition atomically.
+    #
+    # States:
+    #   HEALTHY → DEGRADED → RECOVERING → HEALTHY
+    #                     ↘ FAILED (if recovery exceeds max_attempts)
+    #
+    # Thread-safe: all operations protected by Monitor.
+    class RecoveryProtocol
+      # Valid states for the recovery state machine.
+      STATES = %i[healthy degraded recovering failed].freeze
+      # Valid transition map: current_state => [allowed_next_states]
+      TRANSITIONS = {
+        healthy:    [:degraded],
+        degraded:   [:recovering],
+        recovering: [:healthy, :failed],
+        failed:     [:recovering]
+      }.freeze
+      # Callback event names per state.
+      CALLBACK_EVENTS = %i[on_degraded on_recovering on_healthy on_failed].freeze
+      # Default maximum recovery attempts before transitioning to FAILED.
+      DEFAULT_MAX_ATTEMPTS = 3
+      # Default recovery timeout in milliseconds.
+      DEFAULT_RECOVERY_TIMEOUT_MS = 30_000
+      # Maximum history entries.
+      MAX_HISTORY = 20
+      # @return [RecoveryProtocol] singleton instance
+      def self.instance
+        @instance ||= new
+      end
+      # Reset the singleton instance (for testing only).
+      # @return [void]
+      def self.reset!
+        @instance = new
+      end
+      # Register callbacks for a layer.
+      #
+      # @param layer [Symbol] identifying the layer (:nvruby, :nvccl, :wnais)
+      # @param on_degraded [Proc, nil] called when state transitions to DEGRADED
+      # @param on_recovering [Proc, nil] called when state transitions to RECOVERING
+      # @param on_healthy [Proc, nil] called when state transitions to HEALTHY
+      # @param on_failed [Proc, nil] called when state transitions to FAILED
+      # @return [void]
+      def self.register(layer:, on_degraded: nil, on_recovering: nil, on_healthy: nil, on_failed: nil)
+        instance.register(
+          layer: layer,
+          on_degraded: on_degraded,
+          on_recovering: on_recovering,
+          on_healthy: on_healthy,
+          on_failed: on_failed
+        )
+      end
+      # Get the current state.
+      # @return [Symbol] one of :healthy, :degraded, :recovering, :failed
+      def self.state
+        instance.state
+      end
+      # Begin recovery for a failed GPU.
+      #
+      # Transitions: HEALTHY → DEGRADED (fires on_degraded callbacks),
+      #              then DEGRADED → RECOVERING (fires on_recovering callbacks).
+      #
+      # @param gpu_id [Integer] the failed GPU device ID
+      # @param reason [Symbol] reason for failure (:heartbeat_timeout, :memory_error, etc.)
+      # @return [Symbol] the new state
+      # @raise [RuntimeError] if current state doesn't allow this transition
+      def self.begin!(gpu_id:, reason:)
+        instance.begin!(gpu_id: gpu_id, reason: reason)
+      end
+      # Complete recovery successfully.
+      #
+      # Transitions: RECOVERING → HEALTHY (fires on_healthy callbacks).
+      # Resets attempt counter.
+      #
+      # @param recovered_gpus [Array<Integer>] list of active GPU IDs after recovery
+      # @return [Symbol] the new state (:healthy)
+      def self.complete!(recovered_gpus:)
+        instance.complete!(recovered_gpus: recovered_gpus)
+      end
+      # Manually reset from FAILED state to start recovery again.
+      #
+      # Transitions: FAILED → RECOVERING (fires on_recovering callbacks).
+      # Resets attempt counter.
+      #
+      # @return [Symbol] the new state (:recovering)
+      # @raise [RuntimeError] if current state is not :failed
+      def self.reset!
+        # Note: this shadows the singleton reset! above.
+        # In production, use instance.manual_reset! instead.
+        instance.manual_reset!
+      end
+      # Get transition history.
+      #
+      # @return [Array<Hash>] last 20 transitions with {from:, to:, timestamp:, context:}
+      def self.history
+        instance.history
+      end
+      # Get current attempt count.
+      # @return [Integer]
+      def self.attempt_count
+        instance.attempt_count
+      end
+      # Get the failed GPU ID (if in DEGRADED or RECOVERING state).
+      # @return [Integer, nil]
+      def self.failed_gpu_id
+        instance.failed_gpu_id
+      end
+      # Configuration accessors.
+      # @return [Integer]
+      def self.max_attempts
+        instance.max_attempts
+      end
+      # @param value [Integer]
+      def self.max_attempts=(value)
+        instance.max_attempts = value
+      end
+      # @return [Integer]
+      def self.recovery_timeout_ms
+        instance.recovery_timeout_ms
+      end
+      # @param value [Integer]
+      def self.recovery_timeout_ms=(value)
+        instance.recovery_timeout_ms = value
+      end
+      # Instance methods
+      attr_accessor :max_attempts, :recovery_timeout_ms
+      def initialize
+        @monitor = Monitor.new
+        @state = :healthy
+        @callbacks = {} # layer => {on_degraded: proc, ...}
+        @history = []
+        @attempt_count = 0
+        @failed_gpu_id = nil
+        @failed_reason = nil
+        @max_attempts = DEFAULT_MAX_ATTEMPTS
+        @recovery_timeout_ms = DEFAULT_RECOVERY_TIMEOUT_MS
+      end
+      def register(layer:, on_degraded: nil, on_recovering: nil, on_healthy: nil, on_failed: nil)
+        @monitor.synchronize do
+          @callbacks[layer] = {
+            on_degraded: on_degraded,
+            on_recovering: on_recovering,
+            on_healthy: on_healthy,
+            on_failed: on_failed
+          }
+        end
+      end
+      # @return [Symbol]
+      def state
+        @monitor.synchronize { @state }
+      end
+      # @return [Integer]
+      def attempt_count
+        @monitor.synchronize { @attempt_count }
+      end
+      # @return [Integer, nil]
+      def failed_gpu_id
+        @monitor.synchronize { @failed_gpu_id }
+      end
+      def begin!(gpu_id:, reason:)
+        @monitor.synchronize do
+          # Transition HEALTHY → DEGRADED
+          unless @state == :healthy
+            raise "Cannot begin recovery: current state is #{@state.inspect}, expected :healthy"
+          end
+          @failed_gpu_id = gpu_id
+          @failed_reason = reason
+          @attempt_count += 1
+          transition_to!(:degraded, context: { gpu_id: gpu_id, reason: reason, attempt: @attempt_count })
+          # Fire on_degraded callbacks
+          fire_callbacks(:on_degraded, gpu_id: gpu_id, reason: reason)
+          # Immediately transition DEGRADED → RECOVERING
+          if @attempt_count > @max_attempts
+            transition_to!(:failed, context: { gpu_id: gpu_id, reason: :max_attempts_exceeded, attempt: @attempt_count })
+            fire_callbacks(:on_failed, gpu_id: gpu_id, reason: :max_attempts_exceeded)
+          else
+            transition_to!(:recovering, context: { gpu_id: gpu_id, reason: reason, attempt: @attempt_count })
+            fire_callbacks(:on_recovering, gpu_id: gpu_id, reason: reason)
+          end
+          @state
+        end
+      end
+      def complete!(recovered_gpus:)
+        @monitor.synchronize do
+          unless @state == :recovering
+            raise "Cannot complete recovery: current state is #{@state.inspect}, expected :recovering"
+          end
+          @attempt_count = 0
+          transition_to!(:healthy, context: { recovered_gpus: recovered_gpus })
+          fire_callbacks(:on_healthy, recovered_gpus: recovered_gpus)
+          @failed_gpu_id = nil
+          @failed_reason = nil
+          @state
+        end
+      end
+      def manual_reset!
+        @monitor.synchronize do
+          unless @state == :failed
+            raise "Cannot manual reset: current state is #{@state.inspect}, expected :failed"
+          end
+          @attempt_count = 0
+          transition_to!(:recovering, context: { manual_reset: true })
+          fire_callbacks(:on_recovering, manual_reset: true)
+          @state
+        end
+      end
+      def history
+        @monitor.synchronize do
+          @history.dup
+        end
+      end
+      private
+      # Perform a state transition and record it in history.
+      #
+      # @param new_state [Symbol]
+      # @param context [Hash] additional context for the transition
+      def transition_to!(new_state, context: {})
+        from = @state
+        unless TRANSITIONS[from]&.include?(new_state)
+          raise "Invalid transition: #{from.inspect} → #{new_state.inspect}"
+        end
+        entry = {
+          from: from,
+          to: new_state,
+          timestamp: Time.now,
+          context: context
+        }
+        @history << entry
+        @history.shift while @history.size > MAX_HISTORY
+        @state = new_state
+      end
+      # Fire all registered callbacks for a given event.
+      #
+      # Catches and logs callback exceptions (never raises).
+      #
+      # @param event [Symbol] callback event name (e.g., :on_degraded)
+      # @param kwargs [Hash] keyword arguments passed to the callback
+      def fire_callbacks(event, **kwargs)
+        @callbacks.each do |layer, cbs|
+          cb = cbs[event]
+          next unless cb
+          begin
+            cb.call(**kwargs)
+          rescue => e
+            $stderr.puts "[RecoveryProtocol] Callback #{event} for layer #{layer} raised: #{e.class}: #{e.message}"
+          end
+        end
+      end
+    end
+  end
+end

data/lib/nvruby/configuration.rb ADDED Viewed

@@ -0,0 +1,217 @@
+# frozen_string_literal: true
+require "logger"
+module Ignis
+  # Global configuration for Ignis
+  # Thread-safe configuration access using Mutex
+  class Configuration
+    # Default CUDA installation paths — Windows and Linux
+    DEFAULT_CUDA_PATHS = if RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
+      [
+        'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1',
+        'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0',
+        'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6',
+        'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5',
+        'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4',
+        'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.3',
+        'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2',
+        'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1',
+        'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0',
+        'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8'
+      ].freeze
+    else
+      [
+        '/usr/local/cuda-13.1',
+        '/usr/local/cuda-13.0',
+        '/usr/local/cuda-12.6',
+        '/usr/local/cuda-12.5',
+        '/usr/local/cuda',
+        '/opt/cuda'
+      ].freeze
+    end
+    # @return [String, nil] Custom CUDA installation path
+    attr_accessor :cuda_path
+    # @return [Integer] Default device index to use
+    attr_accessor :default_device
+    # @return [Boolean] Whether to enable autotuning by default
+    attr_accessor :autotuning_enabled
+    # @return [Integer] Default number of autotuning iterations
+    attr_accessor :autotuning_iterations
+    # @return [Logger] Logger instance for Ignis operations
+    attr_accessor :logger
+    # @return [Symbol] Log level (:debug, :info, :warn, :error, :fatal)
+    attr_reader :log_level
+    # @return [Boolean] Whether to use synchronous execution by default
+    attr_accessor :synchronous
+    # @return [Integer] Default workspace size in bytes for cuBLAS operations
+    attr_accessor :default_workspace_size
+    # @return [Boolean] Whether to enable memory pooling
+    attr_accessor :memory_pooling
+    # @return [Integer] Maximum memory pool size in bytes (0 for unlimited)
+    attr_accessor :max_pool_size
+    # Alias for memory.rb compatibility
+    alias use_memory_pool memory_pooling
+    alias use_memory_pool= memory_pooling=
+    def initialize
+      @mutex = Mutex.new
+      @cuda_path = nil
+      @default_device = 0
+      @autotuning_enabled = true
+      @autotuning_iterations = 10
+      @logger = create_default_logger
+      @log_level = :info
+      @synchronous = false
+      @default_workspace_size = 32 * 1024 * 1024 # 32 MB
+      @memory_pooling = true
+      @max_pool_size = 0
+    end
+    # Set log level
+    # @param level [Symbol] Log level (:debug, :info, :warn, :error, :fatal)
+    # @return [void]
+    def log_level=(level)
+      @mutex.synchronize do
+        @log_level = level
+        @logger.level = log_level_to_constant(level)
+      end
+    end
+    # Get the resolved CUDA path
+    # @return [String, nil] The CUDA installation path or nil if not found
+    def resolved_cuda_path
+      @mutex.synchronize do
+        return @cuda_path if @cuda_path && File.directory?(@cuda_path)
+        # Check environment variable
+        env_path = ENV["CUDA_PATH"]
+        return env_path if env_path && File.directory?(env_path)
+        # Search default paths
+        DEFAULT_CUDA_PATHS.find { |path| File.directory?(path) }
+      end
+    end
+    # Get the CUDA bin directory
+    # @return [String, nil] Path to CUDA bin/lib directory
+    def cuda_bin_path
+      base = resolved_cuda_path
+      return nil unless base
+      # Use forward slashes for Ruby compatibility
+      base_normalized = base.tr("\\", "/")
+      if RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
+        # Windows: check bin/x64 first, then bin
+        x64_path = File.join(base_normalized, "bin", "x64")
+        if File.directory?(x64_path)
+          dll_check = Dir.glob(File.join(x64_path, "*.dll"))
+          return x64_path if dll_check.any?
+        end
+        File.join(base_normalized, "bin")
+      else
+        # Linux: use lib64
+        lib64_path = File.join(base_normalized, "lib64")
+        return lib64_path if File.directory?(lib64_path)
+        File.join(base_normalized, "lib")
+      end
+    end
+    # Get the CUDA lib directory
+    # @return [String, nil] Path to CUDA lib directory
+    def cuda_lib_path
+      base = resolved_cuda_path
+      return nil unless base
+      if RUBY_PLATFORM.match?(/mswin|mingw|cygwin/i)
+        lib_path = File.join(base, "lib", "x64")
+        return lib_path if File.directory?(lib_path)
+        File.join(base, "lib")
+      else
+        lib_path = File.join(base, "lib64")
+        return lib_path if File.directory?(lib_path)
+        File.join(base, "lib")
+      end
+    end
+    # Reset configuration to defaults
+    # @return [void]
+    def reset!
+      @mutex.synchronize do
+        @cuda_path = nil
+        @default_device = 0
+        @autotuning_enabled = true
+        @autotuning_iterations = 10
+        @logger = create_default_logger
+        @log_level = :info
+        @synchronous = false
+        @default_workspace_size = 32 * 1024 * 1024
+        @memory_pooling = true
+        @max_pool_size = 0
+      end
+    end
+    # Thread-safe read of configuration values
+    # @yield Block with access to configuration
+    # @return [Object] Return value of the block
+    def with_lock(&block)
+      @mutex.synchronize(&block)
+    end
+    private
+    # @return [Logger] Configured logger instance
+    def create_default_logger
+      logger = Logger.new($stdout)
+      logger.level = Logger::INFO
+      logger.formatter = proc do |severity, datetime, _progname, msg|
+        "[Ignis #{datetime.strftime('%Y-%m-%d %H:%M:%S')}] #{severity}: #{msg}\n"
+      end
+      logger
+    end
+    # @param level [Symbol] Symbolic log level
+    # @return [Integer] Logger constant
+    def log_level_to_constant(level)
+      case level
+      when :debug then Logger::DEBUG
+      when :info  then Logger::INFO
+      when :warn  then Logger::WARN
+      when :error then Logger::ERROR
+      when :fatal then Logger::FATAL
+      else Logger::INFO
+      end
+    end
+  end
+  class << self
+    # @return [Configuration] Global configuration instance
+    def configuration
+      @configuration ||= Configuration.new
+    end
+    # Configure Ignis
+    # @yield [Configuration] Configuration instance
+    # @return [void]
+    def configure
+      yield(configuration) if block_given?
+    end
+    # @return [Logger] Logger instance
+    def logger
+      configuration.logger
+    end
+  end
+end