RubyGems - ignis-autograd - Versions diffs - 0.0.1 - Mend

ignis-autograd 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: fcf74e67da11fb5e8dc37233c01fe0f47e4e60cbe038911e4fff43a582376084
+  data.tar.gz: 7a57cf46813c15e2a067195be05ffcc7bdeafdc63e7b66a2544c42e8aa2dc539
+SHA512:
+  metadata.gz: 7fc7b3badc27f7a94d3a7ff6416db11084aa04ed10b9c88d40ed99c3e1889a8868275168907def8ad8ab8f14ba9604d4b43dcc20c3f97a99d2c51c68304af0a5
+  data.tar.gz: 918f846d561833195db3fce0cb41d64484fd18d907223f1f032621a4b01e6979e9acca56b7886ac70c157f6ad19a60aa36521340ce4c34cc929f416cfefdcaf8

data/README.md ADDED Viewed

@@ -0,0 +1,14 @@
+# ignis-autograd
+Reverse-mode automatic differentiation over GPU arrays, on the [`ignis`](https://rubygems.org/gems/ignis) foundation.
+Adds `Ignis::AI::Tensor` (a differentiable tensor) and an autograd tape. Build computation graphs on the GPU and get exact gradients (verified against finite differences).
+```ruby
+require "ignis-autograd"
+x = Ignis::AI::Tensor.from_host([1.0, 2.0, 3.0], shape: [3], requires_grad: true)
+(x * x).sum.backward!
+x.grad.to_host   # => [2.0, 4.0, 6.0]
+```
+MIT.

data/lib/ignis-autograd.rb ADDED Viewed

@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+# ignis-autograd — reverse-mode automatic differentiation over GPU arrays.
+# Adds Ignis::AI::Tensor (a differentiable tensor) and the autograd tape on top
+# of the Ignis GPU foundation.
+require "ignis"
+require_relative "nnw/ai/tape"
+require_relative "nnw/ai/tensor"
+require_relative "nnw/ai/device"

data/lib/nnw/ai/device.rb ADDED Viewed

@@ -0,0 +1,257 @@
+# frozen_string_literal: true
+module Ignis
+  module AI
+    # Device — dynamic GPU capability detection and configuration.
+    #
+    # Queries GPU properties at runtime: VRAM, compute capability, SM count, etc.
+    # All model configurations adapt based on the actual hardware present.
+    # No hardcoded GPU assumptions.
+    class Device
+      # GPU properties struct
+      DeviceProperties = Struct.new(
+        :id, :name, :compute_capability,
+        :total_memory_bytes, :total_memory_mb, :total_memory_gb,
+        :sm_count, :max_threads_per_block, :max_threads_per_sm,
+        :warp_size, :clock_rate_mhz, :memory_clock_mhz,
+        :l2_cache_size, :shared_mem_per_block,
+        keyword_init: true
+      )
+      class << self
+        # Query all GPU devices and cache properties.
+        # @return [Array<DeviceProperties>]
+        def all_devices
+          @all_devices ||= enumerate_devices
+        end
+        # Get properties for a specific device.
+        # @param device_id [Integer]
+        # @return [DeviceProperties]
+        def properties(device_id = 0)
+          all_devices[device_id] || raise("No GPU device #{device_id} found")
+        end
+        # Total VRAM on device in bytes.
+        # @param device_id [Integer]
+        # @return [Integer]
+        def total_memory(device_id = 0)
+          properties(device_id).total_memory_bytes
+        end
+        # Estimate free VRAM (queries cudaMemGetInfo).
+        # @param device_id [Integer]
+        # @return [Integer] free bytes
+        def free_memory(device_id = 0)
+          query_free_memory(device_id)
+        end
+        # Number of available GPUs.
+        # @return [Integer]
+        def count
+          all_devices.length
+        end
+        # Summary string for logging.
+        # @return [String]
+        def summary
+          lines = ["GPU Devices (#{count}):"]
+          all_devices.each do |dev|
+            lines << "  [#{dev.id}] #{dev.name} | #{dev.total_memory_gb}GB VRAM | " \
+                     "CC #{dev.compute_capability} | #{dev.sm_count} SMs"
+          end
+          lines.join("\n")
+        end
+        # Recommend optimal batch size and sequence length for a model.
+        # @param model_params [Integer] total parameters
+        # @param dtype_bytes [Integer] bytes per parameter (2 for FP16, 4 for FP32)
+        # @param device_id [Integer]
+        # @param target_utilization [Float] fraction of VRAM to use (0.0-1.0)
+        # @return [Hash] :batch_size, :seq_len, :use_flash_attention, :use_gradient_checkpointing
+        def recommend_config(model_params, dtype_bytes: 4, device_id: 0, target_utilization: 0.85)
+          dev = properties(device_id)
+          available_bytes = (dev.total_memory_bytes * target_utilization).to_i
+          # Weight memory
+          weight_bytes = model_params * dtype_bytes
+          # Optimizer state (Adam: 2x params for m, v)
+          optimizer_bytes = model_params * 4 * 2
+          # Gradient storage
+          gradient_bytes = model_params * dtype_bytes
+          # Fixed overhead
+          fixed_bytes = weight_bytes + optimizer_bytes + gradient_bytes
+          # Remaining for activations
+          activation_budget = available_bytes - fixed_bytes
+          if activation_budget <= 0
+            return {
+              batch_size: 1,
+              seq_len: 128,
+              use_flash_attention: true,
+              use_gradient_checkpointing: true,
+              warning: "Model too large for this GPU. Consider model parallelism or FP16."
+            }
+          end
+          # Estimate activation memory per token per layer
+          # Rough estimate: 4 * hidden_dim * dtype_bytes per token per layer
+          # Hidden dim ~ sqrt(model_params / 12) for typical transformers
+          estimated_hidden = Math.sqrt(model_params / 12.0).to_i
+          estimated_layers = [model_params / (estimated_hidden * estimated_hidden * 12), 1].max
+          activation_per_token = 4 * estimated_hidden * dtype_bytes * estimated_layers
+          # Target: batch_size * seq_len * activation_per_token <= activation_budget
+          total_tokens = activation_budget / [activation_per_token, 1].max
+          # Prefer seq_len of 1024, adjust batch_size
+          seq_len = [1024, total_tokens].min
+          batch_size = [total_tokens / seq_len, 1].max
+          # Flash attention saves O(N²) memory — worth it for seq_len > 512
+          use_flash = seq_len > 512
+          # Gradient checkpointing if we're tight on memory
+          use_checkpointing = activation_budget < weight_bytes * 2
+          {
+            batch_size: batch_size.to_i,
+            seq_len: seq_len.to_i,
+            use_flash_attention: use_flash,
+            use_gradient_checkpointing: use_checkpointing,
+            estimated_vram_usage_gb: (fixed_bytes / (1024.0**3)).round(2),
+            available_vram_gb: (available_bytes / (1024.0**3)).round(2),
+            activation_budget_gb: (activation_budget / (1024.0**3)).round(2)
+          }
+        end
+        # Check if multi-GPU is available and worth using.
+        # @return [Hash] :multi_gpu, :device_ids, :strategy
+        def multi_gpu_config
+          devs = all_devices
+          if devs.length <= 1
+            return { multi_gpu: false, device_ids: [0], strategy: :single }
+          end
+          # Check if devices are compatible (same compute capability)
+          ccs = devs.map(&:compute_capability).uniq
+          if ccs.length == 1
+            { multi_gpu: true, device_ids: devs.map(&:id), strategy: :data_parallel }
+          else
+            # Heterogeneous GPUs — only use matching ones
+            dominant_cc = devs.group_by(&:compute_capability).max_by { |_, v| v.length }[0]
+            matching = devs.select { |d| d.compute_capability == dominant_cc }
+            {
+              multi_gpu: matching.length > 1,
+              device_ids: matching.map(&:id),
+              strategy: :data_parallel,
+              warning: "Heterogeneous GPUs detected. Using #{matching.length} " \
+                       "devices with CC #{dominant_cc}."
+            }
+          end
+        end
+        # Clear cached device info (call after hardware changes).
+        # @return [void]
+        def reset!
+          @all_devices = nil
+        end
+        private
+        # Enumerate all CUDA devices.
+        # @return [Array<DeviceProperties>]
+        def enumerate_devices
+          devices = []
+          begin
+            device_count = Ignis::CUDA::Device.count
+          rescue
+            return devices
+          end
+          device_count.times do |id|
+            begin
+              props = query_device_properties(id)
+              devices << props
+            rescue => e
+              Ignis.logger.warn("Failed to query GPU #{id}: #{e.message}")
+            end
+          end
+          devices
+        end
+        # Query device properties via CUDA Runtime API.
+        # @param device_id [Integer]
+        # @return [DeviceProperties]
+        def query_device_properties(device_id)
+          # Use cudaGetDeviceProperties or Ignis wrappers
+          if defined?(Ignis::CUDA::Device) && Ignis::CUDA::Device.respond_to?(:properties)
+            props = Ignis::CUDA::Device.properties(device_id)
+            DeviceProperties.new(
+              id: device_id,
+              name: props[:name] || "GPU #{device_id}",
+              compute_capability: props[:compute_capability] || "0.0",
+              total_memory_bytes: props[:total_global_mem] || 0,
+              total_memory_mb: (props[:total_global_mem] || 0) / (1024 * 1024),
+              total_memory_gb: ((props[:total_global_mem] || 0) / (1024.0**3)).round(2),
+              sm_count: props[:multi_processor_count] || 0,
+              max_threads_per_block: props[:max_threads_per_block] || 1024,
+              max_threads_per_sm: props[:max_threads_per_multi_processor] || 2048,
+              warp_size: props[:warp_size] || 32,
+              clock_rate_mhz: (props[:clock_rate] || 0) / 1000,
+              memory_clock_mhz: (props[:memory_clock_rate] || 0) / 1000,
+              l2_cache_size: props[:l2_cache_size] || 0,
+              shared_mem_per_block: props[:shared_mem_per_block] || 49152
+            )
+          else
+            # Fallback: use cudaMemGetInfo for at least memory info
+            free, total = query_memory_info(device_id)
+            DeviceProperties.new(
+              id: device_id,
+              name: "CUDA Device #{device_id}",
+              compute_capability: "0.0",
+              total_memory_bytes: total,
+              total_memory_mb: total / (1024 * 1024),
+              total_memory_gb: (total / (1024.0**3)).round(2),
+              sm_count: 0,
+              max_threads_per_block: 1024,
+              max_threads_per_sm: 2048,
+              warp_size: 32,
+              clock_rate_mhz: 0,
+              memory_clock_mhz: 0,
+              l2_cache_size: 0,
+              shared_mem_per_block: 49152
+            )
+          end
+        end
+        # Query free and total memory via cudaMemGetInfo.
+        # @param device_id [Integer]
+        # @return [Array(Integer, Integer)] [free_bytes, total_bytes]
+        def query_memory_info(device_id)
+          if defined?(Ignis::CUDA::RuntimeAPI)
+            free_ptr = Fiddle::Pointer.malloc(8, Fiddle::RUBY_FREE)
+            total_ptr = Fiddle::Pointer.malloc(8, Fiddle::RUBY_FREE)
+            Ignis::CUDA::RuntimeAPI.cudaMemGetInfo(free_ptr, total_ptr)
+            [free_ptr[0, 8].unpack1("Q"), total_ptr[0, 8].unpack1("Q")]
+          else
+            [0, 0]
+          end
+        end
+        # Query current free VRAM.
+        # @param device_id [Integer]
+        # @return [Integer]
+        def query_free_memory(device_id)
+          query_memory_info(device_id)[0]
+        end
+      end
+    end
+  end
+end

data/lib/nnw/ai/tape.rb ADDED Viewed

@@ -0,0 +1,200 @@
+# frozen_string_literal: true
+module Ignis
+  module AI
+    # Tape — fiber-local reverse-mode automatic differentiation.
+    #
+    # Each Ruby fiber/thread gets its own tape. Operations record
+    # backward functions during forward pass. backward! does topological
+    # sort and reverse walk to compute gradients.
+    #
+    # @example
+    #   a = Tensor.from_host([2.0], shape: [1], requires_grad: true)
+    #   b = a * a
+    #   b.backward!
+    #   a.grad.to_host  # => [4.0]
+    class Tape
+      # Thread-local tape key
+      TAPE_KEY = :nnw_ai_tape
+      NO_GRAD_KEY = :nnw_ai_no_grad
+      # An entry on the tape representing one operation.
+      Entry = Struct.new(:output, :inputs, :backward_fn, keyword_init: true)
+      class << self
+        # Record an operation on the tape.
+        # @param output [Tensor] the result tensor
+        # @param inputs [Array<Tensor>] input tensors
+        # @yield [Ignis::Shared::NvArray] receives gradient, must return Array of NvArrays
+        # @return [void]
+        def record(output, inputs:, &backward_fn)
+          return if no_grad_active?
+          return unless output.requires_grad
+          tape = current_tape
+          entry = Entry.new(output: output, inputs: inputs, backward_fn: backward_fn)
+          output._tape_id = tape.length
+          tape << entry
+        end
+        # Run reverse-mode AD from a tensor.
+        # @param tensor [Tensor] the output tensor to differentiate
+        # @param grad_output [Ignis::Shared::NvArray] initial gradient
+        # @return [void]
+        def backward!(tensor, grad_output)
+          tape = current_tape
+          return if tape.empty?
+          # Build a map of tensor object_id → accumulated gradient (NvArray).
+          # This is the single source of truth during the reverse walk; leaf
+          # .grad is written ONCE afterwards. Writing both during the walk caused
+          # double-counting when a leaf was reused (e.g. x in x*x): grad_map[x]
+          # and x.grad aliased the same buffer, so the second occurrence
+          # accumulated into it twice.
+          grad_map = {}
+          grad_map[tensor.object_id] = grad_output
+          leaves = {} # object_id => leaf Tensor that received gradient
+          # Buffers grad_map EXCLUSIVELY OWNS, tracked by Ruby object identity.
+          # accumulate_grads! mutates its dst in place, so the tape must never
+          # store or accumulate a buffer that another grad_map entry also
+          # references — an in-place add would silently corrupt the aliased entry.
+          # Backward closures are free to return aliased buffers (e.g. `+` returns
+          # [grad, grad]; `-` returns [grad, neg_grad] reusing the upstream grad).
+          # We clone on the way in to restore exclusive ownership. Clones happen
+          # ONLY on these aliasing paths; the common case (fresh buffer per input)
+          # never clones.
+          owned = {}.compare_by_identity
+          owned[grad_output] = true
+          # Walk tape in reverse order (topological by construction)
+          tape.reverse_each do |entry|
+            output = entry.output
+            output_grad = grad_map[output.object_id]
+            next unless output_grad
+            # Call backward function to get input gradients
+            input_grads = entry.backward_fn.call(output_grad)
+            # Accumulate gradients for each input
+            entry.inputs.each_with_index do |input_tensor, i|
+              next unless input_tensor.requires_grad
+              input_grad = input_grads[i]
+              next unless input_grad
+              if grad_map.key?(input_tensor.object_id)
+                dst = grad_map[input_tensor.object_id]
+                # Never accumulate a buffer into itself (would compute 2*dst):
+                # clone so we add a snapshot of src's current value.
+                src = input_grad.equal?(dst) ? input_grad.clone : input_grad
+                accumulate_grads!(dst, src)
+              else
+                # Take exclusive ownership. If this exact buffer is already owned
+                # by another entry (the aliasing case), clone before storing.
+                input_grad = input_grad.clone if owned[input_grad]
+                grad_map[input_tensor.object_id] = input_grad
+                owned[input_grad] = true
+              end
+              leaves[input_tensor.object_id] = input_tensor if input_tensor.is_leaf
+            end
+          end
+          # Assign accumulated gradients to leaf tensors. Accumulate into any
+          # pre-existing .grad so gradient accumulation across multiple
+          # backward! calls (e.g. micro-batching) still works.
+          leaves.each do |oid, leaf|
+            g = grad_map[oid]
+            next unless g
+            if leaf.grad && !leaf.grad.equal?(g)
+              accumulate_grads!(leaf.grad, g)
+            else
+              leaf.grad = g
+            end
+          end
+          # Clear tape after backward (each backward is a fresh computation)
+          clear!
+        end
+        # Disable gradient recording inside block.
+        # @yield block where no gradients are recorded
+        # @return [Object] block return value
+        def no_grad(&block)
+          prev = Thread.current[NO_GRAD_KEY]
+          Thread.current[NO_GRAD_KEY] = true
+          begin
+            block.call
+          ensure
+            Thread.current[NO_GRAD_KEY] = prev
+          end
+        end
+        # Check if no_grad is currently active.
+        # @return [Boolean]
+        def no_grad_active?
+          Thread.current[NO_GRAD_KEY] == true
+        end
+        # Gradient checkpointing: recompute activations during backward.
+        # Stores only inputs + output. Reruns forward in backward pass.
+        # Critical for large models on 12GB VRAM.
+        # @param inputs [Array<Tensor>] input tensors to save
+        # @yield block that computes the forward pass
+        # @return [Tensor] the output tensor
+        def gradient_checkpoint(inputs, &forward_fn)
+          # Run forward with no_grad to avoid double recording
+          output = no_grad { forward_fn.call }
+          # Record a special tape entry that recomputes forward in backward
+          if output.requires_grad
+            saved_inputs = inputs.map { |t| t.data }
+            record(output, inputs: inputs) do |grad|
+              # Recompute forward pass to get intermediate values
+              recomputed = forward_fn.call
+              # Now the tape has entries for this recomputation
+              # Run backward on the recomputed output
+              Tape.backward!(recomputed, grad)
+              # Collect input gradients
+              inputs.map { |t| t.grad }
+            end
+          end
+          output
+        end
+        # Get current thread's tape.
+        # @return [Array<Entry>]
+        def current_tape
+          Thread.current[TAPE_KEY] ||= []
+        end
+        # Clear current thread's tape.
+        # @return [void]
+        def clear!
+          Thread.current[TAPE_KEY] = []
+        end
+        # Get tape size (for debugging).
+        # @return [Integer]
+        def size
+          current_tape.length
+        end
+        private
+        # Accumulate gradients: dst += src using GPU kernel
+        # @param dst [Ignis::Shared::NvArray]
+        # @param src [Ignis::Shared::NvArray]
+        # @return [void]
+        def accumulate_grads!(dst, src)
+          n = dst.numel
+          kernel = Ignis::JIT::Kernels::Elementwise.accumulate
+          kernel.launch(grid: [(n + 255) / 256], block: [256], args: [dst, src, n])
+          Ignis.synchronize
+        end
+      end
+    end
+  end
+end