RubyGems - ignis-autograd - Versions diffs - 0.0.1 - Mend

ignis-autograd 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/lib/nnw/ai/tensor.rb ADDED Viewed

@@ -0,0 +1,931 @@
+# frozen_string_literal: true
+require_relative "tape"
+module Ignis
+  module AI
+    # Tensor — the user-facing GPU tensor type for AI operations.
+    #
+    # Wraps Ignis::Shared::NvArray and adds gradient tracking for autograd.
+    # All compute ops record backward functions on the tape when requires_grad is true.
+    #
+    # @example Forward + backward
+    #   a = Ignis::AI::Tensor.from_host([1.0, 2.0, 3.0], shape: [3], requires_grad: true)
+    #   b = a * a   # b = a^2
+    #   b.sum.backward!
+    #   a.grad  # => [2.0, 4.0, 6.0]
+    class Tensor
+      # @return [Ignis::Shared::NvArray] underlying GPU data
+      attr_reader :data
+      # @return [Ignis::Shared::NvArray, nil] gradient (same shape as data)
+      attr_accessor :grad
+      # @return [Boolean] whether this tensor participates in autograd
+      attr_reader :requires_grad
+      # @return [Proc, nil] backward function recorded by the tape
+      attr_accessor :grad_fn
+      # @return [Boolean] true if created by user (not computed)
+      attr_reader :is_leaf
+      # @return [Integer, nil] position in current tape
+      attr_accessor :_tape_id
+      # @param data [Ignis::Shared::NvArray]
+      # @param requires_grad [Boolean]
+      # @param grad_fn [Proc, nil]
+      # @param is_leaf [Boolean]
+      def initialize(data:, requires_grad: false, grad_fn: nil, is_leaf: true)
+        @data = data
+        @requires_grad = requires_grad
+        @grad = nil
+        @grad_fn = grad_fn
+        @is_leaf = is_leaf
+        @_tape_id = nil
+      end
+      # -------------------------------------------------------------------
+      # Constructors
+      # -------------------------------------------------------------------
+      # Wrap an existing NvArray.
+      # @param nv_array [Ignis::Shared::NvArray]
+      # @param requires_grad [Boolean]
+      # @return [Tensor]
+      def self.from_nv_array(nv_array, requires_grad: false)
+        new(data: nv_array, requires_grad: requires_grad)
+      end
+      # Create a zero-filled tensor.
+      # @param shape [Array<Integer>]
+      # @param dtype [Symbol]
+      # @param device_id [Integer]
+      # @param requires_grad [Boolean]
+      # @return [Tensor]
+      def self.zeros(shape, dtype: :float32, device_id: 0, requires_grad: false)
+        nv = Ignis::Shared::NvArray.new(shape: shape, dtype: dtype, device_id: device_id)
+        nv.from_host(Array.new(nv.numel, 0.0))
+        new(data: nv, requires_grad: requires_grad)
+      end
+      # Create a ones-filled tensor.
+      # @param shape [Array<Integer>]
+      # @param dtype [Symbol]
+      # @param device_id [Integer]
+      # @param requires_grad [Boolean]
+      # @return [Tensor]
+      def self.ones(shape, dtype: :float32, device_id: 0, requires_grad: false)
+        nv = Ignis::Shared::NvArray.new(shape: shape, dtype: dtype, device_id: device_id)
+        nv.from_host(Array.new(nv.numel, 1.0))
+        new(data: nv, requires_grad: requires_grad)
+      end
+      # Create a tensor with random uniform values in [0, 1).
+      # @param shape [Array<Integer>]
+      # @param dtype [Symbol]
+      # @param device_id [Integer]
+      # @param requires_grad [Boolean]
+      # @return [Tensor]
+      def self.rand(shape, dtype: :float32, device_id: 0, requires_grad: false)
+        nv = Ignis::Shared::NvArray.new(shape: shape, dtype: dtype, device_id: device_id)
+        nv.from_host(Array.new(nv.numel) { Kernel.rand })
+        new(data: nv, requires_grad: requires_grad)
+      end
+      # Create a tensor from a Ruby array.
+      # @param ruby_array [Array<Numeric>]
+      # @param shape [Array<Integer>]
+      # @param dtype [Symbol]
+      # @param device_id [Integer]
+      # @param requires_grad [Boolean]
+      # @return [Tensor]
+      def self.from_host(ruby_array, shape:, dtype: :float32, device_id: 0, requires_grad: false)
+        nv = Ignis::Shared::NvArray.new(shape: shape, dtype: dtype, device_id: device_id)
+        nv.from_host(ruby_array)
+        new(data: nv, requires_grad: requires_grad)
+      end
+      # -------------------------------------------------------------------
+      # Shape / dtype delegation
+      # -------------------------------------------------------------------
+      # @return [Array<Integer>]
+      def shape
+        @data.shape
+      end
+      # @return [Symbol]
+      def dtype
+        @data.dtype
+      end
+      # @return [Integer]
+      def numel
+        @data.numel
+      end
+      # @return [Integer]
+      def device_id
+        @data.device_id
+      end
+      # -------------------------------------------------------------------
+      # Compute ops — each records grad_fn when requires_grad is true
+      # -------------------------------------------------------------------
+      # Matrix multiplication: self @ other
+      # @param other [Tensor]
+      # @return [Tensor]
+      # @param other [Tensor]
+      # @param transpose_b [Boolean] compute self @ other^T (cuBLAS transposes in
+      #   the GEMM — avoids materializing other^T, which for the LM head was a
+      #   765ms/forward transpose of a 38M-element weight). Used by Linear.
+      def matmul(other, transpose_b: false)
+        result_data = Ignis::LinAlg::Matmul.call(@data, other.data, transpose_b: transpose_b)
+        result = Tensor.new(data: result_data, requires_grad: should_track?(other), is_leaf: false)
+        if result.requires_grad
+          saved_self = @data
+          saved_other = other.data
+          Tape.record(result, inputs: [self, other]) do |grad|
+            if transpose_b
+              # y = A @ Bᵀ  ⇒  dA = grad @ B,  dB = gradᵀ @ A
+              grad_a = Ignis::LinAlg::Matmul.call(grad, saved_other)
+              grad_b = Ignis::LinAlg::Matmul.call(grad, saved_self, transpose_a: true)
+            else
+              # dA = grad @ Bᵀ,  dB = Aᵀ @ grad
+              grad_a = Ignis::LinAlg::Matmul.call(grad, saved_other, transpose_b: true)
+              grad_b = Ignis::LinAlg::Matmul.call(saved_self, grad, transpose_a: true)
+            end
+            [grad_a, grad_b]
+          end
+        end
+        result
+      end
+      # Row-broadcast bias add: self [rows, cols] + bias [cols] -> [rows, cols].
+      # (Linear layer bias; plain Tensor#+ requires equal element counts.)
+      # @param bias [Tensor] bias vector of length shape[-1]
+      # @return [Tensor]
+      def add_bias(bias)
+        cols = shape[-1]
+        rows = numel / cols
+        result_nv = alloc_like(@data)
+        kernel = Ignis::JIT::Kernels::Elementwise.add_bias_rows
+        kernel.launch(grid: [(numel + 255) / 256], block: [256],
+                      args: [@data, bias.data, result_nv, rows, cols])
+        result = Tensor.new(data: result_nv, requires_grad: should_track?(bias), is_leaf: false)
+        if result.requires_grad
+          Tape.record(result, inputs: [self, bias]) do |grad|
+            # d/d(input) = grad (passthrough); d/d(bias) = sum over rows
+            grad_bias = zeros_nv([cols])
+            bk = Ignis::JIT::Kernels::Elementwise.add_backward_broadcast
+            bk.launch(grid: [(cols + 255) / 256], block: [256], args: [grad, grad_bias, rows, cols])
+            [grad, grad_bias]
+          end
+        end
+        result
+      end
+      # Multi-head / grouped-query scaled dot-product attention (causal optional),
+      # batch = 1. self = Q [seq, num_heads*head_dim]; +k+, +v+ = [seq,
+      # num_kv_heads*head_dim]. Returns context [seq, num_heads*head_dim].
+      #
+      # With num_kv_heads == num_heads this is standard multi-head attention. With
+      # num_kv_heads < num_heads it is Grouped-Query Attention (Llama-2-70B, Llama-3,
+      # Qwen2/3, SmolLM3): each KV head is shared by group_size = num_heads/num_kv_heads
+      # query heads. Each query head runs the Flash-Attention-2 kernel against its
+      # group's KV head. In the backward, the group_size query heads that share a KV
+      # head ACCUMULATE into that head's dK/dV (scatter-add); dQ heads are disjoint.
+      # @param k [Tensor]
+      # @param v [Tensor]
+      # @param num_heads [Integer] number of query heads
+      # @param num_kv_heads [Integer, nil] number of K/V heads (nil ⇒ num_heads = MHA)
+      # @param causal [Boolean]
+      # @return [Tensor] context [seq, num_heads*head_dim]
+      def sdpa(k, v, num_heads:, num_kv_heads: nil, causal: true)
+        num_kv_heads ||= num_heads
+        raise ArgumentError, "num_heads (#{num_heads}) must be a multiple of num_kv_heads (#{num_kv_heads})" \
+          unless (num_heads % num_kv_heads).zero?
+        seq, embed = shape                 # embed = num_heads * head_dim
+        head_dim = embed / num_heads
+        # The flash-attention kernels store per-head rows in fixed [HEAD_DIM_MAX=128]
+        # register arrays and clamp every dim loop to d < 128. For head_dim > 128
+        # they would silently drop dims 128.. from scores/output/gradients with no
+        # error. Targets (Qwen3/Llama/SmolLM/Phi) use head_dim ≤ 128; fail loud above
+        # that rather than miscompute. (decode_sdpa uses cuBLAS+softmax and has no cap.)
+        raise ArgumentError,
+              "head_dim #{head_dim} exceeds flash-attention HEAD_DIM_MAX (128); " \
+              "larger heads are not yet supported by the flash kernels" if head_dim > 128
+        embed_kv = num_kv_heads * head_dim
+        group_size = num_heads / num_kv_heads
+        scale = (1.0 / Math.sqrt(head_dim)).to_f
+        cmask = causal ? 1 : 0
+        context_nv = zeros_nv([seq, embed])
+        fwd = Ignis::JIT::Kernels::Attention.flash_attention_forward
+        q_tiles = (seq + 63) / 64
+        num_heads.times do |h|
+          qoff = h * head_dim
+          koff = (h / group_size) * head_dim  # the KV head this query head attends to
+          qh = slice_cols_nv(@data, qoff, head_dim, seq, embed)
+          kh = slice_cols_nv(k.data, koff, head_dim, seq, embed_kv)
+          vh = slice_cols_nv(v.data, koff, head_dim, seq, embed_kv)
+          oh = zeros_nv([seq, head_dim])
+          fwd.launch(grid: [q_tiles], block: [64],
+                     args: [qh, kh, vh, oh, seq, head_dim, scale, cmask])
+          scatter_cols_nv!(oh, context_nv, qoff, head_dim, seq, embed)
+        end
+        result = Tensor.new(data: context_nv,
+                            requires_grad: @requires_grad || should_track?(k) || should_track?(v),
+                            is_leaf: false)
+        if result.requires_grad
+          sq = @data
+          sk = k.data
+          sv = v.data
+          so = context_nv
+          Tape.record(result, inputs: [self, k, v]) do |grad|
+            d_q = zeros_nv([seq, embed])
+            d_k = zeros_nv([seq, embed_kv])
+            d_v = zeros_nv([seq, embed_kv])
+            bwd = Ignis::JIT::Kernels::Attention.flash_attention_backward
+            blk = (seq + 255) / 256
+            num_heads.times do |h|
+              qoff = h * head_dim
+              koff = (h / group_size) * head_dim
+              qh = slice_cols_nv(sq, qoff, head_dim, seq, embed)
+              kh = slice_cols_nv(sk, koff, head_dim, seq, embed_kv)
+              vh = slice_cols_nv(sv, koff, head_dim, seq, embed_kv)
+              oh = slice_cols_nv(so, qoff, head_dim, seq, embed)
+              doh = slice_cols_nv(grad, qoff, head_dim, seq, embed)
+              dqh = zeros_nv([seq, head_dim])
+              dkh = zeros_nv([seq, head_dim])
+              dvh = zeros_nv([seq, head_dim])
+              bwd.launch(grid: [blk], block: [256],
+                         args: [qh, kh, vh, oh, doh, dqh, dkh, dvh, seq, head_dim, scale, cmask])
+              # dQ heads are disjoint → overwrite. dK/dV heads are SHARED across the
+              # group → accumulate (add into a zero-initialized buffer). For MHA
+              # (group_size==1) the KV columns are disjoint too, so add-into-zero is
+              # numerically identical to the previous overwrite — no regression.
+              scatter_cols_nv!(dqh, d_q, qoff, head_dim, seq, embed)
+              scatter_cols_add_nv!(dkh, d_k, koff, head_dim, seq, embed_kv)
+              scatter_cols_add_nv!(dvh, d_v, koff, head_dim, seq, embed_kv)
+            end
+            [d_q, d_k, d_v]
+          end
+        end
+        result
+      end
+      # Single-query attention for autoregressive decode with a KV cache.
+      #
+      # self = q [1, embed] (the new token's query); +k+, +v+ = cached keys/values
+      # [past+1, embed] (every position up to and including the current one). The
+      # new token is the LAST position, so it attends to ALL cached positions — no
+      # causal mask is needed. Returns context [1, embed]. No autograd (decode runs
+      # under no_grad). Built from the verified column-major GEMM (the 1/sqrt(d)
+      # scale folded into alpha) + the numerically-stable softmax_forward kernel,
+      # mirroring sdpa's per-head column layout so head splitting is identical.
+      #
+      # @param k [Tensor] cached keys [past+1, embed]
+      # @param v [Tensor] cached values [past+1, embed]
+      # @param num_heads [Integer]
+      # @return [Tensor] context [1, embed]
+      def decode_sdpa(k, v, num_heads:)
+        _, embed = shape
+        tk = k.shape[0]
+        head_dim = embed / num_heads
+        scale = (1.0 / Math.sqrt(head_dim)).to_f
+        context_nv = zeros_nv([1, embed])
+        sm = Ignis::JIT::Kernels::Attention.softmax_forward
+        num_heads.times do |h|
+          off = h * head_dim
+          qh = slice_cols_nv(@data, off, head_dim, 1, embed)   # [1, hd]
+          kh = slice_cols_nv(k.data, off, head_dim, tk, embed) # [tk, hd]
+          vh = slice_cols_nv(v.data, off, head_dim, tk, embed) # [tk, hd]
+          # scores = scale * (qh @ khᵀ) → [1, tk]  (alpha folds in the scale)
+          scores = Ignis::LinAlg::Matmul.call(qh, kh, transpose_b: true, alpha: scale)
+          # probs = softmax(scores) along the tk axis → [1, tk]
+          probs = Ignis::Shared::NvArray.new(shape: [1, tk], dtype: :float32, device_id: device_id).to_device
+          sm.launch(grid: [1], block: [1], args: [scores, probs, 1, tk])
+          # ctx_h = probs @ vh → [1, hd]
+          ctx_h = Ignis::LinAlg::Matmul.call(probs, vh)
+          scatter_cols_nv!(ctx_h, context_nv, off, head_dim, 1, embed)
+        end
+        Tensor.new(data: context_nv, requires_grad: false, is_leaf: false)
+      end
+      # Elementwise addition: self + other
+      # @param other [Tensor, Numeric]
+      # @return [Tensor]
+      def +(other)
+        other = ensure_tensor(other)
+        result_nv = alloc_like(@data)
+        kernel = Ignis::JIT::Kernels::Elementwise.add_forward
+        n = numel
+        grid = [(n + 255) / 256]
+        kernel.launch(grid: grid, block: [256], args: [@data, other.data, result_nv, n])
+        result = Tensor.new(data: result_nv, requires_grad: should_track?(other), is_leaf: false)
+        if result.requires_grad
+          Tape.record(result, inputs: [self, other]) do |grad|
+            [grad, grad]  # d(a+b)/da = 1, d(a+b)/db = 1
+          end
+        end
+        result
+      end
+      # Elementwise subtraction: self - other
+      # @param other [Tensor, Numeric]
+      # @return [Tensor]
+      def -(other)
+        other = ensure_tensor(other)
+        result_nv = alloc_like(@data)
+        kernel = Ignis::JIT::Kernels::Elementwise.sub_forward
+        n = numel
+        grid = [(n + 255) / 256]
+        kernel.launch(grid: grid, block: [256], args: [@data, other.data, result_nv, n])
+        result = Tensor.new(data: result_nv, requires_grad: should_track?(other), is_leaf: false)
+        if result.requires_grad
+          Tape.record(result, inputs: [self, other]) do |grad|
+            neg_grad = alloc_like(grad)
+            scale_k = Ignis::JIT::Kernels::Elementwise.scale_forward
+            gn = grad.numel
+            scale_k.launch(grid: [(gn + 255) / 256], block: [256], args: [grad, neg_grad, -1.0, gn])
+            [grad, neg_grad]
+          end
+        end
+        result
+      end
+      # Elementwise multiplication (Hadamard): self * other
+      # @param other [Tensor, Numeric]
+      # @return [Tensor]
+      def *(other)
+        if other.is_a?(Numeric)
+          return scalar_mul(other)
+        end
+        other = ensure_tensor(other)
+        result_nv = alloc_like(@data)
+        kernel = Ignis::JIT::Kernels::Elementwise.mul_forward
+        n = numel
+        grid = [(n + 255) / 256]
+        kernel.launch(grid: grid, block: [256], args: [@data, other.data, result_nv, n])
+        result = Tensor.new(data: result_nv, requires_grad: should_track?(other), is_leaf: false)
+        if result.requires_grad
+          saved_self = @data
+          saved_other = other.data
+          Tape.record(result, inputs: [self, other]) do |grad|
+            grad_a = alloc_like(grad)
+            grad_b = alloc_like(grad)
+            mk = Ignis::JIT::Kernels::Elementwise.mul_backward
+            gn = grad.numel
+            g = [(gn + 255) / 256]
+            mk.launch(grid: g, block: [256], args: [grad, saved_other, grad_a, gn])
+            mk.launch(grid: g, block: [256], args: [grad, saved_self, grad_b, gn])
+            [grad_a, grad_b]
+          end
+        end
+        result
+      end
+      # ReLU activation
+      # @return [Tensor]
+      def relu
+        result_nv = alloc_like(@data)
+        kernel = Ignis::JIT::Kernels::Activations.relu_forward(numel)
+        n = numel
+        kernel.launch(grid: [(n + 255) / 256], block: [256], args: [@data, result_nv, n])
+        result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
+        if @requires_grad
+          saved_input = @data
+          Tape.record(result, inputs: [self]) do |grad|
+            grad_in = alloc_like(grad)
+            bk = Ignis::JIT::Kernels::Activations.relu_backward
+            gn = grad.numel
+            bk.launch(grid: [(gn + 255) / 256], block: [256], args: [grad, saved_input, grad_in, gn])
+            [grad_in]
+          end
+        end
+        result
+      end
+      # GELU activation (tanh approximation)
+      # @return [Tensor]
+      def gelu
+        result_nv = alloc_like(@data)
+        kernel = Ignis::JIT::Kernels::Activations.gelu_forward
+        n = numel
+        kernel.launch(grid: [(n + 255) / 256], block: [256], args: [@data, result_nv, n])
+        result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
+        if @requires_grad
+          saved_input = @data
+          Tape.record(result, inputs: [self]) do |grad|
+            grad_in = alloc_like(grad)
+            bk = Ignis::JIT::Kernels::Activations.gelu_backward
+            gn = grad.numel
+            bk.launch(grid: [(gn + 255) / 256], block: [256], args: [grad, saved_input, grad_in, gn])
+            [grad_in]
+          end
+        end
+        result
+      end
+      # SiLU activation: x * sigmoid(x)
+      # @return [Tensor]
+      def silu
+        result_nv = alloc_like(@data)
+        kernel = Ignis::JIT::Kernels::Activations.silu_forward
+        n = numel
+        kernel.launch(grid: [(n + 255) / 256], block: [256], args: [@data, result_nv, n])
+        result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
+        if @requires_grad
+          saved_input = @data
+          Tape.record(result, inputs: [self]) do |grad|
+            grad_in = alloc_like(grad)
+            bk = Ignis::JIT::Kernels::Activations.silu_backward
+            gn = grad.numel
+            bk.launch(grid: [(gn + 255) / 256], block: [256], args: [grad, saved_input, grad_in, gn])
+            [grad_in]
+          end
+        end
+        result
+      end
+      # Softmax along last dimension
+      # @return [Tensor]
+      def softmax
+        last_dim = shape[-1]
+        outer_size = numel / last_dim
+        result_nv = alloc_like(@data)
+        kernel = Ignis::JIT::Kernels::Attention.softmax_forward
+        kernel.launch(grid: [(outer_size + 255) / 256], block: [256],
+                      args: [@data, result_nv, outer_size, last_dim])
+        result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
+        if @requires_grad
+          saved_output = result_nv
+          Tape.record(result, inputs: [self]) do |grad|
+            grad_in = alloc_like(grad)
+            bk = Ignis::JIT::Kernels::Attention.softmax_backward
+            bk.launch(grid: [(outer_size + 255) / 256], block: [256],
+                      args: [grad, saved_output, grad_in, outer_size, last_dim])
+            [grad_in]
+          end
+        end
+        result
+      end
+      # Layer normalization
+      # @param weight [Tensor] gamma parameter
+      # @param bias [Tensor] beta parameter
+      # @param eps [Float] epsilon for numerical stability
+      # @return [Tensor]
+      def layer_norm(weight, bias, eps: 1e-5)
+        norm_size = shape[-1]
+        outer_size = numel / norm_size
+        result_nv = alloc_like(@data)
+        # Allocate mean and rstd storage for backward pass
+        mean_nv = Ignis::Shared::NvArray.new(shape: [outer_size], dtype: dtype, device_id: device_id)
+        mean_nv.from_host(Array.new(outer_size, 0.0))
+        rstd_nv = Ignis::Shared::NvArray.new(shape: [outer_size], dtype: dtype, device_id: device_id)
+        rstd_nv.from_host(Array.new(outer_size, 0.0))
+        kernel = Ignis::JIT::Kernels::Normalization.layer_norm_forward
+        kernel.launch(grid: [(outer_size + 255) / 256], block: [256],
+                      args: [@data, weight.data, bias.data, result_nv, mean_nv, rstd_nv,
+                             outer_size, norm_size, eps])
+        result = Tensor.new(data: result_nv,
+                            requires_grad: @requires_grad || weight.requires_grad || bias.requires_grad,
+                            is_leaf: false)
+        if result.requires_grad
+          saved_input = @data
+          saved_gamma = weight.data
+          Tape.record(result, inputs: [self, weight, bias]) do |grad|
+            grad_input = alloc_like(grad)
+            grad_gamma = Ignis::Shared::NvArray.new(shape: [norm_size], dtype: dtype, device_id: device_id)
+            grad_gamma.from_host(Array.new(norm_size, 0.0))
+            grad_beta = Ignis::Shared::NvArray.new(shape: [norm_size], dtype: dtype, device_id: device_id)
+            grad_beta.from_host(Array.new(norm_size, 0.0))
+            bk = Ignis::JIT::Kernels::Normalization.layer_norm_backward
+            bk.launch(grid: [(outer_size + 255) / 256], block: [256],
+                      args: [grad, saved_input, saved_gamma, mean_nv, rstd_nv,
+                             grad_input, grad_gamma, grad_beta, outer_size, norm_size])
+            [grad_input, grad_gamma, grad_beta]
+          end
+        end
+        result
+      end
+      # RMSNorm: y = gamma * x / sqrt(mean(x^2) + eps)  (Llama/Qwen/Mistral style).
+      # No mean-subtraction and no bias (vs LayerNorm). Normalizes the last dim.
+      # @param weight [Tensor] gamma scale [norm_size]
+      # @param eps [Float]
+      # @return [Tensor]
+      def rms_norm(weight, eps: 1e-5)
+        norm_size = shape[-1]
+        outer_size = numel / norm_size
+        result_nv = alloc_like(@data)
+        # rstd per row, saved for backward
+        rstd_nv = Ignis::Shared::NvArray.new(shape: [outer_size], dtype: dtype, device_id: device_id)
+        rstd_nv.zero!
+        fwd = Ignis::JIT::Kernels::Normalization.rms_norm_forward
+        fwd.launch(grid: [(outer_size + 255) / 256], block: [256],
+                   args: [@data, weight.data, result_nv, rstd_nv, outer_size, norm_size, eps.to_f])
+        result = Tensor.new(data: result_nv,
+                            requires_grad: @requires_grad || weight.requires_grad,
+                            is_leaf: false)
+        if result.requires_grad
+          saved_input = @data
+          saved_gamma = weight.data
+          Tape.record(result, inputs: [self, weight]) do |grad|
+            grad_input = alloc_like(grad)
+            grad_gamma = zeros_nv([norm_size])
+            bk = Ignis::JIT::Kernels::Normalization.rms_norm_backward
+            bk.launch(grid: [(outer_size + 255) / 256], block: [256],
+                      args: [grad, saved_input, saved_gamma, rstd_nv,
+                             grad_input, grad_gamma, outer_size, norm_size])
+            [grad_input, grad_gamma]
+          end
+        end
+        result
+      end
+      # Rotary Position Embedding (RoPE), HF/Llama/Qwen "rotate_half" convention.
+      # self is [seq, num_heads*head_dim]; rotates each head's dims by its absolute
+      # position. No learned parameters — the backward is the same rotation with the
+      # sin sign flipped (orthogonal rotation ⇒ R^T = R(-θ)). Applied to Q and K.
+      # @param num_heads [Integer]
+      # @param base [Float] rotary base θ (Llama/Qwen use 10000; long-context models larger)
+      # @param pos_offset [Integer] absolute position of row 0 (for KV-cache decode)
+      # @return [Tensor]
+      # @param num_heads [Integer]
+      # @param base [Float] rotary base θ (used only when +inv_freq+ is nil)
+      # @param pos_offset [Integer] absolute position of row 0 (for KV-cache decode)
+      # @param inv_freq [Ignis::Shared::NvArray, Array<Float>, nil] precomputed [head_dim/2]
+      #   inverse frequencies. nil ⇒ standard base^(-2i/head_dim). Pass a remapped table
+      #   for RoPE scaling (llama3/NTK/YaRN).
+      def rope(num_heads:, base: 10000.0, pos_offset: 0, inv_freq: nil)
+        seq, embed = shape
+        head_dim = embed / num_heads
+        # rotate_half RoPE pairs dim i with i+head_dim/2, so it is only well-defined
+        # for EVEN head_dim. With an odd head_dim the pairing collides (one dim is
+        # used twice, another never), giving a non-orthogonal map whose forward AND
+        # gradient are silently wrong. No real architecture uses odd head_dim — fail
+        # loud rather than miscompute.
+        raise ArgumentError,
+              "RoPE requires an even head_dim (got #{head_dim} = #{embed}/#{num_heads}); " \
+              "rotate_half is only defined for paired dimensions" unless head_dim.even?
+        half = head_dim / 2
+        invf_nv = case inv_freq
+                  when Ignis::Shared::NvArray then inv_freq
+                  when Array then nv_from_floats(inv_freq)
+                  else nv_from_floats((0...half).map { |i| base.to_f**(-2.0 * i / head_dim) })
+                  end
+        out_nv = alloc_like(@data)
+        total = seq * embed
+        k = Ignis::JIT::Kernels::Attention.rope_apply
+        k.launch(grid: [(total + 255) / 256], block: [256],
+                 args: [@data, out_nv, seq, num_heads, head_dim, pos_offset, invf_nv, 1.0])
+        result = Tensor.new(data: out_nv, requires_grad: @requires_grad, is_leaf: false)
+        if result.requires_grad
+          Tape.record(result, inputs: [self]) do |grad|
+            gin = alloc_like(grad)
+            # backward = forward rotation with negated sin (transpose of an orthogonal rotation)
+            k.launch(grid: [(total + 255) / 256], block: [256],
+                     args: [grad, gin, seq, num_heads, head_dim, pos_offset, invf_nv, -1.0])
+            [gin]
+          end
+        end
+        result
+      end
+      # Transpose two dimensions (for 2D tensors)
+      # @param dim0 [Integer]
+      # @param dim1 [Integer]
+      # @return [Tensor]
+      def transpose(dim0 = 0, dim1 = 1)
+        raise ArgumentError, "transpose requires 2D tensor" unless shape.length == 2
+        rows = shape[0]
+        cols = shape[1]
+        result_nv = Ignis::Shared::NvArray.new(shape: [cols, rows], dtype: dtype, device_id: device_id)
+        result_nv.to_device # transpose_2d writes every element — alloc only, no host zeroing
+        kernel = Ignis::JIT::Kernels::Elementwise.transpose_2d
+        grid_x = (cols + 31) / 32
+        grid_y = (rows + 31) / 32
+        kernel.launch(grid: [grid_x, grid_y], block: [32, 8], args: [@data, result_nv, rows, cols])
+        result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
+        if @requires_grad
+          Tape.record(result, inputs: [self]) do |grad|
+            # Backward of transpose is transpose
+            grad_t = alloc_like(@data)
+            kernel_t = Ignis::JIT::Kernels::Elementwise.transpose_2d
+            kernel_t.launch(grid: [grid_y, grid_x], block: [32, 8], args: [grad, grad_t, cols, rows])
+            [grad_t]
+          end
+        end
+        result
+      end
+      # Reshape (zero-copy if contiguous)
+      # @param new_shape [Array<Integer>]
+      # @return [Tensor]
+      def reshape(new_shape)
+        new_numel = new_shape.reduce(1, :*)
+        raise ArgumentError, "Cannot reshape #{shape} to #{new_shape}" unless new_numel == numel
+        # View over @data's buffer: non-owning, retains parent so it isn't freed
+        # while the view is alive (and never double-frees the shared allocation).
+        result_nv = Ignis::Shared::NvArray.new(shape: new_shape, dtype: dtype, device_id: device_id,
+                                             ptr: @data.ptr, parent: @data)
+        result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
+        if @requires_grad
+          original_shape = shape
+          Tape.record(result, inputs: [self]) do |grad|
+            # Backward: reshape grad back to original shape (view over grad)
+            grad_reshaped = Ignis::Shared::NvArray.new(shape: original_shape, dtype: dtype,
+                                                       device_id: device_id, ptr: grad.ptr, parent: grad)
+            [grad_reshaped]
+          end
+        end
+        result
+      end
+      # Sum reduction (all elements → scalar)
+      # @return [Tensor]
+      def sum
+        n = numel
+        result_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: dtype, device_id: device_id)
+        result_nv.from_host([0.0])
+        kernel = Ignis::JIT::Kernels::Elementwise.sum_reduce
+        kernel.launch(grid: [1], block: [1], args: [@data, result_nv, 1, n])
+        result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
+        if @requires_grad
+          orig_shape = shape
+          Tape.record(result, inputs: [self]) do |grad|
+            # Gradient of sum is broadcast of 1.0 to original shape
+            grad_input = Ignis::Shared::NvArray.new(shape: orig_shape, dtype: dtype, device_id: device_id)
+            grad_input.from_host(Array.new(n, 0.0))
+            bk = Ignis::JIT::Kernels::Elementwise.broadcast_grad
+            bk.launch(grid: [(n + 255) / 256], block: [256], args: [grad, grad_input, 1.0, n])
+            [grad_input]
+          end
+        end
+        result
+      end
+      # Mean reduction (all elements → scalar)
+      # @return [Tensor]
+      def mean
+        n = numel
+        sum_result = self.sum
+        # Scale by 1/n
+        result_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: dtype, device_id: device_id)
+        result_nv.from_host([0.0])
+        kernel = Ignis::JIT::Kernels::Elementwise.scale_forward
+        kernel.launch(grid: [1], block: [1], args: [sum_result.data, result_nv, 1.0 / n, 1])
+        result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
+        if @requires_grad
+          orig_shape = shape
+          Tape.record(result, inputs: [self]) do |grad|
+            grad_input = Ignis::Shared::NvArray.new(shape: orig_shape, dtype: dtype, device_id: device_id)
+            grad_input.from_host(Array.new(n, 0.0))
+            bk = Ignis::JIT::Kernels::Elementwise.broadcast_grad
+            bk.launch(grid: [(n + 255) / 256], block: [256], args: [grad, grad_input, 1.0 / n, n])
+            [grad_input]
+          end
+        end
+        result
+      end
+      # -------------------------------------------------------------------
+      # Autograd
+      # -------------------------------------------------------------------
+      # Trigger reverse-mode automatic differentiation from this tensor.
+      # @param grad_output [Ignis::Shared::NvArray, nil] initial gradient
+      # @return [void]
+      def backward!(grad_output = nil)
+        if grad_output.nil? && numel == 1
+          # Scalar loss: start with 1.0
+          grad_output = Ignis::Shared::NvArray.new(shape: [1], dtype: dtype, device_id: device_id)
+          grad_output.from_host([1.0])
+        end
+        raise ArgumentError, "backward! requires grad_output for non-scalar tensors" if grad_output.nil?
+        Tape.backward!(self, grad_output)
+      end
+      # Zero out gradients (sets to zeros, not nil — avoids alloc in training loop)
+      # @return [void]
+      def zero_grad!
+        if @grad
+          n = @grad.numel
+          fill_k = Ignis::JIT::Kernels::Elementwise.fill
+          fill_k.launch(grid: [(n + 255) / 256], block: [256], args: [@grad, 0.0, n])
+        else
+          @grad = Ignis::Shared::NvArray.new(shape: shape, dtype: dtype, device_id: device_id)
+          @grad.from_host(Array.new(numel, 0.0))
+        end
+      end
+      # Create a detached copy (same GPU memory, no grad tracking)
+      # @return [Tensor]
+      def detach
+        Tensor.new(data: @data, requires_grad: false, is_leaf: true)
+      end
+      # Copy GPU data to host as Ruby Array.
+      # @return [Array<Numeric>]
+      def to_host
+        @data.to_host
+      end
+      # Get scalar value (for single-element tensors).
+      # @return [Float, Integer]
+      def item
+        raise "item() requires a single-element tensor, got shape #{shape}" unless numel == 1
+        to_host[0]
+      end
+      # -------------------------------------------------------------------
+      # Internal helpers
+      # -------------------------------------------------------------------
+      private
+      # Scalar multiplication
+      # @param scalar [Numeric]
+      # @return [Tensor]
+      def scalar_mul(scalar)
+        result_nv = alloc_like(@data)
+        kernel = Ignis::JIT::Kernels::Elementwise.scale_forward
+        n = numel
+        kernel.launch(grid: [(n + 255) / 256], block: [256], args: [@data, result_nv, scalar.to_f, n])
+        result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
+        if @requires_grad
+          Tape.record(result, inputs: [self]) do |grad|
+            grad_scaled = alloc_like(grad)
+            sk = Ignis::JIT::Kernels::Elementwise.scale_forward
+            gn = grad.numel
+            sk.launch(grid: [(gn + 255) / 256], block: [256], args: [grad, grad_scaled, scalar.to_f, gn])
+            [grad_scaled]
+          end
+        end
+        result
+      end
+      # Check if gradient tracking should be enabled for a binary op.
+      # @param other [Tensor]
+      # @return [Boolean]
+      def should_track?(other)
+        return false if Tape.no_grad_active?
+        @requires_grad || (other.is_a?(Tensor) && other.requires_grad)
+      end
+      # Ensure argument is a Tensor.
+      # @param other [Tensor, Numeric]
+      # @return [Tensor]
+      def ensure_tensor(other)
+        return other if other.is_a?(Tensor)
+        nv = Ignis::Shared::NvArray.new(shape: shape, dtype: dtype, device_id: device_id)
+        nv.from_host(Array.new(numel, other.to_f))
+        Tensor.new(data: nv, requires_grad: false)
+      end
+      # Allocate a new NvArray with same shape/dtype/device as source.
+      # @param source [Ignis::Shared::NvArray]
+      # @return [Ignis::Shared::NvArray]
+      def alloc_like(source)
+        nv = Ignis::Shared::NvArray.new(shape: source.shape, dtype: source.dtype, device_id: source.device_id)
+        nv.to_device # cudaMalloc
+        nv.zero!     # device memset (was a host Array.new + H2D copy: ~0.5ms/op)
+        nv
+      end
+      # Build a small fp32 NvArray from a Ruby Float array (e.g. a RoPE inv_freq table).
+      # @param floats [Array<Float>]
+      # @return [Ignis::Shared::NvArray]
+      def nv_from_floats(floats)
+        nv = Ignis::Shared::NvArray.new(shape: [floats.length], dtype: :float32, device_id: device_id)
+        nv.from_host(floats.map(&:to_f))
+        nv
+      end
+      # Allocate a zeroed NvArray of the given shape (this tensor's dtype/device).
+      # @param shp [Array<Integer>]
+      # @return [Ignis::Shared::NvArray]
+      def zeros_nv(shp)
+        nv = Ignis::Shared::NvArray.new(shape: shp, dtype: dtype, device_id: device_id)
+        nv.to_device # cudaMalloc
+        nv.zero!     # device memset (fast)
+        nv
+      end
+      # Copy columns [col_off, col_off+len) of every row into a fresh [rows, len] array.
+      # @return [Ignis::Shared::NvArray]
+      def slice_cols_nv(src, col_off, len, rows, total_cols)
+        out = Ignis::Shared::NvArray.new(shape: [rows, len], dtype: src.dtype, device_id: src.device_id)
+        out.to_device
+        kernel = Ignis::JIT::Kernels::Elementwise.slice_cols
+        total = rows * len
+        kernel.launch(grid: [(total + 255) / 256], block: [256],
+                      args: [src, out, rows, total_cols, col_off, len])
+        out
+      end
+      # Write a [rows, len] array into columns [col_off, col_off+len) of dst [rows, total_cols].
+      # @return [Ignis::Shared::NvArray] dst
+      def scatter_cols_nv!(src, dst, col_off, len, rows, total_cols)
+        kernel = Ignis::JIT::Kernels::Elementwise.scatter_cols
+        total = rows * len
+        kernel.launch(grid: [(total + 255) / 256], block: [256],
+                      args: [src, dst, rows, total_cols, col_off, len])
+        dst
+      end
+      # Accumulating scatter: dst[:, col_off...] += src. For GQA backward, where
+      # several query heads share one KV head and their dK/dV must SUM.
+      # @return [Ignis::Shared::NvArray] dst
+      def scatter_cols_add_nv!(src, dst, col_off, len, rows, total_cols)
+        kernel = Ignis::JIT::Kernels::Elementwise.scatter_cols_add
+        total = rows * len
+        kernel.launch(grid: [(total + 255) / 256], block: [256],
+                      args: [src, dst, rows, total_cols, col_off, len])
+        dst
+      end
+    end
+  end
+end