RubyGems - toy - Versions diffs - 0.8.0 → 0.9.0 - Mend

toy 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +31 -0
data/Makefile +211 -5
data/README.md +1 -1
data/lib/toy/compute.rb +9 -0
data/lib/toy/compute_cuda.rb +8 -0
data/lib/toy/compute_metal.rb +17 -0
data/lib/toy/core/cli/new.rb +8 -0
data/lib/toy/ffi/tinynn.rb +19 -0
data/lib/toy/ffi/tinynn_cuda.rb +7 -0
data/lib/toy/ffi/tinynn_metal.rb +5 -0
data/lib/toy/llm/archs/layer_spec.rb +39 -0
data/lib/toy/llm/archs/llama_arch.rb +62 -1
data/lib/toy/llm/archs/llama_arch_cuda.rb +62 -1
data/lib/toy/llm/archs/llama_arch_metal.rb +62 -1
data/lib/toy/llm/blocks/gdn_block.rb +176 -0
data/lib/toy/llm/engine/gpt2_kv_engine.rb +11 -0
data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +11 -0
data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +11 -0
data/lib/toy/llm/engine/llama_kv_engine.rb +10 -2
data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +10 -2
data/lib/toy/llm/engine/llama_kv_engine_metal.rb +10 -2
data/lib/toy/llm/engine/llama_seq_engine.rb +16 -1
data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +16 -1
data/lib/toy/llm/engine/llama_seq_engine_metal.rb +16 -1
data/lib/toy/llm/primitives/depth_scale.rb +33 -0
data/lib/toy/llm/primitives/diff_attention.rb +71 -0
data/lib/toy/llm/primitives/gdn.rb +188 -0
data/lib/toy/llm/primitives/scalable_softmax.rb +37 -0
data/lib/toy/run/eval_metal.rb +12 -0
data/lib/toy/run/infer_metal.rb +19 -0
data/lib/toy/run/train_gpt2_metal.rb +7 -0
data/lib/toy/run/train_hybrid.rb +232 -0
data/lib/toy/run/train_metal.rb +10 -0
data/lib/toy/version.rb +4 -3
data/tinynn/tinynn_backend_cuda.c +22 -0
data/tinynn/tinynn_ggml.c +231 -0
metadata +9 -2

data/lib/toy/llm/archs/llama_arch_cuda.rb CHANGED Viewed

@@ -71,6 +71,21 @@ module Toy; module LLM; module Archs
   class LlamaArch
     attr_accessor :t_seq_token_embed, :t_seq_final_norm_gamma, :t_seq_output,
                   :t_seq_w_proj, :seq_blocks_ffi,
+                  # Phase 3 — per-layer descriptor array, parallel to
+                  # seq_blocks_ffi (same length == n_layers).
+                  :seq_layer_specs,
+                  # Phase 5 — the dispatch key is a plain INT array (one kind per
+                  # layer), NOT LayerSpec.kind reads: constructing/mutating
+                  # LayerSpec objects on a realize path trips a Spinel codegen
+                  # miscompile (corrupts the token-id finalize). Mutating a plain
+                  # int array element is proven-safe. build_forward dispatches on
+                  # this; LayerSpec stays the descriptor type/constants home.
+                  :seq_layer_kinds,
+                  # Phase 5 — parallel GDN-block array (same length; entry is a
+                  # GDNBlock at KIND_GDN positions, null elsewhere). The KIND_GDN
+                  # dispatch arm calls into THIS array — a concrete typed call,
+                  # so the seam stays monomorphic per call site.
+                  :seq_gdn_blocks_ffi,
                   # Orchestration-gating carriers — bare cache ivars with
                   # no accessor before P2.5. The lens-branch guard reads
                   # seq_donor_d_in; the shared ctx reads seq_rope_cfg.
@@ -85,6 +100,15 @@ module Toy; module LLM; module Archs
       @t_seq_w_proj           = TinyNNCuda.tnn_null_ptr
       # Seed with one block — matches the former cache init (L112).
       @seq_blocks_ffi         = [Toy::LLM::Blocks::TransformerBlock.new]
+      # Phase 3 — parallel seed: one attention spec for the seed block.
+      @seq_layer_specs        = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
+      # Phase 5 — parallel int dispatch keys (KIND_ATTENTION for the seed).
+      @seq_layer_kinds        = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
+      # Phase 5 — parallel GDN-block slots. Seeded with GDNBlock placeholders so
+      # the array is MONOMORPHIC (all GDNBlock) — the seam's KIND_GDN call site
+      # never sees a mixed null/object array (Spinel poly-array landmine). At
+      # KIND_ATTENTION layers the placeholder is simply never invoked.
+      @seq_gdn_blocks_ffi     = [Toy::LLM::Blocks::GDNBlock.new]
       @seq_donor_d_in         = 0
       # The cache overwrites seq_rope_cfg with the real RoPE::Cfg before
       # build_forward runs (each realize prologue rebuilds it).
@@ -101,11 +125,33 @@ module Toy; module LLM; module Archs
     # already constructs TransformerBlock.new there, so no new class /
     # Struct / FFI :str at class load. Each realize path now calls this
     # via the cache's seq_blocks_ffi delegator chain (self.seq_arch).
+    # Phase 5 hybrid — rebuild the per-layer spec array from a per-layer GDN
+    # bool flag, using the LayerSpec CTOR (never the .kind= setter: mutating
+    # LayerSpec.kind elsewhere while build_forward reads it trips a Spinel
+    # codegen miscompile that corrupts the token-id finalize). Called after
+    # seed_blocks!, before alloc.
+    # Mark ONE layer as GDN. Takes an INT index (never an array param — a
+    # function-parameter array trips the Spinel #688 type-lock landmine, which
+    # here manifests as a token-id-finalize codegen miscompile). Mutates the
+    # plain int dispatch array element (proven-safe).
+    def set_gdn_layer!(idx)
+      @seq_layer_kinds[idx] = Toy::LLM::Archs::LayerSpec::KIND_GDN
+    end
     def seed_blocks!(n_layers)
       @seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
+      # Phase 3 — seed the parallel spec array in lockstep. Every layer is
+      # KIND_ATTENTION for now (the homogeneous-Llama refactor gate); Phase 5
+      # overwrites individual entries with KIND_GDN for Dragon's pattern.
+      @seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
+      @seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
+      @seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
       li_init = 1
       while li_init < n_layers
         @seq_blocks_ffi.push(Toy::LLM::Blocks::TransformerBlock.new)
+        @seq_layer_specs.push(Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION))
+        @seq_gdn_blocks_ffi.push(Toy::LLM::Blocks::GDNBlock.new)
+        @seq_layer_kinds.push(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)
         li_init = li_init + 1
       end
     end
@@ -217,7 +263,22 @@ module Toy; module LLM; module Archs
       end
       li_g = 0
       while li_g < seq_n_layers
-        t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
+        # Phase 3 — per-layer descriptor dispatch. The branch compares a FLAT
+        # INT (spec.kind) and each arm calls a CONCRETE typed block method, so
+        # every .build_forward call site stays monomorphic (one receiver
+        # class). KIND_ATTENTION is the only arm wired today; KIND_GDN gets its
+        # own arm + its own typed block array in Phase 5. Unknown kinds fail
+        # loud rather than silently building the wrong graph (never-mask rule).
+        spec_kind = self.seq_layer_kinds[li_g]
+        if spec_kind == Toy::LLM::Archs::LayerSpec::KIND_ATTENTION
+          t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
+        elsif spec_kind == Toy::LLM::Archs::LayerSpec::KIND_GDN
+          # Concrete typed call into the parallel GDN array — the GDN block reads
+          # its own dims (set at alloc); seq_t/eps come from the shared ctx.
+          t_cur = self.seq_gdn_blocks_ffi[li_g].build_forward(sess, t_cur, seq_t, eps)
+        else
+          raise "LlamaArch#build_forward: unsupported layer kind #{spec_kind} at layer #{li_g}"
+        end
         li_g = li_g + 1
       end

data/lib/toy/llm/archs/llama_arch_metal.rb CHANGED Viewed

@@ -71,6 +71,21 @@ module Toy; module LLM; module Archs
   class LlamaArch
     attr_accessor :t_seq_token_embed, :t_seq_final_norm_gamma, :t_seq_output,
                   :t_seq_w_proj, :seq_blocks_ffi,
+                  # Phase 3 — per-layer descriptor array, parallel to
+                  # seq_blocks_ffi (same length == n_layers).
+                  :seq_layer_specs,
+                  # Phase 5 — the dispatch key is a plain INT array (one kind per
+                  # layer), NOT LayerSpec.kind reads: constructing/mutating
+                  # LayerSpec objects on a realize path trips a Spinel codegen
+                  # miscompile (corrupts the token-id finalize). Mutating a plain
+                  # int array element is proven-safe. build_forward dispatches on
+                  # this; LayerSpec stays the descriptor type/constants home.
+                  :seq_layer_kinds,
+                  # Phase 5 — parallel GDN-block array (same length; entry is a
+                  # GDNBlock at KIND_GDN positions, null elsewhere). The KIND_GDN
+                  # dispatch arm calls into THIS array — a concrete typed call,
+                  # so the seam stays monomorphic per call site.
+                  :seq_gdn_blocks_ffi,
                   # Orchestration-gating carriers — bare cache ivars with
                   # no accessor before P2.5. The lens-branch guard reads
                   # seq_donor_d_in; the shared ctx reads seq_rope_cfg.
@@ -85,6 +100,15 @@ module Toy; module LLM; module Archs
       @t_seq_w_proj           = TinyNNMetal.tnn_null_ptr
       # Seed with one block — matches the former cache init (L112).
       @seq_blocks_ffi         = [Toy::LLM::Blocks::TransformerBlock.new]
+      # Phase 3 — parallel seed: one attention spec for the seed block.
+      @seq_layer_specs        = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
+      # Phase 5 — parallel int dispatch keys (KIND_ATTENTION for the seed).
+      @seq_layer_kinds        = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
+      # Phase 5 — parallel GDN-block slots. Seeded with GDNBlock placeholders so
+      # the array is MONOMORPHIC (all GDNBlock) — the seam's KIND_GDN call site
+      # never sees a mixed null/object array (Spinel poly-array landmine). At
+      # KIND_ATTENTION layers the placeholder is simply never invoked.
+      @seq_gdn_blocks_ffi     = [Toy::LLM::Blocks::GDNBlock.new]
       @seq_donor_d_in         = 0
       # The cache overwrites seq_rope_cfg with the real RoPE::Cfg before
       # build_forward runs (each realize prologue rebuilds it).
@@ -101,11 +125,33 @@ module Toy; module LLM; module Archs
     # already constructs TransformerBlock.new there, so no new class /
     # Struct / FFI :str at class load. Each realize path now calls this
     # via the cache's seq_blocks_ffi delegator chain (self.seq_arch).
+    # Phase 5 hybrid — rebuild the per-layer spec array from a per-layer GDN
+    # bool flag, using the LayerSpec CTOR (never the .kind= setter: mutating
+    # LayerSpec.kind elsewhere while build_forward reads it trips a Spinel
+    # codegen miscompile that corrupts the token-id finalize). Called after
+    # seed_blocks!, before alloc.
+    # Mark ONE layer as GDN. Takes an INT index (never an array param — a
+    # function-parameter array trips the Spinel #688 type-lock landmine, which
+    # here manifests as a token-id-finalize codegen miscompile). Mutates the
+    # plain int dispatch array element (proven-safe).
+    def set_gdn_layer!(idx)
+      @seq_layer_kinds[idx] = Toy::LLM::Archs::LayerSpec::KIND_GDN
+    end
     def seed_blocks!(n_layers)
       @seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
+      # Phase 3 — seed the parallel spec array in lockstep. Every layer is
+      # KIND_ATTENTION for now (the homogeneous-Llama refactor gate); Phase 5
+      # overwrites individual entries with KIND_GDN for Dragon's pattern.
+      @seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
+      @seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
+      @seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
       li_init = 1
       while li_init < n_layers
         @seq_blocks_ffi.push(Toy::LLM::Blocks::TransformerBlock.new)
+        @seq_layer_specs.push(Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION))
+        @seq_gdn_blocks_ffi.push(Toy::LLM::Blocks::GDNBlock.new)
+        @seq_layer_kinds.push(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)
         li_init = li_init + 1
       end
     end
@@ -217,7 +263,22 @@ module Toy; module LLM; module Archs
       end
       li_g = 0
       while li_g < seq_n_layers
-        t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
+        # Phase 3 — per-layer descriptor dispatch. The branch compares a FLAT
+        # INT (spec.kind) and each arm calls a CONCRETE typed block method, so
+        # every .build_forward call site stays monomorphic (one receiver
+        # class). KIND_ATTENTION is the only arm wired today; KIND_GDN gets its
+        # own arm + its own typed block array in Phase 5. Unknown kinds fail
+        # loud rather than silently building the wrong graph (never-mask rule).
+        spec_kind = self.seq_layer_kinds[li_g]
+        if spec_kind == Toy::LLM::Archs::LayerSpec::KIND_ATTENTION
+          t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
+        elsif spec_kind == Toy::LLM::Archs::LayerSpec::KIND_GDN
+          # Concrete typed call into the parallel GDN array — the GDN block reads
+          # its own dims (set at alloc); seq_t/eps come from the shared ctx.
+          t_cur = self.seq_gdn_blocks_ffi[li_g].build_forward(sess, t_cur, seq_t, eps)
+        else
+          raise "LlamaArch#build_forward: unsupported layer kind #{spec_kind} at layer #{li_g}"
+        end
         li_g = li_g + 1
       end

data/lib/toy/llm/blocks/gdn_block.rb ADDED Viewed

@@ -0,0 +1,176 @@
+# lib/toy/llm/blocks/gdn_block.rb — L2 block: a TRAINABLE Gated-DeltaNet layer
+# (Dragon / Qwen3-Next linear-attention mixer), the KIND_GDN counterpart of the
+# attention TransformerBlock. Composes the L1 GDN primitives around its own
+# projection weights and the autograd-differentiable recurrence
+# (Toy::LLM::Primitives::GDN.recur_unrolled, Phase 4 / Path B) — so the whole
+# layer trains with NO hand-written kernel backward.
+#
+# DEFERRED (Phase 5 minimal-trainable scope; revisit for Dragon bit-match): the
+# short causal conv on q/k/v (ggml_conv_1d is FFI-wired from Phase 1) and any
+# Dragon-exact stream layout. This block proves a GDN layer is a correct,
+# trainable residual unit; it is not yet a bit-faithful Dragon block.
+#
+# Shapes (single seq, B=1):
+#   x            [d_model, T]
+#   h = rmsnorm  [d_model, T]
+#   q/k/v/z      = W·h -> [S_v*H, T]   (W : [d_model, S_v*H])
+#   a/b          = W·h -> [H, T]       (W : [d_model, H]) ; per-head scalars
+#   per head h:  recur_unrolled(qn,kn,v,g,beta, state0_h) -> [S_v, T]
+#   o            concat heads -> [S_v*H, T]
+#   gated        = GatedRMSNorm(o, z) -> [S_v*H, T]
+#   out = W_o·gated -> [d_model, T] ; residual = x + out
+#
+# Spinel hygiene: hand-written positional class, NEVER Struct.new (landmine #16);
+# no Cfg ctor / default args (landmine #4); no Card/step_bind/FFI :str at class
+# load. This file does NOT require_relative "tinynn" (the loader picks the
+# backend before requiring this block, as for the L1 primitives + L2 attention
+# block).
+module Toy; module LLM; module Blocks
+  class GDNBlock
+    attr_accessor :t_rn_gamma,
+                  :t_w_q, :t_w_k, :t_w_v, :t_w_z, :t_w_a, :t_w_b,
+                  :t_a_log, :t_dt_bias, :t_go_gamma, :t_w_o,
+                  :t_state0,
+                  # Block dims (set at alloc) so build_forward can take the same
+                  # (sess, t_x, ctx) signature as TransformerBlock — the seam's
+                  # KIND_GDN dispatch arm stays a single concrete typed call.
+                  :gdn_d_model, :gdn_s_v, :gdn_n_heads,
+                  # F3 full-finetune parallel arrays (weight, m, v) — same
+                  # convention as TransformerBlock so the engine's opt_step
+                  # walker reaches them by name.
+                  :ft_weights, :ft_m, :ft_v
+    def initialize
+      @gdn_d_model = 0; @gdn_s_v = 0; @gdn_n_heads = 0
+      @t_rn_gamma = TinyNN.tnn_null_ptr
+      @t_w_q = TinyNN.tnn_null_ptr; @t_w_k = TinyNN.tnn_null_ptr; @t_w_v = TinyNN.tnn_null_ptr
+      @t_w_z = TinyNN.tnn_null_ptr; @t_w_a = TinyNN.tnn_null_ptr; @t_w_b = TinyNN.tnn_null_ptr
+      @t_a_log = TinyNN.tnn_null_ptr; @t_dt_bias = TinyNN.tnn_null_ptr
+      @t_go_gamma = TinyNN.tnn_null_ptr; @t_w_o = TinyNN.tnn_null_ptr
+      @t_state0 = TinyNN.tnn_null_ptr
+      @ft_weights = [TinyNN.tnn_null_ptr]; @ft_weights.pop
+      @ft_m       = [TinyNN.tnn_null_ptr]; @ft_m.pop
+      @ft_v       = [TinyNN.tnn_null_ptr]; @ft_v.pop
+    end
+    # Allocate the block's trainable persistent F32 weights + their Adam moments
+    # (parallel ft_weights/ft_m/ft_v arrays, populated in lockstep so the engine
+    # / a train loop can opt_step generically). d_model is the residual width;
+    # n_heads × s_v = the GDN inner width. state0 is a zeroed [s_v, s_v*n_heads]
+    # constant carry (one [s_v,s_v] block per head), NOT a param. Each weight's
+    # m/v match its shape (opt_step_adamw asserts same-shape).
+    def alloc_trainable_f32_weights!(sess, d_model, s_v, n_heads)
+      @gdn_d_model = d_model; @gdn_s_v = s_v; @gdn_n_heads = n_heads
+      inner = s_v * n_heads
+      # W : [d_model, out]  (matmul(W, h) contracts ne0=d_model -> [out, T]).
+      # input_2d_f32_persistent(rows, cols) -> ne0=cols, ne1=rows, so pass
+      # (out, d_model) to get ne0=d_model, ne1=out.
+      @t_rn_gamma = reg1(sess, d_model)
+      @t_w_q = reg2(sess, inner,   d_model)
+      @t_w_k = reg2(sess, inner,   d_model)
+      @t_w_v = reg2(sess, inner,   d_model)
+      @t_w_z = reg2(sess, inner,   d_model)
+      @t_w_a = reg2(sess, n_heads, d_model)
+      @t_w_b = reg2(sess, n_heads, d_model)
+      @t_a_log    = reg4(sess, 1, n_heads, 1, 1)
+      @t_dt_bias  = reg4(sess, 1, n_heads, 1, 1)
+      @t_go_gamma = reg1(sess, inner)
+      @t_w_o = reg2(sess, d_model, inner)
+      # Constant zero initial state (NOT registered as a trainable param).
+      @t_state0 = TinyNN.tnn_input_2d_f32_persistent(sess, s_v, s_v * n_heads)
+    end
+    # reg{1,2,4}: alloc a weight of the given rank + matching m/v, push the
+    # triple into ft_weights/ft_m/ft_v, return the weight handle.
+    def reg1(sess, n)
+      w = TinyNN.tnn_input_1d_f32_persistent(sess, n)
+      @ft_weights.push(w)
+      @ft_m.push(TinyNN.tnn_input_1d_f32_persistent(sess, n))
+      @ft_v.push(TinyNN.tnn_input_1d_f32_persistent(sess, n))
+      w
+    end
+    def reg2(sess, rows, cols)
+      w = TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols)
+      @ft_weights.push(w)
+      @ft_m.push(TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols))
+      @ft_v.push(TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols))
+      w
+    end
+    def reg4(sess, a, b, c, d)
+      w = TinyNN.tnn_input_4d_f32_persistent(sess, a, b, c, d)
+      @ft_weights.push(w)
+      @ft_m.push(TinyNN.tnn_input_4d_f32_persistent(sess, a, b, c, d))
+      @ft_v.push(TinyNN.tnn_input_4d_f32_persistent(sess, a, b, c, d))
+      w
+    end
+    # Mark every projection weight a trainable param. Call BEFORE finalize_weights
+    # (load-bearing order, gpt2_seq_engine.rb:128). a_log + dt_bias ARE trained
+    # (per-head decay shape); state0 is NOT (it is not in ft_weights).
+    def set_params!
+      wi = 0
+      while wi < @ft_weights.length
+        TinyNN.tnn_set_param(@ft_weights[wi])
+        wi = wi + 1
+      end
+    end
+    # Zero the constant initial state (after finalize_weights).
+    def zero_state!(sess)
+      TinyNN.tnn_zero_tensor(sess, @t_state0)
+    end
+    # Forward: residual update for x [d_model, T] (B=1). Returns [d_model, T].
+    # Dims (d_model/s_v/n_heads) come from self (set at alloc) so this matches the
+    # seam's per-layer call shape; seq_t/eps arrive from the forward ctx.
+    def build_forward(sess, t_x, seq_t, eps)
+      d_model = @gdn_d_model
+      s_v     = @gdn_s_v
+      n_heads = @gdn_n_heads
+      fbytes  = 4
+      h = Toy::LLM::Primitives::RMSNorm.build(sess, t_x, @t_rn_gamma, eps)
+      q2 = TinyNN.tnn_matmul(sess, @t_w_q, h)   # [S_v*H, T]
+      k2 = TinyNN.tnn_matmul(sess, @t_w_k, h)
+      v2 = TinyNN.tnn_matmul(sess, @t_w_v, h)
+      z2 = TinyNN.tnn_matmul(sess, @t_w_z, h)   # [S_v*H, T] output gate
+      a2 = TinyNN.tnn_matmul(sess, @t_w_a, h)   # [H, T] decay stream
+      b2 = TinyNN.tnn_matmul(sess, @t_w_b, h)   # [H, T] update stream
+      # Reshape projections into the recurrence's packed [S_v, H, T] / [1, H, T].
+      q3 = TinyNN.tnn_reshape_3d(sess, q2, s_v, n_heads, seq_t)
+      k3 = TinyNN.tnn_reshape_3d(sess, k2, s_v, n_heads, seq_t)
+      v3 = TinyNN.tnn_reshape_3d(sess, v2, s_v, n_heads, seq_t)
+      a3 = TinyNN.tnn_reshape_3d(sess, a2, 1,   n_heads, seq_t)
+      b3 = TinyNN.tnn_reshape_3d(sess, b2, 1,   n_heads, seq_t)
+      qn = Toy::LLM::Primitives::GDN.l2_train(sess, q3, eps)
+      kn = Toy::LLM::Primitives::GDN.l2_train(sess, k3, eps)
+      g  = Toy::LLM::Primitives::GDN.decay_gate(sess, a3, @t_dt_bias, @t_a_log)
+      bt = Toy::LLM::Primitives::GDN.update_gate_train(sess, b3)
+      # Per-head recurrence; concat head outputs along ne0 -> [S_v*H, T].
+      o = TinyNN.tnn_null_ptr
+      hh = 0
+      while hh < n_heads
+        st_h = TinyNN.tnn_view_2d(sess, @t_state0, s_v, s_v,
+                                  s_v * fbytes, hh * s_v * s_v * fbytes)
+        o_h = Toy::LLM::Primitives::GDN.recur_unrolled(sess, qn, kn, v3, g, bt,
+                                                       st_h, s_v, n_heads, hh, seq_t)
+        if hh == 0
+          o = o_h
+        else
+          o = TinyNN.tnn_concat(sess, o, o_h, 0)
+        end
+        hh = hh + 1
+      end
+      gated = Toy::LLM::Primitives::GDN.gated_out(sess, o, z2, @t_go_gamma, eps)
+      out   = TinyNN.tnn_matmul(sess, @t_w_o, gated)   # [d_model, T]
+      TinyNN.tnn_add(sess, t_x, out)
+    end
+  end
+end; end; end

data/lib/toy/llm/engine/gpt2_kv_engine.rb CHANGED Viewed

@@ -153,6 +153,17 @@ class GPT2KVFFICache
   # Build the compute graph for one decode position. Returns the logits
   # tensor handle. Caller calls tnn_compute then download_row_major.
   def build_decode_step(pos)
+    # The per-head K/V buffers are sized for @max_T positions. Writing /
+    # reading slot `pos` requires pos < @max_T; at pos == @max_T the
+    # cpy-into-view and history views overrun the cache allocation and
+    # ggml aborts deep inside ggml_view_2d. Fail loud here with a
+    # toy-level message instead (see toy#99).
+    if pos >= @max_T
+      raise "GPT2KVFFICache: decode pos=" + pos.to_s +
+            " exceeds KV cache capacity max_T=" + @max_T.to_s +
+            " (size the cache >= prompt_len + n_generate via realize_for)"
+    end
     eps   = 1.0e-5
     scale = 1.0 / Math.sqrt(@d_head.to_f)
     d_model = @d_model

data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb CHANGED Viewed

@@ -155,6 +155,17 @@ class GPT2KVFFICacheCuda
   # Build the compute graph for one decode position. Returns the logits
   # tensor handle. Caller calls tnn_compute then download_row_major.
   def build_decode_step(pos)
+    # The per-head K/V buffers are sized for @max_T positions. Writing /
+    # reading slot `pos` requires pos < @max_T; at pos == @max_T the
+    # cpy-into-view and history views overrun the cache allocation and
+    # ggml aborts deep inside ggml_view_2d. Fail loud here with a
+    # toy-level message instead (see toy#99).
+    if pos >= @max_T
+      raise "GPT2KVFFICacheCuda: decode pos=" + pos.to_s +
+            " exceeds KV cache capacity max_T=" + @max_T.to_s +
+            " (size the cache >= prompt_len + n_generate via realize_for)"
+    end
     eps   = 1.0e-5
     scale = 1.0 / Math.sqrt(@d_head.to_f)
     d_model = @d_model

data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb CHANGED Viewed

@@ -155,6 +155,17 @@ class GPT2KVFFICacheMetal
   # Build the compute graph for one decode position. Returns the logits
   # tensor handle. Caller calls tnn_compute then download_row_major.
   def build_decode_step(pos)
+    # The per-head K/V buffers are sized for @max_T positions. Writing /
+    # reading slot `pos` requires pos < @max_T; at pos == @max_T the
+    # cpy-into-view and history views overrun the cache allocation and
+    # ggml aborts deep inside ggml_view_2d. Fail loud here with a
+    # toy-level message instead (see toy#99).
+    if pos >= @max_T
+      raise "GPT2KVFFICacheMetal: decode pos=" + pos.to_s +
+            " exceeds KV cache capacity max_T=" + @max_T.to_s +
+            " (size the cache >= prompt_len + n_generate via realize_for)"
+    end
     eps   = 1.0e-5
     scale = 1.0 / Math.sqrt(@d_head.to_f)
     d_model = @d_model

data/lib/toy/llm/engine/llama_kv_engine.rb CHANGED Viewed

@@ -874,7 +874,7 @@ class SmolLM2KVFFICache
     if is_native
       wtype = GGUFLoad.detect_weight_type(gguf_path)
       set_weight_type(wtype)
-      realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias)
+      realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias, flags.qk_norm)
       puts "  BYO-pointer mmap (weight_type=" + wtype.to_s + ")"
       gguf
     else
@@ -1584,7 +1584,15 @@ module SmolLM2KV
     TinyNN.tnn_reset_for_rebuild(kv_cache.sess)
     step = kv_cache.build_decode_step(pos)
     TinyNN.tnn_realize(kv_cache.sess, step.kv_step_logits)
-    TinyNN.upload_int_array(kv_cache.sess, step.t_token_id, [token_id])
+    # Spinel landmine: in whole-program inference contexts where `token_id`
+    # poly-collapses to sp_RbVal (e.g. the eval runner, where generate's
+    # sampler-fed `last_id` unifies decode_step's param to RbVal), the literal
+    # `[token_id]` compiles to a PolyArray. upload_int_array takes :int_array
+    # (sp_IntArray), so the PolyArray is then mis-read as an IntArray → garbage
+    # length → ggml "tensor write out of bounds" abort. Narrowing to a clean
+    # mrb_int via `.to_i` forces the IntArray codegen (as `[pos]` already gets).
+    tid = token_id.to_i
+    TinyNN.upload_int_array(kv_cache.sess, step.t_token_id, [tid])
     TinyNN.upload_int_array(kv_cache.sess, step.t_pos,      [pos])
     TinyNN.tnn_compute(kv_cache.sess)
     kv_cache.dump_trace

data/lib/toy/llm/engine/llama_kv_engine_cuda.rb CHANGED Viewed

@@ -862,7 +862,7 @@ class SmolLM2KVFFICacheCuda
     if is_native
       wtype = GGUFLoad.detect_weight_type(gguf_path)
       set_weight_type(wtype)
-      realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias)
+      realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias, flags.qk_norm)
       puts "  BYO-pointer mmap (weight_type=" + wtype.to_s + ")"
       gguf
     else
@@ -1517,7 +1517,15 @@ module SmolLM2KVCuda
     TinyNNCuda.tnn_reset_for_rebuild(kv_cache.sess)
     step = kv_cache.build_decode_step(pos)
     TinyNNCuda.tnn_realize(kv_cache.sess, step.kv_step_logits)
-    TinyNNCuda.upload_int_array(kv_cache.sess, step.t_token_id, [token_id])
+    # Spinel landmine: in whole-program inference contexts where `token_id`
+    # poly-collapses to sp_RbVal (e.g. the eval runner, where generate's
+    # sampler-fed `last_id` unifies decode_step's param to RbVal), the literal
+    # `[token_id]` compiles to a PolyArray. upload_int_array takes :int_array
+    # (sp_IntArray), so the PolyArray is then mis-read as an IntArray → garbage
+    # length → ggml "tensor write out of bounds" abort. Narrowing to a clean
+    # mrb_int via `.to_i` forces the IntArray codegen (as `[pos]` already gets).
+    tid = token_id.to_i
+    TinyNNCuda.upload_int_array(kv_cache.sess, step.t_token_id, [tid])
     TinyNNCuda.upload_int_array(kv_cache.sess, step.t_pos,      [pos])
     TinyNNCuda.tnn_compute(kv_cache.sess)
     kv_cache.dump_trace

data/lib/toy/llm/engine/llama_kv_engine_metal.rb CHANGED Viewed

@@ -862,7 +862,7 @@ class SmolLM2KVFFICacheMetal
     if is_native
       wtype = GGUFLoad.detect_weight_type(gguf_path)
       set_weight_type(wtype)
-      realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias)
+      realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias, flags.qk_norm)
       puts "  BYO-pointer mmap (weight_type=" + wtype.to_s + ")"
       gguf
     else
@@ -1517,7 +1517,15 @@ module SmolLM2KVMetal
     TinyNNMetal.tnn_reset_for_rebuild(kv_cache.sess)
     step = kv_cache.build_decode_step(pos)
     TinyNNMetal.tnn_realize(kv_cache.sess, step.kv_step_logits)
-    TinyNNMetal.upload_int_array(kv_cache.sess, step.t_token_id, [token_id])
+    # Spinel landmine: in whole-program inference contexts where `token_id`
+    # poly-collapses to sp_RbVal (e.g. the eval runner, where generate's
+    # sampler-fed `last_id` unifies decode_step's param to RbVal), the literal
+    # `[token_id]` compiles to a PolyArray. upload_int_array takes :int_array
+    # (sp_IntArray), so the PolyArray is then mis-read as an IntArray → garbage
+    # length → ggml "tensor write out of bounds" abort. Narrowing to a clean
+    # mrb_int via `.to_i` forces the IntArray codegen (as `[pos]` already gets).
+    tid = token_id.to_i
+    TinyNNMetal.upload_int_array(kv_cache.sess, step.t_token_id, [tid])
     TinyNNMetal.upload_int_array(kv_cache.sess, step.t_pos,      [pos])
     TinyNNMetal.tnn_compute(kv_cache.sess)
     kv_cache.dump_trace

data/lib/toy/llm/engine/llama_seq_engine.rb CHANGED Viewed

@@ -25,6 +25,9 @@ require_relative "../primitives/rope"
 require_relative "../primitives/swiglu"
 require_relative "../primitives/gqa"
 require_relative "../blocks/transformer_block"
+require_relative "../primitives/gdn"
+require_relative "../blocks/gdn_block"
+require_relative "../archs/layer_spec"
 require_relative "../archs/llama_arch"
 module Toy; module LLM; module Engine
@@ -1007,6 +1010,14 @@ class LlamaSeqEngine
       @t_seq_attn_mask = TinyNN.tnn_input_2d_f32_persistent(@sess, tb_alloc, tb_alloc)
     end
+    # #1449: pre-create the token-id index leaf in ctx_w (galloc-external) before
+    # finalize, so galloc can't free its compute-arena slot and reuse it for the
+    # loss output (-> backward get_rows reads loss bits -> wild index -> OOB
+    # abort, layout-flaky on the C backend). build_forward reuses this handle;
+    # it's re-uploaded each step. Positions stay in the compute ctx (the loss
+    # aliases off=0 = token_ids, not positions). See tnn_input_1d_i32_persistent.
+    @t_seq_token_ids = TinyNN.tnn_input_1d_i32_persistent(@sess, @seq_t * @seq_b)
     TinyNN.tnn_finalize_weights(@sess)
     # Upload llama3-style RoPE freq_factors once the backend buffer
@@ -1056,7 +1067,11 @@ class LlamaSeqEngine
     # that ordering); RoPE applies per-batch positional encoding
     # because rope_ext reads positions[k] for each ne[2] slot.
     tb = @seq_t * @seq_b
-    @t_seq_token_ids = TinyNN.tnn_input_1d_i32(@sess, tb)
+    # #1449: token_ids is pre-created persistent (ctx_w) in finalize and survives
+    # reset_for_rebuild — reuse it. Only allocate here as a fallback for any path
+    # that builds the forward without the finalize pre-creation (then it lands in
+    # the compute ctx, the legacy behaviour).
+    @t_seq_token_ids = TinyNN.tnn_input_1d_i32(@sess, tb) if @t_seq_token_ids == TinyNN.tnn_null_ptr
     @t_seq_positions = TinyNN.tnn_input_1d_i32_ctx(@sess, tb)
     # The arch reads seq_rope_cfg / seq_donor_d_in off itself; the cache

data/lib/toy/llm/engine/llama_seq_engine_cuda.rb CHANGED Viewed

@@ -30,6 +30,9 @@ require_relative "../primitives/rope_cuda"
 require_relative "../primitives/swiglu_cuda"
 require_relative "../primitives/gqa_cuda"
 require_relative "../blocks/transformer_block_cuda"
+require_relative "../primitives/gdn"
+require_relative "../blocks/gdn_block"
+require_relative "../archs/layer_spec"
 require_relative "../archs/llama_arch_cuda"
 module Toy; module LLM; module Engine
@@ -1012,6 +1015,14 @@ class LlamaSeqEngineCuda
       @t_seq_attn_mask = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, tb_alloc, tb_alloc)
     end
+    # #1449: pre-create the token-id index leaf in ctx_w (galloc-external) before
+    # finalize, so galloc can't free its compute-arena slot and reuse it for the
+    # loss output (-> backward get_rows reads loss bits -> wild index -> OOB
+    # abort, layout-flaky on the C backend). build_forward reuses this handle;
+    # it's re-uploaded each step. Positions stay in the compute ctx (the loss
+    # aliases off=0 = token_ids, not positions). See tnn_input_1d_i32_persistent.
+    @t_seq_token_ids = TinyNNCuda.tnn_input_1d_i32_persistent(@sess, @seq_t * @seq_b)
     TinyNNCuda.tnn_finalize_weights(@sess)
     # Upload llama3-style RoPE freq_factors once the backend buffer
@@ -1061,7 +1072,11 @@ class LlamaSeqEngineCuda
     # that ordering); RoPE applies per-batch positional encoding
     # because rope_ext reads positions[k] for each ne[2] slot.
     tb = @seq_t * @seq_b
-    @t_seq_token_ids = TinyNNCuda.tnn_input_1d_i32(@sess, tb)
+    # #1449: token_ids is pre-created persistent (ctx_w) in finalize and survives
+    # reset_for_rebuild — reuse it. Only allocate here as a fallback for any path
+    # that builds the forward without the finalize pre-creation (then it lands in
+    # the compute ctx, the legacy behaviour).
+    @t_seq_token_ids = TinyNNCuda.tnn_input_1d_i32(@sess, tb) if @t_seq_token_ids == TinyNNCuda.tnn_null_ptr
     @t_seq_positions = TinyNNCuda.tnn_input_1d_i32_ctx(@sess, tb)
     # The arch reads seq_rope_cfg / seq_donor_d_in off itself; the cache

data/lib/toy/llm/engine/llama_seq_engine_metal.rb CHANGED Viewed

@@ -30,6 +30,9 @@ require_relative "../primitives/rope_metal"
 require_relative "../primitives/swiglu_metal"
 require_relative "../primitives/gqa_metal"
 require_relative "../blocks/transformer_block_metal"
+require_relative "../primitives/gdn"
+require_relative "../blocks/gdn_block"
+require_relative "../archs/layer_spec"
 require_relative "../archs/llama_arch_metal"
 module Toy; module LLM; module Engine
@@ -1012,6 +1015,14 @@ class LlamaSeqEngineMetal
       @t_seq_attn_mask = TinyNNMetal.tnn_input_2d_f32_persistent(@sess, tb_alloc, tb_alloc)
     end
+    # #1449: pre-create the token-id index leaf in ctx_w (galloc-external) before
+    # finalize, so galloc can't free its compute-arena slot and reuse it for the
+    # loss output (-> backward get_rows reads loss bits -> wild index -> OOB
+    # abort, layout-flaky on the C backend). build_forward reuses this handle;
+    # it's re-uploaded each step. Positions stay in the compute ctx (the loss
+    # aliases off=0 = token_ids, not positions). See tnn_input_1d_i32_persistent.
+    @t_seq_token_ids = TinyNNMetal.tnn_input_1d_i32_persistent(@sess, @seq_t * @seq_b)
     TinyNNMetal.tnn_finalize_weights(@sess)
     # Upload llama3-style RoPE freq_factors once the backend buffer
@@ -1061,7 +1072,11 @@ class LlamaSeqEngineMetal
     # that ordering); RoPE applies per-batch positional encoding
     # because rope_ext reads positions[k] for each ne[2] slot.
     tb = @seq_t * @seq_b
-    @t_seq_token_ids = TinyNNMetal.tnn_input_1d_i32(@sess, tb)
+    # #1449: token_ids is pre-created persistent (ctx_w) in finalize and survives
+    # reset_for_rebuild — reuse it. Only allocate here as a fallback for any path
+    # that builds the forward without the finalize pre-creation (then it lands in
+    # the compute ctx, the legacy behaviour).
+    @t_seq_token_ids = TinyNNMetal.tnn_input_1d_i32(@sess, tb) if @t_seq_token_ids == TinyNNMetal.tnn_null_ptr
     @t_seq_positions = TinyNNMetal.tnn_input_1d_i32_ctx(@sess, tb)
     # The arch reads seq_rope_cfg / seq_donor_d_in off itself; the cache

data/lib/toy/llm/primitives/depth_scale.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# lib/toy/llm/primitives/depth_scale.rb — L1 primitive: depth-dependent
+# LayerNorm scaling (Sun et al., "The Curse of Depth", arXiv 2502.05795).
+#
+# Pure module: `self.` methods only. The BLOCK (L2) owns the norm and computes
+# the per-layer constant; this primitive applies the parameter-free 1/sqrt(ell)
+# scaling to a normalised sublayer INPUT. See README.md.
+#
+# Formula: h(ell) = LayerNorm(h_ell) * (1/sqrt(ell)), applied to BOTH the
+# attention and FFN pre-norm outputs (the input fed to the sublayer), 1-indexed
+# ell. Caps Pre-LN variance growth with depth so deep layers stay effective.
+# Hyperparameter-free, no learned weights.
+#
+# Spinel hygiene: no Cfg / no default args. One FFI passthrough. inv_sqrt_depth
+# is the block's precomputed Float (1/sqrt(layer_index), computed in the CRuby
+# layer — no libm in the Spinel runner). Call via the full module path.
+module Toy
+  module LLM
+    module Primitives
+      module DepthScale
+        NAME = :depth_scale
+        # Scale a normalised tensor by the depth constant 1/sqrt(ell).
+        # x is the block's RMSNorm output (the sublayer input); the block
+        # passes inv_sqrt_depth = 1.0/Math.sqrt(layer) as a Float constant.
+        # Returns the depth-scaled handle.
+        def self.apply(sess, x, inv_sqrt_depth)
+          TinyNN.tnn_scale(sess, x, inv_sqrt_depth)
+        end
+      end
+    end
+  end
+end