RubyGems - toy - Versions diffs - 0.8.0 → 0.9.0 - Mend

toy 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +31 -0
data/Makefile +211 -5
data/README.md +1 -1
data/lib/toy/compute.rb +9 -0
data/lib/toy/compute_cuda.rb +8 -0
data/lib/toy/compute_metal.rb +17 -0
data/lib/toy/core/cli/new.rb +8 -0
data/lib/toy/ffi/tinynn.rb +19 -0
data/lib/toy/ffi/tinynn_cuda.rb +7 -0
data/lib/toy/ffi/tinynn_metal.rb +5 -0
data/lib/toy/llm/archs/layer_spec.rb +39 -0
data/lib/toy/llm/archs/llama_arch.rb +62 -1
data/lib/toy/llm/archs/llama_arch_cuda.rb +62 -1
data/lib/toy/llm/archs/llama_arch_metal.rb +62 -1
data/lib/toy/llm/blocks/gdn_block.rb +176 -0
data/lib/toy/llm/engine/gpt2_kv_engine.rb +11 -0
data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +11 -0
data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +11 -0
data/lib/toy/llm/engine/llama_kv_engine.rb +10 -2
data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +10 -2
data/lib/toy/llm/engine/llama_kv_engine_metal.rb +10 -2
data/lib/toy/llm/engine/llama_seq_engine.rb +16 -1
data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +16 -1
data/lib/toy/llm/engine/llama_seq_engine_metal.rb +16 -1
data/lib/toy/llm/primitives/depth_scale.rb +33 -0
data/lib/toy/llm/primitives/diff_attention.rb +71 -0
data/lib/toy/llm/primitives/gdn.rb +188 -0
data/lib/toy/llm/primitives/scalable_softmax.rb +37 -0
data/lib/toy/run/eval_metal.rb +12 -0
data/lib/toy/run/infer_metal.rb +19 -0
data/lib/toy/run/train_gpt2_metal.rb +7 -0
data/lib/toy/run/train_hybrid.rb +232 -0
data/lib/toy/run/train_metal.rb +10 -0
data/lib/toy/version.rb +4 -3
data/tinynn/tinynn_backend_cuda.c +22 -0
data/tinynn/tinynn_ggml.c +231 -0
metadata +9 -2

data/lib/toy/llm/primitives/diff_attention.rb ADDED Viewed

@@ -0,0 +1,71 @@
+# lib/toy/llm/primitives/diff_attention.rb — L1 primitive: Differential
+# Attention (DIFF Transformer, Ye et al.) — the diff-specific composition.
+#
+# Pure module: `self.` methods only, no module ivars, no state, no config
+# object. The BLOCK (L2) owns the Q1/Q2/K1/K2/V projections, runs the two
+# softmax attention maps (reusing the GQA primitive), and owns the learned
+# lambda vectors / per-head subln gamma; this primitive composes only the
+# DIFFERENTIAL pieces: the lambda scalar, the A1 - lambda*A2 combine, and the
+# (1 - lambda_init)-scaled per-head sub-norm. See README.md and
+# docs/roadmap/dragon-gdn-arch-2026-06-20.md.
+#
+# Formula (microsoft/unilm Diff-Transformer): each logical head owns two
+# q/k subheads. lambda = exp(lq1·lk1) - exp(lq2·lk2) + lambda_init, where
+# lambda_init = 0.8 - 0.6*exp(-0.3*depth) is a depth-constant the block passes
+# in. A = A1 - lambda*A2 ; O = A@V ; O = rms_norm(O)*gamma * (1 - lambda_init).
+#
+# Spinel hygiene: no Cfg ctor / no default args, no Card/step_bind, no FFI
+# :str. Fixed-arity FFI passthroughs. NOTE: call via the full module path
+# (Spinel can't dispatch a module method through a constant alias).
+#
+# This file does NOT require_relative "tinynn": the loader loads the backend's
+# TinyNN first (mirror generator handles the TinyNN.->TinyNN<Backend>. rename).
+module Toy
+  module LLM
+    module Primitives
+      module DiffAttention
+        NAME = :diff_attention
+        # The per-head differential lambda SCALAR:
+        #   lambda = exp(sum(lq1*lk1)) - exp(sum(lq2*lk2)) + lambda_init
+        # lq1/lk1/lq2/lk2 are the learned [head_dim] vectors (block-owned);
+        # lambda_init is the depth-constant Float. The dot products reduce to
+        # a [1] tensor via tnn_sum; the result lambda is a [1] tensor that
+        # broadcast-multiplies A2 in `combine`. (scale_bias folds the
+        # + lambda_init onto the first exp term, so the math is
+        # (exp1 + lambda_init) - exp2 = exp1 - exp2 + lambda_init.)
+        def self.lambda_scalar(sess, lq1, lk1, lq2, lk2, lambda_init)
+          d1  = TinyNN.tnn_mul(sess, lq1, lk1)
+          s1  = TinyNN.tnn_sum(sess, d1)
+          e1  = TinyNN.tnn_exp(sess, s1)
+          d2  = TinyNN.tnn_mul(sess, lq2, lk2)
+          s2  = TinyNN.tnn_sum(sess, d2)
+          e2  = TinyNN.tnn_exp(sess, s2)
+          e1b = TinyNN.tnn_scale_bias(sess, e1, 1.0, lambda_init)  # exp1 + lambda_init
+          TinyNN.tnn_sub(sess, e1b, e2)                            # (exp1+λ_init) - exp2
+        end
+        # Combine the two attention maps: A = A1 - lambda*A2. a1/a2 are the
+        # block's two softmax score maps (same shape); lambda the [1] scalar
+        # from `lambda_scalar` (broadcasts). a1 drives the shape under ggml
+        # broadcast; the lambda*a2 term is subtracted.
+        def self.combine(sess, a1, a2, lambda_t)
+          la2 = TinyNN.tnn_mul(sess, a2, lambda_t)
+          TinyNN.tnn_sub(sess, a1, la2)
+        end
+        # Per-head output sub-norm + the fixed (1 - lambda_init) scaling:
+        #   O = rms_norm(O, gamma) * (1 - lambda_init).
+        # o is the per-head attention output (block-sliced); gamma the subln
+        # weight; eps the Float epsilon; one_minus_lambda_init the compile-time
+        # Float (1 - lambda_init). tnn_rms_norm folds gamma; scale applies the
+        # depth constant. Returns the normed/scaled head output.
+        def self.subln(sess, o, gamma, eps, one_minus_lambda_init)
+          n = TinyNN.tnn_rms_norm(sess, o, gamma, eps)
+          TinyNN.tnn_scale(sess, n, one_minus_lambda_init)
+        end
+      end
+    end
+  end
+end

data/lib/toy/llm/primitives/gdn.rb ADDED Viewed

@@ -0,0 +1,188 @@
+# lib/toy/llm/primitives/gdn.rb — L1 primitive: Gated DeltaNet (GDN)
+# composition (Dragon / Qwen3-Next linear-attention mixer).
+#
+# Pure module: `self.` methods only, no module ivars, no state, no
+# config object. The GDN BLOCK (L2) owns the q/k/v/z/b/a projections,
+# the short causal conv, and the A_log / dt_bias / gamma weights; this
+# primitive composes only the PARAMETER-FREE activation + recurrence
+# steps that wrap them. See lib/toy/llm/primitives/README.md.
+#
+# The recurrence core is the in-tree ggml op tnn_gated_delta_net. Its
+# CONTRACT (verified against ggml-cpu/ops.cpp:10634): the kernel applies
+# exp(g) internally (g is the LOG-decay, passed raw), uses beta DIRECTLY
+# (so it must be pre-sigmoid'd), uses q/k DIRECTLY (so they must be
+# pre-L2-normed), and scales the attn output by 1/sqrt(S_v) internally.
+# Output packs [token_outputs | state_snapshots]; the block slices the
+# first T*B token columns before gated_out.
+#
+# Spinel hygiene: no Cfg ctor / no default args (landmine #4), no
+# Card/step_bind, no FFI :str args. Fixed-arity FFI passthroughs only.
+#
+# This file does NOT require_relative "tinynn": the loading module loads
+# the correct backend's TinyNN before requiring this primitive (mirror
+# generator handles the TinyNN. -> TinyNN<Backend>. rename).
+module Toy
+  module LLM
+    module Primitives
+      module GDN
+        NAME = :gdn
+        # L2-normalise a projected q or k along its head dim (the delta
+        # rule replaces softmax normalisation with L2-norm). x is the
+        # block's already-projected (and conv'd) q or k tensor; eps the
+        # Float epsilon. Returns the normalised handle. Called twice by
+        # the block (once for q, once for k).
+        def self.l2(sess, x, eps)
+          TinyNN.tnn_l2_norm(sess, x, eps)
+        end
+        # TRAINABLE L2 norm over ne0 — composed from ops that each have a ggml
+        # backward (mul / sum_rows / scale_bias / sqrt / div), because the fused
+        # `tnn_l2_norm` (GGML_OP_L2_NORM) has NO backward. Used by the trainable
+        # GDN block; the fused `l2` above stays the inference path.
+        #   y = x / sqrt(sum_ne0(x^2) + eps)
+        def self.l2_train(sess, x, eps)
+          sq     = TinyNN.tnn_mul(sess, x, x)            # x^2
+          ss     = TinyNN.tnn_sum_rows(sess, sq)         # sum over ne0 -> [1,...]
+          ss_eps = TinyNN.tnn_scale_bias(sess, ss, 1.0, eps)  # + eps
+          denom  = TinyNN.tnn_sqrt(sess, ss_eps)         # [1,...]
+          # DIV backward does NOT reduce a broadcast src1, so materialise denom to
+          # x's full shape first (REPEAT backward sums the grad back correctly);
+          # the div is then same-shape.
+          denom_full = TinyNN.tnn_repeat(sess, denom, x)
+          TinyNN.tnn_div(sess, x, denom_full)
+        end
+        # Log-decay gate: g = -exp(A_log) * softplus(a + dt_bias). a is
+        # the projected decay stream [1,H,T,B]; dt_bias and A_log are the
+        # block's per-v-head weights ([1,H,1,1], broadcast). Returned g is
+        # the raw LOG-decay the recurrence kernel exps internally. Op
+        # order is fixed for ggml broadcast (the [1,H,T,B] softplus term
+        # drives the shape; the [1,H,1,1] -exp(A_log) broadcasts onto it).
+        def self.decay_gate(sess, a, dt_bias, a_log)
+          a_db   = TinyNN.tnn_add(sess, a, dt_bias)
+          sp     = TinyNN.tnn_softplus(sess, a_db)
+          ea     = TinyNN.tnn_exp(sess, a_log)
+          ea_neg = TinyNN.tnn_neg(sess, ea)
+          TinyNN.tnn_mul(sess, sp, ea_neg)
+        end
+        # Update rate: beta = sigmoid(b). b is the projected update stream
+        # [1,H,T,B]. The kernel uses beta directly, so the sigmoid lives
+        # here. Returns beta.
+        def self.update_gate(sess, b)
+          TinyNN.tnn_sigmoid(sess, b)
+        end
+        # TRAINABLE update gate — sigmoid(b) composed as exp(b)/(1+exp(b)) from
+        # ops that each have a ggml backward, because GGML_UNARY_OP_SIGMOID has
+        # none. Same-shape throughout (no broadcast). The fused `update_gate`
+        # above (tnn_sigmoid) stays the inference path.
+        def self.update_gate_train(sess, b)
+          e = TinyNN.tnn_exp(sess, b)                    # exp(b)
+          d = TinyNN.tnn_scale_bias(sess, e, 1.0, 1.0)   # 1 + exp(b)
+          TinyNN.tnn_div(sess, e, d)
+        end
+        # The recurrence core. q,k must be L2-normed; beta sigmoid'd; g the
+        # raw log-decay; state the [S_v*S_v*H,K,B,1] carry. Returns the
+        # packed [S_v*H, T*B + K*S_v*B] output (token outputs then state
+        # snapshots). The block slices the leading T*B token columns.
+        def self.recur(sess, q, k, v, g, beta, state)
+          TinyNN.tnn_gated_delta_net(sess, q, k, v, g, beta, state)
+        end
+        # Path-B TRAINABLE recurrence: the gated delta rule expressed as an
+        # UNROLLED graph of ops that EACH have a ggml backward (mul / mul_mat /
+        # sub / scale / exp / add / reshape) — so training backward comes free
+        # and NO fused-kernel backward is needed (ggml has none for
+        # GATED_DELTA_NET). The fused `recur` above stays the fast INFERENCE
+        # path; this is its train-time twin, gated for numeric parity.
+        #
+        # Reproduces the fused kernel's token outputs for the SCALAR-decay path
+        # (g->ne0 == 1, the Dragon/Qwen3-Next per-head gate). Single seq (B=1),
+        # single head per call — the block loops heads/seqs around it in Phase 5.
+        # Inputs are the packed projection tensors (q,k,v = [S_v,1,T,1]; g,beta =
+        # [1,1,T,1]; state0 = [S_v,S_v]); per-token vectors are sliced via views
+        # internally (no ptr-array params → no Spinel IntArray-lock landmine).
+        # q/k must be pre-L2-normed and beta pre-sigmoid'd by the caller (the
+        # kernel contract). Returns [S_v, T] — token outputs concat'd along ne1.
+        #
+        #   per token t (matching ops.cpp:10731 exactly):
+        #     S = S * exp(g_t)                  decay  (scalar [1,1] broadcast)
+        #     u = matmul(S, k_t)                u[j] = sum_i S[i,j] k[i]
+        #     d = (v_t - u) * beta_t            delta
+        #     S = S + matmul(k_row, d_row)      outer  (k⊗d)[i,j] = k[i] d[j]
+        #     o_t = matmul(S, q_t)              o[j] = sum_i S[i,j] q[i]
+        #
+        # The kernel's 1/√S_v output scale is folded into a SINGLE pre-scale of q
+        # (q enters only the output read, never the state, so o[j] = sum_i S[i,j]
+        # (scale·q[i]) is exact). Done once on the contiguous q — NOT per-token on
+        # o — because a per-token ggml_scale's BACKWARD receives a view-shaped grad
+        # from the concat and asserts ggml_is_padded_1d (ggml.c:3392). One scale on
+        # the whole tensor keeps the backward grad contiguous.
+        #
+        # ONE head of the recurrence. q,k,v are the packed [S_v, n_heads, T, 1]
+        # projections; g,beta the packed [1, n_heads, T, 1] gates; state0 this
+        # head's [S_v,S_v] carry. `head` selects the head; per-token vectors are
+        # strided views into the packed tensors (token stride = S_v·n_heads, head
+        # base = S_v·head — the ggml [S_v,H,T,B] layout). Returns [S_v, T] for this
+        # head; the block concats heads along ne0. n_heads=1/head=0 is the plain
+        # single-head case (contiguous per-token, the Phase-4 gate shape).
+        def self.recur_unrolled(sess, q, k, v, g, beta, state0, s_v, n_heads, head, n_tokens)
+          scale = 1.0 / Math.sqrt(s_v.to_f)
+          fbytes = 4                          # sizeof(f32)
+          tok_stride  = s_v * n_heads * fbytes # bytes between this head's tokens
+          head_base   = s_v * head * fbytes    # byte offset to this head's col 0
+          gtok_stride = n_heads * fbytes       # g/beta [1,H,T,1]: token stride
+          ghead_base  = head * fbytes
+          q_s = TinyNN.tnn_scale(sess, q, scale)   # pre-scaled q (contiguous)
+          s_mat = state0
+          t_out = TinyNN.tnn_null_ptr
+          t = 0
+          while t < n_tokens
+            # Per-token slices: [S_v,1] vectors (S_v contiguous), [1,1] scalars.
+            q_t = TinyNN.tnn_view_2d(sess, q_s, s_v, 1, tok_stride, head_base + t * tok_stride)
+            k_t = TinyNN.tnn_view_2d(sess, k,   s_v, 1, tok_stride, head_base + t * tok_stride)
+            v_t = TinyNN.tnn_view_2d(sess, v,   s_v, 1, tok_stride, head_base + t * tok_stride)
+            g_t = TinyNN.tnn_view_2d(sess, g,    1, 1, gtok_stride, ghead_base + t * gtok_stride)
+            b_t = TinyNN.tnn_view_2d(sess, beta, 1, 1, gtok_stride, ghead_base + t * gtok_stride)
+            eg    = TinyNN.tnn_exp(sess, g_t)              # [1,1]
+            s_dec = TinyNN.tnn_mul(sess, s_mat, eg)        # [S_v,S_v] * [1,1] bcast
+            u     = TinyNN.tnn_matmul(sess, s_dec, k_t)    # [S_v,1]  u[j]
+            diff  = TinyNN.tnn_sub(sess, v_t, u)           # [S_v,1]
+            d     = TinyNN.tnn_mul(sess, diff, b_t)        # [S_v,1] * [1,1] bcast
+            k_row = TinyNN.tnn_reshape_2d(sess, k_t, 1, s_v)  # [1,S_v]
+            d_row = TinyNN.tnn_reshape_2d(sess, d, 1, s_v)    # [1,S_v]
+            outer = TinyNN.tnn_matmul(sess, k_row, d_row)  # [S_v,S_v] [i,j]=k[i]d[j]
+            s_mat = TinyNN.tnn_add(sess, s_dec, outer)     # state update
+            o_t   = TinyNN.tnn_matmul(sess, s_mat, q_t)    # [S_v,1]  o[j] (already scaled)
+            if t == 0
+              t_out = o_t
+            else
+              t_out = TinyNN.tnn_concat(sess, t_out, o_t, 1)  # stack along ne1
+            end
+            t = t + 1
+          end
+          t_out
+        end
+        # Gated output norm: GatedRMSNorm(o, z) = rms_norm(o) * gamma *
+        # silu(z). o is the per-head token output (block-sliced from
+        # recur); z the output-gate stream; gamma the block's norm weight;
+        # eps the Float epsilon. tnn_rms_norm already folds the gamma
+        # scale, so this is rms_norm(o,gamma) * silu(z). The normed term
+        # drives the shape; silu(z) broadcasts/multiplies. Returns the
+        # gated output (input to the block's out projection).
+        def self.gated_out(sess, o, z, gamma, eps)
+          n  = TinyNN.tnn_rms_norm(sess, o, gamma, eps)
+          sz = TinyNN.tnn_silu(sess, z)
+          TinyNN.tnn_mul(sess, n, sz)
+        end
+      end
+    end
+  end
+end

data/lib/toy/llm/primitives/scalable_softmax.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# lib/toy/llm/primitives/scalable_softmax.rb — L1 primitive: Scalable-Softmax
+# (SSMax, Nakanishi, arXiv 2501.19399) — anti-attention-fading softmax.
+#
+# Pure module: `self.` methods only. The BLOCK (L2) owns the learned per-head
+# scalar s and computes the SSMax scale; this primitive is the scaled softmax
+# itself. See README.md.
+#
+# Formula (no-bias form, Eq. 11): a = softmax((s*log n) * (q·kᵀ / sqrt(d))),
+# i.e. the usual scaled logits are multiplied by the scalar s*log(n), where n
+# is the number of keys in the causal prefix and s is learnable (init ~0.168).
+# This is exactly the existing scaled-softmax op with a MODIFIED scale:
+#   ssmax_scale = (1/sqrt(d)) * s * log(n).
+# The block precomputes ssmax_scale (log(n) for a fixed context length is a
+# CRuby-layer Float constant — no libm in the Spinel runner) and passes it here.
+#
+# Spinel hygiene: no Cfg / no default args. One FFI passthrough to
+# tnn_soft_max_ext. Call via the full module path.
+module Toy
+  module LLM
+    module Primitives
+      module ScalableSoftmax
+        NAME = :scalable_softmax
+        # SSMax-scaled softmax over attention scores. scores is the raw q·kᵀ
+        # map; mask the additive attention mask handle (or null); ssmax_scale
+        # the block's precomputed (1/sqrt(d))*s*log(n) Float; max_bias the
+        # ggml soft_max_ext ALiBi slope (0.0 when unused). Returns the
+        # attention-weight map. (Plain softmax falls out when ssmax_scale is
+        # the ordinary 1/sqrt(d) — so this also covers vanilla attention.)
+        def self.attend(sess, scores, mask, ssmax_scale, max_bias)
+          TinyNN.tnn_soft_max_ext(sess, scores, mask, ssmax_scale, max_bias)
+        end
+      end
+    end
+  end
+end

data/lib/toy/run/eval_metal.rb CHANGED Viewed

@@ -29,6 +29,16 @@
 require_relative "../models/arch"
 require_relative "../models/transformer_lm_metal"
 require_relative "../dev/toy_logprobs"
+require_relative "../ffi/tinynn_metal"
+# toy#90 — Metal teardown drain. See lib/toy/run/infer_metal.rb for the
+# full rationale: ggml-metal asserts at exit (ggml-metal-device.m:618) if
+# a Metal buffer outlives its device, and Spinel has no at_exit, so we
+# call tnn_shutdown_engines explicitly before exit. METAL-ONLY no-op
+# elsewhere. RUNTIME-UNVERIFIED on gx10 (Linux) — Mac gate proves exit-0.
+def toy_metal_teardown
+  TinyNNMetal.tnn_shutdown_engines
+end
 GGUF  = ENV["GGUF"] || "data/smollm2-135m-f32.gguf"
 TOP_K = (ENV["TOP_K"] || "5").to_i
@@ -65,3 +75,5 @@ while k < top_ids.length
   puts "logprob: " + top_ids[k].to_s + " " + top_vals[k].to_s
   k = k + 1
 end
+toy_metal_teardown     # toy#90: drain Metal residency sets before exit 0

data/lib/toy/run/infer_metal.rb CHANGED Viewed

@@ -29,6 +29,22 @@
 require_relative "../models/arch"
 require_relative "../models/transformer_lm_metal"
 require_relative "../io/tokenizer"
+require_relative "../ffi/tinynn_metal"
+# toy#90 — Metal teardown. ggml-metal asserts at process exit
+# (ggml-metal-device.m:618, [rsets->data count]==0) if any Metal buffer is
+# still alive when its singleton device is freed by the C++ static
+# destructor. This runner never frees its session (it relies on process
+# exit), so without an explicit drain it exits 134 AFTER printing correct
+# output. tnn_shutdown_engines frees every live Metal session's
+# weights_buf (removing it from the residency set), satisfying the assert.
+# Spinel has no at_exit (lib/toy/run/serve.rb:123), so this MUST be called
+# explicitly before every exit that follows lm.load. METAL-ONLY: on
+# CPU/CUDA the registry is empty and this is a no-op-equivalent.
+# RUNTIME-UNVERIFIED on gx10 (Linux) — Mac gate proves the exit-0.
+def toy_metal_teardown
+  TinyNNMetal.tnn_shutdown_engines
+end
 GGUF  = ENV["GGUF"] || "data/smollm2-135m-f32.gguf"
 PROMPT = ENV["PROMPT"] || "Once upon a time"
@@ -115,5 +131,8 @@ else
   puts "toy-infer: model has no embedded tokenizer; a string prompt cannot " +
        "be tokenized. Pass numeric token IDs via --prompt-ids (PROMPT_IDS=...) " +
        "or re-convert with --with-tokenizer."
+  toy_metal_teardown   # toy#90: lm.load already allocated Metal buffers
   exit 1
 end
+toy_metal_teardown     # toy#90: drain Metal residency sets before exit 0

data/lib/toy/run/train_gpt2_metal.rb CHANGED Viewed

@@ -83,3 +83,10 @@ while step < STEPS
   puts "step " + (step + 1).to_s + ": loss=" + loss.to_s
   step = step + 1
 end
+# toy#90 — Metal teardown drain. The GPT-2 Metal training session is never
+# explicitly freed; without this the ggml-metal device-free residency assert
+# (ggml-metal-device.m:618) aborts the process (exit 134) AFTER a correct
+# run. Spinel has no at_exit (lib/toy/run/serve.rb:123) so drain explicitly.
+# METAL-ONLY no-op for non-Metal. RUNTIME-UNVERIFIED on gx10 — Mac proves it.
+TinyNNMetal.tnn_shutdown_engines

data/lib/toy/run/train_hybrid.rb ADDED Viewed

@@ -0,0 +1,232 @@
+# lib/toy/run/train_hybrid.rb — Phase 5 capstone: a SELF-CONTAINED from-scratch
+# HYBRID trainer (one attention layer + one Gated-DeltaNet layer), in its OWN
+# Spinel compilation unit.
+#
+# Why its own runner (not the llama engine): pulling GDNBlock alloc/train code
+# into Toy::LLM::Engine::LlamaSeqEngine's unit miscompiles the proven byte-exact
+# attention path on the union pin (landmine #16 family — the same reason
+# toy-train-lora / toy-train-gpt2 are separate binaries). A dedicated unit can't
+# corrupt that path. See docs/roadmap/dragon-gdn-arch-2026-06-20.md (Phase 5).
+#
+# The forward dispatches per layer on a flat INT kind (the LayerSpec seam
+# pattern, monomorphic per call site): KIND_ATTENTION → inline causal
+# self-attention; KIND_GDN → Toy::LLM::Blocks::GDNBlock (Path-B autograd
+# recurrence). All params are flattened into one uniform ptr array so the
+# AdamW opt_step loop never touches two block types in one method.
+#
+#   x   = get_rows(embed, ids)                 [d_model, T]
+#   x   = attention_layer(x)                   [d_model, T]  (KIND_ATTENTION)
+#   x   = GDNBlock.build_forward(x)            [d_model, T]  (KIND_GDN)
+#   xf  = rmsnorm(x, final_gamma)              [d_model, T]
+#   lgt = matmul(embed, xf)                    [vocab, T]    (tied)
+#   loss= cross_entropy(lgt, labels)           overfit one fixed batch
+#
+# Asserts the CE loss DECREASES — the heterogeneous trainable stack works.
+require_relative "../../toy"
+require_relative "../ffi/tinynn"
+require_relative "../llm/primitives/rms_norm"
+require_relative "../llm/primitives/gdn"
+require_relative "../llm/blocks/gdn_block"
+require_relative "../llm/archs/layer_spec"
+module Toy
+  module LLM
+    module Run
+      module TrainHybrid
+        VOCAB = 16
+        DM    = 8
+        H     = 2
+        S_V   = 4     # H*S_V == DM
+        T     = 4
+        STEPS = 16
+        EPS   = 1.0e-5
+        def self.fillv(n, seed)
+          a = [0.0]; a.pop
+          i = 0
+          while i < n
+            a.push(((((i + seed) * 1103515245 + 12345) % 1000) - 500).to_f * 0.001)
+            i = i + 1
+          end
+          a
+        end
+        def self.zeros(n)
+          a = [0.0]; a.pop
+          i = 0
+          while i < n
+            a.push(0.0)
+            i = i + 1
+          end
+          a
+        end
+        # Inline single-head causal self-attention (no RoPE/GQA — minimal,
+        # trainable). Weights arrive as explicit handles. Returns x + Wo·ctx.
+        def self.attention_layer(sess, t_x, rn, wq, wk, wv, wo, eps)
+          h = Toy::LLM::Primitives::RMSNorm.build(sess, t_x, rn, eps)
+          q = TinyNN.tnn_matmul(sess, wq, h)            # [DM, T]
+          k = TinyNN.tnn_matmul(sess, wk, h)            # [DM, T]
+          v = TinyNN.tnn_matmul(sess, wv, h)            # [DM, T]
+          scores = TinyNN.tnn_matmul(sess, k, q)        # [T_k, T_q]
+          scaled = TinyNN.tnn_scale(sess, scores, 1.0 / Math.sqrt(DM.to_f))
+          masked = TinyNN.tnn_diag_mask_inf(sess, scaled, 0)
+          attn   = TinyNN.tnn_softmax(sess, masked)     # [T_k, T_q]
+          v_t    = TinyNN.tnn_transpose(sess, v)        # [T, DM]
+          ctx    = TinyNN.tnn_matmul(sess, v_t, attn)   # [DM, T_q]
+          out    = TinyNN.tnn_matmul(sess, wo, ctx)     # [DM, T]
+          TinyNN.tnn_add(sess, t_x, out)
+        end
+        def self.run
+          sess = TinyNN.tnn_session_new(0)
+          TinyNN.tnn_session_set_graph_capacity(sess, 262144)
+          # Flat param arrays (uniform ptr) so opt_step never sees two block types.
+          pp = [TinyNN.tnn_null_ptr]; pp.pop
+          pm = [TinyNN.tnn_null_ptr]; pm.pop
+          pv = [TinyNN.tnn_null_ptr]; pv.pop
+          # reg2/reg1: alloc a weight + matching m/v, register, return the weight.
+          embed  = reg2(sess, pp, pm, pv, VOCAB, DM)   # ne0=DM, ne1=VOCAB
+          fnorm  = reg1(sess, pp, pm, pv, DM)
+          # Attention layer weights.
+          a_rn   = reg1(sess, pp, pm, pv, DM)
+          a_wq   = reg2(sess, pp, pm, pv, DM, DM)
+          a_wk   = reg2(sess, pp, pm, pv, DM, DM)
+          a_wv   = reg2(sess, pp, pm, pv, DM, DM)
+          a_wo   = reg2(sess, pp, pm, pv, DM, DM)
+          # GDN layer (its own weights live in ft_weights/ft_m/ft_v; flatten in).
+          gblk = Toy::LLM::Blocks::GDNBlock.new
+          gblk.alloc_trainable_f32_weights!(sess, DM, S_V, H)
+          bi = 0
+          while bi < gblk.ft_weights.length
+            pp.push(gblk.ft_weights[bi]); pm.push(gblk.ft_m[bi]); pv.push(gblk.ft_v[bi])
+            bi = bi + 1
+          end
+          # set_param BEFORE finalize (load-bearing order).
+          gi = 0
+          while gi < pp.length
+            TinyNN.tnn_set_param(pp[gi])
+            gi = gi + 1
+          end
+          TinyNN.tnn_finalize_weights(sess)
+          gblk.zero_state!(sess)
+          # Init weights + zero moments.
+          gi = 0
+          while gi < pp.length
+            n = TinyNN.tnn_tensor_nelements(pp[gi])
+            TinyNN.tnn_upload_from_float_array(sess, pp[gi], fillv(n, gi * 7 + 1), n)
+            TinyNN.tnn_zero_tensor(sess, pm[gi])
+            TinyNN.tnn_zero_tensor(sess, pv[gi])
+            gi = gi + 1
+          end
+          # Forward — per-layer INT-kind dispatch (the seam pattern).
+          t_tok = TinyNN.tnn_input_1d_i32(sess, T)
+          x = TinyNN.tnn_get_rows(sess, embed, t_tok)
+          kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION,
+                   Toy::LLM::Archs::LayerSpec::KIND_GDN]
+          li = 0
+          while li < kinds.length
+            if kinds[li] == Toy::LLM::Archs::LayerSpec::KIND_ATTENTION
+              x = attention_layer(sess, x, a_rn, a_wq, a_wk, a_wv, a_wo, EPS)
+            else
+              x = gblk.build_forward(sess, x, T, EPS)
+            end
+            li = li + 1
+          end
+          xf  = Toy::LLM::Primitives::RMSNorm.build(sess, x, fnorm, EPS)
+          lgt = TinyNN.tnn_matmul(sess, embed, xf)         # [VOCAB, T] tied
+          t_labels = TinyNN.tnn_input_2d_f32(sess, T, VOCAB)
+          t_hp     = TinyNN.tnn_input_1d_f32(sess, 7)
+          t_loss   = TinyNN.tnn_cross_entropy_loss(sess, lgt, t_labels)
+          TinyNN.tnn_set_output(t_loss)
+          TinyNN.tnn_set_loss(t_loss)
+          TinyNN.tnn_build_forward_only(sess, t_loss)
+          TinyNN.tnn_build_backward(sess)
+          gj = 0
+          while gj < pp.length
+            tg = TinyNN.tnn_tensor_grad(sess, pp[gj])
+            to = TinyNN.tnn_opt_step_adamw(sess, pp[gj], tg, pm[gj], pv[gj], t_hp)
+            TinyNN.tnn_extend_backward_graph(sess, to)
+            gj = gj + 1
+          end
+          TinyNN.tnn_pin_all_graph_b_nodes(sess)
+          TinyNN.tnn_realize_backward(sess)
+          ids = [1, 2, 3, 4]
+          labels = zeros(VOCAB * T)
+          tt = 0
+          while tt < T
+            tgt = (ids[tt] + 1) % VOCAB
+            labels[tgt + VOCAB * tt] = 1.0
+            tt = tt + 1
+          end
+          hp = [0.02, 0.9, 0.95, 1.0e-8, 0.0, 0.9, 0.95]
+          first_loss = 0.0
+          last_loss  = 0.0
+          s = 0
+          while s < STEPS
+            if s == 0
+              TinyNN.tnn_graph_reset(sess)
+            else
+              TinyNN.tnn_graph_reset_grads_only(sess)
+            end
+            TinyNN.upload_int_array(sess, t_tok, ids)
+            TinyNN.tnn_upload_from_float_array(sess, t_labels, labels, VOCAB * T)
+            TinyNN.tnn_upload_from_float_array(sess, t_hp, hp, 7)
+            TinyNN.tnn_compute_backward(sess)
+            TinyNN.tnn_download(sess, t_loss)
+            lv = TinyNN.tnn_scratch_get(sess, 0)
+            if s == 0
+              first_loss = lv
+            end
+            last_loss = lv
+            puts "step " + s.to_s + ": loss=" + lv.to_s
+            s = s + 1
+          end
+          ok = true
+          if first_loss != first_loss || last_loss != last_loss
+            puts "FAIL: loss is NaN"
+            ok = false
+          end
+          if last_loss >= first_loss - 0.05
+            puts "FAIL: loss did not decrease (first=" + first_loss.to_s + " last=" + last_loss.to_s + ")"
+            ok = false
+          end
+          if ok
+            puts "HYBRID train smoke PASS: attention+GDN from-scratch hybrid trains — CE loss " +
+                 first_loss.to_s + " -> " + last_loss.to_s + " over " + STEPS.to_s + " steps"
+          else
+            puts "HYBRID train smoke FAIL"
+          end
+        end
+        def self.reg1(sess, pp, pm, pv, n)
+          w = TinyNN.tnn_input_1d_f32_persistent(sess, n)
+          pp.push(w); pm.push(TinyNN.tnn_input_1d_f32_persistent(sess, n))
+          pv.push(TinyNN.tnn_input_1d_f32_persistent(sess, n))
+          w
+        end
+        def self.reg2(sess, pp, pm, pv, rows, cols)
+          w = TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols)
+          pp.push(w); pm.push(TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols))
+          pv.push(TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols))
+          w
+        end
+      end
+    end
+  end
+end
+Toy::LLM::Run::TrainHybrid.run

data/lib/toy/run/train_metal.rb CHANGED Viewed

@@ -225,3 +225,13 @@ if EVENTS.length > 0 && TinyNNMetal.tnn_events_active == 1
   TinyNNMetal.tnn_events_emit(re.dump)
   TinyNNMetal.tnn_events_close
 end
+# toy#90 — Metal teardown drain. The training session (recipe.fs_cache.sess)
+# is never explicitly freed (the runner relies on process exit), so without
+# this the ggml-metal device-free assert (ggml-metal-device.m:618,
+# [rsets->data count]==0) fires AFTER a correct run, exiting 134. Spinel has
+# no at_exit (lib/toy/run/serve.rb:123) so we drain explicitly here.
+# tnn_shutdown_engines frees every live Metal session's weights_buf
+# (removing it from the residency set) and the CPU write session too.
+# RUNTIME-UNVERIFIED on gx10 (Linux) — Mac gate proves the exit-0.
+TinyNNMetal.tnn_shutdown_engines

data/lib/toy/version.rb CHANGED Viewed

@@ -4,8 +4,9 @@
 # under CRuby).
 module Toy
   # Single source of truth: gemspec + `toy --version` + `toy --manifest`
-  # all read this; README/CHANGELOG/git tag display it as v0.8.0.
-  # v0.8.0 (2026-06-12) is the first PUBLISHED version (RubyGems).
+  # all read this; README/CHANGELOG/git tag display it as v0.9.0.
+  # v0.8.0 (2026-06-12) was the first PUBLISHED version (RubyGems);
+  # v0.9.0 adds the Dragon / Gated-DeltaNet trainable hybrid arc.
   # Pre-1.0: not API-stable.
-  VERSION = "0.8.0".freeze
+  VERSION = "0.9.0".freeze
 end

data/tinynn/tinynn_backend_cuda.c CHANGED Viewed

@@ -16,6 +16,28 @@
 #include <stdio.h>
 #include <stdlib.h>
+/* toy#94 — DURABLE GUARD against an int-truncated BYO-pointer.
+ *
+ * ggml_backend_cuda_buffer_from_ptr is the vendored BYO-pointer entry
+ * (vendor-patches/0001-cuda-buffer_from_ptr.patch, which patches BOTH
+ * src/ggml-cuda/ggml-cuda.cu AND include/ggml-cuda.h). The SYMBOL is
+ * defined in libggml-cuda.a, but `make gem-prep` resets vendor/ggml to
+ * GGML_REV (Makefile $(GGML_DIR)/.patched / commit 312fae9) — which
+ * silently drops the *header* declaration while a previously-built
+ * archive still carries the symbol. With no prototype in scope, C
+ * treats the call as an implicit declaration returning `int`: on
+ * aarch64 (GB10) the 64-bit ggml_backend_buffer_t is TRUNCATED to 32
+ * bits. The truncated pointer is non-NULL, so the !buf check passes,
+ * and the next ggml_backend_buffer_get_base() dereferences garbage →
+ * SIGSEGV in the Phase-2 mmap weight-attach (the toy#94 stack:
+ * ggml_backend_buffer_get_base <- tnn_session_attach_weight_mmap <-
+ * realize_for_mmap). Declaring it here keeps a correct 64-bit prototype
+ * in scope REGARDLESS of the vendored header's post-reset state, so the
+ * pointer can never be truncated. (Identical to the header decl when
+ * the patch is applied; harmless redundancy.) */
+GGML_BACKEND_API ggml_backend_buffer_t
+ggml_backend_cuda_buffer_from_ptr(void *host_ptr, size_t size, int device);
 ggml_backend_t tnn_backend_cuda_init_internal(void)
 {
     return ggml_backend_cuda_init(0);