toy 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +31 -0
  3. data/Makefile +211 -5
  4. data/README.md +1 -1
  5. data/lib/toy/compute.rb +9 -0
  6. data/lib/toy/compute_cuda.rb +8 -0
  7. data/lib/toy/compute_metal.rb +17 -0
  8. data/lib/toy/core/cli/new.rb +8 -0
  9. data/lib/toy/ffi/tinynn.rb +19 -0
  10. data/lib/toy/ffi/tinynn_cuda.rb +7 -0
  11. data/lib/toy/ffi/tinynn_metal.rb +5 -0
  12. data/lib/toy/llm/archs/layer_spec.rb +39 -0
  13. data/lib/toy/llm/archs/llama_arch.rb +62 -1
  14. data/lib/toy/llm/archs/llama_arch_cuda.rb +62 -1
  15. data/lib/toy/llm/archs/llama_arch_metal.rb +62 -1
  16. data/lib/toy/llm/blocks/gdn_block.rb +176 -0
  17. data/lib/toy/llm/engine/gpt2_kv_engine.rb +11 -0
  18. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +11 -0
  19. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +11 -0
  20. data/lib/toy/llm/engine/llama_kv_engine.rb +10 -2
  21. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +10 -2
  22. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +10 -2
  23. data/lib/toy/llm/engine/llama_seq_engine.rb +16 -1
  24. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +16 -1
  25. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +16 -1
  26. data/lib/toy/llm/primitives/depth_scale.rb +33 -0
  27. data/lib/toy/llm/primitives/diff_attention.rb +71 -0
  28. data/lib/toy/llm/primitives/gdn.rb +188 -0
  29. data/lib/toy/llm/primitives/scalable_softmax.rb +37 -0
  30. data/lib/toy/run/eval_metal.rb +12 -0
  31. data/lib/toy/run/infer_metal.rb +19 -0
  32. data/lib/toy/run/train_gpt2_metal.rb +7 -0
  33. data/lib/toy/run/train_hybrid.rb +232 -0
  34. data/lib/toy/run/train_metal.rb +10 -0
  35. data/lib/toy/version.rb +4 -3
  36. data/tinynn/tinynn_backend_cuda.c +22 -0
  37. data/tinynn/tinynn_ggml.c +231 -0
  38. metadata +9 -2
@@ -0,0 +1,71 @@
1
+ # lib/toy/llm/primitives/diff_attention.rb — L1 primitive: Differential
2
+ # Attention (DIFF Transformer, Ye et al.) — the diff-specific composition.
3
+ #
4
+ # Pure module: `self.` methods only, no module ivars, no state, no config
5
+ # object. The BLOCK (L2) owns the Q1/Q2/K1/K2/V projections, runs the two
6
+ # softmax attention maps (reusing the GQA primitive), and owns the learned
7
+ # lambda vectors / per-head subln gamma; this primitive composes only the
8
+ # DIFFERENTIAL pieces: the lambda scalar, the A1 - lambda*A2 combine, and the
9
+ # (1 - lambda_init)-scaled per-head sub-norm. See README.md and
10
+ # docs/roadmap/dragon-gdn-arch-2026-06-20.md.
11
+ #
12
+ # Formula (microsoft/unilm Diff-Transformer): each logical head owns two
13
+ # q/k subheads. lambda = exp(lq1·lk1) - exp(lq2·lk2) + lambda_init, where
14
+ # lambda_init = 0.8 - 0.6*exp(-0.3*depth) is a depth-constant the block passes
15
+ # in. A = A1 - lambda*A2 ; O = A@V ; O = rms_norm(O)*gamma * (1 - lambda_init).
16
+ #
17
+ # Spinel hygiene: no Cfg ctor / no default args, no Card/step_bind, no FFI
18
+ # :str. Fixed-arity FFI passthroughs. NOTE: call via the full module path
19
+ # (Spinel can't dispatch a module method through a constant alias).
20
+ #
21
+ # This file does NOT require_relative "tinynn": the loader loads the backend's
22
+ # TinyNN first (mirror generator handles the TinyNN.->TinyNN<Backend>. rename).
23
+
24
+ module Toy
25
+ module LLM
26
+ module Primitives
27
+ module DiffAttention
28
+ NAME = :diff_attention
29
+
30
+ # The per-head differential lambda SCALAR:
31
+ # lambda = exp(sum(lq1*lk1)) - exp(sum(lq2*lk2)) + lambda_init
32
+ # lq1/lk1/lq2/lk2 are the learned [head_dim] vectors (block-owned);
33
+ # lambda_init is the depth-constant Float. The dot products reduce to
34
+ # a [1] tensor via tnn_sum; the result lambda is a [1] tensor that
35
+ # broadcast-multiplies A2 in `combine`. (scale_bias folds the
36
+ # + lambda_init onto the first exp term, so the math is
37
+ # (exp1 + lambda_init) - exp2 = exp1 - exp2 + lambda_init.)
38
+ def self.lambda_scalar(sess, lq1, lk1, lq2, lk2, lambda_init)
39
+ d1 = TinyNN.tnn_mul(sess, lq1, lk1)
40
+ s1 = TinyNN.tnn_sum(sess, d1)
41
+ e1 = TinyNN.tnn_exp(sess, s1)
42
+ d2 = TinyNN.tnn_mul(sess, lq2, lk2)
43
+ s2 = TinyNN.tnn_sum(sess, d2)
44
+ e2 = TinyNN.tnn_exp(sess, s2)
45
+ e1b = TinyNN.tnn_scale_bias(sess, e1, 1.0, lambda_init) # exp1 + lambda_init
46
+ TinyNN.tnn_sub(sess, e1b, e2) # (exp1+λ_init) - exp2
47
+ end
48
+
49
+ # Combine the two attention maps: A = A1 - lambda*A2. a1/a2 are the
50
+ # block's two softmax score maps (same shape); lambda the [1] scalar
51
+ # from `lambda_scalar` (broadcasts). a1 drives the shape under ggml
52
+ # broadcast; the lambda*a2 term is subtracted.
53
+ def self.combine(sess, a1, a2, lambda_t)
54
+ la2 = TinyNN.tnn_mul(sess, a2, lambda_t)
55
+ TinyNN.tnn_sub(sess, a1, la2)
56
+ end
57
+
58
+ # Per-head output sub-norm + the fixed (1 - lambda_init) scaling:
59
+ # O = rms_norm(O, gamma) * (1 - lambda_init).
60
+ # o is the per-head attention output (block-sliced); gamma the subln
61
+ # weight; eps the Float epsilon; one_minus_lambda_init the compile-time
62
+ # Float (1 - lambda_init). tnn_rms_norm folds gamma; scale applies the
63
+ # depth constant. Returns the normed/scaled head output.
64
+ def self.subln(sess, o, gamma, eps, one_minus_lambda_init)
65
+ n = TinyNN.tnn_rms_norm(sess, o, gamma, eps)
66
+ TinyNN.tnn_scale(sess, n, one_minus_lambda_init)
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,188 @@
1
+ # lib/toy/llm/primitives/gdn.rb — L1 primitive: Gated DeltaNet (GDN)
2
+ # composition (Dragon / Qwen3-Next linear-attention mixer).
3
+ #
4
+ # Pure module: `self.` methods only, no module ivars, no state, no
5
+ # config object. The GDN BLOCK (L2) owns the q/k/v/z/b/a projections,
6
+ # the short causal conv, and the A_log / dt_bias / gamma weights; this
7
+ # primitive composes only the PARAMETER-FREE activation + recurrence
8
+ # steps that wrap them. See lib/toy/llm/primitives/README.md.
9
+ #
10
+ # The recurrence core is the in-tree ggml op tnn_gated_delta_net. Its
11
+ # CONTRACT (verified against ggml-cpu/ops.cpp:10634): the kernel applies
12
+ # exp(g) internally (g is the LOG-decay, passed raw), uses beta DIRECTLY
13
+ # (so it must be pre-sigmoid'd), uses q/k DIRECTLY (so they must be
14
+ # pre-L2-normed), and scales the attn output by 1/sqrt(S_v) internally.
15
+ # Output packs [token_outputs | state_snapshots]; the block slices the
16
+ # first T*B token columns before gated_out.
17
+ #
18
+ # Spinel hygiene: no Cfg ctor / no default args (landmine #4), no
19
+ # Card/step_bind, no FFI :str args. Fixed-arity FFI passthroughs only.
20
+ #
21
+ # This file does NOT require_relative "tinynn": the loading module loads
22
+ # the correct backend's TinyNN before requiring this primitive (mirror
23
+ # generator handles the TinyNN. -> TinyNN<Backend>. rename).
24
+
25
+ module Toy
26
+ module LLM
27
+ module Primitives
28
+ module GDN
29
+ NAME = :gdn
30
+
31
+ # L2-normalise a projected q or k along its head dim (the delta
32
+ # rule replaces softmax normalisation with L2-norm). x is the
33
+ # block's already-projected (and conv'd) q or k tensor; eps the
34
+ # Float epsilon. Returns the normalised handle. Called twice by
35
+ # the block (once for q, once for k).
36
+ def self.l2(sess, x, eps)
37
+ TinyNN.tnn_l2_norm(sess, x, eps)
38
+ end
39
+
40
+ # TRAINABLE L2 norm over ne0 — composed from ops that each have a ggml
41
+ # backward (mul / sum_rows / scale_bias / sqrt / div), because the fused
42
+ # `tnn_l2_norm` (GGML_OP_L2_NORM) has NO backward. Used by the trainable
43
+ # GDN block; the fused `l2` above stays the inference path.
44
+ # y = x / sqrt(sum_ne0(x^2) + eps)
45
+ def self.l2_train(sess, x, eps)
46
+ sq = TinyNN.tnn_mul(sess, x, x) # x^2
47
+ ss = TinyNN.tnn_sum_rows(sess, sq) # sum over ne0 -> [1,...]
48
+ ss_eps = TinyNN.tnn_scale_bias(sess, ss, 1.0, eps) # + eps
49
+ denom = TinyNN.tnn_sqrt(sess, ss_eps) # [1,...]
50
+ # DIV backward does NOT reduce a broadcast src1, so materialise denom to
51
+ # x's full shape first (REPEAT backward sums the grad back correctly);
52
+ # the div is then same-shape.
53
+ denom_full = TinyNN.tnn_repeat(sess, denom, x)
54
+ TinyNN.tnn_div(sess, x, denom_full)
55
+ end
56
+
57
+ # Log-decay gate: g = -exp(A_log) * softplus(a + dt_bias). a is
58
+ # the projected decay stream [1,H,T,B]; dt_bias and A_log are the
59
+ # block's per-v-head weights ([1,H,1,1], broadcast). Returned g is
60
+ # the raw LOG-decay the recurrence kernel exps internally. Op
61
+ # order is fixed for ggml broadcast (the [1,H,T,B] softplus term
62
+ # drives the shape; the [1,H,1,1] -exp(A_log) broadcasts onto it).
63
+ def self.decay_gate(sess, a, dt_bias, a_log)
64
+ a_db = TinyNN.tnn_add(sess, a, dt_bias)
65
+ sp = TinyNN.tnn_softplus(sess, a_db)
66
+ ea = TinyNN.tnn_exp(sess, a_log)
67
+ ea_neg = TinyNN.tnn_neg(sess, ea)
68
+ TinyNN.tnn_mul(sess, sp, ea_neg)
69
+ end
70
+
71
+ # Update rate: beta = sigmoid(b). b is the projected update stream
72
+ # [1,H,T,B]. The kernel uses beta directly, so the sigmoid lives
73
+ # here. Returns beta.
74
+ def self.update_gate(sess, b)
75
+ TinyNN.tnn_sigmoid(sess, b)
76
+ end
77
+
78
+ # TRAINABLE update gate — sigmoid(b) composed as exp(b)/(1+exp(b)) from
79
+ # ops that each have a ggml backward, because GGML_UNARY_OP_SIGMOID has
80
+ # none. Same-shape throughout (no broadcast). The fused `update_gate`
81
+ # above (tnn_sigmoid) stays the inference path.
82
+ def self.update_gate_train(sess, b)
83
+ e = TinyNN.tnn_exp(sess, b) # exp(b)
84
+ d = TinyNN.tnn_scale_bias(sess, e, 1.0, 1.0) # 1 + exp(b)
85
+ TinyNN.tnn_div(sess, e, d)
86
+ end
87
+
88
+ # The recurrence core. q,k must be L2-normed; beta sigmoid'd; g the
89
+ # raw log-decay; state the [S_v*S_v*H,K,B,1] carry. Returns the
90
+ # packed [S_v*H, T*B + K*S_v*B] output (token outputs then state
91
+ # snapshots). The block slices the leading T*B token columns.
92
+ def self.recur(sess, q, k, v, g, beta, state)
93
+ TinyNN.tnn_gated_delta_net(sess, q, k, v, g, beta, state)
94
+ end
95
+
96
+ # Path-B TRAINABLE recurrence: the gated delta rule expressed as an
97
+ # UNROLLED graph of ops that EACH have a ggml backward (mul / mul_mat /
98
+ # sub / scale / exp / add / reshape) — so training backward comes free
99
+ # and NO fused-kernel backward is needed (ggml has none for
100
+ # GATED_DELTA_NET). The fused `recur` above stays the fast INFERENCE
101
+ # path; this is its train-time twin, gated for numeric parity.
102
+ #
103
+ # Reproduces the fused kernel's token outputs for the SCALAR-decay path
104
+ # (g->ne0 == 1, the Dragon/Qwen3-Next per-head gate). Single seq (B=1),
105
+ # single head per call — the block loops heads/seqs around it in Phase 5.
106
+ # Inputs are the packed projection tensors (q,k,v = [S_v,1,T,1]; g,beta =
107
+ # [1,1,T,1]; state0 = [S_v,S_v]); per-token vectors are sliced via views
108
+ # internally (no ptr-array params → no Spinel IntArray-lock landmine).
109
+ # q/k must be pre-L2-normed and beta pre-sigmoid'd by the caller (the
110
+ # kernel contract). Returns [S_v, T] — token outputs concat'd along ne1.
111
+ #
112
+ # per token t (matching ops.cpp:10731 exactly):
113
+ # S = S * exp(g_t) decay (scalar [1,1] broadcast)
114
+ # u = matmul(S, k_t) u[j] = sum_i S[i,j] k[i]
115
+ # d = (v_t - u) * beta_t delta
116
+ # S = S + matmul(k_row, d_row) outer (k⊗d)[i,j] = k[i] d[j]
117
+ # o_t = matmul(S, q_t) o[j] = sum_i S[i,j] q[i]
118
+ #
119
+ # The kernel's 1/√S_v output scale is folded into a SINGLE pre-scale of q
120
+ # (q enters only the output read, never the state, so o[j] = sum_i S[i,j]
121
+ # (scale·q[i]) is exact). Done once on the contiguous q — NOT per-token on
122
+ # o — because a per-token ggml_scale's BACKWARD receives a view-shaped grad
123
+ # from the concat and asserts ggml_is_padded_1d (ggml.c:3392). One scale on
124
+ # the whole tensor keeps the backward grad contiguous.
125
+ #
126
+ # ONE head of the recurrence. q,k,v are the packed [S_v, n_heads, T, 1]
127
+ # projections; g,beta the packed [1, n_heads, T, 1] gates; state0 this
128
+ # head's [S_v,S_v] carry. `head` selects the head; per-token vectors are
129
+ # strided views into the packed tensors (token stride = S_v·n_heads, head
130
+ # base = S_v·head — the ggml [S_v,H,T,B] layout). Returns [S_v, T] for this
131
+ # head; the block concats heads along ne0. n_heads=1/head=0 is the plain
132
+ # single-head case (contiguous per-token, the Phase-4 gate shape).
133
+ def self.recur_unrolled(sess, q, k, v, g, beta, state0, s_v, n_heads, head, n_tokens)
134
+ scale = 1.0 / Math.sqrt(s_v.to_f)
135
+ fbytes = 4 # sizeof(f32)
136
+ tok_stride = s_v * n_heads * fbytes # bytes between this head's tokens
137
+ head_base = s_v * head * fbytes # byte offset to this head's col 0
138
+ gtok_stride = n_heads * fbytes # g/beta [1,H,T,1]: token stride
139
+ ghead_base = head * fbytes
140
+ q_s = TinyNN.tnn_scale(sess, q, scale) # pre-scaled q (contiguous)
141
+ s_mat = state0
142
+ t_out = TinyNN.tnn_null_ptr
143
+ t = 0
144
+ while t < n_tokens
145
+ # Per-token slices: [S_v,1] vectors (S_v contiguous), [1,1] scalars.
146
+ q_t = TinyNN.tnn_view_2d(sess, q_s, s_v, 1, tok_stride, head_base + t * tok_stride)
147
+ k_t = TinyNN.tnn_view_2d(sess, k, s_v, 1, tok_stride, head_base + t * tok_stride)
148
+ v_t = TinyNN.tnn_view_2d(sess, v, s_v, 1, tok_stride, head_base + t * tok_stride)
149
+ g_t = TinyNN.tnn_view_2d(sess, g, 1, 1, gtok_stride, ghead_base + t * gtok_stride)
150
+ b_t = TinyNN.tnn_view_2d(sess, beta, 1, 1, gtok_stride, ghead_base + t * gtok_stride)
151
+
152
+ eg = TinyNN.tnn_exp(sess, g_t) # [1,1]
153
+ s_dec = TinyNN.tnn_mul(sess, s_mat, eg) # [S_v,S_v] * [1,1] bcast
154
+ u = TinyNN.tnn_matmul(sess, s_dec, k_t) # [S_v,1] u[j]
155
+ diff = TinyNN.tnn_sub(sess, v_t, u) # [S_v,1]
156
+ d = TinyNN.tnn_mul(sess, diff, b_t) # [S_v,1] * [1,1] bcast
157
+ k_row = TinyNN.tnn_reshape_2d(sess, k_t, 1, s_v) # [1,S_v]
158
+ d_row = TinyNN.tnn_reshape_2d(sess, d, 1, s_v) # [1,S_v]
159
+ outer = TinyNN.tnn_matmul(sess, k_row, d_row) # [S_v,S_v] [i,j]=k[i]d[j]
160
+ s_mat = TinyNN.tnn_add(sess, s_dec, outer) # state update
161
+ o_t = TinyNN.tnn_matmul(sess, s_mat, q_t) # [S_v,1] o[j] (already scaled)
162
+
163
+ if t == 0
164
+ t_out = o_t
165
+ else
166
+ t_out = TinyNN.tnn_concat(sess, t_out, o_t, 1) # stack along ne1
167
+ end
168
+ t = t + 1
169
+ end
170
+ t_out
171
+ end
172
+
173
+ # Gated output norm: GatedRMSNorm(o, z) = rms_norm(o) * gamma *
174
+ # silu(z). o is the per-head token output (block-sliced from
175
+ # recur); z the output-gate stream; gamma the block's norm weight;
176
+ # eps the Float epsilon. tnn_rms_norm already folds the gamma
177
+ # scale, so this is rms_norm(o,gamma) * silu(z). The normed term
178
+ # drives the shape; silu(z) broadcasts/multiplies. Returns the
179
+ # gated output (input to the block's out projection).
180
+ def self.gated_out(sess, o, z, gamma, eps)
181
+ n = TinyNN.tnn_rms_norm(sess, o, gamma, eps)
182
+ sz = TinyNN.tnn_silu(sess, z)
183
+ TinyNN.tnn_mul(sess, n, sz)
184
+ end
185
+ end
186
+ end
187
+ end
188
+ end
@@ -0,0 +1,37 @@
1
+ # lib/toy/llm/primitives/scalable_softmax.rb — L1 primitive: Scalable-Softmax
2
+ # (SSMax, Nakanishi, arXiv 2501.19399) — anti-attention-fading softmax.
3
+ #
4
+ # Pure module: `self.` methods only. The BLOCK (L2) owns the learned per-head
5
+ # scalar s and computes the SSMax scale; this primitive is the scaled softmax
6
+ # itself. See README.md.
7
+ #
8
+ # Formula (no-bias form, Eq. 11): a = softmax((s*log n) * (q·kᵀ / sqrt(d))),
9
+ # i.e. the usual scaled logits are multiplied by the scalar s*log(n), where n
10
+ # is the number of keys in the causal prefix and s is learnable (init ~0.168).
11
+ # This is exactly the existing scaled-softmax op with a MODIFIED scale:
12
+ # ssmax_scale = (1/sqrt(d)) * s * log(n).
13
+ # The block precomputes ssmax_scale (log(n) for a fixed context length is a
14
+ # CRuby-layer Float constant — no libm in the Spinel runner) and passes it here.
15
+ #
16
+ # Spinel hygiene: no Cfg / no default args. One FFI passthrough to
17
+ # tnn_soft_max_ext. Call via the full module path.
18
+
19
+ module Toy
20
+ module LLM
21
+ module Primitives
22
+ module ScalableSoftmax
23
+ NAME = :scalable_softmax
24
+
25
+ # SSMax-scaled softmax over attention scores. scores is the raw q·kᵀ
26
+ # map; mask the additive attention mask handle (or null); ssmax_scale
27
+ # the block's precomputed (1/sqrt(d))*s*log(n) Float; max_bias the
28
+ # ggml soft_max_ext ALiBi slope (0.0 when unused). Returns the
29
+ # attention-weight map. (Plain softmax falls out when ssmax_scale is
30
+ # the ordinary 1/sqrt(d) — so this also covers vanilla attention.)
31
+ def self.attend(sess, scores, mask, ssmax_scale, max_bias)
32
+ TinyNN.tnn_soft_max_ext(sess, scores, mask, ssmax_scale, max_bias)
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -29,6 +29,16 @@
29
29
  require_relative "../models/arch"
30
30
  require_relative "../models/transformer_lm_metal"
31
31
  require_relative "../dev/toy_logprobs"
32
+ require_relative "../ffi/tinynn_metal"
33
+
34
+ # toy#90 — Metal teardown drain. See lib/toy/run/infer_metal.rb for the
35
+ # full rationale: ggml-metal asserts at exit (ggml-metal-device.m:618) if
36
+ # a Metal buffer outlives its device, and Spinel has no at_exit, so we
37
+ # call tnn_shutdown_engines explicitly before exit. METAL-ONLY no-op
38
+ # elsewhere. RUNTIME-UNVERIFIED on gx10 (Linux) — Mac gate proves exit-0.
39
+ def toy_metal_teardown
40
+ TinyNNMetal.tnn_shutdown_engines
41
+ end
32
42
 
33
43
  GGUF = ENV["GGUF"] || "data/smollm2-135m-f32.gguf"
34
44
  TOP_K = (ENV["TOP_K"] || "5").to_i
@@ -65,3 +75,5 @@ while k < top_ids.length
65
75
  puts "logprob: " + top_ids[k].to_s + " " + top_vals[k].to_s
66
76
  k = k + 1
67
77
  end
78
+
79
+ toy_metal_teardown # toy#90: drain Metal residency sets before exit 0
@@ -29,6 +29,22 @@
29
29
  require_relative "../models/arch"
30
30
  require_relative "../models/transformer_lm_metal"
31
31
  require_relative "../io/tokenizer"
32
+ require_relative "../ffi/tinynn_metal"
33
+
34
+ # toy#90 — Metal teardown. ggml-metal asserts at process exit
35
+ # (ggml-metal-device.m:618, [rsets->data count]==0) if any Metal buffer is
36
+ # still alive when its singleton device is freed by the C++ static
37
+ # destructor. This runner never frees its session (it relies on process
38
+ # exit), so without an explicit drain it exits 134 AFTER printing correct
39
+ # output. tnn_shutdown_engines frees every live Metal session's
40
+ # weights_buf (removing it from the residency set), satisfying the assert.
41
+ # Spinel has no at_exit (lib/toy/run/serve.rb:123), so this MUST be called
42
+ # explicitly before every exit that follows lm.load. METAL-ONLY: on
43
+ # CPU/CUDA the registry is empty and this is a no-op-equivalent.
44
+ # RUNTIME-UNVERIFIED on gx10 (Linux) — Mac gate proves the exit-0.
45
+ def toy_metal_teardown
46
+ TinyNNMetal.tnn_shutdown_engines
47
+ end
32
48
 
33
49
  GGUF = ENV["GGUF"] || "data/smollm2-135m-f32.gguf"
34
50
  PROMPT = ENV["PROMPT"] || "Once upon a time"
@@ -115,5 +131,8 @@ else
115
131
  puts "toy-infer: model has no embedded tokenizer; a string prompt cannot " +
116
132
  "be tokenized. Pass numeric token IDs via --prompt-ids (PROMPT_IDS=...) " +
117
133
  "or re-convert with --with-tokenizer."
134
+ toy_metal_teardown # toy#90: lm.load already allocated Metal buffers
118
135
  exit 1
119
136
  end
137
+
138
+ toy_metal_teardown # toy#90: drain Metal residency sets before exit 0
@@ -83,3 +83,10 @@ while step < STEPS
83
83
  puts "step " + (step + 1).to_s + ": loss=" + loss.to_s
84
84
  step = step + 1
85
85
  end
86
+
87
+ # toy#90 — Metal teardown drain. The GPT-2 Metal training session is never
88
+ # explicitly freed; without this the ggml-metal device-free residency assert
89
+ # (ggml-metal-device.m:618) aborts the process (exit 134) AFTER a correct
90
+ # run. Spinel has no at_exit (lib/toy/run/serve.rb:123) so drain explicitly.
91
+ # METAL-ONLY no-op for non-Metal. RUNTIME-UNVERIFIED on gx10 — Mac proves it.
92
+ TinyNNMetal.tnn_shutdown_engines
@@ -0,0 +1,232 @@
1
+ # lib/toy/run/train_hybrid.rb — Phase 5 capstone: a SELF-CONTAINED from-scratch
2
+ # HYBRID trainer (one attention layer + one Gated-DeltaNet layer), in its OWN
3
+ # Spinel compilation unit.
4
+ #
5
+ # Why its own runner (not the llama engine): pulling GDNBlock alloc/train code
6
+ # into Toy::LLM::Engine::LlamaSeqEngine's unit miscompiles the proven byte-exact
7
+ # attention path on the union pin (landmine #16 family — the same reason
8
+ # toy-train-lora / toy-train-gpt2 are separate binaries). A dedicated unit can't
9
+ # corrupt that path. See docs/roadmap/dragon-gdn-arch-2026-06-20.md (Phase 5).
10
+ #
11
+ # The forward dispatches per layer on a flat INT kind (the LayerSpec seam
12
+ # pattern, monomorphic per call site): KIND_ATTENTION → inline causal
13
+ # self-attention; KIND_GDN → Toy::LLM::Blocks::GDNBlock (Path-B autograd
14
+ # recurrence). All params are flattened into one uniform ptr array so the
15
+ # AdamW opt_step loop never touches two block types in one method.
16
+ #
17
+ # x = get_rows(embed, ids) [d_model, T]
18
+ # x = attention_layer(x) [d_model, T] (KIND_ATTENTION)
19
+ # x = GDNBlock.build_forward(x) [d_model, T] (KIND_GDN)
20
+ # xf = rmsnorm(x, final_gamma) [d_model, T]
21
+ # lgt = matmul(embed, xf) [vocab, T] (tied)
22
+ # loss= cross_entropy(lgt, labels) overfit one fixed batch
23
+ #
24
+ # Asserts the CE loss DECREASES — the heterogeneous trainable stack works.
25
+
26
+ require_relative "../../toy"
27
+ require_relative "../ffi/tinynn"
28
+ require_relative "../llm/primitives/rms_norm"
29
+ require_relative "../llm/primitives/gdn"
30
+ require_relative "../llm/blocks/gdn_block"
31
+ require_relative "../llm/archs/layer_spec"
32
+
33
+ module Toy
34
+ module LLM
35
+ module Run
36
+ module TrainHybrid
37
+ VOCAB = 16
38
+ DM = 8
39
+ H = 2
40
+ S_V = 4 # H*S_V == DM
41
+ T = 4
42
+ STEPS = 16
43
+ EPS = 1.0e-5
44
+
45
+ def self.fillv(n, seed)
46
+ a = [0.0]; a.pop
47
+ i = 0
48
+ while i < n
49
+ a.push(((((i + seed) * 1103515245 + 12345) % 1000) - 500).to_f * 0.001)
50
+ i = i + 1
51
+ end
52
+ a
53
+ end
54
+
55
+ def self.zeros(n)
56
+ a = [0.0]; a.pop
57
+ i = 0
58
+ while i < n
59
+ a.push(0.0)
60
+ i = i + 1
61
+ end
62
+ a
63
+ end
64
+
65
+ # Inline single-head causal self-attention (no RoPE/GQA — minimal,
66
+ # trainable). Weights arrive as explicit handles. Returns x + Wo·ctx.
67
+ def self.attention_layer(sess, t_x, rn, wq, wk, wv, wo, eps)
68
+ h = Toy::LLM::Primitives::RMSNorm.build(sess, t_x, rn, eps)
69
+ q = TinyNN.tnn_matmul(sess, wq, h) # [DM, T]
70
+ k = TinyNN.tnn_matmul(sess, wk, h) # [DM, T]
71
+ v = TinyNN.tnn_matmul(sess, wv, h) # [DM, T]
72
+ scores = TinyNN.tnn_matmul(sess, k, q) # [T_k, T_q]
73
+ scaled = TinyNN.tnn_scale(sess, scores, 1.0 / Math.sqrt(DM.to_f))
74
+ masked = TinyNN.tnn_diag_mask_inf(sess, scaled, 0)
75
+ attn = TinyNN.tnn_softmax(sess, masked) # [T_k, T_q]
76
+ v_t = TinyNN.tnn_transpose(sess, v) # [T, DM]
77
+ ctx = TinyNN.tnn_matmul(sess, v_t, attn) # [DM, T_q]
78
+ out = TinyNN.tnn_matmul(sess, wo, ctx) # [DM, T]
79
+ TinyNN.tnn_add(sess, t_x, out)
80
+ end
81
+
82
+ def self.run
83
+ sess = TinyNN.tnn_session_new(0)
84
+ TinyNN.tnn_session_set_graph_capacity(sess, 262144)
85
+
86
+ # Flat param arrays (uniform ptr) so opt_step never sees two block types.
87
+ pp = [TinyNN.tnn_null_ptr]; pp.pop
88
+ pm = [TinyNN.tnn_null_ptr]; pm.pop
89
+ pv = [TinyNN.tnn_null_ptr]; pv.pop
90
+
91
+ # reg2/reg1: alloc a weight + matching m/v, register, return the weight.
92
+ embed = reg2(sess, pp, pm, pv, VOCAB, DM) # ne0=DM, ne1=VOCAB
93
+ fnorm = reg1(sess, pp, pm, pv, DM)
94
+ # Attention layer weights.
95
+ a_rn = reg1(sess, pp, pm, pv, DM)
96
+ a_wq = reg2(sess, pp, pm, pv, DM, DM)
97
+ a_wk = reg2(sess, pp, pm, pv, DM, DM)
98
+ a_wv = reg2(sess, pp, pm, pv, DM, DM)
99
+ a_wo = reg2(sess, pp, pm, pv, DM, DM)
100
+
101
+ # GDN layer (its own weights live in ft_weights/ft_m/ft_v; flatten in).
102
+ gblk = Toy::LLM::Blocks::GDNBlock.new
103
+ gblk.alloc_trainable_f32_weights!(sess, DM, S_V, H)
104
+ bi = 0
105
+ while bi < gblk.ft_weights.length
106
+ pp.push(gblk.ft_weights[bi]); pm.push(gblk.ft_m[bi]); pv.push(gblk.ft_v[bi])
107
+ bi = bi + 1
108
+ end
109
+
110
+ # set_param BEFORE finalize (load-bearing order).
111
+ gi = 0
112
+ while gi < pp.length
113
+ TinyNN.tnn_set_param(pp[gi])
114
+ gi = gi + 1
115
+ end
116
+ TinyNN.tnn_finalize_weights(sess)
117
+ gblk.zero_state!(sess)
118
+
119
+ # Init weights + zero moments.
120
+ gi = 0
121
+ while gi < pp.length
122
+ n = TinyNN.tnn_tensor_nelements(pp[gi])
123
+ TinyNN.tnn_upload_from_float_array(sess, pp[gi], fillv(n, gi * 7 + 1), n)
124
+ TinyNN.tnn_zero_tensor(sess, pm[gi])
125
+ TinyNN.tnn_zero_tensor(sess, pv[gi])
126
+ gi = gi + 1
127
+ end
128
+
129
+ # Forward — per-layer INT-kind dispatch (the seam pattern).
130
+ t_tok = TinyNN.tnn_input_1d_i32(sess, T)
131
+ x = TinyNN.tnn_get_rows(sess, embed, t_tok)
132
+ kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION,
133
+ Toy::LLM::Archs::LayerSpec::KIND_GDN]
134
+ li = 0
135
+ while li < kinds.length
136
+ if kinds[li] == Toy::LLM::Archs::LayerSpec::KIND_ATTENTION
137
+ x = attention_layer(sess, x, a_rn, a_wq, a_wk, a_wv, a_wo, EPS)
138
+ else
139
+ x = gblk.build_forward(sess, x, T, EPS)
140
+ end
141
+ li = li + 1
142
+ end
143
+ xf = Toy::LLM::Primitives::RMSNorm.build(sess, x, fnorm, EPS)
144
+ lgt = TinyNN.tnn_matmul(sess, embed, xf) # [VOCAB, T] tied
145
+
146
+ t_labels = TinyNN.tnn_input_2d_f32(sess, T, VOCAB)
147
+ t_hp = TinyNN.tnn_input_1d_f32(sess, 7)
148
+ t_loss = TinyNN.tnn_cross_entropy_loss(sess, lgt, t_labels)
149
+ TinyNN.tnn_set_output(t_loss)
150
+ TinyNN.tnn_set_loss(t_loss)
151
+
152
+ TinyNN.tnn_build_forward_only(sess, t_loss)
153
+ TinyNN.tnn_build_backward(sess)
154
+ gj = 0
155
+ while gj < pp.length
156
+ tg = TinyNN.tnn_tensor_grad(sess, pp[gj])
157
+ to = TinyNN.tnn_opt_step_adamw(sess, pp[gj], tg, pm[gj], pv[gj], t_hp)
158
+ TinyNN.tnn_extend_backward_graph(sess, to)
159
+ gj = gj + 1
160
+ end
161
+ TinyNN.tnn_pin_all_graph_b_nodes(sess)
162
+ TinyNN.tnn_realize_backward(sess)
163
+
164
+ ids = [1, 2, 3, 4]
165
+ labels = zeros(VOCAB * T)
166
+ tt = 0
167
+ while tt < T
168
+ tgt = (ids[tt] + 1) % VOCAB
169
+ labels[tgt + VOCAB * tt] = 1.0
170
+ tt = tt + 1
171
+ end
172
+ hp = [0.02, 0.9, 0.95, 1.0e-8, 0.0, 0.9, 0.95]
173
+
174
+ first_loss = 0.0
175
+ last_loss = 0.0
176
+ s = 0
177
+ while s < STEPS
178
+ if s == 0
179
+ TinyNN.tnn_graph_reset(sess)
180
+ else
181
+ TinyNN.tnn_graph_reset_grads_only(sess)
182
+ end
183
+ TinyNN.upload_int_array(sess, t_tok, ids)
184
+ TinyNN.tnn_upload_from_float_array(sess, t_labels, labels, VOCAB * T)
185
+ TinyNN.tnn_upload_from_float_array(sess, t_hp, hp, 7)
186
+ TinyNN.tnn_compute_backward(sess)
187
+ TinyNN.tnn_download(sess, t_loss)
188
+ lv = TinyNN.tnn_scratch_get(sess, 0)
189
+ if s == 0
190
+ first_loss = lv
191
+ end
192
+ last_loss = lv
193
+ puts "step " + s.to_s + ": loss=" + lv.to_s
194
+ s = s + 1
195
+ end
196
+
197
+ ok = true
198
+ if first_loss != first_loss || last_loss != last_loss
199
+ puts "FAIL: loss is NaN"
200
+ ok = false
201
+ end
202
+ if last_loss >= first_loss - 0.05
203
+ puts "FAIL: loss did not decrease (first=" + first_loss.to_s + " last=" + last_loss.to_s + ")"
204
+ ok = false
205
+ end
206
+ if ok
207
+ puts "HYBRID train smoke PASS: attention+GDN from-scratch hybrid trains — CE loss " +
208
+ first_loss.to_s + " -> " + last_loss.to_s + " over " + STEPS.to_s + " steps"
209
+ else
210
+ puts "HYBRID train smoke FAIL"
211
+ end
212
+ end
213
+
214
+ def self.reg1(sess, pp, pm, pv, n)
215
+ w = TinyNN.tnn_input_1d_f32_persistent(sess, n)
216
+ pp.push(w); pm.push(TinyNN.tnn_input_1d_f32_persistent(sess, n))
217
+ pv.push(TinyNN.tnn_input_1d_f32_persistent(sess, n))
218
+ w
219
+ end
220
+
221
+ def self.reg2(sess, pp, pm, pv, rows, cols)
222
+ w = TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols)
223
+ pp.push(w); pm.push(TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols))
224
+ pv.push(TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols))
225
+ w
226
+ end
227
+ end
228
+ end
229
+ end
230
+ end
231
+
232
+ Toy::LLM::Run::TrainHybrid.run
@@ -225,3 +225,13 @@ if EVENTS.length > 0 && TinyNNMetal.tnn_events_active == 1
225
225
  TinyNNMetal.tnn_events_emit(re.dump)
226
226
  TinyNNMetal.tnn_events_close
227
227
  end
228
+
229
+ # toy#90 — Metal teardown drain. The training session (recipe.fs_cache.sess)
230
+ # is never explicitly freed (the runner relies on process exit), so without
231
+ # this the ggml-metal device-free assert (ggml-metal-device.m:618,
232
+ # [rsets->data count]==0) fires AFTER a correct run, exiting 134. Spinel has
233
+ # no at_exit (lib/toy/run/serve.rb:123) so we drain explicitly here.
234
+ # tnn_shutdown_engines frees every live Metal session's weights_buf
235
+ # (removing it from the residency set) and the CPU write session too.
236
+ # RUNTIME-UNVERIFIED on gx10 (Linux) — Mac gate proves the exit-0.
237
+ TinyNNMetal.tnn_shutdown_engines
data/lib/toy/version.rb CHANGED
@@ -4,8 +4,9 @@
4
4
  # under CRuby).
5
5
  module Toy
6
6
  # Single source of truth: gemspec + `toy --version` + `toy --manifest`
7
- # all read this; README/CHANGELOG/git tag display it as v0.8.0.
8
- # v0.8.0 (2026-06-12) is the first PUBLISHED version (RubyGems).
7
+ # all read this; README/CHANGELOG/git tag display it as v0.9.0.
8
+ # v0.8.0 (2026-06-12) was the first PUBLISHED version (RubyGems);
9
+ # v0.9.0 adds the Dragon / Gated-DeltaNet trainable hybrid arc.
9
10
  # Pre-1.0: not API-stable.
10
- VERSION = "0.8.0".freeze
11
+ VERSION = "0.9.0".freeze
11
12
  end
@@ -16,6 +16,28 @@
16
16
  #include <stdio.h>
17
17
  #include <stdlib.h>
18
18
 
19
+ /* toy#94 — DURABLE GUARD against an int-truncated BYO-pointer.
20
+ *
21
+ * ggml_backend_cuda_buffer_from_ptr is the vendored BYO-pointer entry
22
+ * (vendor-patches/0001-cuda-buffer_from_ptr.patch, which patches BOTH
23
+ * src/ggml-cuda/ggml-cuda.cu AND include/ggml-cuda.h). The SYMBOL is
24
+ * defined in libggml-cuda.a, but `make gem-prep` resets vendor/ggml to
25
+ * GGML_REV (Makefile $(GGML_DIR)/.patched / commit 312fae9) — which
26
+ * silently drops the *header* declaration while a previously-built
27
+ * archive still carries the symbol. With no prototype in scope, C
28
+ * treats the call as an implicit declaration returning `int`: on
29
+ * aarch64 (GB10) the 64-bit ggml_backend_buffer_t is TRUNCATED to 32
30
+ * bits. The truncated pointer is non-NULL, so the !buf check passes,
31
+ * and the next ggml_backend_buffer_get_base() dereferences garbage →
32
+ * SIGSEGV in the Phase-2 mmap weight-attach (the toy#94 stack:
33
+ * ggml_backend_buffer_get_base <- tnn_session_attach_weight_mmap <-
34
+ * realize_for_mmap). Declaring it here keeps a correct 64-bit prototype
35
+ * in scope REGARDLESS of the vendored header's post-reset state, so the
36
+ * pointer can never be truncated. (Identical to the header decl when
37
+ * the patch is applied; harmless redundancy.) */
38
+ GGML_BACKEND_API ggml_backend_buffer_t
39
+ ggml_backend_cuda_buffer_from_ptr(void *host_ptr, size_t size, int device);
40
+
19
41
  ggml_backend_t tnn_backend_cuda_init_internal(void)
20
42
  {
21
43
  return ggml_backend_cuda_init(0);