toy 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +31 -0
  3. data/Makefile +211 -5
  4. data/README.md +1 -1
  5. data/lib/toy/compute.rb +9 -0
  6. data/lib/toy/compute_cuda.rb +8 -0
  7. data/lib/toy/compute_metal.rb +17 -0
  8. data/lib/toy/core/cli/new.rb +8 -0
  9. data/lib/toy/ffi/tinynn.rb +19 -0
  10. data/lib/toy/ffi/tinynn_cuda.rb +7 -0
  11. data/lib/toy/ffi/tinynn_metal.rb +5 -0
  12. data/lib/toy/llm/archs/layer_spec.rb +39 -0
  13. data/lib/toy/llm/archs/llama_arch.rb +62 -1
  14. data/lib/toy/llm/archs/llama_arch_cuda.rb +62 -1
  15. data/lib/toy/llm/archs/llama_arch_metal.rb +62 -1
  16. data/lib/toy/llm/blocks/gdn_block.rb +176 -0
  17. data/lib/toy/llm/engine/gpt2_kv_engine.rb +11 -0
  18. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +11 -0
  19. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +11 -0
  20. data/lib/toy/llm/engine/llama_kv_engine.rb +10 -2
  21. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +10 -2
  22. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +10 -2
  23. data/lib/toy/llm/engine/llama_seq_engine.rb +16 -1
  24. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +16 -1
  25. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +16 -1
  26. data/lib/toy/llm/primitives/depth_scale.rb +33 -0
  27. data/lib/toy/llm/primitives/diff_attention.rb +71 -0
  28. data/lib/toy/llm/primitives/gdn.rb +188 -0
  29. data/lib/toy/llm/primitives/scalable_softmax.rb +37 -0
  30. data/lib/toy/run/eval_metal.rb +12 -0
  31. data/lib/toy/run/infer_metal.rb +19 -0
  32. data/lib/toy/run/train_gpt2_metal.rb +7 -0
  33. data/lib/toy/run/train_hybrid.rb +232 -0
  34. data/lib/toy/run/train_metal.rb +10 -0
  35. data/lib/toy/version.rb +4 -3
  36. data/tinynn/tinynn_backend_cuda.c +22 -0
  37. data/tinynn/tinynn_ggml.c +231 -0
  38. metadata +9 -2
@@ -71,6 +71,21 @@ module Toy; module LLM; module Archs
71
71
  class LlamaArch
72
72
  attr_accessor :t_seq_token_embed, :t_seq_final_norm_gamma, :t_seq_output,
73
73
  :t_seq_w_proj, :seq_blocks_ffi,
74
+ # Phase 3 — per-layer descriptor array, parallel to
75
+ # seq_blocks_ffi (same length == n_layers).
76
+ :seq_layer_specs,
77
+ # Phase 5 — the dispatch key is a plain INT array (one kind per
78
+ # layer), NOT LayerSpec.kind reads: constructing/mutating
79
+ # LayerSpec objects on a realize path trips a Spinel codegen
80
+ # miscompile (corrupts the token-id finalize). Mutating a plain
81
+ # int array element is proven-safe. build_forward dispatches on
82
+ # this; LayerSpec stays the descriptor type/constants home.
83
+ :seq_layer_kinds,
84
+ # Phase 5 — parallel GDN-block array (same length; entry is a
85
+ # GDNBlock at KIND_GDN positions, null elsewhere). The KIND_GDN
86
+ # dispatch arm calls into THIS array — a concrete typed call,
87
+ # so the seam stays monomorphic per call site.
88
+ :seq_gdn_blocks_ffi,
74
89
  # Orchestration-gating carriers — bare cache ivars with
75
90
  # no accessor before P2.5. The lens-branch guard reads
76
91
  # seq_donor_d_in; the shared ctx reads seq_rope_cfg.
@@ -85,6 +100,15 @@ module Toy; module LLM; module Archs
85
100
  @t_seq_w_proj = TinyNNCuda.tnn_null_ptr
86
101
  # Seed with one block — matches the former cache init (L112).
87
102
  @seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
103
+ # Phase 3 — parallel seed: one attention spec for the seed block.
104
+ @seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
105
+ # Phase 5 — parallel int dispatch keys (KIND_ATTENTION for the seed).
106
+ @seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
107
+ # Phase 5 — parallel GDN-block slots. Seeded with GDNBlock placeholders so
108
+ # the array is MONOMORPHIC (all GDNBlock) — the seam's KIND_GDN call site
109
+ # never sees a mixed null/object array (Spinel poly-array landmine). At
110
+ # KIND_ATTENTION layers the placeholder is simply never invoked.
111
+ @seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
88
112
  @seq_donor_d_in = 0
89
113
  # The cache overwrites seq_rope_cfg with the real RoPE::Cfg before
90
114
  # build_forward runs (each realize prologue rebuilds it).
@@ -101,11 +125,33 @@ module Toy; module LLM; module Archs
101
125
  # already constructs TransformerBlock.new there, so no new class /
102
126
  # Struct / FFI :str at class load. Each realize path now calls this
103
127
  # via the cache's seq_blocks_ffi delegator chain (self.seq_arch).
128
+ # Phase 5 hybrid — rebuild the per-layer spec array from a per-layer GDN
129
+ # bool flag, using the LayerSpec CTOR (never the .kind= setter: mutating
130
+ # LayerSpec.kind elsewhere while build_forward reads it trips a Spinel
131
+ # codegen miscompile that corrupts the token-id finalize). Called after
132
+ # seed_blocks!, before alloc.
133
+ # Mark ONE layer as GDN. Takes an INT index (never an array param — a
134
+ # function-parameter array trips the Spinel #688 type-lock landmine, which
135
+ # here manifests as a token-id-finalize codegen miscompile). Mutates the
136
+ # plain int dispatch array element (proven-safe).
137
+ def set_gdn_layer!(idx)
138
+ @seq_layer_kinds[idx] = Toy::LLM::Archs::LayerSpec::KIND_GDN
139
+ end
140
+
104
141
  def seed_blocks!(n_layers)
105
142
  @seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
143
+ # Phase 3 — seed the parallel spec array in lockstep. Every layer is
144
+ # KIND_ATTENTION for now (the homogeneous-Llama refactor gate); Phase 5
145
+ # overwrites individual entries with KIND_GDN for Dragon's pattern.
146
+ @seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
147
+ @seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
148
+ @seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
106
149
  li_init = 1
107
150
  while li_init < n_layers
108
151
  @seq_blocks_ffi.push(Toy::LLM::Blocks::TransformerBlock.new)
152
+ @seq_layer_specs.push(Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION))
153
+ @seq_gdn_blocks_ffi.push(Toy::LLM::Blocks::GDNBlock.new)
154
+ @seq_layer_kinds.push(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)
109
155
  li_init = li_init + 1
110
156
  end
111
157
  end
@@ -217,7 +263,22 @@ module Toy; module LLM; module Archs
217
263
  end
218
264
  li_g = 0
219
265
  while li_g < seq_n_layers
220
- t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
266
+ # Phase 3 — per-layer descriptor dispatch. The branch compares a FLAT
267
+ # INT (spec.kind) and each arm calls a CONCRETE typed block method, so
268
+ # every .build_forward call site stays monomorphic (one receiver
269
+ # class). KIND_ATTENTION is the only arm wired today; KIND_GDN gets its
270
+ # own arm + its own typed block array in Phase 5. Unknown kinds fail
271
+ # loud rather than silently building the wrong graph (never-mask rule).
272
+ spec_kind = self.seq_layer_kinds[li_g]
273
+ if spec_kind == Toy::LLM::Archs::LayerSpec::KIND_ATTENTION
274
+ t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
275
+ elsif spec_kind == Toy::LLM::Archs::LayerSpec::KIND_GDN
276
+ # Concrete typed call into the parallel GDN array — the GDN block reads
277
+ # its own dims (set at alloc); seq_t/eps come from the shared ctx.
278
+ t_cur = self.seq_gdn_blocks_ffi[li_g].build_forward(sess, t_cur, seq_t, eps)
279
+ else
280
+ raise "LlamaArch#build_forward: unsupported layer kind #{spec_kind} at layer #{li_g}"
281
+ end
221
282
  li_g = li_g + 1
222
283
  end
223
284
 
@@ -71,6 +71,21 @@ module Toy; module LLM; module Archs
71
71
  class LlamaArch
72
72
  attr_accessor :t_seq_token_embed, :t_seq_final_norm_gamma, :t_seq_output,
73
73
  :t_seq_w_proj, :seq_blocks_ffi,
74
+ # Phase 3 — per-layer descriptor array, parallel to
75
+ # seq_blocks_ffi (same length == n_layers).
76
+ :seq_layer_specs,
77
+ # Phase 5 — the dispatch key is a plain INT array (one kind per
78
+ # layer), NOT LayerSpec.kind reads: constructing/mutating
79
+ # LayerSpec objects on a realize path trips a Spinel codegen
80
+ # miscompile (corrupts the token-id finalize). Mutating a plain
81
+ # int array element is proven-safe. build_forward dispatches on
82
+ # this; LayerSpec stays the descriptor type/constants home.
83
+ :seq_layer_kinds,
84
+ # Phase 5 — parallel GDN-block array (same length; entry is a
85
+ # GDNBlock at KIND_GDN positions, null elsewhere). The KIND_GDN
86
+ # dispatch arm calls into THIS array — a concrete typed call,
87
+ # so the seam stays monomorphic per call site.
88
+ :seq_gdn_blocks_ffi,
74
89
  # Orchestration-gating carriers — bare cache ivars with
75
90
  # no accessor before P2.5. The lens-branch guard reads
76
91
  # seq_donor_d_in; the shared ctx reads seq_rope_cfg.
@@ -85,6 +100,15 @@ module Toy; module LLM; module Archs
85
100
  @t_seq_w_proj = TinyNNMetal.tnn_null_ptr
86
101
  # Seed with one block — matches the former cache init (L112).
87
102
  @seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
103
+ # Phase 3 — parallel seed: one attention spec for the seed block.
104
+ @seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
105
+ # Phase 5 — parallel int dispatch keys (KIND_ATTENTION for the seed).
106
+ @seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
107
+ # Phase 5 — parallel GDN-block slots. Seeded with GDNBlock placeholders so
108
+ # the array is MONOMORPHIC (all GDNBlock) — the seam's KIND_GDN call site
109
+ # never sees a mixed null/object array (Spinel poly-array landmine). At
110
+ # KIND_ATTENTION layers the placeholder is simply never invoked.
111
+ @seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
88
112
  @seq_donor_d_in = 0
89
113
  # The cache overwrites seq_rope_cfg with the real RoPE::Cfg before
90
114
  # build_forward runs (each realize prologue rebuilds it).
@@ -101,11 +125,33 @@ module Toy; module LLM; module Archs
101
125
  # already constructs TransformerBlock.new there, so no new class /
102
126
  # Struct / FFI :str at class load. Each realize path now calls this
103
127
  # via the cache's seq_blocks_ffi delegator chain (self.seq_arch).
128
+ # Phase 5 hybrid — rebuild the per-layer spec array from a per-layer GDN
129
+ # bool flag, using the LayerSpec CTOR (never the .kind= setter: mutating
130
+ # LayerSpec.kind elsewhere while build_forward reads it trips a Spinel
131
+ # codegen miscompile that corrupts the token-id finalize). Called after
132
+ # seed_blocks!, before alloc.
133
+ # Mark ONE layer as GDN. Takes an INT index (never an array param — a
134
+ # function-parameter array trips the Spinel #688 type-lock landmine, which
135
+ # here manifests as a token-id-finalize codegen miscompile). Mutates the
136
+ # plain int dispatch array element (proven-safe).
137
+ def set_gdn_layer!(idx)
138
+ @seq_layer_kinds[idx] = Toy::LLM::Archs::LayerSpec::KIND_GDN
139
+ end
140
+
104
141
  def seed_blocks!(n_layers)
105
142
  @seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
143
+ # Phase 3 — seed the parallel spec array in lockstep. Every layer is
144
+ # KIND_ATTENTION for now (the homogeneous-Llama refactor gate); Phase 5
145
+ # overwrites individual entries with KIND_GDN for Dragon's pattern.
146
+ @seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
147
+ @seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
148
+ @seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
106
149
  li_init = 1
107
150
  while li_init < n_layers
108
151
  @seq_blocks_ffi.push(Toy::LLM::Blocks::TransformerBlock.new)
152
+ @seq_layer_specs.push(Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION))
153
+ @seq_gdn_blocks_ffi.push(Toy::LLM::Blocks::GDNBlock.new)
154
+ @seq_layer_kinds.push(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)
109
155
  li_init = li_init + 1
110
156
  end
111
157
  end
@@ -217,7 +263,22 @@ module Toy; module LLM; module Archs
217
263
  end
218
264
  li_g = 0
219
265
  while li_g < seq_n_layers
220
- t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
266
+ # Phase 3 — per-layer descriptor dispatch. The branch compares a FLAT
267
+ # INT (spec.kind) and each arm calls a CONCRETE typed block method, so
268
+ # every .build_forward call site stays monomorphic (one receiver
269
+ # class). KIND_ATTENTION is the only arm wired today; KIND_GDN gets its
270
+ # own arm + its own typed block array in Phase 5. Unknown kinds fail
271
+ # loud rather than silently building the wrong graph (never-mask rule).
272
+ spec_kind = self.seq_layer_kinds[li_g]
273
+ if spec_kind == Toy::LLM::Archs::LayerSpec::KIND_ATTENTION
274
+ t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
275
+ elsif spec_kind == Toy::LLM::Archs::LayerSpec::KIND_GDN
276
+ # Concrete typed call into the parallel GDN array — the GDN block reads
277
+ # its own dims (set at alloc); seq_t/eps come from the shared ctx.
278
+ t_cur = self.seq_gdn_blocks_ffi[li_g].build_forward(sess, t_cur, seq_t, eps)
279
+ else
280
+ raise "LlamaArch#build_forward: unsupported layer kind #{spec_kind} at layer #{li_g}"
281
+ end
221
282
  li_g = li_g + 1
222
283
  end
223
284
 
@@ -0,0 +1,176 @@
1
+ # lib/toy/llm/blocks/gdn_block.rb — L2 block: a TRAINABLE Gated-DeltaNet layer
2
+ # (Dragon / Qwen3-Next linear-attention mixer), the KIND_GDN counterpart of the
3
+ # attention TransformerBlock. Composes the L1 GDN primitives around its own
4
+ # projection weights and the autograd-differentiable recurrence
5
+ # (Toy::LLM::Primitives::GDN.recur_unrolled, Phase 4 / Path B) — so the whole
6
+ # layer trains with NO hand-written kernel backward.
7
+ #
8
+ # DEFERRED (Phase 5 minimal-trainable scope; revisit for Dragon bit-match): the
9
+ # short causal conv on q/k/v (ggml_conv_1d is FFI-wired from Phase 1) and any
10
+ # Dragon-exact stream layout. This block proves a GDN layer is a correct,
11
+ # trainable residual unit; it is not yet a bit-faithful Dragon block.
12
+ #
13
+ # Shapes (single seq, B=1):
14
+ # x [d_model, T]
15
+ # h = rmsnorm [d_model, T]
16
+ # q/k/v/z = W·h -> [S_v*H, T] (W : [d_model, S_v*H])
17
+ # a/b = W·h -> [H, T] (W : [d_model, H]) ; per-head scalars
18
+ # per head h: recur_unrolled(qn,kn,v,g,beta, state0_h) -> [S_v, T]
19
+ # o concat heads -> [S_v*H, T]
20
+ # gated = GatedRMSNorm(o, z) -> [S_v*H, T]
21
+ # out = W_o·gated -> [d_model, T] ; residual = x + out
22
+ #
23
+ # Spinel hygiene: hand-written positional class, NEVER Struct.new (landmine #16);
24
+ # no Cfg ctor / default args (landmine #4); no Card/step_bind/FFI :str at class
25
+ # load. This file does NOT require_relative "tinynn" (the loader picks the
26
+ # backend before requiring this block, as for the L1 primitives + L2 attention
27
+ # block).
28
+
29
+ module Toy; module LLM; module Blocks
30
+ class GDNBlock
31
+ attr_accessor :t_rn_gamma,
32
+ :t_w_q, :t_w_k, :t_w_v, :t_w_z, :t_w_a, :t_w_b,
33
+ :t_a_log, :t_dt_bias, :t_go_gamma, :t_w_o,
34
+ :t_state0,
35
+ # Block dims (set at alloc) so build_forward can take the same
36
+ # (sess, t_x, ctx) signature as TransformerBlock — the seam's
37
+ # KIND_GDN dispatch arm stays a single concrete typed call.
38
+ :gdn_d_model, :gdn_s_v, :gdn_n_heads,
39
+ # F3 full-finetune parallel arrays (weight, m, v) — same
40
+ # convention as TransformerBlock so the engine's opt_step
41
+ # walker reaches them by name.
42
+ :ft_weights, :ft_m, :ft_v
43
+
44
+ def initialize
45
+ @gdn_d_model = 0; @gdn_s_v = 0; @gdn_n_heads = 0
46
+ @t_rn_gamma = TinyNN.tnn_null_ptr
47
+ @t_w_q = TinyNN.tnn_null_ptr; @t_w_k = TinyNN.tnn_null_ptr; @t_w_v = TinyNN.tnn_null_ptr
48
+ @t_w_z = TinyNN.tnn_null_ptr; @t_w_a = TinyNN.tnn_null_ptr; @t_w_b = TinyNN.tnn_null_ptr
49
+ @t_a_log = TinyNN.tnn_null_ptr; @t_dt_bias = TinyNN.tnn_null_ptr
50
+ @t_go_gamma = TinyNN.tnn_null_ptr; @t_w_o = TinyNN.tnn_null_ptr
51
+ @t_state0 = TinyNN.tnn_null_ptr
52
+ @ft_weights = [TinyNN.tnn_null_ptr]; @ft_weights.pop
53
+ @ft_m = [TinyNN.tnn_null_ptr]; @ft_m.pop
54
+ @ft_v = [TinyNN.tnn_null_ptr]; @ft_v.pop
55
+ end
56
+
57
+ # Allocate the block's trainable persistent F32 weights + their Adam moments
58
+ # (parallel ft_weights/ft_m/ft_v arrays, populated in lockstep so the engine
59
+ # / a train loop can opt_step generically). d_model is the residual width;
60
+ # n_heads × s_v = the GDN inner width. state0 is a zeroed [s_v, s_v*n_heads]
61
+ # constant carry (one [s_v,s_v] block per head), NOT a param. Each weight's
62
+ # m/v match its shape (opt_step_adamw asserts same-shape).
63
+ def alloc_trainable_f32_weights!(sess, d_model, s_v, n_heads)
64
+ @gdn_d_model = d_model; @gdn_s_v = s_v; @gdn_n_heads = n_heads
65
+ inner = s_v * n_heads
66
+ # W : [d_model, out] (matmul(W, h) contracts ne0=d_model -> [out, T]).
67
+ # input_2d_f32_persistent(rows, cols) -> ne0=cols, ne1=rows, so pass
68
+ # (out, d_model) to get ne0=d_model, ne1=out.
69
+ @t_rn_gamma = reg1(sess, d_model)
70
+ @t_w_q = reg2(sess, inner, d_model)
71
+ @t_w_k = reg2(sess, inner, d_model)
72
+ @t_w_v = reg2(sess, inner, d_model)
73
+ @t_w_z = reg2(sess, inner, d_model)
74
+ @t_w_a = reg2(sess, n_heads, d_model)
75
+ @t_w_b = reg2(sess, n_heads, d_model)
76
+ @t_a_log = reg4(sess, 1, n_heads, 1, 1)
77
+ @t_dt_bias = reg4(sess, 1, n_heads, 1, 1)
78
+ @t_go_gamma = reg1(sess, inner)
79
+ @t_w_o = reg2(sess, d_model, inner)
80
+ # Constant zero initial state (NOT registered as a trainable param).
81
+ @t_state0 = TinyNN.tnn_input_2d_f32_persistent(sess, s_v, s_v * n_heads)
82
+ end
83
+
84
+ # reg{1,2,4}: alloc a weight of the given rank + matching m/v, push the
85
+ # triple into ft_weights/ft_m/ft_v, return the weight handle.
86
+ def reg1(sess, n)
87
+ w = TinyNN.tnn_input_1d_f32_persistent(sess, n)
88
+ @ft_weights.push(w)
89
+ @ft_m.push(TinyNN.tnn_input_1d_f32_persistent(sess, n))
90
+ @ft_v.push(TinyNN.tnn_input_1d_f32_persistent(sess, n))
91
+ w
92
+ end
93
+
94
+ def reg2(sess, rows, cols)
95
+ w = TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols)
96
+ @ft_weights.push(w)
97
+ @ft_m.push(TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols))
98
+ @ft_v.push(TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols))
99
+ w
100
+ end
101
+
102
+ def reg4(sess, a, b, c, d)
103
+ w = TinyNN.tnn_input_4d_f32_persistent(sess, a, b, c, d)
104
+ @ft_weights.push(w)
105
+ @ft_m.push(TinyNN.tnn_input_4d_f32_persistent(sess, a, b, c, d))
106
+ @ft_v.push(TinyNN.tnn_input_4d_f32_persistent(sess, a, b, c, d))
107
+ w
108
+ end
109
+
110
+ # Mark every projection weight a trainable param. Call BEFORE finalize_weights
111
+ # (load-bearing order, gpt2_seq_engine.rb:128). a_log + dt_bias ARE trained
112
+ # (per-head decay shape); state0 is NOT (it is not in ft_weights).
113
+ def set_params!
114
+ wi = 0
115
+ while wi < @ft_weights.length
116
+ TinyNN.tnn_set_param(@ft_weights[wi])
117
+ wi = wi + 1
118
+ end
119
+ end
120
+
121
+ # Zero the constant initial state (after finalize_weights).
122
+ def zero_state!(sess)
123
+ TinyNN.tnn_zero_tensor(sess, @t_state0)
124
+ end
125
+
126
+ # Forward: residual update for x [d_model, T] (B=1). Returns [d_model, T].
127
+ # Dims (d_model/s_v/n_heads) come from self (set at alloc) so this matches the
128
+ # seam's per-layer call shape; seq_t/eps arrive from the forward ctx.
129
+ def build_forward(sess, t_x, seq_t, eps)
130
+ d_model = @gdn_d_model
131
+ s_v = @gdn_s_v
132
+ n_heads = @gdn_n_heads
133
+ fbytes = 4
134
+ h = Toy::LLM::Primitives::RMSNorm.build(sess, t_x, @t_rn_gamma, eps)
135
+
136
+ q2 = TinyNN.tnn_matmul(sess, @t_w_q, h) # [S_v*H, T]
137
+ k2 = TinyNN.tnn_matmul(sess, @t_w_k, h)
138
+ v2 = TinyNN.tnn_matmul(sess, @t_w_v, h)
139
+ z2 = TinyNN.tnn_matmul(sess, @t_w_z, h) # [S_v*H, T] output gate
140
+ a2 = TinyNN.tnn_matmul(sess, @t_w_a, h) # [H, T] decay stream
141
+ b2 = TinyNN.tnn_matmul(sess, @t_w_b, h) # [H, T] update stream
142
+
143
+ # Reshape projections into the recurrence's packed [S_v, H, T] / [1, H, T].
144
+ q3 = TinyNN.tnn_reshape_3d(sess, q2, s_v, n_heads, seq_t)
145
+ k3 = TinyNN.tnn_reshape_3d(sess, k2, s_v, n_heads, seq_t)
146
+ v3 = TinyNN.tnn_reshape_3d(sess, v2, s_v, n_heads, seq_t)
147
+ a3 = TinyNN.tnn_reshape_3d(sess, a2, 1, n_heads, seq_t)
148
+ b3 = TinyNN.tnn_reshape_3d(sess, b2, 1, n_heads, seq_t)
149
+
150
+ qn = Toy::LLM::Primitives::GDN.l2_train(sess, q3, eps)
151
+ kn = Toy::LLM::Primitives::GDN.l2_train(sess, k3, eps)
152
+ g = Toy::LLM::Primitives::GDN.decay_gate(sess, a3, @t_dt_bias, @t_a_log)
153
+ bt = Toy::LLM::Primitives::GDN.update_gate_train(sess, b3)
154
+
155
+ # Per-head recurrence; concat head outputs along ne0 -> [S_v*H, T].
156
+ o = TinyNN.tnn_null_ptr
157
+ hh = 0
158
+ while hh < n_heads
159
+ st_h = TinyNN.tnn_view_2d(sess, @t_state0, s_v, s_v,
160
+ s_v * fbytes, hh * s_v * s_v * fbytes)
161
+ o_h = Toy::LLM::Primitives::GDN.recur_unrolled(sess, qn, kn, v3, g, bt,
162
+ st_h, s_v, n_heads, hh, seq_t)
163
+ if hh == 0
164
+ o = o_h
165
+ else
166
+ o = TinyNN.tnn_concat(sess, o, o_h, 0)
167
+ end
168
+ hh = hh + 1
169
+ end
170
+
171
+ gated = Toy::LLM::Primitives::GDN.gated_out(sess, o, z2, @t_go_gamma, eps)
172
+ out = TinyNN.tnn_matmul(sess, @t_w_o, gated) # [d_model, T]
173
+ TinyNN.tnn_add(sess, t_x, out)
174
+ end
175
+ end
176
+ end; end; end
@@ -153,6 +153,17 @@ class GPT2KVFFICache
153
153
  # Build the compute graph for one decode position. Returns the logits
154
154
  # tensor handle. Caller calls tnn_compute then download_row_major.
155
155
  def build_decode_step(pos)
156
+ # The per-head K/V buffers are sized for @max_T positions. Writing /
157
+ # reading slot `pos` requires pos < @max_T; at pos == @max_T the
158
+ # cpy-into-view and history views overrun the cache allocation and
159
+ # ggml aborts deep inside ggml_view_2d. Fail loud here with a
160
+ # toy-level message instead (see toy#99).
161
+ if pos >= @max_T
162
+ raise "GPT2KVFFICache: decode pos=" + pos.to_s +
163
+ " exceeds KV cache capacity max_T=" + @max_T.to_s +
164
+ " (size the cache >= prompt_len + n_generate via realize_for)"
165
+ end
166
+
156
167
  eps = 1.0e-5
157
168
  scale = 1.0 / Math.sqrt(@d_head.to_f)
158
169
  d_model = @d_model
@@ -155,6 +155,17 @@ class GPT2KVFFICacheCuda
155
155
  # Build the compute graph for one decode position. Returns the logits
156
156
  # tensor handle. Caller calls tnn_compute then download_row_major.
157
157
  def build_decode_step(pos)
158
+ # The per-head K/V buffers are sized for @max_T positions. Writing /
159
+ # reading slot `pos` requires pos < @max_T; at pos == @max_T the
160
+ # cpy-into-view and history views overrun the cache allocation and
161
+ # ggml aborts deep inside ggml_view_2d. Fail loud here with a
162
+ # toy-level message instead (see toy#99).
163
+ if pos >= @max_T
164
+ raise "GPT2KVFFICacheCuda: decode pos=" + pos.to_s +
165
+ " exceeds KV cache capacity max_T=" + @max_T.to_s +
166
+ " (size the cache >= prompt_len + n_generate via realize_for)"
167
+ end
168
+
158
169
  eps = 1.0e-5
159
170
  scale = 1.0 / Math.sqrt(@d_head.to_f)
160
171
  d_model = @d_model
@@ -155,6 +155,17 @@ class GPT2KVFFICacheMetal
155
155
  # Build the compute graph for one decode position. Returns the logits
156
156
  # tensor handle. Caller calls tnn_compute then download_row_major.
157
157
  def build_decode_step(pos)
158
+ # The per-head K/V buffers are sized for @max_T positions. Writing /
159
+ # reading slot `pos` requires pos < @max_T; at pos == @max_T the
160
+ # cpy-into-view and history views overrun the cache allocation and
161
+ # ggml aborts deep inside ggml_view_2d. Fail loud here with a
162
+ # toy-level message instead (see toy#99).
163
+ if pos >= @max_T
164
+ raise "GPT2KVFFICacheMetal: decode pos=" + pos.to_s +
165
+ " exceeds KV cache capacity max_T=" + @max_T.to_s +
166
+ " (size the cache >= prompt_len + n_generate via realize_for)"
167
+ end
168
+
158
169
  eps = 1.0e-5
159
170
  scale = 1.0 / Math.sqrt(@d_head.to_f)
160
171
  d_model = @d_model
@@ -874,7 +874,7 @@ class SmolLM2KVFFICache
874
874
  if is_native
875
875
  wtype = GGUFLoad.detect_weight_type(gguf_path)
876
876
  set_weight_type(wtype)
877
- realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias)
877
+ realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias, flags.qk_norm)
878
878
  puts " BYO-pointer mmap (weight_type=" + wtype.to_s + ")"
879
879
  gguf
880
880
  else
@@ -1584,7 +1584,15 @@ module SmolLM2KV
1584
1584
  TinyNN.tnn_reset_for_rebuild(kv_cache.sess)
1585
1585
  step = kv_cache.build_decode_step(pos)
1586
1586
  TinyNN.tnn_realize(kv_cache.sess, step.kv_step_logits)
1587
- TinyNN.upload_int_array(kv_cache.sess, step.t_token_id, [token_id])
1587
+ # Spinel landmine: in whole-program inference contexts where `token_id`
1588
+ # poly-collapses to sp_RbVal (e.g. the eval runner, where generate's
1589
+ # sampler-fed `last_id` unifies decode_step's param to RbVal), the literal
1590
+ # `[token_id]` compiles to a PolyArray. upload_int_array takes :int_array
1591
+ # (sp_IntArray), so the PolyArray is then mis-read as an IntArray → garbage
1592
+ # length → ggml "tensor write out of bounds" abort. Narrowing to a clean
1593
+ # mrb_int via `.to_i` forces the IntArray codegen (as `[pos]` already gets).
1594
+ tid = token_id.to_i
1595
+ TinyNN.upload_int_array(kv_cache.sess, step.t_token_id, [tid])
1588
1596
  TinyNN.upload_int_array(kv_cache.sess, step.t_pos, [pos])
1589
1597
  TinyNN.tnn_compute(kv_cache.sess)
1590
1598
  kv_cache.dump_trace
@@ -862,7 +862,7 @@ class SmolLM2KVFFICacheCuda
862
862
  if is_native
863
863
  wtype = GGUFLoad.detect_weight_type(gguf_path)
864
864
  set_weight_type(wtype)
865
- realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias)
865
+ realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias, flags.qk_norm)
866
866
  puts " BYO-pointer mmap (weight_type=" + wtype.to_s + ")"
867
867
  gguf
868
868
  else
@@ -1517,7 +1517,15 @@ module SmolLM2KVCuda
1517
1517
  TinyNNCuda.tnn_reset_for_rebuild(kv_cache.sess)
1518
1518
  step = kv_cache.build_decode_step(pos)
1519
1519
  TinyNNCuda.tnn_realize(kv_cache.sess, step.kv_step_logits)
1520
- TinyNNCuda.upload_int_array(kv_cache.sess, step.t_token_id, [token_id])
1520
+ # Spinel landmine: in whole-program inference contexts where `token_id`
1521
+ # poly-collapses to sp_RbVal (e.g. the eval runner, where generate's
1522
+ # sampler-fed `last_id` unifies decode_step's param to RbVal), the literal
1523
+ # `[token_id]` compiles to a PolyArray. upload_int_array takes :int_array
1524
+ # (sp_IntArray), so the PolyArray is then mis-read as an IntArray → garbage
1525
+ # length → ggml "tensor write out of bounds" abort. Narrowing to a clean
1526
+ # mrb_int via `.to_i` forces the IntArray codegen (as `[pos]` already gets).
1527
+ tid = token_id.to_i
1528
+ TinyNNCuda.upload_int_array(kv_cache.sess, step.t_token_id, [tid])
1521
1529
  TinyNNCuda.upload_int_array(kv_cache.sess, step.t_pos, [pos])
1522
1530
  TinyNNCuda.tnn_compute(kv_cache.sess)
1523
1531
  kv_cache.dump_trace
@@ -862,7 +862,7 @@ class SmolLM2KVFFICacheMetal
862
862
  if is_native
863
863
  wtype = GGUFLoad.detect_weight_type(gguf_path)
864
864
  set_weight_type(wtype)
865
- realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias)
865
+ realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias, flags.qk_norm)
866
866
  puts " BYO-pointer mmap (weight_type=" + wtype.to_s + ")"
867
867
  gguf
868
868
  else
@@ -1517,7 +1517,15 @@ module SmolLM2KVMetal
1517
1517
  TinyNNMetal.tnn_reset_for_rebuild(kv_cache.sess)
1518
1518
  step = kv_cache.build_decode_step(pos)
1519
1519
  TinyNNMetal.tnn_realize(kv_cache.sess, step.kv_step_logits)
1520
- TinyNNMetal.upload_int_array(kv_cache.sess, step.t_token_id, [token_id])
1520
+ # Spinel landmine: in whole-program inference contexts where `token_id`
1521
+ # poly-collapses to sp_RbVal (e.g. the eval runner, where generate's
1522
+ # sampler-fed `last_id` unifies decode_step's param to RbVal), the literal
1523
+ # `[token_id]` compiles to a PolyArray. upload_int_array takes :int_array
1524
+ # (sp_IntArray), so the PolyArray is then mis-read as an IntArray → garbage
1525
+ # length → ggml "tensor write out of bounds" abort. Narrowing to a clean
1526
+ # mrb_int via `.to_i` forces the IntArray codegen (as `[pos]` already gets).
1527
+ tid = token_id.to_i
1528
+ TinyNNMetal.upload_int_array(kv_cache.sess, step.t_token_id, [tid])
1521
1529
  TinyNNMetal.upload_int_array(kv_cache.sess, step.t_pos, [pos])
1522
1530
  TinyNNMetal.tnn_compute(kv_cache.sess)
1523
1531
  kv_cache.dump_trace
@@ -25,6 +25,9 @@ require_relative "../primitives/rope"
25
25
  require_relative "../primitives/swiglu"
26
26
  require_relative "../primitives/gqa"
27
27
  require_relative "../blocks/transformer_block"
28
+ require_relative "../primitives/gdn"
29
+ require_relative "../blocks/gdn_block"
30
+ require_relative "../archs/layer_spec"
28
31
  require_relative "../archs/llama_arch"
29
32
 
30
33
  module Toy; module LLM; module Engine
@@ -1007,6 +1010,14 @@ class LlamaSeqEngine
1007
1010
  @t_seq_attn_mask = TinyNN.tnn_input_2d_f32_persistent(@sess, tb_alloc, tb_alloc)
1008
1011
  end
1009
1012
 
1013
+ # #1449: pre-create the token-id index leaf in ctx_w (galloc-external) before
1014
+ # finalize, so galloc can't free its compute-arena slot and reuse it for the
1015
+ # loss output (-> backward get_rows reads loss bits -> wild index -> OOB
1016
+ # abort, layout-flaky on the C backend). build_forward reuses this handle;
1017
+ # it's re-uploaded each step. Positions stay in the compute ctx (the loss
1018
+ # aliases off=0 = token_ids, not positions). See tnn_input_1d_i32_persistent.
1019
+ @t_seq_token_ids = TinyNN.tnn_input_1d_i32_persistent(@sess, @seq_t * @seq_b)
1020
+
1010
1021
  TinyNN.tnn_finalize_weights(@sess)
1011
1022
 
1012
1023
  # Upload llama3-style RoPE freq_factors once the backend buffer
@@ -1056,7 +1067,11 @@ class LlamaSeqEngine
1056
1067
  # that ordering); RoPE applies per-batch positional encoding
1057
1068
  # because rope_ext reads positions[k] for each ne[2] slot.
1058
1069
  tb = @seq_t * @seq_b
1059
- @t_seq_token_ids = TinyNN.tnn_input_1d_i32(@sess, tb)
1070
+ # #1449: token_ids is pre-created persistent (ctx_w) in finalize and survives
1071
+ # reset_for_rebuild — reuse it. Only allocate here as a fallback for any path
1072
+ # that builds the forward without the finalize pre-creation (then it lands in
1073
+ # the compute ctx, the legacy behaviour).
1074
+ @t_seq_token_ids = TinyNN.tnn_input_1d_i32(@sess, tb) if @t_seq_token_ids == TinyNN.tnn_null_ptr
1060
1075
  @t_seq_positions = TinyNN.tnn_input_1d_i32_ctx(@sess, tb)
1061
1076
 
1062
1077
  # The arch reads seq_rope_cfg / seq_donor_d_in off itself; the cache
@@ -30,6 +30,9 @@ require_relative "../primitives/rope_cuda"
30
30
  require_relative "../primitives/swiglu_cuda"
31
31
  require_relative "../primitives/gqa_cuda"
32
32
  require_relative "../blocks/transformer_block_cuda"
33
+ require_relative "../primitives/gdn"
34
+ require_relative "../blocks/gdn_block"
35
+ require_relative "../archs/layer_spec"
33
36
  require_relative "../archs/llama_arch_cuda"
34
37
 
35
38
  module Toy; module LLM; module Engine
@@ -1012,6 +1015,14 @@ class LlamaSeqEngineCuda
1012
1015
  @t_seq_attn_mask = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, tb_alloc, tb_alloc)
1013
1016
  end
1014
1017
 
1018
+ # #1449: pre-create the token-id index leaf in ctx_w (galloc-external) before
1019
+ # finalize, so galloc can't free its compute-arena slot and reuse it for the
1020
+ # loss output (-> backward get_rows reads loss bits -> wild index -> OOB
1021
+ # abort, layout-flaky on the C backend). build_forward reuses this handle;
1022
+ # it's re-uploaded each step. Positions stay in the compute ctx (the loss
1023
+ # aliases off=0 = token_ids, not positions). See tnn_input_1d_i32_persistent.
1024
+ @t_seq_token_ids = TinyNNCuda.tnn_input_1d_i32_persistent(@sess, @seq_t * @seq_b)
1025
+
1015
1026
  TinyNNCuda.tnn_finalize_weights(@sess)
1016
1027
 
1017
1028
  # Upload llama3-style RoPE freq_factors once the backend buffer
@@ -1061,7 +1072,11 @@ class LlamaSeqEngineCuda
1061
1072
  # that ordering); RoPE applies per-batch positional encoding
1062
1073
  # because rope_ext reads positions[k] for each ne[2] slot.
1063
1074
  tb = @seq_t * @seq_b
1064
- @t_seq_token_ids = TinyNNCuda.tnn_input_1d_i32(@sess, tb)
1075
+ # #1449: token_ids is pre-created persistent (ctx_w) in finalize and survives
1076
+ # reset_for_rebuild — reuse it. Only allocate here as a fallback for any path
1077
+ # that builds the forward without the finalize pre-creation (then it lands in
1078
+ # the compute ctx, the legacy behaviour).
1079
+ @t_seq_token_ids = TinyNNCuda.tnn_input_1d_i32(@sess, tb) if @t_seq_token_ids == TinyNNCuda.tnn_null_ptr
1065
1080
  @t_seq_positions = TinyNNCuda.tnn_input_1d_i32_ctx(@sess, tb)
1066
1081
 
1067
1082
  # The arch reads seq_rope_cfg / seq_donor_d_in off itself; the cache
@@ -30,6 +30,9 @@ require_relative "../primitives/rope_metal"
30
30
  require_relative "../primitives/swiglu_metal"
31
31
  require_relative "../primitives/gqa_metal"
32
32
  require_relative "../blocks/transformer_block_metal"
33
+ require_relative "../primitives/gdn"
34
+ require_relative "../blocks/gdn_block"
35
+ require_relative "../archs/layer_spec"
33
36
  require_relative "../archs/llama_arch_metal"
34
37
 
35
38
  module Toy; module LLM; module Engine
@@ -1012,6 +1015,14 @@ class LlamaSeqEngineMetal
1012
1015
  @t_seq_attn_mask = TinyNNMetal.tnn_input_2d_f32_persistent(@sess, tb_alloc, tb_alloc)
1013
1016
  end
1014
1017
 
1018
+ # #1449: pre-create the token-id index leaf in ctx_w (galloc-external) before
1019
+ # finalize, so galloc can't free its compute-arena slot and reuse it for the
1020
+ # loss output (-> backward get_rows reads loss bits -> wild index -> OOB
1021
+ # abort, layout-flaky on the C backend). build_forward reuses this handle;
1022
+ # it's re-uploaded each step. Positions stay in the compute ctx (the loss
1023
+ # aliases off=0 = token_ids, not positions). See tnn_input_1d_i32_persistent.
1024
+ @t_seq_token_ids = TinyNNMetal.tnn_input_1d_i32_persistent(@sess, @seq_t * @seq_b)
1025
+
1015
1026
  TinyNNMetal.tnn_finalize_weights(@sess)
1016
1027
 
1017
1028
  # Upload llama3-style RoPE freq_factors once the backend buffer
@@ -1061,7 +1072,11 @@ class LlamaSeqEngineMetal
1061
1072
  # that ordering); RoPE applies per-batch positional encoding
1062
1073
  # because rope_ext reads positions[k] for each ne[2] slot.
1063
1074
  tb = @seq_t * @seq_b
1064
- @t_seq_token_ids = TinyNNMetal.tnn_input_1d_i32(@sess, tb)
1075
+ # #1449: token_ids is pre-created persistent (ctx_w) in finalize and survives
1076
+ # reset_for_rebuild — reuse it. Only allocate here as a fallback for any path
1077
+ # that builds the forward without the finalize pre-creation (then it lands in
1078
+ # the compute ctx, the legacy behaviour).
1079
+ @t_seq_token_ids = TinyNNMetal.tnn_input_1d_i32(@sess, tb) if @t_seq_token_ids == TinyNNMetal.tnn_null_ptr
1065
1080
  @t_seq_positions = TinyNNMetal.tnn_input_1d_i32_ctx(@sess, tb)
1066
1081
 
1067
1082
  # The arch reads seq_rope_cfg / seq_donor_d_in off itself; the cache
@@ -0,0 +1,33 @@
1
+ # lib/toy/llm/primitives/depth_scale.rb — L1 primitive: depth-dependent
2
+ # LayerNorm scaling (Sun et al., "The Curse of Depth", arXiv 2502.05795).
3
+ #
4
+ # Pure module: `self.` methods only. The BLOCK (L2) owns the norm and computes
5
+ # the per-layer constant; this primitive applies the parameter-free 1/sqrt(ell)
6
+ # scaling to a normalised sublayer INPUT. See README.md.
7
+ #
8
+ # Formula: h(ell) = LayerNorm(h_ell) * (1/sqrt(ell)), applied to BOTH the
9
+ # attention and FFN pre-norm outputs (the input fed to the sublayer), 1-indexed
10
+ # ell. Caps Pre-LN variance growth with depth so deep layers stay effective.
11
+ # Hyperparameter-free, no learned weights.
12
+ #
13
+ # Spinel hygiene: no Cfg / no default args. One FFI passthrough. inv_sqrt_depth
14
+ # is the block's precomputed Float (1/sqrt(layer_index), computed in the CRuby
15
+ # layer — no libm in the Spinel runner). Call via the full module path.
16
+
17
+ module Toy
18
+ module LLM
19
+ module Primitives
20
+ module DepthScale
21
+ NAME = :depth_scale
22
+
23
+ # Scale a normalised tensor by the depth constant 1/sqrt(ell).
24
+ # x is the block's RMSNorm output (the sublayer input); the block
25
+ # passes inv_sqrt_depth = 1.0/Math.sqrt(layer) as a Float constant.
26
+ # Returns the depth-scaled handle.
27
+ def self.apply(sess, x, inv_sqrt_depth)
28
+ TinyNN.tnn_scale(sess, x, inv_sqrt_depth)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end