toy 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -0
- data/Makefile +211 -5
- data/README.md +1 -1
- data/lib/toy/compute.rb +9 -0
- data/lib/toy/compute_cuda.rb +8 -0
- data/lib/toy/compute_metal.rb +17 -0
- data/lib/toy/core/cli/new.rb +8 -0
- data/lib/toy/ffi/tinynn.rb +19 -0
- data/lib/toy/ffi/tinynn_cuda.rb +7 -0
- data/lib/toy/ffi/tinynn_metal.rb +5 -0
- data/lib/toy/llm/archs/layer_spec.rb +39 -0
- data/lib/toy/llm/archs/llama_arch.rb +62 -1
- data/lib/toy/llm/archs/llama_arch_cuda.rb +62 -1
- data/lib/toy/llm/archs/llama_arch_metal.rb +62 -1
- data/lib/toy/llm/blocks/gdn_block.rb +176 -0
- data/lib/toy/llm/engine/gpt2_kv_engine.rb +11 -0
- data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +11 -0
- data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +11 -0
- data/lib/toy/llm/engine/llama_kv_engine.rb +10 -2
- data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +10 -2
- data/lib/toy/llm/engine/llama_kv_engine_metal.rb +10 -2
- data/lib/toy/llm/engine/llama_seq_engine.rb +16 -1
- data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +16 -1
- data/lib/toy/llm/engine/llama_seq_engine_metal.rb +16 -1
- data/lib/toy/llm/primitives/depth_scale.rb +33 -0
- data/lib/toy/llm/primitives/diff_attention.rb +71 -0
- data/lib/toy/llm/primitives/gdn.rb +188 -0
- data/lib/toy/llm/primitives/scalable_softmax.rb +37 -0
- data/lib/toy/run/eval_metal.rb +12 -0
- data/lib/toy/run/infer_metal.rb +19 -0
- data/lib/toy/run/train_gpt2_metal.rb +7 -0
- data/lib/toy/run/train_hybrid.rb +232 -0
- data/lib/toy/run/train_metal.rb +10 -0
- data/lib/toy/version.rb +4 -3
- data/tinynn/tinynn_backend_cuda.c +22 -0
- data/tinynn/tinynn_ggml.c +231 -0
- metadata +9 -2
|
@@ -71,6 +71,21 @@ module Toy; module LLM; module Archs
|
|
|
71
71
|
class LlamaArch
|
|
72
72
|
attr_accessor :t_seq_token_embed, :t_seq_final_norm_gamma, :t_seq_output,
|
|
73
73
|
:t_seq_w_proj, :seq_blocks_ffi,
|
|
74
|
+
# Phase 3 — per-layer descriptor array, parallel to
|
|
75
|
+
# seq_blocks_ffi (same length == n_layers).
|
|
76
|
+
:seq_layer_specs,
|
|
77
|
+
# Phase 5 — the dispatch key is a plain INT array (one kind per
|
|
78
|
+
# layer), NOT LayerSpec.kind reads: constructing/mutating
|
|
79
|
+
# LayerSpec objects on a realize path trips a Spinel codegen
|
|
80
|
+
# miscompile (corrupts the token-id finalize). Mutating a plain
|
|
81
|
+
# int array element is proven-safe. build_forward dispatches on
|
|
82
|
+
# this; LayerSpec stays the descriptor type/constants home.
|
|
83
|
+
:seq_layer_kinds,
|
|
84
|
+
# Phase 5 — parallel GDN-block array (same length; entry is a
|
|
85
|
+
# GDNBlock at KIND_GDN positions, null elsewhere). The KIND_GDN
|
|
86
|
+
# dispatch arm calls into THIS array — a concrete typed call,
|
|
87
|
+
# so the seam stays monomorphic per call site.
|
|
88
|
+
:seq_gdn_blocks_ffi,
|
|
74
89
|
# Orchestration-gating carriers — bare cache ivars with
|
|
75
90
|
# no accessor before P2.5. The lens-branch guard reads
|
|
76
91
|
# seq_donor_d_in; the shared ctx reads seq_rope_cfg.
|
|
@@ -85,6 +100,15 @@ module Toy; module LLM; module Archs
|
|
|
85
100
|
@t_seq_w_proj = TinyNNCuda.tnn_null_ptr
|
|
86
101
|
# Seed with one block — matches the former cache init (L112).
|
|
87
102
|
@seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
|
|
103
|
+
# Phase 3 — parallel seed: one attention spec for the seed block.
|
|
104
|
+
@seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
|
|
105
|
+
# Phase 5 — parallel int dispatch keys (KIND_ATTENTION for the seed).
|
|
106
|
+
@seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
|
|
107
|
+
# Phase 5 — parallel GDN-block slots. Seeded with GDNBlock placeholders so
|
|
108
|
+
# the array is MONOMORPHIC (all GDNBlock) — the seam's KIND_GDN call site
|
|
109
|
+
# never sees a mixed null/object array (Spinel poly-array landmine). At
|
|
110
|
+
# KIND_ATTENTION layers the placeholder is simply never invoked.
|
|
111
|
+
@seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
|
|
88
112
|
@seq_donor_d_in = 0
|
|
89
113
|
# The cache overwrites seq_rope_cfg with the real RoPE::Cfg before
|
|
90
114
|
# build_forward runs (each realize prologue rebuilds it).
|
|
@@ -101,11 +125,33 @@ module Toy; module LLM; module Archs
|
|
|
101
125
|
# already constructs TransformerBlock.new there, so no new class /
|
|
102
126
|
# Struct / FFI :str at class load. Each realize path now calls this
|
|
103
127
|
# via the cache's seq_blocks_ffi delegator chain (self.seq_arch).
|
|
128
|
+
# Phase 5 hybrid — rebuild the per-layer spec array from a per-layer GDN
|
|
129
|
+
# bool flag, using the LayerSpec CTOR (never the .kind= setter: mutating
|
|
130
|
+
# LayerSpec.kind elsewhere while build_forward reads it trips a Spinel
|
|
131
|
+
# codegen miscompile that corrupts the token-id finalize). Called after
|
|
132
|
+
# seed_blocks!, before alloc.
|
|
133
|
+
# Mark ONE layer as GDN. Takes an INT index (never an array param — a
|
|
134
|
+
# function-parameter array trips the Spinel #688 type-lock landmine, which
|
|
135
|
+
# here manifests as a token-id-finalize codegen miscompile). Mutates the
|
|
136
|
+
# plain int dispatch array element (proven-safe).
|
|
137
|
+
def set_gdn_layer!(idx)
|
|
138
|
+
@seq_layer_kinds[idx] = Toy::LLM::Archs::LayerSpec::KIND_GDN
|
|
139
|
+
end
|
|
140
|
+
|
|
104
141
|
def seed_blocks!(n_layers)
|
|
105
142
|
@seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
|
|
143
|
+
# Phase 3 — seed the parallel spec array in lockstep. Every layer is
|
|
144
|
+
# KIND_ATTENTION for now (the homogeneous-Llama refactor gate); Phase 5
|
|
145
|
+
# overwrites individual entries with KIND_GDN for Dragon's pattern.
|
|
146
|
+
@seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
|
|
147
|
+
@seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
|
|
148
|
+
@seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
|
|
106
149
|
li_init = 1
|
|
107
150
|
while li_init < n_layers
|
|
108
151
|
@seq_blocks_ffi.push(Toy::LLM::Blocks::TransformerBlock.new)
|
|
152
|
+
@seq_layer_specs.push(Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION))
|
|
153
|
+
@seq_gdn_blocks_ffi.push(Toy::LLM::Blocks::GDNBlock.new)
|
|
154
|
+
@seq_layer_kinds.push(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)
|
|
109
155
|
li_init = li_init + 1
|
|
110
156
|
end
|
|
111
157
|
end
|
|
@@ -217,7 +263,22 @@ module Toy; module LLM; module Archs
|
|
|
217
263
|
end
|
|
218
264
|
li_g = 0
|
|
219
265
|
while li_g < seq_n_layers
|
|
220
|
-
|
|
266
|
+
# Phase 3 — per-layer descriptor dispatch. The branch compares a FLAT
|
|
267
|
+
# INT (spec.kind) and each arm calls a CONCRETE typed block method, so
|
|
268
|
+
# every .build_forward call site stays monomorphic (one receiver
|
|
269
|
+
# class). KIND_ATTENTION is the only arm wired today; KIND_GDN gets its
|
|
270
|
+
# own arm + its own typed block array in Phase 5. Unknown kinds fail
|
|
271
|
+
# loud rather than silently building the wrong graph (never-mask rule).
|
|
272
|
+
spec_kind = self.seq_layer_kinds[li_g]
|
|
273
|
+
if spec_kind == Toy::LLM::Archs::LayerSpec::KIND_ATTENTION
|
|
274
|
+
t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
|
|
275
|
+
elsif spec_kind == Toy::LLM::Archs::LayerSpec::KIND_GDN
|
|
276
|
+
# Concrete typed call into the parallel GDN array — the GDN block reads
|
|
277
|
+
# its own dims (set at alloc); seq_t/eps come from the shared ctx.
|
|
278
|
+
t_cur = self.seq_gdn_blocks_ffi[li_g].build_forward(sess, t_cur, seq_t, eps)
|
|
279
|
+
else
|
|
280
|
+
raise "LlamaArch#build_forward: unsupported layer kind #{spec_kind} at layer #{li_g}"
|
|
281
|
+
end
|
|
221
282
|
li_g = li_g + 1
|
|
222
283
|
end
|
|
223
284
|
|
|
@@ -71,6 +71,21 @@ module Toy; module LLM; module Archs
|
|
|
71
71
|
class LlamaArch
|
|
72
72
|
attr_accessor :t_seq_token_embed, :t_seq_final_norm_gamma, :t_seq_output,
|
|
73
73
|
:t_seq_w_proj, :seq_blocks_ffi,
|
|
74
|
+
# Phase 3 — per-layer descriptor array, parallel to
|
|
75
|
+
# seq_blocks_ffi (same length == n_layers).
|
|
76
|
+
:seq_layer_specs,
|
|
77
|
+
# Phase 5 — the dispatch key is a plain INT array (one kind per
|
|
78
|
+
# layer), NOT LayerSpec.kind reads: constructing/mutating
|
|
79
|
+
# LayerSpec objects on a realize path trips a Spinel codegen
|
|
80
|
+
# miscompile (corrupts the token-id finalize). Mutating a plain
|
|
81
|
+
# int array element is proven-safe. build_forward dispatches on
|
|
82
|
+
# this; LayerSpec stays the descriptor type/constants home.
|
|
83
|
+
:seq_layer_kinds,
|
|
84
|
+
# Phase 5 — parallel GDN-block array (same length; entry is a
|
|
85
|
+
# GDNBlock at KIND_GDN positions, null elsewhere). The KIND_GDN
|
|
86
|
+
# dispatch arm calls into THIS array — a concrete typed call,
|
|
87
|
+
# so the seam stays monomorphic per call site.
|
|
88
|
+
:seq_gdn_blocks_ffi,
|
|
74
89
|
# Orchestration-gating carriers — bare cache ivars with
|
|
75
90
|
# no accessor before P2.5. The lens-branch guard reads
|
|
76
91
|
# seq_donor_d_in; the shared ctx reads seq_rope_cfg.
|
|
@@ -85,6 +100,15 @@ module Toy; module LLM; module Archs
|
|
|
85
100
|
@t_seq_w_proj = TinyNNMetal.tnn_null_ptr
|
|
86
101
|
# Seed with one block — matches the former cache init (L112).
|
|
87
102
|
@seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
|
|
103
|
+
# Phase 3 — parallel seed: one attention spec for the seed block.
|
|
104
|
+
@seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
|
|
105
|
+
# Phase 5 — parallel int dispatch keys (KIND_ATTENTION for the seed).
|
|
106
|
+
@seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
|
|
107
|
+
# Phase 5 — parallel GDN-block slots. Seeded with GDNBlock placeholders so
|
|
108
|
+
# the array is MONOMORPHIC (all GDNBlock) — the seam's KIND_GDN call site
|
|
109
|
+
# never sees a mixed null/object array (Spinel poly-array landmine). At
|
|
110
|
+
# KIND_ATTENTION layers the placeholder is simply never invoked.
|
|
111
|
+
@seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
|
|
88
112
|
@seq_donor_d_in = 0
|
|
89
113
|
# The cache overwrites seq_rope_cfg with the real RoPE::Cfg before
|
|
90
114
|
# build_forward runs (each realize prologue rebuilds it).
|
|
@@ -101,11 +125,33 @@ module Toy; module LLM; module Archs
|
|
|
101
125
|
# already constructs TransformerBlock.new there, so no new class /
|
|
102
126
|
# Struct / FFI :str at class load. Each realize path now calls this
|
|
103
127
|
# via the cache's seq_blocks_ffi delegator chain (self.seq_arch).
|
|
128
|
+
# Phase 5 hybrid — rebuild the per-layer spec array from a per-layer GDN
|
|
129
|
+
# bool flag, using the LayerSpec CTOR (never the .kind= setter: mutating
|
|
130
|
+
# LayerSpec.kind elsewhere while build_forward reads it trips a Spinel
|
|
131
|
+
# codegen miscompile that corrupts the token-id finalize). Called after
|
|
132
|
+
# seed_blocks!, before alloc.
|
|
133
|
+
# Mark ONE layer as GDN. Takes an INT index (never an array param — a
|
|
134
|
+
# function-parameter array trips the Spinel #688 type-lock landmine, which
|
|
135
|
+
# here manifests as a token-id-finalize codegen miscompile). Mutates the
|
|
136
|
+
# plain int dispatch array element (proven-safe).
|
|
137
|
+
def set_gdn_layer!(idx)
|
|
138
|
+
@seq_layer_kinds[idx] = Toy::LLM::Archs::LayerSpec::KIND_GDN
|
|
139
|
+
end
|
|
140
|
+
|
|
104
141
|
def seed_blocks!(n_layers)
|
|
105
142
|
@seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
|
|
143
|
+
# Phase 3 — seed the parallel spec array in lockstep. Every layer is
|
|
144
|
+
# KIND_ATTENTION for now (the homogeneous-Llama refactor gate); Phase 5
|
|
145
|
+
# overwrites individual entries with KIND_GDN for Dragon's pattern.
|
|
146
|
+
@seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
|
|
147
|
+
@seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
|
|
148
|
+
@seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
|
|
106
149
|
li_init = 1
|
|
107
150
|
while li_init < n_layers
|
|
108
151
|
@seq_blocks_ffi.push(Toy::LLM::Blocks::TransformerBlock.new)
|
|
152
|
+
@seq_layer_specs.push(Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION))
|
|
153
|
+
@seq_gdn_blocks_ffi.push(Toy::LLM::Blocks::GDNBlock.new)
|
|
154
|
+
@seq_layer_kinds.push(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)
|
|
109
155
|
li_init = li_init + 1
|
|
110
156
|
end
|
|
111
157
|
end
|
|
@@ -217,7 +263,22 @@ module Toy; module LLM; module Archs
|
|
|
217
263
|
end
|
|
218
264
|
li_g = 0
|
|
219
265
|
while li_g < seq_n_layers
|
|
220
|
-
|
|
266
|
+
# Phase 3 — per-layer descriptor dispatch. The branch compares a FLAT
|
|
267
|
+
# INT (spec.kind) and each arm calls a CONCRETE typed block method, so
|
|
268
|
+
# every .build_forward call site stays monomorphic (one receiver
|
|
269
|
+
# class). KIND_ATTENTION is the only arm wired today; KIND_GDN gets its
|
|
270
|
+
# own arm + its own typed block array in Phase 5. Unknown kinds fail
|
|
271
|
+
# loud rather than silently building the wrong graph (never-mask rule).
|
|
272
|
+
spec_kind = self.seq_layer_kinds[li_g]
|
|
273
|
+
if spec_kind == Toy::LLM::Archs::LayerSpec::KIND_ATTENTION
|
|
274
|
+
t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
|
|
275
|
+
elsif spec_kind == Toy::LLM::Archs::LayerSpec::KIND_GDN
|
|
276
|
+
# Concrete typed call into the parallel GDN array — the GDN block reads
|
|
277
|
+
# its own dims (set at alloc); seq_t/eps come from the shared ctx.
|
|
278
|
+
t_cur = self.seq_gdn_blocks_ffi[li_g].build_forward(sess, t_cur, seq_t, eps)
|
|
279
|
+
else
|
|
280
|
+
raise "LlamaArch#build_forward: unsupported layer kind #{spec_kind} at layer #{li_g}"
|
|
281
|
+
end
|
|
221
282
|
li_g = li_g + 1
|
|
222
283
|
end
|
|
223
284
|
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# lib/toy/llm/blocks/gdn_block.rb — L2 block: a TRAINABLE Gated-DeltaNet layer
|
|
2
|
+
# (Dragon / Qwen3-Next linear-attention mixer), the KIND_GDN counterpart of the
|
|
3
|
+
# attention TransformerBlock. Composes the L1 GDN primitives around its own
|
|
4
|
+
# projection weights and the autograd-differentiable recurrence
|
|
5
|
+
# (Toy::LLM::Primitives::GDN.recur_unrolled, Phase 4 / Path B) — so the whole
|
|
6
|
+
# layer trains with NO hand-written kernel backward.
|
|
7
|
+
#
|
|
8
|
+
# DEFERRED (Phase 5 minimal-trainable scope; revisit for Dragon bit-match): the
|
|
9
|
+
# short causal conv on q/k/v (ggml_conv_1d is FFI-wired from Phase 1) and any
|
|
10
|
+
# Dragon-exact stream layout. This block proves a GDN layer is a correct,
|
|
11
|
+
# trainable residual unit; it is not yet a bit-faithful Dragon block.
|
|
12
|
+
#
|
|
13
|
+
# Shapes (single seq, B=1):
|
|
14
|
+
# x [d_model, T]
|
|
15
|
+
# h = rmsnorm [d_model, T]
|
|
16
|
+
# q/k/v/z = W·h -> [S_v*H, T] (W : [d_model, S_v*H])
|
|
17
|
+
# a/b = W·h -> [H, T] (W : [d_model, H]) ; per-head scalars
|
|
18
|
+
# per head h: recur_unrolled(qn,kn,v,g,beta, state0_h) -> [S_v, T]
|
|
19
|
+
# o concat heads -> [S_v*H, T]
|
|
20
|
+
# gated = GatedRMSNorm(o, z) -> [S_v*H, T]
|
|
21
|
+
# out = W_o·gated -> [d_model, T] ; residual = x + out
|
|
22
|
+
#
|
|
23
|
+
# Spinel hygiene: hand-written positional class, NEVER Struct.new (landmine #16);
|
|
24
|
+
# no Cfg ctor / default args (landmine #4); no Card/step_bind/FFI :str at class
|
|
25
|
+
# load. This file does NOT require_relative "tinynn" (the loader picks the
|
|
26
|
+
# backend before requiring this block, as for the L1 primitives + L2 attention
|
|
27
|
+
# block).
|
|
28
|
+
|
|
29
|
+
module Toy; module LLM; module Blocks
|
|
30
|
+
class GDNBlock
|
|
31
|
+
attr_accessor :t_rn_gamma,
|
|
32
|
+
:t_w_q, :t_w_k, :t_w_v, :t_w_z, :t_w_a, :t_w_b,
|
|
33
|
+
:t_a_log, :t_dt_bias, :t_go_gamma, :t_w_o,
|
|
34
|
+
:t_state0,
|
|
35
|
+
# Block dims (set at alloc) so build_forward can take the same
|
|
36
|
+
# (sess, t_x, ctx) signature as TransformerBlock — the seam's
|
|
37
|
+
# KIND_GDN dispatch arm stays a single concrete typed call.
|
|
38
|
+
:gdn_d_model, :gdn_s_v, :gdn_n_heads,
|
|
39
|
+
# F3 full-finetune parallel arrays (weight, m, v) — same
|
|
40
|
+
# convention as TransformerBlock so the engine's opt_step
|
|
41
|
+
# walker reaches them by name.
|
|
42
|
+
:ft_weights, :ft_m, :ft_v
|
|
43
|
+
|
|
44
|
+
def initialize
|
|
45
|
+
@gdn_d_model = 0; @gdn_s_v = 0; @gdn_n_heads = 0
|
|
46
|
+
@t_rn_gamma = TinyNN.tnn_null_ptr
|
|
47
|
+
@t_w_q = TinyNN.tnn_null_ptr; @t_w_k = TinyNN.tnn_null_ptr; @t_w_v = TinyNN.tnn_null_ptr
|
|
48
|
+
@t_w_z = TinyNN.tnn_null_ptr; @t_w_a = TinyNN.tnn_null_ptr; @t_w_b = TinyNN.tnn_null_ptr
|
|
49
|
+
@t_a_log = TinyNN.tnn_null_ptr; @t_dt_bias = TinyNN.tnn_null_ptr
|
|
50
|
+
@t_go_gamma = TinyNN.tnn_null_ptr; @t_w_o = TinyNN.tnn_null_ptr
|
|
51
|
+
@t_state0 = TinyNN.tnn_null_ptr
|
|
52
|
+
@ft_weights = [TinyNN.tnn_null_ptr]; @ft_weights.pop
|
|
53
|
+
@ft_m = [TinyNN.tnn_null_ptr]; @ft_m.pop
|
|
54
|
+
@ft_v = [TinyNN.tnn_null_ptr]; @ft_v.pop
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Allocate the block's trainable persistent F32 weights + their Adam moments
|
|
58
|
+
# (parallel ft_weights/ft_m/ft_v arrays, populated in lockstep so the engine
|
|
59
|
+
# / a train loop can opt_step generically). d_model is the residual width;
|
|
60
|
+
# n_heads × s_v = the GDN inner width. state0 is a zeroed [s_v, s_v*n_heads]
|
|
61
|
+
# constant carry (one [s_v,s_v] block per head), NOT a param. Each weight's
|
|
62
|
+
# m/v match its shape (opt_step_adamw asserts same-shape).
|
|
63
|
+
def alloc_trainable_f32_weights!(sess, d_model, s_v, n_heads)
|
|
64
|
+
@gdn_d_model = d_model; @gdn_s_v = s_v; @gdn_n_heads = n_heads
|
|
65
|
+
inner = s_v * n_heads
|
|
66
|
+
# W : [d_model, out] (matmul(W, h) contracts ne0=d_model -> [out, T]).
|
|
67
|
+
# input_2d_f32_persistent(rows, cols) -> ne0=cols, ne1=rows, so pass
|
|
68
|
+
# (out, d_model) to get ne0=d_model, ne1=out.
|
|
69
|
+
@t_rn_gamma = reg1(sess, d_model)
|
|
70
|
+
@t_w_q = reg2(sess, inner, d_model)
|
|
71
|
+
@t_w_k = reg2(sess, inner, d_model)
|
|
72
|
+
@t_w_v = reg2(sess, inner, d_model)
|
|
73
|
+
@t_w_z = reg2(sess, inner, d_model)
|
|
74
|
+
@t_w_a = reg2(sess, n_heads, d_model)
|
|
75
|
+
@t_w_b = reg2(sess, n_heads, d_model)
|
|
76
|
+
@t_a_log = reg4(sess, 1, n_heads, 1, 1)
|
|
77
|
+
@t_dt_bias = reg4(sess, 1, n_heads, 1, 1)
|
|
78
|
+
@t_go_gamma = reg1(sess, inner)
|
|
79
|
+
@t_w_o = reg2(sess, d_model, inner)
|
|
80
|
+
# Constant zero initial state (NOT registered as a trainable param).
|
|
81
|
+
@t_state0 = TinyNN.tnn_input_2d_f32_persistent(sess, s_v, s_v * n_heads)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# reg{1,2,4}: alloc a weight of the given rank + matching m/v, push the
|
|
85
|
+
# triple into ft_weights/ft_m/ft_v, return the weight handle.
|
|
86
|
+
def reg1(sess, n)
|
|
87
|
+
w = TinyNN.tnn_input_1d_f32_persistent(sess, n)
|
|
88
|
+
@ft_weights.push(w)
|
|
89
|
+
@ft_m.push(TinyNN.tnn_input_1d_f32_persistent(sess, n))
|
|
90
|
+
@ft_v.push(TinyNN.tnn_input_1d_f32_persistent(sess, n))
|
|
91
|
+
w
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def reg2(sess, rows, cols)
|
|
95
|
+
w = TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols)
|
|
96
|
+
@ft_weights.push(w)
|
|
97
|
+
@ft_m.push(TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols))
|
|
98
|
+
@ft_v.push(TinyNN.tnn_input_2d_f32_persistent(sess, rows, cols))
|
|
99
|
+
w
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def reg4(sess, a, b, c, d)
|
|
103
|
+
w = TinyNN.tnn_input_4d_f32_persistent(sess, a, b, c, d)
|
|
104
|
+
@ft_weights.push(w)
|
|
105
|
+
@ft_m.push(TinyNN.tnn_input_4d_f32_persistent(sess, a, b, c, d))
|
|
106
|
+
@ft_v.push(TinyNN.tnn_input_4d_f32_persistent(sess, a, b, c, d))
|
|
107
|
+
w
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Mark every projection weight a trainable param. Call BEFORE finalize_weights
|
|
111
|
+
# (load-bearing order, gpt2_seq_engine.rb:128). a_log + dt_bias ARE trained
|
|
112
|
+
# (per-head decay shape); state0 is NOT (it is not in ft_weights).
|
|
113
|
+
def set_params!
|
|
114
|
+
wi = 0
|
|
115
|
+
while wi < @ft_weights.length
|
|
116
|
+
TinyNN.tnn_set_param(@ft_weights[wi])
|
|
117
|
+
wi = wi + 1
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Zero the constant initial state (after finalize_weights).
|
|
122
|
+
def zero_state!(sess)
|
|
123
|
+
TinyNN.tnn_zero_tensor(sess, @t_state0)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Forward: residual update for x [d_model, T] (B=1). Returns [d_model, T].
|
|
127
|
+
# Dims (d_model/s_v/n_heads) come from self (set at alloc) so this matches the
|
|
128
|
+
# seam's per-layer call shape; seq_t/eps arrive from the forward ctx.
|
|
129
|
+
def build_forward(sess, t_x, seq_t, eps)
|
|
130
|
+
d_model = @gdn_d_model
|
|
131
|
+
s_v = @gdn_s_v
|
|
132
|
+
n_heads = @gdn_n_heads
|
|
133
|
+
fbytes = 4
|
|
134
|
+
h = Toy::LLM::Primitives::RMSNorm.build(sess, t_x, @t_rn_gamma, eps)
|
|
135
|
+
|
|
136
|
+
q2 = TinyNN.tnn_matmul(sess, @t_w_q, h) # [S_v*H, T]
|
|
137
|
+
k2 = TinyNN.tnn_matmul(sess, @t_w_k, h)
|
|
138
|
+
v2 = TinyNN.tnn_matmul(sess, @t_w_v, h)
|
|
139
|
+
z2 = TinyNN.tnn_matmul(sess, @t_w_z, h) # [S_v*H, T] output gate
|
|
140
|
+
a2 = TinyNN.tnn_matmul(sess, @t_w_a, h) # [H, T] decay stream
|
|
141
|
+
b2 = TinyNN.tnn_matmul(sess, @t_w_b, h) # [H, T] update stream
|
|
142
|
+
|
|
143
|
+
# Reshape projections into the recurrence's packed [S_v, H, T] / [1, H, T].
|
|
144
|
+
q3 = TinyNN.tnn_reshape_3d(sess, q2, s_v, n_heads, seq_t)
|
|
145
|
+
k3 = TinyNN.tnn_reshape_3d(sess, k2, s_v, n_heads, seq_t)
|
|
146
|
+
v3 = TinyNN.tnn_reshape_3d(sess, v2, s_v, n_heads, seq_t)
|
|
147
|
+
a3 = TinyNN.tnn_reshape_3d(sess, a2, 1, n_heads, seq_t)
|
|
148
|
+
b3 = TinyNN.tnn_reshape_3d(sess, b2, 1, n_heads, seq_t)
|
|
149
|
+
|
|
150
|
+
qn = Toy::LLM::Primitives::GDN.l2_train(sess, q3, eps)
|
|
151
|
+
kn = Toy::LLM::Primitives::GDN.l2_train(sess, k3, eps)
|
|
152
|
+
g = Toy::LLM::Primitives::GDN.decay_gate(sess, a3, @t_dt_bias, @t_a_log)
|
|
153
|
+
bt = Toy::LLM::Primitives::GDN.update_gate_train(sess, b3)
|
|
154
|
+
|
|
155
|
+
# Per-head recurrence; concat head outputs along ne0 -> [S_v*H, T].
|
|
156
|
+
o = TinyNN.tnn_null_ptr
|
|
157
|
+
hh = 0
|
|
158
|
+
while hh < n_heads
|
|
159
|
+
st_h = TinyNN.tnn_view_2d(sess, @t_state0, s_v, s_v,
|
|
160
|
+
s_v * fbytes, hh * s_v * s_v * fbytes)
|
|
161
|
+
o_h = Toy::LLM::Primitives::GDN.recur_unrolled(sess, qn, kn, v3, g, bt,
|
|
162
|
+
st_h, s_v, n_heads, hh, seq_t)
|
|
163
|
+
if hh == 0
|
|
164
|
+
o = o_h
|
|
165
|
+
else
|
|
166
|
+
o = TinyNN.tnn_concat(sess, o, o_h, 0)
|
|
167
|
+
end
|
|
168
|
+
hh = hh + 1
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
gated = Toy::LLM::Primitives::GDN.gated_out(sess, o, z2, @t_go_gamma, eps)
|
|
172
|
+
out = TinyNN.tnn_matmul(sess, @t_w_o, gated) # [d_model, T]
|
|
173
|
+
TinyNN.tnn_add(sess, t_x, out)
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end; end; end
|
|
@@ -153,6 +153,17 @@ class GPT2KVFFICache
|
|
|
153
153
|
# Build the compute graph for one decode position. Returns the logits
|
|
154
154
|
# tensor handle. Caller calls tnn_compute then download_row_major.
|
|
155
155
|
def build_decode_step(pos)
|
|
156
|
+
# The per-head K/V buffers are sized for @max_T positions. Writing /
|
|
157
|
+
# reading slot `pos` requires pos < @max_T; at pos == @max_T the
|
|
158
|
+
# cpy-into-view and history views overrun the cache allocation and
|
|
159
|
+
# ggml aborts deep inside ggml_view_2d. Fail loud here with a
|
|
160
|
+
# toy-level message instead (see toy#99).
|
|
161
|
+
if pos >= @max_T
|
|
162
|
+
raise "GPT2KVFFICache: decode pos=" + pos.to_s +
|
|
163
|
+
" exceeds KV cache capacity max_T=" + @max_T.to_s +
|
|
164
|
+
" (size the cache >= prompt_len + n_generate via realize_for)"
|
|
165
|
+
end
|
|
166
|
+
|
|
156
167
|
eps = 1.0e-5
|
|
157
168
|
scale = 1.0 / Math.sqrt(@d_head.to_f)
|
|
158
169
|
d_model = @d_model
|
|
@@ -155,6 +155,17 @@ class GPT2KVFFICacheCuda
|
|
|
155
155
|
# Build the compute graph for one decode position. Returns the logits
|
|
156
156
|
# tensor handle. Caller calls tnn_compute then download_row_major.
|
|
157
157
|
def build_decode_step(pos)
|
|
158
|
+
# The per-head K/V buffers are sized for @max_T positions. Writing /
|
|
159
|
+
# reading slot `pos` requires pos < @max_T; at pos == @max_T the
|
|
160
|
+
# cpy-into-view and history views overrun the cache allocation and
|
|
161
|
+
# ggml aborts deep inside ggml_view_2d. Fail loud here with a
|
|
162
|
+
# toy-level message instead (see toy#99).
|
|
163
|
+
if pos >= @max_T
|
|
164
|
+
raise "GPT2KVFFICacheCuda: decode pos=" + pos.to_s +
|
|
165
|
+
" exceeds KV cache capacity max_T=" + @max_T.to_s +
|
|
166
|
+
" (size the cache >= prompt_len + n_generate via realize_for)"
|
|
167
|
+
end
|
|
168
|
+
|
|
158
169
|
eps = 1.0e-5
|
|
159
170
|
scale = 1.0 / Math.sqrt(@d_head.to_f)
|
|
160
171
|
d_model = @d_model
|
|
@@ -155,6 +155,17 @@ class GPT2KVFFICacheMetal
|
|
|
155
155
|
# Build the compute graph for one decode position. Returns the logits
|
|
156
156
|
# tensor handle. Caller calls tnn_compute then download_row_major.
|
|
157
157
|
def build_decode_step(pos)
|
|
158
|
+
# The per-head K/V buffers are sized for @max_T positions. Writing /
|
|
159
|
+
# reading slot `pos` requires pos < @max_T; at pos == @max_T the
|
|
160
|
+
# cpy-into-view and history views overrun the cache allocation and
|
|
161
|
+
# ggml aborts deep inside ggml_view_2d. Fail loud here with a
|
|
162
|
+
# toy-level message instead (see toy#99).
|
|
163
|
+
if pos >= @max_T
|
|
164
|
+
raise "GPT2KVFFICacheMetal: decode pos=" + pos.to_s +
|
|
165
|
+
" exceeds KV cache capacity max_T=" + @max_T.to_s +
|
|
166
|
+
" (size the cache >= prompt_len + n_generate via realize_for)"
|
|
167
|
+
end
|
|
168
|
+
|
|
158
169
|
eps = 1.0e-5
|
|
159
170
|
scale = 1.0 / Math.sqrt(@d_head.to_f)
|
|
160
171
|
d_model = @d_model
|
|
@@ -874,7 +874,7 @@ class SmolLM2KVFFICache
|
|
|
874
874
|
if is_native
|
|
875
875
|
wtype = GGUFLoad.detect_weight_type(gguf_path)
|
|
876
876
|
set_weight_type(wtype)
|
|
877
|
-
realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias)
|
|
877
|
+
realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias, flags.qk_norm)
|
|
878
878
|
puts " BYO-pointer mmap (weight_type=" + wtype.to_s + ")"
|
|
879
879
|
gguf
|
|
880
880
|
else
|
|
@@ -1584,7 +1584,15 @@ module SmolLM2KV
|
|
|
1584
1584
|
TinyNN.tnn_reset_for_rebuild(kv_cache.sess)
|
|
1585
1585
|
step = kv_cache.build_decode_step(pos)
|
|
1586
1586
|
TinyNN.tnn_realize(kv_cache.sess, step.kv_step_logits)
|
|
1587
|
-
|
|
1587
|
+
# Spinel landmine: in whole-program inference contexts where `token_id`
|
|
1588
|
+
# poly-collapses to sp_RbVal (e.g. the eval runner, where generate's
|
|
1589
|
+
# sampler-fed `last_id` unifies decode_step's param to RbVal), the literal
|
|
1590
|
+
# `[token_id]` compiles to a PolyArray. upload_int_array takes :int_array
|
|
1591
|
+
# (sp_IntArray), so the PolyArray is then mis-read as an IntArray → garbage
|
|
1592
|
+
# length → ggml "tensor write out of bounds" abort. Narrowing to a clean
|
|
1593
|
+
# mrb_int via `.to_i` forces the IntArray codegen (as `[pos]` already gets).
|
|
1594
|
+
tid = token_id.to_i
|
|
1595
|
+
TinyNN.upload_int_array(kv_cache.sess, step.t_token_id, [tid])
|
|
1588
1596
|
TinyNN.upload_int_array(kv_cache.sess, step.t_pos, [pos])
|
|
1589
1597
|
TinyNN.tnn_compute(kv_cache.sess)
|
|
1590
1598
|
kv_cache.dump_trace
|
|
@@ -862,7 +862,7 @@ class SmolLM2KVFFICacheCuda
|
|
|
862
862
|
if is_native
|
|
863
863
|
wtype = GGUFLoad.detect_weight_type(gguf_path)
|
|
864
864
|
set_weight_type(wtype)
|
|
865
|
-
realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias)
|
|
865
|
+
realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias, flags.qk_norm)
|
|
866
866
|
puts " BYO-pointer mmap (weight_type=" + wtype.to_s + ")"
|
|
867
867
|
gguf
|
|
868
868
|
else
|
|
@@ -1517,7 +1517,15 @@ module SmolLM2KVCuda
|
|
|
1517
1517
|
TinyNNCuda.tnn_reset_for_rebuild(kv_cache.sess)
|
|
1518
1518
|
step = kv_cache.build_decode_step(pos)
|
|
1519
1519
|
TinyNNCuda.tnn_realize(kv_cache.sess, step.kv_step_logits)
|
|
1520
|
-
|
|
1520
|
+
# Spinel landmine: in whole-program inference contexts where `token_id`
|
|
1521
|
+
# poly-collapses to sp_RbVal (e.g. the eval runner, where generate's
|
|
1522
|
+
# sampler-fed `last_id` unifies decode_step's param to RbVal), the literal
|
|
1523
|
+
# `[token_id]` compiles to a PolyArray. upload_int_array takes :int_array
|
|
1524
|
+
# (sp_IntArray), so the PolyArray is then mis-read as an IntArray → garbage
|
|
1525
|
+
# length → ggml "tensor write out of bounds" abort. Narrowing to a clean
|
|
1526
|
+
# mrb_int via `.to_i` forces the IntArray codegen (as `[pos]` already gets).
|
|
1527
|
+
tid = token_id.to_i
|
|
1528
|
+
TinyNNCuda.upload_int_array(kv_cache.sess, step.t_token_id, [tid])
|
|
1521
1529
|
TinyNNCuda.upload_int_array(kv_cache.sess, step.t_pos, [pos])
|
|
1522
1530
|
TinyNNCuda.tnn_compute(kv_cache.sess)
|
|
1523
1531
|
kv_cache.dump_trace
|
|
@@ -862,7 +862,7 @@ class SmolLM2KVFFICacheMetal
|
|
|
862
862
|
if is_native
|
|
863
863
|
wtype = GGUFLoad.detect_weight_type(gguf_path)
|
|
864
864
|
set_weight_type(wtype)
|
|
865
|
-
realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias)
|
|
865
|
+
realize_for_mmap(gguf, cfg, max_T, flags.untied, flags.qkv_bias, flags.qk_norm)
|
|
866
866
|
puts " BYO-pointer mmap (weight_type=" + wtype.to_s + ")"
|
|
867
867
|
gguf
|
|
868
868
|
else
|
|
@@ -1517,7 +1517,15 @@ module SmolLM2KVMetal
|
|
|
1517
1517
|
TinyNNMetal.tnn_reset_for_rebuild(kv_cache.sess)
|
|
1518
1518
|
step = kv_cache.build_decode_step(pos)
|
|
1519
1519
|
TinyNNMetal.tnn_realize(kv_cache.sess, step.kv_step_logits)
|
|
1520
|
-
|
|
1520
|
+
# Spinel landmine: in whole-program inference contexts where `token_id`
|
|
1521
|
+
# poly-collapses to sp_RbVal (e.g. the eval runner, where generate's
|
|
1522
|
+
# sampler-fed `last_id` unifies decode_step's param to RbVal), the literal
|
|
1523
|
+
# `[token_id]` compiles to a PolyArray. upload_int_array takes :int_array
|
|
1524
|
+
# (sp_IntArray), so the PolyArray is then mis-read as an IntArray → garbage
|
|
1525
|
+
# length → ggml "tensor write out of bounds" abort. Narrowing to a clean
|
|
1526
|
+
# mrb_int via `.to_i` forces the IntArray codegen (as `[pos]` already gets).
|
|
1527
|
+
tid = token_id.to_i
|
|
1528
|
+
TinyNNMetal.upload_int_array(kv_cache.sess, step.t_token_id, [tid])
|
|
1521
1529
|
TinyNNMetal.upload_int_array(kv_cache.sess, step.t_pos, [pos])
|
|
1522
1530
|
TinyNNMetal.tnn_compute(kv_cache.sess)
|
|
1523
1531
|
kv_cache.dump_trace
|
|
@@ -25,6 +25,9 @@ require_relative "../primitives/rope"
|
|
|
25
25
|
require_relative "../primitives/swiglu"
|
|
26
26
|
require_relative "../primitives/gqa"
|
|
27
27
|
require_relative "../blocks/transformer_block"
|
|
28
|
+
require_relative "../primitives/gdn"
|
|
29
|
+
require_relative "../blocks/gdn_block"
|
|
30
|
+
require_relative "../archs/layer_spec"
|
|
28
31
|
require_relative "../archs/llama_arch"
|
|
29
32
|
|
|
30
33
|
module Toy; module LLM; module Engine
|
|
@@ -1007,6 +1010,14 @@ class LlamaSeqEngine
|
|
|
1007
1010
|
@t_seq_attn_mask = TinyNN.tnn_input_2d_f32_persistent(@sess, tb_alloc, tb_alloc)
|
|
1008
1011
|
end
|
|
1009
1012
|
|
|
1013
|
+
# #1449: pre-create the token-id index leaf in ctx_w (galloc-external) before
|
|
1014
|
+
# finalize, so galloc can't free its compute-arena slot and reuse it for the
|
|
1015
|
+
# loss output (-> backward get_rows reads loss bits -> wild index -> OOB
|
|
1016
|
+
# abort, layout-flaky on the C backend). build_forward reuses this handle;
|
|
1017
|
+
# it's re-uploaded each step. Positions stay in the compute ctx (the loss
|
|
1018
|
+
# aliases off=0 = token_ids, not positions). See tnn_input_1d_i32_persistent.
|
|
1019
|
+
@t_seq_token_ids = TinyNN.tnn_input_1d_i32_persistent(@sess, @seq_t * @seq_b)
|
|
1020
|
+
|
|
1010
1021
|
TinyNN.tnn_finalize_weights(@sess)
|
|
1011
1022
|
|
|
1012
1023
|
# Upload llama3-style RoPE freq_factors once the backend buffer
|
|
@@ -1056,7 +1067,11 @@ class LlamaSeqEngine
|
|
|
1056
1067
|
# that ordering); RoPE applies per-batch positional encoding
|
|
1057
1068
|
# because rope_ext reads positions[k] for each ne[2] slot.
|
|
1058
1069
|
tb = @seq_t * @seq_b
|
|
1059
|
-
|
|
1070
|
+
# #1449: token_ids is pre-created persistent (ctx_w) in finalize and survives
|
|
1071
|
+
# reset_for_rebuild — reuse it. Only allocate here as a fallback for any path
|
|
1072
|
+
# that builds the forward without the finalize pre-creation (then it lands in
|
|
1073
|
+
# the compute ctx, the legacy behaviour).
|
|
1074
|
+
@t_seq_token_ids = TinyNN.tnn_input_1d_i32(@sess, tb) if @t_seq_token_ids == TinyNN.tnn_null_ptr
|
|
1060
1075
|
@t_seq_positions = TinyNN.tnn_input_1d_i32_ctx(@sess, tb)
|
|
1061
1076
|
|
|
1062
1077
|
# The arch reads seq_rope_cfg / seq_donor_d_in off itself; the cache
|
|
@@ -30,6 +30,9 @@ require_relative "../primitives/rope_cuda"
|
|
|
30
30
|
require_relative "../primitives/swiglu_cuda"
|
|
31
31
|
require_relative "../primitives/gqa_cuda"
|
|
32
32
|
require_relative "../blocks/transformer_block_cuda"
|
|
33
|
+
require_relative "../primitives/gdn"
|
|
34
|
+
require_relative "../blocks/gdn_block"
|
|
35
|
+
require_relative "../archs/layer_spec"
|
|
33
36
|
require_relative "../archs/llama_arch_cuda"
|
|
34
37
|
|
|
35
38
|
module Toy; module LLM; module Engine
|
|
@@ -1012,6 +1015,14 @@ class LlamaSeqEngineCuda
|
|
|
1012
1015
|
@t_seq_attn_mask = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, tb_alloc, tb_alloc)
|
|
1013
1016
|
end
|
|
1014
1017
|
|
|
1018
|
+
# #1449: pre-create the token-id index leaf in ctx_w (galloc-external) before
|
|
1019
|
+
# finalize, so galloc can't free its compute-arena slot and reuse it for the
|
|
1020
|
+
# loss output (-> backward get_rows reads loss bits -> wild index -> OOB
|
|
1021
|
+
# abort, layout-flaky on the C backend). build_forward reuses this handle;
|
|
1022
|
+
# it's re-uploaded each step. Positions stay in the compute ctx (the loss
|
|
1023
|
+
# aliases off=0 = token_ids, not positions). See tnn_input_1d_i32_persistent.
|
|
1024
|
+
@t_seq_token_ids = TinyNNCuda.tnn_input_1d_i32_persistent(@sess, @seq_t * @seq_b)
|
|
1025
|
+
|
|
1015
1026
|
TinyNNCuda.tnn_finalize_weights(@sess)
|
|
1016
1027
|
|
|
1017
1028
|
# Upload llama3-style RoPE freq_factors once the backend buffer
|
|
@@ -1061,7 +1072,11 @@ class LlamaSeqEngineCuda
|
|
|
1061
1072
|
# that ordering); RoPE applies per-batch positional encoding
|
|
1062
1073
|
# because rope_ext reads positions[k] for each ne[2] slot.
|
|
1063
1074
|
tb = @seq_t * @seq_b
|
|
1064
|
-
|
|
1075
|
+
# #1449: token_ids is pre-created persistent (ctx_w) in finalize and survives
|
|
1076
|
+
# reset_for_rebuild — reuse it. Only allocate here as a fallback for any path
|
|
1077
|
+
# that builds the forward without the finalize pre-creation (then it lands in
|
|
1078
|
+
# the compute ctx, the legacy behaviour).
|
|
1079
|
+
@t_seq_token_ids = TinyNNCuda.tnn_input_1d_i32(@sess, tb) if @t_seq_token_ids == TinyNNCuda.tnn_null_ptr
|
|
1065
1080
|
@t_seq_positions = TinyNNCuda.tnn_input_1d_i32_ctx(@sess, tb)
|
|
1066
1081
|
|
|
1067
1082
|
# The arch reads seq_rope_cfg / seq_donor_d_in off itself; the cache
|
|
@@ -30,6 +30,9 @@ require_relative "../primitives/rope_metal"
|
|
|
30
30
|
require_relative "../primitives/swiglu_metal"
|
|
31
31
|
require_relative "../primitives/gqa_metal"
|
|
32
32
|
require_relative "../blocks/transformer_block_metal"
|
|
33
|
+
require_relative "../primitives/gdn"
|
|
34
|
+
require_relative "../blocks/gdn_block"
|
|
35
|
+
require_relative "../archs/layer_spec"
|
|
33
36
|
require_relative "../archs/llama_arch_metal"
|
|
34
37
|
|
|
35
38
|
module Toy; module LLM; module Engine
|
|
@@ -1012,6 +1015,14 @@ class LlamaSeqEngineMetal
|
|
|
1012
1015
|
@t_seq_attn_mask = TinyNNMetal.tnn_input_2d_f32_persistent(@sess, tb_alloc, tb_alloc)
|
|
1013
1016
|
end
|
|
1014
1017
|
|
|
1018
|
+
# #1449: pre-create the token-id index leaf in ctx_w (galloc-external) before
|
|
1019
|
+
# finalize, so galloc can't free its compute-arena slot and reuse it for the
|
|
1020
|
+
# loss output (-> backward get_rows reads loss bits -> wild index -> OOB
|
|
1021
|
+
# abort, layout-flaky on the C backend). build_forward reuses this handle;
|
|
1022
|
+
# it's re-uploaded each step. Positions stay in the compute ctx (the loss
|
|
1023
|
+
# aliases off=0 = token_ids, not positions). See tnn_input_1d_i32_persistent.
|
|
1024
|
+
@t_seq_token_ids = TinyNNMetal.tnn_input_1d_i32_persistent(@sess, @seq_t * @seq_b)
|
|
1025
|
+
|
|
1015
1026
|
TinyNNMetal.tnn_finalize_weights(@sess)
|
|
1016
1027
|
|
|
1017
1028
|
# Upload llama3-style RoPE freq_factors once the backend buffer
|
|
@@ -1061,7 +1072,11 @@ class LlamaSeqEngineMetal
|
|
|
1061
1072
|
# that ordering); RoPE applies per-batch positional encoding
|
|
1062
1073
|
# because rope_ext reads positions[k] for each ne[2] slot.
|
|
1063
1074
|
tb = @seq_t * @seq_b
|
|
1064
|
-
|
|
1075
|
+
# #1449: token_ids is pre-created persistent (ctx_w) in finalize and survives
|
|
1076
|
+
# reset_for_rebuild — reuse it. Only allocate here as a fallback for any path
|
|
1077
|
+
# that builds the forward without the finalize pre-creation (then it lands in
|
|
1078
|
+
# the compute ctx, the legacy behaviour).
|
|
1079
|
+
@t_seq_token_ids = TinyNNMetal.tnn_input_1d_i32(@sess, tb) if @t_seq_token_ids == TinyNNMetal.tnn_null_ptr
|
|
1065
1080
|
@t_seq_positions = TinyNNMetal.tnn_input_1d_i32_ctx(@sess, tb)
|
|
1066
1081
|
|
|
1067
1082
|
# The arch reads seq_rope_cfg / seq_donor_d_in off itself; the cache
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# lib/toy/llm/primitives/depth_scale.rb — L1 primitive: depth-dependent
|
|
2
|
+
# LayerNorm scaling (Sun et al., "The Curse of Depth", arXiv 2502.05795).
|
|
3
|
+
#
|
|
4
|
+
# Pure module: `self.` methods only. The BLOCK (L2) owns the norm and computes
|
|
5
|
+
# the per-layer constant; this primitive applies the parameter-free 1/sqrt(ell)
|
|
6
|
+
# scaling to a normalised sublayer INPUT. See README.md.
|
|
7
|
+
#
|
|
8
|
+
# Formula: h(ell) = LayerNorm(h_ell) * (1/sqrt(ell)), applied to BOTH the
|
|
9
|
+
# attention and FFN pre-norm outputs (the input fed to the sublayer), 1-indexed
|
|
10
|
+
# ell. Caps Pre-LN variance growth with depth so deep layers stay effective.
|
|
11
|
+
# Hyperparameter-free, no learned weights.
|
|
12
|
+
#
|
|
13
|
+
# Spinel hygiene: no Cfg / no default args. One FFI passthrough. inv_sqrt_depth
|
|
14
|
+
# is the block's precomputed Float (1/sqrt(layer_index), computed in the CRuby
|
|
15
|
+
# layer — no libm in the Spinel runner). Call via the full module path.
|
|
16
|
+
|
|
17
|
+
module Toy
|
|
18
|
+
module LLM
|
|
19
|
+
module Primitives
|
|
20
|
+
module DepthScale
|
|
21
|
+
NAME = :depth_scale
|
|
22
|
+
|
|
23
|
+
# Scale a normalised tensor by the depth constant 1/sqrt(ell).
|
|
24
|
+
# x is the block's RMSNorm output (the sublayer input); the block
|
|
25
|
+
# passes inv_sqrt_depth = 1.0/Math.sqrt(layer) as a Float constant.
|
|
26
|
+
# Returns the depth-scaled handle.
|
|
27
|
+
def self.apply(sess, x, inv_sqrt_depth)
|
|
28
|
+
TinyNN.tnn_scale(sess, x, inv_sqrt_depth)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|