toy 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -0
- data/Makefile +211 -5
- data/README.md +1 -1
- data/lib/toy/compute.rb +9 -0
- data/lib/toy/compute_cuda.rb +8 -0
- data/lib/toy/compute_metal.rb +17 -0
- data/lib/toy/core/cli/new.rb +8 -0
- data/lib/toy/ffi/tinynn.rb +19 -0
- data/lib/toy/ffi/tinynn_cuda.rb +7 -0
- data/lib/toy/ffi/tinynn_metal.rb +5 -0
- data/lib/toy/llm/archs/layer_spec.rb +39 -0
- data/lib/toy/llm/archs/llama_arch.rb +62 -1
- data/lib/toy/llm/archs/llama_arch_cuda.rb +62 -1
- data/lib/toy/llm/archs/llama_arch_metal.rb +62 -1
- data/lib/toy/llm/blocks/gdn_block.rb +176 -0
- data/lib/toy/llm/engine/gpt2_kv_engine.rb +11 -0
- data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +11 -0
- data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +11 -0
- data/lib/toy/llm/engine/llama_kv_engine.rb +10 -2
- data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +10 -2
- data/lib/toy/llm/engine/llama_kv_engine_metal.rb +10 -2
- data/lib/toy/llm/engine/llama_seq_engine.rb +16 -1
- data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +16 -1
- data/lib/toy/llm/engine/llama_seq_engine_metal.rb +16 -1
- data/lib/toy/llm/primitives/depth_scale.rb +33 -0
- data/lib/toy/llm/primitives/diff_attention.rb +71 -0
- data/lib/toy/llm/primitives/gdn.rb +188 -0
- data/lib/toy/llm/primitives/scalable_softmax.rb +37 -0
- data/lib/toy/run/eval_metal.rb +12 -0
- data/lib/toy/run/infer_metal.rb +19 -0
- data/lib/toy/run/train_gpt2_metal.rb +7 -0
- data/lib/toy/run/train_hybrid.rb +232 -0
- data/lib/toy/run/train_metal.rb +10 -0
- data/lib/toy/version.rb +4 -3
- data/tinynn/tinynn_backend_cuda.c +22 -0
- data/tinynn/tinynn_ggml.c +231 -0
- metadata +9 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e6344fb33638dcdc959b0e713aa08081d9082eecec81c095c55b633828d5f3e8
|
|
4
|
+
data.tar.gz: 0bfac0f0a5f6025cae9146f877b3d565e94a4ab0f8b69ecb60144ed4f9dab8e1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4dc0eb7b7a049022bd5c86a4853fcc52ff1e803bd0d574cddd8ceb01742eff18d02d8703c8cdcff9a044c3d81c2bf4d1a60d42ef70680a944e86a908b853e91f
|
|
7
|
+
data.tar.gz: facf2141aebb4c5384eb25d1ea3c4fcdb00725184fdc0b176e23799a871f7f485a8564ebfa38bd2ba47f3832723476e568fd40e0ffa5057c6bda242dcbc09483
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,36 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## v0.9.0 — 2026-06-22
|
|
4
|
+
|
|
5
|
+
**The Dragon / Gated-DeltaNet trainable hybrid arc.** toy grows a second block
|
|
6
|
+
type and the seam to stack it heterogeneously with attention — built phase by
|
|
7
|
+
phase, each independently gated.
|
|
8
|
+
|
|
9
|
+
- **Trainable GDN (Path B)**: the gated delta rule expressed as an *unrolled
|
|
10
|
+
autograd composition* (`GDN.recur_unrolled`) of ops that each have a ggml
|
|
11
|
+
backward — so a Gated-DeltaNet layer trains with **no hand-written kernel
|
|
12
|
+
backward** (ggml has none for `GATED_DELTA_NET`); the fused kernel is kept for
|
|
13
|
+
inference. Gated by forward-parity (`recur_unrolled` == fused kernel to 1e-6,
|
|
14
|
+
incl. multi-head) + a differentiability proof.
|
|
15
|
+
- **L1 Dragon primitives**: `gdn` (l2/decay-gate/update-gate/recur/gated-out),
|
|
16
|
+
`diff_attention`, `scalable_softmax`, `depth_scale`; 8 elementwise ggml ops +
|
|
17
|
+
`tnn_gated_delta_net`/`tnn_conv_1d` wired (CPU-only this arc).
|
|
18
|
+
- **Per-layer `LayerSpec` seam**: a flat-int `seq_layer_kinds` dispatch (one arch
|
|
19
|
+
loop, monomorphic per-kind block call) — byte-exact on homogeneous Llama
|
|
20
|
+
(from-scratch / warm-start / lora unchanged).
|
|
21
|
+
- **`GDNBlock`** (L2) + **`libexec/toy-train-hybrid`**: a self-contained
|
|
22
|
+
from-scratch **attention+GDN hybrid** trains (CE loss decreases). Folding it
|
|
23
|
+
into the shared `toy train` engine is deferred behind a union-pin Spinel
|
|
24
|
+
codegen block — re-apply protocol in `docs/roadmap/gdn-hybrid-engine-reintegration.md`.
|
|
25
|
+
- **Fixes**: `#1449` whole-program training abort (backward `get_rows` index OOB)
|
|
26
|
+
root-caused as a latent ggml-alloc liveness bug and fixed toy-side
|
|
27
|
+
(`tnn_input_1d_i32_persistent`, a galloc-external token-id index) — *not* a
|
|
28
|
+
spinel codegen bug (matz closed it resolved); CUDA/Metal training restored by
|
|
29
|
+
mirroring that FFI decl into the CUDA/Metal siblings. New backward-friendly
|
|
30
|
+
shims `tnn_sqrt`/`tnn_div`/`tnn_repeat`.
|
|
31
|
+
- **Performance**: CPU inference ~+27% tok/s and LoRA steady-state ~−24% vs the
|
|
32
|
+
v0.8.0-era baselines (heavy CUDA bench stable).
|
|
33
|
+
|
|
3
34
|
## v0.8.0 — 2026-06-12
|
|
4
35
|
|
|
5
36
|
**The first published version** (RubyGems, gem name graciously transferred
|
data/Makefile
CHANGED
|
@@ -59,7 +59,12 @@ endif
|
|
|
59
59
|
# .a in tinynn/ combined with newer Spinel C codegen can produce
|
|
60
60
|
# misaligned binaries that segfault at init (Tao hit this 2026-05-26
|
|
61
61
|
# after pulling Spinel 2183a92 — the lib archives weren't rebuilt).
|
|
62
|
-
|
|
62
|
+
# Track the compiler BINARY: post the Ruby→C rewrite there is no
|
|
63
|
+
# spinel_analyze/spinel_codegen at the checkout root (the Ruby backend
|
|
64
|
+
# moved to legacy/, oracle-only), just the single `spinel` binary —
|
|
65
|
+
# the right rebuild trigger, present on both the legacy and C layouts
|
|
66
|
+
# (verified byte-exact green on the union pin; toy#101 Part 1).
|
|
67
|
+
SPINEL_DEPS := $(SPINEL_BIN)
|
|
63
68
|
|
|
64
69
|
CC ?= cc
|
|
65
70
|
CFLAGS ?= -O2 -fPIC -Wall -Wextra
|
|
@@ -355,6 +360,16 @@ endif
|
|
|
355
360
|
$(SPINEL) --cc='cc -Wl,-u,_tnn_metal_force_link -framework Foundation -framework Metal -framework MetalKit' $< -o $@
|
|
356
361
|
toy-eval-metal: libexec/toy-eval-metal
|
|
357
362
|
|
|
363
|
+
# Convenience: run both functional gates on the pure CPU path (no parity arm).
|
|
364
|
+
# These are the byte-exact infer/eval baselines. Until this target existed the
|
|
365
|
+
# CPU eval gate only ran behind gate-cuda's TOY_GATE_CUDA=1, so a CPU-only eval
|
|
366
|
+
# regression could reach main unnoticed — and did once (the decode_step
|
|
367
|
+
# PolyArray OOB, #104/#105). Self-builds the runners via bin/toy.
|
|
368
|
+
.PHONY: gate-cpu
|
|
369
|
+
gate-cpu:
|
|
370
|
+
ruby prep/infer_gate.rb
|
|
371
|
+
ruby prep/eval_gate.rb
|
|
372
|
+
|
|
358
373
|
# Convenience: run both functional gates with the CUDA parity arm enabled.
|
|
359
374
|
.PHONY: gate-cuda
|
|
360
375
|
gate-cuda:
|
|
@@ -449,8 +464,12 @@ gate-run-log:
|
|
|
449
464
|
# turns the skip into a failure): MRI+Fiddle reproduces the recorded
|
|
450
465
|
# Spinel from-scratch gate curve BIT-EXACT (train_baseline.txt) and the
|
|
451
466
|
# smollm2-135m greedy decode ids byte-equal infer_baseline.txt.
|
|
467
|
+
# Prereq on the shared .so so a NEW FFI symbol (e.g. the #1449
|
|
468
|
+
# tnn_input_1d_i32_persistent) can't leave a STALE .so behind that
|
|
469
|
+
# fails the native leg with a missing-symbol NativeCallError — make
|
|
470
|
+
# rebuilds it from the .o's automatically.
|
|
452
471
|
.PHONY: gate-mri
|
|
453
|
-
gate-mri:
|
|
472
|
+
gate-mri: tinynn/libtinynn_ggml_shared.so
|
|
454
473
|
ruby prep/mri_gate.rb
|
|
455
474
|
|
|
456
475
|
# toy#60 item 4 — the COLD-START consumer gate: `toy new` scaffold →
|
|
@@ -486,6 +505,34 @@ gate-compute-surface-cuda: prep/smokes/smoke_compute_surface_cuda
|
|
|
486
505
|
&& echo "GATE PASS [compute-surface-cuda]: lib/toy/compute_cuda.rb device entry is live" \
|
|
487
506
|
|| { echo "GATE FAIL [compute-surface-cuda]"; exit 1; }
|
|
488
507
|
|
|
508
|
+
# Projection-lens gate: train through W_proj only (token_embd frozen) and
|
|
509
|
+
# assert the loss drops (the smoke's own "is learning" verdict). The CPU
|
|
510
|
+
# smoke was an ungated diagnostic; this wires it into the gate surface.
|
|
511
|
+
.PHONY: gate-projection-lens
|
|
512
|
+
gate-projection-lens: prep/smokes/smoke_projection_lens
|
|
513
|
+
@out="$$(STEPS=20 ./prep/smokes/smoke_projection_lens 2>&1)"; \
|
|
514
|
+
echo "$$out" | tail -2; \
|
|
515
|
+
echo "$$out" | grep -q "projection-lens training is learning" \
|
|
516
|
+
&& echo "GATE PASS [projection-lens]: W_proj-only training learns (token_embd frozen)" \
|
|
517
|
+
|| { echo "GATE FAIL [projection-lens]"; exit 1; }
|
|
518
|
+
|
|
519
|
+
# Metal twin of the projection-lens gate. The _metal smoke is an auto-
|
|
520
|
+
# generated mirror (MIRROR_METAL) that previously built but was reachable
|
|
521
|
+
# from no gate; this de-orphans it. macOS-only, skips green off Darwin
|
|
522
|
+
# exactly like gate-metal.
|
|
523
|
+
.PHONY: gate-projection-lens-metal
|
|
524
|
+
gate-projection-lens-metal:
|
|
525
|
+
ifneq ($(UNAME_S),Darwin)
|
|
526
|
+
@echo "gate-projection-lens-metal: Metal is macOS-only (uname -s = $(UNAME_S)) — skipping"; exit 0
|
|
527
|
+
else
|
|
528
|
+
$(MAKE) prep/smokes/smoke_projection_lens_metal
|
|
529
|
+
@out="$$(STEPS=20 ./prep/smokes/smoke_projection_lens_metal 2>&1)"; \
|
|
530
|
+
echo "$$out" | tail -2; \
|
|
531
|
+
echo "$$out" | grep -q "projection-lens training is learning" \
|
|
532
|
+
&& echo "GATE PASS [projection-lens-metal]: W_proj-only training learns on Metal" \
|
|
533
|
+
|| { echo "GATE FAIL [projection-lens-metal]"; exit 1; }
|
|
534
|
+
endif
|
|
535
|
+
|
|
489
536
|
# K-quant MoE attention regression gate (the bug long misfiled as ggml#1506):
|
|
490
537
|
# head_nbytes returned 0 for K-quant attention weights → per-head mmap stride
|
|
491
538
|
# collapsed every head onto head 0 → degenerate repeating decode on OLMoE
|
|
@@ -564,7 +611,8 @@ libexec/toy-train: lib/toy/run/train.rb lib/toy/dev/toy_describe_flow.rb lib/toy
|
|
|
564
611
|
lib/toy/train/toy_gguf_writer.rb lib/toy/train/toy_drift_grad.rb lib/toy/models/transformer.rb \
|
|
565
612
|
lib/toy/llm/primitives/rms_norm.rb lib/toy/llm/primitives/rope.rb \
|
|
566
613
|
lib/toy/llm/primitives/swiglu.rb lib/toy/llm/primitives/gqa.rb \
|
|
567
|
-
lib/toy/llm/blocks/transformer_block.rb lib/toy/llm/
|
|
614
|
+
lib/toy/llm/blocks/transformer_block.rb lib/toy/llm/primitives/gdn.rb lib/toy/llm/blocks/gdn_block.rb \
|
|
615
|
+
lib/toy/llm/archs/layer_spec.rb lib/toy/llm/archs/llama_arch.rb \
|
|
568
616
|
lib/toy/ffi/tinynn.rb tinynn/libtinynn_ggml.a | libexec
|
|
569
617
|
$(SPINEL) $< -o $@
|
|
570
618
|
toy-train: libexec/toy-train
|
|
@@ -579,7 +627,8 @@ libexec/toy-train-lora: lib/toy/run/train_lora.rb lib/toy/dev/toy_describe_flow.
|
|
|
579
627
|
lib/toy/train/toy_gguf_writer.rb lib/toy/train/toy_drift_grad.rb lib/toy/models/transformer.rb \
|
|
580
628
|
lib/toy/llm/primitives/rms_norm.rb lib/toy/llm/primitives/rope.rb \
|
|
581
629
|
lib/toy/llm/primitives/swiglu.rb lib/toy/llm/primitives/gqa.rb \
|
|
582
|
-
lib/toy/llm/blocks/transformer_block.rb lib/toy/llm/
|
|
630
|
+
lib/toy/llm/blocks/transformer_block.rb lib/toy/llm/primitives/gdn.rb lib/toy/llm/blocks/gdn_block.rb \
|
|
631
|
+
lib/toy/llm/archs/layer_spec.rb lib/toy/llm/archs/llama_arch.rb \
|
|
583
632
|
lib/toy/ffi/tinynn.rb tinynn/libtinynn_ggml.a | libexec
|
|
584
633
|
$(SPINEL) $< -o $@
|
|
585
634
|
toy-train-lora: libexec/toy-train-lora
|
|
@@ -756,6 +805,127 @@ prep/smokes/smoke_projection_lens: prep/smokes/smoke_projection_lens.rb lib/toy/
|
|
|
756
805
|
prep/smokes/smoke_compute_surface: prep/smokes/smoke_compute_surface.rb lib/toy/compute.rb lib/toy/llm/training_batch.rb lib/toy/llm/recipe_options.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
|
|
757
806
|
$(SPINEL) $< -o $@
|
|
758
807
|
|
|
808
|
+
# Dragon/GDN Phase 1 (docs/roadmap/dragon-gdn-arch-2026-06-20.md): prove the
|
|
809
|
+
# newly-wired tnn_gated_delta_net + tnn_conv_1d FFI ops compute through toy's
|
|
810
|
+
# stack on the in-tree ggml. Forward-only shape gate (the recurrence runs and
|
|
811
|
+
# emits the documented output shape).
|
|
812
|
+
.PHONY: gate-gdn-forward
|
|
813
|
+
gate-gdn-forward: prep/smokes/smoke_gdn_forward
|
|
814
|
+
@out="$$(./prep/smokes/smoke_gdn_forward 2>&1)"; \
|
|
815
|
+
echo "$$out" | tail -2; \
|
|
816
|
+
echo "$$out" | grep -q "GDN smoke PASS" \
|
|
817
|
+
&& echo "GATE PASS [gdn-forward]: tnn_gated_delta_net computes through the FFI" \
|
|
818
|
+
|| { echo "GATE FAIL [gdn-forward]"; exit 1; }
|
|
819
|
+
|
|
820
|
+
prep/smokes/smoke_gdn_forward: prep/smokes/smoke_gdn_forward.rb lib/toy.rb lib/toy/ffi/tinynn.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
|
|
821
|
+
$(SPINEL) $< -o $@
|
|
822
|
+
|
|
823
|
+
# Dragon/GDN Phase 2: the Toy::LLM::Primitives::GDN L1 composition (l2-norm,
|
|
824
|
+
# log-decay + sigmoid gates, recurrence, gated output norm). The gate+l2+recur
|
|
825
|
+
# chain is computed end-to-end; gated_out is shape-checked.
|
|
826
|
+
.PHONY: gate-gdn-primitive
|
|
827
|
+
gate-gdn-primitive: prep/smokes/smoke_gdn_primitive
|
|
828
|
+
@out="$$(./prep/smokes/smoke_gdn_primitive 2>&1)"; \
|
|
829
|
+
echo "$$out" | tail -2; \
|
|
830
|
+
echo "$$out" | grep -q "GDN primitive smoke PASS" \
|
|
831
|
+
&& echo "GATE PASS [gdn-primitive]: Toy::LLM::Primitives::GDN composes + computes" \
|
|
832
|
+
|| { echo "GATE FAIL [gdn-primitive]"; exit 1; }
|
|
833
|
+
|
|
834
|
+
prep/smokes/smoke_gdn_primitive: prep/smokes/smoke_gdn_primitive.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/gdn.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
|
|
835
|
+
$(SPINEL) $< -o $@
|
|
836
|
+
|
|
837
|
+
# Dragon/GDN Phase 2: the Dragon attention-side L1 primitives (DiffAttention,
|
|
838
|
+
# ScalableSoftmax, DepthScale).
|
|
839
|
+
.PHONY: gate-dragon-attn-prims
|
|
840
|
+
gate-dragon-attn-prims: prep/smokes/smoke_dragon_attn_prims
|
|
841
|
+
@out="$$(./prep/smokes/smoke_dragon_attn_prims 2>&1)"; \
|
|
842
|
+
echo "$$out" | tail -2; \
|
|
843
|
+
echo "$$out" | grep -q "Dragon attn prims smoke PASS" \
|
|
844
|
+
&& echo "GATE PASS [dragon-attn-prims]: diff-attn / ssmax / depth-scale compose" \
|
|
845
|
+
|| { echo "GATE FAIL [dragon-attn-prims]"; exit 1; }
|
|
846
|
+
|
|
847
|
+
prep/smokes/smoke_dragon_attn_prims: prep/smokes/smoke_dragon_attn_prims.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/diff_attention.rb lib/toy/llm/primitives/scalable_softmax.rb lib/toy/llm/primitives/depth_scale.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
|
|
848
|
+
$(SPINEL) $< -o $@
|
|
849
|
+
|
|
850
|
+
# Dragon/GDN Phase 4 (Path B): numeric-parity gate — the UNROLLED,
|
|
851
|
+
# autograd-differentiable recurrence (GDN.recur_unrolled) reproduces the FUSED
|
|
852
|
+
# tnn_gated_delta_net token outputs within eps. This is what lets training use
|
|
853
|
+
# the composition (every op has a ggml backward) while inference keeps the fused
|
|
854
|
+
# kernel. See docs/roadmap/dragon-gdn-arch-2026-06-20.md (Phase 4).
|
|
855
|
+
.PHONY: gate-gdn-unrolled-parity
|
|
856
|
+
gate-gdn-unrolled-parity: prep/smokes/smoke_gdn_unrolled_parity
|
|
857
|
+
@out="$$(./prep/smokes/smoke_gdn_unrolled_parity 2>&1)"; \
|
|
858
|
+
echo "$$out" | tail -2; \
|
|
859
|
+
echo "$$out" | grep -q "GDN unrolled-parity smoke PASS" \
|
|
860
|
+
&& echo "GATE PASS [gdn-unrolled-parity]: recur_unrolled == fused kernel (eps)" \
|
|
861
|
+
|| { echo "GATE FAIL [gdn-unrolled-parity]"; exit 1; }
|
|
862
|
+
|
|
863
|
+
prep/smokes/smoke_gdn_unrolled_parity: prep/smokes/smoke_gdn_unrolled_parity.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/gdn.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
|
|
864
|
+
$(SPINEL) $< -o $@
|
|
865
|
+
|
|
866
|
+
# Dragon/GDN Phase 4 (Path B): the differentiability proof — ggml builds + runs
|
|
867
|
+
# a backward graph through recur_unrolled and yields finite non-zero dL/dq,k,v
|
|
868
|
+
# with NO hand-written fused-kernel backward. This is what makes GDN trainable.
|
|
869
|
+
.PHONY: gate-gdn-unrolled-backward
|
|
870
|
+
gate-gdn-unrolled-backward: prep/smokes/smoke_gdn_unrolled_backward
|
|
871
|
+
@out="$$(./prep/smokes/smoke_gdn_unrolled_backward 2>&1)"; \
|
|
872
|
+
echo "$$out" | tail -2; \
|
|
873
|
+
echo "$$out" | grep -q "GDN unrolled-backward smoke PASS" \
|
|
874
|
+
&& echo "GATE PASS [gdn-unrolled-backward]: recur_unrolled is differentiable" \
|
|
875
|
+
|| { echo "GATE FAIL [gdn-unrolled-backward]"; exit 1; }
|
|
876
|
+
|
|
877
|
+
prep/smokes/smoke_gdn_unrolled_backward: prep/smokes/smoke_gdn_unrolled_backward.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/gdn.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
|
|
878
|
+
$(SPINEL) $< -o $@
|
|
879
|
+
|
|
880
|
+
# Dragon/GDN Phase 5: multi-head parity — the per-head recur_unrolled looped over
|
|
881
|
+
# H heads + concat'd matches the fused kernel's head packing (strided slicing).
|
|
882
|
+
.PHONY: gate-gdn-unrolled-parity-mh
|
|
883
|
+
gate-gdn-unrolled-parity-mh: prep/smokes/smoke_gdn_unrolled_parity_mh
|
|
884
|
+
@out="$$(./prep/smokes/smoke_gdn_unrolled_parity_mh 2>&1)"; \
|
|
885
|
+
echo "$$out" | tail -2; \
|
|
886
|
+
echo "$$out" | grep -q "GDN unrolled-parity-mh smoke PASS" \
|
|
887
|
+
&& echo "GATE PASS [gdn-unrolled-parity-mh]: H-head recur_unrolled == fused kernel" \
|
|
888
|
+
|| { echo "GATE FAIL [gdn-unrolled-parity-mh]"; exit 1; }
|
|
889
|
+
|
|
890
|
+
prep/smokes/smoke_gdn_unrolled_parity_mh: prep/smokes/smoke_gdn_unrolled_parity_mh.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/gdn.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
|
|
891
|
+
$(SPINEL) $< -o $@
|
|
892
|
+
|
|
893
|
+
# Dragon/GDN Phase 5 capstone: a SELF-CONTAINED from-scratch HYBRID runner (one
|
|
894
|
+
# attention layer + one GDN layer, dispatched by the int-kind seam pattern) in
|
|
895
|
+
# its OWN compilation unit — CE loss decreases. Proves a heterogeneous
|
|
896
|
+
# attention+GDN stack trains from scratch. Separate unit so it can't corrupt the
|
|
897
|
+
# byte-exact llama engine (landmine #16). Reintegration into `toy train` waits on
|
|
898
|
+
# the union-pin Spinel codegen fix (master/spinelc).
|
|
899
|
+
.PHONY: gate-gdn-hybrid
|
|
900
|
+
gate-gdn-hybrid: libexec/toy-train-hybrid
|
|
901
|
+
@out="$$(./libexec/toy-train-hybrid 2>&1)"; \
|
|
902
|
+
echo "$$out" | tail -3; \
|
|
903
|
+
echo "$$out" | grep -q "HYBRID train smoke PASS" \
|
|
904
|
+
&& echo "GATE PASS [gdn-hybrid]: attention+GDN from-scratch hybrid trains" \
|
|
905
|
+
|| { echo "GATE FAIL [gdn-hybrid]"; exit 1; }
|
|
906
|
+
|
|
907
|
+
libexec/toy-train-hybrid: lib/toy/run/train_hybrid.rb lib/toy.rb lib/toy/ffi/tinynn.rb \
|
|
908
|
+
lib/toy/llm/primitives/rms_norm.rb lib/toy/llm/primitives/gdn.rb \
|
|
909
|
+
lib/toy/llm/blocks/gdn_block.rb lib/toy/llm/archs/layer_spec.rb \
|
|
910
|
+
tinynn/libtinynn_ggml.a $(SPINEL_DEPS) | libexec
|
|
911
|
+
$(SPINEL) $< -o $@
|
|
912
|
+
.PHONY: toy-train-hybrid
|
|
913
|
+
toy-train-hybrid: libexec/toy-train-hybrid
|
|
914
|
+
|
|
915
|
+
# Dragon/GDN Phase 5 (end-of-flow): a from-scratch model whose mixer is a
|
|
916
|
+
# trainable GDNBlock trains — CE loss decreases. Proves the GDN layer is an
|
|
917
|
+
# end-to-end trainable residual unit (no hand-written kernel backward).
|
|
918
|
+
.PHONY: gate-gdn-train
|
|
919
|
+
gate-gdn-train: prep/smokes/smoke_gdn_train
|
|
920
|
+
@out="$$(./prep/smokes/smoke_gdn_train 2>&1)"; \
|
|
921
|
+
echo "$$out" | tail -3; \
|
|
922
|
+
echo "$$out" | grep -q "GDN train smoke PASS" \
|
|
923
|
+
&& echo "GATE PASS [gdn-train]: from-scratch GDN-layer model trains (loss decreases)" \
|
|
924
|
+
|| { echo "GATE FAIL [gdn-train]"; exit 1; }
|
|
925
|
+
|
|
926
|
+
prep/smokes/smoke_gdn_train: prep/smokes/smoke_gdn_train.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/rms_norm.rb lib/toy/llm/primitives/gdn.rb lib/toy/llm/blocks/gdn_block.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
|
|
927
|
+
$(SPINEL) $< -o $@
|
|
928
|
+
|
|
759
929
|
# toy#64 item 8 — the CUDA compute entry (lib/toy/compute_cuda.rb), the
|
|
760
930
|
# consumer-ish device-at-compile-time gate. Same shape as the CPU
|
|
761
931
|
# compute-surface gate but requires compute_cuda + links the CUDA
|
|
@@ -858,7 +1028,12 @@ examples/example_07_vit_tiny: examples/07_vit_tiny.rb lib/toy/compute.rb lib/toy
|
|
|
858
1028
|
example_07: examples/example_07_vit_tiny
|
|
859
1029
|
.PHONY: example_07
|
|
860
1030
|
|
|
861
|
-
examples
|
|
1031
|
+
examples/example_08_gdn_block: examples/08_gdn_block.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/rms_norm.rb lib/toy/llm/primitives/gdn.rb lib/toy/llm/blocks/gdn_block.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
|
|
1032
|
+
$(SPINEL) $< -o $@
|
|
1033
|
+
example_08: examples/example_08_gdn_block
|
|
1034
|
+
.PHONY: example_08
|
|
1035
|
+
|
|
1036
|
+
examples-curated: example_01 example_02 example_03 example_04 example_05 example_07 example_08
|
|
862
1037
|
.PHONY: examples-curated
|
|
863
1038
|
|
|
864
1039
|
# L4 LoRA recipe gate. Drives the same LoRA fine-tune config as the
|
|
@@ -1965,6 +2140,37 @@ bench-update: tinynn/libtinynn_ggml.a
|
|
|
1965
2140
|
bench-report: tinynn/libtinynn_ggml.a
|
|
1966
2141
|
ruby bench/check.rb --report
|
|
1967
2142
|
|
|
2143
|
+
# Metal perf leg (macOS only; #104 part C). Times the metal-vs-cpu infer
|
|
2144
|
+
# runners via N-differencing — steady-state decode ms/token plus the
|
|
2145
|
+
# metal-vs-cpu ratio on THIS machine. The baseline (bench/baselines_metal.csv)
|
|
2146
|
+
# is Mac-pinned, like the metal_gate float baseline; capture it with
|
|
2147
|
+
# `make bench-metal-update` on a QUIESCED machine (desktop load skews the
|
|
2148
|
+
# numbers badly). Skips green off macOS, exactly like gate-metal.
|
|
2149
|
+
.PHONY: bench-metal bench-metal-update bench-metal-report
|
|
2150
|
+
bench-metal:
|
|
2151
|
+
ifneq ($(UNAME_S),Darwin)
|
|
2152
|
+
@echo "bench-metal: Metal is macOS-only (uname -s = $(UNAME_S)) — skipping"; exit 0
|
|
2153
|
+
else
|
|
2154
|
+
$(MAKE) libexec/toy-infer-metal libexec/toy-infer
|
|
2155
|
+
ruby bench/check_metal.rb
|
|
2156
|
+
endif
|
|
2157
|
+
|
|
2158
|
+
bench-metal-update:
|
|
2159
|
+
ifneq ($(UNAME_S),Darwin)
|
|
2160
|
+
@echo "bench-metal-update: Metal is macOS-only (uname -s = $(UNAME_S)) — skipping"; exit 0
|
|
2161
|
+
else
|
|
2162
|
+
$(MAKE) libexec/toy-infer-metal libexec/toy-infer
|
|
2163
|
+
ruby bench/check_metal.rb --update
|
|
2164
|
+
endif
|
|
2165
|
+
|
|
2166
|
+
bench-metal-report:
|
|
2167
|
+
ifneq ($(UNAME_S),Darwin)
|
|
2168
|
+
@echo "bench-metal-report: Metal is macOS-only (uname -s = $(UNAME_S)) — skipping"; exit 0
|
|
2169
|
+
else
|
|
2170
|
+
$(MAKE) libexec/toy-infer-metal libexec/toy-infer
|
|
2171
|
+
ruby bench/check_metal.rb --report
|
|
2172
|
+
endif
|
|
2173
|
+
|
|
1968
2174
|
# Routine comparison vs PyTorch — the "old-stable" yardstick — in the
|
|
1969
2175
|
# single-machine single-GPU case. Runs ON gx10: toy CUDA benches run
|
|
1970
2176
|
# native, the PyTorch reference (bench/ref_pytorch.py) runs in the
|
data/README.md
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
<img src="toy_logo.png" alt="toy" width="240" />
|
|
5
5
|
</p>
|
|
6
6
|
|
|
7
|
-
**v0.
|
|
7
|
+
**v0.9.0** · Dragon / Gated-DeltaNet trainable hybrid arc · pre-1.0, not API-stable
|
|
8
8
|
· [CHANGELOG](CHANGELOG.md)
|
|
9
9
|
· [docs](docs/architecture.md)
|
|
10
10
|
· [framework guide](docs/framework.md)
|
data/lib/toy/compute.rb
CHANGED
|
@@ -123,6 +123,15 @@ module Toy
|
|
|
123
123
|
def self.warm_start_recipe
|
|
124
124
|
Toy::LLM::Recipes::WarmStart.new
|
|
125
125
|
end
|
|
126
|
+
|
|
127
|
+
# toy#90 — device teardown hook. CPU has no GPU-resource lifecycle to
|
|
128
|
+
# drain, so this is a deliberate no-op; it exists only so a
|
|
129
|
+
# device-agnostic experiment body can call Toy::Device.shutdown
|
|
130
|
+
# portably before exit (the Metal entry's override is the one that
|
|
131
|
+
# actually matters — see compute_metal.rb).
|
|
132
|
+
def self.shutdown
|
|
133
|
+
nil
|
|
134
|
+
end
|
|
126
135
|
end
|
|
127
136
|
end
|
|
128
137
|
|
data/lib/toy/compute_cuda.rb
CHANGED
|
@@ -92,6 +92,14 @@ module Toy
|
|
|
92
92
|
def self.warm_start_recipe
|
|
93
93
|
Toy::LLM::Recipes::WarmStartCuda.new
|
|
94
94
|
end
|
|
95
|
+
|
|
96
|
+
# toy#90 — device teardown hook. CUDA frees its GPU allocations on
|
|
97
|
+
# process exit without a residency-set assert (unlike Metal), so this
|
|
98
|
+
# is a deliberate no-op; it exists for parity so a device-agnostic
|
|
99
|
+
# experiment body can call Toy::Device.shutdown portably.
|
|
100
|
+
def self.shutdown
|
|
101
|
+
nil
|
|
102
|
+
end
|
|
95
103
|
end
|
|
96
104
|
end
|
|
97
105
|
|
data/lib/toy/compute_metal.rb
CHANGED
|
@@ -85,6 +85,23 @@ module Toy
|
|
|
85
85
|
def self.from_scratch_recipe
|
|
86
86
|
Toy::LLM::Recipes::FromScratchMetal.new
|
|
87
87
|
end
|
|
88
|
+
|
|
89
|
+
# toy#90 — device teardown hook (THE one that matters). ggml-metal
|
|
90
|
+
# keeps a process-lifetime residency-set collection on its singleton
|
|
91
|
+
# device and asserts at the C++ static-destructor device-free that the
|
|
92
|
+
# collection is empty (vendor/ggml/src/ggml-metal/ggml-metal-device.m
|
|
93
|
+
# :618). A consumer that builds experiment_metal (toy new --lib) runs
|
|
94
|
+
# the binary directly — it gets NO GGML_METAL_NO_RESIDENCY=1 (that env
|
|
95
|
+
# is injected only by toy's own CLI subprocesses), so any Metal buffer
|
|
96
|
+
# still alive at exit aborts the process (exit 134) AFTER correct
|
|
97
|
+
# compute (toy#27 runs 3-4). Spinel has no at_exit, so a device-
|
|
98
|
+
# agnostic body MUST call Toy::Device.shutdown before returning;
|
|
99
|
+
# tnn_shutdown_engines frees every live Metal session's weights_buf
|
|
100
|
+
# (removing it from the residency set), satisfying the assert.
|
|
101
|
+
# RUNTIME-UNVERIFIED on gx10 (Linux) — Mac gate proves the exit-0.
|
|
102
|
+
def self.shutdown
|
|
103
|
+
TinyNNMetal.tnn_shutdown_engines
|
|
104
|
+
end
|
|
88
105
|
end
|
|
89
106
|
end
|
|
90
107
|
|
data/lib/toy/core/cli/new.rb
CHANGED
|
@@ -284,6 +284,14 @@ module Toy
|
|
|
284
284
|
step = step + 1
|
|
285
285
|
end
|
|
286
286
|
puts "experiment: ok (device=" + Toy::Device.name + ")"
|
|
287
|
+
|
|
288
|
+
# toy#90 — release backend resources before exit. REQUIRED on
|
|
289
|
+
# Metal: ggml-metal asserts at device-free that its residency
|
|
290
|
+
# set is empty, and a directly-run experiment_metal gets no
|
|
291
|
+
# GGML_METAL_NO_RESIDENCY=1 (that env is injected only by toy's
|
|
292
|
+
# own CLI). No-op on cpu/cuda. Spinel has no at_exit, so this
|
|
293
|
+
# explicit call is the teardown seam.
|
|
294
|
+
Toy::Device.shutdown
|
|
287
295
|
RUBY
|
|
288
296
|
|
|
289
297
|
# Per-device entry shims — device chosen at COMPILE time by
|
data/lib/toy/ffi/tinynn.rb
CHANGED
|
@@ -392,6 +392,24 @@ module TinyNN
|
|
|
392
392
|
# C-SSM (#114): state-space model primitives.
|
|
393
393
|
ffi_func :tnn_ssm_conv, [:ptr, :ptr, :ptr], :ptr
|
|
394
394
|
ffi_func :tnn_ssm_scan, [:ptr, :ptr, :ptr, :ptr, :ptr, :ptr, :ptr, :ptr], :ptr
|
|
395
|
+
# Gated DeltaNet recurrence core (Dragon/Qwen3-Next; GDN Phase 1). Forward-only
|
|
396
|
+
# in ggml — see docs/roadmap/dragon-gdn-arch-2026-06-20.md.
|
|
397
|
+
ffi_func :tnn_gated_delta_net, [:ptr, :ptr, :ptr, :ptr, :ptr, :ptr, :ptr], :ptr
|
|
398
|
+
ffi_func :tnn_conv_1d, [:ptr, :ptr, :ptr, :int, :int, :int], :ptr
|
|
399
|
+
# Elementwise ops for GDN gate math / differential attention / gated output
|
|
400
|
+
# norm (GDN Phase 2). sigmoid(beta), exp/log (log-decay + softplus), sub
|
|
401
|
+
# (A1-λA2), neg, l2_norm(q,k for the delta rule). See dragon-gdn doc.
|
|
402
|
+
ffi_func :tnn_sigmoid, [:ptr, :ptr], :ptr
|
|
403
|
+
ffi_func :tnn_exp, [:ptr, :ptr], :ptr
|
|
404
|
+
ffi_func :tnn_log, [:ptr, :ptr], :ptr
|
|
405
|
+
ffi_func :tnn_neg, [:ptr, :ptr], :ptr
|
|
406
|
+
ffi_func :tnn_sub, [:ptr, :ptr, :ptr], :ptr
|
|
407
|
+
ffi_func :tnn_sqrt, [:ptr, :ptr], :ptr
|
|
408
|
+
ffi_func :tnn_repeat, [:ptr, :ptr, :ptr], :ptr
|
|
409
|
+
ffi_func :tnn_div, [:ptr, :ptr, :ptr], :ptr
|
|
410
|
+
ffi_func :tnn_l2_norm, [:ptr, :ptr, :double], :ptr
|
|
411
|
+
ffi_func :tnn_softplus, [:ptr, :ptr], :ptr
|
|
412
|
+
ffi_func :tnn_scale_bias, [:ptr, :ptr, :double, :double], :ptr
|
|
395
413
|
ffi_func :tnn_rms_norm, [:ptr, :ptr, :ptr, :double], :ptr
|
|
396
414
|
ffi_func :tnn_softmax, [:ptr, :ptr], :ptr
|
|
397
415
|
ffi_func :tnn_diag_mask_inf, [:ptr, :ptr, :int], :ptr
|
|
@@ -481,6 +499,7 @@ module TinyNN
|
|
|
481
499
|
ffi_func :tnn_input_3d_persistent_mmap, [:ptr, :int, :int, :int, :int, :size_t], :ptr
|
|
482
500
|
ffi_func :tnn_input_1d_persistent_mmap, [:ptr, :int, :int, :size_t], :ptr
|
|
483
501
|
ffi_func :tnn_input_1d_f32_persistent, [:ptr, :int], :ptr
|
|
502
|
+
ffi_func :tnn_input_1d_i32_persistent, [:ptr, :int], :ptr
|
|
484
503
|
ffi_func :tnn_finalize_weights, [:ptr], :int
|
|
485
504
|
ffi_func :tnn_zero_tensor, [:ptr, :ptr], :int
|
|
486
505
|
ffi_func :tnn_realize_b, [:ptr, :ptr], :int
|
data/lib/toy/ffi/tinynn_cuda.rb
CHANGED
|
@@ -238,6 +238,13 @@ module TinyNNCuda
|
|
|
238
238
|
ffi_func :tnn_input_2d_persistent_typed, [:ptr, :int, :int, :int], :ptr
|
|
239
239
|
ffi_func :tnn_row_size, [:int, :int], :long
|
|
240
240
|
ffi_func :tnn_input_1d_f32_persistent, [:ptr, :int], :ptr
|
|
241
|
+
# #1449 fix — the token-id index leaf allocated galloc-external in ctx_w (so
|
|
242
|
+
# galloc can't free its slot + reuse it for the loss output). Mirrors the CPU
|
|
243
|
+
# tinynn.rb decl; the C function lives in the shared tinynn_ggml.c (the CUDA
|
|
244
|
+
# binaries link libtinynn_ggml.a too). Without this, the mirrored CUDA engine's
|
|
245
|
+
# finalize call to tnn_input_1d_i32_persistent is an undefined method → CUDA
|
|
246
|
+
# training aborts (caught by the heavy CUDA bench, 2026-06-22).
|
|
247
|
+
ffi_func :tnn_input_1d_i32_persistent, [:ptr, :int], :ptr
|
|
241
248
|
# Phase 2 BYO-pointer mmap (CUDA path: ggml-cuda patched to expose
|
|
242
249
|
# ggml_backend_cuda_buffer_from_ptr; weight tensors reference
|
|
243
250
|
# cudaHostRegister'd pages and run via UVA on unified-memory SKUs).
|
data/lib/toy/ffi/tinynn_metal.rb
CHANGED
|
@@ -225,6 +225,11 @@ module TinyNNMetal
|
|
|
225
225
|
ffi_func :tnn_input_2d_persistent_typed, [:ptr, :int, :int, :int], :ptr
|
|
226
226
|
ffi_func :tnn_row_size, [:int, :int], :long
|
|
227
227
|
ffi_func :tnn_input_1d_f32_persistent, [:ptr, :int], :ptr
|
|
228
|
+
# #1449 fix — galloc-external token-id index leaf (ctx_w). Mirrors the CPU/CUDA
|
|
229
|
+
# decl; C function lives in the shared tinynn_ggml.c. Without it the mirrored
|
|
230
|
+
# Metal engine's finalize aborts (undefined method), same as the CUDA gap the
|
|
231
|
+
# heavy bench caught 2026-06-22. (Metal is Mac-only; not runnable here.)
|
|
232
|
+
ffi_func :tnn_input_1d_i32_persistent, [:ptr, :int], :ptr
|
|
228
233
|
# Phase 2 BYO-pointer mmap. On Metal the buffer-from-ptr path falls
|
|
229
234
|
# through to ggml_backend_cpu_buffer_from_ptr (no public Metal
|
|
230
235
|
# buffer_from_ptr API); the scheduler then copies host pages to
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# lib/toy/llm/archs/layer_spec.rb — Phase 3 of the Dragon-GDN arc: the
|
|
2
|
+
# per-layer descriptor that lets one arch forward loop build a heterogeneous
|
|
3
|
+
# layer stack via flat-int kind dispatch (no polymorphic receiver). See
|
|
4
|
+
# docs/roadmap/dragon-gdn-arch-2026-06-20.md.
|
|
5
|
+
|
|
6
|
+
module Toy; module LLM; module Archs
|
|
7
|
+
# Per-layer descriptor — the seam that lets ONE arch forward loop build a
|
|
8
|
+
# heterogeneous layer stack (homogeneous Llama attention today; Dragon's
|
|
9
|
+
# Gated-DeltaNet + selective-attention mix from Phase 5) WITHOUT polymorphic
|
|
10
|
+
# method dispatch.
|
|
11
|
+
#
|
|
12
|
+
# The `kind` field is a FLAT INTEGER, deliberately not a class, symbol, or
|
|
13
|
+
# block object. The arch loop branches on `spec.kind == KIND_*` and then
|
|
14
|
+
# calls a CONCRETE typed block method inside each branch, so every
|
|
15
|
+
# `.build_forward` call site keeps a single receiver class. Funnelling
|
|
16
|
+
# heterogeneous receiver types through ONE call site is the Spinel
|
|
17
|
+
# poly-dispatch landmine (the #11/#12 family, matz/spinel#1043) the whole
|
|
18
|
+
# Dragon seam is shaped to avoid — see dragon-gdn-arch-2026-06-20.md
|
|
19
|
+
# "Phase 3 — the per-layer descriptor seam."
|
|
20
|
+
#
|
|
21
|
+
# Hand-written positional class, NEVER Struct.new (landmine #16 / #1043): a
|
|
22
|
+
# Struct's synthesized accessors unify across modules and miscompile
|
|
23
|
+
# unrelated callers, exactly like LlamaArchForwardOut / TransformerBlockCtx.
|
|
24
|
+
# Carries values, no behavior.
|
|
25
|
+
class LayerSpec
|
|
26
|
+
# Layer kinds. Flat ints so the dispatch branch stays monomorphic. The
|
|
27
|
+
# Phase-3 refactor gate only exercises KIND_ATTENTION (every layer); the
|
|
28
|
+
# GDN kind is reserved here so the seam shape is fixed before Phase 5
|
|
29
|
+
# actually wires a Gated-DeltaNet block into a branch.
|
|
30
|
+
KIND_ATTENTION = 0 # standard Llama-style attention + SwiGLU FFN
|
|
31
|
+
KIND_GDN = 1 # Dragon Gated-DeltaNet block (Phase 5)
|
|
32
|
+
|
|
33
|
+
attr_accessor :kind
|
|
34
|
+
|
|
35
|
+
def initialize(kind)
|
|
36
|
+
@kind = kind
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end; end; end
|
|
@@ -67,6 +67,21 @@ module Toy; module LLM; module Archs
|
|
|
67
67
|
class LlamaArch
|
|
68
68
|
attr_accessor :t_seq_token_embed, :t_seq_final_norm_gamma, :t_seq_output,
|
|
69
69
|
:t_seq_w_proj, :seq_blocks_ffi,
|
|
70
|
+
# Phase 3 — per-layer descriptor array, parallel to
|
|
71
|
+
# seq_blocks_ffi (same length == n_layers).
|
|
72
|
+
:seq_layer_specs,
|
|
73
|
+
# Phase 5 — the dispatch key is a plain INT array (one kind per
|
|
74
|
+
# layer), NOT LayerSpec.kind reads: constructing/mutating
|
|
75
|
+
# LayerSpec objects on a realize path trips a Spinel codegen
|
|
76
|
+
# miscompile (corrupts the token-id finalize). Mutating a plain
|
|
77
|
+
# int array element is proven-safe. build_forward dispatches on
|
|
78
|
+
# this; LayerSpec stays the descriptor type/constants home.
|
|
79
|
+
:seq_layer_kinds,
|
|
80
|
+
# Phase 5 — parallel GDN-block array (same length; entry is a
|
|
81
|
+
# GDNBlock at KIND_GDN positions, null elsewhere). The KIND_GDN
|
|
82
|
+
# dispatch arm calls into THIS array — a concrete typed call,
|
|
83
|
+
# so the seam stays monomorphic per call site.
|
|
84
|
+
:seq_gdn_blocks_ffi,
|
|
70
85
|
# Orchestration-gating carriers — bare cache ivars with
|
|
71
86
|
# no accessor before P2.5. The lens-branch guard reads
|
|
72
87
|
# seq_donor_d_in; the shared ctx reads seq_rope_cfg.
|
|
@@ -81,6 +96,15 @@ module Toy; module LLM; module Archs
|
|
|
81
96
|
@t_seq_w_proj = TinyNN.tnn_null_ptr
|
|
82
97
|
# Seed with one block — matches the former cache init (L112).
|
|
83
98
|
@seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
|
|
99
|
+
# Phase 3 — parallel seed: one attention spec for the seed block.
|
|
100
|
+
@seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
|
|
101
|
+
# Phase 5 — parallel int dispatch keys (KIND_ATTENTION for the seed).
|
|
102
|
+
@seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
|
|
103
|
+
# Phase 5 — parallel GDN-block slots. Seeded with GDNBlock placeholders so
|
|
104
|
+
# the array is MONOMORPHIC (all GDNBlock) — the seam's KIND_GDN call site
|
|
105
|
+
# never sees a mixed null/object array (Spinel poly-array landmine). At
|
|
106
|
+
# KIND_ATTENTION layers the placeholder is simply never invoked.
|
|
107
|
+
@seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
|
|
84
108
|
@seq_donor_d_in = 0
|
|
85
109
|
# The cache overwrites seq_rope_cfg with the real RoPE::Cfg before
|
|
86
110
|
# build_forward runs (each realize prologue rebuilds it).
|
|
@@ -97,11 +121,33 @@ module Toy; module LLM; module Archs
|
|
|
97
121
|
# already constructs TransformerBlock.new there, so no new class /
|
|
98
122
|
# Struct / FFI :str at class load. Each realize path now calls this
|
|
99
123
|
# via the cache's seq_blocks_ffi delegator chain (self.seq_arch).
|
|
124
|
+
# Phase 5 hybrid — rebuild the per-layer spec array from a per-layer GDN
|
|
125
|
+
# bool flag, using the LayerSpec CTOR (never the .kind= setter: mutating
|
|
126
|
+
# LayerSpec.kind elsewhere while build_forward reads it trips a Spinel
|
|
127
|
+
# codegen miscompile that corrupts the token-id finalize). Called after
|
|
128
|
+
# seed_blocks!, before alloc.
|
|
129
|
+
# Mark ONE layer as GDN. Takes an INT index (never an array param — a
|
|
130
|
+
# function-parameter array trips the Spinel #688 type-lock landmine, which
|
|
131
|
+
# here manifests as a token-id-finalize codegen miscompile). Mutates the
|
|
132
|
+
# plain int dispatch array element (proven-safe).
|
|
133
|
+
def set_gdn_layer!(idx)
|
|
134
|
+
@seq_layer_kinds[idx] = Toy::LLM::Archs::LayerSpec::KIND_GDN
|
|
135
|
+
end
|
|
136
|
+
|
|
100
137
|
def seed_blocks!(n_layers)
|
|
101
138
|
@seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
|
|
139
|
+
# Phase 3 — seed the parallel spec array in lockstep. Every layer is
|
|
140
|
+
# KIND_ATTENTION for now (the homogeneous-Llama refactor gate); Phase 5
|
|
141
|
+
# overwrites individual entries with KIND_GDN for Dragon's pattern.
|
|
142
|
+
@seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
|
|
143
|
+
@seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
|
|
144
|
+
@seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
|
|
102
145
|
li_init = 1
|
|
103
146
|
while li_init < n_layers
|
|
104
147
|
@seq_blocks_ffi.push(Toy::LLM::Blocks::TransformerBlock.new)
|
|
148
|
+
@seq_layer_specs.push(Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION))
|
|
149
|
+
@seq_gdn_blocks_ffi.push(Toy::LLM::Blocks::GDNBlock.new)
|
|
150
|
+
@seq_layer_kinds.push(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)
|
|
105
151
|
li_init = li_init + 1
|
|
106
152
|
end
|
|
107
153
|
end
|
|
@@ -213,7 +259,22 @@ module Toy; module LLM; module Archs
|
|
|
213
259
|
end
|
|
214
260
|
li_g = 0
|
|
215
261
|
while li_g < seq_n_layers
|
|
216
|
-
|
|
262
|
+
# Phase 3 — per-layer descriptor dispatch. The branch compares a FLAT
|
|
263
|
+
# INT (spec.kind) and each arm calls a CONCRETE typed block method, so
|
|
264
|
+
# every .build_forward call site stays monomorphic (one receiver
|
|
265
|
+
# class). KIND_ATTENTION is the only arm wired today; KIND_GDN gets its
|
|
266
|
+
# own arm + its own typed block array in Phase 5. Unknown kinds fail
|
|
267
|
+
# loud rather than silently building the wrong graph (never-mask rule).
|
|
268
|
+
spec_kind = self.seq_layer_kinds[li_g]
|
|
269
|
+
if spec_kind == Toy::LLM::Archs::LayerSpec::KIND_ATTENTION
|
|
270
|
+
t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
|
|
271
|
+
elsif spec_kind == Toy::LLM::Archs::LayerSpec::KIND_GDN
|
|
272
|
+
# Concrete typed call into the parallel GDN array — the GDN block reads
|
|
273
|
+
# its own dims (set at alloc); seq_t/eps come from the shared ctx.
|
|
274
|
+
t_cur = self.seq_gdn_blocks_ffi[li_g].build_forward(sess, t_cur, seq_t, eps)
|
|
275
|
+
else
|
|
276
|
+
raise "LlamaArch#build_forward: unsupported layer kind #{spec_kind} at layer #{li_g}"
|
|
277
|
+
end
|
|
217
278
|
li_g = li_g + 1
|
|
218
279
|
end
|
|
219
280
|
|