toy 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +31 -0
  3. data/Makefile +211 -5
  4. data/README.md +1 -1
  5. data/lib/toy/compute.rb +9 -0
  6. data/lib/toy/compute_cuda.rb +8 -0
  7. data/lib/toy/compute_metal.rb +17 -0
  8. data/lib/toy/core/cli/new.rb +8 -0
  9. data/lib/toy/ffi/tinynn.rb +19 -0
  10. data/lib/toy/ffi/tinynn_cuda.rb +7 -0
  11. data/lib/toy/ffi/tinynn_metal.rb +5 -0
  12. data/lib/toy/llm/archs/layer_spec.rb +39 -0
  13. data/lib/toy/llm/archs/llama_arch.rb +62 -1
  14. data/lib/toy/llm/archs/llama_arch_cuda.rb +62 -1
  15. data/lib/toy/llm/archs/llama_arch_metal.rb +62 -1
  16. data/lib/toy/llm/blocks/gdn_block.rb +176 -0
  17. data/lib/toy/llm/engine/gpt2_kv_engine.rb +11 -0
  18. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +11 -0
  19. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +11 -0
  20. data/lib/toy/llm/engine/llama_kv_engine.rb +10 -2
  21. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +10 -2
  22. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +10 -2
  23. data/lib/toy/llm/engine/llama_seq_engine.rb +16 -1
  24. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +16 -1
  25. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +16 -1
  26. data/lib/toy/llm/primitives/depth_scale.rb +33 -0
  27. data/lib/toy/llm/primitives/diff_attention.rb +71 -0
  28. data/lib/toy/llm/primitives/gdn.rb +188 -0
  29. data/lib/toy/llm/primitives/scalable_softmax.rb +37 -0
  30. data/lib/toy/run/eval_metal.rb +12 -0
  31. data/lib/toy/run/infer_metal.rb +19 -0
  32. data/lib/toy/run/train_gpt2_metal.rb +7 -0
  33. data/lib/toy/run/train_hybrid.rb +232 -0
  34. data/lib/toy/run/train_metal.rb +10 -0
  35. data/lib/toy/version.rb +4 -3
  36. data/tinynn/tinynn_backend_cuda.c +22 -0
  37. data/tinynn/tinynn_ggml.c +231 -0
  38. metadata +9 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d1f8e6c6264601b49a0efd757444f45ba0c6ef684a32b0a6b32d39ad6d65bf08
4
- data.tar.gz: 19de5005b49891d17e0f5fd8d45ae9f614f0240ab0754f914391547cbf4bcb7c
3
+ metadata.gz: e6344fb33638dcdc959b0e713aa08081d9082eecec81c095c55b633828d5f3e8
4
+ data.tar.gz: 0bfac0f0a5f6025cae9146f877b3d565e94a4ab0f8b69ecb60144ed4f9dab8e1
5
5
  SHA512:
6
- metadata.gz: 16990c36afd421bea60a8e7c71016523a8cc6295bf0b9c1590e00fda3fa7dfa2895d0343c8f86fe75b1ae1d533311cf1cd8d08ab9888820f7793df06262a7f37
7
- data.tar.gz: 1af72fb299d5eee1d6019833008098e21c94b8ec2f689c4bedf411964f20d766f53ba1a91da61a7d4cf3ff7d7001897133e6ec0dfa4ed91690cc1efec8a1d17b
6
+ metadata.gz: 4dc0eb7b7a049022bd5c86a4853fcc52ff1e803bd0d574cddd8ceb01742eff18d02d8703c8cdcff9a044c3d81c2bf4d1a60d42ef70680a944e86a908b853e91f
7
+ data.tar.gz: facf2141aebb4c5384eb25d1ea3c4fcdb00725184fdc0b176e23799a871f7f485a8564ebfa38bd2ba47f3832723476e568fd40e0ffa5057c6bda242dcbc09483
data/CHANGELOG.md CHANGED
@@ -1,5 +1,36 @@
1
1
  # Changelog
2
2
 
3
+ ## v0.9.0 — 2026-06-22
4
+
5
+ **The Dragon / Gated-DeltaNet trainable hybrid arc.** toy grows a second block
6
+ type and the seam to stack it heterogeneously with attention — built phase by
7
+ phase, each independently gated.
8
+
9
+ - **Trainable GDN (Path B)**: the gated delta rule expressed as an *unrolled
10
+ autograd composition* (`GDN.recur_unrolled`) of ops that each have a ggml
11
+ backward — so a Gated-DeltaNet layer trains with **no hand-written kernel
12
+ backward** (ggml has none for `GATED_DELTA_NET`); the fused kernel is kept for
13
+ inference. Gated by forward-parity (`recur_unrolled` == fused kernel to 1e-6,
14
+ incl. multi-head) + a differentiability proof.
15
+ - **L1 Dragon primitives**: `gdn` (l2/decay-gate/update-gate/recur/gated-out),
16
+ `diff_attention`, `scalable_softmax`, `depth_scale`; 8 elementwise ggml ops +
17
+ `tnn_gated_delta_net`/`tnn_conv_1d` wired (CPU-only this arc).
18
+ - **Per-layer `LayerSpec` seam**: a flat-int `seq_layer_kinds` dispatch (one arch
19
+ loop, monomorphic per-kind block call) — byte-exact on homogeneous Llama
20
+ (from-scratch / warm-start / lora unchanged).
21
+ - **`GDNBlock`** (L2) + **`libexec/toy-train-hybrid`**: a self-contained
22
+ from-scratch **attention+GDN hybrid** trains (CE loss decreases). Folding it
23
+ into the shared `toy train` engine is deferred behind a union-pin Spinel
24
+ codegen block — re-apply protocol in `docs/roadmap/gdn-hybrid-engine-reintegration.md`.
25
+ - **Fixes**: `#1449` whole-program training abort (backward `get_rows` index OOB)
26
+ root-caused as a latent ggml-alloc liveness bug and fixed toy-side
27
+ (`tnn_input_1d_i32_persistent`, a galloc-external token-id index) — *not* a
28
+ spinel codegen bug (matz closed it resolved); CUDA/Metal training restored by
29
+ mirroring that FFI decl into the CUDA/Metal siblings. New backward-friendly
30
+ shims `tnn_sqrt`/`tnn_div`/`tnn_repeat`.
31
+ - **Performance**: CPU inference ~+27% tok/s and LoRA steady-state ~−24% vs the
32
+ v0.8.0-era baselines (heavy CUDA bench stable).
33
+
3
34
  ## v0.8.0 — 2026-06-12
4
35
 
5
36
  **The first published version** (RubyGems, gem name graciously transferred
data/Makefile CHANGED
@@ -59,7 +59,12 @@ endif
59
59
  # .a in tinynn/ combined with newer Spinel C codegen can produce
60
60
  # misaligned binaries that segfault at init (Tao hit this 2026-05-26
61
61
  # after pulling Spinel 2183a92 — the lib archives weren't rebuilt).
62
- SPINEL_DEPS := $(SPINEL_DIR)/spinel_analyze $(SPINEL_DIR)/spinel_codegen
62
+ # Track the compiler BINARY: post the Ruby→C rewrite there is no
63
+ # spinel_analyze/spinel_codegen at the checkout root (the Ruby backend
64
+ # moved to legacy/, oracle-only), just the single `spinel` binary —
65
+ # the right rebuild trigger, present on both the legacy and C layouts
66
+ # (verified byte-exact green on the union pin; toy#101 Part 1).
67
+ SPINEL_DEPS := $(SPINEL_BIN)
63
68
 
64
69
  CC ?= cc
65
70
  CFLAGS ?= -O2 -fPIC -Wall -Wextra
@@ -355,6 +360,16 @@ endif
355
360
  $(SPINEL) --cc='cc -Wl,-u,_tnn_metal_force_link -framework Foundation -framework Metal -framework MetalKit' $< -o $@
356
361
  toy-eval-metal: libexec/toy-eval-metal
357
362
 
363
+ # Convenience: run both functional gates on the pure CPU path (no parity arm).
364
+ # These are the byte-exact infer/eval baselines. Until this target existed the
365
+ # CPU eval gate only ran behind gate-cuda's TOY_GATE_CUDA=1, so a CPU-only eval
366
+ # regression could reach main unnoticed — and did once (the decode_step
367
+ # PolyArray OOB, #104/#105). Self-builds the runners via bin/toy.
368
+ .PHONY: gate-cpu
369
+ gate-cpu:
370
+ ruby prep/infer_gate.rb
371
+ ruby prep/eval_gate.rb
372
+
358
373
  # Convenience: run both functional gates with the CUDA parity arm enabled.
359
374
  .PHONY: gate-cuda
360
375
  gate-cuda:
@@ -449,8 +464,12 @@ gate-run-log:
449
464
  # turns the skip into a failure): MRI+Fiddle reproduces the recorded
450
465
  # Spinel from-scratch gate curve BIT-EXACT (train_baseline.txt) and the
451
466
  # smollm2-135m greedy decode ids byte-equal infer_baseline.txt.
467
+ # Prereq on the shared .so so a NEW FFI symbol (e.g. the #1449
468
+ # tnn_input_1d_i32_persistent) can't leave a STALE .so behind that
469
+ # fails the native leg with a missing-symbol NativeCallError — make
470
+ # rebuilds it from the .o's automatically.
452
471
  .PHONY: gate-mri
453
- gate-mri:
472
+ gate-mri: tinynn/libtinynn_ggml_shared.so
454
473
  ruby prep/mri_gate.rb
455
474
 
456
475
  # toy#60 item 4 — the COLD-START consumer gate: `toy new` scaffold →
@@ -486,6 +505,34 @@ gate-compute-surface-cuda: prep/smokes/smoke_compute_surface_cuda
486
505
  && echo "GATE PASS [compute-surface-cuda]: lib/toy/compute_cuda.rb device entry is live" \
487
506
  || { echo "GATE FAIL [compute-surface-cuda]"; exit 1; }
488
507
 
508
+ # Projection-lens gate: train through W_proj only (token_embd frozen) and
509
+ # assert the loss drops (the smoke's own "is learning" verdict). The CPU
510
+ # smoke was an ungated diagnostic; this wires it into the gate surface.
511
+ .PHONY: gate-projection-lens
512
+ gate-projection-lens: prep/smokes/smoke_projection_lens
513
+ @out="$$(STEPS=20 ./prep/smokes/smoke_projection_lens 2>&1)"; \
514
+ echo "$$out" | tail -2; \
515
+ echo "$$out" | grep -q "projection-lens training is learning" \
516
+ && echo "GATE PASS [projection-lens]: W_proj-only training learns (token_embd frozen)" \
517
+ || { echo "GATE FAIL [projection-lens]"; exit 1; }
518
+
519
+ # Metal twin of the projection-lens gate. The _metal smoke is an auto-
520
+ # generated mirror (MIRROR_METAL) that previously built but was reachable
521
+ # from no gate; this de-orphans it. macOS-only, skips green off Darwin
522
+ # exactly like gate-metal.
523
+ .PHONY: gate-projection-lens-metal
524
+ gate-projection-lens-metal:
525
+ ifneq ($(UNAME_S),Darwin)
526
+ @echo "gate-projection-lens-metal: Metal is macOS-only (uname -s = $(UNAME_S)) — skipping"; exit 0
527
+ else
528
+ $(MAKE) prep/smokes/smoke_projection_lens_metal
529
+ @out="$$(STEPS=20 ./prep/smokes/smoke_projection_lens_metal 2>&1)"; \
530
+ echo "$$out" | tail -2; \
531
+ echo "$$out" | grep -q "projection-lens training is learning" \
532
+ && echo "GATE PASS [projection-lens-metal]: W_proj-only training learns on Metal" \
533
+ || { echo "GATE FAIL [projection-lens-metal]"; exit 1; }
534
+ endif
535
+
489
536
  # K-quant MoE attention regression gate (the bug long misfiled as ggml#1506):
490
537
  # head_nbytes returned 0 for K-quant attention weights → per-head mmap stride
491
538
  # collapsed every head onto head 0 → degenerate repeating decode on OLMoE
@@ -564,7 +611,8 @@ libexec/toy-train: lib/toy/run/train.rb lib/toy/dev/toy_describe_flow.rb lib/toy
564
611
  lib/toy/train/toy_gguf_writer.rb lib/toy/train/toy_drift_grad.rb lib/toy/models/transformer.rb \
565
612
  lib/toy/llm/primitives/rms_norm.rb lib/toy/llm/primitives/rope.rb \
566
613
  lib/toy/llm/primitives/swiglu.rb lib/toy/llm/primitives/gqa.rb \
567
- lib/toy/llm/blocks/transformer_block.rb lib/toy/llm/archs/llama_arch.rb \
614
+ lib/toy/llm/blocks/transformer_block.rb lib/toy/llm/primitives/gdn.rb lib/toy/llm/blocks/gdn_block.rb \
615
+ lib/toy/llm/archs/layer_spec.rb lib/toy/llm/archs/llama_arch.rb \
568
616
  lib/toy/ffi/tinynn.rb tinynn/libtinynn_ggml.a | libexec
569
617
  $(SPINEL) $< -o $@
570
618
  toy-train: libexec/toy-train
@@ -579,7 +627,8 @@ libexec/toy-train-lora: lib/toy/run/train_lora.rb lib/toy/dev/toy_describe_flow.
579
627
  lib/toy/train/toy_gguf_writer.rb lib/toy/train/toy_drift_grad.rb lib/toy/models/transformer.rb \
580
628
  lib/toy/llm/primitives/rms_norm.rb lib/toy/llm/primitives/rope.rb \
581
629
  lib/toy/llm/primitives/swiglu.rb lib/toy/llm/primitives/gqa.rb \
582
- lib/toy/llm/blocks/transformer_block.rb lib/toy/llm/archs/llama_arch.rb \
630
+ lib/toy/llm/blocks/transformer_block.rb lib/toy/llm/primitives/gdn.rb lib/toy/llm/blocks/gdn_block.rb \
631
+ lib/toy/llm/archs/layer_spec.rb lib/toy/llm/archs/llama_arch.rb \
583
632
  lib/toy/ffi/tinynn.rb tinynn/libtinynn_ggml.a | libexec
584
633
  $(SPINEL) $< -o $@
585
634
  toy-train-lora: libexec/toy-train-lora
@@ -756,6 +805,127 @@ prep/smokes/smoke_projection_lens: prep/smokes/smoke_projection_lens.rb lib/toy/
756
805
  prep/smokes/smoke_compute_surface: prep/smokes/smoke_compute_surface.rb lib/toy/compute.rb lib/toy/llm/training_batch.rb lib/toy/llm/recipe_options.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
757
806
  $(SPINEL) $< -o $@
758
807
 
808
+ # Dragon/GDN Phase 1 (docs/roadmap/dragon-gdn-arch-2026-06-20.md): prove the
809
+ # newly-wired tnn_gated_delta_net + tnn_conv_1d FFI ops compute through toy's
810
+ # stack on the in-tree ggml. Forward-only shape gate (the recurrence runs and
811
+ # emits the documented output shape).
812
+ .PHONY: gate-gdn-forward
813
+ gate-gdn-forward: prep/smokes/smoke_gdn_forward
814
+ @out="$$(./prep/smokes/smoke_gdn_forward 2>&1)"; \
815
+ echo "$$out" | tail -2; \
816
+ echo "$$out" | grep -q "GDN smoke PASS" \
817
+ && echo "GATE PASS [gdn-forward]: tnn_gated_delta_net computes through the FFI" \
818
+ || { echo "GATE FAIL [gdn-forward]"; exit 1; }
819
+
820
+ prep/smokes/smoke_gdn_forward: prep/smokes/smoke_gdn_forward.rb lib/toy.rb lib/toy/ffi/tinynn.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
821
+ $(SPINEL) $< -o $@
822
+
823
+ # Dragon/GDN Phase 2: the Toy::LLM::Primitives::GDN L1 composition (l2-norm,
824
+ # log-decay + sigmoid gates, recurrence, gated output norm). The gate+l2+recur
825
+ # chain is computed end-to-end; gated_out is shape-checked.
826
+ .PHONY: gate-gdn-primitive
827
+ gate-gdn-primitive: prep/smokes/smoke_gdn_primitive
828
+ @out="$$(./prep/smokes/smoke_gdn_primitive 2>&1)"; \
829
+ echo "$$out" | tail -2; \
830
+ echo "$$out" | grep -q "GDN primitive smoke PASS" \
831
+ && echo "GATE PASS [gdn-primitive]: Toy::LLM::Primitives::GDN composes + computes" \
832
+ || { echo "GATE FAIL [gdn-primitive]"; exit 1; }
833
+
834
+ prep/smokes/smoke_gdn_primitive: prep/smokes/smoke_gdn_primitive.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/gdn.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
835
+ $(SPINEL) $< -o $@
836
+
837
+ # Dragon/GDN Phase 2: the Dragon attention-side L1 primitives (DiffAttention,
838
+ # ScalableSoftmax, DepthScale).
839
+ .PHONY: gate-dragon-attn-prims
840
+ gate-dragon-attn-prims: prep/smokes/smoke_dragon_attn_prims
841
+ @out="$$(./prep/smokes/smoke_dragon_attn_prims 2>&1)"; \
842
+ echo "$$out" | tail -2; \
843
+ echo "$$out" | grep -q "Dragon attn prims smoke PASS" \
844
+ && echo "GATE PASS [dragon-attn-prims]: diff-attn / ssmax / depth-scale compose" \
845
+ || { echo "GATE FAIL [dragon-attn-prims]"; exit 1; }
846
+
847
+ prep/smokes/smoke_dragon_attn_prims: prep/smokes/smoke_dragon_attn_prims.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/diff_attention.rb lib/toy/llm/primitives/scalable_softmax.rb lib/toy/llm/primitives/depth_scale.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
848
+ $(SPINEL) $< -o $@
849
+
850
+ # Dragon/GDN Phase 4 (Path B): numeric-parity gate — the UNROLLED,
851
+ # autograd-differentiable recurrence (GDN.recur_unrolled) reproduces the FUSED
852
+ # tnn_gated_delta_net token outputs within eps. This is what lets training use
853
+ # the composition (every op has a ggml backward) while inference keeps the fused
854
+ # kernel. See docs/roadmap/dragon-gdn-arch-2026-06-20.md (Phase 4).
855
+ .PHONY: gate-gdn-unrolled-parity
856
+ gate-gdn-unrolled-parity: prep/smokes/smoke_gdn_unrolled_parity
857
+ @out="$$(./prep/smokes/smoke_gdn_unrolled_parity 2>&1)"; \
858
+ echo "$$out" | tail -2; \
859
+ echo "$$out" | grep -q "GDN unrolled-parity smoke PASS" \
860
+ && echo "GATE PASS [gdn-unrolled-parity]: recur_unrolled == fused kernel (eps)" \
861
+ || { echo "GATE FAIL [gdn-unrolled-parity]"; exit 1; }
862
+
863
+ prep/smokes/smoke_gdn_unrolled_parity: prep/smokes/smoke_gdn_unrolled_parity.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/gdn.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
864
+ $(SPINEL) $< -o $@
865
+
866
+ # Dragon/GDN Phase 4 (Path B): the differentiability proof — ggml builds + runs
867
+ # a backward graph through recur_unrolled and yields finite non-zero dL/dq,k,v
868
+ # with NO hand-written fused-kernel backward. This is what makes GDN trainable.
869
+ .PHONY: gate-gdn-unrolled-backward
870
+ gate-gdn-unrolled-backward: prep/smokes/smoke_gdn_unrolled_backward
871
+ @out="$$(./prep/smokes/smoke_gdn_unrolled_backward 2>&1)"; \
872
+ echo "$$out" | tail -2; \
873
+ echo "$$out" | grep -q "GDN unrolled-backward smoke PASS" \
874
+ && echo "GATE PASS [gdn-unrolled-backward]: recur_unrolled is differentiable" \
875
+ || { echo "GATE FAIL [gdn-unrolled-backward]"; exit 1; }
876
+
877
+ prep/smokes/smoke_gdn_unrolled_backward: prep/smokes/smoke_gdn_unrolled_backward.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/gdn.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
878
+ $(SPINEL) $< -o $@
879
+
880
+ # Dragon/GDN Phase 5: multi-head parity — the per-head recur_unrolled looped over
881
+ # H heads + concat'd matches the fused kernel's head packing (strided slicing).
882
+ .PHONY: gate-gdn-unrolled-parity-mh
883
+ gate-gdn-unrolled-parity-mh: prep/smokes/smoke_gdn_unrolled_parity_mh
884
+ @out="$$(./prep/smokes/smoke_gdn_unrolled_parity_mh 2>&1)"; \
885
+ echo "$$out" | tail -2; \
886
+ echo "$$out" | grep -q "GDN unrolled-parity-mh smoke PASS" \
887
+ && echo "GATE PASS [gdn-unrolled-parity-mh]: H-head recur_unrolled == fused kernel" \
888
+ || { echo "GATE FAIL [gdn-unrolled-parity-mh]"; exit 1; }
889
+
890
+ prep/smokes/smoke_gdn_unrolled_parity_mh: prep/smokes/smoke_gdn_unrolled_parity_mh.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/gdn.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
891
+ $(SPINEL) $< -o $@
892
+
893
+ # Dragon/GDN Phase 5 capstone: a SELF-CONTAINED from-scratch HYBRID runner (one
894
+ # attention layer + one GDN layer, dispatched by the int-kind seam pattern) in
895
+ # its OWN compilation unit — CE loss decreases. Proves a heterogeneous
896
+ # attention+GDN stack trains from scratch. Separate unit so it can't corrupt the
897
+ # byte-exact llama engine (landmine #16). Reintegration into `toy train` waits on
898
+ # the union-pin Spinel codegen fix (master/spinelc).
899
+ .PHONY: gate-gdn-hybrid
900
+ gate-gdn-hybrid: libexec/toy-train-hybrid
901
+ @out="$$(./libexec/toy-train-hybrid 2>&1)"; \
902
+ echo "$$out" | tail -3; \
903
+ echo "$$out" | grep -q "HYBRID train smoke PASS" \
904
+ && echo "GATE PASS [gdn-hybrid]: attention+GDN from-scratch hybrid trains" \
905
+ || { echo "GATE FAIL [gdn-hybrid]"; exit 1; }
906
+
907
+ libexec/toy-train-hybrid: lib/toy/run/train_hybrid.rb lib/toy.rb lib/toy/ffi/tinynn.rb \
908
+ lib/toy/llm/primitives/rms_norm.rb lib/toy/llm/primitives/gdn.rb \
909
+ lib/toy/llm/blocks/gdn_block.rb lib/toy/llm/archs/layer_spec.rb \
910
+ tinynn/libtinynn_ggml.a $(SPINEL_DEPS) | libexec
911
+ $(SPINEL) $< -o $@
912
+ .PHONY: toy-train-hybrid
913
+ toy-train-hybrid: libexec/toy-train-hybrid
914
+
915
+ # Dragon/GDN Phase 5 (end-of-flow): a from-scratch model whose mixer is a
916
+ # trainable GDNBlock trains — CE loss decreases. Proves the GDN layer is an
917
+ # end-to-end trainable residual unit (no hand-written kernel backward).
918
+ .PHONY: gate-gdn-train
919
+ gate-gdn-train: prep/smokes/smoke_gdn_train
920
+ @out="$$(./prep/smokes/smoke_gdn_train 2>&1)"; \
921
+ echo "$$out" | tail -3; \
922
+ echo "$$out" | grep -q "GDN train smoke PASS" \
923
+ && echo "GATE PASS [gdn-train]: from-scratch GDN-layer model trains (loss decreases)" \
924
+ || { echo "GATE FAIL [gdn-train]"; exit 1; }
925
+
926
+ prep/smokes/smoke_gdn_train: prep/smokes/smoke_gdn_train.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/rms_norm.rb lib/toy/llm/primitives/gdn.rb lib/toy/llm/blocks/gdn_block.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
927
+ $(SPINEL) $< -o $@
928
+
759
929
  # toy#64 item 8 — the CUDA compute entry (lib/toy/compute_cuda.rb), the
760
930
  # consumer-ish device-at-compile-time gate. Same shape as the CPU
761
931
  # compute-surface gate but requires compute_cuda + links the CUDA
@@ -858,7 +1028,12 @@ examples/example_07_vit_tiny: examples/07_vit_tiny.rb lib/toy/compute.rb lib/toy
858
1028
  example_07: examples/example_07_vit_tiny
859
1029
  .PHONY: example_07
860
1030
 
861
- examples-curated: example_01 example_02 example_03 example_04 example_05 example_07
1031
+ examples/example_08_gdn_block: examples/08_gdn_block.rb lib/toy.rb lib/toy/ffi/tinynn.rb lib/toy/llm/primitives/rms_norm.rb lib/toy/llm/primitives/gdn.rb lib/toy/llm/blocks/gdn_block.rb tinynn/libtinynn_ggml.a $(SPINEL_DEPS)
1032
+ $(SPINEL) $< -o $@
1033
+ example_08: examples/example_08_gdn_block
1034
+ .PHONY: example_08
1035
+
1036
+ examples-curated: example_01 example_02 example_03 example_04 example_05 example_07 example_08
862
1037
  .PHONY: examples-curated
863
1038
 
864
1039
  # L4 LoRA recipe gate. Drives the same LoRA fine-tune config as the
@@ -1965,6 +2140,37 @@ bench-update: tinynn/libtinynn_ggml.a
1965
2140
  bench-report: tinynn/libtinynn_ggml.a
1966
2141
  ruby bench/check.rb --report
1967
2142
 
2143
+ # Metal perf leg (macOS only; #104 part C). Times the metal-vs-cpu infer
2144
+ # runners via N-differencing — steady-state decode ms/token plus the
2145
+ # metal-vs-cpu ratio on THIS machine. The baseline (bench/baselines_metal.csv)
2146
+ # is Mac-pinned, like the metal_gate float baseline; capture it with
2147
+ # `make bench-metal-update` on a QUIESCED machine (desktop load skews the
2148
+ # numbers badly). Skips green off macOS, exactly like gate-metal.
2149
+ .PHONY: bench-metal bench-metal-update bench-metal-report
2150
+ bench-metal:
2151
+ ifneq ($(UNAME_S),Darwin)
2152
+ @echo "bench-metal: Metal is macOS-only (uname -s = $(UNAME_S)) — skipping"; exit 0
2153
+ else
2154
+ $(MAKE) libexec/toy-infer-metal libexec/toy-infer
2155
+ ruby bench/check_metal.rb
2156
+ endif
2157
+
2158
+ bench-metal-update:
2159
+ ifneq ($(UNAME_S),Darwin)
2160
+ @echo "bench-metal-update: Metal is macOS-only (uname -s = $(UNAME_S)) — skipping"; exit 0
2161
+ else
2162
+ $(MAKE) libexec/toy-infer-metal libexec/toy-infer
2163
+ ruby bench/check_metal.rb --update
2164
+ endif
2165
+
2166
+ bench-metal-report:
2167
+ ifneq ($(UNAME_S),Darwin)
2168
+ @echo "bench-metal-report: Metal is macOS-only (uname -s = $(UNAME_S)) — skipping"; exit 0
2169
+ else
2170
+ $(MAKE) libexec/toy-infer-metal libexec/toy-infer
2171
+ ruby bench/check_metal.rb --report
2172
+ endif
2173
+
1968
2174
  # Routine comparison vs PyTorch — the "old-stable" yardstick — in the
1969
2175
  # single-machine single-GPU case. Runs ON gx10: toy CUDA benches run
1970
2176
  # native, the PyTorch reference (bench/ref_pytorch.py) runs in the
data/README.md CHANGED
@@ -4,7 +4,7 @@
4
4
  <img src="toy_logo.png" alt="toy" width="240" />
5
5
  </p>
6
6
 
7
- **v0.8.0** · first published gem · pre-1.0, not API-stable
7
+ **v0.9.0** · Dragon / Gated-DeltaNet trainable hybrid arc · pre-1.0, not API-stable
8
8
  &nbsp;·&nbsp; [CHANGELOG](CHANGELOG.md)
9
9
  &nbsp;·&nbsp; [docs](docs/architecture.md)
10
10
  &nbsp;·&nbsp; [framework guide](docs/framework.md)
data/lib/toy/compute.rb CHANGED
@@ -123,6 +123,15 @@ module Toy
123
123
  def self.warm_start_recipe
124
124
  Toy::LLM::Recipes::WarmStart.new
125
125
  end
126
+
127
+ # toy#90 — device teardown hook. CPU has no GPU-resource lifecycle to
128
+ # drain, so this is a deliberate no-op; it exists only so a
129
+ # device-agnostic experiment body can call Toy::Device.shutdown
130
+ # portably before exit (the Metal entry's override is the one that
131
+ # actually matters — see compute_metal.rb).
132
+ def self.shutdown
133
+ nil
134
+ end
126
135
  end
127
136
  end
128
137
 
@@ -92,6 +92,14 @@ module Toy
92
92
  def self.warm_start_recipe
93
93
  Toy::LLM::Recipes::WarmStartCuda.new
94
94
  end
95
+
96
+ # toy#90 — device teardown hook. CUDA frees its GPU allocations on
97
+ # process exit without a residency-set assert (unlike Metal), so this
98
+ # is a deliberate no-op; it exists for parity so a device-agnostic
99
+ # experiment body can call Toy::Device.shutdown portably.
100
+ def self.shutdown
101
+ nil
102
+ end
95
103
  end
96
104
  end
97
105
 
@@ -85,6 +85,23 @@ module Toy
85
85
  def self.from_scratch_recipe
86
86
  Toy::LLM::Recipes::FromScratchMetal.new
87
87
  end
88
+
89
+ # toy#90 — device teardown hook (THE one that matters). ggml-metal
90
+ # keeps a process-lifetime residency-set collection on its singleton
91
+ # device and asserts at the C++ static-destructor device-free that the
92
+ # collection is empty (vendor/ggml/src/ggml-metal/ggml-metal-device.m
93
+ # :618). A consumer that builds experiment_metal (toy new --lib) runs
94
+ # the binary directly — it gets NO GGML_METAL_NO_RESIDENCY=1 (that env
95
+ # is injected only by toy's own CLI subprocesses), so any Metal buffer
96
+ # still alive at exit aborts the process (exit 134) AFTER correct
97
+ # compute (toy#27 runs 3-4). Spinel has no at_exit, so a device-
98
+ # agnostic body MUST call Toy::Device.shutdown before returning;
99
+ # tnn_shutdown_engines frees every live Metal session's weights_buf
100
+ # (removing it from the residency set), satisfying the assert.
101
+ # RUNTIME-UNVERIFIED on gx10 (Linux) — Mac gate proves the exit-0.
102
+ def self.shutdown
103
+ TinyNNMetal.tnn_shutdown_engines
104
+ end
88
105
  end
89
106
  end
90
107
 
@@ -284,6 +284,14 @@ module Toy
284
284
  step = step + 1
285
285
  end
286
286
  puts "experiment: ok (device=" + Toy::Device.name + ")"
287
+
288
+ # toy#90 — release backend resources before exit. REQUIRED on
289
+ # Metal: ggml-metal asserts at device-free that its residency
290
+ # set is empty, and a directly-run experiment_metal gets no
291
+ # GGML_METAL_NO_RESIDENCY=1 (that env is injected only by toy's
292
+ # own CLI). No-op on cpu/cuda. Spinel has no at_exit, so this
293
+ # explicit call is the teardown seam.
294
+ Toy::Device.shutdown
287
295
  RUBY
288
296
 
289
297
  # Per-device entry shims — device chosen at COMPILE time by
@@ -392,6 +392,24 @@ module TinyNN
392
392
  # C-SSM (#114): state-space model primitives.
393
393
  ffi_func :tnn_ssm_conv, [:ptr, :ptr, :ptr], :ptr
394
394
  ffi_func :tnn_ssm_scan, [:ptr, :ptr, :ptr, :ptr, :ptr, :ptr, :ptr, :ptr], :ptr
395
+ # Gated DeltaNet recurrence core (Dragon/Qwen3-Next; GDN Phase 1). Forward-only
396
+ # in ggml — see docs/roadmap/dragon-gdn-arch-2026-06-20.md.
397
+ ffi_func :tnn_gated_delta_net, [:ptr, :ptr, :ptr, :ptr, :ptr, :ptr, :ptr], :ptr
398
+ ffi_func :tnn_conv_1d, [:ptr, :ptr, :ptr, :int, :int, :int], :ptr
399
+ # Elementwise ops for GDN gate math / differential attention / gated output
400
+ # norm (GDN Phase 2). sigmoid(beta), exp/log (log-decay + softplus), sub
401
+ # (A1-λA2), neg, l2_norm(q,k for the delta rule). See dragon-gdn doc.
402
+ ffi_func :tnn_sigmoid, [:ptr, :ptr], :ptr
403
+ ffi_func :tnn_exp, [:ptr, :ptr], :ptr
404
+ ffi_func :tnn_log, [:ptr, :ptr], :ptr
405
+ ffi_func :tnn_neg, [:ptr, :ptr], :ptr
406
+ ffi_func :tnn_sub, [:ptr, :ptr, :ptr], :ptr
407
+ ffi_func :tnn_sqrt, [:ptr, :ptr], :ptr
408
+ ffi_func :tnn_repeat, [:ptr, :ptr, :ptr], :ptr
409
+ ffi_func :tnn_div, [:ptr, :ptr, :ptr], :ptr
410
+ ffi_func :tnn_l2_norm, [:ptr, :ptr, :double], :ptr
411
+ ffi_func :tnn_softplus, [:ptr, :ptr], :ptr
412
+ ffi_func :tnn_scale_bias, [:ptr, :ptr, :double, :double], :ptr
395
413
  ffi_func :tnn_rms_norm, [:ptr, :ptr, :ptr, :double], :ptr
396
414
  ffi_func :tnn_softmax, [:ptr, :ptr], :ptr
397
415
  ffi_func :tnn_diag_mask_inf, [:ptr, :ptr, :int], :ptr
@@ -481,6 +499,7 @@ module TinyNN
481
499
  ffi_func :tnn_input_3d_persistent_mmap, [:ptr, :int, :int, :int, :int, :size_t], :ptr
482
500
  ffi_func :tnn_input_1d_persistent_mmap, [:ptr, :int, :int, :size_t], :ptr
483
501
  ffi_func :tnn_input_1d_f32_persistent, [:ptr, :int], :ptr
502
+ ffi_func :tnn_input_1d_i32_persistent, [:ptr, :int], :ptr
484
503
  ffi_func :tnn_finalize_weights, [:ptr], :int
485
504
  ffi_func :tnn_zero_tensor, [:ptr, :ptr], :int
486
505
  ffi_func :tnn_realize_b, [:ptr, :ptr], :int
@@ -238,6 +238,13 @@ module TinyNNCuda
238
238
  ffi_func :tnn_input_2d_persistent_typed, [:ptr, :int, :int, :int], :ptr
239
239
  ffi_func :tnn_row_size, [:int, :int], :long
240
240
  ffi_func :tnn_input_1d_f32_persistent, [:ptr, :int], :ptr
241
+ # #1449 fix — the token-id index leaf allocated galloc-external in ctx_w (so
242
+ # galloc can't free its slot + reuse it for the loss output). Mirrors the CPU
243
+ # tinynn.rb decl; the C function lives in the shared tinynn_ggml.c (the CUDA
244
+ # binaries link libtinynn_ggml.a too). Without this, the mirrored CUDA engine's
245
+ # finalize call to tnn_input_1d_i32_persistent is an undefined method → CUDA
246
+ # training aborts (caught by the heavy CUDA bench, 2026-06-22).
247
+ ffi_func :tnn_input_1d_i32_persistent, [:ptr, :int], :ptr
241
248
  # Phase 2 BYO-pointer mmap (CUDA path: ggml-cuda patched to expose
242
249
  # ggml_backend_cuda_buffer_from_ptr; weight tensors reference
243
250
  # cudaHostRegister'd pages and run via UVA on unified-memory SKUs).
@@ -225,6 +225,11 @@ module TinyNNMetal
225
225
  ffi_func :tnn_input_2d_persistent_typed, [:ptr, :int, :int, :int], :ptr
226
226
  ffi_func :tnn_row_size, [:int, :int], :long
227
227
  ffi_func :tnn_input_1d_f32_persistent, [:ptr, :int], :ptr
228
+ # #1449 fix — galloc-external token-id index leaf (ctx_w). Mirrors the CPU/CUDA
229
+ # decl; C function lives in the shared tinynn_ggml.c. Without it the mirrored
230
+ # Metal engine's finalize aborts (undefined method), same as the CUDA gap the
231
+ # heavy bench caught 2026-06-22. (Metal is Mac-only; not runnable here.)
232
+ ffi_func :tnn_input_1d_i32_persistent, [:ptr, :int], :ptr
228
233
  # Phase 2 BYO-pointer mmap. On Metal the buffer-from-ptr path falls
229
234
  # through to ggml_backend_cpu_buffer_from_ptr (no public Metal
230
235
  # buffer_from_ptr API); the scheduler then copies host pages to
@@ -0,0 +1,39 @@
1
+ # lib/toy/llm/archs/layer_spec.rb — Phase 3 of the Dragon-GDN arc: the
2
+ # per-layer descriptor that lets one arch forward loop build a heterogeneous
3
+ # layer stack via flat-int kind dispatch (no polymorphic receiver). See
4
+ # docs/roadmap/dragon-gdn-arch-2026-06-20.md.
5
+
6
+ module Toy; module LLM; module Archs
7
+ # Per-layer descriptor — the seam that lets ONE arch forward loop build a
8
+ # heterogeneous layer stack (homogeneous Llama attention today; Dragon's
9
+ # Gated-DeltaNet + selective-attention mix from Phase 5) WITHOUT polymorphic
10
+ # method dispatch.
11
+ #
12
+ # The `kind` field is a FLAT INTEGER, deliberately not a class, symbol, or
13
+ # block object. The arch loop branches on `spec.kind == KIND_*` and then
14
+ # calls a CONCRETE typed block method inside each branch, so every
15
+ # `.build_forward` call site keeps a single receiver class. Funnelling
16
+ # heterogeneous receiver types through ONE call site is the Spinel
17
+ # poly-dispatch landmine (the #11/#12 family, matz/spinel#1043) the whole
18
+ # Dragon seam is shaped to avoid — see dragon-gdn-arch-2026-06-20.md
19
+ # "Phase 3 — the per-layer descriptor seam."
20
+ #
21
+ # Hand-written positional class, NEVER Struct.new (landmine #16 / #1043): a
22
+ # Struct's synthesized accessors unify across modules and miscompile
23
+ # unrelated callers, exactly like LlamaArchForwardOut / TransformerBlockCtx.
24
+ # Carries values, no behavior.
25
+ class LayerSpec
26
+ # Layer kinds. Flat ints so the dispatch branch stays monomorphic. The
27
+ # Phase-3 refactor gate only exercises KIND_ATTENTION (every layer); the
28
+ # GDN kind is reserved here so the seam shape is fixed before Phase 5
29
+ # actually wires a Gated-DeltaNet block into a branch.
30
+ KIND_ATTENTION = 0 # standard Llama-style attention + SwiGLU FFN
31
+ KIND_GDN = 1 # Dragon Gated-DeltaNet block (Phase 5)
32
+
33
+ attr_accessor :kind
34
+
35
+ def initialize(kind)
36
+ @kind = kind
37
+ end
38
+ end
39
+ end; end; end
@@ -67,6 +67,21 @@ module Toy; module LLM; module Archs
67
67
  class LlamaArch
68
68
  attr_accessor :t_seq_token_embed, :t_seq_final_norm_gamma, :t_seq_output,
69
69
  :t_seq_w_proj, :seq_blocks_ffi,
70
+ # Phase 3 — per-layer descriptor array, parallel to
71
+ # seq_blocks_ffi (same length == n_layers).
72
+ :seq_layer_specs,
73
+ # Phase 5 — the dispatch key is a plain INT array (one kind per
74
+ # layer), NOT LayerSpec.kind reads: constructing/mutating
75
+ # LayerSpec objects on a realize path trips a Spinel codegen
76
+ # miscompile (corrupts the token-id finalize). Mutating a plain
77
+ # int array element is proven-safe. build_forward dispatches on
78
+ # this; LayerSpec stays the descriptor type/constants home.
79
+ :seq_layer_kinds,
80
+ # Phase 5 — parallel GDN-block array (same length; entry is a
81
+ # GDNBlock at KIND_GDN positions, null elsewhere). The KIND_GDN
82
+ # dispatch arm calls into THIS array — a concrete typed call,
83
+ # so the seam stays monomorphic per call site.
84
+ :seq_gdn_blocks_ffi,
70
85
  # Orchestration-gating carriers — bare cache ivars with
71
86
  # no accessor before P2.5. The lens-branch guard reads
72
87
  # seq_donor_d_in; the shared ctx reads seq_rope_cfg.
@@ -81,6 +96,15 @@ module Toy; module LLM; module Archs
81
96
  @t_seq_w_proj = TinyNN.tnn_null_ptr
82
97
  # Seed with one block — matches the former cache init (L112).
83
98
  @seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
99
+ # Phase 3 — parallel seed: one attention spec for the seed block.
100
+ @seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
101
+ # Phase 5 — parallel int dispatch keys (KIND_ATTENTION for the seed).
102
+ @seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
103
+ # Phase 5 — parallel GDN-block slots. Seeded with GDNBlock placeholders so
104
+ # the array is MONOMORPHIC (all GDNBlock) — the seam's KIND_GDN call site
105
+ # never sees a mixed null/object array (Spinel poly-array landmine). At
106
+ # KIND_ATTENTION layers the placeholder is simply never invoked.
107
+ @seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
84
108
  @seq_donor_d_in = 0
85
109
  # The cache overwrites seq_rope_cfg with the real RoPE::Cfg before
86
110
  # build_forward runs (each realize prologue rebuilds it).
@@ -97,11 +121,33 @@ module Toy; module LLM; module Archs
97
121
  # already constructs TransformerBlock.new there, so no new class /
98
122
  # Struct / FFI :str at class load. Each realize path now calls this
99
123
  # via the cache's seq_blocks_ffi delegator chain (self.seq_arch).
124
+ # Phase 5 hybrid — rebuild the per-layer spec array from a per-layer GDN
125
+ # bool flag, using the LayerSpec CTOR (never the .kind= setter: mutating
126
+ # LayerSpec.kind elsewhere while build_forward reads it trips a Spinel
127
+ # codegen miscompile that corrupts the token-id finalize). Called after
128
+ # seed_blocks!, before alloc.
129
+ # Mark ONE layer as GDN. Takes an INT index (never an array param — a
130
+ # function-parameter array trips the Spinel #688 type-lock landmine, which
131
+ # here manifests as a token-id-finalize codegen miscompile). Mutates the
132
+ # plain int dispatch array element (proven-safe).
133
+ def set_gdn_layer!(idx)
134
+ @seq_layer_kinds[idx] = Toy::LLM::Archs::LayerSpec::KIND_GDN
135
+ end
136
+
100
137
  def seed_blocks!(n_layers)
101
138
  @seq_blocks_ffi = [Toy::LLM::Blocks::TransformerBlock.new]
139
+ # Phase 3 — seed the parallel spec array in lockstep. Every layer is
140
+ # KIND_ATTENTION for now (the homogeneous-Llama refactor gate); Phase 5
141
+ # overwrites individual entries with KIND_GDN for Dragon's pattern.
142
+ @seq_layer_specs = [Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)]
143
+ @seq_gdn_blocks_ffi = [Toy::LLM::Blocks::GDNBlock.new]
144
+ @seq_layer_kinds = [Toy::LLM::Archs::LayerSpec::KIND_ATTENTION]
102
145
  li_init = 1
103
146
  while li_init < n_layers
104
147
  @seq_blocks_ffi.push(Toy::LLM::Blocks::TransformerBlock.new)
148
+ @seq_layer_specs.push(Toy::LLM::Archs::LayerSpec.new(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION))
149
+ @seq_gdn_blocks_ffi.push(Toy::LLM::Blocks::GDNBlock.new)
150
+ @seq_layer_kinds.push(Toy::LLM::Archs::LayerSpec::KIND_ATTENTION)
105
151
  li_init = li_init + 1
106
152
  end
107
153
  end
@@ -213,7 +259,22 @@ module Toy; module LLM; module Archs
213
259
  end
214
260
  li_g = 0
215
261
  while li_g < seq_n_layers
216
- t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
262
+ # Phase 3 — per-layer descriptor dispatch. The branch compares a FLAT
263
+ # INT (spec.kind) and each arm calls a CONCRETE typed block method, so
264
+ # every .build_forward call site stays monomorphic (one receiver
265
+ # class). KIND_ATTENTION is the only arm wired today; KIND_GDN gets its
266
+ # own arm + its own typed block array in Phase 5. Unknown kinds fail
267
+ # loud rather than silently building the wrong graph (never-mask rule).
268
+ spec_kind = self.seq_layer_kinds[li_g]
269
+ if spec_kind == Toy::LLM::Archs::LayerSpec::KIND_ATTENTION
270
+ t_cur = self.seq_blocks_ffi[li_g].build_forward(sess, t_cur, ctx)
271
+ elsif spec_kind == Toy::LLM::Archs::LayerSpec::KIND_GDN
272
+ # Concrete typed call into the parallel GDN array — the GDN block reads
273
+ # its own dims (set at alloc); seq_t/eps come from the shared ctx.
274
+ t_cur = self.seq_gdn_blocks_ffi[li_g].build_forward(sess, t_cur, seq_t, eps)
275
+ else
276
+ raise "LlamaArch#build_forward: unsupported layer kind #{spec_kind} at layer #{li_g}"
277
+ end
217
278
  li_g = li_g + 1
218
279
  end
219
280