toy 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +31 -0
  3. data/Makefile +211 -5
  4. data/README.md +1 -1
  5. data/lib/toy/compute.rb +9 -0
  6. data/lib/toy/compute_cuda.rb +8 -0
  7. data/lib/toy/compute_metal.rb +17 -0
  8. data/lib/toy/core/cli/new.rb +8 -0
  9. data/lib/toy/ffi/tinynn.rb +19 -0
  10. data/lib/toy/ffi/tinynn_cuda.rb +7 -0
  11. data/lib/toy/ffi/tinynn_metal.rb +5 -0
  12. data/lib/toy/llm/archs/layer_spec.rb +39 -0
  13. data/lib/toy/llm/archs/llama_arch.rb +62 -1
  14. data/lib/toy/llm/archs/llama_arch_cuda.rb +62 -1
  15. data/lib/toy/llm/archs/llama_arch_metal.rb +62 -1
  16. data/lib/toy/llm/blocks/gdn_block.rb +176 -0
  17. data/lib/toy/llm/engine/gpt2_kv_engine.rb +11 -0
  18. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +11 -0
  19. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +11 -0
  20. data/lib/toy/llm/engine/llama_kv_engine.rb +10 -2
  21. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +10 -2
  22. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +10 -2
  23. data/lib/toy/llm/engine/llama_seq_engine.rb +16 -1
  24. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +16 -1
  25. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +16 -1
  26. data/lib/toy/llm/primitives/depth_scale.rb +33 -0
  27. data/lib/toy/llm/primitives/diff_attention.rb +71 -0
  28. data/lib/toy/llm/primitives/gdn.rb +188 -0
  29. data/lib/toy/llm/primitives/scalable_softmax.rb +37 -0
  30. data/lib/toy/run/eval_metal.rb +12 -0
  31. data/lib/toy/run/infer_metal.rb +19 -0
  32. data/lib/toy/run/train_gpt2_metal.rb +7 -0
  33. data/lib/toy/run/train_hybrid.rb +232 -0
  34. data/lib/toy/run/train_metal.rb +10 -0
  35. data/lib/toy/version.rb +4 -3
  36. data/tinynn/tinynn_backend_cuda.c +22 -0
  37. data/tinynn/tinynn_ggml.c +231 -0
  38. metadata +9 -2
data/tinynn/tinynn_ggml.c CHANGED
@@ -90,6 +90,68 @@ static tnn_engine *g_engine_cpu = NULL;
90
90
  static tnn_engine *g_engine_cuda[TNN_MAX_CUDA_DEVICES] = { NULL };
91
91
  static tnn_engine *g_engine_metal = NULL;
92
92
 
93
+ /* toy#90 — Metal residency-set teardown drain.
94
+ *
95
+ * The ggml-metal backend keeps a process-lifetime residency-set
96
+ * collection on its (singleton) device. Each live Metal buffer adds
97
+ * itself to that collection on alloc and removes itself on free
98
+ * (vendor/ggml/src/ggml-metal/ggml-metal-device.m:1491/1594). The
99
+ * device is freed ONLY by a C++ static destructor at process exit
100
+ * (ggml-metal-device.cpp:12-26), at which point ggml asserts the
101
+ * collection is empty (ggml-metal-device.m:618 — a deliberate "you
102
+ * leaked GPU resources" guard). toy's Metal runners (lib/toy/run/)
103
+ * and the consumer scaffold (Toy::Device, lib/toy/compute_metal.rb)
104
+ * historically rely on process exit to reclaim everything and never
105
+ * call tnn_session_free, so a session's weights_buf is still alive at
106
+ * exit → the residency set is non-empty → SIGABRT (exit 134), AFTER
107
+ * compute already produced correct output. The CLI subprocess paths
108
+ * mask this with GGML_METAL_NO_RESIDENCY=1, but a directly-run
109
+ * consumer binary gets no such env (toy#27 runs 3-4).
110
+ *
111
+ * Fix: track every live Metal session so tnn_shutdown_engines can free
112
+ * them (draining their weights_buf, hence the residency set) before the
113
+ * static destructor runs. The registry is METAL-ONLY by construction —
114
+ * tnn_session_register is called only when s->engine == g_engine_metal
115
+ * — so CPU and CUDA session/teardown semantics are byte-for-byte
116
+ * unchanged (their sessions are never registered, never drained here). */
117
+ #define TNN_MAX_METAL_SESSIONS 256
118
+ static void *g_metal_sessions[TNN_MAX_METAL_SESSIONS] = { NULL };
119
+ static int g_metal_session_count = 0;
120
+
121
+ static void tnn_metal_session_register(void *sess)
122
+ {
123
+ if (g_metal_session_count >= TNN_MAX_METAL_SESSIONS) {
124
+ /* Fail loud (never silently drop): an undrained session leaks a
125
+ * residency set and re-trips the device-free assert. */
126
+ fprintf(stderr,
127
+ "[tnn] WARNING: more than %d live Metal sessions; "
128
+ "tnn_shutdown_engines cannot track this one and the "
129
+ "ggml-metal device-free residency assert may fire at "
130
+ "exit. Bump TNN_MAX_METAL_SESSIONS or free sessions "
131
+ "explicitly with tnn_session_free.\n",
132
+ TNN_MAX_METAL_SESSIONS);
133
+ return;
134
+ }
135
+ g_metal_sessions[g_metal_session_count++] = sess;
136
+ }
137
+
138
+ static void tnn_metal_session_unregister(void *sess)
139
+ {
140
+ for (int i = 0; i < g_metal_session_count; ++i) {
141
+ if (g_metal_sessions[i] == sess) {
142
+ /* compact: move the tail entry into the hole */
143
+ g_metal_sessions[i] = g_metal_sessions[g_metal_session_count - 1];
144
+ g_metal_sessions[g_metal_session_count - 1] = NULL;
145
+ --g_metal_session_count;
146
+ return;
147
+ }
148
+ }
149
+ }
150
+
151
+ /* Forward decl: tnn_session_free is defined further down; the drain in
152
+ * tnn_shutdown_engines needs it. */
153
+ void tnn_session_free(void *sess);
154
+
93
155
  /* CUDA backend init with device selection. Weak stub returns NULL;
94
156
  * strong override lives in tinynn_backend_cuda.c. */
95
157
  __attribute__((weak))
@@ -199,6 +261,24 @@ static tnn_engine *tnn_engine_get_on(int backend_kind, int device)
199
261
  * GPU between phases. */
200
262
  void tnn_shutdown_engines(void)
201
263
  {
264
+ /* toy#90 — drain any live Metal sessions FIRST. Each session_free
265
+ * frees s->weights_buf, whose ggml-metal buffer removes itself from
266
+ * the device's residency-set collection; only once every Metal
267
+ * buffer is freed is the device-free assert (ggml-metal-device.m:618)
268
+ * satisfied. tnn_session_free unregisters as it goes, so we always
269
+ * drain index 0 until the list is empty (no iterator invalidation).
270
+ * Metal-only: CPU/CUDA sessions are never registered. */
271
+ while (g_metal_session_count > 0) {
272
+ void *sess = g_metal_sessions[0];
273
+ if (!sess) { /* defensive: drop a stale NULL slot */
274
+ g_metal_sessions[0] = g_metal_sessions[g_metal_session_count - 1];
275
+ g_metal_sessions[g_metal_session_count - 1] = NULL;
276
+ --g_metal_session_count;
277
+ continue;
278
+ }
279
+ tnn_session_free(sess);
280
+ }
281
+
202
282
  /* CPU + Metal: single slot each. */
203
283
  tnn_engine **scalar_slots[] = { &g_engine_cpu, &g_engine_metal };
204
284
  for (int i = 0; i < 2; ++i) {
@@ -387,6 +467,13 @@ void *tnn_session_new_on(int backend_kind, int device)
387
467
  s->weights_map_base = NULL;
388
468
  s->weights_map_size = 0;
389
469
  s->last_graph = 0;
470
+ /* toy#90 — register Metal sessions so tnn_shutdown_engines can drain
471
+ * their residency-set-carrying buffers before the ggml-metal static
472
+ * destructor runs. Gated to the Metal engine: CPU/CUDA sessions are
473
+ * never tracked, keeping their lifecycle unchanged. */
474
+ if (e == g_engine_metal) {
475
+ tnn_metal_session_register((void *)s);
476
+ }
390
477
  return (void *)s;
391
478
  }
392
479
 
@@ -394,6 +481,9 @@ void tnn_session_free(void *sess)
394
481
  {
395
482
  if (!sess) return;
396
483
  tnn_session *s = (tnn_session *)sess;
484
+ /* toy#90 — drop from the Metal drain registry (no-op for CPU/CUDA
485
+ * sessions, which were never registered). Idempotent. */
486
+ tnn_metal_session_unregister(sess);
397
487
  if (s->weights_buf) ggml_backend_buffer_free(s->weights_buf);
398
488
  if (s->weights_buf_mmap) ggml_backend_buffer_free(s->weights_buf_mmap);
399
489
  if (s->ctx) ggml_free(s->ctx);
@@ -815,6 +905,38 @@ void *tnn_ssm_scan(void *sess, void *state, void *x, void *dt,
815
905
  (struct ggml_tensor *)ids);
816
906
  }
817
907
 
908
+ /* Gated DeltaNet recurrence core (Dragon / Qwen3-Next family). The q/k/v/g/beta
909
+ * projections + the short causal conv are built by the Ruby GDN primitive; this
910
+ * is the fused recurrence op only. Shapes (all F32): v=[S_v,H,T,B];
911
+ * q,k contiguous-rows; g=[1|S_v,H,T,B]; beta ne0==1; state=[S_v*S_v*H,K,B,1].
912
+ * out=[S_v*H, T*B + K*S_v*B, 1, 1] (token outputs then trailing state snapshots).
913
+ * Forward only in ggml — training backward is a separate hand-written kernel. */
914
+ void *tnn_gated_delta_net(void *sess, void *q, void *k, void *v,
915
+ void *g, void *beta, void *state)
916
+ {
917
+ if (!sess || !q || !k || !v || !g || !beta || !state) return NULL;
918
+ tnn_session *s = (tnn_session *)sess;
919
+ return (void *)ggml_gated_delta_net(s->ctx,
920
+ (struct ggml_tensor *)q,
921
+ (struct ggml_tensor *)k,
922
+ (struct ggml_tensor *)v,
923
+ (struct ggml_tensor *)g,
924
+ (struct ggml_tensor *)beta,
925
+ (struct ggml_tensor *)state);
926
+ }
927
+
928
+ /* 1-D convolution (kernel a, data b) with stride/pad/dilation — the short
929
+ * causal conv inside a GDN block (also generally useful). */
930
+ void *tnn_conv_1d(void *sess, void *a, void *b, int s0, int p0, int d0)
931
+ {
932
+ if (!sess || !a || !b) return NULL;
933
+ tnn_session *s = (tnn_session *)sess;
934
+ return (void *)ggml_conv_1d(s->ctx,
935
+ (struct ggml_tensor *)a,
936
+ (struct ggml_tensor *)b,
937
+ s0, p0, d0);
938
+ }
939
+
818
940
  void *tnn_gelu(void *sess, void *a)
819
941
  {
820
942
  if (!sess || !a) return NULL;
@@ -825,6 +947,101 @@ void *tnn_gelu(void *sess, void *a)
825
947
  return (void *)ggml_gelu(s->ctx, (struct ggml_tensor *)a);
826
948
  }
827
949
 
950
+ /* Unary/binary elementwise ops used to compose the GDN gate math, differential
951
+ * attention, and gated output norm (Dragon/GDN Phase 2). All thin ggml wraps. */
952
+ void *tnn_sigmoid(void *sess, void *a)
953
+ {
954
+ if (!sess || !a) return NULL;
955
+ tnn_session *s = (tnn_session *)sess;
956
+ return (void *)ggml_sigmoid(s->ctx, (struct ggml_tensor *)a);
957
+ }
958
+
959
+ void *tnn_exp(void *sess, void *a)
960
+ {
961
+ if (!sess || !a) return NULL;
962
+ tnn_session *s = (tnn_session *)sess;
963
+ return (void *)ggml_exp(s->ctx, (struct ggml_tensor *)a);
964
+ }
965
+
966
+ void *tnn_log(void *sess, void *a)
967
+ {
968
+ if (!sess || !a) return NULL;
969
+ tnn_session *s = (tnn_session *)sess;
970
+ return (void *)ggml_log(s->ctx, (struct ggml_tensor *)a);
971
+ }
972
+
973
+ void *tnn_neg(void *sess, void *a)
974
+ {
975
+ if (!sess || !a) return NULL;
976
+ tnn_session *s = (tnn_session *)sess;
977
+ return (void *)ggml_neg(s->ctx, (struct ggml_tensor *)a);
978
+ }
979
+
980
+ void *tnn_sub(void *sess, void *a, void *b)
981
+ {
982
+ if (!sess || !a || !b) return NULL;
983
+ tnn_session *s = (tnn_session *)sess;
984
+ return (void *)ggml_sub(s->ctx, (struct ggml_tensor *)a, (struct ggml_tensor *)b);
985
+ }
986
+
987
+ void *tnn_sqrt(void *sess, void *a)
988
+ {
989
+ /* elementwise sqrt; has ggml backward (GGML_OP_SQRT). Used to compose a
990
+ * backward-friendly L2 norm (L2_NORM itself has no ggml backward). */
991
+ if (!sess || !a) return NULL;
992
+ tnn_session *s = (tnn_session *)sess;
993
+ return (void *)ggml_sqrt(s->ctx, (struct ggml_tensor *)a);
994
+ }
995
+
996
+ void *tnn_repeat(void *sess, void *a, void *b)
997
+ {
998
+ /* ggml_repeat: broadcast `a` up to the shape of `b`. Has ggml backward
999
+ * (GGML_OP_REPEAT → repeat_back, which SUMS the grad back down to a's
1000
+ * shape). Used to materialise a broadcast operand explicitly so a later
1001
+ * same-shape op (e.g. DIV, whose backward does NOT reduce a broadcast
1002
+ * src1) sees matching shapes and the grad reduction happens through the
1003
+ * well-formed REPEAT backward instead. */
1004
+ if (!sess || !a || !b) return NULL;
1005
+ tnn_session *s = (tnn_session *)sess;
1006
+ return (void *)ggml_repeat(s->ctx, (struct ggml_tensor *)a,
1007
+ (struct ggml_tensor *)b);
1008
+ }
1009
+
1010
+ void *tnn_div(void *sess, void *a, void *b)
1011
+ {
1012
+ /* elementwise a/b with ggml broadcast (b repeats into a); has ggml backward
1013
+ * (GGML_OP_DIV). The divisor in the composed L2 is [1,H,T] broadcasting over
1014
+ * the [S_v,H,T] numerator. */
1015
+ if (!sess || !a || !b) return NULL;
1016
+ tnn_session *s = (tnn_session *)sess;
1017
+ return (void *)ggml_div(s->ctx, (struct ggml_tensor *)a, (struct ggml_tensor *)b);
1018
+ }
1019
+
1020
+ /* L2-normalise rows along ne0 (q/k normalisation for the delta rule). */
1021
+ void *tnn_l2_norm(void *sess, void *a, double eps)
1022
+ {
1023
+ if (!sess || !a) return NULL;
1024
+ tnn_session *s = (tnn_session *)sess;
1025
+ return (void *)ggml_l2_norm(s->ctx, (struct ggml_tensor *)a, (float)eps);
1026
+ }
1027
+
1028
+ /* softplus(x) = log(1 + exp(x)) — the GDN log-decay gate. */
1029
+ void *tnn_softplus(void *sess, void *a)
1030
+ {
1031
+ if (!sess || !a) return NULL;
1032
+ tnn_session *s = (tnn_session *)sess;
1033
+ return (void *)ggml_softplus(s->ctx, (struct ggml_tensor *)a);
1034
+ }
1035
+
1036
+ /* scale + bias: s*x + b (compile-time scalars). Used for SSMax (s*log n + b)
1037
+ * and depth scaling. */
1038
+ void *tnn_scale_bias(void *sess, void *a, double s, double b)
1039
+ {
1040
+ if (!sess || !a) return NULL;
1041
+ tnn_session *sx = (tnn_session *)sess;
1042
+ return (void *)ggml_scale_bias(sx->ctx, (struct ggml_tensor *)a, (float)s, (float)b);
1043
+ }
1044
+
828
1045
  void *tnn_rms_norm(void *sess, void *x, void *gamma_row, double eps)
829
1046
  {
830
1047
  if (!sess || !x || !gamma_row) return NULL;
@@ -1316,6 +1533,20 @@ void *tnn_input_1d_i32(void *sess, int n)
1316
1533
  return (void *)ggml_new_tensor_1d(s->ctx, GGML_TYPE_I32, (int64_t)n);
1317
1534
  }
1318
1535
 
1536
+ /* Persistent i32 input in ctx_w (#1449): a graph INPUT read across the
1537
+ * forward->backward boundary must not live in the galloc compute arena, where
1538
+ * galloc (seeing it as dead after the forward gather) frees its offset and
1539
+ * reuses it for the loss output -> backward get_rows reads loss bits as a wild
1540
+ * index. ctx_w is galloc-external and survives reset_for_rebuild. Allocated
1541
+ * before tnn_finalize_weights; re-uploaded each step. */
1542
+ void *tnn_input_1d_i32_persistent(void *sess, int n)
1543
+ {
1544
+ if (!sess || n <= 0) return NULL;
1545
+ tnn_session *s = (tnn_session *)sess;
1546
+ if (s->weights_finalized) return NULL;
1547
+ return (void *)ggml_new_tensor_1d(s->ctx_w, GGML_TYPE_I32, (int64_t)n);
1548
+ }
1549
+
1319
1550
  void tnn_gelu_back_scratch(void *sess, int n)
1320
1551
  {
1321
1552
  if (!sess || n <= 0) return;
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: toy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ori Pekelman
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-06-12 00:00:00.000000000 Z
11
+ date: 2026-06-27 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: |-
14
14
  Toy is a pure-Ruby transformer LM that compiles to a native binary
@@ -74,9 +74,11 @@ files:
74
74
  - lib/toy/io/toy_events.rb
75
75
  - lib/toy/io/toy_image_loader.rb
76
76
  - lib/toy/llm/adamw.rb
77
+ - lib/toy/llm/archs/layer_spec.rb
77
78
  - lib/toy/llm/archs/llama_arch.rb
78
79
  - lib/toy/llm/archs/llama_arch_cuda.rb
79
80
  - lib/toy/llm/archs/llama_arch_metal.rb
81
+ - lib/toy/llm/blocks/gdn_block.rb
80
82
  - lib/toy/llm/blocks/transformer_block.rb
81
83
  - lib/toy/llm/blocks/transformer_block_cuda.rb
82
84
  - lib/toy/llm/blocks/transformer_block_metal.rb
@@ -98,6 +100,9 @@ files:
98
100
  - lib/toy/llm/engine/llama_seq_engine_metal.rb
99
101
  - lib/toy/llm/engine/vit_tiny_engine.rb
100
102
  - lib/toy/llm/labels.rb
103
+ - lib/toy/llm/primitives/depth_scale.rb
104
+ - lib/toy/llm/primitives/diff_attention.rb
105
+ - lib/toy/llm/primitives/gdn.rb
101
106
  - lib/toy/llm/primitives/gqa.rb
102
107
  - lib/toy/llm/primitives/gqa_cuda.rb
103
108
  - lib/toy/llm/primitives/gqa_metal.rb
@@ -107,6 +112,7 @@ files:
107
112
  - lib/toy/llm/primitives/rope.rb
108
113
  - lib/toy/llm/primitives/rope_cuda.rb
109
114
  - lib/toy/llm/primitives/rope_metal.rb
115
+ - lib/toy/llm/primitives/scalable_softmax.rb
110
116
  - lib/toy/llm/primitives/swiglu.rb
111
117
  - lib/toy/llm/primitives/swiglu_cuda.rb
112
118
  - lib/toy/llm/primitives/swiglu_metal.rb
@@ -146,6 +152,7 @@ files:
146
152
  - lib/toy/run/train_gpt2.rb
147
153
  - lib/toy/run/train_gpt2_cuda.rb
148
154
  - lib/toy/run/train_gpt2_metal.rb
155
+ - lib/toy/run/train_hybrid.rb
149
156
  - lib/toy/run/train_lora.rb
150
157
  - lib/toy/run/train_lora_cuda.rb
151
158
  - lib/toy/run/train_metal.rb