RubyGems - toy - Versions diffs - 0.8.0 → 0.9.0 - Mend

toy 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +31 -0
data/Makefile +211 -5
data/README.md +1 -1
data/lib/toy/compute.rb +9 -0
data/lib/toy/compute_cuda.rb +8 -0
data/lib/toy/compute_metal.rb +17 -0
data/lib/toy/core/cli/new.rb +8 -0
data/lib/toy/ffi/tinynn.rb +19 -0
data/lib/toy/ffi/tinynn_cuda.rb +7 -0
data/lib/toy/ffi/tinynn_metal.rb +5 -0
data/lib/toy/llm/archs/layer_spec.rb +39 -0
data/lib/toy/llm/archs/llama_arch.rb +62 -1
data/lib/toy/llm/archs/llama_arch_cuda.rb +62 -1
data/lib/toy/llm/archs/llama_arch_metal.rb +62 -1
data/lib/toy/llm/blocks/gdn_block.rb +176 -0
data/lib/toy/llm/engine/gpt2_kv_engine.rb +11 -0
data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +11 -0
data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +11 -0
data/lib/toy/llm/engine/llama_kv_engine.rb +10 -2
data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +10 -2
data/lib/toy/llm/engine/llama_kv_engine_metal.rb +10 -2
data/lib/toy/llm/engine/llama_seq_engine.rb +16 -1
data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +16 -1
data/lib/toy/llm/engine/llama_seq_engine_metal.rb +16 -1
data/lib/toy/llm/primitives/depth_scale.rb +33 -0
data/lib/toy/llm/primitives/diff_attention.rb +71 -0
data/lib/toy/llm/primitives/gdn.rb +188 -0
data/lib/toy/llm/primitives/scalable_softmax.rb +37 -0
data/lib/toy/run/eval_metal.rb +12 -0
data/lib/toy/run/infer_metal.rb +19 -0
data/lib/toy/run/train_gpt2_metal.rb +7 -0
data/lib/toy/run/train_hybrid.rb +232 -0
data/lib/toy/run/train_metal.rb +10 -0
data/lib/toy/version.rb +4 -3
data/tinynn/tinynn_backend_cuda.c +22 -0
data/tinynn/tinynn_ggml.c +231 -0
metadata +9 -2

data/tinynn/tinynn_ggml.c CHANGED Viewed

@@ -90,6 +90,68 @@ static tnn_engine *g_engine_cpu   = NULL;
 static tnn_engine *g_engine_cuda[TNN_MAX_CUDA_DEVICES] = { NULL };
 static tnn_engine *g_engine_metal = NULL;
+/* toy#90 — Metal residency-set teardown drain.
+ *
+ * The ggml-metal backend keeps a process-lifetime residency-set
+ * collection on its (singleton) device. Each live Metal buffer adds
+ * itself to that collection on alloc and removes itself on free
+ * (vendor/ggml/src/ggml-metal/ggml-metal-device.m:1491/1594). The
+ * device is freed ONLY by a C++ static destructor at process exit
+ * (ggml-metal-device.cpp:12-26), at which point ggml asserts the
+ * collection is empty (ggml-metal-device.m:618 — a deliberate "you
+ * leaked GPU resources" guard). toy's Metal runners (lib/toy/run/)
+ * and the consumer scaffold (Toy::Device, lib/toy/compute_metal.rb)
+ * historically rely on process exit to reclaim everything and never
+ * call tnn_session_free, so a session's weights_buf is still alive at
+ * exit → the residency set is non-empty → SIGABRT (exit 134), AFTER
+ * compute already produced correct output. The CLI subprocess paths
+ * mask this with GGML_METAL_NO_RESIDENCY=1, but a directly-run
+ * consumer binary gets no such env (toy#27 runs 3-4).
+ *
+ * Fix: track every live Metal session so tnn_shutdown_engines can free
+ * them (draining their weights_buf, hence the residency set) before the
+ * static destructor runs. The registry is METAL-ONLY by construction —
+ * tnn_session_register is called only when s->engine == g_engine_metal
+ * — so CPU and CUDA session/teardown semantics are byte-for-byte
+ * unchanged (their sessions are never registered, never drained here). */
+#define TNN_MAX_METAL_SESSIONS 256
+static void *g_metal_sessions[TNN_MAX_METAL_SESSIONS] = { NULL };
+static int   g_metal_session_count = 0;
+static void tnn_metal_session_register(void *sess)
+{
+    if (g_metal_session_count >= TNN_MAX_METAL_SESSIONS) {
+        /* Fail loud (never silently drop): an undrained session leaks a
+         * residency set and re-trips the device-free assert. */
+        fprintf(stderr,
+                "[tnn] WARNING: more than %d live Metal sessions; "
+                "tnn_shutdown_engines cannot track this one and the "
+                "ggml-metal device-free residency assert may fire at "
+                "exit. Bump TNN_MAX_METAL_SESSIONS or free sessions "
+                "explicitly with tnn_session_free.\n",
+                TNN_MAX_METAL_SESSIONS);
+        return;
+    }
+    g_metal_sessions[g_metal_session_count++] = sess;
+}
+static void tnn_metal_session_unregister(void *sess)
+{
+    for (int i = 0; i < g_metal_session_count; ++i) {
+        if (g_metal_sessions[i] == sess) {
+            /* compact: move the tail entry into the hole */
+            g_metal_sessions[i] = g_metal_sessions[g_metal_session_count - 1];
+            g_metal_sessions[g_metal_session_count - 1] = NULL;
+            --g_metal_session_count;
+            return;
+        }
+    }
+}
+/* Forward decl: tnn_session_free is defined further down; the drain in
+ * tnn_shutdown_engines needs it. */
+void tnn_session_free(void *sess);
 /* CUDA backend init with device selection. Weak stub returns NULL;
  * strong override lives in tinynn_backend_cuda.c. */
 __attribute__((weak))
@@ -199,6 +261,24 @@ static tnn_engine *tnn_engine_get_on(int backend_kind, int device)
  * GPU between phases. */
 void tnn_shutdown_engines(void)
 {
+    /* toy#90 — drain any live Metal sessions FIRST. Each session_free
+     * frees s->weights_buf, whose ggml-metal buffer removes itself from
+     * the device's residency-set collection; only once every Metal
+     * buffer is freed is the device-free assert (ggml-metal-device.m:618)
+     * satisfied. tnn_session_free unregisters as it goes, so we always
+     * drain index 0 until the list is empty (no iterator invalidation).
+     * Metal-only: CPU/CUDA sessions are never registered. */
+    while (g_metal_session_count > 0) {
+        void *sess = g_metal_sessions[0];
+        if (!sess) { /* defensive: drop a stale NULL slot */
+            g_metal_sessions[0] = g_metal_sessions[g_metal_session_count - 1];
+            g_metal_sessions[g_metal_session_count - 1] = NULL;
+            --g_metal_session_count;
+            continue;
+        }
+        tnn_session_free(sess);
+    }
     /* CPU + Metal: single slot each. */
     tnn_engine **scalar_slots[] = { &g_engine_cpu, &g_engine_metal };
     for (int i = 0; i < 2; ++i) {
@@ -387,6 +467,13 @@ void *tnn_session_new_on(int backend_kind, int device)
     s->weights_map_base  = NULL;
     s->weights_map_size  = 0;
     s->last_graph        = 0;
+    /* toy#90 — register Metal sessions so tnn_shutdown_engines can drain
+     * their residency-set-carrying buffers before the ggml-metal static
+     * destructor runs. Gated to the Metal engine: CPU/CUDA sessions are
+     * never tracked, keeping their lifecycle unchanged. */
+    if (e == g_engine_metal) {
+        tnn_metal_session_register((void *)s);
+    }
     return (void *)s;
 }
@@ -394,6 +481,9 @@ void tnn_session_free(void *sess)
 {
     if (!sess) return;
     tnn_session *s = (tnn_session *)sess;
+    /* toy#90 — drop from the Metal drain registry (no-op for CPU/CUDA
+     * sessions, which were never registered). Idempotent. */
+    tnn_metal_session_unregister(sess);
     if (s->weights_buf)      ggml_backend_buffer_free(s->weights_buf);
     if (s->weights_buf_mmap) ggml_backend_buffer_free(s->weights_buf_mmap);
     if (s->ctx)        ggml_free(s->ctx);
@@ -815,6 +905,38 @@ void *tnn_ssm_scan(void *sess, void *state, void *x, void *dt,
                                   (struct ggml_tensor *)ids);
 }
+/* Gated DeltaNet recurrence core (Dragon / Qwen3-Next family). The q/k/v/g/beta
+ * projections + the short causal conv are built by the Ruby GDN primitive; this
+ * is the fused recurrence op only. Shapes (all F32): v=[S_v,H,T,B];
+ * q,k contiguous-rows; g=[1|S_v,H,T,B]; beta ne0==1; state=[S_v*S_v*H,K,B,1].
+ * out=[S_v*H, T*B + K*S_v*B, 1, 1] (token outputs then trailing state snapshots).
+ * Forward only in ggml — training backward is a separate hand-written kernel. */
+void *tnn_gated_delta_net(void *sess, void *q, void *k, void *v,
+                          void *g, void *beta, void *state)
+{
+    if (!sess || !q || !k || !v || !g || !beta || !state) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_gated_delta_net(s->ctx,
+                                        (struct ggml_tensor *)q,
+                                        (struct ggml_tensor *)k,
+                                        (struct ggml_tensor *)v,
+                                        (struct ggml_tensor *)g,
+                                        (struct ggml_tensor *)beta,
+                                        (struct ggml_tensor *)state);
+}
+/* 1-D convolution (kernel a, data b) with stride/pad/dilation — the short
+ * causal conv inside a GDN block (also generally useful). */
+void *tnn_conv_1d(void *sess, void *a, void *b, int s0, int p0, int d0)
+{
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_conv_1d(s->ctx,
+                                (struct ggml_tensor *)a,
+                                (struct ggml_tensor *)b,
+                                s0, p0, d0);
+}
 void *tnn_gelu(void *sess, void *a)
 {
     if (!sess || !a) return NULL;
@@ -825,6 +947,101 @@ void *tnn_gelu(void *sess, void *a)
     return (void *)ggml_gelu(s->ctx, (struct ggml_tensor *)a);
 }
+/* Unary/binary elementwise ops used to compose the GDN gate math, differential
+ * attention, and gated output norm (Dragon/GDN Phase 2). All thin ggml wraps. */
+void *tnn_sigmoid(void *sess, void *a)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_sigmoid(s->ctx, (struct ggml_tensor *)a);
+}
+void *tnn_exp(void *sess, void *a)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_exp(s->ctx, (struct ggml_tensor *)a);
+}
+void *tnn_log(void *sess, void *a)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_log(s->ctx, (struct ggml_tensor *)a);
+}
+void *tnn_neg(void *sess, void *a)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_neg(s->ctx, (struct ggml_tensor *)a);
+}
+void *tnn_sub(void *sess, void *a, void *b)
+{
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_sub(s->ctx, (struct ggml_tensor *)a, (struct ggml_tensor *)b);
+}
+void *tnn_sqrt(void *sess, void *a)
+{
+    /* elementwise sqrt; has ggml backward (GGML_OP_SQRT). Used to compose a
+     * backward-friendly L2 norm (L2_NORM itself has no ggml backward). */
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_sqrt(s->ctx, (struct ggml_tensor *)a);
+}
+void *tnn_repeat(void *sess, void *a, void *b)
+{
+    /* ggml_repeat: broadcast `a` up to the shape of `b`. Has ggml backward
+     * (GGML_OP_REPEAT → repeat_back, which SUMS the grad back down to a's
+     * shape). Used to materialise a broadcast operand explicitly so a later
+     * same-shape op (e.g. DIV, whose backward does NOT reduce a broadcast
+     * src1) sees matching shapes and the grad reduction happens through the
+     * well-formed REPEAT backward instead. */
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_repeat(s->ctx, (struct ggml_tensor *)a,
+                                (struct ggml_tensor *)b);
+}
+void *tnn_div(void *sess, void *a, void *b)
+{
+    /* elementwise a/b with ggml broadcast (b repeats into a); has ggml backward
+     * (GGML_OP_DIV). The divisor in the composed L2 is [1,H,T] broadcasting over
+     * the [S_v,H,T] numerator. */
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_div(s->ctx, (struct ggml_tensor *)a, (struct ggml_tensor *)b);
+}
+/* L2-normalise rows along ne0 (q/k normalisation for the delta rule). */
+void *tnn_l2_norm(void *sess, void *a, double eps)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_l2_norm(s->ctx, (struct ggml_tensor *)a, (float)eps);
+}
+/* softplus(x) = log(1 + exp(x)) — the GDN log-decay gate. */
+void *tnn_softplus(void *sess, void *a)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_softplus(s->ctx, (struct ggml_tensor *)a);
+}
+/* scale + bias: s*x + b (compile-time scalars). Used for SSMax (s*log n + b)
+ * and depth scaling. */
+void *tnn_scale_bias(void *sess, void *a, double s, double b)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *sx = (tnn_session *)sess;
+    return (void *)ggml_scale_bias(sx->ctx, (struct ggml_tensor *)a, (float)s, (float)b);
+}
 void *tnn_rms_norm(void *sess, void *x, void *gamma_row, double eps)
 {
     if (!sess || !x || !gamma_row) return NULL;
@@ -1316,6 +1533,20 @@ void *tnn_input_1d_i32(void *sess, int n)
     return (void *)ggml_new_tensor_1d(s->ctx, GGML_TYPE_I32, (int64_t)n);
 }
+/* Persistent i32 input in ctx_w (#1449): a graph INPUT read across the
+ * forward->backward boundary must not live in the galloc compute arena, where
+ * galloc (seeing it as dead after the forward gather) frees its offset and
+ * reuses it for the loss output -> backward get_rows reads loss bits as a wild
+ * index. ctx_w is galloc-external and survives reset_for_rebuild. Allocated
+ * before tnn_finalize_weights; re-uploaded each step. */
+void *tnn_input_1d_i32_persistent(void *sess, int n)
+{
+    if (!sess || n <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->weights_finalized) return NULL;
+    return (void *)ggml_new_tensor_1d(s->ctx_w, GGML_TYPE_I32, (int64_t)n);
+}
 void tnn_gelu_back_scratch(void *sess, int n)
 {
     if (!sess || n <= 0) return;

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: toy
 version: !ruby/object:Gem::Version
-  version: 0.8.0
+  version: 0.9.0
 platform: ruby
 authors:
 - Ori Pekelman
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-06-12 00:00:00.000000000 Z
+date: 2026-06-27 00:00:00.000000000 Z
 dependencies: []
 description: |-
   Toy is a pure-Ruby transformer LM that compiles to a native binary
@@ -74,9 +74,11 @@ files:
 - lib/toy/io/toy_events.rb
 - lib/toy/io/toy_image_loader.rb
 - lib/toy/llm/adamw.rb
+- lib/toy/llm/archs/layer_spec.rb
 - lib/toy/llm/archs/llama_arch.rb
 - lib/toy/llm/archs/llama_arch_cuda.rb
 - lib/toy/llm/archs/llama_arch_metal.rb
+- lib/toy/llm/blocks/gdn_block.rb
 - lib/toy/llm/blocks/transformer_block.rb
 - lib/toy/llm/blocks/transformer_block_cuda.rb
 - lib/toy/llm/blocks/transformer_block_metal.rb
@@ -98,6 +100,9 @@ files:
 - lib/toy/llm/engine/llama_seq_engine_metal.rb
 - lib/toy/llm/engine/vit_tiny_engine.rb
 - lib/toy/llm/labels.rb
+- lib/toy/llm/primitives/depth_scale.rb
+- lib/toy/llm/primitives/diff_attention.rb
+- lib/toy/llm/primitives/gdn.rb
 - lib/toy/llm/primitives/gqa.rb
 - lib/toy/llm/primitives/gqa_cuda.rb
 - lib/toy/llm/primitives/gqa_metal.rb
@@ -107,6 +112,7 @@ files:
 - lib/toy/llm/primitives/rope.rb
 - lib/toy/llm/primitives/rope_cuda.rb
 - lib/toy/llm/primitives/rope_metal.rb
+- lib/toy/llm/primitives/scalable_softmax.rb
 - lib/toy/llm/primitives/swiglu.rb
 - lib/toy/llm/primitives/swiglu_cuda.rb
 - lib/toy/llm/primitives/swiglu_metal.rb
@@ -146,6 +152,7 @@ files:
 - lib/toy/run/train_gpt2.rb
 - lib/toy/run/train_gpt2_cuda.rb
 - lib/toy/run/train_gpt2_metal.rb
+- lib/toy/run/train_hybrid.rb
 - lib/toy/run/train_lora.rb
 - lib/toy/run/train_lora_cuda.rb
 - lib/toy/run/train_metal.rb