RubyGems - toy - Versions diffs - 0.8.0 - Mend

toy 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2107) hide show

data/tinynn/tinynn_ggml.c ADDED Viewed

@@ -0,0 +1,2460 @@
+#include "tinynn_ggml.h"
+#include "tinynn_trace.h"
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+#include "gguf.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <unistd.h>
+/* CUDA backend init lives in tinynn_backend_cuda.c (only present when
+ * linking against libtinynn_ggml_cuda.a). Weak DEFINITION here returns
+ * NULL — strong override in the CUDA archive provides the real impl.
+ * Lets a single tinynn_ggml.o serve both CPU-only and CUDA programs
+ * without symbol duplication, on both clang and gcc.
+ */
+__attribute__((weak)) ggml_backend_t tnn_backend_cuda_init_internal(void) {
+    return NULL;
+}
+/* Metal backend init: same weak-default / strong-override pattern as
+ * CUDA. The strong definition lives in tinynn/tinynn_backend_metal.m,
+ * compiled only into libtinynn_ggml_metal.a (macOS-only). Builds
+ * without the Metal archive get this NULL weak default and fall
+ * through to the CPU backend. */
+__attribute__((weak)) ggml_backend_t tnn_backend_metal_init_internal(void) {
+    return NULL;
+}
+/* Weak hook: returns a CUDA-side ggml_backend_cuda_buffer_from_ptr
+ * wrapping the given host region (typically an mmap'd GGUF). The
+ * CPU-only build leaves this NULL; the CUDA archive
+ * (tinynn_backend_cuda.c) overrides with a strong definition that
+ * calls into the patched ggml-cuda. */
+__attribute__((weak)) ggml_backend_buffer_t
+tnn_cuda_buffer_from_ptr_internal(void *host_ptr, size_t size, int device) {
+    (void)host_ptr; (void)size; (void)device;
+    return NULL;
+}
+#define TNN_SCRATCH_BYTES (16 * 1024 * 1024)   /* 16 MiB: 4M f32 */
+/* P6: per-op timing via sched eval callback. Routes one Chrome-Trace
+ * duration event per ggml node when tnn_trace_op_capture_active().
+ * Cost when off: one capture-flag load + early return per node.
+ *
+ * Semantics from ggml_backend_sched_eval_callback (ggml-backend.h):
+ *   ask=true  → return true to request a post call, false to skip.
+ *   ask=false → return true to continue compute, false to ABORT.
+ * We never abort: returning true on post is unconditional. */
+static bool tnn_sched_op_eval_cb(struct ggml_tensor *t, bool ask, void *user_data) {
+    (void)user_data;
+    if (ask) {
+        if (!tnn_trace_op_capture_active()) return false;
+        tnn_trace_op_record_begin();
+        return true;
+    }
+    if (tnn_trace_op_capture_active()) {
+        tnn_trace_op_record_end(ggml_op_name(t->op));
+    }
+    return true;
+}
+/* Engine: persistent across the program's lifetime. Holds the backend
+ * objects + scheduler. Cached per backend flavor so multiple
+ * session_new calls share one backend init. */
+typedef struct {
+    ggml_backend_t       backend;        /* CUDA / Metal / CPU */
+    ggml_backend_t       cpu_backend;    /* sched fallback when primary is GPU */
+    ggml_backend_sched_t sched;
+    const char          *backend_name;
+} tnn_engine;
+/* GH#3 — multi-GPU mode 1 (replicated inference). CUDA engine cache
+ * widened from a scalar to a per-device array so each GPU can have
+ * its own backend/sched and sessions can be pinned to a device via
+ * tnn_session_new_on(kind, device). CPU and Metal stay scalar (Metal
+ * = single Apple GPU; CPU = single host).
+ *
+ * Bound is conservative — 8 GPUs on one host is more than any
+ * realistic toy deployment. If you need more, bump and rebuild. */
+#define TNN_MAX_CUDA_DEVICES 8
+static tnn_engine *g_engine_cpu   = NULL;
+static tnn_engine *g_engine_cuda[TNN_MAX_CUDA_DEVICES] = { NULL };
+static tnn_engine *g_engine_metal = NULL;
+/* CUDA backend init with device selection. Weak stub returns NULL;
+ * strong override lives in tinynn_backend_cuda.c. */
+__attribute__((weak))
+ggml_backend_t tnn_backend_cuda_init_internal_on(int device) {
+    (void)device;
+    return NULL;
+}
+/* Bind ggml_backend_cuda_get_device_count so Ruby can discover the
+ * GPU count without hard-coding. Weak stub returns 0 on CPU-only
+ * builds; strong override in tinynn_backend_cuda.c calls the real
+ * ggml API. */
+__attribute__((weak))
+int tnn_cuda_get_device_count_internal(void) {
+    return 0;
+}
+int tnn_cuda_get_device_count(void) {
+    return tnn_cuda_get_device_count_internal();
+}
+/* backend_kind: 0 = CPU, 1 = CUDA, 2 = Metal. device: for CUDA, the
+ * GPU index (0..TNN_MAX_CUDA_DEVICES-1); ignored for CPU and Metal.
+ * Falls back to CPU if the requested GPU backend isn't linked into
+ * the binary (weak init stub returns NULL). */
+static tnn_engine *tnn_engine_get_on(int backend_kind, int device)
+{
+    tnn_engine **slot;
+    switch (backend_kind) {
+        case 1: {
+            if (device < 0 || device >= TNN_MAX_CUDA_DEVICES) {
+                fprintf(stderr,
+                        "[tnn] tnn_engine_get_on: CUDA device=%d out of range "
+                        "[0,%d). Bump TNN_MAX_CUDA_DEVICES if you really have "
+                        "this many GPUs.\n", device, TNN_MAX_CUDA_DEVICES);
+                return NULL;
+            }
+            slot = &g_engine_cuda[device];
+            break;
+        }
+        case 2:  slot = &g_engine_metal; break;
+        default: slot = &g_engine_cpu;   break;
+    }
+    if (*slot) return *slot;
+    ggml_backend_load_all();
+    tnn_engine *e = (tnn_engine *)calloc(1, sizeof(tnn_engine));
+    if (!e) return NULL;
+    if (backend_kind == 1) {
+        e->backend = tnn_backend_cuda_init_internal_on(device);
+        if (e->backend) e->backend_name = "cuda";
+    } else if (backend_kind == 2) {
+        e->backend = tnn_backend_metal_init_internal();
+        if (e->backend) e->backend_name = "metal";
+    }
+    if (!e->backend) {
+        /* Fail loud on the GPU→CPU fallback: a consumer that asked for
+         * CUDA/Metal but didn't link the backend archive (e.g. missing
+         * -Wl,-u,tnn_cuda_force_link) would otherwise silently compute
+         * on CPU — the loss curve flips with no other symptom. */
+        if (backend_kind == 1 || backend_kind == 2) {
+            fprintf(stderr,
+                    "[tnn] WARNING: %s backend requested but not linked into "
+                    "this binary (init returned NULL) — falling back to CPU. "
+                    "CUDA consumers must link with "
+                    "-Wl,-u,tnn_cuda_force_link; see docs/consuming-toy.md.\n",
+                    backend_kind == 1 ? "CUDA" : "Metal");
+        }
+        e->backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
+        e->backend_name = "cpu";
+    }
+    if (!e->backend) { free(e); return NULL; }
+    e->cpu_backend = (e->backend_name[0] == 'c' && e->backend_name[1] == 'p')
+        ? NULL
+        : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
+    ggml_backend_t backends[2];
+    int n_backends = 0;
+    backends[n_backends++] = e->backend;
+    if (e->cpu_backend) backends[n_backends++] = e->cpu_backend;
+    /* Scheduler graph-size hint. Must be >= n_nodes + n_leafs of the
+     * largest graph we'll alloc. 65536 covers seq-mode training of
+     * Qwen2.5-3B at T<=32 with LoRA + AdamW + pin_all_graph_b_nodes
+     * (~30K backward nodes once every grad-chain intermediate is
+     * pinned-as-output for the ggml-cpu sched-alias workaround).
+     * Older path (KV-cache decode, T=1) used 16384; we leave headroom. */
+    e->sched = ggml_backend_sched_new(backends, NULL, n_backends,
+                                       65536, false, true);
+    /* P6: per-op eval callback. Installed unconditionally; the callback
+     * itself early-outs when capture is off, so the overhead in the
+     * common (untraced) case is one branch per ggml node. */
+    ggml_backend_sched_set_eval_callback(e->sched, tnn_sched_op_eval_cb, NULL);
+    *slot = e;
+    return e;
+}
+/* Explicit teardown of every cached engine. Idempotent. Programs that
+ * want a clean exit (notably Metal — its ggml backend asserts in the
+ * static destructor if a residency set wasn't drained beforehand)
+ * should call this before main() returns. CUDA + CPU don't strictly
+ * need it but tolerate the call. After tnn_shutdown_engines the
+ * caches are NULL, so a fresh tnn_session_new will re-init the
+ * backend from scratch — handy if your program wants to release the
+ * GPU between phases. */
+void tnn_shutdown_engines(void)
+{
+    /* CPU + Metal: single slot each. */
+    tnn_engine **scalar_slots[] = { &g_engine_cpu, &g_engine_metal };
+    for (int i = 0; i < 2; ++i) {
+        tnn_engine *e = *scalar_slots[i];
+        if (!e) continue;
+        if (e->sched)        ggml_backend_sched_free(e->sched);
+        if (e->cpu_backend)  ggml_backend_free(e->cpu_backend);
+        if (e->backend)      ggml_backend_free(e->backend);
+        free(e);
+        *scalar_slots[i] = NULL;
+    }
+    /* CUDA: walk the per-device array (GH#3 multi-GPU mode 1). */
+    for (int dev = 0; dev < TNN_MAX_CUDA_DEVICES; ++dev) {
+        tnn_engine *e = g_engine_cuda[dev];
+        if (!e) continue;
+        if (e->sched)        ggml_backend_sched_free(e->sched);
+        if (e->cpu_backend)  ggml_backend_free(e->cpu_backend);
+        if (e->backend)      ggml_backend_free(e->backend);
+        free(e);
+        g_engine_cuda[dev] = NULL;
+    }
+}
+/* Session: per "compute frame" — owns its ctx + graph + scratch, but
+ * references a cached engine. tnn_session_free frees the per-frame
+ * resources only; the engine persists for reuse.
+ *
+ * Two contexts:
+ *  - ctx_w (weights_ctx): persistent tensors (parameters, moments).
+ *    Allocated once via ggml_backend_alloc_ctx_tensors into a stable
+ *    backend buffer that survives sched_reset cycles.
+ *  - ctx (compute_ctx): per-step tensors (inputs, intermediates).
+ *    Managed by backend_sched, re-allocated per compute cycle.
+ *
+ * Cross-ctx tensors in a single graph are supported by ggml — nodes
+ * just hold tensor pointers. The compute graph references both ctxs;
+ * sched skips persistent tensors (they already have a buffer). */
+typedef struct {
+    tnn_engine             *engine;       /* unowned */
+    struct ggml_context    *ctx;          /* compute (no_alloc=true) */
+    struct ggml_context    *ctx_w;        /* weights  (no_alloc=true until finalized) */
+    struct ggml_context    *ctx_w_mmap;   /* mmap'd weights (no_alloc=true forever;
+                                           * tensors get data via
+                                           * ggml_backend_tensor_alloc against
+                                           * weight_buf_mmap) */
+    struct ggml_cgraph     *graph;        /* primary (e.g. forward) */
+    struct ggml_cgraph     *graph_b;      /* secondary (e.g. adam_step) */
+    uint8_t                *ctx_buf;
+    size_t                  ctx_buf_size;
+    uint8_t                *ctx_w_buf;
+    size_t                  ctx_w_buf_size;
+    uint8_t                *ctx_w_mmap_buf;
+    size_t                  ctx_w_mmap_buf_size;
+    ggml_backend_buffer_t   weights_buf;       /* set by tnn_finalize_weights */
+    ggml_backend_buffer_t   weights_buf_mmap;  /* cpu_buffer_from_ptr wrapping
+                                                * a caller-owned mmap region. We
+                                                * free the buffer; we do NOT free
+                                                * the underlying memory. */
+    void                   *weights_map_base;  /* mmap base, caller-owned */
+    size_t                  weights_map_size;
+    float                  *scratch;
+    int                     scratch_pinned;        /* 1 if cudaHostAlloc'd */
+    int                     realized;
+    int                     realized_b;
+    int                     weights_finalized;
+    int                     last_graph;            /* 0 = none, 1 = a, 2 = b */
+    int                     scratch_overflow_warned; /* once-per-session diag */
+    int                     graph_capacity;        /* GH#17: persists across rebuilds */
+} tnn_session;
+/* Pinned-memory allocator hooks. Weak defaults below fall back to
+ * calloc/free. The CUDA backend object overrides these with
+ * cudaHostAlloc/cudaFreeHost so that ggml_backend_tensor_set can DMA
+ * directly from the scratch buffer instead of staging through a pinned
+ * bounce buffer inside the driver. CPU-only builds keep the weak
+ * fallbacks and pay no extra cost. */
+__attribute__((weak))
+void *tnn_pinned_alloc(size_t bytes) { return calloc(1, bytes); }
+__attribute__((weak))
+void  tnn_pinned_free(void *p)       { free(p); }
+/* Source-compat: pre-GH#3 single-device entry point. Existing callers
+ * keep working unchanged — CPU sessions and Metal sessions never had
+ * a device choice, and CUDA defaulted to device 0. */
+void *tnn_session_new(int backend_kind)
+{
+    return tnn_session_new_on(backend_kind, 0);
+}
+/* GH#3 — multi-GPU device-aware session constructor. For backend_kind
+ * == 1 (CUDA), `device` is the GPU index. For CPU/Metal the device
+ * argument is ignored. Returns NULL if the requested backend isn't
+ * linked or if the device index is out of range. */
+void *tnn_session_new_on(int backend_kind, int device)
+{
+    tnn_engine *e = tnn_engine_get_on(backend_kind, device);
+    if (!e) return NULL;
+    tnn_session *s = (tnn_session *)calloc(1, sizeof(tnn_session));
+    if (!s) return NULL;
+    s->engine = e;
+    /* Reset the (shared) scheduler so any prior allocation state is
+     * wiped before this session builds its graph. */
+    ggml_backend_sched_reset(e->sched);
+    /* Two cgraphs share ctx, so reserve room for both. ctx grows
+     * monotonically across tnn_reset_for_rebuild cycles (each rebuild
+     * allocates new compute-tensor metadata in the same ctx). At
+     * GPT-2-distil shape one decode-step graph has ~1280 ops:
+     *   6 layers × (12 heads × ~16 ops + concat/proj/FFN/LN/residual)
+     * × N rebuilds = 1280 × N tensor headers (~376 B each).
+     * Reserve enough headroom for ~10k rebuilds = ~5M tensor headers.
+     * The no_alloc=true ctx only holds metadata so this is cheap
+     * bytes-wise. */
+    s->ctx_buf_size = ggml_tensor_overhead() * 262144
+                      + ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false) * 4
+                      + 32 * 1024 * 1024;
+    s->ctx_buf = (uint8_t *)calloc(1, s->ctx_buf_size);
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ s->ctx_buf_size,
+        /*.mem_buffer =*/ s->ctx_buf,
+        /*.no_alloc   =*/ true,
+    };
+    s->ctx = ggml_init(params);
+    /* Graph node-count budget. Default GGML_DEFAULT_GRAPH_SIZE=2048
+     * is enough for distilgpt2 (6 layers, ~1200 nodes/step) but not
+     * for gpt2-small (12 layers, ~2500) and larger. 65536 covers
+     * seq-mode training (Qwen2.5-3B, T<=32, LoRA + AdamW + pinned
+     * graph_b) — matches the engine sched hash-set size. Cost is one
+     * int slot per node header. */
+    s->graph_capacity = 65536;
+    s->graph   = ggml_new_graph_custom(s->ctx, (size_t)s->graph_capacity, false);
+    s->graph_b = ggml_new_graph_custom(s->ctx, (size_t)s->graph_capacity, false);
+    /* Weights ctx pool. Sized for ~1024 weight tensors -- generous
+     * upper bound that covers FullForwardFFICache at LLM scale
+     * (per layer: 2 norms + 3*n_heads + 3 = up to ~50 tensors; for
+     * 16 layers that's 800; plus global). no_alloc=true so this is
+     * just metadata bytes. */
+    /* Persistent-weights ctx. One slot per tensor declared via
+     * tnn_input_*_f32_persistent. GPT-2 sizes:
+     *   distilgpt2  6 layers  ~  636 tensors
+     *   gpt2-small 12 layers  ~ 1272 tensors
+     *   gpt2-large 36 layers  ~ 7560 tensors
+     *   gpt2-xl    48 layers  ~10080 tensors  (KV cache per head adds)
+     * 16384 covers up to gpt2-xl comfortably; the no_alloc ctx only
+     * holds metadata so the extra bytes cost nothing on small models. */
+    s->ctx_w_buf_size = ggml_tensor_overhead() * 16384;
+    s->ctx_w_buf = (uint8_t *)calloc(1, s->ctx_w_buf_size);
+    struct ggml_init_params w_params = {
+        /*.mem_size   =*/ s->ctx_w_buf_size,
+        /*.mem_buffer =*/ s->ctx_w_buf,
+        /*.no_alloc   =*/ true,
+    };
+    s->ctx_w = ggml_init(w_params);
+    /* ctx_w_mmap is created LAZILY (on first tnn_session_attach_weight_mmap
+     * call) rather than at session_new. Eager creation has a CUDA
+     * regression: even an empty no_alloc ggml_context with no
+     * attached backend buffer causes ggml-cuda's scheduler to
+     * produce wrong matmul output for downstream ops on the SAME
+     * session (verified 2026-05-18 — CUDA inference goes from
+     * wrong (top=112919) to correct (top=71 matching CPU) when
+     * this ctx is absent). Lazy creation keeps the BYO-pointer
+     * path working when needed without poisoning sessions that
+     * don't use it. */
+    s->ctx_w_mmap_buf_size = 0;
+    s->ctx_w_mmap_buf      = NULL;
+    s->ctx_w_mmap          = NULL;
+    /* Pinned scratch on CUDA: cudaHostAlloc'd pages let
+     * ggml_backend_tensor_set DMA directly without staging through a
+     * pinned bounce buffer inside the driver. Cuts per-step
+     * labels-upload cost (heavy LoRA bench: ~19 ms → ~target). The
+     * pinned_alloc symbols are weak in this object; the CUDA backend
+     * archive overrides them with cudaHostAlloc, CPU-only binaries
+     * keep the calloc fallback. */
+    s->scratch = (float *)tnn_pinned_alloc(TNN_SCRATCH_BYTES);
+    s->scratch_pinned = (s->scratch != NULL);
+    s->realized          = 0;
+    s->realized_b        = 0;
+    s->weights_finalized = 0;
+    s->weights_buf       = NULL;
+    s->weights_buf_mmap  = NULL;
+    s->weights_map_base  = NULL;
+    s->weights_map_size  = 0;
+    s->last_graph        = 0;
+    return (void *)s;
+}
+void tnn_session_free(void *sess)
+{
+    if (!sess) return;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->weights_buf)      ggml_backend_buffer_free(s->weights_buf);
+    if (s->weights_buf_mmap) ggml_backend_buffer_free(s->weights_buf_mmap);
+    if (s->ctx)        ggml_free(s->ctx);
+    if (s->ctx_w)      ggml_free(s->ctx_w);
+    if (s->ctx_w_mmap) ggml_free(s->ctx_w_mmap);
+    free(s->ctx_buf);
+    free(s->ctx_w_buf);
+    free(s->ctx_w_mmap_buf);
+    if (s->scratch_pinned) tnn_pinned_free(s->scratch);
+    else                   free(s->scratch);
+    free(s);
+    /* Engine + sched are cached globally; do not free here. */
+}
+const char *tnn_backend_name(void *sess)
+{
+    if (!sess) return "(null)";
+    return ((tnn_session *)sess)->engine->backend_name;
+}
+int tnn_link_check(void) { return 73; }
+void *tnn_input_2d_f32(void *sess, int rows, int cols)
+{
+    if (!sess || rows <= 0 || cols <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    (void)s;   /* future: validate ctx hasn't been realized */
+    return (void *)ggml_new_tensor_2d(((tnn_session *)sess)->ctx, GGML_TYPE_F32,
+                                       (int64_t)cols, (int64_t)rows);
+}
+/* Create a PERSISTENT 2D F32 tensor in ctx_w. Its backend buffer is
+ * allocated by tnn_finalize_weights (call once after all persistent
+ * tensors are declared) and retained across sched_reset cycles, so
+ * uploaded data survives multiple compute calls without re-upload. */
+void *tnn_input_2d_f32_persistent(void *sess, int rows, int cols)
+{
+    if (!sess || rows <= 0 || cols <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->weights_finalized) return NULL;
+    return (void *)ggml_new_tensor_2d(s->ctx_w, GGML_TYPE_F32,
+                                       (int64_t)cols, (int64_t)rows);
+}
+/* Same shape as tnn_input_2d_f32_persistent but with a caller-chosen
+ * ggml type (e.g. GGML_TYPE_Q8_0 for Q8-stays-Q8 inference). For
+ * block-quantized types the column count (ne0) must be a multiple of
+ * the block size — GGML_BLCK_SIZE handles this. Returns NULL on bad
+ * shape; callers should sanity-check the result. */
+void *tnn_input_2d_persistent_typed(void *sess, int rows, int cols, int ggml_type)
+{
+    if (!sess || rows <= 0 || cols <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->weights_finalized) return NULL;
+    enum ggml_type t = (enum ggml_type)ggml_type;
+    int blck = ggml_blck_size(t);
+    if (blck > 1 && (cols % blck != 0)) return NULL;
+    return (void *)ggml_new_tensor_2d(s->ctx_w, t,
+                                       (int64_t)cols, (int64_t)rows);
+}
+long tnn_row_size(int ggml_type, int ne0)
+{
+    if (ne0 <= 0) return 0;
+    return (long)ggml_row_size((enum ggml_type)ggml_type, (int64_t)ne0);
+}
+/* Phase 2 BYO-pointer: register an mmap'd region as the backing
+ * buffer for weight tensors created via tnn_input_*_persistent_mmap.
+ * The session does NOT own the underlying memory — the caller (e.g.
+ * a tnn_gguf_session) must keep `base` valid for the session's
+ * lifetime. Returns 0 on success, -1 on already-attached / bad args. */
+/* Lazy-create ctx_w_mmap. Called from tnn_session_attach_weight_mmap
+ * (Phase 2 entry point). NOT called from tnn_session_new — see the
+ * note there for why (eager creation breaks CUDA inference). */
+static int ensure_ctx_w_mmap(tnn_session *s)
+{
+    if (s->ctx_w_mmap) return 0;
+    s->ctx_w_mmap_buf_size = ggml_tensor_overhead() * 16384;
+    s->ctx_w_mmap_buf = (uint8_t *)calloc(1, s->ctx_w_mmap_buf_size);
+    if (!s->ctx_w_mmap_buf) return -1;
+    struct ggml_init_params m_params = {
+        /*.mem_size   =*/ s->ctx_w_mmap_buf_size,
+        /*.mem_buffer =*/ s->ctx_w_mmap_buf,
+        /*.no_alloc   =*/ true,
+    };
+    s->ctx_w_mmap = ggml_init(m_params);
+    if (!s->ctx_w_mmap) {
+        free(s->ctx_w_mmap_buf);
+        s->ctx_w_mmap_buf = NULL;
+        s->ctx_w_mmap_buf_size = 0;
+        return -1;
+    }
+    return 0;
+}
+int tnn_session_attach_weight_mmap(void *sess, void *base, size_t size)
+{
+    if (!sess || !base || size == 0) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->weights_buf_mmap) return -1;  /* already attached */
+    if (ensure_ctx_w_mmap(s) != 0) return -1;
+    /* The buffer_from_ptr APIs assert ptr % TENSOR_ALIGNMENT == 0.
+     * mmap returns page-aligned pointers (>= 4 KiB), so a GGUF mmap
+     * always satisfies this.
+     *
+     * CUDA sessions get the patched ggml_backend_cuda_buffer_from_ptr
+     * (vendored in this repo; see docs/cuda-byo-pointer-design.md).
+     * The host region is cudaHostRegister'd and made device-addressable
+     * via UVA; on GB10 unified memory the device pointer equals the
+     * host pointer and kernels read the mmap'd file pages directly. */
+    int is_cuda = (s->engine && s->engine->backend_name &&
+                    s->engine->backend_name[0] == 'c' &&
+                    s->engine->backend_name[1] == 'u');
+    if (is_cuda) {
+        s->weights_buf_mmap = tnn_cuda_buffer_from_ptr_internal(base, size, 0);
+        if (!s->weights_buf_mmap) return -2;  /* CUDA archive not linked / GPU error */
+    } else {
+        s->weights_buf_mmap = ggml_backend_cpu_buffer_from_ptr(base, size);
+        if (!s->weights_buf_mmap) return -1;
+    }
+    /* Store the buffer's "view" of the base, NOT the raw host pointer.
+     * On CPU these are the same; on CUDA the buffer's base is the
+     * UVA-mapped device pointer (equal to host_ptr on unified-memory
+     * SKUs, different on discrete GPUs). Tensor data pointers are
+     * computed as weights_map_base + offset; using the buffer's base
+     * keeps ggml_backend_tensor_alloc's range-check happy. */
+    s->weights_map_base = ggml_backend_buffer_get_base(s->weights_buf_mmap);
+    s->weights_map_size = size;
+    return 0;
+}
+/* Allocate a 2D persistent tensor in ctx_w_mmap whose `data` points
+ * at `base + buf_offset` in the attached mmap region. The tensor's
+ * `buffer` is set so the scheduler treats it as already-resident.
+ * Returns NULL on bad args or out-of-range offset.
+ *
+ * For block-quantized types, `cols` (ne0) must be a multiple of the
+ * type's block size and `buf_offset` must land on a 32-byte boundary
+ * (GGUF guarantees this).
+ *
+ * Caller computes buf_offset as
+ *   gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, idx).
+ */
+void *tnn_input_2d_persistent_mmap(void *sess, int rows, int cols,
+                                    int ggml_type, size_t buf_offset)
+{
+    if (!sess || rows <= 0 || cols <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->weights_buf_mmap || !s->weights_map_base) return NULL;
+    enum ggml_type t = (enum ggml_type)ggml_type;
+    int blck = ggml_blck_size(t);
+    if (blck > 1 && (cols % blck != 0)) return NULL;
+    if (buf_offset >= s->weights_map_size) return NULL;
+    struct ggml_tensor *tensor = ggml_new_tensor_2d(s->ctx_w_mmap, t,
+                                                    (int64_t)cols,
+                                                    (int64_t)rows);
+    if (!tensor) return NULL;
+    void *addr = (char *)s->weights_map_base + buf_offset;
+    enum ggml_status st = ggml_backend_tensor_alloc(s->weights_buf_mmap,
+                                                     tensor, addr);
+    if (st != GGML_STATUS_SUCCESS) return NULL;
+    return (void *)tensor;
+}
+/* 3D variant for M2.3 MoE expert stacks. Per-expert weight matrices
+ * concatenated along ne[2] in the GGUF (e.g. ffn_gate_exps.weight has
+ * ne=[d_model, d_ff, n_experts]). Loads them in place via mmap so a
+ * Mixtral-8x7B Q4_K_M (26GB) doesn't require any RAM copy. */
+void *tnn_input_3d_persistent_mmap(void *sess, int ne0, int ne1, int ne2,
+                                    int ggml_type, size_t buf_offset)
+{
+    if (!sess || ne0 <= 0 || ne1 <= 0 || ne2 <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->weights_buf_mmap || !s->weights_map_base) return NULL;
+    enum ggml_type t = (enum ggml_type)ggml_type;
+    int blck = ggml_blck_size(t);
+    if (blck > 1 && (ne0 % blck != 0)) return NULL;
+    if (buf_offset >= s->weights_map_size) return NULL;
+    struct ggml_tensor *tensor = ggml_new_tensor_3d(s->ctx_w_mmap, t,
+                                                    (int64_t)ne0,
+                                                    (int64_t)ne1,
+                                                    (int64_t)ne2);
+    if (!tensor) return NULL;
+    void *addr = (char *)s->weights_map_base + buf_offset;
+    enum ggml_status st = ggml_backend_tensor_alloc(s->weights_buf_mmap,
+                                                     tensor, addr);
+    if (st != GGML_STATUS_SUCCESS) return NULL;
+    return (void *)tensor;
+}
+/* 1D variant for norms / biases — same semantics. */
+void *tnn_input_1d_persistent_mmap(void *sess, int n, int ggml_type,
+                                    size_t buf_offset)
+{
+    if (!sess || n <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->weights_buf_mmap || !s->weights_map_base) return NULL;
+    enum ggml_type t = (enum ggml_type)ggml_type;
+    if (buf_offset >= s->weights_map_size) return NULL;
+    struct ggml_tensor *tensor = ggml_new_tensor_1d(s->ctx_w_mmap, t,
+                                                    (int64_t)n);
+    if (!tensor) return NULL;
+    void *addr = (char *)s->weights_map_base + buf_offset;
+    enum ggml_status st = ggml_backend_tensor_alloc(s->weights_buf_mmap,
+                                                     tensor, addr);
+    if (st != GGML_STATUS_SUCCESS) return NULL;
+    return (void *)tensor;
+}
+/* Same as above but 1D — used for the 7-elem adamw_params vector. */
+void *tnn_input_1d_f32_persistent(void *sess, int n)
+{
+    if (!sess || n <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->weights_finalized) return NULL;
+    return (void *)ggml_new_tensor_1d(s->ctx_w, GGML_TYPE_F32, (int64_t)n);
+}
+/* M2: 3-D persistent F32 tensor — needed for MoE expert stacks,
+ * shape [d_in, d_out, n_experts]. Same lifecycle as the 2-D variant. */
+void *tnn_input_3d_f32_persistent(void *sess, int ne0, int ne1, int ne2)
+{
+    if (!sess || ne0 <= 0 || ne1 <= 0 || ne2 <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->weights_finalized) return NULL;
+    return (void *)ggml_new_tensor_3d(s->ctx_w, GGML_TYPE_F32,
+                                       (int64_t)ne0, (int64_t)ne1, (int64_t)ne2);
+}
+/* 3D variant of tnn_input_2d_persistent_typed for M2.3 MoE expert
+ * stacks. ne0/ne1 are the per-expert matrix dims; ne2 is n_experts.
+ * For Q8_0 we require ne0 % 32 == 0 (block alignment). */
+void *tnn_input_3d_persistent_typed(void *sess, int ne0, int ne1, int ne2,
+                                      int ggml_type)
+{
+    if (!sess || ne0 <= 0 || ne1 <= 0 || ne2 <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->weights_finalized) return NULL;
+    enum ggml_type t = (enum ggml_type)ggml_type;
+    int blck = ggml_blck_size(t);
+    if (blck > 1 && (ne0 % blck != 0)) return NULL;
+    return (void *)ggml_new_tensor_3d(s->ctx_w, t,
+                                       (int64_t)ne0, (int64_t)ne1, (int64_t)ne2);
+}
+/* E1.1 — 4D persistent F32 (conv kernels: ne=[KW, KH, IC, OC]). */
+void *tnn_input_4d_f32_persistent(void *sess, int ne0, int ne1, int ne2, int ne3)
+{
+    if (!sess || ne0 <= 0 || ne1 <= 0 || ne2 <= 0 || ne3 <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->weights_finalized) return NULL;
+    return (void *)ggml_new_tensor_4d(s->ctx_w, GGML_TYPE_F32,
+                                       (int64_t)ne0, (int64_t)ne1,
+                                       (int64_t)ne2, (int64_t)ne3);
+}
+/* Allocate the backend buffer for all persistent tensors in ctx_w.
+ * Must be called AFTER declaring all persistent tensors and BEFORE
+ * any tnn_realize/compute. After this, the persistent tensors have
+ * stable backend storage independent of sched.
+ *
+ * Returns 0 on success, negative on failure. */
+int tnn_finalize_weights(void *sess)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->weights_finalized) return -2;
+    s->weights_buf = ggml_backend_alloc_ctx_tensors(s->ctx_w, s->engine->backend);
+    if (!s->weights_buf) return -3;
+    s->weights_finalized = 1;
+    return 0;
+}
+/* Zero an entire persistent tensor via backend memset_tensor. Faster
+ * than building a Mat-of-zeros + upload_row_major when the tensor is
+ * big (e.g. Adam state for vocab×d_model embeddings: ~1 GB of zeros).
+ * Works on both CPU (memset) and CUDA (cudaMemsetAsync). */
+int tnn_zero_tensor(void *sess, void *tensor)
+{
+    if (!sess || !tensor) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    (void)s;
+    struct ggml_tensor *t = (struct ggml_tensor *)tensor;
+    ggml_backend_tensor_memset(t, 0, 0, ggml_nbytes(t));
+    return 0;
+}
+void *tnn_matmul(void *sess, void *a, void *b)
+{
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_mul_mat(s->ctx,
+                                 (struct ggml_tensor *)a,
+                                 (struct ggml_tensor *)b);
+}
+void *tnn_out_prod(void *sess, void *a, void *b)
+{
+    /* ggml_out_prod: result[m, n] = sum_k a[k, m] * b[k, n]. Same
+     * input shape constraints as ggml_mul_mat (a.ne0 == b.ne0). Used
+     * by ggml's autograd for weight-gradient computations. Exposed
+     * here so A/B smokes can compare per-op cost vs ggml_mul_mat. */
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_out_prod(s->ctx,
+                                  (struct ggml_tensor *)a,
+                                  (struct ggml_tensor *)b);
+}
+void *tnn_swiglu_split(void *sess, void *gate, void *up)
+{
+    /* ggml_swiglu_split: silu(gate) * up — fused activation+mul for
+     * the Llama-family SwiGLU FFN gating step. Replaces the explicit
+     * silu(gate) → mul(_, up) pair in toy's FFN block. On CUDA the
+     * fusion lets ggml-cuda issue one kernel instead of two. */
+    if (!sess || !gate || !up) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_swiglu_split(s->ctx,
+                                       (struct ggml_tensor *)gate,
+                                       (struct ggml_tensor *)up);
+}
+/* M2 MoE primitives. Thin wrappers — ggml does the work; we just expose
+ * the entry points through the FFI. See tinynn_ggml.h for shape docs. */
+void *tnn_mul_mat_id(void *sess, void *as, void *b, void *ids)
+{
+    if (!sess || !as || !b || !ids) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_mul_mat_id(s->ctx,
+                                   (struct ggml_tensor *)as,
+                                   (struct ggml_tensor *)b,
+                                   (struct ggml_tensor *)ids);
+}
+void *tnn_add_id(void *sess, void *a, void *b, void *ids)
+{
+    if (!sess || !a || !b || !ids) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_add_id(s->ctx,
+                               (struct ggml_tensor *)a,
+                               (struct ggml_tensor *)b,
+                               (struct ggml_tensor *)ids);
+}
+void *tnn_argsort(void *sess, void *a, int descending)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    enum ggml_sort_order order = descending ? GGML_SORT_ORDER_DESC : GGML_SORT_ORDER_ASC;
+    return (void *)ggml_argsort(s->ctx, (struct ggml_tensor *)a, order);
+}
+void *tnn_top_k(void *sess, void *a, int k)
+{
+    if (!sess || !a || k <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_top_k(s->ctx, (struct ggml_tensor *)a, k);
+}
+void *tnn_matmul_axb(void *sess, void *a, void *b)
+{
+    /* Compute A · B (no transpose at the caller). ggml_mul_mat does
+     * A · B^T natively, so we transpose B first.  ggml_transpose is a
+     * stride-permutation view; ggml_cont materializes it as contiguous
+     * so mul_mat's contiguity-required input is satisfied. */
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    struct ggml_tensor *bT = ggml_cont(s->ctx, ggml_transpose(s->ctx, (struct ggml_tensor *)b));
+    return (void *)ggml_mul_mat(s->ctx, (struct ggml_tensor *)a, bT);
+}
+void *tnn_add(void *sess, void *a, void *b)
+{
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_add(s->ctx,
+                             (struct ggml_tensor *)a,
+                             (struct ggml_tensor *)b);
+}
+void *tnn_tanh(void *sess, void *a)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    /* Element-wise tanh. Used by Gemma 2's logit soft-cap:
+     *   y = softcap * tanh(x / softcap)
+     * Composed via tnn_scale + tnn_tanh + tnn_scale in the graph builder. */
+    return (void *)ggml_tanh(s->ctx, (struct ggml_tensor *)a);
+}
+void *tnn_ssm_conv(void *sess, void *sx, void *c)
+{
+    if (!sess || !sx || !c) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_ssm_conv(s->ctx,
+                                  (struct ggml_tensor *)sx,
+                                  (struct ggml_tensor *)c);
+}
+void *tnn_ssm_scan(void *sess, void *state, void *x, void *dt,
+                    void *A, void *B, void *C, void *ids)
+{
+    if (!sess || !state || !x || !dt || !A || !B || !C || !ids) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_ssm_scan(s->ctx,
+                                  (struct ggml_tensor *)state,
+                                  (struct ggml_tensor *)x,
+                                  (struct ggml_tensor *)dt,
+                                  (struct ggml_tensor *)A,
+                                  (struct ggml_tensor *)B,
+                                  (struct ggml_tensor *)C,
+                                  (struct ggml_tensor *)ids);
+}
+void *tnn_gelu(void *sess, void *a)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    /* ggml_gelu uses the tanh approximation:
+     *   0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3)))
+     * which matches the project's feed_forward GeLU exactly. */
+    return (void *)ggml_gelu(s->ctx, (struct ggml_tensor *)a);
+}
+void *tnn_rms_norm(void *sess, void *x, void *gamma_row, double eps)
+{
+    if (!sess || !x || !gamma_row) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    /* ggml_rms_norm normalizes along ne[0] (the feature dim). The result
+     * is the unscaled normalized tensor; we then multiply by gamma_row
+     * (shape 1 x feature) which ggml_mul broadcasts over the leading dim. */
+    struct ggml_tensor *normed = ggml_rms_norm(s->ctx,
+                                                (struct ggml_tensor *)x,
+                                                (float)eps);
+    return (void *)ggml_mul(s->ctx, normed, (struct ggml_tensor *)gamma_row);
+}
+/* LayerNorm: y = gamma * (x - mean) / sqrt(var + eps) + beta. ggml_norm
+ * computes the normalized (x - mean)/sqrt(var+eps) part; we then
+ * multiply by gamma and add beta. Used for HF-style models (GPT-2 /
+ * GPT-Neo / TinyStories) that use LayerNorm rather than RMSNorm. */
+void *tnn_layer_norm(void *sess, void *x, void *gamma_row, void *beta_row, double eps)
+{
+    if (!sess || !x || !gamma_row || !beta_row) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    struct ggml_tensor *normed = ggml_norm(s->ctx,
+                                             (struct ggml_tensor *)x,
+                                             (float)eps);
+    struct ggml_tensor *scaled = ggml_mul(s->ctx, normed,
+                                            (struct ggml_tensor *)gamma_row);
+    return (void *)ggml_add(s->ctx, scaled,
+                              (struct ggml_tensor *)beta_row);
+}
+/* Write `b` into `a` at byte offset, with row stride nb1. Result has
+ * `a`'s shape (unlike ggml_cpy which returns the small dst view) so
+ * downstream ops can read the modified `a` directly. Used for V[:, pos]
+ * column writes in KV cache (V layout = [max_T, d_head], offset =
+ * pos * 4, nb1 = max_T * 4). */
+void *tnn_set_2d(void *sess, void *a, void *b, long nb1, long offset)
+{
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_set_2d(s->ctx,
+                                 (struct ggml_tensor *)a,
+                                 (struct ggml_tensor *)b,
+                                 (size_t)nb1,
+                                 (size_t)offset);
+}
+/* Write `b`'s rows into `a` at row indices `idx`. For our KV cache:
+ *   a   = persistent K (ne=[d_head, max_T])
+ *   b   = compute k_new (ne=[d_head, 1])
+ *   idx = compute (1,) int32 holding the current decode position
+ * The new k row lands at K[idx[0]] (other rows untouched). Same shape
+ * pattern for V. Position is a RUNTIME tensor — the graph stays
+ * static across decode steps, so we don't need to rebuild it. */
+void *tnn_set_rows(void *sess, void *a, void *b, void *idx)
+{
+    if (!sess || !a || !b || !idx) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_set_rows(s->ctx,
+                                   (struct ggml_tensor *)a,
+                                   (struct ggml_tensor *)b,
+                                   (struct ggml_tensor *)idx);
+}
+/* Softmax-with-mask. Adds `mask` to `a`, scales by `scale`, then runs
+ * softmax along ne[0]. For KV-cache attention: scores shape (max_T, 1),
+ * mask shape (max_T, 1), result shape (max_T, 1). The mask is uploaded
+ * per step with 0.0 for positions <= pos and -inf for positions > pos
+ * so the softmax zeroes out future-key attention even though K's
+ * future-position slots may hold stale or uninitialised values. */
+void *tnn_soft_max_ext(void *sess, void *a, void *mask, double scale, double max_bias)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_soft_max_ext(s->ctx,
+                                       (struct ggml_tensor *)a,
+                                       (struct ggml_tensor *)mask,
+                                       (float)scale,
+                                       (float)max_bias);
+}
+/* Returns a NULL pointer typed as :ptr. Useful as an Array<:ptr> seed
+ * value so Spinel infers the array as a PtrArray rather than typing
+ * it from a `[nil]` literal (which can resolve to IntArray). */
+void *tnn_null_ptr(void)
+{
+    return NULL;
+}
+/* 1-D view of a tensor at byte `offset`, of length `ne0`. Used to
+ * slice a single row out of a (max_T, d_head) KV buffer at a runtime
+ * position computed by the caller (offset = pos * d_head * 4). */
+void *tnn_view_1d(void *sess, void *a, int ne0, long offset)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_view_1d(s->ctx, (struct ggml_tensor *)a,
+                                  (int64_t)ne0, (size_t)offset);
+}
+/* 2-D view of a tensor: rows of length ne0 stride nb1, ne1 rows
+ * total, starting at byte `offset`. Used for slicing K/V[0:pos+1] in
+ * attention. nb1 = d_head * 4 for our row-of-floats KV layout. */
+void *tnn_view_2d(void *sess, void *a, int ne0, int ne1, long nb1, long offset)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_view_2d(s->ctx, (struct ggml_tensor *)a,
+                                  (int64_t)ne0, (int64_t)ne1,
+                                  (size_t)nb1, (size_t)offset);
+}
+/* Reshape a contiguous tensor to (ne0, ne1, ne2). The total element
+ * count must match. Used by the sequence-mode forward (M3) to lift
+ * Q/K from ne=[d_head, T] to ne=[d_head, 1, T] before ggml_rope_ext —
+ * rope_ext asserts a->ne[2] == positions->ne[0]. */
+void *tnn_reshape_3d(void *sess, void *a, int ne0, int ne1, int ne2)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_reshape_3d(s->ctx, (struct ggml_tensor *)a,
+                                     (int64_t)ne0, (int64_t)ne1, (int64_t)ne2);
+}
+/* Reshape a contiguous tensor back to (ne0, ne1). After rope_ext on
+ * a [d_head, 1, T] tensor, downstream matmul wants [d_head, T] again. */
+void *tnn_reshape_2d(void *sess, void *a, int ne0, int ne1)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_reshape_2d(s->ctx, (struct ggml_tensor *)a,
+                                     (int64_t)ne0, (int64_t)ne1);
+}
+/* Copy a -> b. Used to write k_new into a view of the persistent K
+ * buffer (b = view_2d(K, d_head, 1, ..., offset=pos*d_head*4)). */
+void *tnn_cpy(void *sess, void *a, void *b)
+{
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_cpy(s->ctx, (struct ggml_tensor *)a,
+                              (struct ggml_tensor *)b);
+}
+/* Cast a tensor to a target dtype (GH#9 mixed-precision training).
+ * Returns a NEW tensor of the requested type with the same shape.
+ * dtype enum values are from ggml_type (0=F32, 1=F16, 30=BF16, …).
+ * Backed by ggml_cast which under the hood is GGML_OP_CPY with a
+ * fresh dst of the target dtype — backward flows correctly through
+ * the cpy backward case (grad of cast(src) = reshape(grad, src)
+ * which preserves src's dtype = the F32 master in the typical
+ * weight-cast-to-bf16 use case). */
+void *tnn_cast(void *sess, void *a, int dtype)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_cast(s->ctx, (struct ggml_tensor *)a,
+                               (enum ggml_type)dtype);
+}
+/* Concatenate `a` and `b` along the given dim (0 = ne[0], 1 = ne[1]).
+ * Other dims must match. Used to glue per-head attention outputs into
+ * a single (d_model, T) tensor by stacking d_head slices along ne0. */
+void *tnn_concat(void *sess, void *a, void *b, int dim)
+{
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_concat(s->ctx,
+                                (struct ggml_tensor *)a,
+                                (struct ggml_tensor *)b,
+                                dim);
+}
+/* Causal mask: sets elements ABOVE the diagonal (i.e. positions where
+ * key_idx > query_idx + n_past) to -inf, so subsequent softmax zeroes
+ * them. n_past = 0 gives the standard causal mask for training. For
+ * KV-cache inference, n_past = current position so attention can see
+ * cached keys plus the current token but not future tokens. */
+void *tnn_diag_mask_inf(void *sess, void *a, int n_past)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_diag_mask_inf(s->ctx, (struct ggml_tensor *)a, n_past);
+}
+/* --- Vision / Conv ops (E1.1) ------------------------------------------ */
+/* im2col: extracts sliding kernel windows from the input image into a
+ * 2D matrix suitable for matmul-as-conv. ggml's im2col output ne for
+ * is_2D=true is [IC*KH*KW, OH*OW, N, 1].
+ *
+ * dst_type: 0=F32, 1=F16, 26=I32 (full enum in ggml.h:ggml_type). */
+void *tnn_im2col(void *sess, void *kernel, void *data,
+                 int s0, int s1, int p0, int p1, int d0, int d1,
+                 int is_2D, int dst_type)
+{
+    if (!sess || !kernel || !data) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_im2col(s->ctx,
+                                (struct ggml_tensor *)kernel,
+                                (struct ggml_tensor *)data,
+                                s0, s1, p0, p1, d0, d1,
+                                is_2D ? true : false,
+                                (enum ggml_type)dst_type);
+}
+/* im2col_back: gradient of im2col w.r.t. the input image. Caller
+ * must pass the original input image shape via input_w/input_h/input_c/input_n
+ * (ggml's API wants an int64_t ne[4]). */
+void *tnn_im2col_back(void *sess, void *kernel, void *grad_im2col,
+                      int input_w, int input_h, int input_c, int input_n,
+                      int s0, int s1, int p0, int p1, int d0, int d1,
+                      int is_2D)
+{
+    if (!sess || !kernel || !grad_im2col) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    int64_t ne[4];
+    ne[0] = (int64_t)input_w;
+    ne[1] = (int64_t)input_h;
+    ne[2] = (int64_t)input_c;
+    ne[3] = (int64_t)input_n;
+    return (void *)ggml_im2col_back(s->ctx,
+                                     (struct ggml_tensor *)kernel,
+                                     (struct ggml_tensor *)grad_im2col,
+                                     ne,
+                                     s0, s1, p0, p1, d0, d1,
+                                     is_2D ? true : false);
+}
+/* conv_2d: composite (im2col + matmul). ggml internally folds the
+ * kernel + im2col output and emits the [OW, OH, OC, N] result. */
+void *tnn_conv_2d(void *sess, void *kernel, void *data,
+                  int s0, int s1, int p0, int p1, int d0, int d1)
+{
+    if (!sess || !kernel || !data) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_conv_2d(s->ctx,
+                                 (struct ggml_tensor *)kernel,
+                                 (struct ggml_tensor *)data,
+                                 s0, s1, p0, p1, d0, d1);
+}
+/* Reorder dims as a view (no copy). Result must be passed through
+ * ggml_cont before any op that requires contiguous memory. */
+void *tnn_permute(void *sess, void *a, int axis0, int axis1, int axis2, int axis3)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_permute(s->ctx, (struct ggml_tensor *)a,
+                                 axis0, axis1, axis2, axis3);
+}
+/* Make contiguous + reshape to 2D in one op. Used after a permute
+ * to flatten the spatial dims for the transformer input. */
+void *tnn_cont_2d(void *sess, void *a, int ne0, int ne1)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_cont_2d(s->ctx, (struct ggml_tensor *)a,
+                                 (int64_t)ne0, (int64_t)ne1);
+}
+/* --- Llama-family ops -------------------------------------------------- */
+/* SiLU activation: silu(x) = x * sigmoid(x). Used in SwiGLU FFNs
+ * (Llama / SmolLM2 / Qwen / Phi). */
+void *tnn_silu(void *sess, void *a)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_silu(s->ctx, (struct ggml_tensor *)a);
+}
+/* Elementwise multiply c = a * b. Used to combine the gate and up
+ * projections of SwiGLU before the down projection. */
+void *tnn_mul(void *sess, void *a, void *b)
+{
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_mul(s->ctx,
+                             (struct ggml_tensor *)a,
+                             (struct ggml_tensor *)b);
+}
+/* Rotary Position Embedding (rotate_half / NEOX mode), as used by
+ * Llama / SmolLM2 / Qwen2 / Mistral. Applied to Q and K before the
+ * dot product.
+ *
+ *   a:        input tensor, shape [Dh, T, ...]   (one head's worth)
+ *   pos:      int32 tensor of length T, absolute positions per token
+ *   n_dims:   number of dimensions to rotate (= Dh for full rotary,
+ *             smaller for partial — Pythia uses Dh/4)
+ *   freq_base: theta base. 10000 (Llama-1/2), 100000 (SmolLM2),
+ *              1000000 (Qwen2 long-context)
+ *
+ * Pass freq_scale=1.0, ext_factor=0.0, attn_factor=1.0, beta_fast=32.0,
+ * beta_slow=1.0, freq_factors=NULL for the no-scaling (vanilla GPT-2 /
+ * SmolLM2 / Qwen2-short-context) default. YaRN tunes the scalars;
+ * llama3 + LongRoPE supply freq_factors via tnn_rope_freq_factors_*. */
+void *tnn_rope_ext(void *sess, void *a, void *pos, int n_dims,
+                   double freq_base, double freq_scale,
+                   double ext_factor, double attn_factor,
+                   double beta_fast, double beta_slow,
+                   void *freq_factors)
+{
+    if (!sess || !a || !pos) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    const int mode = 2;   /* GGML_ROPE_TYPE_NEOX — matches HF llama rotate_half */
+    /* n_ctx_orig is only consulted when ext_factor != 0 (YaRN). Pass
+     * 0 when no YaRN is in play; callers using YaRN encode orig_ctx
+     * via the freq_factors path or pass it via attn_factor scaling. */
+    const int n_ctx_orig = 0;
+    return (void *)ggml_rope_ext(s->ctx,
+                                  (struct ggml_tensor *)a,
+                                  (struct ggml_tensor *)pos,
+                                  (struct ggml_tensor *)freq_factors,
+                                  n_dims,
+                                  mode,
+                                  n_ctx_orig,
+                                  (float)freq_base,
+                                  (float)freq_scale,
+                                  (float)ext_factor,
+                                  (float)attn_factor,
+                                  (float)beta_fast,
+                                  (float)beta_slow);
+}
+/* Allocate a persistent (n_dims/2)-element F32 tensor in ctx_w to hold
+ * RoPE freq_factors for llama3-style or LongRoPE scaling. Must be
+ * called BEFORE tnn_finalize_weights, like any other persistent.
+ *
+ * The values are computed by the Ruby side (see
+ * Toy::RopeScaling.compute_llama3_freq_factors) and uploaded via the
+ * standard tnn_upload_from_float_array path after finalize. Doing the
+ * math in Ruby (i) keeps the C wrapper simple, (ii) avoids the
+ * "write to t->data with no_alloc=true" trap, and (iii) makes the
+ * scaling formula trivially testable from MRI without recompiling. */
+void *tnn_rope_freq_factors_alloc(void *sess, int n_dims)
+{
+    if (!sess || n_dims <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->weights_finalized) return NULL;
+    return (void *)ggml_new_tensor_1d(s->ctx_w, GGML_TYPE_F32,
+                                      (int64_t)(n_dims / 2));
+}
+/* Allocate a 1-D int32 tensor in the *session* context. Used to hold
+ * RoPE position indices. The caller fills it via tnn_scratch_set_i32 +
+ * tnn_upload_int_array (or fills directly during graph build). */
+void *tnn_input_1d_i32_ctx(void *sess, int n)
+{
+    if (!sess) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_new_tensor_1d(s->ctx, GGML_TYPE_I32, n);
+}
+void *tnn_softmax(void *sess, void *a)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    /* ggml_soft_max normalizes along ne[0]. With our convention
+     * (ne0=cols, ne1=rows) this is per-row softmax, matching the
+     * project's softmax_rows!. */
+    return (void *)ggml_soft_max(s->ctx, (struct ggml_tensor *)a);
+}
+void *tnn_flash_attn_ext(void *sess, void *q, void *k, void *v, void *mask,
+                          double scale, double max_bias, double logit_softcap)
+{
+    if (!sess || !q || !k || !v) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    /* mask may be NULL when no causal/sequence mask is wanted (e.g. fully
+     * dense T_q=1 decode with no padding). ggml's impl handles NULL. */
+    return (void *)ggml_flash_attn_ext(s->ctx,
+                                        (struct ggml_tensor *)q,
+                                        (struct ggml_tensor *)k,
+                                        (struct ggml_tensor *)v,
+                                        (struct ggml_tensor *)mask,
+                                        (float)scale,
+                                        (float)max_bias,
+                                        (float)logit_softcap);
+}
+void *tnn_transpose(void *sess, void *a)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    /* ggml_transpose is a stride-permutation view (no data movement).
+     * Wrap in ggml_cont so the result is contiguous f32 and downloadable. */
+    return (void *)ggml_cont(s->ctx,
+                              ggml_transpose(s->ctx, (struct ggml_tensor *)a));
+}
+void *tnn_scale(void *sess, void *a, double scale)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_scale(s->ctx, (struct ggml_tensor *)a, (float)scale);
+}
+void *tnn_rms_norm_back(void *sess, void *x, void *dy, double eps)
+{
+    if (!sess || !x || !dy) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_rms_norm_back(s->ctx,
+                                       (struct ggml_tensor *)x,
+                                       (struct ggml_tensor *)dy,
+                                       (float)eps);
+}
+void *tnn_softmax_back(void *sess, void *a, void *dy)
+{
+    if (!sess || !a || !dy) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    /* Plain softmax backward: scale=1.0, max_bias=0.0 (no ALiBi). */
+    return (void *)ggml_soft_max_ext_back(s->ctx,
+                                           (struct ggml_tensor *)a,
+                                           (struct ggml_tensor *)dy,
+                                           1.0f, 0.0f);
+}
+/* Backward for SiLU activation. SiLU(x) = x * sigmoid(x);
+ * dSiLU/dx = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
+ * Given x and dy (gradient from upstream), returns dx.
+ *
+ * NOTE: ggml_silu_back's public header comment swaps the args
+ * ("a - x, b - dy"). Reading the actual CPU op, src[0]=dy and
+ * src[1]=x. We pass (dy, x) to match the implementation. */
+void *tnn_silu_back(void *sess, void *x, void *dy)
+{
+    if (!sess || !x || !dy) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_silu_back(s->ctx,
+                                   (struct ggml_tensor *)dy,
+                                   (struct ggml_tensor *)x);
+}
+/* Backward for RoPE-NEOX. Same arg convention as tnn_rope_ext but
+ * also takes dy (gradient of the rope_ext output). Returns dx.
+ * Callers must pass the same YaRN/scaling args used in the forward;
+ * mismatch silently corrupts gradients. */
+void *tnn_rope_ext_back(void *sess, void *dy, void *pos, int n_dims,
+                        double freq_base, double freq_scale,
+                        double ext_factor, double attn_factor,
+                        double beta_fast, double beta_slow,
+                        void *freq_factors)
+{
+    if (!sess || !dy || !pos) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    const int mode = 2;   /* GGML_ROPE_TYPE_NEOX */
+    const int n_ctx_orig = 0;
+    return (void *)ggml_rope_ext_back(s->ctx,
+                                       (struct ggml_tensor *)dy,
+                                       (struct ggml_tensor *)pos,
+                                       (struct ggml_tensor *)freq_factors,
+                                       n_dims,
+                                       mode,
+                                       n_ctx_orig,
+                                       (float)freq_base,
+                                       (float)freq_scale,
+                                       (float)ext_factor,
+                                       (float)attn_factor,
+                                       (float)beta_fast,
+                                       (float)beta_slow);
+}
+void *tnn_get_rows(void *sess, void *table, void *idx)
+{
+    if (!sess || !table || !idx) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_get_rows(s->ctx,
+                                  (struct ggml_tensor *)table,
+                                  (struct ggml_tensor *)idx);
+}
+void *tnn_get_rows_back(void *sess, void *d_out, void *idx, void *table_shape)
+{
+    if (!sess || !d_out || !idx || !table_shape) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_get_rows_back(s->ctx,
+                                       (struct ggml_tensor *)d_out,
+                                       (struct ggml_tensor *)idx,
+                                       (struct ggml_tensor *)table_shape);
+}
+void *tnn_input_1d_i32(void *sess, int n)
+{
+    if (!sess || n <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_new_tensor_1d(s->ctx, GGML_TYPE_I32, (int64_t)n);
+}
+void tnn_gelu_back_scratch(void *sess, int n)
+{
+    if (!sess || n <= 0) return;
+    tnn_session *s = (tnn_session *)sess;
+    int max_slots = TNN_SCRATCH_BYTES / (int)sizeof(float);
+    if (3 * n > max_slots) return;     /* not enough scratch */
+    const float *x  = s->scratch + 0;
+    const float *dh = s->scratch + n;
+    float       *dx = s->scratch + 2 * n;
+    const float c = 0.7978845608028654f;    /* sqrt(2/pi) */
+    const float k = 0.044715f;
+    for (int i = 0; i < n; ++i) {
+        float xi  = x[i];
+        float xi2 = xi * xi;
+        float u   = c * (xi + k * xi * xi2);
+        float tu  = tanhf(u);
+        float sech2 = 1.0f - tu * tu;
+        float dudx  = c * (1.0f + 3.0f * k * xi2);
+        float dgelu = 0.5f * (1.0f + tu) + 0.5f * xi * sech2 * dudx;
+        dx[i] = dh[i] * dgelu;
+    }
+}
+void tnn_adam_step_scratch(void *sess, int n,
+                            double lr, double b1, double b2, double eps,
+                            double omc1, double omc2)
+{
+    if (!sess || n <= 0) return;
+    tnn_session *s = (tnn_session *)sess;
+    int max_slots = TNN_SCRATCH_BYTES / (int)sizeof(float);
+    if (4 * n > max_slots) return;
+    float *p = s->scratch + 0;
+    const float *g = s->scratch + n;
+    float *m = s->scratch + 2 * n;
+    float *v = s->scratch + 3 * n;
+    const float one_minus_b1 = (float)(1.0 - b1);
+    const float one_minus_b2 = (float)(1.0 - b2);
+    const float fb1   = (float)b1;
+    const float fb2   = (float)b2;
+    const float flr   = (float)lr;
+    const float feps  = (float)eps;
+    const float fomc1 = (float)omc1;
+    const float fomc2 = (float)omc2;
+    for (int i = 0; i < n; ++i) {
+        float gi = g[i];
+        float new_m = fb1 * m[i] + one_minus_b1 * gi;
+        float new_v = fb2 * v[i] + one_minus_b2 * gi * gi;
+        m[i] = new_m;
+        v[i] = new_v;
+        float m_hat = new_m / fomc1;
+        float v_hat = new_v / fomc2;
+        p[i] = p[i] - flr * m_hat / (sqrtf(v_hat) + feps);
+    }
+}
+void tnn_set_output(void *tensor)
+{
+    if (!tensor) return;
+    ggml_set_output((struct ggml_tensor *)tensor);
+}
+/* Sum all elements → scalar. Used to build a loss from a vector
+ * output (e.g. sum(y * y) for an L2 squared loss). */
+void *tnn_sum(void *sess, void *a)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_sum(s->ctx, (struct ggml_tensor *)a);
+}
+void *tnn_sum_rows(void *sess, void *a)
+{
+    if (!sess || !a) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_sum_rows(s->ctx, (struct ggml_tensor *)a);
+}
+/* Cross-entropy loss against a probability-distribution label tensor.
+ * Wraps ggml_cross_entropy_loss: returns a scalar. The label tensor
+ * has the same shape as the logits and should be a probability dist
+ * (one-hot for hard targets, label-smoothed for soft). Output is the
+ * mean negative log-likelihood across the columns of a (a column =
+ * one example). Used for F1.2 SmolLM2 LoRA fine-tuning. */
+void *tnn_cross_entropy_loss(void *sess, void *a, void *b)
+{
+    if (!sess || !a || !b) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_cross_entropy_loss(s->ctx,
+                                            (struct ggml_tensor *)a,
+                                            (struct ggml_tensor *)b);
+}
+void tnn_set_param(void *tensor)
+{
+    if (!tensor) return;
+    ggml_set_param((struct ggml_tensor *)tensor);
+}
+/* Mark a tensor as the training loss. Required for autograd via
+ * ggml_build_backward_expand — it asserts at least one node is marked
+ * as loss and at least one as param. Typically the scalar output of
+ * a sum-reduce or cross-entropy. */
+void tnn_set_loss(void *tensor)
+{
+    if (!tensor) return;
+    ggml_set_loss((struct ggml_tensor *)tensor);
+}
+/* Phase F0.4 autograd: after building a forward graph + marking params
+ * + marking loss, call this to extend the graph with backward nodes.
+ *
+ * Workflow (caller side):
+ *   1. tnn_input_*_persistent(...) for params, mark each with tnn_set_param
+ *   2. Build forward ops (matmul, gelu, ...) ending in a scalar loss
+ *   3. tnn_set_loss(loss_tensor); tnn_set_output(loss_tensor)
+ *   4. tnn_realize(sess, loss_tensor)
+ *   5. tnn_build_backward(sess)   ← extends s->graph_b with backward nodes
+ *   6. tnn_compute_backward(sess) ← runs forward+backward
+ *   7. tnn_tensor_grad(param)     ← retrieve the gradient tensor
+ *
+ * The backward extends s->graph_b (we keep s->graph as forward-only
+ * for inference use); a freshly-duped copy of s->graph is taken with
+ * grads=true so ggml_build_backward_expand has the slots it needs.
+ * Returns 0 on success, -1 on failure. */
+/* Split tnn_build_backward into two phases so callers can extend the
+ * graph with optimizer-step nodes between build and alloc. Typical
+ * in-graph-optimizer flow:
+ *
+ *   tnn_realize(sess, loss)            // forward graph
+ *   tnn_build_backward(sess)           // dup + build_backward_expand
+ *   for each param:
+ *     opt_node = tnn_opt_step_adamw(sess, p, grad, m, v, hp)
+ *     tnn_extend_backward_graph(sess, opt_node)
+ *   tnn_realize_backward(sess)         // sched-alloc the final graph
+ *   loop:
+ *     tnn_compute_backward(sess)       // fwd + bwd + adam in one call
+ *     read scalar loss; repeat
+ */
+int tnn_build_backward(void *sess)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->realized) return -2;   /* must build forward first */
+    /* ggml_build_backward_expand requires cgraph->grads + grad_accs
+     * to be non-NULL, which ggml_new_graph_custom only allocates when
+     * `grads=true`. Our session's graph is created with grads=false
+     * (forward-only). Solve by dup'ing with force_grads=true. The
+     * duped graph SHARES tensor pointers with the original — leaves
+     * and compute nodes alike. */
+    s->graph_b = ggml_graph_dup(s->ctx, s->graph, /*force_grads=*/true);
+    if (!s->graph_b) return -3;
+    /* Expand with backward nodes for every node tagged as param. */
+    ggml_build_backward_expand(s->ctx, s->graph_b, NULL);
+    /* Note: NOT allocated yet — caller may extend with opt_step nodes,
+     * then call tnn_realize_backward to finalize the allocation. */
+    return 0;
+}
+/* Add a node to the backward graph (typically an opt_step output).
+ * Used between tnn_build_backward and tnn_realize_backward. */
+int tnn_extend_backward_graph(void *sess, void *node)
+{
+    if (!sess || !node) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->graph_b) return -2;
+    ggml_build_forward_expand(s->graph_b, (struct ggml_tensor *)node);
+    return 0;
+}
+/* Finalize the backward graph allocation. Called once, after all
+ * opt_step nodes have been added. Subsequent compute_backward calls
+ * are cheap re-runs. */
+int tnn_realize_backward(void *sess)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->graph_b) return -2;
+    int64_t _t = tnn_trace_begin("realize_backward");
+    ggml_backend_sched_reset(s->engine->sched);
+    int ok = ggml_backend_sched_alloc_graph(s->engine->sched, s->graph_b) ? 1 : 0;
+    tnn_trace_end("realize_backward", _t);
+    if (!ok) return -3;
+    s->realized_b = 1;
+    return 0;
+}
+/* Initialize the backward-graph state: zero all gradient
+ * accumulators + Adam moments (m, v) for any opt_step nodes; set the
+ * loss tensor's incoming gradient to 1.0. Call this ONCE between
+ * tnn_realize_backward and the first tnn_compute_backward. Subsequent
+ * compute calls accumulate normally — momenta persist across steps. */
+int tnn_graph_reset(void *sess)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->graph_b) return -2;
+    ggml_graph_reset(s->graph_b);
+    return 0;
+}
+/* F1.2 step 5: zero grad accumulators (and reset loss_grad to 1) but
+ * leave opt_step's m / v momenta alone. Lets AdamW survive across
+ * training steps without losing momentum, while still clearing the
+ * grads between iterations so the next compute_backward recomputes
+ * them from scratch (not accumulates).
+ *
+ * Mirrors ggml_graph_reset minus the GGML_OP_OPT_STEP_ADAMW arm that
+ * zeros src[2] (m) and src[3] (v). For SGD this primitive and
+ * tnn_graph_reset behave identically. For AdamW the difference is
+ * load-bearing: graph_reset would clobber momentum every step. */
+int tnn_graph_reset_grads_only(void *sess)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->graph_b) return -2;
+    int n_nodes = ggml_graph_n_nodes(s->graph_b);
+    int i = 0;
+    while (i < n_nodes) {
+        struct ggml_tensor * node     = ggml_graph_node(s->graph_b, i);
+        struct ggml_tensor * grad_acc = ggml_graph_get_grad_acc(s->graph_b, node);
+        if (grad_acc) {
+            if (node->flags & GGML_TENSOR_FLAG_LOSS) {
+                const float onef = 1.0f;
+                if (grad_acc->buffer) {
+                    ggml_backend_tensor_set(grad_acc, &onef, 0, sizeof(float));
+                } else if (grad_acc->data) {
+                    *((float *) grad_acc->data) = onef;
+                }
+            } else {
+                ggml_set_zero(grad_acc);
+            }
+        }
+        i++;
+    }
+    return 0;
+}
+/* Task #70 diagnostic — pin EVERY node in graph_b as an output, so
+ * sched is forbidden from reusing any intermediate's buffer slot
+ * once the node is computed. Used to test the hypothesis that the
+ * CPU/CUDA training divergence is caused by sched aliasing of
+ * intermediate grad tensors in long backward chains. Returns the
+ * number of nodes pinned.
+ *
+ * Call AFTER tnn_build_backward (so the backward nodes exist) but
+ * BEFORE tnn_realize_backward (so the sched sees the output flags
+ * when it allocates buffers).
+ *
+ * This is a diagnostic primitive, NOT a recommended training path —
+ * pinning every node defeats the sched's buffer-reuse optimization
+ * and inflates memory by ~node-count tensors. Use only to localize
+ * sched aliasing as the cause. */
+int tnn_pin_all_graph_b_nodes(void *sess)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->graph_b) return -2;
+    int n = ggml_graph_n_nodes(s->graph_b);
+    int i = 0;
+    while (i < n) {
+        struct ggml_tensor *t = ggml_graph_node(s->graph_b, i);
+        if (t) ggml_set_output(t);
+        i++;
+    }
+    return n;
+}
+/* Run the backward graph (forward + backward in one compute call). */
+int tnn_compute_backward(void *sess)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->realized_b) return -2;
+    int64_t _t = tnn_trace_begin("compute_backward");
+    enum ggml_status rc = ggml_backend_sched_graph_compute(s->engine->sched, s->graph_b);
+    tnn_trace_end("compute_backward", _t);
+    return (rc == GGML_STATUS_SUCCESS) ? 0 : (int)rc;
+}
+/* Return the gradient tensor for a param. Caller can then read its
+ * data via tnn_download. Returns NULL if no gradient exists (param
+ * wasn't marked, or backward wasn't built/computed). */
+void *tnn_tensor_grad(void *sess, void *tensor)
+{
+    if (!sess || !tensor) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->graph_b) return NULL;
+    return (void *)ggml_graph_get_grad(s->graph_b, (struct ggml_tensor *)tensor);
+}
+void *tnn_input_1d_f32(void *sess, int n)
+{
+    if (!sess || n <= 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_new_tensor_1d(s->ctx, GGML_TYPE_F32, (int64_t)n);
+}
+void *tnn_opt_step_adamw(void *sess, void *a, void *grad, void *m, void *v, void *params)
+{
+    if (!sess || !a || !grad || !m || !v || !params) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_opt_step_adamw(s->ctx,
+                                        (struct ggml_tensor *)a,
+                                        (struct ggml_tensor *)grad,
+                                        (struct ggml_tensor *)m,
+                                        (struct ggml_tensor *)v,
+                                        (struct ggml_tensor *)params);
+}
+/* SGD step: w = w - alpha * grad - alpha * wd * w. Simpler than Adam,
+ * useful for sanity-checking the autograd gradient direction (no
+ * momentum to obscure things). params is a 1-D 2-element tensor:
+ * [alpha, weight_decay]. */
+void *tnn_opt_step_sgd(void *sess, void *a, void *grad, void *params)
+{
+    if (!sess || !a || !grad || !params) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    return (void *)ggml_opt_step_sgd(s->ctx,
+                                      (struct ggml_tensor *)a,
+                                      (struct ggml_tensor *)grad,
+                                      (struct ggml_tensor *)params);
+}
+int tnn_realize(void *sess, void *result)
+{
+    if (!sess || !result) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->realized) return -2;
+    int64_t _t = tnn_trace_begin("realize");
+    ggml_build_forward_expand(s->graph, (struct ggml_tensor *)result);
+    ggml_backend_sched_reset(s->engine->sched);
+    int ok = ggml_backend_sched_alloc_graph(s->engine->sched, s->graph) ? 1 : 0;
+    tnn_trace_end("realize", _t);
+    if (!ok) return -3;
+    s->realized   = 1;
+    s->last_graph = 1;
+    return 0;
+}
+/* Same as tnn_realize minus the sched-alloc. Training callers use this:
+ * tnn_build_forward_only(sess, loss) → tnn_build_backward(sess) →
+ * (optional tnn_extend_backward_graph for opt_step) → tnn_realize_backward.
+ * The follow-up tnn_realize_backward does the single sched-alloc on the
+ * combined graph_b. Calling tnn_realize THEN tnn_realize_backward is
+ * broken: the sched_reset between the two leaves tensor buffer pointers
+ * stale and the second alloc lands tensors on freed-pool memory (validated
+ * 2026-05-20 with a standalone ggml POC reproducing micro5's failure
+ * byte-for-byte; see docs/design/phase-f1-status.md). */
+int tnn_build_forward_only(void *sess, void *result)
+{
+    if (!sess || !result) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->realized) return -2;
+    ggml_build_forward_expand(s->graph, (struct ggml_tensor *)result);
+    s->realized   = 1;
+    s->last_graph = 1;
+    return 0;
+}
+/* Add an extra tensor's compute tree to the graph BEFORE tnn_realize.
+ * Use for side-effect ops (ggml_cpy into a view) that aren't reachable
+ * from the final result tensor — without this they'd be pruned. The
+ * realize-target's tree is appended later by tnn_realize itself. */
+/* E2.4 — streaming corpus loader primitive. Reads n_ints int32s
+ * from `path` starting at byte_offset (byte-addressed, not
+ * token-addressed — caller computes offset = token_offset * 4).
+ * Widens the disk-format i32s to int64 to match Spinel's :int_array
+ * ABI (Ruby Integers are 64-bit on this platform).
+ * Returns count of i32s actually read (== n_ints on full read,
+ * < n_ints at EOF), or negative on open/seek/alloc failure. */
+int tnn_read_f32_file(const char *path, int byte_offset, int n_floats, double *dst)
+{
+    if (!path || !dst || n_floats <= 0) return -1;
+    FILE *f = fopen(path, "rb");
+    if (!f) return -2;
+    if (fseek(f, (long)byte_offset, SEEK_SET) != 0) {
+        fclose(f);
+        return -3;
+    }
+    float *tmp = (float *)malloc((size_t)n_floats * sizeof(float));
+    if (!tmp) {
+        fclose(f);
+        return -4;
+    }
+    size_t got = fread(tmp, sizeof(float), (size_t)n_floats, f);
+    fclose(f);
+    for (size_t i = 0; i < got; i++) {
+        dst[i] = (double)tmp[i];
+    }
+    free(tmp);
+    return (int)got;
+}
+int tnn_read_i32_file(const char *path, int byte_offset, int n_ints, int64_t *dst)
+{
+    if (!path || !dst || n_ints <= 0) return -1;
+    FILE *f = fopen(path, "rb");
+    if (!f) return -2;
+    if (fseek(f, (long)byte_offset, SEEK_SET) != 0) {
+        fclose(f);
+        return -3;
+    }
+    int32_t *tmp = (int32_t *)malloc((size_t)n_ints * sizeof(int32_t));
+    if (!tmp) {
+        fclose(f);
+        return -4;
+    }
+    size_t got = fread(tmp, sizeof(int32_t), (size_t)n_ints, f);
+    fclose(f);
+    for (size_t i = 0; i < got; i++) {
+        dst[i] = (int64_t)tmp[i];
+    }
+    free(tmp);
+    return (int)got;
+}
+/* toy#embed-api (#145) — dequantize-aware single-row read from a
+ * 2-D tensor whose data lives in CPU-readable memory (mmap'd GGUF
+ * pages are the common case). Reads row `row_idx` of `tensor`,
+ * dequantizes via the per-type to_float, and writes d_model doubles
+ * into dst.
+ *
+ * Returns 0 on success, negative on failure:
+ *   -1 null arg, -2 bad row_idx, -3 mismatched d_model,
+ *   -4 t->data is NULL (GPU-resident; needs download path instead),
+ *   -5 type has no to_float (no known dequantizer).
+ *
+ * Use case: Tep's future /v1/embeddings. The mmap'd token_embd table
+ * is CPU-readable regardless of compute backend, so this primitive
+ * works under :cpu, :cuda, and :metal sessions. */
+int tnn_embed_lookup_to_doubles(void *sess, void *tensor, int row_idx,
+                                 double *dst, int d_model)
+{
+    (void)sess;  /* not consulted; embed table lives in mmap region */
+    if (!tensor || !dst) return -1;
+    struct ggml_tensor *t = (struct ggml_tensor *)tensor;
+    if (row_idx < 0 || row_idx >= (int)t->ne[1]) return -2;
+    if (d_model != (int)t->ne[0]) return -3;
+    if (!t->data) return -4;
+    /* Row offset in bytes: stride along ne[1] is nb[1]. */
+    const uint8_t *src = (const uint8_t *)t->data + (size_t)row_idx * t->nb[1];
+    /* F32 needs no dequant; ggml's type_traits.to_float is NULL for it. */
+    if (t->type == GGML_TYPE_F32) {
+        const float *frow = (const float *)src;
+        for (int j = 0; j < d_model; j++) dst[j] = (double)frow[j];
+        return 0;
+    }
+    const struct ggml_type_traits *tr = ggml_get_type_traits(t->type);
+    if (!tr || !tr->to_float) return -5;
+    /* Dequantize into a float scratch then widen to double. */
+    float *fbuf = (float *)malloc((size_t)d_model * sizeof(float));
+    if (!fbuf) return -6;
+    tr->to_float(src, fbuf, (int64_t)d_model);
+    for (int j = 0; j < d_model; j++) dst[j] = (double)fbuf[j];
+    free(fbuf);
+    return 0;
+}
+/* GH#17 — re-allocate the session's forward + backward graphs with a
+ * larger node-count budget. Must be called BEFORE realize so the ctx
+ * hasn't yet stored any compute tensors.
+ *
+ * Why this exists: per-head attention decomposition makes node count
+ * scale as O(n_layers × n_heads); the default 65536 cap overflows on
+ * 24L × 16-head Qwen-shape models at backward-expand time. Callers in
+ * realize_for_random_init / _mmap pass a size derived from cfg.
+ *
+ * Implementation: this tears down the compute ctx and re-inits it with
+ * a buffer large enough to hold:
+ *   - the forward graph (capacity nodes)
+ *   - the backward graph (capacity nodes, with grads → 2× tensor-ptr
+ *     arrays + a hash_set sized proportional to capacity)
+ *   - rebuild headroom for many decode steps (the original 32 MB slack
+ *     served distil-GPT-2 at 10k rebuilds; we keep that slack additive)
+ * The persistent-weights ctx (ctx_w) is untouched. */
+int tnn_session_set_graph_capacity(void *sess, int capacity)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (capacity <= 0) return -2;
+    if (s->realized) return -3;
+    /* Size the buffer so two grad-flagged graphs at this capacity fit
+     * comfortably, plus the original rebuild slack. */
+    size_t graph_bytes = ggml_graph_overhead_custom((size_t)capacity, true);
+    size_t needed = graph_bytes * 2
+                  + ggml_tensor_overhead() * 262144   /* preserve original tensor-header slack */
+                  + 32 * 1024 * 1024;                 /* rebuild headroom */
+    if (needed > s->ctx_buf_size) {
+        ggml_free(s->ctx);
+        free(s->ctx_buf);
+        s->ctx_buf_size = needed;
+        s->ctx_buf = (uint8_t *)calloc(1, s->ctx_buf_size);
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ s->ctx_buf_size,
+            /*.mem_buffer =*/ s->ctx_buf,
+            /*.no_alloc   =*/ true,
+        };
+        s->ctx = ggml_init(params);
+    }
+    s->graph_capacity = capacity;
+    s->graph   = ggml_new_graph_custom(s->ctx, (size_t)s->graph_capacity, false);
+    s->graph_b = ggml_new_graph_custom(s->ctx, (size_t)s->graph_capacity, false);
+    return 0;
+}
+int tnn_add_to_graph(void *sess, void *tensor)
+{
+    if (!sess || !tensor) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->realized) return -2;
+    ggml_build_forward_expand(s->graph, (struct ggml_tensor *)tensor);
+    return 0;
+}
+/* Reset for rebuild: free the compute ctx entirely and start fresh.
+ * The persistent ctx_w + its backend buffer are untouched, so weights
+ * keep their data. Previously this only swapped graphs in the same
+ * ctx — that grew monotonically and overflowed after ~80 decode steps
+ * at gpt2-small + max_T=1024 (each step creates ~1300 new tensor
+ * headers, none get reclaimed). Tearing ctx down per step makes the
+ * per-decode-step compute fully bounded in metadata footprint.
+ *
+ * The scheduler also has internal state tied to tensor pointers; we
+ * reset it before realize, so this is safe. Per decode step:
+ *   tnn_reset_for_rebuild(sess)
+ *   ... build ops with current pos baked in ...
+ *   tnn_realize(sess, result_tensor)
+ *   ... upload, compute, download ... */
+int tnn_reset_for_rebuild(void *sess)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    /* Profile timing showed that free()+init() of the (now 130-ish MB)
+     * ctx_buf adds ~500 ms per call — dominates compute. So we ONLY
+     * teardown when the ctx is approaching capacity. The (small)
+     * accumulated dead headers between teardowns are bounded by
+     * ctx_used / ctx_buf_size, which we check before each rebuild
+     * via ggml_used_mem.
+     *
+     * Threshold: half the buffer. Headroom ensures the *next* step's
+     * graph build can complete without overflowing. */
+    size_t used = ggml_used_mem(s->ctx);
+    if (used > s->ctx_buf_size / 2) {
+        ggml_free(s->ctx);
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ s->ctx_buf_size,
+            /*.mem_buffer =*/ s->ctx_buf,
+            /*.no_alloc   =*/ true,
+        };
+        s->ctx        = ggml_init(params);
+        s->graph_b    = ggml_new_graph_custom(s->ctx, (size_t)s->graph_capacity, false);
+        s->realized_b = 0;
+    }
+    s->realized = 0;
+    s->graph    = ggml_new_graph_custom(s->ctx, (size_t)s->graph_capacity, false);
+    s->last_graph = 0;
+    return 0;
+}
+int tnn_compute(void *sess)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->realized) return -2;
+    int64_t _t = tnn_trace_begin("compute");
+    enum ggml_status rc = ggml_backend_sched_graph_compute(s->engine->sched, s->graph);
+    tnn_trace_end("compute", _t);
+    return (rc == GGML_STATUS_SUCCESS) ? 0 : (int)rc;
+}
+/* Build a SECONDARY graph (graph_b) in the same session, sharing ctx
+ * and tensors with the primary. Does NOT alloc — call tnn_switch_b
+ * before tnn_compute_b each cycle. */
+int tnn_realize_b(void *sess, void *result)
+{
+    if (!sess || !result) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (s->realized_b) return -2;
+    ggml_build_forward_expand(s->graph_b, (struct ggml_tensor *)result);
+    s->realized_b = 1;
+    return 0;
+}
+/* Switch sched allocation to graph_b (or back to graph). Resets the
+ * scheduler then allocates buffer slots for the requested graph's
+ * compute tensors. Persistent tensors (allocated via ctx_w) keep
+ * their stable buffer locations. Compute tensors (h, intermediates)
+ * get fresh slots that may differ from prior cycles -- caller MUST
+ * re-upload any compute inputs before tnn_compute*. */
+int tnn_switch_b(void *sess)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->realized_b) return -2;
+    ggml_backend_sched_reset(s->engine->sched);
+    if (!ggml_backend_sched_alloc_graph(s->engine->sched, s->graph_b)) return -3;
+    s->last_graph = 2;
+    return 0;
+}
+int tnn_switch_a(void *sess)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->realized) return -2;
+    ggml_backend_sched_reset(s->engine->sched);
+    if (!ggml_backend_sched_alloc_graph(s->engine->sched, s->graph)) return -3;
+    s->last_graph = 1;
+    return 0;
+}
+int tnn_compute_b(void *sess)
+{
+    if (!sess) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->realized_b) return -2;
+    enum ggml_status rc = ggml_backend_sched_graph_compute(s->engine->sched, s->graph_b);
+    return (rc == GGML_STATUS_SUCCESS) ? 0 : (int)rc;
+}
+/* Out-of-range scratch_set used to silently drop writes — a stage+upload
+ * pair operating on a tensor larger than the scratch buffer would
+ * truncate at the boundary and then `tnn_upload` would memcpy past
+ * the scratch end into the next backend buffer. That bug bit
+ * Qwen2.5-0.5B (ffn_gate = 4.36M floats > 4M scratch slots; 17.4 MB
+ * upload past a 16 MiB scratch) and produced NaN logits at L=1 with
+ * no visible error. Now we fprintf a one-line warning the FIRST time
+ * we see an out-of-range write per session — noisy enough to catch
+ * future regressions without spamming the logs. */
+void tnn_scratch_set(void *sess, int idx, double v)
+{
+    if (!sess) return;
+    tnn_session *s = (tnn_session *)sess;
+    int max_n = TNN_SCRATCH_BYTES / (int)sizeof(float);
+    if (idx < 0 || idx >= max_n) {
+        if (!s->scratch_overflow_warned) {
+            fprintf(stderr, "[tnn] WARN: tnn_scratch_set idx=%d out of range "
+                            "(max=%d, scratch=%d bytes). Subsequent uploads "
+                            "from this scratch are corrupt — use a chunked "
+                            "uploader (e.g. tnn_upload_transposed_f64).\n",
+                            idx, max_n, TNN_SCRATCH_BYTES);
+            s->scratch_overflow_warned = 1;
+        }
+        return;
+    }
+    s->scratch[idx] = (float)v;
+}
+/* Out-of-range reads used to silently return 0.0 — indistinguishable
+ * from a legitimate zero in the scratch slot. Now we still return 0.0
+ * for backward compatibility, but emit a once-per-session warning so
+ * the failure is visible. Callers that need the legitimate zero/OOR
+ * distinction should check bounds themselves. */
+double tnn_scratch_get(void *sess, int idx)
+{
+    if (!sess) return 0.0;
+    tnn_session *s = (tnn_session *)sess;
+    int max_n = TNN_SCRATCH_BYTES / (int)sizeof(float);
+    if (idx < 0 || idx >= max_n) {
+        if (!s->scratch_overflow_warned) {
+            fprintf(stderr, "[tnn] WARN: tnn_scratch_get idx=%d out of range "
+                            "(max=%d). Returning 0.0 — but this is now a "
+                            "silent zero, not a real one. Check your indexing.\n",
+                            idx, max_n);
+            s->scratch_overflow_warned = 1;
+        }
+        return 0.0;
+    }
+    return (double)s->scratch[idx];
+}
+/* The scratch buffer is just bytes; we let i32 values share it. Caller
+ * must not mix i32 + f32 writes within a single tensor's upload window.
+ * Same overflow warning as tnn_scratch_set — once-per-session fprintf. */
+void tnn_scratch_set_i32(void *sess, int idx, int value)
+{
+    if (!sess) return;
+    tnn_session *s = (tnn_session *)sess;
+    int max_n = TNN_SCRATCH_BYTES / (int)sizeof(int32_t);
+    if (idx < 0 || idx >= max_n) {
+        if (!s->scratch_overflow_warned) {
+            fprintf(stderr, "[tnn] WARN: tnn_scratch_set_i32 idx=%d out of "
+                            "range (max=%d). Use a chunked uploader.\n",
+                            idx, max_n);
+            s->scratch_overflow_warned = 1;
+        }
+        return;
+    }
+    ((int32_t *)s->scratch)[idx] = (int32_t)value;
+}
+int tnn_scratch_get_i32(void *sess, int idx)
+{
+    if (!sess) return 0;
+    tnn_session *s = (tnn_session *)sess;
+    int max_n = TNN_SCRATCH_BYTES / (int)sizeof(int32_t);
+    if (idx < 0 || idx >= max_n) {
+        if (!s->scratch_overflow_warned) {
+            fprintf(stderr, "[tnn] WARN: tnn_scratch_get_i32 idx=%d out of "
+                            "range (max=%d). Returning 0 — but this is a "
+                            "silent zero, not a real one.\n",
+                            idx, max_n);
+            s->scratch_overflow_warned = 1;
+        }
+        return 0;
+    }
+    return (int)((int32_t *)s->scratch)[idx];
+}
+/* Bounds-checked upload: tensor must fit in the 16 MiB scratch. Larger
+ * tensors caused the silent UB that produced NaN logits at L=1 on
+ * Qwen2.5-0.5B (ffn_gate = 17.4 MB > 16 MB scratch); the memcpy past
+ * the scratch end overwrote adjacent heap. Use chunked uploaders for
+ * anything that might be large:
+ *   - tnn_upload_from_float_array (chunked f32 upload)
+ *   - tnn_upload_transposed_f64   (chunked transposed f64 upload)
+ * Returns 0 on success, -1 on null sess/tensor, -2 on size overflow. */
+int tnn_upload(void *sess, void *tensor)
+{
+    if (!sess || !tensor) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    struct ggml_tensor *t = (struct ggml_tensor *)tensor;
+    size_t nbytes = ggml_nbytes(t);
+    if (nbytes > (size_t)TNN_SCRATCH_BYTES) {
+        if (!s->scratch_overflow_warned) {
+            fprintf(stderr, "[tnn] WARN: tnn_upload tensor=%zu bytes exceeds "
+                            "scratch=%d bytes. Skipping upload (was: silent UB). "
+                            "Use tnn_upload_from_float_array or "
+                            "tnn_upload_transposed_f64 for tensors > 16 MiB.\n",
+                            nbytes, TNN_SCRATCH_BYTES);
+            s->scratch_overflow_warned = 1;
+        }
+        return -2;
+    }
+    int64_t _t = tnn_trace_begin("upload");
+    ggml_backend_tensor_set(t, s->scratch, 0, nbytes);
+    tnn_trace_end("upload", _t);
+    return 0;
+}
+/* Same bounds check as tnn_upload — a download into an oversized
+ * tensor would memcpy past the scratch end into adjacent heap. */
+int tnn_download(void *sess, void *tensor)
+{
+    if (!sess || !tensor) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    struct ggml_tensor *t = (struct ggml_tensor *)tensor;
+    size_t nbytes = ggml_nbytes(t);
+    if (nbytes > (size_t)TNN_SCRATCH_BYTES) {
+        if (!s->scratch_overflow_warned) {
+            fprintf(stderr, "[tnn] WARN: tnn_download tensor=%zu bytes exceeds "
+                            "scratch=%d bytes. Skipping download (was: silent UB). "
+                            "Use tnn_download_to_f64_array for tensors > 16 MiB.\n",
+                            nbytes, TNN_SCRATCH_BYTES);
+            s->scratch_overflow_warned = 1;
+        }
+        return -2;
+    }
+    int64_t _t = tnn_trace_begin("download");
+    ggml_backend_tensor_get(t, s->scratch, 0, nbytes);
+    tnn_trace_end("download", _t);
+    return 0;
+}
+/* Transpose-and-upload a row-major f64 Mat into a ggml f32 tensor of
+ * shape ne=[br, bc] in chunked passes — so it works for tensors larger
+ * than the 16 MiB scratch buffer.
+ *
+ * Source layout: src[i*bc + j] = (i, j) of an (br × bc) row-major Mat.
+ * Destination ggml layout: T[ne0=k0, ne1=k1] at byte offset k1*br + k0
+ * (in float positions). We want T[i, j] = src[i, j] (transpose semantics
+ * is in the *consumer* — ggml_mul_mat treats (br, bc) as (K, M) where
+ * the K axis is contracted; we get B^T · h that way).
+ *
+ * Chunking: pick `cols_per_chunk` ≤ scratch_slots / br. For each chunk
+ * [j_start, j_end) of columns: stage src[i, j] → scratch[(j - j_start)*br + i]
+ * for i ∈ [0, br) and j ∈ [j_start, j_end). Then upload that contiguous
+ * slice into the tensor at byte offset j_start*br*sizeof(float).
+ *
+ * Same shape as tnn_upload_from_float_array's chunking, but for the
+ * transposed-input case used by stage_transposed_and_upload. Fixes the
+ * scratch-overflow bug that produced garbage uploads for Qwen's
+ * ffn_gate / ffn_up / ffn_down (each ~17 MB, scratch is 16 MB). */
+int tnn_upload_transposed_f64(void *sess, void *tensor,
+                              const double *src, int br, int bc)
+{
+    if (!sess || !tensor || !src || br <= 0 || bc <= 0) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    struct ggml_tensor *t = (struct ggml_tensor *)tensor;
+    size_t expected_bytes = (size_t)br * (size_t)bc * sizeof(float);
+    if (expected_bytes > ggml_nbytes(t)) return -2;
+    const int max_slots = TNN_SCRATCH_BYTES / (int)sizeof(float);
+    int cols_per_chunk = max_slots / br;
+    if (cols_per_chunk <= 0) return -3;   /* br > scratch — wider than ~4M */
+    int j_start = 0;
+    while (j_start < bc) {
+        int j_end = j_start + cols_per_chunk;
+        if (j_end > bc) j_end = bc;
+        int j = j_start;
+        while (j < j_end) {
+            int i = 0;
+            const double *src_row_base = src + (size_t)j;
+            float *dst_col = s->scratch + (size_t)(j - j_start) * (size_t)br;
+            while (i < br) {
+                dst_col[i] = (float)src_row_base[(size_t)i * (size_t)bc];
+                i++;
+            }
+            j++;
+        }
+        size_t byte_off = (size_t)j_start * (size_t)br * sizeof(float);
+        size_t byte_len = (size_t)(j_end - j_start) * (size_t)br * sizeof(float);
+        ggml_backend_tensor_set(t, s->scratch, byte_off, byte_len);
+        j_start = j_end;
+    }
+    return 0;
+}
+int tnn_upload_from_float_array(void *sess, void *tensor, const double *data, size_t n)
+{
+    if (!sess || !tensor || !data) return -1;
+    int64_t _trace = tnn_trace_begin("upload_from_float_array");
+    tnn_session *s = (tnn_session *)sess;
+    struct ggml_tensor *t = (struct ggml_tensor *)tensor;
+    const size_t chunk_floats = TNN_SCRATCH_BYTES / sizeof(float);
+    /* Chunked f64 → f32 conversion into scratch, then ggml_backend_tensor_set
+     * per chunk at the right byte offset. Lets us upload tensors larger
+     * than scratch (e.g. distilgpt2's 38.6 M-element token_embd) without
+     * growing the scratch buffer for everyone. */
+    size_t off = 0;
+    while (off < n) {
+        size_t this_chunk = (n - off) < chunk_floats ? (n - off) : chunk_floats;
+        for (size_t i = 0; i < this_chunk; ++i) {
+            s->scratch[i] = (float)data[off + i];
+        }
+        ggml_backend_tensor_set(t, s->scratch,
+                                  off * sizeof(float),
+                                  this_chunk * sizeof(float));
+        off += this_chunk;
+    }
+    tnn_trace_end("upload_from_float_array", _trace);
+    return 0;
+}
+/* Mirror of tnn_upload_from_float_array: read a tensor's f32 contents
+ * back into a host f64 buffer in scratch-sized chunks. Enables full
+ * Mat-roundtrip on weights loaded via the direct GGUF→FFI path —
+ * required by the user-stated rule that the API mustn't paint into
+ * an inference-only corner. */
+int tnn_download_to_f64_array(void *sess, void *tensor, double *dst, size_t n)
+{
+    if (!sess || !tensor || !dst) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    struct ggml_tensor *t = (struct ggml_tensor *)tensor;
+    size_t available = ggml_nelements(t);
+    if (n > available) return -2;
+    const size_t chunk_floats = TNN_SCRATCH_BYTES / sizeof(float);
+    size_t off = 0;
+    while (off < n) {
+        size_t this_chunk = (n - off) < chunk_floats ? (n - off) : chunk_floats;
+        ggml_backend_tensor_get(t, s->scratch,
+                                  off * sizeof(float),
+                                  this_chunk * sizeof(float));
+        for (size_t i = 0; i < this_chunk; ++i) {
+            dst[off + i] = (double)s->scratch[i];
+        }
+        off += this_chunk;
+    }
+    return 0;
+}
+int tnn_upload_from_int_array(void *sess, void *tensor, const long *data, size_t n)
+{
+    if (!sess || !tensor || !data) return -1;
+    tnn_session *s = (tnn_session *)sess;
+    struct ggml_tensor *t = (struct ggml_tensor *)tensor;
+    size_t max_n = TNN_SCRATCH_BYTES / sizeof(int32_t);
+    if (n > max_n) return -2;
+    int64_t _trace = tnn_trace_begin("upload_from_int_array");
+    int32_t *dst = (int32_t *)s->scratch;
+    /* i64 → i32 narrowing. Spinel's :int_array is `const int64_t *`; ggml's
+     * GGML_TYPE_I32 row-index tensors are 32-bit. Caller responsibility
+     * not to pass out-of-range indices (vocab fits easily in int32). */
+    for (size_t i = 0; i < n; ++i) dst[i] = (int32_t)data[i];
+    ggml_backend_tensor_set(t, dst, 0, n * sizeof(int32_t));
+    tnn_trace_end("upload_from_int_array", _trace);
+    return 0;
+}
+/* Scratch-buffer stats. Caller has just done tnn_download(sess, t)
+ * which copied a tensor's f32 contents into the session's scratch
+ * buffer. These helpers reduce over the first `n` floats without
+ * crossing the FFI boundary per element — one Ruby↔C call per stat,
+ * O(n) in C. Used by the trace-tap diagnostic path; not on any
+ * production hot path. */
+double tnn_scratch_min_f32(void *sess, int n)
+{
+    if (!sess || n <= 0) return 0.0;
+    tnn_session *s = (tnn_session *)sess;
+    float mn = s->scratch[0];
+    int i = 1;
+    while (i < n) { if (s->scratch[i] < mn) mn = s->scratch[i]; i++; }
+    return (double)mn;
+}
+double tnn_scratch_max_f32(void *sess, int n)
+{
+    if (!sess || n <= 0) return 0.0;
+    tnn_session *s = (tnn_session *)sess;
+    float mx = s->scratch[0];
+    int i = 1;
+    while (i < n) { if (s->scratch[i] > mx) mx = s->scratch[i]; i++; }
+    return (double)mx;
+}
+double tnn_scratch_sum_abs_f32(void *sess, int n)
+{
+    if (!sess || n <= 0) return 0.0;
+    tnn_session *s = (tnn_session *)sess;
+    double sum = 0.0;
+    int i = 0;
+    while (i < n) {
+        float v = s->scratch[i];
+        sum += v < 0.0f ? -(double)v : (double)v;
+        i++;
+    }
+    return sum;
+}
+/* Sum of squares; for L2 norm take sqrt() on the Ruby side. */
+double tnn_scratch_sum_sq_f32(void *sess, int n)
+{
+    if (!sess || n <= 0) return 0.0;
+    tnn_session *s = (tnn_session *)sess;
+    double sum = 0.0;
+    int i = 0;
+    while (i < n) {
+        double v = (double)s->scratch[i];
+        sum += v * v;
+        i++;
+    }
+    return sum;
+}
+/* Plain sum (for mean = sum/n). */
+double tnn_scratch_sum_f32(void *sess, int n)
+{
+    if (!sess || n <= 0) return 0.0;
+    tnn_session *s = (tnn_session *)sess;
+    double sum = 0.0;
+    int i = 0;
+    while (i < n) {
+        sum += (double)s->scratch[i];
+        i++;
+    }
+    return sum;
+}
+/* Count of NaN-or-inf elements. NaN comparison: v != v is true iff NaN.
+ * Inf: abs(v) > 1e30 is conservative (real f32 inf is 3.4e38). */
+int tnn_scratch_nan_count_f32(void *sess, int n)
+{
+    if (!sess || n <= 0) return 0;
+    tnn_session *s = (tnn_session *)sess;
+    int c = 0;
+    int i = 0;
+    while (i < n) {
+        float v = s->scratch[i];
+        float av = v < 0.0f ? -v : v;
+        if (v != v || av > 1.0e30f) c++;
+        i++;
+    }
+    return c;
+}
+int tnn_tensor_ne0(void *t) { return t ? (int)((struct ggml_tensor *)t)->ne[0] : 0; }
+int tnn_tensor_ne1(void *t) { return t ? (int)((struct ggml_tensor *)t)->ne[1] : 0; }
+int tnn_tensor_ne2(void *t) { return t ? (int)((struct ggml_tensor *)t)->ne[2] : 0; }
+int tnn_tensor_ne3(void *t) { return t ? (int)((struct ggml_tensor *)t)->ne[3] : 0; }
+size_t tnn_tensor_nbytes(void *t) { return t ? ggml_nbytes((struct ggml_tensor *)t) : 0; }
+int    tnn_tensor_nelements(void *t) { return t ? (int)ggml_nelements((struct ggml_tensor *)t) : 0; }
+/* Introspection primitives for kv.describe_flow (tao#kv-describe-flow).
+ * All are read-only walks over the built compute graph + leaf set;
+ * cheap enough to invoke ad-hoc after a graph has been realized. */
+const char *tnn_tensor_name(void *t) {
+    return t ? ((struct ggml_tensor *)t)->name : "";
+}
+/* ggml_type enum value: 0=F32, 8=Q8_0, etc. See vendor/ggml/include/ggml.h. */
+int tnn_tensor_dtype(void *t) {
+    return t ? (int)((struct ggml_tensor *)t)->type : 0;
+}
+/* Bitmask of GGML_TENSOR_FLAG_INPUT(1) | OUTPUT(2) | PARAM(4) | LOSS(8) | COMPUTE(16). */
+int tnn_tensor_flags(void *t) {
+    return t ? (int)((struct ggml_tensor *)t)->flags : 0;
+}
+/* Op id (ggml_op enum) — 0=NONE, then MUL_MAT, ADD, …. Useful to label
+ * compute nodes by their op kind in the description. */
+int tnn_tensor_op(void *t) {
+    return t ? (int)((struct ggml_tensor *)t)->op : 0;
+}
+const char *tnn_tensor_op_name(void *t) {
+    if (!t) return "";
+    return ggml_op_name(((struct ggml_tensor *)t)->op);
+}
+/* Source-tensor pointers for an op node: src[0]..src[N]. Returns NULL
+ * past the last source. ggml caps at GGML_MAX_SRC=10 — typical ops
+ * use 2 srcs, opt_step_adamw uses 5, no current op uses more than 10. */
+void *tnn_tensor_src(void *t, int i) {
+    if (!t || i < 0 || i >= GGML_MAX_SRC) return NULL;
+    return (void *)((struct ggml_tensor *)t)->src[i];
+}
+/* Graph walk: number of compute nodes, indexed accessor. Walks the
+ * primary graph (graph_a) — the one populated by tnn_build_forward_only
+ * or tnn_realize. Use tnn_graph_b_n_nodes / tnn_graph_b_node for the
+ * backward graph when needed. */
+int tnn_graph_n_nodes(void *sess) {
+    if (!sess) return 0;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->graph) return 0;
+    return ggml_graph_n_nodes(s->graph);
+}
+void *tnn_graph_node(void *sess, int i) {
+    if (!sess || i < 0) return NULL;
+    tnn_session *s = (tnn_session *)sess;
+    if (!s->graph) return NULL;
+    if (i >= ggml_graph_n_nodes(s->graph)) return NULL;
+    return (void *)ggml_graph_node(s->graph, i);
+}
+/* No tnn_graph_n_leafs / tnn_graph_leaf: ggml's cgraph leafs[] is
+ * private (no public accessor). The describe_flow walker discovers
+ * leaves from the Ruby side by scanning node srcs that aren't
+ * themselves nodes — same set, just computed differently. */
+/* tao#gguf-checkpoint-writer thin wrappers over ggml's gguf writer
+ * API. The lifecycle is:
+ *   ctx = tnn_gguf_w_init()
+ *   tnn_gguf_w_set_str/u32/f32(ctx, key, value)   — metadata
+ *   tnn_tensor_set_name(t, "...")                  — name each param
+ *   tnn_gguf_w_add_tensor(ctx, t)                  — record + data ptr
+ *   tnn_gguf_w_finalize(ctx, path)                 — fsync + close
+ *   tnn_gguf_w_free(ctx)
+ *
+ * `tnn_gguf_w_add_tensor` reads the tensor's `data` field; for CPU
+ * backend that's the host pointer in the persistent backend buffer.
+ * For CUDA backend a download step (not implemented here — see
+ * toy#gguf-checkpoint-writer-cuda) would be required. */
+void tnn_tensor_set_name(void *t, const char *name) {
+    if (!t || !name) return;
+    ggml_set_name((struct ggml_tensor *)t, name);
+}
+void *tnn_gguf_w_init(void) {
+    return (void *)gguf_init_empty();
+}
+void tnn_gguf_w_set_str(void *ctx, const char *key, const char *val) {
+    if (!ctx || !key || !val) return;
+    gguf_set_val_str((struct gguf_context *)ctx, key, val);
+}
+void tnn_gguf_w_set_u32(void *ctx, const char *key, int val) {
+    if (!ctx || !key) return;
+    gguf_set_val_u32((struct gguf_context *)ctx, key, (uint32_t)val);
+}
+void tnn_gguf_w_set_f32(void *ctx, const char *key, double val) {
+    if (!ctx || !key) return;
+    gguf_set_val_f32((struct gguf_context *)ctx, key, (float)val);
+}
+void tnn_gguf_w_set_bool(void *ctx, const char *key, int val) {
+    if (!ctx || !key) return;
+    gguf_set_val_bool((struct gguf_context *)ctx, key, val ? true : false);
+}
+void tnn_gguf_w_add_tensor(void *ctx, void *t) {
+    if (!ctx || !t) return;
+    gguf_add_tensor((struct gguf_context *)ctx,
+                     (const struct ggml_tensor *)t);
+}
+/* Returns 0 on success, -1 on null args, -2 on file write failure. */
+int tnn_gguf_w_finalize(void *ctx, const char *path) {
+    if (!ctx || !path) return -1;
+    bool ok = gguf_write_to_file((const struct gguf_context *)ctx,
+                                   path, /*only_meta=*/ false);
+    return ok ? 0 : -2;
+}
+void tnn_gguf_w_free(void *ctx) {
+    if (!ctx) return;
+    gguf_free((struct gguf_context *)ctx);
+}
+/* Atomic symlink replace (sym_path → target). Used by the checkpoint
+ * writer to maintain `weights/latest`. Returns 0 on success, -1 on
+ * failure. Unlinks any pre-existing symlink first; the create itself
+ * is non-atomic (real atomicity needs renameat2 + a tmp link), but
+ * Tao's consumers tolerate brief absence of the latest link. */
+int tnn_filesystem_symlink(const char *target, const char *sym_path) {
+    if (!target || !sym_path) return -1;
+    unlink(sym_path);                 /* may not exist; ignore EEXIST/ENOENT */
+    int rc = symlink(target, sym_path);
+    return rc == 0 ? 0 : -1;
+}
+/* mkdir-p style helper: creates dir if missing. Returns 0 on success
+ * (or already-exists), -1 on failure. Single-level: caller is
+ * responsible for parent dirs (typically TAO_RUN_DIR already exists
+ * because Tao created it). */
+int tnn_filesystem_mkdir(const char *path) {
+    if (!path) return -1;
+    int rc = mkdir(path, 0755);
+    if (rc == 0) return 0;
+    if (errno == EEXIST) return 0;
+    return -1;
+}