npm - tensorgrad - Versions diffs - 0.0.15 → 0.0.16 - Mend

tensorgrad 0.0.15 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/dist/index.js CHANGED Viewed

@@ -1,39 +1,2208 @@
-// Public surface. Bulb code imports from here.
-//
-// Phase 1 exports: IR types, op surface, trace driver. Autograd (Phase 2) and
-// codegen / compile() (Phase 3+) come later.
-export { ShapeError } from './shape.js';
-export { trace, traceInto, paramInput, tensorInput, stateInput } from './trace.js';
-export { capture } from './capture.js';
-export {
-// Element-wise arithmetic. The binops accept Tensor or JS-number for the second arg.
-add, sub, mul, div,
-// Element-wise unary
-sqrt, rsqrt, log, exp, relu,
-// Comparisons + select
-less, greater, where,
-// Reductions over the last axis (other axes via reshape/transpose first)
-meanLast, sumLast, sumAll,
-// Shape ops
-reshape, transpose, swapAxes,
-// Linear algebra
-matmul, matmulBatched,
-// Indexing / casting
-oneHot, arange, embedding,
-// ML primitives — fused for the transformer
-softmaxCausalLast, logSoftmaxLast, whereCausal,
-// Slicing
-sliceLastRange, } from './ops.js';
-// Note: addScalar/mulScalar/broadcastTo/sumToShape/constScalar/reluGrad/adam_update_*
-// are autograd/optimizer building blocks. They live in ops.ts (so grad.ts and
-// adam.ts can import them) but aren't part of the public API — `add`/`mul`
-// overload on JS numbers, `where` subsumes the rest.
-export { appendGrad } from './grad.js';
-export { appendAdam } from './adam.js';
-export { planBuffers } from './buffers.js';
-export { emitKernels } from './codegen.js';
-export { createRuntime, createForwardRuntime, Captures } from './runtime.js';
-export { compile, compileToIR, compileModule, compileForward, } from './compile.js';
-export { Module, materializeParams } from './module.js';
-export * as nn from './nn.js';
-//# sourceMappingURL=index.js.map
+var __defProp = Object.defineProperty;
+var __export = (target, all) => {
+  for (var name in all)
+    __defProp(target, name, { get: all[name], enumerable: true });
+};
+// src/ir.ts
+function makeGraph() {
+  return { ops: [], tensors: [], outputs: [], captures: /* @__PURE__ */ new Map() };
+}
+function addTensor(g, shape, dtype, source, site) {
+  const id = g.tensors.length;
+  const t = { id, shape, dtype, source, site };
+  g.tensors.push(t);
+  return t;
+}
+function addOp(g, kind, shape, dtype, site, fields) {
+  const opIndex = g.ops.length;
+  const out = addTensor(g, shape, dtype, opIndex, site);
+  const node = { kind, out: out.id, ...fields };
+  g.ops.push(node);
+  return out;
+}
+function captureSite(opName) {
+  const stack = new Error().stack ?? "";
+  return { opName, stack };
+}
+function formatSite(site) {
+  const lines = site.stack.split("\n");
+  const userFrames = [];
+  for (const line of lines.slice(1)) {
+    if (line.includes("/tensorgrad/src/") || line.includes("\\tensorgrad\\src\\")) continue;
+    userFrames.push(line.trim());
+    if (userFrames.length >= 3) break;
+  }
+  if (userFrames.length === 0) return `[${site.opName}] (no user frame found)`;
+  return `[${site.opName}]
+  ${userFrames.join("\n  ")}`;
+}
+// src/shape.ts
+var ShapeError = class extends Error {
+  constructor(message, site) {
+    const formatted = site ? `${message}
+  at ${formatSite(site)}` : message;
+    super(formatted);
+    this.name = "ShapeError";
+  }
+};
+function fail(message, site) {
+  throw new ShapeError(message, site);
+}
+function shapesEqual(a, b) {
+  if (a.length !== b.length) return false;
+  for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return false;
+  return true;
+}
+function shapeSize(shape) {
+  let n = 1;
+  for (const d of shape) n *= d;
+  return n;
+}
+function showShape(shape) {
+  return `[${shape.join(", ")}]`;
+}
+function broadcastTrailing(a, b) {
+  const rank = Math.max(a.length, b.length);
+  const out = new Array(rank);
+  for (let i = 0; i < rank; i++) {
+    const ai = i - (rank - a.length);
+    const bi = i - (rank - b.length);
+    const av = ai < 0 ? 1 : a[ai];
+    const bv = bi < 0 ? 1 : b[bi];
+    if (av === bv) out[i] = av;
+    else if (av === 1) out[i] = bv;
+    else if (bv === 1) out[i] = av;
+    else return null;
+  }
+  return out;
+}
+function inferElementwiseBinop(opName, aShape, bShape, site) {
+  const result = broadcastTrailing(aShape, bShape);
+  if (!result) {
+    fail(
+      `${opName}: incompatible shapes ${showShape(aShape)} and ${showShape(bShape)}. Trailing-suffix broadcasting only \u2014 the smaller shape must be a suffix of the larger, with size-1 axes broadcasting to any size.`,
+      site
+    );
+  }
+  return result;
+}
+function inferUnary(_opName, aShape, _site) {
+  return aShape;
+}
+function inferMeanLast(opName, aShape, site) {
+  if (aShape.length === 0) fail(`${opName}: cannot reduce a 0-d tensor`, site);
+  return [...aShape.slice(0, -1), 1];
+}
+function inferSumLast(opName, aShape, site) {
+  if (aShape.length === 0) fail(`${opName}: cannot reduce a 0-d tensor`, site);
+  return aShape.slice(0, -1);
+}
+function inferReshape(opName, aShape, newShape, site) {
+  let inferIdx = -1;
+  let knownSize = 1;
+  for (let i = 0; i < newShape.length; i++) {
+    const d = newShape[i];
+    if (d === -1) {
+      if (inferIdx !== -1) fail(`${opName}: at most one -1 dim allowed in newShape ${showShape(newShape)}`, site);
+      inferIdx = i;
+    } else if (d <= 0) {
+      fail(`${opName}: invalid dim ${d} in newShape ${showShape(newShape)}`, site);
+    } else {
+      knownSize *= d;
+    }
+  }
+  const totalIn = shapeSize(aShape);
+  const out = [...newShape];
+  if (inferIdx !== -1) {
+    if (totalIn % knownSize !== 0) {
+      fail(`${opName}: cannot reshape ${showShape(aShape)} (size ${totalIn}) to ${showShape(newShape)} \u2014 known dims multiply to ${knownSize}`, site);
+    }
+    out[inferIdx] = totalIn / knownSize;
+  } else if (knownSize !== totalIn) {
+    fail(`${opName}: size mismatch \u2014 input ${showShape(aShape)} has ${totalIn} elements but newShape ${showShape(newShape)} has ${knownSize}`, site);
+  }
+  return out;
+}
+function inferTranspose(opName, aShape, perm, site) {
+  if (perm.length !== aShape.length) {
+    fail(`${opName}: perm length ${perm.length} must equal input rank ${aShape.length}`, site);
+  }
+  const seen = /* @__PURE__ */ new Set();
+  for (const p of perm) {
+    if (p < 0 || p >= aShape.length) fail(`${opName}: perm index ${p} out of range for rank ${aShape.length}`, site);
+    if (seen.has(p)) fail(`${opName}: perm has duplicate index ${p}`, site);
+    seen.add(p);
+  }
+  return perm.map((p) => aShape[p]);
+}
+function inferMatmul(opName, aShape, bShape, site) {
+  if (aShape.length < 2) fail(`${opName}: lhs must have rank >= 2, got ${showShape(aShape)}`, site);
+  if (bShape.length !== 2) fail(`${opName}: rhs must have rank 2, got ${showShape(bShape)} \u2014 use matmulBatched for batched rhs`, site);
+  const M = aShape[aShape.length - 2];
+  const Ka = aShape[aShape.length - 1];
+  const Kb = bShape[0];
+  const N = bShape[1];
+  if (Ka !== Kb) fail(`${opName}: inner dims don't match \u2014 ${showShape(aShape)} \xB7 ${showShape(bShape)} (last axis of lhs = ${Ka}, first axis of rhs = ${Kb})`, site);
+  return [...aShape.slice(0, -2), M, N];
+}
+function inferMatmulBatched(opName, aShape, bShape, site) {
+  if (aShape.length < 2 || bShape.length < 2) {
+    fail(`${opName}: both inputs must have rank >= 2, got ${showShape(aShape)} and ${showShape(bShape)}`, site);
+  }
+  if (aShape.length !== bShape.length) {
+    fail(`${opName}: ranks must match (got ${aShape.length} vs ${bShape.length}). Reshape if you need different batch dims.`, site);
+  }
+  const aBatch = aShape.slice(0, -2);
+  const bBatch = bShape.slice(0, -2);
+  for (let i = 0; i < aBatch.length; i++) {
+    if (aBatch[i] !== bBatch[i]) {
+      fail(`${opName}: batch dims must match \u2014 ${showShape(aShape)} vs ${showShape(bShape)}`, site);
+    }
+  }
+  const M = aShape[aShape.length - 2];
+  const Ka = aShape[aShape.length - 1];
+  const Kb = bShape[bShape.length - 2];
+  const N = bShape[bShape.length - 1];
+  if (Ka !== Kb) fail(`${opName}: inner dims don't match \u2014 last axis of lhs = ${Ka}, second-to-last of rhs = ${Kb}`, site);
+  return [...aBatch, M, N];
+}
+function inferOneHot(opName, indicesShape, depth, site) {
+  if (depth <= 0) fail(`${opName}: depth must be positive, got ${depth}`, site);
+  return [...indicesShape, depth];
+}
+function inferWhereCausal(opName, aShape, site) {
+  if (aShape.length < 2) fail(`${opName}: requires rank >= 2, got ${showShape(aShape)}`, site);
+  const m = aShape[aShape.length - 2];
+  const n = aShape[aShape.length - 1];
+  if (m !== n) fail(`${opName}: last two axes must be equal (square mask), got ${showShape(aShape)}`, site);
+  return aShape;
+}
+function inferSliceLastRange(opName, aShape, start, end, site) {
+  if (aShape.length === 0) fail(`${opName}: cannot slice 0-d tensor`, site);
+  const last = aShape[aShape.length - 1];
+  if (start < 0 || end > last || start >= end) {
+    fail(`${opName}: invalid range [${start}, ${end}) for last axis of size ${last}`, site);
+  }
+  return [...aShape.slice(0, -1), end - start];
+}
+function inferBroadcastTo(opName, aShape, targetShape, site) {
+  if (aShape.length > targetShape.length) {
+    fail(`${opName}: source rank ${aShape.length} > target rank ${targetShape.length}`, site);
+  }
+  const offset = targetShape.length - aShape.length;
+  for (let i = 0; i < aShape.length; i++) {
+    const av = aShape[i];
+    const tv = targetShape[offset + i];
+    if (av !== tv && av !== 1) {
+      fail(`${opName}: cannot broadcast ${showShape(aShape)} to ${showShape(targetShape)} \u2014 axis ${i} (size ${av}) doesn't match target axis ${offset + i} (size ${tv}) and isn't 1`, site);
+    }
+  }
+  return targetShape;
+}
+function inferSumToShape(opName, aShape, targetShape, site) {
+  if (targetShape.length > aShape.length) {
+    fail(`${opName}: target rank ${targetShape.length} > source rank ${aShape.length}`, site);
+  }
+  const offset = aShape.length - targetShape.length;
+  for (let i = 0; i < targetShape.length; i++) {
+    const av = aShape[offset + i];
+    const tv = targetShape[i];
+    if (av !== tv && tv !== 1) {
+      fail(`${opName}: cannot sum-reduce ${showShape(aShape)} to ${showShape(targetShape)} \u2014 target axis ${i} (size ${tv}) must be 1 or match source`, site);
+    }
+  }
+  return targetShape;
+}
+function inferWhere(opName, condShape, aShape, bShape, site) {
+  const ab = broadcastTrailing(aShape, bShape);
+  if (!ab) fail(`${opName}: a/b incompatible: ${showShape(aShape)} vs ${showShape(bShape)}`, site);
+  const result = broadcastTrailing(condShape, ab);
+  if (!result) fail(`${opName}: cond ${showShape(condShape)} incompatible with broadcast(a, b) ${showShape(ab)}`, site);
+  return result;
+}
+function inferReluGrad(opName, xShape, dyShape, site) {
+  if (!shapesEqual(xShape, dyShape)) {
+    fail(`${opName}: x and dy must have matching shapes, got ${showShape(xShape)} and ${showShape(dyShape)}`, site);
+  }
+  return xShape;
+}
+// src/trace.ts
+var _current = null;
+var _captureEnabled = false;
+function currentGraph() {
+  if (!_current) {
+    throw new Error(
+      "tensorgrad: ops can only be called inside trace(). Did you forget to wrap your forward pass?"
+    );
+  }
+  return _current;
+}
+function isCaptureEnabled() {
+  return _captureEnabled;
+}
+function trace(fn) {
+  if (_current) {
+    throw new Error("tensorgrad: nested trace() is not supported");
+  }
+  const g = makeGraph();
+  _current = g;
+  _captureEnabled = true;
+  try {
+    const result = fn();
+    const outputs = Array.isArray(result) ? result : [result];
+    for (const t of outputs) {
+      ;
+      g.outputs.push(t.id);
+    }
+  } finally {
+    _current = null;
+    _captureEnabled = false;
+  }
+  return g;
+}
+function traceInto(g, fn) {
+  if (_current) {
+    throw new Error("tensorgrad: traceInto() called while another trace is active");
+  }
+  _current = g;
+  try {
+    return fn();
+  } finally {
+    _current = null;
+  }
+}
+function assertNameUnused(g, name, kinds, label) {
+  if (g.ops.some((op) => kinds.includes(op.kind) && op.name === name)) {
+    throw new Error(`tensorgrad: ${label} name '${name}' already used in this trace`);
+  }
+}
+function paramInput(name, shape, dtype = "f32") {
+  const g = currentGraph();
+  assertNameUnused(g, name, ["param_input", "tensor_input"], "input");
+  const site = captureSite("paramInput");
+  return addOp(g, "param_input", shape, dtype, site, { name });
+}
+function tensorInput(name, shape, dtype = "f32") {
+  const g = currentGraph();
+  assertNameUnused(g, name, ["param_input", "tensor_input"], "input");
+  const site = captureSite("tensorInput");
+  return addOp(g, "tensor_input", shape, dtype, site, { name });
+}
+function stateInput(name, shape, dtype = "f32", initValue = 0) {
+  const g = currentGraph();
+  assertNameUnused(g, name, ["state_input"], "state");
+  const site = captureSite("stateInput");
+  return addOp(g, "state_input", shape, dtype, site, { name, initValue });
+}
+// src/capture.ts
+function capture(name, t) {
+  if (!isCaptureEnabled()) return t;
+  const g = currentGraph();
+  if (g.captures.has(name)) {
+    throw new Error(
+      `capture: name '${name}' already registered. Use unique names (e.g. \`attn.\${layerIdx}\`) when capturing across a loop.`
+    );
+  }
+  g.captures.set(name, t.id);
+  return t;
+}
+// src/ops.ts
+function binopOp(name, kind, a, b, outDtype = a.dtype) {
+  const site = captureSite(name);
+  if (a.dtype !== b.dtype) throw new ShapeError(`${name}: dtype mismatch (${a.dtype} vs ${b.dtype})`, site);
+  const outShape = inferElementwiseBinop(name, a.shape, b.shape, site);
+  return addOp(currentGraph(), kind, outShape, outDtype, site, { a: a.id, b: b.id });
+}
+function add(a, b) {
+  return typeof b === "number" ? addScalar(a, b) : binopOp("add", "add", a, b);
+}
+function sub(a, b) {
+  return typeof b === "number" ? addScalar(a, -b) : binopOp("sub", "sub", a, b);
+}
+function mul(a, b) {
+  return typeof b === "number" ? mulScalar(a, b) : binopOp("mul", "mul", a, b);
+}
+function div(a, b) {
+  if (typeof b === "number") {
+    if (b === 0) throw new ShapeError(`div: scalar divisor cannot be zero`, captureSite("div"));
+    return mulScalar(a, 1 / b);
+  }
+  return binopOp("div", "div", a, b);
+}
+function mulScalar(a, scalar) {
+  const site = captureSite("mulScalar");
+  return addOp(currentGraph(), "mul_scalar", a.shape, a.dtype, site, { a: a.id, scalar });
+}
+function addScalar(a, scalar) {
+  const site = captureSite("addScalar");
+  return addOp(currentGraph(), "add_scalar", a.shape, a.dtype, site, { a: a.id, scalar });
+}
+function unary(name, a) {
+  const site = captureSite(name);
+  if (a.dtype !== "f32") throw new ShapeError(`${name}: requires f32, got ${a.dtype}`, site);
+  return addOp(currentGraph(), name, inferUnary(name, a.shape, site), "f32", site, { a: a.id });
+}
+var sqrt = (a) => unary("sqrt", a);
+var rsqrt = (a) => unary("rsqrt", a);
+var log = (a) => unary("log", a);
+var exp = (a) => unary("exp", a);
+var relu = (a) => unary("relu", a);
+function meanLast(a) {
+  const site = captureSite("meanLast");
+  if (a.dtype !== "f32") throw new ShapeError(`meanLast: requires f32, got ${a.dtype}`, site);
+  const outShape = inferMeanLast("meanLast", a.shape, site);
+  return addOp(currentGraph(), "mean_last", outShape, a.dtype, site, { a: a.id });
+}
+function sumLast(a) {
+  const site = captureSite("sumLast");
+  if (a.dtype !== "f32") throw new ShapeError(`sumLast: requires f32, got ${a.dtype}`, site);
+  const outShape = inferSumLast("sumLast", a.shape, site);
+  return addOp(currentGraph(), "sum_last", outShape, a.dtype, site, { a: a.id });
+}
+function sumAll(a) {
+  return sumLast(reshape(a, [-1]));
+}
+function reshape(a, newShape) {
+  const site = captureSite("reshape");
+  const outShape = inferReshape("reshape", a.shape, newShape, site);
+  return addOp(currentGraph(), "reshape", outShape, a.dtype, site, { a: a.id, newShape: outShape });
+}
+function transpose(a, perm) {
+  const site = captureSite("transpose");
+  const outShape = inferTranspose("transpose", a.shape, perm, site);
+  return addOp(currentGraph(), "transpose", outShape, a.dtype, site, { a: a.id, perm });
+}
+function swapAxes(a, axis1, axis2) {
+  const r = a.shape.length;
+  const norm = (axis) => axis < 0 ? r + axis : axis;
+  const i1 = norm(axis1);
+  const i2 = norm(axis2);
+  const site = captureSite("swapAxes");
+  if (i1 < 0 || i1 >= r || i2 < 0 || i2 >= r) {
+    throw new ShapeError(`swapAxes: axis out of range \u2014 got (${axis1}, ${axis2}) for rank-${r} tensor`, site);
+  }
+  if (i1 === i2) return a;
+  const perm = Array.from({ length: r }, (_, k) => k);
+  perm[i1] = i2;
+  perm[i2] = i1;
+  return transpose(a, perm);
+}
+function matmul(a, b) {
+  const site = captureSite("matmul");
+  if (a.dtype !== "f32" || b.dtype !== "f32") {
+    throw new ShapeError(`matmul: requires f32, got ${a.dtype} and ${b.dtype}`, site);
+  }
+  const outShape = inferMatmul("matmul", a.shape, b.shape, site);
+  return addOp(currentGraph(), "matmul", outShape, "f32", site, { a: a.id, b: b.id });
+}
+function matmulBatched(a, b) {
+  const site = captureSite("matmulBatched");
+  if (a.dtype !== "f32" || b.dtype !== "f32") {
+    throw new ShapeError(`matmulBatched: requires f32, got ${a.dtype} and ${b.dtype}`, site);
+  }
+  const outShape = inferMatmulBatched("matmulBatched", a.shape, b.shape, site);
+  return addOp(currentGraph(), "matmul_batched", outShape, "f32", site, { a: a.id, b: b.id });
+}
+function oneHot(indices, depth, dtype = "f32") {
+  const site = captureSite("oneHot");
+  if (indices.dtype !== "i32") {
+    throw new ShapeError(`oneHot: indices must be i32, got ${indices.dtype}`, site);
+  }
+  const outShape = inferOneHot("oneHot", indices.shape, depth, site);
+  return addOp(currentGraph(), "one_hot", outShape, dtype, site, { indices: indices.id, depth, dtype });
+}
+function embedding(table, indices) {
+  const site = captureSite("embedding");
+  if (table.shape.length !== 2) {
+    throw new ShapeError(`embedding: table must be 2-d [vocab, dim], got ${showShape(table.shape)}`, site);
+  }
+  if (indices.dtype !== "i32") {
+    throw new ShapeError(`embedding: indices must be i32, got ${indices.dtype}`, site);
+  }
+  return matmul(oneHot(indices, table.shape[0], "f32"), table);
+}
+function arange(n, dtype = "i32") {
+  const site = captureSite("arange");
+  if (n <= 0 || !Number.isInteger(n)) {
+    throw new ShapeError(`arange: n must be a positive integer, got ${n}`, site);
+  }
+  return addOp(currentGraph(), "arange", [n], dtype, site, { n, dtype });
+}
+function softmaxCausalLast(a) {
+  const site = captureSite("softmaxCausalLast");
+  if (a.dtype !== "f32") throw new ShapeError(`softmaxCausalLast: requires f32, got ${a.dtype}`, site);
+  inferWhereCausal("softmaxCausalLast", a.shape, site);
+  return addOp(currentGraph(), "softmax_causal_last", a.shape, "f32", site, { a: a.id });
+}
+function logSoftmaxLast(a) {
+  const site = captureSite("logSoftmaxLast");
+  if (a.dtype !== "f32") throw new ShapeError(`logSoftmaxLast: requires f32, got ${a.dtype}`, site);
+  return addOp(currentGraph(), "log_softmax_last", a.shape, "f32", site, { a: a.id });
+}
+function whereCausal(a, fillValue) {
+  const site = captureSite("whereCausal");
+  if (a.dtype !== "f32") throw new ShapeError(`whereCausal: requires f32, got ${a.dtype}`, site);
+  inferWhereCausal("whereCausal", a.shape, site);
+  return addOp(currentGraph(), "where_causal", a.shape, "f32", site, { a: a.id, fillValue });
+}
+function sliceLastRange(a, start, end) {
+  const site = captureSite("sliceLastRange");
+  const outShape = inferSliceLastRange("sliceLastRange", a.shape, start, end, site);
+  return addOp(currentGraph(), "slice_last_range", outShape, a.dtype, site, { a: a.id, start, end });
+}
+function broadcastTo(a, targetShape) {
+  const site = captureSite("broadcastTo");
+  inferBroadcastTo("broadcastTo", a.shape, targetShape, site);
+  return addOp(currentGraph(), "broadcast_to", targetShape, a.dtype, site, { a: a.id, targetShape });
+}
+function sumToShape(a, targetShape) {
+  const site = captureSite("sumToShape");
+  inferSumToShape("sumToShape", a.shape, targetShape, site);
+  return addOp(currentGraph(), "sum_to_shape", targetShape, a.dtype, site, { a: a.id, targetShape });
+}
+function constScalar(value, dtype = "f32") {
+  const site = captureSite("constScalar");
+  return addOp(currentGraph(), "const_scalar", [], dtype, site, { value, dtype });
+}
+var less = (a, b) => binopOp("less", "less", a, b, "bool");
+var greater = (a, b) => binopOp("greater", "greater", a, b, "bool");
+function where(cond, a, b) {
+  const site = captureSite("where");
+  if (cond.dtype !== "bool") throw new ShapeError(`where: cond must be bool, got ${cond.dtype}`, site);
+  if (a.dtype !== b.dtype) throw new ShapeError(`where: a/b dtype mismatch (${a.dtype} vs ${b.dtype})`, site);
+  const outShape = inferWhere("where", cond.shape, a.shape, b.shape, site);
+  return addOp(currentGraph(), "where", outShape, a.dtype, site, { cond: cond.id, a: a.id, b: b.id });
+}
+function reluGrad(x, dy) {
+  const site = captureSite("reluGrad");
+  if (x.dtype !== "f32" || dy.dtype !== "f32") {
+    throw new ShapeError(`reluGrad: requires f32, got ${x.dtype} and ${dy.dtype}`, site);
+  }
+  const outShape = inferReluGrad("reluGrad", x.shape, dy.shape, site);
+  return addOp(currentGraph(), "relu_grad", outShape, "f32", site, { x: x.id, dy: dy.id });
+}
+function adamUpdateM(m, g, b1) {
+  const site = captureSite("adamUpdateM");
+  if (m.dtype !== "f32" || g.dtype !== "f32") throw new ShapeError(`adamUpdateM: requires f32`, site);
+  if (m.shape.length !== g.shape.length || m.shape.some((d, i) => d !== g.shape[i])) {
+    throw new ShapeError(`adamUpdateM: shape mismatch`, site);
+  }
+  return addOp(currentGraph(), "adam_update_m", m.shape, "f32", site, { m: m.id, g: g.id, b1 });
+}
+function adamUpdateV(v, g, b2) {
+  const site = captureSite("adamUpdateV");
+  if (v.dtype !== "f32" || g.dtype !== "f32") throw new ShapeError(`adamUpdateV: requires f32`, site);
+  if (v.shape.length !== g.shape.length || v.shape.some((d, i) => d !== g.shape[i])) {
+    throw new ShapeError(`adamUpdateV: shape mismatch`, site);
+  }
+  return addOp(currentGraph(), "adam_update_v", v.shape, "f32", site, { v: v.id, g: g.id, b2 });
+}
+function adamUpdateP(p, mNew, vNew, lrt, eps, decayShrink = 1) {
+  const site = captureSite("adamUpdateP");
+  if (p.dtype !== "f32") throw new ShapeError(`adamUpdateP: requires f32`, site);
+  if (lrt.dtype !== "f32" || lrt.shape.length !== 0) {
+    throw new ShapeError(`adamUpdateP: lrt must be a 0-d f32 scalar`, site);
+  }
+  if (p.shape.length !== mNew.shape.length || p.shape.some((d, i) => d !== mNew.shape[i])) {
+    throw new ShapeError(`adamUpdateP: p/mNew shape mismatch`, site);
+  }
+  const isTensor = typeof decayShrink === "object";
+  if (isTensor) {
+    if (decayShrink.dtype !== "f32" || decayShrink.shape.length !== 0) {
+      throw new ShapeError(`adamUpdateP: decayShrink tensor must be a 0-d f32 scalar`, site);
+    }
+  }
+  return addOp(currentGraph(), "adam_update_p", p.shape, "f32", site, {
+    p: p.id,
+    mNew: mNew.id,
+    vNew: vNew.id,
+    lrt: lrt.id,
+    eps,
+    decayShrink: isTensor ? 1 : decayShrink,
+    decayShrinkTensor: isTensor ? decayShrink.id : null
+  });
+}
+// src/grad.ts
+function appendGrad(graph) {
+  if (graph.outputs.length !== 1) {
+    throw new Error(`autograd: expected graph with exactly 1 output (the loss); got ${graph.outputs.length}`);
+  }
+  const lossId = graph.outputs[0];
+  const lossTensor = graph.tensors[lossId];
+  if (lossTensor.shape.length !== 0) {
+    throw new Error(
+      `autograd: loss must be a rank-0 scalar; got shape [${lossTensor.shape.join(", ")}]. Reduce with sumLast / mulScalar to a scalar before calling appendGrad.`
+    );
+  }
+  const forwardOpCount = graph.ops.length;
+  const forwardOps = graph.ops.slice(0, forwardOpCount);
+  const cotangents = /* @__PURE__ */ new Map();
+  return traceInto(graph, () => {
+    cotangents.set(lossId, constScalar(1, "f32"));
+    for (let i = forwardOpCount - 1; i >= 0; i--) {
+      const op = forwardOps[i];
+      const outCotan = cotangents.get(op.out);
+      if (!outCotan) continue;
+      runTransposeRule(op, outCotan, graph, cotangents);
+    }
+    const paramGrads = {};
+    for (const op of forwardOps) {
+      if (op.kind !== "param_input") continue;
+      const cotan = cotangents.get(op.out);
+      if (!cotan) {
+        const t = graph.tensors[op.out];
+        paramGrads[op.name] = broadcastTo(constScalar(0, t.dtype), t.shape);
+      } else {
+        paramGrads[op.name] = cotan;
+      }
+    }
+    return { graph, paramGrads, loss: lossTensor };
+  });
+}
+function accumulate(cotangents, inputId, contribution) {
+  const existing = cotangents.get(inputId);
+  if (existing) {
+    cotangents.set(inputId, add(existing, contribution));
+  } else {
+    cotangents.set(inputId, contribution);
+  }
+}
+function unbroadcast(cotan, toShape) {
+  if (shapesEqual(cotan.shape, toShape)) return cotan;
+  return sumToShape(cotan, toShape);
+}
+function runTransposeRule(op, outCotan, graph, cotangents) {
+  const tensorOf = (id) => graph.tensors[id];
+  switch (op.kind) {
+    // ---- Leaves: no inputs to accumulate into. -----------------------------
+    case "param_input":
+    case "tensor_input":
+    case "state_input":
+    case "arange":
+    case "const_scalar":
+      return;
+    // ---- Element-wise binops (with broadcast) ------------------------------
+    // c = a op b; reduce cotan back to each operand's shape.
+    case "add": {
+      const a = tensorOf(op.a), b = tensorOf(op.b);
+      accumulate(cotangents, op.a, unbroadcast(outCotan, a.shape));
+      accumulate(cotangents, op.b, unbroadcast(outCotan, b.shape));
+      return;
+    }
+    case "sub": {
+      const a = tensorOf(op.a), b = tensorOf(op.b);
+      accumulate(cotangents, op.a, unbroadcast(outCotan, a.shape));
+      accumulate(cotangents, op.b, unbroadcast(mulScalar(outCotan, -1), b.shape));
+      return;
+    }
+    case "mul": {
+      const a = tensorOf(op.a), b = tensorOf(op.b);
+      accumulate(cotangents, op.a, unbroadcast(mul(outCotan, b), a.shape));
+      accumulate(cotangents, op.b, unbroadcast(mul(outCotan, a), b.shape));
+      return;
+    }
+    case "div": {
+      const a = tensorOf(op.a), b = tensorOf(op.b);
+      accumulate(cotangents, op.a, unbroadcast(div(outCotan, b), a.shape));
+      const numer = mul(outCotan, a);
+      const bSq = mul(b, b);
+      accumulate(cotangents, op.b, unbroadcast(mulScalar(div(numer, bSq), -1), b.shape));
+      return;
+    }
+    // ---- Element-wise scalar binops (scalar is a JS number, not a tensor) -
+    case "mul_scalar": {
+      accumulate(cotangents, op.a, mulScalar(outCotan, op.scalar));
+      return;
+    }
+    case "add_scalar": {
+      accumulate(cotangents, op.a, outCotan);
+      return;
+    }
+    // ---- Unary -------------------------------------------------------------
+    case "sqrt": {
+      const c = tensorOf(op.out);
+      accumulate(cotangents, op.a, mulScalar(div(outCotan, c), 0.5));
+      return;
+    }
+    case "rsqrt": {
+      const c = tensorOf(op.out);
+      const c3 = mul(mul(c, c), c);
+      accumulate(cotangents, op.a, mulScalar(mul(outCotan, c3), -0.5));
+      return;
+    }
+    case "log": {
+      const a = tensorOf(op.a);
+      accumulate(cotangents, op.a, div(outCotan, a));
+      return;
+    }
+    case "exp": {
+      const c = tensorOf(op.out);
+      accumulate(cotangents, op.a, mul(outCotan, c));
+      return;
+    }
+    case "relu": {
+      const a = tensorOf(op.a);
+      accumulate(cotangents, op.a, reluGrad(a, outCotan));
+      return;
+    }
+    // ---- Reductions over last axis ---------------------------------------
+    case "mean_last": {
+      const a = tensorOf(op.a);
+      const D = a.shape[a.shape.length - 1];
+      const expanded = broadcastTo(outCotan, a.shape);
+      accumulate(cotangents, op.a, mulScalar(expanded, 1 / D));
+      return;
+    }
+    case "sum_last": {
+      const a = tensorOf(op.a);
+      const withKeep = reshape(outCotan, [...outCotan.shape, 1]);
+      accumulate(cotangents, op.a, broadcastTo(withKeep, a.shape));
+      return;
+    }
+    // ---- Shape ------------------------------------------------------------
+    case "reshape": {
+      const a = tensorOf(op.a);
+      accumulate(cotangents, op.a, reshape(outCotan, a.shape));
+      return;
+    }
+    case "transpose": {
+      const inv = invertPerm(op.perm);
+      accumulate(cotangents, op.a, transpose(outCotan, inv));
+      return;
+    }
+    // ---- Linear algebra ---------------------------------------------------
+    case "matmul": {
+      const a = tensorOf(op.a), b = tensorOf(op.b);
+      accumulate(cotangents, op.a, matmul(outCotan, swapAxes(b, -1, -2)));
+      const aT = swapAxes(a, -1, -2);
+      let perBatchDb;
+      if (a.shape.length > 2) {
+        perBatchDb = matmulBatched(aT, outCotan);
+      } else {
+        perBatchDb = matmul(aT, outCotan);
+      }
+      accumulate(cotangents, op.b, sumToShape(perBatchDb, b.shape));
+      return;
+    }
+    case "matmul_batched": {
+      const a = tensorOf(op.a), b = tensorOf(op.b);
+      accumulate(cotangents, op.a, matmulBatched(outCotan, swapAxes(b, -1, -2)));
+      accumulate(cotangents, op.b, matmulBatched(swapAxes(a, -1, -2), outCotan));
+      return;
+    }
+    // ---- Indexing / casting (no gradient through integer indices) --------
+    case "one_hot":
+      return;
+    // ---- Slicing ---------------------------------------------------------
+    case "slice_last_range": {
+      const a = tensorOf(op.a);
+      throw new Error(
+        `autograd: slice_last_range backward not implemented yet (would need a scatter-style op or a Concat op). Workaround for now: avoid taking gradients through slices by using separate matmuls for Q/K/V instead of a fused W_qkv. Tensor: ${a.shape} -> ${tensorOf(op.out).shape}`
+      );
+    }
+    // ---- Broadcast / un-broadcast (autograd infrastructure) ---------------
+    case "broadcast_to": {
+      const a = tensorOf(op.a);
+      accumulate(cotangents, op.a, sumToShape(outCotan, a.shape));
+      return;
+    }
+    case "sum_to_shape": {
+      const a = tensorOf(op.a);
+      accumulate(cotangents, op.a, broadcastTo(outCotan, a.shape));
+      return;
+    }
+    // ---- ML primitives ---------------------------------------------------
+    case "log_softmax_last": {
+      const c = tensorOf(op.out);
+      const sm = exp(c);
+      const sumDc = sumLast(outCotan);
+      const sumDcKeep = reshape(sumDc, [...sumDc.shape, 1]);
+      const term = mul(sm, broadcastTo(sumDcKeep, c.shape));
+      accumulate(cotangents, op.a, sub(outCotan, term));
+      return;
+    }
+    case "softmax_causal_last": {
+      const c = tensorOf(op.out);
+      const dcXc = mul(outCotan, c);
+      const s = sumLast(dcXc);
+      const sKeep = reshape(s, [...s.shape, 1]);
+      const inner = sub(outCotan, broadcastTo(sKeep, c.shape));
+      accumulate(cotangents, op.a, mul(inner, c));
+      return;
+    }
+    // ---- Comparisons + select ---------------------------------------------
+    case "less":
+    case "greater":
+      return;
+    case "where": {
+      const cond = tensorOf(op.cond);
+      const a = tensorOf(op.a);
+      const b = tensorOf(op.b);
+      const zeroA = broadcastTo(constScalar(0, a.dtype), outCotan.shape);
+      const zeroB = broadcastTo(constScalar(0, b.dtype), outCotan.shape);
+      accumulate(cotangents, op.a, unbroadcast(where(cond, outCotan, zeroA), a.shape));
+      accumulate(cotangents, op.b, unbroadcast(where(cond, zeroB, outCotan), b.shape));
+      return;
+    }
+    case "where_causal": {
+      throw new Error(
+        `autograd: where_causal backward not yet implemented. Use softmax_causal_last (which fuses the mask + softmax) instead.`
+      );
+    }
+    // ---- Adam ops are post-autograd; no backward through them. ----------
+    case "adam_update_m":
+    case "adam_update_v":
+    case "adam_update_p":
+      throw new Error(`autograd: cannot differentiate through ${op.kind}`);
+    // ---- relu_grad has no further backward (autograd-internal) ----------
+    case "relu_grad": {
+      throw new Error(
+        `autograd: cannot take second-order gradient through relu_grad. Phase 2 does not support higher-order autodiff.`
+      );
+    }
+    default: {
+      const _exhaustive = op;
+      void _exhaustive;
+      throw new Error(`autograd: unhandled op kind ${op.kind}`);
+    }
+  }
+}
+function invertPerm(perm) {
+  const inv = new Array(perm.length);
+  for (let i = 0; i < perm.length; i++) inv[perm[i]] = i;
+  return inv;
+}
+// src/adam.ts
+var lr = {
+  constant: (value) => ({ kind: "constant", value }),
+  /** Linearly interpolate from `peak` at step 1 to `final` at step `steps`,
+   *  then hold at `final`. Matches `peak + (final - peak) * min(step/steps, 1)`. */
+  linearDecay: (opts) => ({ kind: "linearDecay", ...opts }),
+  /** Half-cosine from `peak` at step 1 down to `final` at step `steps`,
+   *  then hold at `final`. */
+  cosineDecay: (opts) => ({ kind: "cosineDecay", ...opts }),
+  /** Linear ramp from 0 to `peakLr` over `warmupSteps` steps, then hand off
+   *  to `after` (offset so step 1 of `after` = first post-warmup step). */
+  warmup: (opts) => ({ kind: "warmup", ...opts })
+};
+function resolveLR(schedule, step) {
+  if (typeof schedule === "number") return schedule;
+  switch (schedule.kind) {
+    case "constant":
+      return schedule.value;
+    case "linearDecay": {
+      const f = Math.min(step / schedule.steps, 1);
+      return schedule.peak + (schedule.final - schedule.peak) * f;
+    }
+    case "cosineDecay": {
+      const f = Math.min(step / schedule.steps, 1);
+      return schedule.final + 0.5 * (schedule.peak - schedule.final) * (1 + Math.cos(Math.PI * f));
+    }
+    case "warmup": {
+      if (step <= schedule.warmupSteps) return schedule.peakLr * (step / schedule.warmupSteps);
+      return resolveLR(schedule.after, step - schedule.warmupSteps);
+    }
+  }
+}
+function isLRDynamic(schedule) {
+  if (typeof schedule === "number") return false;
+  return schedule.kind !== "constant";
+}
+function appendAdam(graph, paramGrads, paramTensors, config, decayFlags) {
+  const lrIsScheduled = isLRDynamic(config.lr);
+  const initialLr = resolveLR(config.lr, 1);
+  const fullConfig = {
+    lr: config.lr,
+    b1: config.b1 ?? 0.9,
+    b2: config.b2 ?? 0.999,
+    eps: config.eps ?? 1e-8,
+    weightDecay: config.weightDecay ?? 0,
+    decayFilter: config.decayFilter ?? (() => true),
+    lrIsScheduled
+  };
+  const writebacks = [];
+  const lrtInputName = "_adam_lrt";
+  let decayShrinkInputName = null;
+  return traceInto(graph, () => {
+    const lrt = tensorInput(lrtInputName, [], "f32");
+    const decayedNames = new Set(
+      fullConfig.weightDecay > 0 ? Object.keys(paramGrads).filter((name) => decayFlags && name in decayFlags ? decayFlags[name] : fullConfig.decayFilter(name)) : []
+    );
+    let decayShrinkScalar = null;
+    if (lrIsScheduled && decayedNames.size > 0) {
+      decayShrinkInputName = "_adam_decay_shrink";
+      decayShrinkScalar = tensorInput(decayShrinkInputName, [], "f32");
+    }
+    for (const name of Object.keys(paramGrads)) {
+      const p = paramTensors[name];
+      const g = paramGrads[name];
+      if (!p) throw new Error(`appendAdam: missing param tensor for '${name}'`);
+      if (!g) throw new Error(`appendAdam: missing gradient for '${name}'`);
+      const mState = stateInput(`adam_m_${name}`, p.shape, "f32", 0);
+      const vState = stateInput(`adam_v_${name}`, p.shape, "f32", 0);
+      const decayShrink = !decayedNames.has(name) ? 1 : decayShrinkScalar !== null ? decayShrinkScalar : 1 - initialLr * fullConfig.weightDecay;
+      const newM = adamUpdateM(mState, g, fullConfig.b1);
+      const newV = adamUpdateV(vState, g, fullConfig.b2);
+      const newP = adamUpdateP(p, newM, newV, lrt, fullConfig.eps, decayShrink);
+      writebacks.push({ source: newM, destName: `adam_m_${name}`, destKind: "state" });
+      writebacks.push({ source: newV, destName: `adam_v_${name}`, destKind: "state" });
+      writebacks.push({ source: newP, destName: name, destKind: "param" });
+    }
+    return { writebacks, lrtInputName, decayShrinkInputName, config: fullConfig };
+  });
+}
+// src/buffers.ts
+var dtypeBytes = { f32: 4, i32: 4, bool: 4 };
+function planBuffers(graph, paramGrads, writebackDecls = []) {
+  const buffers = [];
+  const tensorToBuffer = /* @__PURE__ */ new Map();
+  const paramsByName = /* @__PURE__ */ new Map();
+  const inputsByName = /* @__PURE__ */ new Map();
+  const paramGradsByName = /* @__PURE__ */ new Map();
+  const statesByName = /* @__PURE__ */ new Map();
+  const gradTensorIdToName = /* @__PURE__ */ new Map();
+  for (const [name, tensor] of Object.entries(paramGrads)) {
+    gradTensorIdToName.set(tensor.id, name);
+  }
+  const opByOutId = /* @__PURE__ */ new Map();
+  for (const op of graph.ops) opByOutId.set(op.out, op);
+  const outputSet = new Set(graph.outputs);
+  for (const t of graph.tensors) {
+    const op = opByOutId.get(t.id);
+    let kind = "intermediate";
+    let name = null;
+    let initValue;
+    if (op?.kind === "param_input") {
+      kind = "param";
+      name = op.name;
+    } else if (op?.kind === "tensor_input") {
+      kind = "tensor_input";
+      name = op.name;
+    } else if (op?.kind === "state_input") {
+      kind = "state";
+      name = op.name;
+      initValue = op.initValue;
+    } else if (gradTensorIdToName.has(t.id)) {
+      kind = "param_grad";
+      name = gradTensorIdToName.get(t.id);
+    } else if (outputSet.has(t.id)) {
+      kind = "output";
+    }
+    const spec = {
+      id: t.id,
+      byteSize: Math.max(4, shapeSize(t.shape) * dtypeBytes[t.dtype]),
+      dtype: t.dtype,
+      shape: t.shape,
+      kind,
+      name,
+      ...initValue !== void 0 ? { initValue } : {}
+    };
+    buffers.push(spec);
+    tensorToBuffer.set(t.id, t.id);
+    if (kind === "param") paramsByName.set(name, t.id);
+    if (kind === "tensor_input") inputsByName.set(name, t.id);
+    if (kind === "param_grad") paramGradsByName.set(name, t.id);
+    if (kind === "state") statesByName.set(name, t.id);
+  }
+  const outputBufferIds = graph.outputs.map((id) => tensorToBuffer.get(id));
+  const writebacks = writebackDecls.map((decl) => {
+    const sourceBufId = tensorToBuffer.get(decl.source.id);
+    if (sourceBufId === void 0) {
+      throw new Error(`planBuffers: writeback source tensor #${decl.source.id} not in graph`);
+    }
+    const destBufId = decl.destKind === "param" ? paramsByName.get(decl.destName) : statesByName.get(decl.destName);
+    if (destBufId === void 0) {
+      throw new Error(`planBuffers: writeback dest ${decl.destKind}:'${decl.destName}' not found`);
+    }
+    const sourceSpec = buffers[sourceBufId];
+    const destSpec = buffers[destBufId];
+    if (sourceSpec.byteSize !== destSpec.byteSize) {
+      throw new Error(
+        `planBuffers: writeback size mismatch for ${decl.destKind}:'${decl.destName}' (source ${sourceSpec.byteSize} bytes vs dest ${destSpec.byteSize})`
+      );
+    }
+    return { source: sourceBufId, dest: destBufId, bytes: sourceSpec.byteSize };
+  });
+  const capturesByName = /* @__PURE__ */ new Map();
+  for (const [name, tensorId] of graph.captures) {
+    const bufId = tensorToBuffer.get(tensorId);
+    if (bufId === void 0) {
+      throw new Error(`planBuffers: capture '${name}' references unknown tensor #${tensorId}`);
+    }
+    capturesByName.set(name, bufId);
+  }
+  return { buffers, tensorToBuffer, paramsByName, inputsByName, paramGradsByName, statesByName, capturesByName, outputBufferIds, writebacks };
+}
+// src/codegen.ts
+var WG_SIZE = 256;
+var GID_LINE = "let i = gid.x + gid.y * 16776960u;";
+function emitKernels(graph, plan) {
+  const out = [];
+  for (let i = 0; i < graph.ops.length; i++) {
+    const op = graph.ops[i];
+    const spec = emitKernel(op, graph, plan, i);
+    out.push(spec);
+  }
+  return out;
+}
+function emitKernel(op, graph, plan, opIndex) {
+  const tof = (id) => graph.tensors[id];
+  const buf = (tensorId) => plan.tensorToBuffer.get(tensorId);
+  const empty = () => ({ opIndex, opKind: op.kind, wgsl: "", bindings: [], threads: 0, workgroupSize: WG_SIZE });
+  switch (op.kind) {
+    // ---- Leaves: data is supplied externally; no kernel ---------------------
+    case "param_input":
+    case "tensor_input":
+    case "state_input":
+      return empty();
+    // ---- arange / const_scalar: kernel that fills the buffer once -----------
+    case "arange": {
+      const out = tof(op.out);
+      const wgsl = `
+@group(0) @binding(0) var<storage, read_write> buf : array<${wgslDtype(out.dtype)}>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${op.n}u) { return; }
+  buf[i] = ${castFromI32("i32(i)", out.dtype)};
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.out)], threads: op.n, workgroupSize: WG_SIZE };
+    }
+    case "const_scalar": {
+      const wgsl = `
+@group(0) @binding(0) var<storage, read_write> buf : array<${wgslDtype(op.dtype)}>;
+@compute @workgroup_size(1)
+fn main() {
+  buf[0] = ${wgslLiteral(op.value, op.dtype)};
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.out)], threads: 1, workgroupSize: 1 };
+    }
+    // ---- Element-wise binops with broadcast --------------------------------
+    case "add":
+    case "sub":
+    case "mul":
+    case "div": {
+      const out = tof(op.out);
+      const a = tof(op.a);
+      const b = tof(op.b);
+      const opStr = { add: "+", sub: "-", mul: "*", div: "/" }[op.kind];
+      const total = shapeSize(out.shape);
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<${wgslDtype(a.dtype)}>;
+@group(0) @binding(1) var<storage, read> b : array<${wgslDtype(b.dtype)}>;
+@group(0) @binding(2) var<storage, read_write> out : array<${wgslDtype(out.dtype)}>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+${broadcastIndexBlock("i", out.shape, a.shape, "aIdx")}
+${broadcastIndexBlock("i", out.shape, b.shape, "bIdx")}
+  out[i] = a[aIdx] ${opStr} b[bIdx];
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.b), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    // ---- Element-wise scalar binops (scalar baked into WGSL) ---------------
+    case "mul_scalar":
+    case "add_scalar": {
+      const out = tof(op.out);
+      const a = tof(op.a);
+      const opStr = op.kind === "mul_scalar" ? "*" : "+";
+      const total = shapeSize(out.shape);
+      const lit = wgslLiteral(op.scalar, out.dtype);
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<${wgslDtype(a.dtype)}>;
+@group(0) @binding(1) var<storage, read_write> out : array<${wgslDtype(out.dtype)}>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+  out[i] = a[i] ${opStr} ${lit};
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    // ---- Unary -------------------------------------------------------------
+    case "sqrt":
+    case "rsqrt":
+    case "log":
+    case "exp":
+    case "relu": {
+      const out = tof(op.out);
+      const a = tof(op.a);
+      const total = shapeSize(out.shape);
+      const expr = op.kind === "sqrt" ? "sqrt(x)" : op.kind === "rsqrt" ? "1.0 / sqrt(x)" : op.kind === "log" ? "log(x)" : op.kind === "exp" ? "exp(x)" : (
+        /* relu */
+        "max(x, 0.0)"
+      );
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<${wgslDtype(a.dtype)}>;
+@group(0) @binding(1) var<storage, read_write> out : array<${wgslDtype(out.dtype)}>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+  let x = a[i];
+  out[i] = ${expr};
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    // ---- Comparisons + select --------------------------------------------
+    case "less":
+    case "greater": {
+      const out = tof(op.out);
+      const a = tof(op.a);
+      const b = tof(op.b);
+      const opStr = op.kind === "less" ? "<" : ">";
+      const total = shapeSize(out.shape);
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<${wgslDtype(a.dtype)}>;
+@group(0) @binding(1) var<storage, read> b : array<${wgslDtype(b.dtype)}>;
+@group(0) @binding(2) var<storage, read_write> out : array<u32>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+${broadcastIndexBlock("i", out.shape, a.shape, "aIdx")}
+${broadcastIndexBlock("i", out.shape, b.shape, "bIdx")}
+  out[i] = select(0u, 1u, a[aIdx] ${opStr} b[bIdx]);
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.b), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    case "where": {
+      const out = tof(op.out);
+      const cond = tof(op.cond);
+      const a = tof(op.a);
+      const b = tof(op.b);
+      const total = shapeSize(out.shape);
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> cond : array<u32>;
+@group(0) @binding(1) var<storage, read> a : array<${wgslDtype(a.dtype)}>;
+@group(0) @binding(2) var<storage, read> b : array<${wgslDtype(b.dtype)}>;
+@group(0) @binding(3) var<storage, read_write> out : array<${wgslDtype(out.dtype)}>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+${broadcastIndexBlock("i", out.shape, cond.shape, "cIdx")}
+${broadcastIndexBlock("i", out.shape, a.shape, "aIdx")}
+${broadcastIndexBlock("i", out.shape, b.shape, "bIdx")}
+  out[i] = select(b[bIdx], a[aIdx], cond[cIdx] != 0u);
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.cond), buf(op.a), buf(op.b), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    case "relu_grad": {
+      const out = tof(op.out);
+      const total = shapeSize(out.shape);
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> x : array<f32>;
+@group(0) @binding(1) var<storage, read> dy : array<f32>;
+@group(0) @binding(2) var<storage, read_write> out : array<f32>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+  out[i] = select(0.0, dy[i], x[i] > 0.0);
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.x), buf(op.dy), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    // ---- Reductions over last axis -----------------------------------------
+    case "mean_last":
+    case "sum_last": {
+      const a = tof(op.a);
+      const D = a.shape[a.shape.length - 1];
+      const outerSize = shapeSize(a.shape) / D;
+      const divisor = op.kind === "mean_last" ? `f32(${D}u)` : "1.0";
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<f32>;
+@group(0) @binding(1) var<storage, read_write> out : array<f32>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${outerSize}u) { return; }
+  let base = i * ${D}u;
+  var s : f32 = 0.0;
+  for (var j : u32 = 0u; j < ${D}u; j = j + 1u) {
+    s = s + a[base + j];
+  }
+  out[i] = s / ${divisor};
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.out)], threads: outerSize, workgroupSize: WG_SIZE };
+    }
+    // ---- Shape ---------------------------------------------------------------
+    // reshape: no kernel needed if buffers can alias (shape change only). For
+    // v1 simplicity we emit a memcpy-style kernel rather than aliasing buffers,
+    // because aliasing complicates the buffer plan and we have memory headroom.
+    case "reshape": {
+      const out = tof(op.out);
+      const a = tof(op.a);
+      const total = shapeSize(out.shape);
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<${wgslDtype(a.dtype)}>;
+@group(0) @binding(1) var<storage, read_write> out : array<${wgslDtype(out.dtype)}>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+  out[i] = a[i];
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    case "transpose": {
+      const out = tof(op.out);
+      const a = tof(op.a);
+      const total = shapeSize(out.shape);
+      const aStrides = computeStrides(a.shape);
+      const outDimDecls = decomposeFlatIndexBlock("i", out.shape, "oIdx");
+      const srcExpr = [];
+      for (let k = 0; k < a.shape.length; k++) {
+        const srcAxis = op.perm.indexOf(k);
+        srcExpr.push(`oIdx_${srcAxis} * ${aStrides[k]}u`);
+      }
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<${wgslDtype(a.dtype)}>;
+@group(0) @binding(1) var<storage, read_write> out : array<${wgslDtype(out.dtype)}>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+${outDimDecls}
+  let srcIdx = ${srcExpr.join(" + ")};
+  out[i] = a[srcIdx];
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    // ---- Linear algebra ----------------------------------------------------
+    // matmul: a [..., M, K] · b [K, N] -> [..., M, N]. b is unbatched.
+    case "matmul": {
+      const out = tof(op.out);
+      const a = tof(op.a);
+      const b = tof(op.b);
+      const M = a.shape[a.shape.length - 2];
+      const K = a.shape[a.shape.length - 1];
+      const N = b.shape[1];
+      const batch = shapeSize(a.shape) / (M * K);
+      const total = batch * M * N;
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<f32>;
+@group(0) @binding(1) var<storage, read> b : array<f32>;
+@group(0) @binding(2) var<storage, read_write> c : array<f32>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+  let bi = i / ${M * N}u;          // batch index
+  let mn = i % ${M * N}u;
+  let m = mn / ${N}u;
+  let n = mn % ${N}u;
+  let aBase = bi * ${M * K}u + m * ${K}u;
+  var s : f32 = 0.0;
+  for (var k : u32 = 0u; k < ${K}u; k = k + 1u) {
+    s = s + a[aBase + k] * b[k * ${N}u + n];
+  }
+  c[i] = s;
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.b), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    case "matmul_batched": {
+      const out = tof(op.out);
+      const a = tof(op.a);
+      const b = tof(op.b);
+      const M = a.shape[a.shape.length - 2];
+      const K = a.shape[a.shape.length - 1];
+      const N = b.shape[b.shape.length - 1];
+      const batch = shapeSize(a.shape) / (M * K);
+      const total = batch * M * N;
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<f32>;
+@group(0) @binding(1) var<storage, read> b : array<f32>;
+@group(0) @binding(2) var<storage, read_write> c : array<f32>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+  let bi = i / ${M * N}u;
+  let mn = i % ${M * N}u;
+  let m = mn / ${N}u;
+  let n = mn % ${N}u;
+  let aBase = bi * ${M * K}u + m * ${K}u;
+  let bBase = bi * ${K * N}u;
+  var s : f32 = 0.0;
+  for (var k : u32 = 0u; k < ${K}u; k = k + 1u) {
+    s = s + a[aBase + k] * b[bBase + k * ${N}u + n];
+  }
+  c[i] = s;
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.b), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    // ---- One-hot ------------------------------------------------------------
+    case "one_hot": {
+      const out = tof(op.out);
+      const indices = tof(op.indices);
+      const total = shapeSize(out.shape);
+      const depth = op.depth;
+      const zeroLit = wgslLiteral(0, out.dtype);
+      const oneLit = wgslLiteral(1, out.dtype);
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> indices : array<i32>;
+@group(0) @binding(1) var<storage, read_write> out : array<${wgslDtype(out.dtype)}>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+  let outerIdx = i / ${depth}u;
+  let depthIdx = i % ${depth}u;
+  let tgt = u32(indices[outerIdx]);
+  out[i] = select(${zeroLit}, ${oneLit}, tgt == depthIdx);
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.indices), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    // ---- ML primitives -----------------------------------------------------
+    case "log_softmax_last": {
+      const a = tof(op.a);
+      const D = a.shape[a.shape.length - 1];
+      const outerSize = shapeSize(a.shape) / D;
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<f32>;
+@group(0) @binding(1) var<storage, read_write> out : array<f32>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${outerSize}u) { return; }
+  let base = i * ${D}u;
+  var m : f32 = -1.0e30;
+  for (var j : u32 = 0u; j < ${D}u; j = j + 1u) {
+    let v = a[base + j];
+    if (v > m) { m = v; }
+  }
+  var s : f32 = 0.0;
+  for (var j : u32 = 0u; j < ${D}u; j = j + 1u) {
+    s = s + exp(a[base + j] - m);
+  }
+  let logZ = m + log(s);
+  for (var j : u32 = 0u; j < ${D}u; j = j + 1u) {
+    out[base + j] = a[base + j] - logZ;
+  }
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.out)], threads: outerSize, workgroupSize: WG_SIZE };
+    }
+    case "softmax_causal_last": {
+      const a = tof(op.a);
+      const T = a.shape[a.shape.length - 1];
+      const outerSize = shapeSize(a.shape) / T;
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<f32>;
+@group(0) @binding(1) var<storage, read_write> out : array<f32>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  // Each thread handles one (..., qpos)-row, softmaxing over kpos\u2208[0..qpos].
+  ${GID_LINE}
+  if (i >= ${outerSize}u) { return; }
+  let qpos = i % ${T}u;
+  let base = i * ${T}u;
+  var m : f32 = -1.0e30;
+  for (var k : u32 = 0u; k <= qpos; k = k + 1u) {
+    let v = a[base + k];
+    if (v > m) { m = v; }
+  }
+  var s : f32 = 0.0;
+  for (var k : u32 = 0u; k <= qpos; k = k + 1u) {
+    let e = exp(a[base + k] - m);
+    out[base + k] = e;
+    s = s + e;
+  }
+  for (var k : u32 = 0u; k <= qpos; k = k + 1u) {
+    out[base + k] = out[base + k] / s;
+  }
+  for (var k : u32 = qpos + 1u; k < ${T}u; k = k + 1u) {
+    out[base + k] = 0.0;
+  }
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.out)], threads: outerSize, workgroupSize: WG_SIZE };
+    }
+    case "where_causal": {
+      const a = tof(op.a);
+      const T = a.shape[a.shape.length - 1];
+      const total = shapeSize(a.shape);
+      const fillLit = wgslLiteral(op.fillValue, "f32");
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<f32>;
+@group(0) @binding(1) var<storage, read_write> out : array<f32>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+  let kpos = i % ${T}u;
+  let qpos = (i / ${T}u) % ${T}u;
+  if (kpos > qpos) {
+    out[i] = ${fillLit};
+  } else {
+    out[i] = a[i];
+  }
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    // ---- Slicing -----------------------------------------------------------
+    case "slice_last_range": {
+      const out = tof(op.out);
+      const a = tof(op.a);
+      const D_in = a.shape[a.shape.length - 1];
+      const D_out = op.end - op.start;
+      const total = shapeSize(out.shape);
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<${wgslDtype(a.dtype)}>;
+@group(0) @binding(1) var<storage, read_write> out : array<${wgslDtype(out.dtype)}>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+  let outer = i / ${D_out}u;
+  let inner = i % ${D_out}u;
+  out[i] = a[outer * ${D_in}u + ${op.start}u + inner];
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    // ---- Broadcast / un-broadcast (autograd infrastructure) ----------------
+    case "broadcast_to": {
+      const out = tof(op.out);
+      const a = tof(op.a);
+      const total = shapeSize(out.shape);
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> a : array<${wgslDtype(a.dtype)}>;
+@group(0) @binding(1) var<storage, read_write> out : array<${wgslDtype(out.dtype)}>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+${broadcastIndexBlock("i", out.shape, a.shape, "srcIdx")}
+  out[i] = a[srcIdx];
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    // ---- Adam (fused per-element) -----------------------------------------
+    case "adam_update_m": {
+      const out = tof(op.out);
+      const total = shapeSize(out.shape);
+      const b1 = op.b1;
+      const oneMinusB1 = 1 - b1;
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> m : array<f32>;
+@group(0) @binding(1) var<storage, read> g : array<f32>;
+@group(0) @binding(2) var<storage, read_write> out : array<f32>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+  out[i] = ${wgslLiteral(b1, "f32")} * m[i] + ${wgslLiteral(oneMinusB1, "f32")} * g[i];
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.m), buf(op.g), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    case "adam_update_v": {
+      const out = tof(op.out);
+      const total = shapeSize(out.shape);
+      const b2 = op.b2;
+      const oneMinusB2 = 1 - b2;
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> v : array<f32>;
+@group(0) @binding(1) var<storage, read> g : array<f32>;
+@group(0) @binding(2) var<storage, read_write> out : array<f32>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+  let gv = g[i];
+  out[i] = ${wgslLiteral(b2, "f32")} * v[i] + ${wgslLiteral(oneMinusB2, "f32")} * gv * gv;
+}`.trim();
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.v), buf(op.g), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+    case "adam_update_p": {
+      const out = tof(op.out);
+      const total = shapeSize(out.shape);
+      const dynamicShrink = op.decayShrinkTensor !== null;
+      const shrinkExpr = dynamicShrink ? "decayShrink[0]" : wgslLiteral(op.decayShrink, "f32");
+      const shrinkBinding = dynamicShrink ? `@group(0) @binding(4) var<storage, read> decayShrink : array<f32>;
+@group(0) @binding(5) var<storage, read_write> out : array<f32>;` : `@group(0) @binding(4) var<storage, read_write> out : array<f32>;`;
+      const wgsl = `
+@group(0) @binding(0) var<storage, read> p : array<f32>;
+@group(0) @binding(1) var<storage, read> mNew : array<f32>;
+@group(0) @binding(2) var<storage, read> vNew : array<f32>;
+@group(0) @binding(3) var<storage, read> lrt : array<f32>;
+${shrinkBinding}
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+  out[i] = ${shrinkExpr} * p[i] - lrt[0] * mNew[i] / (sqrt(vNew[i]) + ${wgslLiteral(op.eps, "f32")});
+}`.trim();
+      const bindings = dynamicShrink ? [buf(op.p), buf(op.mNew), buf(op.vNew), buf(op.lrt), buf(op.decayShrinkTensor), buf(op.out)] : [buf(op.p), buf(op.mNew), buf(op.vNew), buf(op.lrt), buf(op.out)];
+      return { opIndex, opKind: op.kind, wgsl, bindings, threads: total, workgroupSize: WG_SIZE };
+    }
+    case "sum_to_shape": {
+      const out = tof(op.out);
+      const a = tof(op.a);
+      const wgsl = emitSumToShape(a.shape, out.shape, a.dtype);
+      const total = shapeSize(out.shape);
+      return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.a), buf(op.out)], threads: total, workgroupSize: WG_SIZE };
+    }
+  }
+}
+function wgslDtype(d) {
+  if (d === "bool") return "u32";
+  return d;
+}
+function wgslLiteral(value, dtype) {
+  if (dtype === "f32") {
+    if (Number.isFinite(value)) {
+      return value.toString().includes(".") || value.toString().includes("e") ? `${value}f` : `${value}.0f`;
+    }
+    return value > 0 ? "1.0e30f" : "-1.0e30f";
+  }
+  if (dtype === "i32") return `${Math.trunc(value)}i`;
+  return value ? "1u" : "0u";
+}
+function castFromI32(expr, dtype) {
+  if (dtype === "f32") return `f32(${expr})`;
+  if (dtype === "i32") return `i32(${expr})`;
+  return `u32(${expr})`;
+}
+function computeStrides(shape) {
+  const strides = new Array(shape.length).fill(1);
+  for (let i = shape.length - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * shape[i + 1];
+  }
+  return strides;
+}
+function decomposeFlatIndexBlock(flatVar, shape, outVar) {
+  if (shape.length === 0) return `  let ${outVar}_0 : u32 = 0u;`;
+  const strides = computeStrides(shape);
+  const lines = [];
+  let remaining = flatVar;
+  for (let i = 0; i < shape.length; i++) {
+    if (i === shape.length - 1) {
+      lines.push(`  let ${outVar}_${i} = ${remaining};`);
+    } else {
+      lines.push(`  let ${outVar}_${i} = ${remaining} / ${strides[i]}u;`);
+      const newRem = `${outVar}_rem${i}`;
+      lines.push(`  let ${newRem} = ${remaining} % ${strides[i]}u;`);
+      remaining = newRem;
+    }
+  }
+  return lines.join("\n");
+}
+function broadcastIndexBlock(flatVar, outShape, srcShape, srcVar) {
+  const prefix = `${srcVar}_ax`;
+  const decompose = decomposeFlatIndexBlock(flatVar, outShape, prefix);
+  const offset = outShape.length - srcShape.length;
+  if (srcShape.length === 0) {
+    return `${decompose}
+  let ${srcVar} : u32 = 0u;`;
+  }
+  const srcStrides = computeStrides(srcShape);
+  const terms = [];
+  for (let i = 0; i < srcShape.length; i++) {
+    const outAxis = i + offset;
+    const srcDim = srcShape[i];
+    const term = srcDim === 1 ? "0u" : `${prefix}_${outAxis} * ${srcStrides[i]}u`;
+    terms.push(term);
+  }
+  return `${decompose}
+  let ${srcVar} = ${terms.join(" + ")};`;
+}
+function emitSumToShape(srcShape, tgtShape, dtype) {
+  const srcStrides = computeStrides(srcShape);
+  const tgtStrides = computeStrides(tgtShape);
+  const offset = srcShape.length - tgtShape.length;
+  const decompose = decomposeFlatIndexBlock("i", tgtShape, "tgt");
+  const reducedAxes = [];
+  for (let k = 0; k < srcShape.length; k++) {
+    if (k < offset) {
+      reducedAxes.push(k);
+      continue;
+    }
+    const tDim = tgtShape[k - offset];
+    const sDim = srcShape[k];
+    if (tDim === 1 && sDim > 1) reducedAxes.push(k);
+  }
+  const baseTerms = [];
+  for (let k = 0; k < srcShape.length; k++) {
+    if (reducedAxes.includes(k)) continue;
+    const tAxis = k - offset;
+    baseTerms.push(`tgt_${tAxis} * ${srcStrides[k]}u`);
+  }
+  const baseExpr = baseTerms.length > 0 ? baseTerms.join(" + ") : "0u";
+  const indent = (depth) => "  ".repeat(depth + 1);
+  const loops = [];
+  for (let depth = 0; depth < reducedAxes.length; depth++) {
+    const k = reducedAxes[depth];
+    const dim = srcShape[k];
+    loops.push(`${indent(depth)}for (var r${k} : u32 = 0u; r${k} < ${dim}u; r${k} = r${k} + 1u) {`);
+  }
+  const reducedTerms = reducedAxes.map((k) => `r${k} * ${srcStrides[k]}u`);
+  const fullExpr = reducedTerms.length > 0 ? `${baseExpr} + ${reducedTerms.join(" + ")}` : baseExpr;
+  loops.push(`${indent(reducedAxes.length)}s = s + a[${fullExpr}];`);
+  for (let depth = reducedAxes.length - 1; depth >= 0; depth--) {
+    loops.push(`${indent(depth)}}`);
+  }
+  const total = tgtShape.length === 0 ? 1 : tgtStrides[0] * tgtShape[0];
+  const loopBody = reducedAxes.length === 0 ? `  s = s + a[${baseExpr}];` : loops.join("\n");
+  return `
+@group(0) @binding(0) var<storage, read> a : array<${wgslDtype(dtype)}>;
+@group(0) @binding(1) var<storage, read_write> out : array<${wgslDtype(dtype)}>;
+@compute @workgroup_size(${WG_SIZE})
+fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+  ${GID_LINE}
+  if (i >= ${total}u) { return; }
+${decompose}
+  var s : ${wgslDtype(dtype)} = ${dtype === "f32" ? "0.0f" : dtype === "i32" ? "0i" : "0u"};
+${loopBody}
+  out[i] = s;
+}`.trim();
+}
+// src/runtime.ts
+var Captures = class {
+  constructor(shapes, data) {
+    this.shapes = shapes;
+    this.data = data;
+  }
+  shapes;
+  data;
+  get(name) {
+    const d = this.data.get(name);
+    if (!d) {
+      const known = [...this.data.keys()].sort().join(", ");
+      const detail = known ? `Known this call: ${known}` : `(call run/step with { withCaptures: true } to populate)`;
+      throw new Error(`Captures.get: '${name}' not present. ${detail}`);
+    }
+    return d;
+  }
+  shapeOf(name) {
+    const s = this.shapes[name];
+    if (!s) {
+      const known = Object.keys(this.shapes).sort().join(", ") || "(none registered)";
+      throw new Error(`Captures.shapeOf: '${name}' not registered. Known: ${known}`);
+    }
+    return s;
+  }
+  has(name) {
+    return this.data.has(name);
+  }
+  names() {
+    return [...this.data.keys()].sort();
+  }
+};
+var STORAGE_RW = 128 | 8 | 4;
+var READBACK = 1 | 8;
+// src/module.ts
+var init = {
+  randn: (opts = {}) => ({ kind: "randn", scale: opts.scale ?? 0.02 }),
+  kaiming: (opts = {}) => opts.gain !== void 0 ? { kind: "kaiming", gain: opts.gain } : { kind: "kaiming" },
+  literal: (data) => ({ kind: "literal", data })
+};
+function boxMuller() {
+  return Math.sqrt(-2 * Math.log(Math.max(1e-10, Math.random()))) * Math.cos(2 * Math.PI * Math.random());
+}
+function randnFn(scale) {
+  return (size) => {
+    const arr = new Float32Array(size);
+    for (let i = 0; i < size; i++) arr[i] = boxMuller() * scale;
+    return arr;
+  };
+}
+function resolveInit(spec) {
+  if (!spec || spec === "randn") return randnFn(0.02);
+  if (spec === "zeros") return (size) => new Float32Array(size);
+  if (spec === "ones") return (size) => {
+    const a = new Float32Array(size);
+    a.fill(1);
+    return a;
+  };
+  switch (spec.kind) {
+    case "randn":
+      return randnFn(spec.scale);
+    case "kaiming": {
+      const gain = spec.gain ?? Math.sqrt(2);
+      return (size, shape) => {
+        const fanIn = shape[0] ?? size;
+        const std = gain / Math.sqrt(fanIn);
+        const arr = new Float32Array(size);
+        for (let i = 0; i < size; i++) arr[i] = boxMuller() * std;
+        return arr;
+      };
+    }
+    case "literal": {
+      const data = spec.data;
+      return (size) => {
+        if (data.length !== size) {
+          throw new Error(`init.literal: data length ${data.length} doesn't match param size ${size}`);
+        }
+        return new Float32Array(data);
+      };
+    }
+  }
+}
+function resolveDecay(opts) {
+  if (opts?.decay !== void 0) return opts.decay;
+  const spec = opts?.init ?? "randn";
+  return spec !== "zeros" && spec !== "ones";
+}
+var ParamSentinel = class {
+  constructor(shape, dtype, initFn, decay) {
+    this.shape = shape;
+    this.dtype = dtype;
+    this.initFn = initFn;
+    this.decay = decay;
+  }
+  shape;
+  dtype;
+  initFn;
+  decay;
+};
+var Module = class {
+  /**
+   * Declare a learnable parameter at this module. Must be called from inside
+   * the constructor (typically as a field assignment). Returns a placeholder
+   * that gets replaced with a real Tensor at compile time.
+   *
+   * The parameter's name is auto-derived from its property path in the model
+   * tree (e.g. `layers.0.attn.W_q`). Init metadata travels with the param;
+   * call `compiled.uploadInitialParams()` to apply it after compile.
+   */
+  param(shape, opts) {
+    const dtype = opts?.dtype ?? "f32";
+    return new ParamSentinel(shape, dtype, resolveInit(opts?.init), resolveDecay(opts));
+  }
+};
+function materializeParams(root) {
+  const tensors = {};
+  const initFns = {};
+  const decayFlags = {};
+  visit(root, "", (path, val, owner, key) => {
+    if (val instanceof ParamSentinel) {
+      const t = paramInput(path, val.shape, val.dtype);
+      owner[key] = t;
+      tensors[path] = t;
+      initFns[path] = val.initFn;
+      decayFlags[path] = val.decay;
+    }
+  });
+  return { tensors, initFns, decayFlags };
+}
+function visit(node, path, visitor) {
+  if (node === null || node === void 0) return;
+  if (typeof node !== "object") return;
+  if (node instanceof Module) {
+    for (const key of Object.keys(node)) {
+      const child = node[key];
+      const childPath = path ? `${path}.${key}` : key;
+      visitChild(child, childPath, node, key, visitor);
+    }
+    return;
+  }
+  if (Array.isArray(node)) {
+    node.forEach((item, i) => {
+      const childPath = path ? `${path}.${i}` : String(i);
+      visitChild(item, childPath, node, i, visitor);
+    });
+    return;
+  }
+}
+function visitChild(child, path, owner, key, visitor) {
+  if (child instanceof Module || Array.isArray(child)) {
+    visit(child, path, visitor);
+  } else {
+    visitor(path, child, owner, key);
+  }
+}
+// src/worker-protocol.ts
+function transferablesOfRecord(rec) {
+  const out = [];
+  for (const v of Object.values(rec)) out.push(v.buffer);
+  return out;
+}
+function reconstituteError(w) {
+  const err = new Error(w.message);
+  err.name = w.name;
+  err.stack = w.stack;
+  return err;
+}
+// src/worker-proxy.ts
+var WorkerProxy = class {
+  worker;
+  nextId = 1;
+  pending = /* @__PURE__ */ new Map();
+  terminated = false;
+  constructor(workerSource) {
+    const blob = new Blob([workerSource], { type: "application/javascript" });
+    const url = URL.createObjectURL(blob);
+    this.worker = new Worker(url, { type: "module" });
+    URL.revokeObjectURL(url);
+    this.worker.onmessage = (ev) => {
+      const reply = ev.data;
+      const handlers = this.pending.get(reply.id);
+      if (!handlers) return;
+      this.pending.delete(reply.id);
+      if (reply.ok) handlers.resolve(reply.result);
+      else handlers.reject(reconstituteError(reply.error));
+    };
+    this.worker.onerror = (ev) => {
+      const err = new Error(`tensorgrad worker error: ${ev.message || "unknown"}`);
+      const wire = { name: "WorkerError", message: err.message, stack: err.stack ?? "" };
+      for (const handlers of this.pending.values()) handlers.reject(reconstituteError(wire));
+      this.pending.clear();
+    };
+  }
+  /** Send a request and await its matching response. `transfer` lists the
+   *  ArrayBuffers to move (zero-copy) into the worker. */
+  request(req, transfer = []) {
+    if (this.terminated) return Promise.reject(new Error("tensorgrad: worker has been terminated"));
+    const id = this.nextId++;
+    return new Promise((resolve, reject) => {
+      this.pending.set(id, { resolve, reject });
+      this.worker.postMessage({ ...req, id }, transfer);
+    });
+  }
+  /** Fire-and-forget variant for cases where the caller doesn't need a reply
+   *  (currently unused; keep for symmetry / future use). */
+  send(req, transfer = []) {
+    if (this.terminated) return;
+    const id = this.nextId++;
+    this.worker.postMessage({ ...req, id }, transfer);
+  }
+  terminate() {
+    if (this.terminated) return;
+    this.terminated = true;
+    this.worker.terminate();
+    const err = new Error("tensorgrad: worker terminated");
+    for (const handlers of this.pending.values()) handlers.reject(err);
+    this.pending.clear();
+  }
+};
+// src/compile.ts
+function compileToIR(traceFn) {
+  const graph = trace(traceFn);
+  const { paramGrads, loss } = appendGrad(graph);
+  const plan = planBuffers(graph, paramGrads);
+  const kernels = emitKernels(graph, plan);
+  return { graph, paramGrads, loss, plan, kernels };
+}
+async function compileModule(modelFactory, forward, opts = {}) {
+  const { graph, materialized } = traceModule(modelFactory, forward, opts.inputs ?? {});
+  const { paramGrads, loss } = appendGrad(graph);
+  const adamResult = opts.adam ? appendAdam(graph, paramGrads, materialized.tensors, opts.adam, materialized.decayFlags) : void 0;
+  const plan = planBuffers(graph, paramGrads, adamResult?.writebacks ?? []);
+  const kernels = emitKernels(graph, plan);
+  const ir = { graph, paramGrads, loss, plan, kernels };
+  const initialParams = buildInitialParams(plan, materialized.initFns);
+  const proxy = new WorkerProxy('// src/runtime.ts\nvar Captures = class {\n  constructor(shapes, data) {\n    this.shapes = shapes;\n    this.data = data;\n  }\n  shapes;\n  data;\n  get(name) {\n    const d = this.data.get(name);\n    if (!d) {\n      const known = [...this.data.keys()].sort().join(", ");\n      const detail = known ? `Known this call: ${known}` : `(call run/step with { withCaptures: true } to populate)`;\n      throw new Error(`Captures.get: \'${name}\' not present. ${detail}`);\n    }\n    return d;\n  }\n  shapeOf(name) {\n    const s = this.shapes[name];\n    if (!s) {\n      const known = Object.keys(this.shapes).sort().join(", ") || "(none registered)";\n      throw new Error(`Captures.shapeOf: \'${name}\' not registered. Known: ${known}`);\n    }\n    return s;\n  }\n  has(name) {\n    return this.data.has(name);\n  }\n  names() {\n    return [...this.data.keys()].sort();\n  }\n};\nvar STORAGE_RW = 128 | 8 | 4;\nvar READBACK = 1 | 8;\nasync function createRuntime(plan, kernels, lossBufferId, opts = {}) {\n  const device2 = opts.device ?? await acquireDevice();\n  const queue = device2.queue;\n  const buffers = /* @__PURE__ */ new Map();\n  const ownedBufferIds = /* @__PURE__ */ new Set();\n  const sharedParams = opts.sharedParams;\n  for (const spec of plan.buffers) {\n    const shared = spec.kind === "param" ? sharedParams?.get(spec.name) : void 0;\n    if (shared) {\n      if (shared.size !== spec.byteSize) {\n        throw new Error(\n          `sharedParams: size mismatch for \'${spec.name}\' \\u2014 supplied ${shared.size} bytes, compiled graph expects ${spec.byteSize}.`\n        );\n      }\n      buffers.set(spec.id, shared);\n      continue;\n    }\n    const buf = device2.createBuffer({\n      size: spec.byteSize,\n      usage: STORAGE_RW,\n      label: spec.name ?? `t${spec.id}-${spec.kind}`\n    });\n    buffers.set(spec.id, buf);\n    ownedBufferIds.add(spec.id);\n    if (spec.kind === "state") fillStateBuffer(spec, buf);\n  }\n  const moduleCache = /* @__PURE__ */ new Map();\n  const pipelines = [];\n  const probes = [];\n  for (const k of kernels) {\n    if (!k.wgsl) {\n      pipelines.push(null);\n      continue;\n    }\n    let module = moduleCache.get(k.wgsl);\n    if (!module) {\n      module = device2.createShaderModule({ code: k.wgsl, label: k.opKind });\n      moduleCache.set(k.wgsl, module);\n    }\n    device2.pushErrorScope("validation");\n    const pipeline = device2.createComputePipeline({\n      layout: "auto",\n      compute: { module, entryPoint: "main" },\n      label: k.opKind\n    });\n    pipelines.push(pipeline);\n    probes.push(device2.popErrorScope().then((err) => err ? { k, module, err } : null));\n  }\n  const probeResults = await Promise.all(probes);\n  const failures = probeResults.filter((p) => p != null);\n  if (failures.length > 0) {\n    const reports = [];\n    for (const { k, module, err } of failures) {\n      const info = await module.getCompilationInfo();\n      const messages = info.messages.map((m) => `  L${m.lineNum}:${m.linePos} [${m.type}] ${m.message}`).join("\\n");\n      reports.push(\n        `[shader compile error] ${k.opKind} (op #${k.opIndex}): ${err.message}\n` + (messages || "  (no compilation messages)") + `\n--- WGSL ---\n${k.wgsl}\n-----------`\n      );\n    }\n    console.error(reports.join("\\n\\n"));\n    throw new Error(`tensorgrad: ${failures.length} shader(s) failed to compile (see console).`);\n  }\n  const bindGroups = kernels.map((k, i) => {\n    const pipeline = pipelines[i];\n    if (!pipeline) return null;\n    return device2.createBindGroup({\n      layout: pipeline.getBindGroupLayout(0),\n      entries: k.bindings.map((bufId, idx) => ({\n        binding: idx,\n        resource: { buffer: buffers.get(bufId) }\n      }))\n    });\n  });\n  const outputSpec = plan.buffers[lossBufferId];\n  const outputReadback = device2.createBuffer({ size: outputSpec.byteSize, usage: READBACK });\n  let captureStaging = null;\n  function ensureCaptureStaging() {\n    if (captureStaging) return captureStaging;\n    let totalBytes = 0;\n    const slices = [];\n    for (const [name, bufId] of plan.capturesByName) {\n      const spec = plan.buffers[bufId];\n      slices.push({ name, bufId, offset: totalBytes, byteSize: spec.byteSize });\n      totalBytes += spec.byteSize;\n    }\n    const buffer = device2.createBuffer({ size: totalBytes, usage: READBACK, label: "captures-staging" });\n    captureStaging = { buffer, slices };\n    return captureStaging;\n  }\n  let pending = Promise.resolve();\n  async function dispatch(inputs, opts2) {\n    const turn = pending.catch(() => {\n    }).then(() => dispatchUnsynchronized(inputs, opts2));\n    pending = turn;\n    return turn;\n  }\n  async function dispatchUnsynchronized(inputs, opts2) {\n    const wantCaptures = opts2.wantCaptures;\n    if (wantCaptures && plan.capturesByName.size === 0) {\n      throw new Error(\n        `withCaptures=true but no capture(...) calls were registered during the trace. Add capture(\'name\', tensor) inside your forward pass for the intermediates you want read back.`\n      );\n    }\n    for (const [name, bufId] of plan.inputsByName) {\n      const data = inputs[name];\n      if (!data) throw new Error(`tensorgrad: missing input \'${name}\'`);\n      const expectedBytes = plan.buffers[bufId].byteSize;\n      if (data.byteLength !== expectedBytes) {\n        throw new Error(`tensorgrad: input \'${name}\' has ${data.byteLength} bytes, expected ${expectedBytes}`);\n      }\n      queue.writeBuffer(buffers.get(bufId), 0, data);\n    }\n    const encoder = device2.createCommandEncoder({ label: "tensorgrad-step" });\n    for (let i = 0; i < kernels.length; i++) {\n      const k = kernels[i];\n      if (!k.wgsl || k.threads === 0) continue;\n      const pipeline = pipelines[i];\n      const bindGroup = bindGroups[i];\n      const pass = encoder.beginComputePass({ label: k.opKind });\n      pass.setPipeline(pipeline);\n      pass.setBindGroup(0, bindGroup);\n      const wgCount = Math.max(1, Math.ceil(k.threads / k.workgroupSize));\n      const MAX_X = 65535;\n      const wgX = Math.min(wgCount, MAX_X);\n      const wgY = Math.ceil(wgCount / MAX_X);\n      pass.dispatchWorkgroups(wgX, wgY, 1);\n      pass.end();\n    }\n    for (const wb of plan.writebacks) {\n      encoder.copyBufferToBuffer(buffers.get(wb.source), 0, buffers.get(wb.dest), 0, wb.bytes);\n    }\n    encoder.copyBufferToBuffer(buffers.get(lossBufferId), 0, outputReadback, 0, outputSpec.byteSize);\n    let layout = null;\n    if (wantCaptures) {\n      layout = ensureCaptureStaging();\n      for (const s of layout.slices) {\n        encoder.copyBufferToBuffer(buffers.get(s.bufId), 0, layout.buffer, s.offset, s.byteSize);\n      }\n    }\n    queue.submit([encoder.finish()]);\n    if (!opts2.readback) return null;\n    await outputReadback.mapAsync(GPUMapMode.READ);\n    const output = new Float32Array(outputReadback.getMappedRange().slice(0));\n    outputReadback.unmap();\n    const captures = /* @__PURE__ */ new Map();\n    if (layout) {\n      await layout.buffer.mapAsync(GPUMapMode.READ);\n      const range = layout.buffer.getMappedRange();\n      for (const s of layout.slices) {\n        captures.set(s.name, new Float32Array(range, s.offset, s.byteSize / 4).slice());\n      }\n      layout.buffer.unmap();\n    }\n    return { output, captures };\n  }\n  async function step(inputs, opts2) {\n    if (opts2?.readLoss === false) {\n      await dispatch(inputs, { wantCaptures: false, readback: false });\n      return;\n    }\n    const r = await dispatch(inputs, { wantCaptures: opts2?.withCaptures === true, readback: true });\n    if (opts2?.withCaptures) return { loss: r.output[0], captures: new Captures(captureShapes, r.captures) };\n    return r.output[0];\n  }\n  async function readLoss() {\n    const turn = pending.catch(() => {\n    }).then(async () => {\n      await outputReadback.mapAsync(GPUMapMode.READ);\n      const v = new Float32Array(outputReadback.getMappedRange())[0];\n      outputReadback.unmap();\n      return v;\n    });\n    pending = turn;\n    return turn;\n  }\n  async function run(inputs, opts2) {\n    const r = await dispatch(inputs, { wantCaptures: opts2?.withCaptures === true, readback: true });\n    if (opts2?.withCaptures) return { output: r.output, captures: new Captures(captureShapes, r.captures) };\n    return r.output;\n  }\n  function uploadParams(params2, opts2) {\n    const partial = opts2?.partial ?? false;\n    for (const name of Object.keys(params2)) {\n      if (!plan.paramsByName.has(name)) {\n        throw new Error(\n          `uploadParams: unknown param \'${name}\'. Known: ${[...plan.paramsByName.keys()].sort().join(", ")}`\n        );\n      }\n    }\n    if (!partial) {\n      for (const name of plan.paramsByName.keys()) {\n        if (!(name in params2)) {\n          throw new Error(\n            `uploadParams: missing param \'${name}\'. Pass { partial: true } if you mean to update only some params.`\n          );\n        }\n      }\n    }\n    for (const [name, bufId] of plan.paramsByName) {\n      const data = params2[name];\n      if (!data) continue;\n      const expected = plan.buffers[bufId].byteSize / 4;\n      if (data.length !== expected) {\n        throw new Error(`uploadParams: \'${name}\' has ${data.length} elements, expected ${expected}`);\n      }\n      queue.writeBuffer(buffers.get(bufId), 0, data);\n    }\n  }\n  async function downloadFromMap(map) {\n    const stagings = [];\n    const encoder = device2.createCommandEncoder({ label: "tensorgrad-download" });\n    for (const [name, bufId] of map) {\n      const spec = plan.buffers[bufId];\n      const staging = device2.createBuffer({ size: spec.byteSize, usage: READBACK });\n      encoder.copyBufferToBuffer(buffers.get(bufId), 0, staging, 0, spec.byteSize);\n      stagings.push({ name, buf: staging, bytes: spec.byteSize });\n    }\n    queue.submit([encoder.finish()]);\n    const out = {};\n    for (const s of stagings) {\n      await s.buf.mapAsync(GPUMapMode.READ);\n      out[s.name] = new Float32Array(s.buf.getMappedRange().slice(0));\n      s.buf.unmap();\n      s.buf.destroy();\n    }\n    return out;\n  }\n  function fillStateBuffer(spec, target) {\n    const elements = spec.byteSize / 4;\n    const init = spec.dtype === "f32" ? new Float32Array(elements).fill(spec.initValue ?? 0) : new Int32Array(elements).fill(Math.trunc(spec.initValue ?? 0));\n    queue.writeBuffer(target, 0, init);\n  }\n  function resetOptimizerState() {\n    for (const spec of plan.buffers) {\n      if (spec.kind === "state") fillStateBuffer(spec, buffers.get(spec.id));\n    }\n  }\n  const params = /* @__PURE__ */ new Map();\n  for (const [name, bufId] of plan.paramsByName) {\n    params.set(name, buffers.get(bufId));\n  }\n  const captureShapes = {};\n  for (const [name, bufId] of plan.capturesByName) {\n    captureShapes[name] = [...plan.buffers[bufId].shape];\n  }\n  const outputShape = [...plan.buffers[lossBufferId].shape];\n  const destroy = () => {\n    for (const [id, b] of buffers) {\n      if (ownedBufferIds.has(id)) b.destroy();\n    }\n    outputReadback.destroy();\n    if (captureStaging) captureStaging.buffer.destroy();\n  };\n  return {\n    device: device2,\n    params,\n    outputShape,\n    uploadParams,\n    downloadParams: () => downloadFromMap(plan.paramsByName),\n    downloadParamGrads: () => downloadFromMap(plan.paramGradsByName),\n    step,\n    run,\n    readLoss,\n    resetOptimizerState,\n    destroy\n  };\n}\nasync function acquireDevice() {\n  if (typeof navigator === "undefined" || !navigator.gpu) {\n    throw new Error("tensorgrad: WebGPU not available in this environment");\n  }\n  const adapter = await navigator.gpu.requestAdapter();\n  if (!adapter) throw new Error("tensorgrad: no WebGPU adapter");\n  return await adapter.requestDevice();\n}\n\n// src/adam.ts\nfunction resolveLR(schedule, step) {\n  if (typeof schedule === "number") return schedule;\n  switch (schedule.kind) {\n    case "constant":\n      return schedule.value;\n    case "linearDecay": {\n      const f = Math.min(step / schedule.steps, 1);\n      return schedule.peak + (schedule.final - schedule.peak) * f;\n    }\n    case "cosineDecay": {\n      const f = Math.min(step / schedule.steps, 1);\n      return schedule.final + 0.5 * (schedule.peak - schedule.final) * (1 + Math.cos(Math.PI * f));\n    }\n    case "warmup": {\n      if (step <= schedule.warmupSteps) return schedule.peakLr * (step / schedule.warmupSteps);\n      return resolveLR(schedule.after, step - schedule.warmupSteps);\n    }\n  }\n}\n\n// src/worker-protocol.ts\nfunction wireError(e) {\n  if (e instanceof Error) {\n    return { name: e.name, message: e.message, stack: e.stack ?? "" };\n  }\n  return { name: "Error", message: String(e), stack: "" };\n}\n\n// src/worker.ts\nvar graphs = /* @__PURE__ */ new Map();\nvar device = null;\nasync function ensureDevice() {\n  if (device) return device;\n  if (typeof navigator === "undefined" || !navigator.gpu) {\n    throw new Error("tensorgrad worker: WebGPU not available in this environment");\n  }\n  const adapter = await navigator.gpu.requestAdapter();\n  if (!adapter) throw new Error("tensorgrad worker: no WebGPU adapter");\n  device = await adapter.requestDevice();\n  return device;\n}\nasync function handleCreateRuntime(payload) {\n  const dev = await ensureDevice();\n  const { graph, plan, kernels } = payload.ir;\n  const outputTensorId = graph.outputs[0];\n  const outputBufferId = plan.tensorToBuffer.get(outputTensorId);\n  const opts = { device: dev };\n  const runtime = await createRuntime(plan, kernels, outputBufferId, opts);\n  if (Object.keys(payload.initialParams).length > 0) {\n    runtime.uploadParams(payload.initialParams);\n  }\n  const captureShapes = {};\n  for (const [name, bufId] of plan.capturesByName) {\n    captureShapes[name] = [...plan.buffers[bufId].shape];\n  }\n  const slot = {\n    runtime,\n    paramNames: [...plan.paramsByName.keys()],\n    outputShape: [...runtime.outputShape],\n    kernelCount: kernels.filter((k) => k.wgsl).length,\n    captureShapes,\n    adam: payload.adam ? createAdamState(payload.adam) : null\n  };\n  graphs.set(payload.graphId, slot);\n  return {\n    paramNames: [...slot.paramNames],\n    outputShape: slot.outputShape,\n    kernelCount: slot.kernelCount,\n    captureShapes: slot.captureShapes\n  };\n}\nasync function handleCompileForward(payload) {\n  const dev = await ensureDevice();\n  const parent = graphs.get(payload.parentGraphId);\n  if (!parent) throw new Error(`compileForward: parent graph ${payload.parentGraphId} not found`);\n  const { graph, plan, kernels } = payload.ir;\n  const outputTensorId = graph.outputs[0];\n  const outputBufferId = plan.tensorToBuffer.get(outputTensorId);\n  const opts = { device: dev, sharedParams: parent.runtime.params };\n  const runtime = await createRuntime(plan, kernels, outputBufferId, opts);\n  const captureShapes = {};\n  for (const [name, bufId] of plan.capturesByName) {\n    captureShapes[name] = [...plan.buffers[bufId].shape];\n  }\n  const slot = {\n    runtime,\n    paramNames: [...plan.paramsByName.keys()],\n    outputShape: [...runtime.outputShape],\n    kernelCount: kernels.filter((k) => k.wgsl).length,\n    captureShapes,\n    adam: null\n  };\n  graphs.set(payload.graphId, slot);\n  return {\n    paramNames: [...slot.paramNames],\n    outputShape: slot.outputShape,\n    kernelCount: slot.kernelCount,\n    captureShapes: slot.captureShapes\n  };\n}\nfunction createAdamState(cfg) {\n  return {\n    config: cfg,\n    t: 0,\n    lrtBuf: new Float32Array(1),\n    decayShrinkBuf: cfg.decayShrinkInputName ? new Float32Array(1) : null\n  };\n}\nfunction injectAdamScalars(slot, inputs) {\n  const a = slot.adam;\n  if (!a) return inputs;\n  a.t++;\n  const lrNow = resolveLR(a.config.lr, a.t);\n  a.lrtBuf[0] = lrNow * Math.sqrt(1 - Math.pow(a.config.b2, a.t)) / (1 - Math.pow(a.config.b1, a.t));\n  const merged = { ...inputs, [a.config.lrtInputName]: a.lrtBuf };\n  if (a.decayShrinkBuf && a.config.decayShrinkInputName) {\n    a.decayShrinkBuf[0] = 1 - lrNow * a.config.weightDecay;\n    merged[a.config.decayShrinkInputName] = a.decayShrinkBuf;\n  }\n  return merged;\n}\nasync function handleStep(payload) {\n  const slot = mustGet(payload.graphId);\n  const merged = injectAdamScalars(slot, payload.inputs);\n  if (payload.withCaptures) {\n    const r = await slot.runtime.step(merged, { withCaptures: true });\n    return { loss: r.loss, captures: capturesToRecord(r.captures, slot.captureShapes) };\n  }\n  const loss = await slot.runtime.step(merged);\n  return { loss, captures: null };\n}\nasync function handleRun(payload) {\n  const slot = mustGet(payload.graphId);\n  if (payload.withCaptures) {\n    const r = await slot.runtime.run(payload.inputs, { withCaptures: true });\n    return { output: r.output, captures: capturesToRecord(r.captures, slot.captureShapes) };\n  }\n  const output = await slot.runtime.run(payload.inputs);\n  return { output, captures: null };\n}\nfunction capturesToRecord(captures, shapes) {\n  const out = {};\n  for (const name of Object.keys(shapes)) {\n    if (captures.has(name)) out[name] = captures.get(name);\n  }\n  return out;\n}\nfunction handleUploadParams(payload) {\n  const slot = mustGet(payload.graphId);\n  slot.runtime.uploadParams(payload.params, { partial: payload.partial });\n}\nasync function handleDownloadParams(payload) {\n  const slot = mustGet(payload.graphId);\n  return { params: await slot.runtime.downloadParams() };\n}\nasync function handleDownloadParamGrads(payload) {\n  const slot = mustGet(payload.graphId);\n  return { params: await slot.runtime.downloadParamGrads() };\n}\nfunction handleResetOptimizer(payload) {\n  const slot = mustGet(payload.graphId);\n  slot.runtime.resetOptimizerState();\n  if (slot.adam) slot.adam.t = 0;\n}\nfunction handleDestroy(payload) {\n  const slot = graphs.get(payload.graphId);\n  if (!slot) return;\n  slot.runtime.destroy();\n  graphs.delete(payload.graphId);\n}\nfunction mustGet(graphId) {\n  const slot = graphs.get(graphId);\n  if (!slot) throw new Error(`tensorgrad worker: graph ${graphId} not found`);\n  return slot;\n}\nself.onmessage = async (ev) => {\n  const req = ev.data;\n  try {\n    let result;\n    let transferList = [];\n    switch (req.kind) {\n      case "createRuntime":\n        result = await handleCreateRuntime(req.payload);\n        break;\n      case "compileForward":\n        result = await handleCompileForward(req.payload);\n        break;\n      case "step":\n        result = await handleStep(req.payload);\n        transferList = collectTransfers(result.captures);\n        break;\n      case "run": {\n        const r = await handleRun(req.payload);\n        result = r;\n        transferList = [r.output.buffer, ...collectTransfers(r.captures)];\n        break;\n      }\n      case "uploadParams":\n        handleUploadParams(req.payload);\n        result = null;\n        break;\n      case "downloadParams": {\n        const r = await handleDownloadParams(req.payload);\n        result = r;\n        transferList = collectTransfers(r.params);\n        break;\n      }\n      case "downloadParamGrads": {\n        const r = await handleDownloadParamGrads(req.payload);\n        result = r;\n        transferList = collectTransfers(r.params);\n        break;\n      }\n      case "resetOptimizer":\n        handleResetOptimizer(req.payload);\n        result = null;\n        break;\n      case "destroy":\n        handleDestroy(req.payload);\n        result = null;\n        break;\n      default:\n        throw new Error(`unknown request kind: ${req.kind}`);\n    }\n    const reply = { id: req.id, ok: true, result };\n    self.postMessage(reply, { transfer: transferList });\n  } catch (e) {\n    const error = wireError(e);\n    const reply = { id: req.id, ok: false, error };\n    self.postMessage(reply);\n  }\n};\nfunction collectTransfers(rec) {\n  if (!rec) return [];\n  const out = [];\n  for (const v of Object.values(rec)) out.push(v.buffer);\n  return out;\n}\n');
+  const wireIR = { graph, plan, kernels };
+  const wireAdam = adamResult ? wireAdamConfig(adamResult) : null;
+  const transfers = transferablesOfRecord(initialParams);
+  let meta;
+  try {
+    meta = await proxy.request(
+      { kind: "createRuntime", payload: { graphId: 0, ir: wireIR, initialParams, adam: wireAdam } },
+      transfers
+    );
+  } catch (e) {
+    proxy.terminate();
+    throw e;
+  }
+  return new CompiledModuleProxy(
+    proxy,
+    /* graphId */
+    0,
+    ir,
+    meta,
+    modelFactory,
+    /* initFns */
+    materialized.initFns,
+    /* nextGraphId */
+    { v: 1 }
+  );
+}
+async function compileForward(modelFactory, forward, opts = {}) {
+  const { graph, materialized } = traceModule(modelFactory, forward, opts.inputs ?? {});
+  const outputTensor = graph.tensors[graph.outputs[0]];
+  const plan = planBuffers(
+    graph,
+    /* paramGrads */
+    {}
+  );
+  const kernels = emitKernels(graph, plan);
+  const ir = { graph, paramGrads: {}, loss: outputTensor, plan, kernels };
+  const initialParams = buildInitialParams(plan, materialized.initFns);
+  const proxy = new WorkerProxy('// src/runtime.ts\nvar Captures = class {\n  constructor(shapes, data) {\n    this.shapes = shapes;\n    this.data = data;\n  }\n  shapes;\n  data;\n  get(name) {\n    const d = this.data.get(name);\n    if (!d) {\n      const known = [...this.data.keys()].sort().join(", ");\n      const detail = known ? `Known this call: ${known}` : `(call run/step with { withCaptures: true } to populate)`;\n      throw new Error(`Captures.get: \'${name}\' not present. ${detail}`);\n    }\n    return d;\n  }\n  shapeOf(name) {\n    const s = this.shapes[name];\n    if (!s) {\n      const known = Object.keys(this.shapes).sort().join(", ") || "(none registered)";\n      throw new Error(`Captures.shapeOf: \'${name}\' not registered. Known: ${known}`);\n    }\n    return s;\n  }\n  has(name) {\n    return this.data.has(name);\n  }\n  names() {\n    return [...this.data.keys()].sort();\n  }\n};\nvar STORAGE_RW = 128 | 8 | 4;\nvar READBACK = 1 | 8;\nasync function createRuntime(plan, kernels, lossBufferId, opts = {}) {\n  const device2 = opts.device ?? await acquireDevice();\n  const queue = device2.queue;\n  const buffers = /* @__PURE__ */ new Map();\n  const ownedBufferIds = /* @__PURE__ */ new Set();\n  const sharedParams = opts.sharedParams;\n  for (const spec of plan.buffers) {\n    const shared = spec.kind === "param" ? sharedParams?.get(spec.name) : void 0;\n    if (shared) {\n      if (shared.size !== spec.byteSize) {\n        throw new Error(\n          `sharedParams: size mismatch for \'${spec.name}\' \\u2014 supplied ${shared.size} bytes, compiled graph expects ${spec.byteSize}.`\n        );\n      }\n      buffers.set(spec.id, shared);\n      continue;\n    }\n    const buf = device2.createBuffer({\n      size: spec.byteSize,\n      usage: STORAGE_RW,\n      label: spec.name ?? `t${spec.id}-${spec.kind}`\n    });\n    buffers.set(spec.id, buf);\n    ownedBufferIds.add(spec.id);\n    if (spec.kind === "state") fillStateBuffer(spec, buf);\n  }\n  const moduleCache = /* @__PURE__ */ new Map();\n  const pipelines = [];\n  const probes = [];\n  for (const k of kernels) {\n    if (!k.wgsl) {\n      pipelines.push(null);\n      continue;\n    }\n    let module = moduleCache.get(k.wgsl);\n    if (!module) {\n      module = device2.createShaderModule({ code: k.wgsl, label: k.opKind });\n      moduleCache.set(k.wgsl, module);\n    }\n    device2.pushErrorScope("validation");\n    const pipeline = device2.createComputePipeline({\n      layout: "auto",\n      compute: { module, entryPoint: "main" },\n      label: k.opKind\n    });\n    pipelines.push(pipeline);\n    probes.push(device2.popErrorScope().then((err) => err ? { k, module, err } : null));\n  }\n  const probeResults = await Promise.all(probes);\n  const failures = probeResults.filter((p) => p != null);\n  if (failures.length > 0) {\n    const reports = [];\n    for (const { k, module, err } of failures) {\n      const info = await module.getCompilationInfo();\n      const messages = info.messages.map((m) => `  L${m.lineNum}:${m.linePos} [${m.type}] ${m.message}`).join("\\n");\n      reports.push(\n        `[shader compile error] ${k.opKind} (op #${k.opIndex}): ${err.message}\n` + (messages || "  (no compilation messages)") + `\n--- WGSL ---\n${k.wgsl}\n-----------`\n      );\n    }\n    console.error(reports.join("\\n\\n"));\n    throw new Error(`tensorgrad: ${failures.length} shader(s) failed to compile (see console).`);\n  }\n  const bindGroups = kernels.map((k, i) => {\n    const pipeline = pipelines[i];\n    if (!pipeline) return null;\n    return device2.createBindGroup({\n      layout: pipeline.getBindGroupLayout(0),\n      entries: k.bindings.map((bufId, idx) => ({\n        binding: idx,\n        resource: { buffer: buffers.get(bufId) }\n      }))\n    });\n  });\n  const outputSpec = plan.buffers[lossBufferId];\n  const outputReadback = device2.createBuffer({ size: outputSpec.byteSize, usage: READBACK });\n  let captureStaging = null;\n  function ensureCaptureStaging() {\n    if (captureStaging) return captureStaging;\n    let totalBytes = 0;\n    const slices = [];\n    for (const [name, bufId] of plan.capturesByName) {\n      const spec = plan.buffers[bufId];\n      slices.push({ name, bufId, offset: totalBytes, byteSize: spec.byteSize });\n      totalBytes += spec.byteSize;\n    }\n    const buffer = device2.createBuffer({ size: totalBytes, usage: READBACK, label: "captures-staging" });\n    captureStaging = { buffer, slices };\n    return captureStaging;\n  }\n  let pending = Promise.resolve();\n  async function dispatch(inputs, opts2) {\n    const turn = pending.catch(() => {\n    }).then(() => dispatchUnsynchronized(inputs, opts2));\n    pending = turn;\n    return turn;\n  }\n  async function dispatchUnsynchronized(inputs, opts2) {\n    const wantCaptures = opts2.wantCaptures;\n    if (wantCaptures && plan.capturesByName.size === 0) {\n      throw new Error(\n        `withCaptures=true but no capture(...) calls were registered during the trace. Add capture(\'name\', tensor) inside your forward pass for the intermediates you want read back.`\n      );\n    }\n    for (const [name, bufId] of plan.inputsByName) {\n      const data = inputs[name];\n      if (!data) throw new Error(`tensorgrad: missing input \'${name}\'`);\n      const expectedBytes = plan.buffers[bufId].byteSize;\n      if (data.byteLength !== expectedBytes) {\n        throw new Error(`tensorgrad: input \'${name}\' has ${data.byteLength} bytes, expected ${expectedBytes}`);\n      }\n      queue.writeBuffer(buffers.get(bufId), 0, data);\n    }\n    const encoder = device2.createCommandEncoder({ label: "tensorgrad-step" });\n    for (let i = 0; i < kernels.length; i++) {\n      const k = kernels[i];\n      if (!k.wgsl || k.threads === 0) continue;\n      const pipeline = pipelines[i];\n      const bindGroup = bindGroups[i];\n      const pass = encoder.beginComputePass({ label: k.opKind });\n      pass.setPipeline(pipeline);\n      pass.setBindGroup(0, bindGroup);\n      const wgCount = Math.max(1, Math.ceil(k.threads / k.workgroupSize));\n      const MAX_X = 65535;\n      const wgX = Math.min(wgCount, MAX_X);\n      const wgY = Math.ceil(wgCount / MAX_X);\n      pass.dispatchWorkgroups(wgX, wgY, 1);\n      pass.end();\n    }\n    for (const wb of plan.writebacks) {\n      encoder.copyBufferToBuffer(buffers.get(wb.source), 0, buffers.get(wb.dest), 0, wb.bytes);\n    }\n    encoder.copyBufferToBuffer(buffers.get(lossBufferId), 0, outputReadback, 0, outputSpec.byteSize);\n    let layout = null;\n    if (wantCaptures) {\n      layout = ensureCaptureStaging();\n      for (const s of layout.slices) {\n        encoder.copyBufferToBuffer(buffers.get(s.bufId), 0, layout.buffer, s.offset, s.byteSize);\n      }\n    }\n    queue.submit([encoder.finish()]);\n    if (!opts2.readback) return null;\n    await outputReadback.mapAsync(GPUMapMode.READ);\n    const output = new Float32Array(outputReadback.getMappedRange().slice(0));\n    outputReadback.unmap();\n    const captures = /* @__PURE__ */ new Map();\n    if (layout) {\n      await layout.buffer.mapAsync(GPUMapMode.READ);\n      const range = layout.buffer.getMappedRange();\n      for (const s of layout.slices) {\n        captures.set(s.name, new Float32Array(range, s.offset, s.byteSize / 4).slice());\n      }\n      layout.buffer.unmap();\n    }\n    return { output, captures };\n  }\n  async function step(inputs, opts2) {\n    if (opts2?.readLoss === false) {\n      await dispatch(inputs, { wantCaptures: false, readback: false });\n      return;\n    }\n    const r = await dispatch(inputs, { wantCaptures: opts2?.withCaptures === true, readback: true });\n    if (opts2?.withCaptures) return { loss: r.output[0], captures: new Captures(captureShapes, r.captures) };\n    return r.output[0];\n  }\n  async function readLoss() {\n    const turn = pending.catch(() => {\n    }).then(async () => {\n      await outputReadback.mapAsync(GPUMapMode.READ);\n      const v = new Float32Array(outputReadback.getMappedRange())[0];\n      outputReadback.unmap();\n      return v;\n    });\n    pending = turn;\n    return turn;\n  }\n  async function run(inputs, opts2) {\n    const r = await dispatch(inputs, { wantCaptures: opts2?.withCaptures === true, readback: true });\n    if (opts2?.withCaptures) return { output: r.output, captures: new Captures(captureShapes, r.captures) };\n    return r.output;\n  }\n  function uploadParams(params2, opts2) {\n    const partial = opts2?.partial ?? false;\n    for (const name of Object.keys(params2)) {\n      if (!plan.paramsByName.has(name)) {\n        throw new Error(\n          `uploadParams: unknown param \'${name}\'. Known: ${[...plan.paramsByName.keys()].sort().join(", ")}`\n        );\n      }\n    }\n    if (!partial) {\n      for (const name of plan.paramsByName.keys()) {\n        if (!(name in params2)) {\n          throw new Error(\n            `uploadParams: missing param \'${name}\'. Pass { partial: true } if you mean to update only some params.`\n          );\n        }\n      }\n    }\n    for (const [name, bufId] of plan.paramsByName) {\n      const data = params2[name];\n      if (!data) continue;\n      const expected = plan.buffers[bufId].byteSize / 4;\n      if (data.length !== expected) {\n        throw new Error(`uploadParams: \'${name}\' has ${data.length} elements, expected ${expected}`);\n      }\n      queue.writeBuffer(buffers.get(bufId), 0, data);\n    }\n  }\n  async function downloadFromMap(map) {\n    const stagings = [];\n    const encoder = device2.createCommandEncoder({ label: "tensorgrad-download" });\n    for (const [name, bufId] of map) {\n      const spec = plan.buffers[bufId];\n      const staging = device2.createBuffer({ size: spec.byteSize, usage: READBACK });\n      encoder.copyBufferToBuffer(buffers.get(bufId), 0, staging, 0, spec.byteSize);\n      stagings.push({ name, buf: staging, bytes: spec.byteSize });\n    }\n    queue.submit([encoder.finish()]);\n    const out = {};\n    for (const s of stagings) {\n      await s.buf.mapAsync(GPUMapMode.READ);\n      out[s.name] = new Float32Array(s.buf.getMappedRange().slice(0));\n      s.buf.unmap();\n      s.buf.destroy();\n    }\n    return out;\n  }\n  function fillStateBuffer(spec, target) {\n    const elements = spec.byteSize / 4;\n    const init = spec.dtype === "f32" ? new Float32Array(elements).fill(spec.initValue ?? 0) : new Int32Array(elements).fill(Math.trunc(spec.initValue ?? 0));\n    queue.writeBuffer(target, 0, init);\n  }\n  function resetOptimizerState() {\n    for (const spec of plan.buffers) {\n      if (spec.kind === "state") fillStateBuffer(spec, buffers.get(spec.id));\n    }\n  }\n  const params = /* @__PURE__ */ new Map();\n  for (const [name, bufId] of plan.paramsByName) {\n    params.set(name, buffers.get(bufId));\n  }\n  const captureShapes = {};\n  for (const [name, bufId] of plan.capturesByName) {\n    captureShapes[name] = [...plan.buffers[bufId].shape];\n  }\n  const outputShape = [...plan.buffers[lossBufferId].shape];\n  const destroy = () => {\n    for (const [id, b] of buffers) {\n      if (ownedBufferIds.has(id)) b.destroy();\n    }\n    outputReadback.destroy();\n    if (captureStaging) captureStaging.buffer.destroy();\n  };\n  return {\n    device: device2,\n    params,\n    outputShape,\n    uploadParams,\n    downloadParams: () => downloadFromMap(plan.paramsByName),\n    downloadParamGrads: () => downloadFromMap(plan.paramGradsByName),\n    step,\n    run,\n    readLoss,\n    resetOptimizerState,\n    destroy\n  };\n}\nasync function acquireDevice() {\n  if (typeof navigator === "undefined" || !navigator.gpu) {\n    throw new Error("tensorgrad: WebGPU not available in this environment");\n  }\n  const adapter = await navigator.gpu.requestAdapter();\n  if (!adapter) throw new Error("tensorgrad: no WebGPU adapter");\n  return await adapter.requestDevice();\n}\n\n// src/adam.ts\nfunction resolveLR(schedule, step) {\n  if (typeof schedule === "number") return schedule;\n  switch (schedule.kind) {\n    case "constant":\n      return schedule.value;\n    case "linearDecay": {\n      const f = Math.min(step / schedule.steps, 1);\n      return schedule.peak + (schedule.final - schedule.peak) * f;\n    }\n    case "cosineDecay": {\n      const f = Math.min(step / schedule.steps, 1);\n      return schedule.final + 0.5 * (schedule.peak - schedule.final) * (1 + Math.cos(Math.PI * f));\n    }\n    case "warmup": {\n      if (step <= schedule.warmupSteps) return schedule.peakLr * (step / schedule.warmupSteps);\n      return resolveLR(schedule.after, step - schedule.warmupSteps);\n    }\n  }\n}\n\n// src/worker-protocol.ts\nfunction wireError(e) {\n  if (e instanceof Error) {\n    return { name: e.name, message: e.message, stack: e.stack ?? "" };\n  }\n  return { name: "Error", message: String(e), stack: "" };\n}\n\n// src/worker.ts\nvar graphs = /* @__PURE__ */ new Map();\nvar device = null;\nasync function ensureDevice() {\n  if (device) return device;\n  if (typeof navigator === "undefined" || !navigator.gpu) {\n    throw new Error("tensorgrad worker: WebGPU not available in this environment");\n  }\n  const adapter = await navigator.gpu.requestAdapter();\n  if (!adapter) throw new Error("tensorgrad worker: no WebGPU adapter");\n  device = await adapter.requestDevice();\n  return device;\n}\nasync function handleCreateRuntime(payload) {\n  const dev = await ensureDevice();\n  const { graph, plan, kernels } = payload.ir;\n  const outputTensorId = graph.outputs[0];\n  const outputBufferId = plan.tensorToBuffer.get(outputTensorId);\n  const opts = { device: dev };\n  const runtime = await createRuntime(plan, kernels, outputBufferId, opts);\n  if (Object.keys(payload.initialParams).length > 0) {\n    runtime.uploadParams(payload.initialParams);\n  }\n  const captureShapes = {};\n  for (const [name, bufId] of plan.capturesByName) {\n    captureShapes[name] = [...plan.buffers[bufId].shape];\n  }\n  const slot = {\n    runtime,\n    paramNames: [...plan.paramsByName.keys()],\n    outputShape: [...runtime.outputShape],\n    kernelCount: kernels.filter((k) => k.wgsl).length,\n    captureShapes,\n    adam: payload.adam ? createAdamState(payload.adam) : null\n  };\n  graphs.set(payload.graphId, slot);\n  return {\n    paramNames: [...slot.paramNames],\n    outputShape: slot.outputShape,\n    kernelCount: slot.kernelCount,\n    captureShapes: slot.captureShapes\n  };\n}\nasync function handleCompileForward(payload) {\n  const dev = await ensureDevice();\n  const parent = graphs.get(payload.parentGraphId);\n  if (!parent) throw new Error(`compileForward: parent graph ${payload.parentGraphId} not found`);\n  const { graph, plan, kernels } = payload.ir;\n  const outputTensorId = graph.outputs[0];\n  const outputBufferId = plan.tensorToBuffer.get(outputTensorId);\n  const opts = { device: dev, sharedParams: parent.runtime.params };\n  const runtime = await createRuntime(plan, kernels, outputBufferId, opts);\n  const captureShapes = {};\n  for (const [name, bufId] of plan.capturesByName) {\n    captureShapes[name] = [...plan.buffers[bufId].shape];\n  }\n  const slot = {\n    runtime,\n    paramNames: [...plan.paramsByName.keys()],\n    outputShape: [...runtime.outputShape],\n    kernelCount: kernels.filter((k) => k.wgsl).length,\n    captureShapes,\n    adam: null\n  };\n  graphs.set(payload.graphId, slot);\n  return {\n    paramNames: [...slot.paramNames],\n    outputShape: slot.outputShape,\n    kernelCount: slot.kernelCount,\n    captureShapes: slot.captureShapes\n  };\n}\nfunction createAdamState(cfg) {\n  return {\n    config: cfg,\n    t: 0,\n    lrtBuf: new Float32Array(1),\n    decayShrinkBuf: cfg.decayShrinkInputName ? new Float32Array(1) : null\n  };\n}\nfunction injectAdamScalars(slot, inputs) {\n  const a = slot.adam;\n  if (!a) return inputs;\n  a.t++;\n  const lrNow = resolveLR(a.config.lr, a.t);\n  a.lrtBuf[0] = lrNow * Math.sqrt(1 - Math.pow(a.config.b2, a.t)) / (1 - Math.pow(a.config.b1, a.t));\n  const merged = { ...inputs, [a.config.lrtInputName]: a.lrtBuf };\n  if (a.decayShrinkBuf && a.config.decayShrinkInputName) {\n    a.decayShrinkBuf[0] = 1 - lrNow * a.config.weightDecay;\n    merged[a.config.decayShrinkInputName] = a.decayShrinkBuf;\n  }\n  return merged;\n}\nasync function handleStep(payload) {\n  const slot = mustGet(payload.graphId);\n  const merged = injectAdamScalars(slot, payload.inputs);\n  if (payload.withCaptures) {\n    const r = await slot.runtime.step(merged, { withCaptures: true });\n    return { loss: r.loss, captures: capturesToRecord(r.captures, slot.captureShapes) };\n  }\n  const loss = await slot.runtime.step(merged);\n  return { loss, captures: null };\n}\nasync function handleRun(payload) {\n  const slot = mustGet(payload.graphId);\n  if (payload.withCaptures) {\n    const r = await slot.runtime.run(payload.inputs, { withCaptures: true });\n    return { output: r.output, captures: capturesToRecord(r.captures, slot.captureShapes) };\n  }\n  const output = await slot.runtime.run(payload.inputs);\n  return { output, captures: null };\n}\nfunction capturesToRecord(captures, shapes) {\n  const out = {};\n  for (const name of Object.keys(shapes)) {\n    if (captures.has(name)) out[name] = captures.get(name);\n  }\n  return out;\n}\nfunction handleUploadParams(payload) {\n  const slot = mustGet(payload.graphId);\n  slot.runtime.uploadParams(payload.params, { partial: payload.partial });\n}\nasync function handleDownloadParams(payload) {\n  const slot = mustGet(payload.graphId);\n  return { params: await slot.runtime.downloadParams() };\n}\nasync function handleDownloadParamGrads(payload) {\n  const slot = mustGet(payload.graphId);\n  return { params: await slot.runtime.downloadParamGrads() };\n}\nfunction handleResetOptimizer(payload) {\n  const slot = mustGet(payload.graphId);\n  slot.runtime.resetOptimizerState();\n  if (slot.adam) slot.adam.t = 0;\n}\nfunction handleDestroy(payload) {\n  const slot = graphs.get(payload.graphId);\n  if (!slot) return;\n  slot.runtime.destroy();\n  graphs.delete(payload.graphId);\n}\nfunction mustGet(graphId) {\n  const slot = graphs.get(graphId);\n  if (!slot) throw new Error(`tensorgrad worker: graph ${graphId} not found`);\n  return slot;\n}\nself.onmessage = async (ev) => {\n  const req = ev.data;\n  try {\n    let result;\n    let transferList = [];\n    switch (req.kind) {\n      case "createRuntime":\n        result = await handleCreateRuntime(req.payload);\n        break;\n      case "compileForward":\n        result = await handleCompileForward(req.payload);\n        break;\n      case "step":\n        result = await handleStep(req.payload);\n        transferList = collectTransfers(result.captures);\n        break;\n      case "run": {\n        const r = await handleRun(req.payload);\n        result = r;\n        transferList = [r.output.buffer, ...collectTransfers(r.captures)];\n        break;\n      }\n      case "uploadParams":\n        handleUploadParams(req.payload);\n        result = null;\n        break;\n      case "downloadParams": {\n        const r = await handleDownloadParams(req.payload);\n        result = r;\n        transferList = collectTransfers(r.params);\n        break;\n      }\n      case "downloadParamGrads": {\n        const r = await handleDownloadParamGrads(req.payload);\n        result = r;\n        transferList = collectTransfers(r.params);\n        break;\n      }\n      case "resetOptimizer":\n        handleResetOptimizer(req.payload);\n        result = null;\n        break;\n      case "destroy":\n        handleDestroy(req.payload);\n        result = null;\n        break;\n      default:\n        throw new Error(`unknown request kind: ${req.kind}`);\n    }\n    const reply = { id: req.id, ok: true, result };\n    self.postMessage(reply, { transfer: transferList });\n  } catch (e) {\n    const error = wireError(e);\n    const reply = { id: req.id, ok: false, error };\n    self.postMessage(reply);\n  }\n};\nfunction collectTransfers(rec) {\n  if (!rec) return [];\n  const out = [];\n  for (const v of Object.values(rec)) out.push(v.buffer);\n  return out;\n}\n');
+  const wireIR = { graph, plan, kernels };
+  const transfers = transferablesOfRecord(initialParams);
+  let meta;
+  try {
+    meta = await proxy.request(
+      { kind: "createRuntime", payload: { graphId: 0, ir: wireIR, initialParams, adam: null } },
+      transfers
+    );
+  } catch (e) {
+    proxy.terminate();
+    throw e;
+  }
+  return new CompiledForwardModuleProxy(
+    proxy,
+    /* graphId */
+    0,
+    ir,
+    meta,
+    /* ownsWorker */
+    true
+  );
+}
+var CompiledModuleProxy = class {
+  constructor(proxy, graphId, ir, meta, modelFactory, initFns, nextGraphId) {
+    this.proxy = proxy;
+    this.graphId = graphId;
+    this.ir = ir;
+    this.meta = meta;
+    this.modelFactory = modelFactory;
+    this.initFns = initFns;
+    this.nextGraphId = nextGraphId;
+  }
+  proxy;
+  graphId;
+  ir;
+  meta;
+  modelFactory;
+  initFns;
+  nextGraphId;
+  get kernelCount() {
+    return this.meta.kernelCount;
+  }
+  get outputShape() {
+    return this.meta.outputShape;
+  }
+  get paramNames() {
+    return this.meta.paramNames;
+  }
+  async step(inputs, opts) {
+    const r = await this.proxy.request(
+      { kind: "step", payload: { graphId: this.graphId, inputs, withCaptures: opts?.withCaptures === true } }
+    );
+    if (opts?.withCaptures) {
+      return { loss: r.loss, captures: makeCaptures(r.captures, this.meta.captureShapes) };
+    }
+    return r.loss;
+  }
+  async run(inputs, opts) {
+    const r = await this.proxy.request(
+      { kind: "run", payload: { graphId: this.graphId, inputs, withCaptures: opts?.withCaptures === true } }
+    );
+    if (opts?.withCaptures) {
+      return { output: r.output, captures: makeCaptures(r.captures, this.meta.captureShapes) };
+    }
+    return r.output;
+  }
+  uploadParams(params, opts) {
+    return this.proxy.request(
+      { kind: "uploadParams", payload: { graphId: this.graphId, params, partial: !!opts?.partial } }
+    ).then(() => void 0);
+  }
+  async downloadParams() {
+    const r = await this.proxy.request(
+      { kind: "downloadParams", payload: { graphId: this.graphId } }
+    );
+    return r.params;
+  }
+  async downloadParamGrads() {
+    const r = await this.proxy.request(
+      { kind: "downloadParamGrads", payload: { graphId: this.graphId } }
+    );
+    return r.params;
+  }
+  async reset() {
+    const initialParams = buildInitialParams(this.ir.plan, this.initFns);
+    await this.uploadParams(initialParams);
+    await this.resetOptimizerState();
+  }
+  resetOptimizerState() {
+    return this.proxy.request(
+      { kind: "resetOptimizer", payload: { graphId: this.graphId } }
+    ).then(() => void 0);
+  }
+  async compileForward(forward, opts = {}) {
+    const { graph, materialized: _materialized } = traceModule(this.modelFactory, forward, opts.inputs ?? {});
+    const outputTensor = graph.tensors[graph.outputs[0]];
+    const plan = planBuffers(
+      graph,
+      /* paramGrads */
+      {}
+    );
+    const kernels = emitKernels(graph, plan);
+    const ir = { graph, paramGrads: {}, loss: outputTensor, plan, kernels };
+    const childGraphId = this.nextGraphId.v++;
+    const wireIR = { graph, plan, kernels };
+    const meta = await this.proxy.request(
+      { kind: "compileForward", payload: { graphId: childGraphId, parentGraphId: this.graphId, ir: wireIR } }
+    );
+    return new CompiledForwardModuleProxy(
+      this.proxy,
+      childGraphId,
+      ir,
+      meta,
+      /* ownsWorker */
+      false
+    );
+  }
+  destroy() {
+    this.proxy.send({ kind: "destroy", payload: { graphId: this.graphId } });
+    this.proxy.terminate();
+  }
+};
+var CompiledForwardModuleProxy = class {
+  constructor(proxy, graphId, ir, meta, ownsWorker) {
+    this.proxy = proxy;
+    this.graphId = graphId;
+    this.ir = ir;
+    this.meta = meta;
+    this.ownsWorker = ownsWorker;
+  }
+  proxy;
+  graphId;
+  ir;
+  meta;
+  ownsWorker;
+  get kernelCount() {
+    return this.meta.kernelCount;
+  }
+  get outputShape() {
+    return this.meta.outputShape;
+  }
+  get paramNames() {
+    return this.meta.paramNames;
+  }
+  async run(inputs, opts) {
+    const r = await this.proxy.request(
+      { kind: "run", payload: { graphId: this.graphId, inputs, withCaptures: opts?.withCaptures === true } }
+    );
+    if (opts?.withCaptures) {
+      return { output: r.output, captures: makeCaptures(r.captures, this.meta.captureShapes) };
+    }
+    return r.output;
+  }
+  uploadParams(params, opts) {
+    return this.proxy.request(
+      { kind: "uploadParams", payload: { graphId: this.graphId, params, partial: !!opts?.partial } }
+    ).then(() => void 0);
+  }
+  async downloadParams() {
+    const r = await this.proxy.request(
+      { kind: "downloadParams", payload: { graphId: this.graphId } }
+    );
+    return r.params;
+  }
+  destroy() {
+    this.proxy.send({ kind: "destroy", payload: { graphId: this.graphId } });
+    if (this.ownsWorker) this.proxy.terminate();
+  }
+};
+function traceModule(modelFactory, forward, inputDecls) {
+  const model = modelFactory();
+  let materialized = { tensors: {}, initFns: {}, decayFlags: {} };
+  const graph = trace(() => {
+    materialized = materializeParams(model);
+    const inputTensors = {};
+    for (const [name, decl] of Object.entries(inputDecls)) {
+      inputTensors[name] = tensorInput(name, decl.shape, decl.dtype ?? "f32");
+    }
+    return forward(model, inputTensors);
+  });
+  return { graph, materialized };
+}
+function buildInitialParams(plan, initFns) {
+  const out = {};
+  for (const [name, bufId] of plan.paramsByName) {
+    const shape = plan.buffers[bufId].shape;
+    const size = shape.reduce((a, b) => a * b, 1);
+    const initFn = initFns[name];
+    if (!initFn) throw new Error(`compile: no init for param '${name}'`);
+    out[name] = initFn(size, shape);
+  }
+  return out;
+}
+function wireAdamConfig(r) {
+  const c = r.config;
+  return {
+    lr: c.lr,
+    b1: c.b1,
+    b2: c.b2,
+    eps: c.eps,
+    weightDecay: c.weightDecay,
+    lrIsScheduled: c.lrIsScheduled,
+    lrtInputName: r.lrtInputName,
+    decayShrinkInputName: r.decayShrinkInputName
+  };
+}
+function makeCaptures(captures, captureShapes) {
+  const data = /* @__PURE__ */ new Map();
+  if (captures) {
+    for (const [name, arr] of Object.entries(captures)) data.set(name, arr);
+  }
+  return new Captures(captureShapes, data);
+}
+// src/nn.ts
+var nn_exports = {};
+__export(nn_exports, {
+  LayerNorm: () => LayerNorm,
+  Linear: () => Linear,
+  crossEntropyLast: () => crossEntropyLast,
+  mergeHeads: () => mergeHeads,
+  splitHeads: () => splitHeads,
+  unsplitHeads: () => unsplitHeads
+});
+var Linear = class extends Module {
+  constructor(inDim, outDim, opts = {}) {
+    super();
+    this.inDim = inDim;
+    this.outDim = outDim;
+    this.W = this.param([inDim, outDim]);
+    this.b = opts.bias === false ? null : this.param([outDim], { init: "zeros" });
+  }
+  inDim;
+  outDim;
+  W;
+  b;
+  fwd(x) {
+    const out = matmul(x, this.W);
+    return this.b ? add(out, this.b) : out;
+  }
+};
+var LayerNorm = class extends Module {
+  constructor(d, eps = 1e-5) {
+    super();
+    this.d = d;
+    this.eps = eps;
+    this.g = this.param([d], { init: "ones" });
+    this.b = this.param([d], { init: "zeros" });
+  }
+  d;
+  eps;
+  g;
+  b;
+  fwd(x) {
+    const m = meanLast(x);
+    const c = sub(x, m);
+    const v = meanLast(mul(c, c));
+    const stdev = sqrt(add(v, this.eps));
+    return add(mul(div(c, stdev), this.g), this.b);
+  }
+};
+function splitHeads(x, nHeads) {
+  const site = captureSite("splitHeads");
+  const r = x.shape.length;
+  if (r < 2) throw new ShapeError(`splitHeads: requires rank >= 2, got ${r}`, site);
+  const T = x.shape[r - 2];
+  const D = x.shape[r - 1];
+  if (D % nHeads !== 0) {
+    throw new ShapeError(`splitHeads: last dim ${D} not divisible by nHeads ${nHeads}`, site);
+  }
+  const lead = x.shape.slice(0, r - 2);
+  const reshaped = reshape(x, [...lead, T, nHeads, D / nHeads]);
+  return swapAxes(reshaped, lead.length, lead.length + 1);
+}
+function mergeHeads(x) {
+  const site = captureSite("mergeHeads");
+  const r = x.shape.length;
+  if (r < 3) throw new ShapeError(`mergeHeads: requires rank >= 3, got ${r}`, site);
+  const H = x.shape[r - 3];
+  const T = x.shape[r - 2];
+  const d = x.shape[r - 1];
+  const lead = x.shape.slice(0, r - 3);
+  const swapped = swapAxes(x, r - 3, r - 2);
+  return reshape(swapped, [...lead, T, H * d]);
+}
+function unsplitHeads(captures, name) {
+  const flat = captures.get(name);
+  const shape = captures.shapeOf(name);
+  if (shape.length < 2) {
+    throw new Error(`unsplitHeads: '${name}' shape needs >= 2 dims, got [${shape.join(", ")}]`);
+  }
+  const s = shape[0] === 1 ? shape.slice(1) : shape;
+  const H = s[0];
+  let stride = 1;
+  for (let i = 1; i < s.length; i++) stride *= s[i];
+  const expected = H * stride;
+  if (flat.length !== expected) {
+    throw new Error(`unsplitHeads: '${name}' length ${flat.length} doesn't match shape product ${expected}`);
+  }
+  return Array.from({ length: H }, (_, h) => flat.slice(h * stride, (h + 1) * stride));
+}
+function crossEntropyLast(logits, targets) {
+  const site = captureSite("crossEntropyLast");
+  if (targets.dtype !== "i32") {
+    throw new ShapeError(`crossEntropyLast: targets must be i32, got ${targets.dtype}`, site);
+  }
+  const vocab = logits.shape[logits.shape.length - 1];
+  const lp = logSoftmaxLast(logits);
+  const targetLp = sumLast(mul(lp, oneHot(targets, vocab, "f32")));
+  return mul(targetLp, -1);
+}
+export {
+  Captures,
+  Module,
+  ShapeError,
+  add,
+  appendAdam,
+  appendGrad,
+  arange,
+  capture,
+  compileForward,
+  compileModule,
+  compileToIR,
+  div,
+  embedding,
+  emitKernels,
+  exp,
+  greater,
+  init,
+  less,
+  log,
+  logSoftmaxLast,
+  lr,
+  materializeParams,
+  matmul,
+  matmulBatched,
+  meanLast,
+  mul,
+  nn_exports as nn,
+  oneHot,
+  paramInput,
+  planBuffers,
+  relu,
+  reshape,
+  resolveLR,
+  rsqrt,
+  sliceLastRange,
+  softmaxCausalLast,
+  sqrt,
+  stateInput,
+  sub,
+  sumAll,
+  sumLast,
+  swapAxes,
+  tensorInput,
+  trace,
+  traceInto,
+  transpose,
+  where,
+  whereCausal
+};
+//# sourceMappingURL=index.js.map