npm - tensorgrad - Versions diffs - 0.0.5 → 0.0.8 - Mend

tensorgrad 0.0.5 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/src/adam.ts CHANGED Viewed

@@ -95,6 +95,11 @@ export function appendAdam(
   paramGrads: Record<string, Tensor>,
   paramTensors: Record<string, Tensor>,
   config: AdamConfig,
+  /** Per-param decay flags from `materializeParams`. When supplied, overrides
+   *  `config.decayFilter` for any name in the map; falls back to `decayFilter`
+   *  for names not present (e.g., for low-level callers using `compile()`
+   *  directly without a Module). */
+  decayFlags?: Record<string, boolean>,
 ): AdamResult {
   const lrIsScheduled = typeof config.lr === 'function'
   const lrFn = lrIsScheduled
@@ -119,13 +124,22 @@ export function appendAdam(
   return traceInto(graph, () => {
     const lrt = tensorInput(lrtInputName, [], 'f32')
-    // Decide up-front whether we need a runtime decayShrink scalar. Only does
-    // something when both (a) lr varies per step and (b) some param is decayed.
-    const needsDynamicShrink = lrIsScheduled
-      && fullConfig.weightDecay > 0
-      && Object.keys(paramGrads).some(name => fullConfig.decayFilter(name))
+    // Up-front: which params receive weight decay? Per-param decayFlags (set
+    // by Module.param's options) wins; falls back to decayFilter for names
+    // not in the map. Empty when weightDecay = 0 so the rest of the function
+    // can just ask "is this name in the set?".
+    const decayedNames = new Set<string>(
+      fullConfig.weightDecay > 0
+        ? Object.keys(paramGrads).filter(name =>
+            (decayFlags && name in decayFlags) ? decayFlags[name]! : fullConfig.decayFilter(name))
+        : [],
+    )
+    // We only need a runtime decayShrink scalar when lr varies per step AND
+    // at least one param is being decayed. Otherwise the value is constant
+    // and bakes into the kernel as a literal.
     let decayShrinkScalar: Tensor | null = null
-    if (needsDynamicShrink) {
+    if (lrIsScheduled && decayedNames.size > 0) {
       decayShrinkInputName = '_adam_decay_shrink'
       decayShrinkScalar = tensorInput(decayShrinkInputName, [], 'f32')
     }
@@ -141,17 +155,12 @@ export function appendAdam(
       // Choose the decayShrink form per param:
       //   - non-decayed params: literal 1 (kernel multiply folds out).
-      //   - decayed + static lr: literal `1 - lr * wd` baked at compile.
       //   - decayed + scheduled lr: tensor input updated per step.
-      const isDecayed = fullConfig.weightDecay > 0 && fullConfig.decayFilter(name)
-      let decayShrink: number | Tensor
-      if (!isDecayed) {
-        decayShrink = 1
-      } else if (decayShrinkScalar !== null) {
-        decayShrink = decayShrinkScalar
-      } else {
-        decayShrink = 1 - initialLr * fullConfig.weightDecay
-      }
+      //   - decayed + static lr: literal `1 - lr * wd` baked at compile.
+      const decayShrink: number | Tensor =
+        !decayedNames.has(name) ? 1
+        : decayShrinkScalar !== null ? decayShrinkScalar
+        : 1 - initialLr * fullConfig.weightDecay
       // Three fused kernels per parameter — one for each of m_new / v_new / p_new.
       const newM = adamUpdateM(mState, g, fullConfig.b1)

package/src/compile.ts CHANGED Viewed

@@ -93,7 +93,7 @@ export async function compileModule<M extends Module>(
 ): Promise<CompiledRuntime & { ir: CompiledIR; uploadInitialParams: () => void }> {
   const inputDecls = opts.inputs ?? []
   const model = modelFactory()
-  let materialized: ReturnType<typeof materializeParams> = { tensors: {}, initFns: {} }
+  let materialized: ReturnType<typeof materializeParams> = { tensors: {}, initFns: {}, decayFlags: {} }
   const graph = trace(() => {
     materialized = materializeParams(model)
     const inputTensors = inputDecls.map(d => tensorInput(d.name, d.shape, d.dtype ?? 'f32'))
@@ -104,7 +104,7 @@ export async function compileModule<M extends Module>(
   let adamResult: ReturnType<typeof appendAdam> | undefined
   if (opts.adam) {
-    adamResult = appendAdam(graph, paramGrads, materialized.tensors, opts.adam)
+    adamResult = appendAdam(graph, paramGrads, materialized.tensors, opts.adam, materialized.decayFlags)
   }
   const plan = planBuffers(graph, paramGrads, adamResult?.writebacks ?? [])
@@ -144,16 +144,8 @@ export async function compileModule<M extends Module>(
     }
   }
-  const { initFns } = materialized
   const uploadInitialParams = () => {
-    const out: Record<string, Float32Array> = {}
-    for (const [name, bufId] of plan.paramsByName) {
-      const shape = plan.buffers[bufId]!.shape
-      const size = shape.reduce((a, b) => a * b, 1)
-      const initFn = initFns[name]
-      if (!initFn) throw new Error(`uploadInitialParams: no init for param '${name}'`)
-      out[name] = initFn(size, shape)
-    }
+    const out = buildInitialParamUploads(plan, materialized.initFns)
     runtime.uploadParams(out)
   }
@@ -161,6 +153,28 @@ export async function compileModule<M extends Module>(
   return Object.assign(runtime, { ir, uploadInitialParams })
 }
+// Build a Record<paramName, Float32Array> by running each param's init
+// function against its shape. Shared by compileModule and compileForward.
+// `sharedParams`, when supplied, skips any name it covers (those are owned
+// by the sibling compile and already initialized there).
+type InitFn = (size: number, shape: readonly number[]) => Float32Array
+function buildInitialParamUploads(
+  plan: BufferPlan,
+  initFns: Record<string, InitFn>,
+  sharedParams?: Map<string, GPUBuffer>,
+): Record<string, Float32Array> {
+  const out: Record<string, Float32Array> = {}
+  for (const [name, bufId] of plan.paramsByName) {
+    if (sharedParams?.has(name)) continue
+    const shape = plan.buffers[bufId]!.shape
+    const size = shape.reduce((a, b) => a * b, 1)
+    const initFn = initFns[name]
+    if (!initFn) throw new Error(`uploadInitialParams: no init for param '${name}'`)
+    out[name] = initFn(size, shape)
+  }
+  return out
+}
 // ============================================================================
 // Forward-only compile
 // ============================================================================
@@ -187,7 +201,7 @@ export async function compileForward<M extends Module>(
 ): Promise<CompiledForward & { ir: CompiledIR; uploadInitialParams: () => void }> {
   const inputDecls = opts.inputs ?? []
   const model = modelFactory()
-  let materialized: ReturnType<typeof materializeParams> = { tensors: {}, initFns: {} }
+  let materialized: ReturnType<typeof materializeParams> = { tensors: {}, initFns: {}, decayFlags: {} }
   const graph = trace(() => {
     materialized = materializeParams(model)
     const inputTensors = inputDecls.map(d => tensorInput(d.name, d.shape, d.dtype ?? 'f32'))
@@ -201,22 +215,9 @@ export async function compileForward<M extends Module>(
   const runtime = await createForwardRuntime(plan, kernels, outputBufferId, opts)
   const sharedParams = opts.sharedParams
-  const { initFns } = materialized
   const uploadInitialParams = () => {
-    const out: Record<string, Float32Array> = {}
-    let needsUpload = false
-    for (const [name, bufId] of plan.paramsByName) {
-      // Skip params covered by sharedParams — those are owned by the providing
-      // compile and already initialized there.
-      if (sharedParams?.has(name)) continue
-      const shape = plan.buffers[bufId]!.shape
-      const size = shape.reduce((a, b) => a * b, 1)
-      const initFn = initFns[name]
-      if (!initFn) throw new Error(`uploadInitialParams: no init for param '${name}'`)
-      out[name] = initFn(size, shape)
-      needsUpload = true
-    }
-    if (needsUpload) runtime.uploadParams(out, { partial: !!sharedParams })
+    const out = buildInitialParamUploads(plan, materialized.initFns, sharedParams)
+    if (Object.keys(out).length > 0) runtime.uploadParams(out, { partial: !!sharedParams })
   }
   // CompiledIR.loss is the field name; for forward-only, it carries the user's

package/src/grad.ts CHANGED Viewed

@@ -18,7 +18,7 @@
 import type { Graph, OpNode, Tensor, Shape } from './ir.js'
 import {
   add, sub, mul, div, mulScalar,
-  matmul, matmulBatched, transpose, reshape,
+  matmul, matmulBatched, transpose, swapAxes, reshape,
   exp,
   broadcastTo, sumToShape,
   constScalar, reluGrad,
@@ -280,14 +280,10 @@ function runTransposeRule(
       // leading batch dims to get [K, N].
       const a = tensorOf(op.a), b = tensorOf(op.b)
       // dA = dC @ B^T
-      const bT = transpose(b, [1, 0])
-      accumulate(cotangents, op.a, matmul(outCotan, bT))
+      accumulate(cotangents, op.a, matmul(outCotan, swapAxes(b, -1, -2)))
       // dB: per-batch A^T @ dC, then sum over batch dims.
       // A is [..., M, K]; transpose last two axes.
-      const aTPerm = identityPerm(a.shape.length)
-      ;[aTPerm[a.shape.length - 1], aTPerm[a.shape.length - 2]] =
-        [aTPerm[a.shape.length - 2]!, aTPerm[a.shape.length - 1]!]
-      const aT = transpose(a, aTPerm)  // [..., K, M]
+      const aT = swapAxes(a, -1, -2)  // [..., K, M]
       // matmul_batched needs same rank on both sides. dC has rank `a.rank`;
       // aT has rank `a.rank`; use matmul_batched if rank > 2, else matmul.
       let perBatchDb: Tensor
@@ -305,15 +301,8 @@ function runTransposeRule(
       // dA = dC @ B^T   (per-batch, all batch dims preserved)
       // dB = A^T @ dC   (per-batch)
       const a = tensorOf(op.a), b = tensorOf(op.b)
-      const lastTwoSwap = (rank: number) => {
-        const p = identityPerm(rank)
-        ;[p[rank - 1], p[rank - 2]] = [p[rank - 2]!, p[rank - 1]!]
-        return p
-      }
-      const bT = transpose(b, lastTwoSwap(b.shape.length))
-      const aT = transpose(a, lastTwoSwap(a.shape.length))
-      accumulate(cotangents, op.a, matmulBatched(outCotan, bT))
-      accumulate(cotangents, op.b, matmulBatched(aT, outCotan))
+      accumulate(cotangents, op.a, matmulBatched(outCotan, swapAxes(b, -1, -2)))
+      accumulate(cotangents, op.b, matmulBatched(swapAxes(a, -1, -2), outCotan))
       return
     }

package/src/index.ts CHANGED Viewed

@@ -15,13 +15,13 @@ export {
   // Comparisons + select
   less, greater, where,
   // Reductions over the last axis (other axes via reshape/transpose first)
-  meanLast, sumLast,
+  meanLast, sumLast, sumAll,
   // Shape ops
-  reshape, transpose,
+  reshape, transpose, swapAxes,
   // Linear algebra
   matmul, matmulBatched,
   // Indexing / casting
-  oneHot, arange,
+  oneHot, arange, embedding,
   // ML primitives — fused for the transformer
   softmaxCausalLast, logSoftmaxLast, whereCausal,
   // Slicing

package/src/module.ts CHANGED Viewed

@@ -52,6 +52,11 @@ export interface ParamOptions {
   init?: InitSpec
   /** Std dev for `'randn'`. Default 0.02. Ignored for non-randn init. */
   scale?: number
+  /** Whether AdamW (when `weightDecay > 0`) should apply decoupled weight
+   *  decay to this param. Default: `true` for `'randn'` init (weight matrices,
+   *  embeddings), `false` for `'zeros'` / `'ones'` (biases, LN gains). Override
+   *  to force or skip. Replaces `adam.decayFilter` for the common case. */
+  decay?: boolean
 }
 type InitFn = (size: number, shape: readonly number[]) => Float32Array
@@ -76,6 +81,17 @@ function resolveInit(opts: ParamOptions | undefined): InitFn {
   throw new Error(`Unknown init: ${String(init)}`)
 }
+/** Resolve the decay default for a param. Decay weight matrices and
+ *  embedding tables (randn-initialized); skip biases (zeros) and LN gains
+ *  (ones). Custom init functions default to "decay" — most user-supplied
+ *  inits are weight-shaped (Kaiming etc.). Explicit `decay: false` overrides. */
+function resolveDecay(opts: ParamOptions | undefined): boolean {
+  if (opts?.decay !== undefined) return opts.decay
+  const init = opts?.init ?? 'randn'
+  if (init === 'zeros' || init === 'ones') return false
+  return true   // 'randn' or function
+}
 // ============================================================================
 // Internals: param sentinel
 // ============================================================================
@@ -90,6 +106,7 @@ class ParamSentinel {
     public readonly shape: Shape,
     public readonly dtype: Dtype,
     public readonly initFn: InitFn,
+    public readonly decay: boolean,
   ) {}
 }
@@ -110,7 +127,7 @@ export abstract class Module {
   protected param(shape: Shape, opts?: ParamOptions): Tensor {
     const dtype = opts?.dtype ?? 'f32'
     // Lie to TypeScript: the sentinel becomes a Tensor at materialize time.
-    return new ParamSentinel(shape, dtype, resolveInit(opts)) as unknown as Tensor
+    return new ParamSentinel(shape, dtype, resolveInit(opts), resolveDecay(opts)) as unknown as Tensor
   }
 }
@@ -123,6 +140,9 @@ export interface MaterializedParams {
   tensors: Record<string, Tensor>
   /** Init function per param path. Used by `uploadInitialParams`. */
   initFns: Record<string, InitFn>
+  /** Whether this param should receive AdamW weight decay. Resolved at
+   *  `param()` time from `ParamOptions.decay` (with init-based default). */
+  decayFlags: Record<string, boolean>
 }
 /**
@@ -136,15 +156,17 @@ export interface MaterializedParams {
 export function materializeParams(root: Module): MaterializedParams {
   const tensors: Record<string, Tensor> = {}
   const initFns: Record<string, InitFn> = {}
+  const decayFlags: Record<string, boolean> = {}
   visit(root, '', (path, val, owner, key) => {
     if (val instanceof ParamSentinel) {
       const t = paramInput(path, val.shape, val.dtype)
       ;(owner as any)[key] = t
       tensors[path] = t
       initFns[path] = val.initFn
+      decayFlags[path] = val.decay
     }
   })
-  return { tensors, initFns }
+  return { tensors, initFns, decayFlags }
 }
 // ----------------------------------------------------------------------------

package/src/nn.ts CHANGED Viewed

@@ -15,7 +15,9 @@
 import { Module } from './module.js'
 import type { Tensor } from './ir.js'
-import { add, matmul, sub, mul, div, sqrt, meanLast } from './ops.js'
+import { add, matmul, sub, mul, div, sqrt, meanLast, sumLast, reshape, swapAxes, oneHot, logSoftmaxLast } from './ops.js'
+import { ShapeError } from './shape.js'
+import { captureSite } from './ir.js'
 // ----------------------------------------------------------------------------
 // Linear: y = x @ W (+ b)
@@ -57,3 +59,84 @@ export function layerNormFwd(p: LayerNorm, x: Tensor): Tensor {
   const stdev = sqrt(add(v, p.eps))
   return add(mul(div(c, stdev), p.g), p.b)
 }
+// ----------------------------------------------------------------------------
+// Multi-head attention shape helpers — split the last (model) axis into
+// [nHeads, headDim] and bring heads ahead of the sequence axis.
+// ----------------------------------------------------------------------------
+/** [..., T, D] → [..., H, T, D/H]. Folds the standard
+ *  `transpose(reshape(x, [..., T, H, d]), [..., H, T, d])` pattern into one
+ *  call. Last dim of `x` must divide evenly by `nHeads`. */
+export function splitHeads(x: Tensor, nHeads: number): Tensor {
+  const site = captureSite('splitHeads')
+  const r = x.shape.length
+  if (r < 2) throw new ShapeError(`splitHeads: requires rank >= 2, got ${r}`, site)
+  const T = x.shape[r - 2]!
+  const D = x.shape[r - 1]!
+  if (D % nHeads !== 0) {
+    throw new ShapeError(`splitHeads: last dim ${D} not divisible by nHeads ${nHeads}`, site)
+  }
+  const lead = x.shape.slice(0, r - 2)
+  const reshaped = reshape(x, [...lead, T, nHeads, D / nHeads])
+  // Swap T (axis lead.length) with H (axis lead.length + 1).
+  return swapAxes(reshaped, lead.length, lead.length + 1)
+}
+/** Inverse of `splitHeads`: [..., H, T, d] → [..., T, H*d]. */
+export function mergeHeads(x: Tensor): Tensor {
+  const site = captureSite('mergeHeads')
+  const r = x.shape.length
+  if (r < 3) throw new ShapeError(`mergeHeads: requires rank >= 3, got ${r}`, site)
+  const H = x.shape[r - 3]!
+  const T = x.shape[r - 2]!
+  const d = x.shape[r - 1]!
+  const lead = x.shape.slice(0, r - 3)
+  // Swap H (axis r-3) and T (axis r-2): [..., H, T, d] → [..., T, H, d]
+  const swapped = swapAxes(x, r - 3, r - 2)
+  return reshape(swapped, [...lead, T, H * d])
+}
+/** Slice a flat capture readback of shape `[H, ..., ...]` into one
+ *  Float32Array per head. The leading axis is treated as the head axis;
+ *  pass the shape from `compiled.captureShapes[name]`. Result: `H` arrays,
+ *  each holding the row-major data for that head (size = product of trailing
+ *  axes). For B>1 graphs, prefix the result by the batch — this helper
+ *  assumes the leading axis is heads, which matches how `splitHeads` lays
+ *  out captures at B=1 (the typical capture-readback shape). */
+export function unsplitHeads(flat: Float32Array, shape: readonly number[]): Float32Array[] {
+  if (shape.length < 2) {
+    throw new Error(`unsplitHeads: shape needs >= 2 dims, got [${shape.join(', ')}]`)
+  }
+  // For inference graphs at B=1, captures have shape [1, H, ..., ...]. Strip
+  // the leading 1 if present so callers can pass captureShapes[name] directly.
+  const s = shape[0] === 1 ? shape.slice(1) : shape
+  const H = s[0]!
+  let stride = 1
+  for (let i = 1; i < s.length; i++) stride *= s[i]!
+  const expected = H * stride
+  if (flat.length !== expected) {
+    throw new Error(`unsplitHeads: flat length ${flat.length} doesn't match shape product ${expected}`)
+  }
+  return Array.from({ length: H }, (_, h) => flat.slice(h * stride, (h + 1) * stride))
+}
+// ----------------------------------------------------------------------------
+// Loss helpers
+// ----------------------------------------------------------------------------
+/** Per-position cross-entropy along the last (vocab) axis: returns
+ *  `-log p(target)` at each position. `logits` is `[..., V]`; `targets` is
+ *  `[...]` of i32; result is `[...]` (one rank less than logits). The user
+ *  applies their own masking + reduction downstream — useful when only some
+ *  positions contribute (e.g. result-digit masking) or for label smoothing. */
+export function crossEntropyLast(logits: Tensor, targets: Tensor): Tensor {
+  const site = captureSite('crossEntropyLast')
+  if (targets.dtype !== 'i32') {
+    throw new ShapeError(`crossEntropyLast: targets must be i32, got ${targets.dtype}`, site)
+  }
+  const vocab = logits.shape[logits.shape.length - 1]!
+  const lp = logSoftmaxLast(logits)                                   // [..., V]
+  const targetLp = sumLast(mul(lp, oneHot(targets, vocab, 'f32')))    // [...]
+  return mul(targetLp, -1)
+}

package/src/ops.ts CHANGED Viewed

@@ -17,7 +17,7 @@ import {
   inferReshape, inferTranspose, inferMatmul, inferMatmulBatched,
   inferOneHot, inferWhereCausal, inferSliceLastRange,
   inferBroadcastTo, inferSumToShape, inferReluGrad, inferWhere,
-  ShapeError,
+  ShapeError, showShape,
 } from './shape.js'
 // ----------------------------------------------------------------------------
@@ -112,6 +112,11 @@ export function sumLast(a: Tensor): Tensor {
   return addOp(currentGraph(), 'sum_last', outShape, a.dtype, site, { a: a.id })
 }
+/** Reduce all elements to a 0-d scalar. Composes `reshape` + `sumLast`. */
+export function sumAll(a: Tensor): Tensor {
+  return sumLast(reshape(a, [-1]))
+}
 // ----------------------------------------------------------------------------
 // Shape ops.
 // ----------------------------------------------------------------------------
@@ -128,6 +133,26 @@ export function transpose(a: Tensor, perm: readonly number[]): Tensor {
   return addOp(currentGraph(), 'transpose', outShape, a.dtype, site, { a: a.id, perm })
 }
+/** Swap two axes of a tensor. Negative indices count from the end (so
+ *  `swapAxes(x, -1, -2)` swaps the last two — the common attention pattern).
+ *  All other axes keep their position. Implemented as `transpose` with the
+ *  permutation `[0, 1, ..., axis2, ..., axis1, ..., n-1]`. */
+export function swapAxes(a: Tensor, axis1: number, axis2: number): Tensor {
+  const r = a.shape.length
+  const norm = (axis: number): number => axis < 0 ? r + axis : axis
+  const i1 = norm(axis1)
+  const i2 = norm(axis2)
+  const site = captureSite('swapAxes')
+  if (i1 < 0 || i1 >= r || i2 < 0 || i2 >= r) {
+    throw new ShapeError(`swapAxes: axis out of range — got (${axis1}, ${axis2}) for rank-${r} tensor`, site)
+  }
+  if (i1 === i2) return a
+  const perm = Array.from({ length: r }, (_, k) => k)
+  perm[i1] = i2
+  perm[i2] = i1
+  return transpose(a, perm)
+}
 // ----------------------------------------------------------------------------
 // Linear algebra.
 // ----------------------------------------------------------------------------
@@ -163,6 +188,22 @@ export function oneHot(indices: Tensor, depth: number, dtype: Dtype = 'f32'): Te
   return addOp(currentGraph(), 'one_hot', outShape, dtype, site, { indices: indices.id, depth, dtype })
 }
+/** Embedding lookup: pull rows from `table` indexed by `indices`. Decomposes
+ *  to `oneHot(indices, vocab) @ table` so autograd works without a dedicated
+ *  scatter-with-atomic-add backward — the matmul transpose rule handles it.
+ *  `table` is `[vocab, dim]`; `indices` is any shape `[...]` of i32; result
+ *  is `[..., dim]`. The vocab size is taken from `table.shape[0]`. */
+export function embedding(table: Tensor, indices: Tensor): Tensor {
+  const site = captureSite('embedding')
+  if (table.shape.length !== 2) {
+    throw new ShapeError(`embedding: table must be 2-d [vocab, dim], got ${showShape(table.shape)}`, site)
+  }
+  if (indices.dtype !== 'i32') {
+    throw new ShapeError(`embedding: indices must be i32, got ${indices.dtype}`, site)
+  }
+  return matmul(oneHot(indices, table.shape[0]!, 'f32'), table)
+}
 // arange(n) → [n] of values [0, 1, ..., n-1]. Used for position embeddings.
 export function arange(n: number, dtype: Dtype = 'i32'): Tensor {
   const site = captureSite('arange')

package/src/runtime.ts CHANGED Viewed

@@ -43,17 +43,38 @@ export interface RunWithCaptures {
   captures: Record<string, Float32Array>
 }
-export interface CompiledRuntime {
-  /** Map of param name -> the underlying GPUBuffer. Pass to a sibling compile
-   *  via `sharedParams` to share without copies — every step on this runtime
-   *  is immediately visible to anyone reading these buffers. */
+/** Common surface for both training and forward-only compiled runtimes. */
+export interface CompiledBase {
+  /** Param name -> the underlying GPUBuffer. Pass to a sibling compile via
+   *  `sharedParams` to share without copies. */
   params: Map<string, GPUBuffer>
+  /** Shape of each tensor registered via `capture(name, t)`. Static after
+   *  compile — reshape readbacks without recomputing strides. */
+  captureShapes: Record<string, number[]>
+  /** Shape of the graph's output (loss scalar `[]` for training; the user's
+   *  returned tensor for forward-only compiles). */
+  outputShape: number[]
   /** Upload parameter Float32Arrays to their GPU buffers. By default, requires
    *  *all* params to be present; throws on any unknown or missing key. Pass
    *  `{ partial: true }` to skip the missing-key check. */
   uploadParams(params: Record<string, Float32Array>, opts?: UploadParamsOptions): void
   /** Read all parameters back as Float32Arrays — used for UI panels. */
   downloadParams(): Promise<Record<string, Float32Array>>
+  /** Free GPU resources. */
+  destroy(): void
+}
+/** Run a dispatch and read back the full output tensor (and any registered
+ *  captures if requested). Forward-only compiles use this as their primary
+ *  surface; training compiles also expose it but `step()` is more convenient
+ *  there because the output is a scalar loss. */
+export interface RunFn {
+  (inputs: Record<string, Int32Array | Float32Array>): Promise<Float32Array>
+  (inputs: Record<string, Int32Array | Float32Array>, opts: { withCaptures: true }): Promise<RunWithCaptures>
+  (inputs: Record<string, Int32Array | Float32Array>, opts: RunOptions): Promise<Float32Array | RunWithCaptures>
+}
+export interface CompiledRuntime extends CompiledBase {
   /** Read all parameter gradients back. Mostly for verification / debugging. */
   downloadParamGrads(): Promise<Record<string, Float32Array>>
   /**
@@ -68,32 +89,19 @@ export interface CompiledRuntime {
   step(inputs: Record<string, Int32Array | Float32Array>): Promise<number>
   step(inputs: Record<string, Int32Array | Float32Array>, opts: { withCaptures: true }): Promise<StepWithCaptures>
   step(inputs: Record<string, Int32Array | Float32Array>, opts: StepOptions): Promise<number | StepWithCaptures>
-  /** Like `step()` but returns the full output Float32Array instead of just
-   *  its first element. For training graphs this is rarely useful (the output
-   *  *is* a scalar loss); it's the primary API for forward-only compiles. */
-  run(inputs: Record<string, Int32Array | Float32Array>): Promise<Float32Array>
-  run(inputs: Record<string, Int32Array | Float32Array>, opts: { withCaptures: true }): Promise<RunWithCaptures>
-  run(inputs: Record<string, Int32Array | Float32Array>, opts: RunOptions): Promise<Float32Array | RunWithCaptures>
+  /** Same dispatch as step() but returns the full output Float32Array — for
+   *  training graphs the output is a scalar loss, so step() is usually more
+   *  convenient. Provided for parity with `compileForward`. */
+  run: RunFn
   /** Re-zero all optimizer state buffers (Adam's m/v) in place. Pair with
    *  `uploadInitialParams()` for a full training reset without recompile. */
   resetOptimizerState(): void
-  /** Free GPU resources. */
-  destroy(): void
 }
 /** Forward-only compiled runtime — produced by `compileForward`. No optimizer,
  *  no backward. Returns the output tensor (not just a scalar) per `run()` call. */
-export interface CompiledForward {
-  params: Map<string, GPUBuffer>
-  uploadParams(params: Record<string, Float32Array>, opts?: UploadParamsOptions): void
-  downloadParams(): Promise<Record<string, Float32Array>>
-  /** Forward-only dispatch. Returns the graph's output tensor as a Float32Array
-   *  (the user's returned tensor from the forward function, in row-major order).
-   *  With `{ withCaptures: true }`, returns `{ output, captures }`. */
-  run(inputs: Record<string, Int32Array | Float32Array>): Promise<Float32Array>
-  run(inputs: Record<string, Int32Array | Float32Array>, opts: { withCaptures: true }): Promise<RunWithCaptures>
-  run(inputs: Record<string, Int32Array | Float32Array>, opts: RunOptions): Promise<Float32Array | RunWithCaptures>
-  destroy(): void
+export interface CompiledForward extends CompiledBase {
+  run: RunFn
 }
 export interface RuntimeOpts {
@@ -147,14 +155,7 @@ export async function createRuntime(
       label: spec.name ?? `t${spec.id}-${spec.kind}`,
     })
     buffers.set(spec.id, buf)
-    if (spec.kind === 'state') {
-      // Fill with initValue (typically 0). Float and int both 4 bytes per element.
-      const elements = spec.byteSize / 4
-      const init = spec.dtype === 'f32'
-        ? new Float32Array(elements).fill(spec.initValue ?? 0)
-        : new Int32Array(elements).fill(Math.trunc(spec.initValue ?? 0))
-      queue.writeBuffer(buf, 0, init as unknown as BufferSource)
-    }
+    if (spec.kind === 'state') fillStateBuffer(spec, buf)
   }
   // Track which params are externally owned — those are skipped on destroy().
   const ownedBufferIds = new Set<number>()
@@ -404,14 +405,20 @@ export async function createRuntime(
     return out
   }
+  // Fill a state buffer with its declared initValue (typically 0). Float and
+  // int both serialize to 4 bytes per element. Used at allocation time and on
+  // resetOptimizerState() — same logic, two callers.
+  function fillStateBuffer(spec: { byteSize: number; dtype: 'f32' | 'i32' | 'bool'; initValue?: number }, target: GPUBuffer): void {
+    const elements = spec.byteSize / 4
+    const init = spec.dtype === 'f32'
+      ? new Float32Array(elements).fill(spec.initValue ?? 0)
+      : new Int32Array(elements).fill(Math.trunc(spec.initValue ?? 0))
+    queue.writeBuffer(target, 0, init as unknown as BufferSource)
+  }
   function resetOptimizerState() {
     for (const spec of plan.buffers) {
-      if (spec.kind !== 'state') continue
-      const elements = spec.byteSize / 4
-      const init = spec.dtype === 'f32'
-        ? new Float32Array(elements).fill(spec.initValue ?? 0)
-        : new Int32Array(elements).fill(Math.trunc(spec.initValue ?? 0))
-      queue.writeBuffer(buffers.get(spec.id)!, 0, init as unknown as BufferSource)
+      if (spec.kind === 'state') fillStateBuffer(spec, buffers.get(spec.id)!)
     }
   }
@@ -421,6 +428,13 @@ export async function createRuntime(
   for (const [name, bufId] of plan.paramsByName) {
     params.set(name, buffers.get(bufId)!)
   }
+  // Static-after-compile shape metadata so users don't have to recompute
+  // strides to interpret a flat capture readback.
+  const captureShapes: Record<string, number[]> = {}
+  for (const [name, bufId] of plan.capturesByName) {
+    captureShapes[name] = [...plan.buffers[bufId]!.shape]
+  }
+  const outputShape = [...plan.buffers[lossBufferId]!.shape]
   const destroy = () => {
     for (const [id, b] of buffers) {
@@ -432,6 +446,8 @@ export async function createRuntime(
   return {
     params,
+    captureShapes,
+    outputShape,
     uploadParams,
     downloadParams: () => downloadFromMap(plan.paramsByName),
     downloadParamGrads: () => downloadFromMap(plan.paramGradsByName),
@@ -442,22 +458,17 @@ export async function createRuntime(
   }
 }
-/** Same machinery as `createRuntime`, narrower public API: no step,
- *  no resetOptimizerState, no downloadParamGrads. Used by `compileForward`. */
+/** Same machinery as `createRuntime`, narrower public type: a forward-only
+ *  graph exposes `run()` instead of `step()` (no optimizer state, no scalar-
+ *  loss readback). The full runtime object is built once and projected by
+ *  `compileForward` to the public shape. */
 export async function createForwardRuntime(
   plan: BufferPlan,
   kernels: KernelSpec[],
   outputBufferId: number,
   opts: RuntimeOpts = {},
 ): Promise<CompiledForward> {
-  const full = await createRuntime(plan, kernels, outputBufferId, opts)
-  return {
-    params: full.params,
-    uploadParams: full.uploadParams,
-    downloadParams: full.downloadParams,
-    run: full.run,
-    destroy: full.destroy,
-  }
+  return await createRuntime(plan, kernels, outputBufferId, opts)
 }
 async function acquireDevice(): Promise<GPUDevice> {