npm - tensorgrad - Versions diffs - 0.0.12 → 0.0.14 - Mend

tensorgrad 0.0.12 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -1,14 +1,740 @@
-export type { Tensor, Shape, Dtype, OpNode, Graph, CallSite } from './ir.js';
-export { ShapeError } from './shape.js';
-export { trace, traceInto, paramInput, tensorInput, stateInput } from './trace.js';
-export { capture } from './capture.js';
-export { add, sub, mul, div, sqrt, rsqrt, log, exp, relu, less, greater, where, meanLast, sumLast, sumAll, reshape, transpose, swapAxes, matmul, matmulBatched, oneHot, arange, embedding, softmaxCausalLast, logSoftmaxLast, whereCausal, sliceLastRange, } from './ops.js';
-export { appendGrad, type GradResult } from './grad.js';
-export { appendAdam, type AdamConfig, type AdamResult } from './adam.js';
-export { planBuffers, type BufferPlan, type BufferSpec, type Writeback, type WritebackDecl } from './buffers.js';
-export { emitKernels, type KernelSpec } from './codegen.js';
-export { createRuntime, createForwardRuntime, Captures, type CompiledRuntime, type CompiledForward, type RuntimeOpts, type RunOptions, type StepResult, type RunResult } from './runtime.js';
-export { compile, compileToIR, compileModule, compileForward, type CompiledIR, type CompileModuleOptions, type CompileForwardOptions, type CompileForwardMethodOptions, type CompiledModule, type CompiledForwardModule, type InputDecl, type InputDecls, type InputsTensors, type ForwardFn, } from './compile.js';
-export { Module, materializeParams, type InitSpec, type ParamOptions, type MaterializedParams } from './module.js';
-export * as nn from './nn.js';
-//# sourceMappingURL=index.d.ts.map
+type Dtype = 'f32' | 'i32' | 'bool';
+type Shape = readonly number[];
+interface Tensor {
+    readonly id: number;
+    readonly shape: Shape;
+    readonly dtype: Dtype;
+    readonly source: number | null;
+    readonly site: CallSite | null;
+}
+interface CallSite {
+    readonly opName: string;
+    readonly stack: string;
+}
+type OpNode = {
+    kind: 'param_input';
+    out: number;
+    name: string;
+} | {
+    kind: 'tensor_input';
+    out: number;
+    name: string;
+} | {
+    kind: 'state_input';
+    out: number;
+    name: string;
+    initValue: number;
+} | {
+    kind: 'add';
+    out: number;
+    a: number;
+    b: number;
+} | {
+    kind: 'sub';
+    out: number;
+    a: number;
+    b: number;
+} | {
+    kind: 'mul';
+    out: number;
+    a: number;
+    b: number;
+} | {
+    kind: 'div';
+    out: number;
+    a: number;
+    b: number;
+} | {
+    kind: 'mul_scalar';
+    out: number;
+    a: number;
+    scalar: number;
+} | {
+    kind: 'add_scalar';
+    out: number;
+    a: number;
+    scalar: number;
+} | {
+    kind: 'sqrt';
+    out: number;
+    a: number;
+} | {
+    kind: 'rsqrt';
+    out: number;
+    a: number;
+} | {
+    kind: 'log';
+    out: number;
+    a: number;
+} | {
+    kind: 'exp';
+    out: number;
+    a: number;
+} | {
+    kind: 'relu';
+    out: number;
+    a: number;
+} | {
+    kind: 'mean_last';
+    out: number;
+    a: number;
+} | {
+    kind: 'sum_last';
+    out: number;
+    a: number;
+} | {
+    kind: 'reshape';
+    out: number;
+    a: number;
+    newShape: Shape;
+} | {
+    kind: 'transpose';
+    out: number;
+    a: number;
+    perm: readonly number[];
+} | {
+    kind: 'matmul';
+    out: number;
+    a: number;
+    b: number;
+} | {
+    kind: 'matmul_batched';
+    out: number;
+    a: number;
+    b: number;
+} | {
+    kind: 'one_hot';
+    out: number;
+    indices: number;
+    depth: number;
+    dtype: Dtype;
+} | {
+    kind: 'arange';
+    out: number;
+    n: number;
+    dtype: Dtype;
+} | {
+    kind: 'softmax_causal_last';
+    out: number;
+    a: number;
+} | {
+    kind: 'log_softmax_last';
+    out: number;
+    a: number;
+} | {
+    kind: 'where_causal';
+    out: number;
+    a: number;
+    fillValue: number;
+} | {
+    kind: 'less';
+    out: number;
+    a: number;
+    b: number;
+} | {
+    kind: 'greater';
+    out: number;
+    a: number;
+    b: number;
+} | {
+    kind: 'where';
+    out: number;
+    cond: number;
+    a: number;
+    b: number;
+} | {
+    kind: 'adam_update_m';
+    out: number;
+    m: number;
+    g: number;
+    b1: number;
+} | {
+    kind: 'adam_update_v';
+    out: number;
+    v: number;
+    g: number;
+    b2: number;
+} | {
+    kind: 'adam_update_p';
+    out: number;
+    p: number;
+    mNew: number;
+    vNew: number;
+    lrt: number;
+    eps: number;
+    decayShrink: number;
+    decayShrinkTensor: number | null;
+} | {
+    kind: 'slice_last_range';
+    out: number;
+    a: number;
+    start: number;
+    end: number;
+} | {
+    kind: 'broadcast_to';
+    out: number;
+    a: number;
+    targetShape: Shape;
+} | {
+    kind: 'sum_to_shape';
+    out: number;
+    a: number;
+    targetShape: Shape;
+} | {
+    kind: 'const_scalar';
+    out: number;
+    value: number;
+    dtype: Dtype;
+} | {
+    kind: 'relu_grad';
+    out: number;
+    x: number;
+    dy: number;
+};
+interface Graph {
+    readonly ops: OpNode[];
+    readonly tensors: Tensor[];
+    readonly outputs: number[];
+    readonly captures: Map<string, number>;
+}
+declare class ShapeError extends Error {
+    constructor(message: string, site: CallSite | null);
+}
+declare function trace(fn: () => Tensor | Tensor[]): Graph;
+declare function traceInto<T>(g: Graph, fn: () => T): T;
+declare function paramInput(name: string, shape: Shape, dtype?: Dtype): Tensor;
+declare function tensorInput(name: string, shape: Shape, dtype?: Dtype): Tensor;
+declare function stateInput(name: string, shape: Shape, dtype?: Dtype, initValue?: number): Tensor;
+declare function capture<T extends Tensor>(name: string, t: T): T;
+declare function add(a: Tensor, b: Tensor | number): Tensor;
+declare function sub(a: Tensor, b: Tensor | number): Tensor;
+declare function mul(a: Tensor, b: Tensor | number): Tensor;
+declare function div(a: Tensor, b: Tensor | number): Tensor;
+declare const sqrt: (a: Tensor) => Tensor;
+declare const rsqrt: (a: Tensor) => Tensor;
+declare const log: (a: Tensor) => Tensor;
+declare const exp: (a: Tensor) => Tensor;
+declare const relu: (a: Tensor) => Tensor;
+declare function meanLast(a: Tensor): Tensor;
+declare function sumLast(a: Tensor): Tensor;
+/** Reduce all elements to a 0-d scalar. Composes `reshape` + `sumLast`. */
+declare function sumAll(a: Tensor): Tensor;
+declare function reshape(a: Tensor, newShape: Shape): Tensor;
+declare function transpose(a: Tensor, perm: readonly number[]): Tensor;
+/** Swap two axes of a tensor. Negative indices count from the end (so
+ *  `swapAxes(x, -1, -2)` swaps the last two — the common attention pattern).
+ *  All other axes keep their position. Implemented as `transpose` with the
+ *  permutation `[0, 1, ..., axis2, ..., axis1, ..., n-1]`. */
+declare function swapAxes(a: Tensor, axis1: number, axis2: number): Tensor;
+declare function matmul(a: Tensor, b: Tensor): Tensor;
+declare function matmulBatched(a: Tensor, b: Tensor): Tensor;
+declare function oneHot(indices: Tensor, depth: number, dtype?: Dtype): Tensor;
+/** Embedding lookup: pull rows from `table` indexed by `indices`. Decomposes
+ *  to `oneHot(indices, vocab) @ table` so autograd works without a dedicated
+ *  scatter-with-atomic-add backward — the matmul transpose rule handles it.
+ *  `table` is `[vocab, dim]`; `indices` is any shape `[...]` of i32; result
+ *  is `[..., dim]`. The vocab size is taken from `table.shape[0]`. */
+declare function embedding(table: Tensor, indices: Tensor): Tensor;
+declare function arange(n: number, dtype?: Dtype): Tensor;
+declare function softmaxCausalLast(a: Tensor): Tensor;
+declare function logSoftmaxLast(a: Tensor): Tensor;
+declare function whereCausal(a: Tensor, fillValue: number): Tensor;
+declare function sliceLastRange(a: Tensor, start: number, end: number): Tensor;
+declare const less: (a: Tensor, b: Tensor) => Tensor;
+declare const greater: (a: Tensor, b: Tensor) => Tensor;
+declare function where(cond: Tensor, a: Tensor, b: Tensor): Tensor;
+interface GradResult {
+    readonly graph: Graph;
+    readonly paramGrads: Record<string, Tensor>;
+    readonly loss: Tensor;
+}
+declare function appendGrad(graph: Graph): GradResult;
+interface BufferSpec {
+    /** Matches tensor.id. */
+    id: number;
+    byteSize: number;
+    dtype: Dtype;
+    shape: Shape;
+    kind: 'param' | 'param_grad' | 'tensor_input' | 'state' | 'intermediate' | 'output';
+    /** External name for param/param_grad/tensor_input/state bindings. null otherwise. */
+    name: string | null;
+    /** For state buffers: the value to fill on initial allocation. 0 by default. */
+    initValue?: number;
+}
+/**
+ * After step(), copy `source`'s buffer into `dest`'s buffer.
+ * Used to write back updated optimizer state and updated parameters into
+ * their persistent home buffers.
+ */
+interface Writeback {
+    source: number;
+    dest: number;
+    bytes: number;
+}
+interface BufferPlan {
+    buffers: BufferSpec[];
+    /** Tensor id -> buffer id (currently 1:1 but kept opaque for future pooling). */
+    tensorToBuffer: Map<number, number>;
+    /** Easy lookup tables for the runtime. */
+    paramsByName: Map<string, number>;
+    inputsByName: Map<string, number>;
+    paramGradsByName: Map<string, number>;
+    statesByName: Map<string, number>;
+    capturesByName: Map<string, number>;
+    outputBufferIds: number[];
+    /** End-of-step writebacks (Adam updates for params, m, v, etc.) */
+    writebacks: Writeback[];
+}
+/**
+ * Caller-supplied writeback declarations: "after each step, copy this Tensor's
+ * buffer into the persistent home of this param/state."
+ */
+interface WritebackDecl {
+    /** The Tensor (output of some op) holding the new value to write back. */
+    source: Tensor;
+    /** Either a param name (writes to that param's home buffer) or a state name. */
+    destName: string;
+    destKind: 'param' | 'state';
+}
+/**
+ * Build a BufferPlan from a graph + the param-grad map produced by appendGrad.
+ * @param graph the full graph (forward + backward + any optimizer ops)
+ * @param paramGrads map from param name -> the Tensor that holds its gradient
+ * @param writebackDecls list of end-of-step writebacks (e.g. from appendAdam).
+ *                       Empty when there's no optimizer in the graph.
+ */
+declare function planBuffers(graph: Graph, paramGrads: Record<string, Tensor>, writebackDecls?: WritebackDecl[]): BufferPlan;
+interface AdamConfig {
+    /** Constant scalar (e.g., `0.005`) or a per-step schedule function
+     *  `(step) => lr`. Schedule fn lets the user implement linear/cosine decay
+     *  or warmup; first call passes `step=1`. Decay-shrink (AdamW) updates
+     *  per-step automatically when this is a function. */
+    lr: number | ((step: number) => number);
+    b1?: number;
+    b2?: number;
+    eps?: number;
+    /** AdamW: decoupled weight decay coefficient. Default 0 (plain Adam).
+     *  When non-zero, every step shrinks each decayed param by a factor of
+     *  `1 - lr * weightDecay` before the gradient update. */
+    weightDecay?: number;
+    /** Filter deciding which params get weight decay. Only consulted when
+     *  weightDecay > 0. Default: decay every param. Override for the standard
+     *  transformer convention (decay weights/embeddings, skip biases + LN gains).
+     *  Example: `(name) => name.includes('.W') || name.endsWith('_emb')`. */
+    decayFilter?: (paramName: string) => boolean;
+}
+/** Resolved hyperparameters: lr is the schedule fn (constants are wrapped). */
+interface AdamResolvedConfig {
+    lr: (step: number) => number;
+    b1: number;
+    b2: number;
+    eps: number;
+    weightDecay: number;
+    decayFilter: (name: string) => boolean;
+    /** True iff the user supplied an lr function (vs a constant). When false,
+     *  decayShrink is baked at compile time and never updated. */
+    lrIsScheduled: boolean;
+}
+interface AdamResult {
+    /** Writebacks the buffer planner should wire into the runtime. */
+    writebacks: WritebackDecl[];
+    /** Name of the per-step scalar tensor_input. The runtime fills this each call
+     * with `lr * sqrt(1-b2^t)/(1-b1^t)` (Adam's bias-corrected effective LR). */
+    lrtInputName: string;
+    /** Name of the per-step decayShrink scalar tensor_input, or null when lr is
+     *  static (decayShrink baked into the kernel) or no params are decayed. */
+    decayShrinkInputName: string | null;
+    /** Hyperparameters as captured (so the runtime can compute lrt and decayShrink). */
+    config: AdamResolvedConfig;
+}
+/**
+ * Append Adam update ops to `graph`. Must be called inside an active trace
+ * context (or after a trace, since traceInto re-enters the graph).
+ *
+ * @param graph the graph (already containing forward + backward)
+ * @param paramGrads param name -> gradient tensor (output of `appendGrad`)
+ * @param paramTensors param name -> the param's leaf Tensor (the param_input).
+ *                     Needed because the param_input lives in the graph but we
+ *                     don't have a direct map by name in `Graph` — caller passes it.
+ * @param config Adam hyperparameters. Set `weightDecay > 0` for AdamW; an
+ *               optional `decayFilter` selects which params receive decay.
+ */
+declare function appendAdam(graph: Graph, paramGrads: Record<string, Tensor>, paramTensors: Record<string, Tensor>, config: AdamConfig,
+/** Per-param decay flags from `materializeParams`. When supplied, overrides
+ *  `config.decayFilter` for any name in the map; falls back to `decayFilter`
+ *  for names not present (e.g., for low-level callers using `compile()`
+ *  directly without a Module). */
+decayFlags?: Record<string, boolean>): AdamResult;
+interface KernelSpec {
+    /** Index into graph.ops. */
+    opIndex: number;
+    /** Op kind (for debugging / pipeline cache key). */
+    opKind: OpNode['kind'];
+    /** Generated WGSL source. Empty string for "logical" ops with no kernel. */
+    wgsl: string;
+    /**
+     * Buffer ids in binding-index order. The runtime creates a bind group with
+     * these in @binding(0..N) on @group(0). Inputs come first (read), output last
+     * (read_write).
+     */
+    bindings: number[];
+    /** Number of threads to dispatch (1-D). 0 means "skip" (e.g. reshape no-op). */
+    threads: number;
+    /** Workgroup size; usually WG_SIZE. */
+    workgroupSize: number;
+}
+/** Generate a KernelSpec per compute op in graph.ops (in dispatch order). */
+declare function emitKernels(graph: Graph, plan: BufferPlan): KernelSpec[];
+interface UploadParamsOptions {
+    /** Skip the "missing param" check, allowing the caller to update only some
+     *  params and leave the rest at their current GPU values. Extra (unknown)
+     *  keys are still rejected — that's always a typo. Default: false. */
+    partial?: boolean;
+}
+/**
+ * Activation readbacks for one `step()`/`run()` call. Keyed by the names
+ * passed to `capture(name, t)` during the trace. `get(name)` throws if the
+ * name isn't registered or wasn't read back this call (i.e., the call was
+ * made without `{ withCaptures: true }`); use `has(name)` if you need to
+ * branch. `shapeOf(name)` returns the static-after-compile shape and works
+ * regardless of whether captures were read back.
+ */
+declare class Captures {
+    private readonly shapes;
+    private readonly data;
+    constructor(shapes: Record<string, readonly number[]>, data: Map<string, Float32Array>);
+    get(name: string): Float32Array;
+    shapeOf(name: string): readonly number[];
+    has(name: string): boolean;
+    names(): string[];
+}
+interface RunResult {
+    output: Float32Array;
+    captures: Captures;
+}
+interface StepResult {
+    loss: number;
+    captures: Captures;
+}
+interface RunOptions {
+    /** Read back tensors registered via `capture(name, t)` during the trace.
+     *  Default false. When false, the returned `captures` is empty (calling
+     *  `.get` throws); when true, captures are read back and accessible. */
+    withCaptures?: boolean;
+}
+/** Common surface for both training and forward-only compiled runtimes. */
+interface CompiledBase {
+    /** The GPUDevice this runtime is bound to. Pass to sibling compiles to
+     *  share the device, or use directly for other GPU work. */
+    device: GPUDevice;
+    /** Param name -> the underlying GPUBuffer. Pass to a sibling compile via
+     *  `sharedParams` to share without copies. */
+    params: Map<string, GPUBuffer>;
+    /** Shape of the graph's output (loss scalar `[]` for training; the user's
+     *  returned tensor for forward-only compiles). */
+    outputShape: number[];
+    /** Upload parameter Float32Arrays to their GPU buffers. By default, requires
+     *  *all* params to be present; throws on any unknown or missing key. Pass
+     *  `{ partial: true }` to skip the missing-key check. */
+    uploadParams(params: Record<string, Float32Array>, opts?: UploadParamsOptions): void;
+    /** Read all parameters back as Float32Arrays — used for UI panels. */
+    downloadParams(): Promise<Record<string, Float32Array>>;
+    /** Free GPU resources. */
+    destroy(): void;
+}
+/** Run a dispatch and read back the full output tensor. Default returns the
+ *  output as a `Float32Array`; with `{ withCaptures: true }` returns
+ *  `{ output, captures }`. Same shape as `step()`'s overloads. */
+interface RunFn {
+    (inputs: Record<string, Int32Array | Float32Array>): Promise<Float32Array>;
+    (inputs: Record<string, Int32Array | Float32Array>, opts: {
+        withCaptures: true;
+    }): Promise<RunResult>;
+    (inputs: Record<string, Int32Array | Float32Array>, opts: RunOptions): Promise<Float32Array | RunResult>;
+}
+interface CompiledRuntime extends CompiledBase {
+    /** Read all parameter gradients back. Mostly for verification / debugging. */
+    downloadParamGrads(): Promise<Record<string, Float32Array>>;
+    /**
+     * One full forward+backward step.
+     *   1. Uploads `inputs` (tokens, targets, masks) to input buffers.
+     *   2. Dispatches every kernel in order.
+     *   3. Reads back the loss scalar (and any registered captures, if requested).
+     * Default returns the loss as a JS number; with `{ withCaptures: true }`
+     * returns `{ loss, captures }`.
+     */
+    step(inputs: Record<string, Int32Array | Float32Array>): Promise<number>;
+    step(inputs: Record<string, Int32Array | Float32Array>, opts: {
+        withCaptures: true;
+    }): Promise<StepResult>;
+    step(inputs: Record<string, Int32Array | Float32Array>, opts: RunOptions): Promise<number | StepResult>;
+    /** Same dispatch as step() but returns the full output Float32Array — for
+     *  training graphs the output is a scalar loss, so step() is usually more
+     *  convenient. Provided for parity with `compileForward`. */
+    run: RunFn;
+    /** Re-zero all optimizer state buffers (Adam's m/v) in place. Pair with
+     *  `uploadInitialParams()` for a full training reset without recompile. */
+    resetOptimizerState(): void;
+}
+/** Forward-only compiled runtime — produced by `compileForward`. No optimizer,
+ *  no backward. Returns the output tensor (not just a scalar) per `run()` call. */
+interface CompiledForward extends CompiledBase {
+    run: RunFn;
+}
+interface RuntimeOpts {
+    /** Pre-acquired GPUDevice. If omitted, runtime requests its own. */
+    device?: GPUDevice;
+    /** External param buffers to bind in place of allocating fresh ones, keyed
+     *  by param name. Used to share params between a training compile and a
+     *  sibling forward-only compile (e.g., a B=1 inference graph). When a name
+     *  is in this map, the runtime reuses the provided GPUBuffer; otherwise it
+     *  allocates as usual. */
+    sharedParams?: Map<string, GPUBuffer>;
+}
+declare function createRuntime(plan: BufferPlan, kernels: KernelSpec[], lossBufferId: number, opts?: RuntimeOpts): Promise<CompiledRuntime>;
+/** Same machinery as `createRuntime`, narrower public type: a forward-only
+ *  graph exposes `run()` instead of `step()` (no optimizer state, no scalar-
+ *  loss readback). The full runtime object is built once and projected by
+ *  `compileForward` to the public shape. */
+declare function createForwardRuntime(plan: BufferPlan, kernels: KernelSpec[], outputBufferId: number, opts?: RuntimeOpts): Promise<CompiledForward>;
+/** How a parameter's initial values are produced.
+ *  - `'randn'` — Gaussian, with `scale` (default 0.02). The common case for
+ *    weight matrices and embeddings.
+ *  - `'zeros'` — fill with 0. Common for biases and LayerNorm beta.
+ *  - `'ones'`  — fill with 1. Common for LayerNorm gain.
+ *  - Custom function — receives total element count and shape, returns the
+ *    Float32Array. Use for fan-in scaling or any non-standard scheme.
+ */
+type InitSpec = 'randn' | 'zeros' | 'ones' | ((size: number, shape: readonly number[]) => Float32Array);
+interface ParamOptions {
+    dtype?: Dtype;
+    /** Init kind. Default: `'randn'`. */
+    init?: InitSpec;
+    /** Std dev for `'randn'`. Default 0.02. Ignored for non-randn init. */
+    scale?: number;
+    /** Whether AdamW (when `weightDecay > 0`) should apply decoupled weight
+     *  decay to this param. Default: `true` for `'randn'` init (weight matrices,
+     *  embeddings), `false` for `'zeros'` / `'ones'` (biases, LN gains). Override
+     *  to force or skip. Replaces `adam.decayFilter` for the common case. */
+    decay?: boolean;
+}
+type InitFn = (size: number, shape: readonly number[]) => Float32Array;
+declare abstract class Module {
+    /**
+     * Declare a learnable parameter at this module. Must be called from inside
+     * the constructor (typically as a field assignment). Returns a placeholder
+     * that gets replaced with a real Tensor at compile time.
+     *
+     * The parameter's name is auto-derived from its property path in the model
+     * tree (e.g. `layers.0.attn.W_q`). Init metadata travels with the param;
+     * call `compiled.uploadInitialParams()` to apply it after compile.
+     */
+    protected param(shape: Shape, opts?: ParamOptions): Tensor;
+}
+interface MaterializedParams {
+    /** Map from auto-derived path (e.g. `layers.0.attn.W_q`) to its Tensor. */
+    tensors: Record<string, Tensor>;
+    /** Init function per param path. Used by `uploadInitialParams`. */
+    initFns: Record<string, InitFn>;
+    /** Whether this param should receive AdamW weight decay. Resolved at
+     *  `param()` time from `ParamOptions.decay` (with init-based default). */
+    decayFlags: Record<string, boolean>;
+}
+/**
+ * Walk the module tree and replace every ParamSentinel with a real Tensor
+ * created via `paramInput(autoName, ...)`. Must be called inside an active
+ * trace context (paramInput appends to the current graph).
+ *
+ * Returns the param tensors keyed by path, plus init functions for use by
+ * `uploadInitialParams`.
+ */
+declare function materializeParams(root: Module): MaterializedParams;
+/** Declares one input tensor of the model's forward function. The name is the
+ *  key in the `inputs:` Record at compile time and the key on the `step()`/
+ *  `run()` data object at runtime. */
+interface InputDecl {
+    shape: Shape;
+    dtype?: Dtype;
+}
+/** Inputs declaration: a Record from input name to its shape/dtype. The name
+ *  doubles as the key the forward fn destructures and the key the runtime
+ *  expects in `step({...})` / `run({...})`. */
+type InputDecls = Record<string, InputDecl>;
+/** Maps an `InputDecls` Record to its forward-time tensor counterpart —
+ *  same keys, each value is a Tensor. Used to type the forward function's
+ *  `inputs` argument from the declared shape Record. */
+type InputsTensors<I extends InputDecls> = {
+    [K in keyof I]: Tensor;
+};
+/** Forward function shape: takes the materialized model and a Record of
+ *  named input tensors (matching the declared `inputs:` keys), returns the
+ *  output tensor (loss for compileModule; logits/etc. for compileForward).
+ *  The second generic flows from the inputs declaration so destructuring
+ *  the input record stays typed. */
+type ForwardFn<M extends Module, I extends InputDecls = InputDecls> = (m: M, inputs: InputsTensors<I>) => Tensor;
+interface CompiledIR {
+    graph: GradResult['graph'];
+    paramGrads: GradResult['paramGrads'];
+    loss: Tensor;
+    plan: BufferPlan;
+    kernels: KernelSpec[];
+}
+/** Trace + autograd + buffer-plan + codegen, without touching WebGPU. */
+declare function compileToIR(traceFn: () => Tensor): CompiledIR;
+/** Full compile pipeline. Browser-only because it creates a GPUDevice. */
+declare function compile(traceFn: () => Tensor, opts?: RuntimeOpts): Promise<CompiledRuntime & {
+    ir: CompiledIR;
+}>;
+interface CompileModuleOptions<I extends InputDecls = InputDecls> extends RuntimeOpts {
+    /** Per-step data inputs to the forward function, keyed by name. The forward
+     *  fn destructures these out of its second argument; runtime calls to
+     *  `step()` / `run()` pass typed arrays under the same keys. */
+    inputs?: I;
+    /** Adam hyperparameters. If omitted, no optimizer is appended (forward-only). */
+    adam?: AdamConfig;
+}
+interface CompileForwardOptions<I extends InputDecls = InputDecls> extends RuntimeOpts {
+    /** Per-step data inputs to the forward function, keyed by name. */
+    inputs?: I;
+}
+/** Forward-only compile options as taken by the `compileForward` *method* on
+ *  a training runtime — no `device` (inherited) and no `sharedParams`
+ *  (auto-supplied from the train graph's params). */
+interface CompileForwardMethodOptions<I extends InputDecls = InputDecls> {
+    inputs?: I;
+}
+/** Returned by `compileModule`. Adds training-graph extras (auto-init, reset,
+ *  sibling-graph compile) on top of the base runtime. */
+interface CompiledModule<M extends Module> extends CompiledRuntime {
+    ir: CompiledIR;
+    /** Number of dispatchable kernels (excludes leaf no-ops). */
+    kernelCount: number;
+    /** Re-initialize all params from their declared init specs and zero the
+     *  optimizer state. Use to start training over without recompiling. */
+    reset(): void;
+    /** Compile a sibling forward-only graph (e.g., a B=1 inference graph or a
+     *  B=N held-out eval graph) that shares this runtime's device and param
+     *  buffers. Pass the forward fn (typically distinct from your loss fn —
+     *  it returns logits, not a scalar) and any shape changes via `inputs`.
+     *  Auto-initialization is a no-op since params are shared. */
+    compileForward<I extends InputDecls>(forward: ForwardFn<M, I>, opts?: CompileForwardMethodOptions<I>): Promise<CompiledForwardModule>;
+}
+/** Returned by `compileForward` (and by the `compileForward` method). */
+interface CompiledForwardModule extends CompiledForward {
+    ir: CompiledIR;
+    /** Number of dispatchable kernels (excludes leaf no-ops). */
+    kernelCount: number;
+}
+/**
+ * Compile a Module-based model. Pass a *factory* `() => new Model()`, not the
+ * model instance itself: compilation mutates the tree (every `ParamSentinel`
+ * field becomes a real `Tensor`), so the instance is consumed and shouldn't be
+ * referenced afterwards. Re-call the factory if you need a fresh tree.
+ *
+ * The forward function takes the materialized model and a Record of named
+ * input tensors, returns the loss tensor. Inputs are matched by name with the
+ * `inputs:` declaration:
+ *
+ *   inputs: {
+ *     tokens:  { shape: [B, T], dtype: 'i32' },
+ *     targets: { shape: [B, T], dtype: 'i32' },
+ *   }
+ *   forward: (m, { tokens, targets }) => …
+ *
+ * Walks the module tree to materialize params with auto-derived names, then
+ * runs trace → grad → adam → buffer plan → codegen → runtime. Initial
+ * parameter values are uploaded automatically before this function returns;
+ * call `reset()` later to re-randomize.
+ *
+ * If `opts.adam` is set, the runtime's `step()` automatically tracks an
+ * internal step count and injects the bias-corrected `lrt` scalar each call;
+ * users don't need to provide it themselves.
+ */
+declare function compileModule<M extends Module, I extends InputDecls = InputDecls>(modelFactory: () => M, forward: ForwardFn<M, I>, opts?: CompileModuleOptions<I>): Promise<CompiledModule<M>>;
+/**
+ * Compile a Module-based model in forward-only mode (no autograd, no Adam).
+ * The forward function returns the output tensor (e.g., logits) instead of a
+ * scalar loss; runtime exposes `run(inputs)` returning the full output as a
+ * `Float32Array`.
+ *
+ * **Prefer the `compileForward` method on a training runtime** when both
+ * graphs use the same Module class — it auto-supplies `device` and
+ * `sharedParams`. This standalone form is for forward-only models with no
+ * training graph at all, or for sharing params across a different model.
+ *
+ * **Sharing params with a training compile.** Pass `opts.sharedParams =
+ * trainCompiled.params` to bind this graph's param buffers to an existing
+ * training runtime's GPU buffers — every train step is then immediately
+ * visible to `run()` calls here, no copies.
+ *
+ * Initial param values are uploaded automatically for params *not* covered
+ * by `sharedParams` (those are owned by the sibling compile).
+ */
+declare function compileForward<M extends Module, I extends InputDecls = InputDecls>(modelFactory: () => M, forward: ForwardFn<M, I>, opts?: CompileForwardOptions<I>): Promise<CompiledForwardModule>;
+interface LinearOptions {
+    /** Include a bias term (default true). */
+    bias?: boolean;
+}
+declare class Linear extends Module {
+    readonly inDim: number;
+    readonly outDim: number;
+    W: Tensor;
+    b: Tensor | null;
+    constructor(inDim: number, outDim: number, opts?: LinearOptions);
+    fwd(x: Tensor): Tensor;
+}
+declare class LayerNorm extends Module {
+    readonly d: number;
+    readonly eps: number;
+    g: Tensor;
+    b: Tensor;
+    constructor(d: number, eps?: number);
+    fwd(x: Tensor): Tensor;
+}
+/** [..., T, D] → [..., H, T, D/H]. Folds the standard
+ *  `transpose(reshape(x, [..., T, H, d]), [..., H, T, d])` pattern into one
+ *  call. Last dim of `x` must divide evenly by `nHeads`. */
+declare function splitHeads(x: Tensor, nHeads: number): Tensor;
+/** Inverse of `splitHeads`: [..., H, T, d] → [..., T, H*d]. */
+declare function mergeHeads(x: Tensor): Tensor;
+/** Slice a captured tensor named `name` into one Float32Array per head, using
+ *  the static shape registered at compile time. The leading axis is treated as
+ *  heads (matching `splitHeads` layout at B=1); a leading singleton batch is
+ *  stripped if present so callers can pass capture names directly. Throws if
+ *  the capture isn't registered or wasn't read back this call. */
+declare function unsplitHeads(captures: Captures, name: string): Float32Array[];
+/** Per-position cross-entropy along the last (vocab) axis: returns
+ *  `-log p(target)` at each position. `logits` is `[..., V]`; `targets` is
+ *  `[...]` of i32; result is `[...]` (one rank less than logits). The user
+ *  applies their own masking + reduction downstream — useful when only some
+ *  positions contribute (e.g. result-digit masking) or for label smoothing. */
+declare function crossEntropyLast(logits: Tensor, targets: Tensor): Tensor;
+type nn_d_LayerNorm = LayerNorm;
+declare const nn_d_LayerNorm: typeof LayerNorm;
+type nn_d_Linear = Linear;
+declare const nn_d_Linear: typeof Linear;
+type nn_d_LinearOptions = LinearOptions;
+declare const nn_d_crossEntropyLast: typeof crossEntropyLast;
+declare const nn_d_mergeHeads: typeof mergeHeads;
+declare const nn_d_splitHeads: typeof splitHeads;
+declare const nn_d_unsplitHeads: typeof unsplitHeads;
+declare namespace nn_d {
+  export { nn_d_LayerNorm as LayerNorm, nn_d_Linear as Linear, nn_d_crossEntropyLast as crossEntropyLast, nn_d_mergeHeads as mergeHeads, nn_d_splitHeads as splitHeads, nn_d_unsplitHeads as unsplitHeads };
+  export type { nn_d_LinearOptions as LinearOptions };
+}
+export { Captures, Module, ShapeError, add, appendAdam, appendGrad, arange, capture, compile, compileForward, compileModule, compileToIR, createForwardRuntime, createRuntime, div, embedding, emitKernels, exp, greater, less, log, logSoftmaxLast, materializeParams, matmul, matmulBatched, meanLast, mul, nn_d as nn, oneHot, paramInput, planBuffers, relu, reshape, rsqrt, sliceLastRange, softmaxCausalLast, sqrt, stateInput, sub, sumAll, sumLast, swapAxes, tensorInput, trace, traceInto, transpose, where, whereCausal };
+export type { AdamConfig, AdamResult, BufferPlan, BufferSpec, CallSite, CompileForwardMethodOptions, CompileForwardOptions, CompileModuleOptions, CompiledForward, CompiledForwardModule, CompiledIR, CompiledModule, CompiledRuntime, Dtype, ForwardFn, GradResult, Graph, InitSpec, InputDecl, InputDecls, InputsTensors, KernelSpec, MaterializedParams, OpNode, ParamOptions, RunOptions, RunResult, RuntimeOpts, Shape, StepResult, Tensor, Writeback, WritebackDecl };