npm - tensorgrad - Versions diffs - 0.0.15 → 0.0.16 - Mend

tensorgrad 0.0.15 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -311,12 +311,62 @@ interface WritebackDecl {
  */
 declare function planBuffers(graph: Graph, paramGrads: Record<string, Tensor>, writebackDecls?: WritebackDecl[]): BufferPlan;
+/** Per-step learning-rate schedule. Either a fixed number or one of the
+ *  serializable shape forms below. Functions/closures are not supported —
+ *  the schedule needs to cross thread boundaries and survive serialization
+ *  for the worker-internal runtime, and every realistic LR pattern (constant,
+ *  linear decay, cosine, warmup-then-decay) maps to a finite set of shapes.
+ *  Use the `lr` helper namespace to construct shapes ergonomically. */
+type LRSchedule = number | {
+    readonly kind: 'constant';
+    readonly value: number;
+} | {
+    readonly kind: 'linearDecay';
+    readonly peak: number;
+    readonly final: number;
+    readonly steps: number;
+} | {
+    readonly kind: 'cosineDecay';
+    readonly peak: number;
+    readonly final: number;
+    readonly steps: number;
+} | {
+    readonly kind: 'warmup';
+    readonly peakLr: number;
+    readonly warmupSteps: number;
+    readonly after: LRSchedule;
+};
+/** Ergonomic constructors for LRSchedule shapes. */
+declare const lr: {
+    constant: (value: number) => LRSchedule;
+    /** Linearly interpolate from `peak` at step 1 to `final` at step `steps`,
+     *  then hold at `final`. Matches `peak + (final - peak) * min(step/steps, 1)`. */
+    linearDecay: (opts: {
+        peak: number;
+        final: number;
+        steps: number;
+    }) => LRSchedule;
+    /** Half-cosine from `peak` at step 1 down to `final` at step `steps`,
+     *  then hold at `final`. */
+    cosineDecay: (opts: {
+        peak: number;
+        final: number;
+        steps: number;
+    }) => LRSchedule;
+    /** Linear ramp from 0 to `peakLr` over `warmupSteps` steps, then hand off
+     *  to `after` (offset so step 1 of `after` = first post-warmup step). */
+    warmup: (opts: {
+        peakLr: number;
+        warmupSteps: number;
+        after: LRSchedule;
+    }) => LRSchedule;
+};
+/** Resolve a schedule to its scalar value at a given 1-based step. */
+declare function resolveLR(schedule: LRSchedule, step: number): number;
 interface AdamConfig {
-    /** Constant scalar (e.g., `0.005`) or a per-step schedule function
-     *  `(step) => lr`. Schedule fn lets the user implement linear/cosine decay
-     *  or warmup; first call passes `step=1`. Decay-shrink (AdamW) updates
-     *  per-step automatically when this is a function. */
-    lr: number | ((step: number) => number);
+    /** Learning rate schedule. Pass a number for fixed lr, or a shape from
+     *  the `lr` helpers (e.g., `lr.linearDecay({ peak: 0.005, final: 0.0005, steps: 1500 })`). */
+    lr: LRSchedule;
     b1?: number;
     b2?: number;
     eps?: number;
@@ -330,16 +380,17 @@ interface AdamConfig {
      *  Example: `(name) => name.includes('.W') || name.endsWith('_emb')`. */
     decayFilter?: (paramName: string) => boolean;
 }
-/** Resolved hyperparameters: lr is the schedule fn (constants are wrapped). */
+/** Resolved hyperparameters with all fields populated. `lr` stays as the
+ *  shape (not pre-resolved) so the runtime can compute per-step values. */
 interface AdamResolvedConfig {
-    lr: (step: number) => number;
+    lr: LRSchedule;
     b1: number;
     b2: number;
     eps: number;
     weightDecay: number;
     decayFilter: (name: string) => boolean;
-    /** True iff the user supplied an lr function (vs a constant). When false,
-     *  decayShrink is baked at compile time and never updated. */
+    /** True iff the lr shape varies with step (linearDecay, cosineDecay,
+     *  warmup). When false, decayShrink is baked at compile time. */
     lrIsScheduled: boolean;
 }
 interface AdamResult {
@@ -431,124 +482,52 @@ interface RunOptions {
      *  `.get` throws); when true, captures are read back and accessible. */
     withCaptures?: boolean;
 }
-interface StepOptions extends RunOptions {
-    /** If false, the training submit is queued but the JS thread does not
-     *  await `mapAsync` of the loss buffer. Returns `void` immediately.
-     *  Use `runtime.readLoss()` to read the latest loss explicitly when
-     *  you want it (e.g., every Nth step for UI display).
-     *
-     *  Why: each `mapAsync` round-trip is ~1 ms on desktop but 10–30 ms on
-     *  Android Chrome. A training loop that awaits per step pays N × that
-     *  on the main thread, which on mobile starves the OS compositor and
-     *  causes visible UI sluggishness. With `readLoss: false` plus a
-     *  `requestAnimationFrame` yield between steps, the main thread stays
-     *  responsive while training runs at GPU speed.
-     *
-     *  Implies `withCaptures: false`. Default: true. */
-    readLoss?: boolean;
-}
-/** Common surface for both training and forward-only compiled runtimes. */
-interface CompiledBase {
-    /** The GPUDevice this runtime is bound to. Pass to sibling compiles to
-     *  share the device, or use directly for other GPU work. */
-    device: GPUDevice;
-    /** Param name -> the underlying GPUBuffer. Pass to a sibling compile via
-     *  `sharedParams` to share without copies. */
-    params: Map<string, GPUBuffer>;
-    /** Shape of the graph's output (loss scalar `[]` for training; the user's
-     *  returned tensor for forward-only compiles). */
-    outputShape: number[];
-    /** Upload parameter Float32Arrays to their GPU buffers. By default, requires
-     *  *all* params to be present; throws on any unknown or missing key. Pass
-     *  `{ partial: true }` to skip the missing-key check. */
-    uploadParams(params: Record<string, Float32Array>, opts?: UploadParamsOptions): void;
-    /** Read all parameters back as Float32Arrays — used for UI panels. */
-    downloadParams(): Promise<Record<string, Float32Array>>;
-    /** Free GPU resources. */
-    destroy(): void;
-}
-/** Run a dispatch and read back the full output tensor. Default returns the
- *  output as a `Float32Array`; with `{ withCaptures: true }` returns
- *  `{ output, captures }`. Same shape as `step()`'s overloads. */
-interface RunFn {
-    (inputs: Record<string, Int32Array | Float32Array>): Promise<Float32Array>;
-    (inputs: Record<string, Int32Array | Float32Array>, opts: {
-        withCaptures: true;
-    }): Promise<RunResult>;
-    (inputs: Record<string, Int32Array | Float32Array>, opts: RunOptions): Promise<Float32Array | RunResult>;
-}
-interface CompiledRuntime extends CompiledBase {
-    /** Read all parameter gradients back. Mostly for verification / debugging. */
-    downloadParamGrads(): Promise<Record<string, Float32Array>>;
-    /**
-     * One full forward+backward step.
-     *   1. Uploads `inputs` (tokens, targets, masks) to input buffers.
-     *   2. Dispatches every kernel in order.
-     *   3. Reads back the loss scalar (and any registered captures, if requested).
-     * Default returns the loss as a JS number; with `{ withCaptures: true }`
-     * returns `{ loss, captures }`.
-     */
-    step(inputs: Record<string, Int32Array | Float32Array>): Promise<number>;
-    step(inputs: Record<string, Int32Array | Float32Array>, opts: {
-        withCaptures: true;
-    }): Promise<StepResult>;
-    step(inputs: Record<string, Int32Array | Float32Array>, opts: {
-        readLoss: false;
-    }): Promise<void>;
-    step(inputs: Record<string, Int32Array | Float32Array>, opts: StepOptions): Promise<number | StepResult | void>;
-    /** Same dispatch as step() but returns the full output Float32Array — for
-     *  training graphs the output is a scalar loss, so step() is usually more
-     *  convenient. Provided for parity with `compileForward`. */
-    run: RunFn;
-    /** Read the latest loss value from the GPU. Pair with `step({ readLoss: false })`
-     *  fire-and-forget training: every Nth iteration, call `readLoss()` for the
-     *  UI, but most iterations don't pay the `mapAsync` cost. */
-    readLoss(): Promise<number>;
-    /** Re-zero all optimizer state buffers (Adam's m/v) in place. Pair with
-     *  `uploadInitialParams()` for a full training reset without recompile. */
-    resetOptimizerState(): void;
-}
-/** Forward-only compiled runtime — produced by `compileForward`. No optimizer,
- *  no backward. Returns the output tensor (not just a scalar) per `run()` call. */
-interface CompiledForward extends CompiledBase {
-    run: RunFn;
-}
-interface RuntimeOpts {
-    /** Pre-acquired GPUDevice. If omitted, runtime requests its own. */
-    device?: GPUDevice;
-    /** External param buffers to bind in place of allocating fresh ones, keyed
-     *  by param name. Used to share params between a training compile and a
-     *  sibling forward-only compile (e.g., a B=1 inference graph). When a name
-     *  is in this map, the runtime reuses the provided GPUBuffer; otherwise it
-     *  allocates as usual. */
-    sharedParams?: Map<string, GPUBuffer>;
-}
-declare function createRuntime(plan: BufferPlan, kernels: KernelSpec[], lossBufferId: number, opts?: RuntimeOpts): Promise<CompiledRuntime>;
-/** Same machinery as `createRuntime`, narrower public type: a forward-only
- *  graph exposes `run()` instead of `step()` (no optimizer state, no scalar-
- *  loss readback). The full runtime object is built once and projected by
- *  `compileForward` to the public shape. */
-declare function createForwardRuntime(plan: BufferPlan, kernels: KernelSpec[], outputBufferId: number, opts?: RuntimeOpts): Promise<CompiledForward>;
-/** How a parameter's initial values are produced.
- *  - `'randn'` — Gaussian, with `scale` (default 0.02). The common case for
- *    weight matrices and embeddings.
- *  - `'zeros'` — fill with 0. Common for biases and LayerNorm beta.
- *  - `'ones'`  — fill with 1. Common for LayerNorm gain.
- *  - Custom function — receives total element count and shape, returns the
- *    Float32Array. Use for fan-in scaling or any non-standard scheme.
+/** How a parameter's initial values are produced. Serializable shape — no
+ *  closures, since the initial values cross the worker boundary at compile
+ *  time. Use the `init` helpers for ergonomic construction.
+ *
+ *  String shorthands:
+ *  - `'randn'` — Gaussian with std 0.02 (the common weight-matrix init).
+ *  - `'zeros'` — fill with 0 (biases, LayerNorm beta).
+ *  - `'ones'`  — fill with 1 (LayerNorm gain).
+ *
+ *  Object shapes:
+ *  - `{ kind: 'randn', scale }` — randn with explicit std.
+ *  - `{ kind: 'kaiming', gain? }` — `std = gain / sqrt(fan_in)`. Default
+ *    gain `sqrt(2)` (good for ReLU). `fan_in = shape[0]`.
+ *  - `{ kind: 'literal', data }` — explicit Float32Array; length must
+ *    match the parameter's element count.
  */
-type InitSpec = 'randn' | 'zeros' | 'ones' | ((size: number, shape: readonly number[]) => Float32Array);
+type InitSpec = 'randn' | 'zeros' | 'ones' | {
+    readonly kind: 'randn';
+    readonly scale: number;
+} | {
+    readonly kind: 'kaiming';
+    readonly gain?: number;
+} | {
+    readonly kind: 'literal';
+    readonly data: Float32Array;
+};
+/** Ergonomic constructors for InitSpec object shapes. */
+declare const init: {
+    randn: (opts?: {
+        scale?: number;
+    }) => InitSpec;
+    kaiming: (opts?: {
+        gain?: number;
+    }) => InitSpec;
+    literal: (data: Float32Array) => InitSpec;
+};
 interface ParamOptions {
     dtype?: Dtype;
-    /** Init kind. Default: `'randn'`. */
+    /** Init shape. Default: `'randn'` (std 0.02). */
     init?: InitSpec;
-    /** Std dev for `'randn'`. Default 0.02. Ignored for non-randn init. */
-    scale?: number;
     /** Whether AdamW (when `weightDecay > 0`) should apply decoupled weight
-     *  decay to this param. Default: `true` for `'randn'` init (weight matrices,
-     *  embeddings), `false` for `'zeros'` / `'ones'` (biases, LN gains). Override
-     *  to force or skip. Replaces `adam.decayFilter` for the common case. */
+     *  decay to this param. Default: `true` for randn/kaiming/literal init
+     *  (weight matrices, embeddings); `false` for zeros/ones (biases, LN
+     *  gains). Override to force or skip. Replaces `adam.decayFilter` for
+     *  the common case. */
     decay?: boolean;
 }
 type InitFn = (size: number, shape: readonly number[]) => Float32Array;
@@ -590,21 +569,14 @@ interface InputDecl {
     shape: Shape;
     dtype?: Dtype;
 }
-/** Inputs declaration: a Record from input name to its shape/dtype. The name
- *  doubles as the key the forward fn destructures and the key the runtime
- *  expects in `step({...})` / `run({...})`. */
+/** Inputs declaration: a Record from input name to its shape/dtype. */
 type InputDecls = Record<string, InputDecl>;
 /** Maps an `InputDecls` Record to its forward-time tensor counterpart —
- *  same keys, each value is a Tensor. Used to type the forward function's
- *  `inputs` argument from the declared shape Record. */
+ *  same keys, each value is a Tensor. */
 type InputsTensors<I extends InputDecls> = {
     [K in keyof I]: Tensor;
 };
-/** Forward function shape: takes the materialized model and a Record of
- *  named input tensors (matching the declared `inputs:` keys), returns the
- *  output tensor (loss for compileModule; logits/etc. for compileForward).
- *  The second generic flows from the inputs declaration so destructuring
- *  the input record stays typed. */
+/** Forward function shape. */
 type ForwardFn<M extends Module, I extends InputDecls = InputDecls> = (m: M, inputs: InputsTensors<I>) => Tensor;
 interface CompiledIR {
     graph: GradResult['graph'];
@@ -615,59 +587,67 @@ interface CompiledIR {
 }
 /** Trace + autograd + buffer-plan + codegen, without touching WebGPU. */
 declare function compileToIR(traceFn: () => Tensor): CompiledIR;
-/** Full compile pipeline. Browser-only because it creates a GPUDevice. */
-declare function compile(traceFn: () => Tensor, opts?: RuntimeOpts): Promise<CompiledRuntime & {
-    ir: CompiledIR;
-}>;
-interface CompileModuleOptions<I extends InputDecls = InputDecls> extends RuntimeOpts {
-    /** Per-step data inputs to the forward function, keyed by name. The forward
-     *  fn destructures these out of its second argument; runtime calls to
-     *  `step()` / `run()` pass typed arrays under the same keys. */
+interface CompileModuleOptions<I extends InputDecls = InputDecls> {
     inputs?: I;
-    /** Adam hyperparameters. If omitted, no optimizer is appended (forward-only). */
     adam?: AdamConfig;
 }
-interface CompileForwardOptions<I extends InputDecls = InputDecls> extends RuntimeOpts {
-    /** Per-step data inputs to the forward function, keyed by name. */
+interface CompileForwardOptions<I extends InputDecls = InputDecls> {
     inputs?: I;
 }
-/** Forward-only compile options as taken by the `compileForward` *method* on
- *  a training runtime — no `device` (inherited) and no `sharedParams`
- *  (auto-supplied from the train graph's params). */
 interface CompileForwardMethodOptions<I extends InputDecls = InputDecls> {
     inputs?: I;
 }
-/** Returned by `compileModule`. Adds training-graph extras (auto-init, reset,
- *  sibling-graph compile) on top of the base runtime. */
-interface CompiledModule<M extends Module> extends CompiledRuntime {
-    ir: CompiledIR;
-    /** Number of dispatchable kernels (excludes leaf no-ops). */
-    kernelCount: number;
-    /** Re-initialize all params from their declared init specs and zero the
-     *  optimizer state. Use to start training over without recompiling. */
-    reset(): void;
-    /** Compile a sibling forward-only graph (e.g., a B=1 inference graph or a
-     *  B=N held-out eval graph) that shares this runtime's device and param
-     *  buffers. Pass the forward fn (typically distinct from your loss fn —
-     *  it returns logits, not a scalar) and any shape changes via `inputs`.
-     *  Auto-initialization is a no-op since params are shared. */
+/** Returned by `compileModule`. Proxies all GPU work to a worker held
+ *  internally; user code awaits Promises and never sees the worker. */
+interface CompiledModule<M extends Module> {
+    readonly ir: CompiledIR;
+    readonly kernelCount: number;
+    readonly outputShape: readonly number[];
+    /** Names of the model's parameters, in materialization order. The actual
+     *  GPUBuffers live in the worker; use `downloadParams()` for values. */
+    readonly paramNames: readonly string[];
+    step(inputs: Record<string, Int32Array | Float32Array>): Promise<number>;
+    step(inputs: Record<string, Int32Array | Float32Array>, opts: {
+        withCaptures: true;
+    }): Promise<StepResult>;
+    run(inputs: Record<string, Int32Array | Float32Array>): Promise<Float32Array>;
+    run(inputs: Record<string, Int32Array | Float32Array>, opts: {
+        withCaptures: true;
+    }): Promise<RunResult>;
+    uploadParams(params: Record<string, Float32Array>, opts?: UploadParamsOptions): Promise<void>;
+    downloadParams(): Promise<Record<string, Float32Array>>;
+    downloadParamGrads(): Promise<Record<string, Float32Array>>;
+    /** Re-initialize all params + zero optimizer state. */
+    reset(): Promise<void>;
+    resetOptimizerState(): Promise<void>;
+    /** Compile a sibling forward-only graph that shares this runtime's worker
+     *  (and therefore its param GPUBuffers). */
     compileForward<I extends InputDecls>(forward: ForwardFn<M, I>, opts?: CompileForwardMethodOptions<I>): Promise<CompiledForwardModule>;
+    /** Free the runtime's GPU resources and terminate the worker. */
+    destroy(): void;
 }
 /** Returned by `compileForward` (and by the `compileForward` method). */
-interface CompiledForwardModule extends CompiledForward {
-    ir: CompiledIR;
-    /** Number of dispatchable kernels (excludes leaf no-ops). */
-    kernelCount: number;
+interface CompiledForwardModule {
+    readonly ir: CompiledIR;
+    readonly kernelCount: number;
+    readonly outputShape: readonly number[];
+    readonly paramNames: readonly string[];
+    run(inputs: Record<string, Int32Array | Float32Array>): Promise<Float32Array>;
+    run(inputs: Record<string, Int32Array | Float32Array>, opts: {
+        withCaptures: true;
+    }): Promise<RunResult>;
+    uploadParams(params: Record<string, Float32Array>, opts?: UploadParamsOptions): Promise<void>;
+    downloadParams(): Promise<Record<string, Float32Array>>;
+    destroy(): void;
 }
 /**
  * Compile a Module-based model. Pass a *factory* `() => new Model()`, not the
  * model instance itself: compilation mutates the tree (every `ParamSentinel`
  * field becomes a real `Tensor`), so the instance is consumed and shouldn't be
- * referenced afterwards. Re-call the factory if you need a fresh tree.
+ * referenced afterwards.
  *
  * The forward function takes the materialized model and a Record of named
- * input tensors, returns the loss tensor. Inputs are matched by name with the
- * `inputs:` declaration:
+ * input tensors, returns the loss tensor:
  *
  *   inputs: {
  *     tokens:  { shape: [B, T], dtype: 'i32' },
@@ -675,34 +655,15 @@ interface CompiledForwardModule extends CompiledForward {
  *   }
  *   forward: (m, { tokens, targets }) => …
  *
- * Walks the module tree to materialize params with auto-derived names, then
- * runs trace → grad → adam → buffer plan → codegen → runtime. Initial
- * parameter values are uploaded automatically before this function returns;
- * call `reset()` later to re-randomize.
- *
- * If `opts.adam` is set, the runtime's `step()` automatically tracks an
- * internal step count and injects the bias-corrected `lrt` scalar each call;
- * users don't need to provide it themselves.
+ * Returns a `CompiledModule` proxy. All GPU work (createRuntime, step, run,
+ * mapAsync) happens in an internal worker; calls return Promises that resolve
+ * when the worker replies.
  */
 declare function compileModule<M extends Module, I extends InputDecls = InputDecls>(modelFactory: () => M, forward: ForwardFn<M, I>, opts?: CompileModuleOptions<I>): Promise<CompiledModule<M>>;
 /**
- * Compile a Module-based model in forward-only mode (no autograd, no Adam).
- * The forward function returns the output tensor (e.g., logits) instead of a
- * scalar loss; runtime exposes `run(inputs)` returning the full output as a
- * `Float32Array`.
- *
- * **Prefer the `compileForward` method on a training runtime** when both
- * graphs use the same Module class — it auto-supplies `device` and
- * `sharedParams`. This standalone form is for forward-only models with no
- * training graph at all, or for sharing params across a different model.
- *
- * **Sharing params with a training compile.** Pass `opts.sharedParams =
- * trainCompiled.params` to bind this graph's param buffers to an existing
- * training runtime's GPU buffers — every train step is then immediately
- * visible to `run()` calls here, no copies.
- *
- * Initial param values are uploaded automatically for params *not* covered
- * by `sharedParams` (those are owned by the sibling compile).
+ * Forward-only compile. Spawns its own worker. For sibling graphs that share
+ * params with a training graph, prefer the `compileForward` method on the
+ * CompiledModule returned by `compileModule()`.
  */
 declare function compileForward<M extends Module, I extends InputDecls = InputDecls>(modelFactory: () => M, forward: ForwardFn<M, I>, opts?: CompileForwardOptions<I>): Promise<CompiledForwardModule>;
@@ -759,5 +720,5 @@ declare namespace nn_d {
   export type { nn_d_LinearOptions as LinearOptions };
 }
-export { Captures, Module, ShapeError, add, appendAdam, appendGrad, arange, capture, compile, compileForward, compileModule, compileToIR, createForwardRuntime, createRuntime, div, embedding, emitKernels, exp, greater, less, log, logSoftmaxLast, materializeParams, matmul, matmulBatched, meanLast, mul, nn_d as nn, oneHot, paramInput, planBuffers, relu, reshape, rsqrt, sliceLastRange, softmaxCausalLast, sqrt, stateInput, sub, sumAll, sumLast, swapAxes, tensorInput, trace, traceInto, transpose, where, whereCausal };
-export type { AdamConfig, AdamResult, BufferPlan, BufferSpec, CallSite, CompileForwardMethodOptions, CompileForwardOptions, CompileModuleOptions, CompiledForward, CompiledForwardModule, CompiledIR, CompiledModule, CompiledRuntime, Dtype, ForwardFn, GradResult, Graph, InitSpec, InputDecl, InputDecls, InputsTensors, KernelSpec, MaterializedParams, OpNode, ParamOptions, RunOptions, RunResult, RuntimeOpts, Shape, StepResult, Tensor, Writeback, WritebackDecl };
+export { Captures, Module, ShapeError, add, appendAdam, appendGrad, arange, capture, compileForward, compileModule, compileToIR, div, embedding, emitKernels, exp, greater, init, less, log, logSoftmaxLast, lr, materializeParams, matmul, matmulBatched, meanLast, mul, nn_d as nn, oneHot, paramInput, planBuffers, relu, reshape, resolveLR, rsqrt, sliceLastRange, softmaxCausalLast, sqrt, stateInput, sub, sumAll, sumLast, swapAxes, tensorInput, trace, traceInto, transpose, where, whereCausal };
+export type { AdamConfig, AdamResult, BufferPlan, BufferSpec, CallSite, CompileForwardMethodOptions, CompileForwardOptions, CompileModuleOptions, CompiledForwardModule, CompiledIR, CompiledModule, Dtype, ForwardFn, GradResult, Graph, InitSpec, InputDecl, InputDecls, InputsTensors, KernelSpec, LRSchedule, MaterializedParams, OpNode, ParamOptions, RunOptions, RunResult, Shape, StepResult, Tensor, UploadParamsOptions, Writeback, WritebackDecl };