npm - tensorgrad - Versions diffs - 0.0.14 → 0.0.16 - Mend

tensorgrad 0.0.14 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

package/src/module.ts CHANGED Viewed

@@ -32,30 +32,47 @@ import { paramInput } from './trace.js'
 // Init metadata
 // ============================================================================
-/** How a parameter's initial values are produced.
- *  - `'randn'` — Gaussian, with `scale` (default 0.02). The common case for
- *    weight matrices and embeddings.
- *  - `'zeros'` — fill with 0. Common for biases and LayerNorm beta.
- *  - `'ones'`  — fill with 1. Common for LayerNorm gain.
- *  - Custom function — receives total element count and shape, returns the
- *    Float32Array. Use for fan-in scaling or any non-standard scheme.
+/** How a parameter's initial values are produced. Serializable shape — no
+ *  closures, since the initial values cross the worker boundary at compile
+ *  time. Use the `init` helpers for ergonomic construction.
+ *
+ *  String shorthands:
+ *  - `'randn'` — Gaussian with std 0.02 (the common weight-matrix init).
+ *  - `'zeros'` — fill with 0 (biases, LayerNorm beta).
+ *  - `'ones'`  — fill with 1 (LayerNorm gain).
+ *
+ *  Object shapes:
+ *  - `{ kind: 'randn', scale }` — randn with explicit std.
+ *  - `{ kind: 'kaiming', gain? }` — `std = gain / sqrt(fan_in)`. Default
+ *    gain `sqrt(2)` (good for ReLU). `fan_in = shape[0]`.
+ *  - `{ kind: 'literal', data }` — explicit Float32Array; length must
+ *    match the parameter's element count.
  */
 export type InitSpec =
   | 'randn'
   | 'zeros'
   | 'ones'
-  | ((size: number, shape: readonly number[]) => Float32Array)
+  | { readonly kind: 'randn'; readonly scale: number }
+  | { readonly kind: 'kaiming'; readonly gain?: number }
+  | { readonly kind: 'literal'; readonly data: Float32Array }
+/** Ergonomic constructors for InitSpec object shapes. */
+export const init = {
+  randn: (opts: { scale?: number } = {}): InitSpec => ({ kind: 'randn', scale: opts.scale ?? 0.02 }),
+  kaiming: (opts: { gain?: number } = {}): InitSpec =>
+    opts.gain !== undefined ? { kind: 'kaiming', gain: opts.gain } : { kind: 'kaiming' },
+  literal: (data: Float32Array): InitSpec => ({ kind: 'literal', data }),
+}
 export interface ParamOptions {
   dtype?: Dtype
-  /** Init kind. Default: `'randn'`. */
+  /** Init shape. Default: `'randn'` (std 0.02). */
   init?: InitSpec
-  /** Std dev for `'randn'`. Default 0.02. Ignored for non-randn init. */
-  scale?: number
   /** Whether AdamW (when `weightDecay > 0`) should apply decoupled weight
-   *  decay to this param. Default: `true` for `'randn'` init (weight matrices,
-   *  embeddings), `false` for `'zeros'` / `'ones'` (biases, LN gains). Override
-   *  to force or skip. Replaces `adam.decayFilter` for the common case. */
+   *  decay to this param. Default: `true` for randn/kaiming/literal init
+   *  (weight matrices, embeddings); `false` for zeros/ones (biases, LN
+   *  gains). Override to force or skip. Replaces `adam.decayFilter` for
+   *  the common case. */
   decay?: boolean
 }
@@ -65,31 +82,52 @@ function boxMuller(): number {
   return Math.sqrt(-2 * Math.log(Math.max(1e-10, Math.random()))) * Math.cos(2 * Math.PI * Math.random())
 }
-function resolveInit(opts: ParamOptions | undefined): InitFn {
-  const init = opts?.init ?? 'randn'
-  if (init === 'randn') {
-    const scale = opts?.scale ?? 0.02
-    return (size) => {
-      const arr = new Float32Array(size)
-      for (let i = 0; i < size; i++) arr[i] = boxMuller() * scale
-      return arr
+function randnFn(scale: number): InitFn {
+  return (size) => {
+    const arr = new Float32Array(size)
+    for (let i = 0; i < size; i++) arr[i] = boxMuller() * scale
+    return arr
+  }
+}
+/** Compile-time-only: resolve an InitSpec shape into the closure that
+ *  generates the initial Float32Array for a given parameter shape. Runs
+ *  on the main thread before initial values are transferred to the worker. */
+function resolveInit(spec: InitSpec | undefined): InitFn {
+  if (!spec || spec === 'randn') return randnFn(0.02)
+  if (spec === 'zeros') return (size) => new Float32Array(size)
+  if (spec === 'ones') return (size) => { const a = new Float32Array(size); a.fill(1); return a }
+  switch (spec.kind) {
+    case 'randn': return randnFn(spec.scale)
+    case 'kaiming': {
+      const gain = spec.gain ?? Math.sqrt(2)
+      return (size, shape) => {
+        const fanIn = shape[0] ?? size
+        const std = gain / Math.sqrt(fanIn)
+        const arr = new Float32Array(size)
+        for (let i = 0; i < size; i++) arr[i] = boxMuller() * std
+        return arr
+      }
+    }
+    case 'literal': {
+      const data = spec.data
+      return (size) => {
+        if (data.length !== size) {
+          throw new Error(`init.literal: data length ${data.length} doesn't match param size ${size}`)
+        }
+        return new Float32Array(data)
+      }
     }
   }
-  if (init === 'zeros') return (size) => new Float32Array(size)
-  if (init === 'ones') return (size) => { const a = new Float32Array(size); a.fill(1); return a }
-  if (typeof init === 'function') return init
-  throw new Error(`Unknown init: ${String(init)}`)
 }
-/** Resolve the decay default for a param. Decay weight matrices and
- *  embedding tables (randn-initialized); skip biases (zeros) and LN gains
- *  (ones). Custom init functions default to "decay" — most user-supplied
- *  inits are weight-shaped (Kaiming etc.). Explicit `decay: false` overrides. */
+/** Resolve the decay default for a param. Weight-shaped inits (randn,
+ *  kaiming, literal) default to decay=true; ones/zeros default to false
+ *  (biases, LN gains). Explicit `decay` opt overrides. */
 function resolveDecay(opts: ParamOptions | undefined): boolean {
   if (opts?.decay !== undefined) return opts.decay
-  const init = opts?.init ?? 'randn'
-  if (init === 'zeros' || init === 'ones') return false
-  return true   // 'randn' or function
+  const spec = opts?.init ?? 'randn'
+  return spec !== 'zeros' && spec !== 'ones'
 }
 // ============================================================================
@@ -127,7 +165,7 @@ export abstract class Module {
   protected param(shape: Shape, opts?: ParamOptions): Tensor {
     const dtype = opts?.dtype ?? 'f32'
     // Lie to TypeScript: the sentinel becomes a Tensor at materialize time.
-    return new ParamSentinel(shape, dtype, resolveInit(opts), resolveDecay(opts)) as unknown as Tensor
+    return new ParamSentinel(shape, dtype, resolveInit(opts?.init), resolveDecay(opts)) as unknown as Tensor
   }
 }

package/src/runtime.ts CHANGED Viewed

@@ -69,6 +69,23 @@ export interface RunOptions {
   withCaptures?: boolean
 }
+export interface StepOptions extends RunOptions {
+  /** If false, the training submit is queued but the JS thread does not
+   *  await `mapAsync` of the loss buffer. Returns `void` immediately.
+   *  Use `runtime.readLoss()` to read the latest loss explicitly when
+   *  you want it (e.g., every Nth step for UI display).
+   *
+   *  Why: each `mapAsync` round-trip is ~1 ms on desktop but 10–30 ms on
+   *  Android Chrome. A training loop that awaits per step pays N × that
+   *  on the main thread, which on mobile starves the OS compositor and
+   *  causes visible UI sluggishness. With `readLoss: false` plus a
+   *  `requestAnimationFrame` yield between steps, the main thread stays
+   *  responsive while training runs at GPU speed.
+   *
+   *  Implies `withCaptures: false`. Default: true. */
+  readLoss?: boolean
+}
 /** Common surface for both training and forward-only compiled runtimes. */
 export interface CompiledBase {
   /** The GPUDevice this runtime is bound to. Pass to sibling compiles to
@@ -112,11 +129,16 @@ export interface CompiledRuntime extends CompiledBase {
    */
   step(inputs: Record<string, Int32Array | Float32Array>): Promise<number>
   step(inputs: Record<string, Int32Array | Float32Array>, opts: { withCaptures: true }): Promise<StepResult>
-  step(inputs: Record<string, Int32Array | Float32Array>, opts: RunOptions): Promise<number | StepResult>
+  step(inputs: Record<string, Int32Array | Float32Array>, opts: { readLoss: false }): Promise<void>
+  step(inputs: Record<string, Int32Array | Float32Array>, opts: StepOptions): Promise<number | StepResult | void>
   /** Same dispatch as step() but returns the full output Float32Array — for
    *  training graphs the output is a scalar loss, so step() is usually more
    *  convenient. Provided for parity with `compileForward`. */
   run: RunFn
+  /** Read the latest loss value from the GPU. Pair with `step({ readLoss: false })`
+   *  fire-and-forget training: every Nth iteration, call `readLoss()` for the
+   *  UI, but most iterations don't pay the `mapAsync` cost. */
+  readLoss(): Promise<number>
   /** Re-zero all optimizer state buffers (Adam's m/v) in place. Pair with
    *  `uploadInitialParams()` for a full training reset without recompile. */
   resetOptimizerState(): void
@@ -292,18 +314,21 @@ export async function createRuntime(
   // run sequentially even when fired from independent async paths (e.g., a
   // training loop's auxiliary `refreshPrediction()` + `writeDiagnostic()`).
   let pending: Promise<unknown> = Promise.resolve()
+  type DispatchOpts = { wantCaptures: boolean; readback: boolean }
+  type DispatchResult = { output: Float32Array; captures: Map<string, Float32Array> } | null
   async function dispatch(
     inputs: Record<string, Int32Array | Float32Array>,
-    wantCaptures: boolean,
-  ): Promise<{ output: Float32Array; captures: Map<string, Float32Array> }> {
-    const turn = pending.catch(() => {}).then(() => dispatchUnsynchronized(inputs, wantCaptures))
+    opts: DispatchOpts,
+  ): Promise<DispatchResult> {
+    const turn = pending.catch(() => {}).then(() => dispatchUnsynchronized(inputs, opts))
     pending = turn
     return turn
   }
   async function dispatchUnsynchronized(
     inputs: Record<string, Int32Array | Float32Array>,
-    wantCaptures: boolean,
-  ): Promise<{ output: Float32Array; captures: Map<string, Float32Array> }> {
+    opts: DispatchOpts,
+  ): Promise<DispatchResult> {
+    const wantCaptures = opts.wantCaptures
     if (wantCaptures && plan.capturesByName.size === 0) {
       throw new Error(
         `withCaptures=true but no capture(...) calls were registered during ` +
@@ -360,6 +385,12 @@ export async function createRuntime(
     }
     queue.submit([encoder.finish()])
+    // readback=false: training fire-and-forget. The encoder still copied
+    // loss → outputReadback (and captures → staging), but we don't await
+    // mapAsync. The caller can read the latest loss later via readLoss()
+    // when it actually wants to display it.
+    if (!opts.readback) return null
     await outputReadback.mapAsync(GPUMapMode.READ)
     const output = new Float32Array(outputReadback.getMappedRange().slice(0))
     outputReadback.unmap()
@@ -381,16 +412,37 @@ export async function createRuntime(
   // ---- step() — training-mode wrapper, returns scalar [0] of output ---------
   function step(inputs: Record<string, Int32Array | Float32Array>): Promise<number>
   function step(inputs: Record<string, Int32Array | Float32Array>, opts: { withCaptures: true }): Promise<StepResult>
-  function step(inputs: Record<string, Int32Array | Float32Array>, opts: RunOptions): Promise<number | StepResult>
+  function step(inputs: Record<string, Int32Array | Float32Array>, opts: { readLoss: false }): Promise<void>
+  function step(inputs: Record<string, Int32Array | Float32Array>, opts: StepOptions): Promise<number | StepResult | void>
   async function step(
     inputs: Record<string, Int32Array | Float32Array>,
-    opts?: RunOptions,
-  ): Promise<number | StepResult> {
-    const r = await dispatch(inputs, opts?.withCaptures === true)
+    opts?: StepOptions,
+  ): Promise<number | StepResult | void> {
+    if (opts?.readLoss === false) {
+      await dispatch(inputs, { wantCaptures: false, readback: false })
+      return
+    }
+    const r = (await dispatch(inputs, { wantCaptures: opts?.withCaptures === true, readback: true }))!
     if (opts?.withCaptures) return { loss: r.output[0]!, captures: new Captures(captureShapes, r.captures) }
     return r.output[0]!
   }
+  // ---- readLoss() — explicit late readback for fire-and-forget training -----
+  // Maps the output buffer (which step() always copies the latest loss into,
+  // even when readLoss:false) and returns the value. Goes through the same
+  // serialization chain as step()/run() so two readLoss() calls don't both
+  // try to mapAsync the same buffer.
+  async function readLoss(): Promise<number> {
+    const turn = pending.catch(() => {}).then(async () => {
+      await outputReadback.mapAsync(GPUMapMode.READ)
+      const v = new Float32Array(outputReadback.getMappedRange())[0]!
+      outputReadback.unmap()
+      return v
+    })
+    pending = turn
+    return turn
+  }
   // ---- run() — forward-mode wrapper, returns Float32Array by default -------
   // Same overloaded shape as step(): scalar-shaped result (here Float32Array,
   // there a JS number) is the default; { ..., captures } is the opt-in form.
@@ -401,7 +453,7 @@ export async function createRuntime(
     inputs: Record<string, Int32Array | Float32Array>,
     opts?: RunOptions,
   ): Promise<Float32Array | RunResult> {
-    const r = await dispatch(inputs, opts?.withCaptures === true)
+    const r = (await dispatch(inputs, { wantCaptures: opts?.withCaptures === true, readback: true }))!
     if (opts?.withCaptures) return { output: r.output, captures: new Captures(captureShapes, r.captures) }
     return r.output
   }
@@ -507,6 +559,7 @@ export async function createRuntime(
     downloadParamGrads: () => downloadFromMap(plan.paramGradsByName),
     step,
     run,
+    readLoss,
     resetOptimizerState,
     destroy,
   }

package/src/worker-protocol.ts ADDED Viewed

@@ -0,0 +1,183 @@
+// Wire format for the main-thread ↔ worker postMessage channel.
+//
+// All requests carry a numeric `id` assigned by the main thread; responses
+// echo it back so the proxy can match concurrent in-flight calls. Every
+// response is either `{ ok: true, result }` or `{ ok: false, error }`.
+// Errors carry serialized name/message/stack so the proxy can reconstitute
+// an Error with a working `instanceof` check on the receiving side.
+//
+// Inputs (typed arrays) and outputs (typed arrays, captures) are transferred
+// rather than copied — see the per-request notes for which fields go on the
+// transfer list. A single worker may host multiple compiled graphs (a train
+// graph plus sibling forward graphs); each has a `graphId` issued by the
+// main thread at compile time.
+import type { Graph } from './ir.js'
+import type { BufferPlan } from './buffers.js'
+import type { KernelSpec } from './codegen.js'
+import type { LRSchedule } from './adam.js'
+// ============================================================================
+// Serializable config (subset of AdamResolvedConfig that crosses the wire).
+// `decayFilter` (a function, used only at compile time) is NOT part of this —
+// the per-param decay decision is already baked into the IR by appendAdam
+// before the IR ships to the worker.
+// ============================================================================
+export interface WireAdamConfig {
+  lr: LRSchedule
+  b1: number
+  b2: number
+  eps: number
+  weightDecay: number
+  lrIsScheduled: boolean
+  /** Names of the per-step scalar inputs the worker must populate before
+   *  every step (`_adam_lrt`, optionally `_adam_decay_shrink`). Mirrors
+   *  AdamResult so the worker can update them without re-deriving. */
+  lrtInputName: string
+  decayShrinkInputName: string | null
+}
+/** Compile output that crosses to the worker. Same fields as CompiledIR
+ *  minus the `loss` tensor (carried by graph.outputs[0]). */
+export interface WireIR {
+  graph: Graph
+  plan: BufferPlan
+  kernels: KernelSpec[]
+}
+// ============================================================================
+// Requests (main → worker)
+// ============================================================================
+export type Req =
+  | { id: number; kind: 'createRuntime'; payload: CreateRuntimePayload }
+  | { id: number; kind: 'compileForward'; payload: CompileForwardPayload }
+  | { id: number; kind: 'step'; payload: StepPayload }
+  | { id: number; kind: 'run'; payload: RunPayload }
+  | { id: number; kind: 'uploadParams'; payload: UploadParamsPayload }
+  | { id: number; kind: 'downloadParams'; payload: { graphId: number } }
+  | { id: number; kind: 'downloadParamGrads'; payload: { graphId: number } }
+  | { id: number; kind: 'resetOptimizer'; payload: { graphId: number } }
+  | { id: number; kind: 'destroy'; payload: { graphId: number } }
+/** Build the training runtime. Always graphId=0 for a fresh worker. */
+export interface CreateRuntimePayload {
+  graphId: number
+  ir: WireIR
+  /** Initial param values per name. Transferred (zero-copy) — the main
+   *  thread loses access after postMessage. */
+  initialParams: Record<string, Float32Array>
+  /** Adam config when training; absent for forward-only compiles. */
+  adam: WireAdamConfig | null
+}
+/** Build a sibling forward-only graph that shares param buffers with an
+ *  existing graph (typically the training graph at graphId=0). */
+export interface CompileForwardPayload {
+  graphId: number
+  parentGraphId: number
+  ir: WireIR
+}
+/** One training step. Inputs are transferred; the caller's typed arrays
+ *  become detached after postMessage. */
+export interface StepPayload {
+  graphId: number
+  inputs: Record<string, Int32Array | Float32Array>
+  withCaptures: boolean
+}
+/** Forward-only run. Same transfer semantics as `step`. */
+export interface RunPayload {
+  graphId: number
+  inputs: Record<string, Int32Array | Float32Array>
+  withCaptures: boolean
+}
+export interface UploadParamsPayload {
+  graphId: number
+  params: Record<string, Float32Array>  // transferred
+  partial: boolean
+}
+// ============================================================================
+// Responses (worker → main)
+// ============================================================================
+export type Res<R = unknown> =
+  | { id: number; ok: true; result: R }
+  | { id: number; ok: false; error: WireError }
+export interface WireError {
+  name: string
+  message: string
+  stack: string
+}
+// Per-request result shapes:
+export interface CreateRuntimeResult {
+  paramNames: string[]
+  outputShape: number[]
+  kernelCount: number
+  captureShapes: Record<string, number[]>
+}
+export interface CompileForwardResult {
+  paramNames: string[]
+  outputShape: number[]
+  kernelCount: number
+  captureShapes: Record<string, number[]>
+}
+/** Step without `withCaptures` returns just `loss`. With captures, also
+ *  populates `captures` (per-name Float32Array, all transferred back). */
+export interface StepResultWire {
+  loss: number
+  captures: Record<string, Float32Array> | null
+}
+/** Run without `withCaptures` returns `{ output, captures: null }`.
+ *  With captures, also populates `captures`. */
+export interface RunResultWire {
+  output: Float32Array
+  captures: Record<string, Float32Array> | null
+}
+export interface DownloadParamsResult {
+  params: Record<string, Float32Array>  // transferred
+}
+// ============================================================================
+// Transfer-list helpers
+// ============================================================================
+/** Collect the underlying ArrayBuffers from a Record of typed arrays so we
+ *  can pass them on `postMessage`'s transfer list. The values themselves
+ *  stay in the Record; only their backing buffers move. */
+export function transferablesOfRecord(
+  rec: Record<string, Int32Array | Float32Array>,
+): ArrayBuffer[] {
+  const out: ArrayBuffer[] = []
+  for (const v of Object.values(rec)) out.push(v.buffer as ArrayBuffer)
+  return out
+}
+/** Serialize an Error to a wire-friendly shape, preserving stack + name so
+ *  the receiving side can reconstitute an Error that an `instanceof`-aware
+ *  caller (e.g., for `ShapeError`) can still pattern-match by name. */
+export function wireError(e: unknown): WireError {
+  if (e instanceof Error) {
+    return { name: e.name, message: e.message, stack: e.stack ?? '' }
+  }
+  return { name: 'Error', message: String(e), stack: '' }
+}
+/** Reconstitute an Error from the wire shape on the receiving (main) side. */
+export function reconstituteError(w: WireError): Error {
+  const err = new Error(w.message)
+  err.name = w.name
+  err.stack = w.stack
+  return err
+}

package/src/worker-proxy.ts ADDED Viewed

@@ -0,0 +1,76 @@
+// Main-thread half of the worker channel: request/response correlation,
+// promise wiring, error reconstitution. Knows nothing about Adam, captures,
+// IR, etc. — just shuttles typed messages.
+import type { Req, Res, WireError } from './worker-protocol.js'
+import { reconstituteError } from './worker-protocol.js'
+interface PendingHandlers {
+  resolve: (v: unknown) => void
+  reject: (e: Error) => void
+}
+/** Spawn a worker from an inlined source string and provide a typed
+ *  request/response channel. One WorkerProxy = one Worker = one GPUDevice
+ *  on the worker side. Sibling graphs share the same WorkerProxy. */
+export class WorkerProxy {
+  private worker: Worker
+  private nextId = 1
+  private pending = new Map<number, PendingHandlers>()
+  private terminated = false
+  constructor(workerSource: string) {
+    const blob = new Blob([workerSource], { type: 'application/javascript' })
+    const url = URL.createObjectURL(blob)
+    this.worker = new Worker(url, { type: 'module' })
+    // The Blob URL keeps memory alive as long as it's referenced; revoke
+    // once the worker has loaded its source. Browsers tolerate revoke
+    // immediately after construction in practice.
+    URL.revokeObjectURL(url)
+    this.worker.onmessage = (ev: MessageEvent<Res>) => {
+      const reply = ev.data
+      const handlers = this.pending.get(reply.id)
+      if (!handlers) return  // stale reply; ignore
+      this.pending.delete(reply.id)
+      if (reply.ok) handlers.resolve(reply.result)
+      else handlers.reject(reconstituteError(reply.error))
+    }
+    this.worker.onerror = (ev: ErrorEvent) => {
+      const err = new Error(`tensorgrad worker error: ${ev.message || 'unknown'}`)
+      const wire: WireError = { name: 'WorkerError', message: err.message, stack: err.stack ?? '' }
+      // Reject everything in flight; subsequent calls will fail too.
+      for (const handlers of this.pending.values()) handlers.reject(reconstituteError(wire))
+      this.pending.clear()
+    }
+  }
+  /** Send a request and await its matching response. `transfer` lists the
+   *  ArrayBuffers to move (zero-copy) into the worker. */
+  request<R>(req: Omit<Req, 'id'>, transfer: ArrayBuffer[] = []): Promise<R> {
+    if (this.terminated) return Promise.reject(new Error('tensorgrad: worker has been terminated'))
+    const id = this.nextId++
+    return new Promise<R>((resolve, reject) => {
+      this.pending.set(id, { resolve: resolve as (v: unknown) => void, reject })
+      this.worker.postMessage({ ...req, id } as Req, transfer)
+    })
+  }
+  /** Fire-and-forget variant for cases where the caller doesn't need a reply
+   *  (currently unused; keep for symmetry / future use). */
+  send(req: Omit<Req, 'id'>, transfer: ArrayBuffer[] = []): void {
+    if (this.terminated) return
+    const id = this.nextId++
+    this.worker.postMessage({ ...req, id } as Req, transfer)
+  }
+  terminate(): void {
+    if (this.terminated) return
+    this.terminated = true
+    this.worker.terminate()
+    const err = new Error('tensorgrad: worker terminated')
+    for (const handlers of this.pending.values()) handlers.reject(err)
+    this.pending.clear()
+  }
+}