npm - tensorgrad - Versions diffs - 0.0.15 → 0.0.16 - Mend

tensorgrad 0.0.15 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/src/module.ts CHANGED Viewed

@@ -32,30 +32,47 @@ import { paramInput } from './trace.js'
 // Init metadata
 // ============================================================================
-/** How a parameter's initial values are produced.
- *  - `'randn'` — Gaussian, with `scale` (default 0.02). The common case for
- *    weight matrices and embeddings.
- *  - `'zeros'` — fill with 0. Common for biases and LayerNorm beta.
- *  - `'ones'`  — fill with 1. Common for LayerNorm gain.
- *  - Custom function — receives total element count and shape, returns the
- *    Float32Array. Use for fan-in scaling or any non-standard scheme.
+/** How a parameter's initial values are produced. Serializable shape — no
+ *  closures, since the initial values cross the worker boundary at compile
+ *  time. Use the `init` helpers for ergonomic construction.
+ *
+ *  String shorthands:
+ *  - `'randn'` — Gaussian with std 0.02 (the common weight-matrix init).
+ *  - `'zeros'` — fill with 0 (biases, LayerNorm beta).
+ *  - `'ones'`  — fill with 1 (LayerNorm gain).
+ *
+ *  Object shapes:
+ *  - `{ kind: 'randn', scale }` — randn with explicit std.
+ *  - `{ kind: 'kaiming', gain? }` — `std = gain / sqrt(fan_in)`. Default
+ *    gain `sqrt(2)` (good for ReLU). `fan_in = shape[0]`.
+ *  - `{ kind: 'literal', data }` — explicit Float32Array; length must
+ *    match the parameter's element count.
  */
 export type InitSpec =
   | 'randn'
   | 'zeros'
   | 'ones'
-  | ((size: number, shape: readonly number[]) => Float32Array)
+  | { readonly kind: 'randn'; readonly scale: number }
+  | { readonly kind: 'kaiming'; readonly gain?: number }
+  | { readonly kind: 'literal'; readonly data: Float32Array }
+/** Ergonomic constructors for InitSpec object shapes. */
+export const init = {
+  randn: (opts: { scale?: number } = {}): InitSpec => ({ kind: 'randn', scale: opts.scale ?? 0.02 }),
+  kaiming: (opts: { gain?: number } = {}): InitSpec =>
+    opts.gain !== undefined ? { kind: 'kaiming', gain: opts.gain } : { kind: 'kaiming' },
+  literal: (data: Float32Array): InitSpec => ({ kind: 'literal', data }),
+}
 export interface ParamOptions {
   dtype?: Dtype
-  /** Init kind. Default: `'randn'`. */
+  /** Init shape. Default: `'randn'` (std 0.02). */
   init?: InitSpec
-  /** Std dev for `'randn'`. Default 0.02. Ignored for non-randn init. */
-  scale?: number
   /** Whether AdamW (when `weightDecay > 0`) should apply decoupled weight
-   *  decay to this param. Default: `true` for `'randn'` init (weight matrices,
-   *  embeddings), `false` for `'zeros'` / `'ones'` (biases, LN gains). Override
-   *  to force or skip. Replaces `adam.decayFilter` for the common case. */
+   *  decay to this param. Default: `true` for randn/kaiming/literal init
+   *  (weight matrices, embeddings); `false` for zeros/ones (biases, LN
+   *  gains). Override to force or skip. Replaces `adam.decayFilter` for
+   *  the common case. */
   decay?: boolean
 }
@@ -65,31 +82,52 @@ function boxMuller(): number {
   return Math.sqrt(-2 * Math.log(Math.max(1e-10, Math.random()))) * Math.cos(2 * Math.PI * Math.random())
 }
-function resolveInit(opts: ParamOptions | undefined): InitFn {
-  const init = opts?.init ?? 'randn'
-  if (init === 'randn') {
-    const scale = opts?.scale ?? 0.02
-    return (size) => {
-      const arr = new Float32Array(size)
-      for (let i = 0; i < size; i++) arr[i] = boxMuller() * scale
-      return arr
+function randnFn(scale: number): InitFn {
+  return (size) => {
+    const arr = new Float32Array(size)
+    for (let i = 0; i < size; i++) arr[i] = boxMuller() * scale
+    return arr
+  }
+}
+/** Compile-time-only: resolve an InitSpec shape into the closure that
+ *  generates the initial Float32Array for a given parameter shape. Runs
+ *  on the main thread before initial values are transferred to the worker. */
+function resolveInit(spec: InitSpec | undefined): InitFn {
+  if (!spec || spec === 'randn') return randnFn(0.02)
+  if (spec === 'zeros') return (size) => new Float32Array(size)
+  if (spec === 'ones') return (size) => { const a = new Float32Array(size); a.fill(1); return a }
+  switch (spec.kind) {
+    case 'randn': return randnFn(spec.scale)
+    case 'kaiming': {
+      const gain = spec.gain ?? Math.sqrt(2)
+      return (size, shape) => {
+        const fanIn = shape[0] ?? size
+        const std = gain / Math.sqrt(fanIn)
+        const arr = new Float32Array(size)
+        for (let i = 0; i < size; i++) arr[i] = boxMuller() * std
+        return arr
+      }
+    }
+    case 'literal': {
+      const data = spec.data
+      return (size) => {
+        if (data.length !== size) {
+          throw new Error(`init.literal: data length ${data.length} doesn't match param size ${size}`)
+        }
+        return new Float32Array(data)
+      }
     }
   }
-  if (init === 'zeros') return (size) => new Float32Array(size)
-  if (init === 'ones') return (size) => { const a = new Float32Array(size); a.fill(1); return a }
-  if (typeof init === 'function') return init
-  throw new Error(`Unknown init: ${String(init)}`)
 }
-/** Resolve the decay default for a param. Decay weight matrices and
- *  embedding tables (randn-initialized); skip biases (zeros) and LN gains
- *  (ones). Custom init functions default to "decay" — most user-supplied
- *  inits are weight-shaped (Kaiming etc.). Explicit `decay: false` overrides. */
+/** Resolve the decay default for a param. Weight-shaped inits (randn,
+ *  kaiming, literal) default to decay=true; ones/zeros default to false
+ *  (biases, LN gains). Explicit `decay` opt overrides. */
 function resolveDecay(opts: ParamOptions | undefined): boolean {
   if (opts?.decay !== undefined) return opts.decay
-  const init = opts?.init ?? 'randn'
-  if (init === 'zeros' || init === 'ones') return false
-  return true   // 'randn' or function
+  const spec = opts?.init ?? 'randn'
+  return spec !== 'zeros' && spec !== 'ones'
 }
 // ============================================================================
@@ -127,7 +165,7 @@ export abstract class Module {
   protected param(shape: Shape, opts?: ParamOptions): Tensor {
     const dtype = opts?.dtype ?? 'f32'
     // Lie to TypeScript: the sentinel becomes a Tensor at materialize time.
-    return new ParamSentinel(shape, dtype, resolveInit(opts), resolveDecay(opts)) as unknown as Tensor
+    return new ParamSentinel(shape, dtype, resolveInit(opts?.init), resolveDecay(opts)) as unknown as Tensor
   }
 }

package/src/worker-protocol.ts ADDED Viewed

@@ -0,0 +1,183 @@
+// Wire format for the main-thread ↔ worker postMessage channel.
+//
+// All requests carry a numeric `id` assigned by the main thread; responses
+// echo it back so the proxy can match concurrent in-flight calls. Every
+// response is either `{ ok: true, result }` or `{ ok: false, error }`.
+// Errors carry serialized name/message/stack so the proxy can reconstitute
+// an Error with a working `instanceof` check on the receiving side.
+//
+// Inputs (typed arrays) and outputs (typed arrays, captures) are transferred
+// rather than copied — see the per-request notes for which fields go on the
+// transfer list. A single worker may host multiple compiled graphs (a train
+// graph plus sibling forward graphs); each has a `graphId` issued by the
+// main thread at compile time.
+import type { Graph } from './ir.js'
+import type { BufferPlan } from './buffers.js'
+import type { KernelSpec } from './codegen.js'
+import type { LRSchedule } from './adam.js'
+// ============================================================================
+// Serializable config (subset of AdamResolvedConfig that crosses the wire).
+// `decayFilter` (a function, used only at compile time) is NOT part of this —
+// the per-param decay decision is already baked into the IR by appendAdam
+// before the IR ships to the worker.
+// ============================================================================
+export interface WireAdamConfig {
+  lr: LRSchedule
+  b1: number
+  b2: number
+  eps: number
+  weightDecay: number
+  lrIsScheduled: boolean
+  /** Names of the per-step scalar inputs the worker must populate before
+   *  every step (`_adam_lrt`, optionally `_adam_decay_shrink`). Mirrors
+   *  AdamResult so the worker can update them without re-deriving. */
+  lrtInputName: string
+  decayShrinkInputName: string | null
+}
+/** Compile output that crosses to the worker. Same fields as CompiledIR
+ *  minus the `loss` tensor (carried by graph.outputs[0]). */
+export interface WireIR {
+  graph: Graph
+  plan: BufferPlan
+  kernels: KernelSpec[]
+}
+// ============================================================================
+// Requests (main → worker)
+// ============================================================================
+export type Req =
+  | { id: number; kind: 'createRuntime'; payload: CreateRuntimePayload }
+  | { id: number; kind: 'compileForward'; payload: CompileForwardPayload }
+  | { id: number; kind: 'step'; payload: StepPayload }
+  | { id: number; kind: 'run'; payload: RunPayload }
+  | { id: number; kind: 'uploadParams'; payload: UploadParamsPayload }
+  | { id: number; kind: 'downloadParams'; payload: { graphId: number } }
+  | { id: number; kind: 'downloadParamGrads'; payload: { graphId: number } }
+  | { id: number; kind: 'resetOptimizer'; payload: { graphId: number } }
+  | { id: number; kind: 'destroy'; payload: { graphId: number } }
+/** Build the training runtime. Always graphId=0 for a fresh worker. */
+export interface CreateRuntimePayload {
+  graphId: number
+  ir: WireIR
+  /** Initial param values per name. Transferred (zero-copy) — the main
+   *  thread loses access after postMessage. */
+  initialParams: Record<string, Float32Array>
+  /** Adam config when training; absent for forward-only compiles. */
+  adam: WireAdamConfig | null
+}
+/** Build a sibling forward-only graph that shares param buffers with an
+ *  existing graph (typically the training graph at graphId=0). */
+export interface CompileForwardPayload {
+  graphId: number
+  parentGraphId: number
+  ir: WireIR
+}
+/** One training step. Inputs are transferred; the caller's typed arrays
+ *  become detached after postMessage. */
+export interface StepPayload {
+  graphId: number
+  inputs: Record<string, Int32Array | Float32Array>
+  withCaptures: boolean
+}
+/** Forward-only run. Same transfer semantics as `step`. */
+export interface RunPayload {
+  graphId: number
+  inputs: Record<string, Int32Array | Float32Array>
+  withCaptures: boolean
+}
+export interface UploadParamsPayload {
+  graphId: number
+  params: Record<string, Float32Array>  // transferred
+  partial: boolean
+}
+// ============================================================================
+// Responses (worker → main)
+// ============================================================================
+export type Res<R = unknown> =
+  | { id: number; ok: true; result: R }
+  | { id: number; ok: false; error: WireError }
+export interface WireError {
+  name: string
+  message: string
+  stack: string
+}
+// Per-request result shapes:
+export interface CreateRuntimeResult {
+  paramNames: string[]
+  outputShape: number[]
+  kernelCount: number
+  captureShapes: Record<string, number[]>
+}
+export interface CompileForwardResult {
+  paramNames: string[]
+  outputShape: number[]
+  kernelCount: number
+  captureShapes: Record<string, number[]>
+}
+/** Step without `withCaptures` returns just `loss`. With captures, also
+ *  populates `captures` (per-name Float32Array, all transferred back). */
+export interface StepResultWire {
+  loss: number
+  captures: Record<string, Float32Array> | null
+}
+/** Run without `withCaptures` returns `{ output, captures: null }`.
+ *  With captures, also populates `captures`. */
+export interface RunResultWire {
+  output: Float32Array
+  captures: Record<string, Float32Array> | null
+}
+export interface DownloadParamsResult {
+  params: Record<string, Float32Array>  // transferred
+}
+// ============================================================================
+// Transfer-list helpers
+// ============================================================================
+/** Collect the underlying ArrayBuffers from a Record of typed arrays so we
+ *  can pass them on `postMessage`'s transfer list. The values themselves
+ *  stay in the Record; only their backing buffers move. */
+export function transferablesOfRecord(
+  rec: Record<string, Int32Array | Float32Array>,
+): ArrayBuffer[] {
+  const out: ArrayBuffer[] = []
+  for (const v of Object.values(rec)) out.push(v.buffer as ArrayBuffer)
+  return out
+}
+/** Serialize an Error to a wire-friendly shape, preserving stack + name so
+ *  the receiving side can reconstitute an Error that an `instanceof`-aware
+ *  caller (e.g., for `ShapeError`) can still pattern-match by name. */
+export function wireError(e: unknown): WireError {
+  if (e instanceof Error) {
+    return { name: e.name, message: e.message, stack: e.stack ?? '' }
+  }
+  return { name: 'Error', message: String(e), stack: '' }
+}
+/** Reconstitute an Error from the wire shape on the receiving (main) side. */
+export function reconstituteError(w: WireError): Error {
+  const err = new Error(w.message)
+  err.name = w.name
+  err.stack = w.stack
+  return err
+}

package/src/worker-proxy.ts ADDED Viewed

@@ -0,0 +1,76 @@
+// Main-thread half of the worker channel: request/response correlation,
+// promise wiring, error reconstitution. Knows nothing about Adam, captures,
+// IR, etc. — just shuttles typed messages.
+import type { Req, Res, WireError } from './worker-protocol.js'
+import { reconstituteError } from './worker-protocol.js'
+interface PendingHandlers {
+  resolve: (v: unknown) => void
+  reject: (e: Error) => void
+}
+/** Spawn a worker from an inlined source string and provide a typed
+ *  request/response channel. One WorkerProxy = one Worker = one GPUDevice
+ *  on the worker side. Sibling graphs share the same WorkerProxy. */
+export class WorkerProxy {
+  private worker: Worker
+  private nextId = 1
+  private pending = new Map<number, PendingHandlers>()
+  private terminated = false
+  constructor(workerSource: string) {
+    const blob = new Blob([workerSource], { type: 'application/javascript' })
+    const url = URL.createObjectURL(blob)
+    this.worker = new Worker(url, { type: 'module' })
+    // The Blob URL keeps memory alive as long as it's referenced; revoke
+    // once the worker has loaded its source. Browsers tolerate revoke
+    // immediately after construction in practice.
+    URL.revokeObjectURL(url)
+    this.worker.onmessage = (ev: MessageEvent<Res>) => {
+      const reply = ev.data
+      const handlers = this.pending.get(reply.id)
+      if (!handlers) return  // stale reply; ignore
+      this.pending.delete(reply.id)
+      if (reply.ok) handlers.resolve(reply.result)
+      else handlers.reject(reconstituteError(reply.error))
+    }
+    this.worker.onerror = (ev: ErrorEvent) => {
+      const err = new Error(`tensorgrad worker error: ${ev.message || 'unknown'}`)
+      const wire: WireError = { name: 'WorkerError', message: err.message, stack: err.stack ?? '' }
+      // Reject everything in flight; subsequent calls will fail too.
+      for (const handlers of this.pending.values()) handlers.reject(reconstituteError(wire))
+      this.pending.clear()
+    }
+  }
+  /** Send a request and await its matching response. `transfer` lists the
+   *  ArrayBuffers to move (zero-copy) into the worker. */
+  request<R>(req: Omit<Req, 'id'>, transfer: ArrayBuffer[] = []): Promise<R> {
+    if (this.terminated) return Promise.reject(new Error('tensorgrad: worker has been terminated'))
+    const id = this.nextId++
+    return new Promise<R>((resolve, reject) => {
+      this.pending.set(id, { resolve: resolve as (v: unknown) => void, reject })
+      this.worker.postMessage({ ...req, id } as Req, transfer)
+    })
+  }
+  /** Fire-and-forget variant for cases where the caller doesn't need a reply
+   *  (currently unused; keep for symmetry / future use). */
+  send(req: Omit<Req, 'id'>, transfer: ArrayBuffer[] = []): void {
+    if (this.terminated) return
+    const id = this.nextId++
+    this.worker.postMessage({ ...req, id } as Req, transfer)
+  }
+  terminate(): void {
+    if (this.terminated) return
+    this.terminated = true
+    this.worker.terminate()
+    const err = new Error('tensorgrad: worker terminated')
+    for (const handlers of this.pending.values()) handlers.reject(err)
+    this.pending.clear()
+  }
+}

package/src/worker.ts ADDED Viewed

@@ -0,0 +1,281 @@
+// Worker entry point. Holds the GPUDevice + CompiledRuntime for one or more
+// graphs and proxies main-thread requests via postMessage. See
+// specs/WorkerArchitecture.md for the rationale.
+//
+// Keep this file dependency-free of anything DOM-y: it bundles into a Blob
+// URL and runs in a Web Worker context where `window`/`document` don't
+// exist. WebGPU IS available in workers (Chrome 113+, Safari 17.4+).
+import { createRuntime, type CompiledRuntime, type RuntimeOpts } from './runtime.js'
+import { resolveLR, type LRSchedule } from './adam.js'
+import type { Req, Res, WireIR, WireAdamConfig, WireError } from './worker-protocol.js'
+import { wireError } from './worker-protocol.js'
+// ----------------------------------------------------------------------------
+// Per-graph state
+// ----------------------------------------------------------------------------
+interface GraphSlot {
+  runtime: CompiledRuntime
+  paramNames: readonly string[]
+  outputShape: number[]
+  kernelCount: number
+  captureShapes: Record<string, number[]>
+  /** Adam state for this graph, if it's a training graph. The wrapped step
+   *  uses these to populate the per-step lrt and decayShrink scalars. */
+  adam: AdamState | null
+}
+interface AdamState {
+  config: WireAdamConfig
+  t: number
+  lrtBuf: Float32Array
+  decayShrinkBuf: Float32Array | null
+}
+const graphs = new Map<number, GraphSlot>()
+// Worker holds one device shared across all graphs (sibling forward graphs
+// must share param GPUBuffers, which means sharing a device).
+let device: GPUDevice | null = null
+async function ensureDevice(): Promise<GPUDevice> {
+  if (device) return device
+  if (typeof navigator === 'undefined' || !navigator.gpu) {
+    throw new Error('tensorgrad worker: WebGPU not available in this environment')
+  }
+  const adapter = await navigator.gpu.requestAdapter()
+  if (!adapter) throw new Error('tensorgrad worker: no WebGPU adapter')
+  device = await adapter.requestDevice()
+  return device
+}
+// ----------------------------------------------------------------------------
+// Request handlers
+// ----------------------------------------------------------------------------
+async function handleCreateRuntime(payload: {
+  graphId: number
+  ir: WireIR
+  initialParams: Record<string, Float32Array>
+  adam: WireAdamConfig | null
+}): Promise<{ paramNames: string[]; outputShape: number[]; kernelCount: number; captureShapes: Record<string, number[]> }> {
+  const dev = await ensureDevice()
+  const { graph, plan, kernels } = payload.ir
+  const outputTensorId = graph.outputs[0]!
+  const outputBufferId = plan.tensorToBuffer.get(outputTensorId)!
+  const opts: RuntimeOpts = { device: dev }
+  const runtime = await createRuntime(plan, kernels, outputBufferId, opts)
+  // Upload initial params.
+  if (Object.keys(payload.initialParams).length > 0) {
+    runtime.uploadParams(payload.initialParams)
+  }
+  // Capture shape metadata for return.
+  const captureShapes: Record<string, number[]> = {}
+  for (const [name, bufId] of plan.capturesByName) {
+    captureShapes[name] = [...plan.buffers[bufId]!.shape]
+  }
+  const slot: GraphSlot = {
+    runtime,
+    paramNames: [...plan.paramsByName.keys()],
+    outputShape: [...runtime.outputShape],
+    kernelCount: kernels.filter(k => k.wgsl).length,
+    captureShapes,
+    adam: payload.adam ? createAdamState(payload.adam) : null,
+  }
+  graphs.set(payload.graphId, slot)
+  return {
+    paramNames: [...slot.paramNames],
+    outputShape: slot.outputShape,
+    kernelCount: slot.kernelCount,
+    captureShapes: slot.captureShapes,
+  }
+}
+async function handleCompileForward(payload: {
+  graphId: number
+  parentGraphId: number
+  ir: WireIR
+}): Promise<{ paramNames: string[]; outputShape: number[]; kernelCount: number; captureShapes: Record<string, number[]> }> {
+  const dev = await ensureDevice()
+  const parent = graphs.get(payload.parentGraphId)
+  if (!parent) throw new Error(`compileForward: parent graph ${payload.parentGraphId} not found`)
+  const { graph, plan, kernels } = payload.ir
+  const outputTensorId = graph.outputs[0]!
+  const outputBufferId = plan.tensorToBuffer.get(outputTensorId)!
+  const opts: RuntimeOpts = { device: dev, sharedParams: parent.runtime.params }
+  const runtime = await createRuntime(plan, kernels, outputBufferId, opts)
+  // No initial-param upload — sharedParams covers everything.
+  const captureShapes: Record<string, number[]> = {}
+  for (const [name, bufId] of plan.capturesByName) {
+    captureShapes[name] = [...plan.buffers[bufId]!.shape]
+  }
+  const slot: GraphSlot = {
+    runtime,
+    paramNames: [...plan.paramsByName.keys()],
+    outputShape: [...runtime.outputShape],
+    kernelCount: kernels.filter(k => k.wgsl).length,
+    captureShapes,
+    adam: null,
+  }
+  graphs.set(payload.graphId, slot)
+  return {
+    paramNames: [...slot.paramNames],
+    outputShape: slot.outputShape,
+    kernelCount: slot.kernelCount,
+    captureShapes: slot.captureShapes,
+  }
+}
+function createAdamState(cfg: WireAdamConfig): AdamState {
+  return {
+    config: cfg,
+    t: 0,
+    lrtBuf: new Float32Array(1),
+    decayShrinkBuf: cfg.decayShrinkInputName ? new Float32Array(1) : null,
+  }
+}
+/** Inject Adam's per-step lrt + decayShrink scalars into the inputs map.
+ *  Called before every step on a training graph. The buffers are reused
+ *  across steps to avoid allocation. */
+function injectAdamScalars(slot: GraphSlot, inputs: Record<string, Int32Array | Float32Array>): Record<string, Int32Array | Float32Array> {
+  const a = slot.adam
+  if (!a) return inputs
+  a.t++
+  const lrNow = resolveLR(a.config.lr as LRSchedule, a.t)
+  a.lrtBuf[0] = lrNow * Math.sqrt(1 - Math.pow(a.config.b2, a.t)) / (1 - Math.pow(a.config.b1, a.t))
+  const merged: Record<string, Int32Array | Float32Array> = { ...inputs, [a.config.lrtInputName]: a.lrtBuf }
+  if (a.decayShrinkBuf && a.config.decayShrinkInputName) {
+    a.decayShrinkBuf[0] = 1 - lrNow * a.config.weightDecay
+    merged[a.config.decayShrinkInputName] = a.decayShrinkBuf
+  }
+  return merged
+}
+async function handleStep(payload: {
+  graphId: number
+  inputs: Record<string, Int32Array | Float32Array>
+  withCaptures: boolean
+}): Promise<{ loss: number; captures: Record<string, Float32Array> | null }> {
+  const slot = mustGet(payload.graphId)
+  const merged = injectAdamScalars(slot, payload.inputs)
+  if (payload.withCaptures) {
+    const r = await slot.runtime.step(merged, { withCaptures: true })
+    return { loss: r.loss, captures: capturesToRecord(r.captures, slot.captureShapes) }
+  }
+  const loss = await slot.runtime.step(merged)
+  return { loss, captures: null }
+}
+async function handleRun(payload: {
+  graphId: number
+  inputs: Record<string, Int32Array | Float32Array>
+  withCaptures: boolean
+}): Promise<{ output: Float32Array; captures: Record<string, Float32Array> | null }> {
+  const slot = mustGet(payload.graphId)
+  if (payload.withCaptures) {
+    const r = await slot.runtime.run(payload.inputs, { withCaptures: true })
+    return { output: r.output, captures: capturesToRecord(r.captures, slot.captureShapes) }
+  }
+  const output = await slot.runtime.run(payload.inputs)
+  return { output, captures: null }
+}
+/** Captures (a class instance with a private Map) → a plain Record so the
+ *  worker can transfer Float32Arrays back without serializing the class. */
+function capturesToRecord(
+  captures: { get(name: string): Float32Array; has(name: string): boolean; names(): string[] },
+  // captureShapes available but not used directly — capture names from
+  // shapes in case captures.names() is filtered (it isn't, but be safe).
+  shapes: Record<string, number[]>,
+): Record<string, Float32Array> {
+  const out: Record<string, Float32Array> = {}
+  for (const name of Object.keys(shapes)) {
+    if (captures.has(name)) out[name] = captures.get(name)
+  }
+  return out
+}
+function handleUploadParams(payload: {
+  graphId: number
+  params: Record<string, Float32Array>
+  partial: boolean
+}): void {
+  const slot = mustGet(payload.graphId)
+  slot.runtime.uploadParams(payload.params, { partial: payload.partial })
+}
+async function handleDownloadParams(payload: { graphId: number }): Promise<{ params: Record<string, Float32Array> }> {
+  const slot = mustGet(payload.graphId)
+  return { params: await slot.runtime.downloadParams() }
+}
+async function handleDownloadParamGrads(payload: { graphId: number }): Promise<{ params: Record<string, Float32Array> }> {
+  const slot = mustGet(payload.graphId)
+  return { params: await slot.runtime.downloadParamGrads() }
+}
+function handleResetOptimizer(payload: { graphId: number }): void {
+  const slot = mustGet(payload.graphId)
+  slot.runtime.resetOptimizerState()
+  if (slot.adam) slot.adam.t = 0
+}
+function handleDestroy(payload: { graphId: number }): void {
+  const slot = graphs.get(payload.graphId)
+  if (!slot) return
+  slot.runtime.destroy()
+  graphs.delete(payload.graphId)
+}
+function mustGet(graphId: number): GraphSlot {
+  const slot = graphs.get(graphId)
+  if (!slot) throw new Error(`tensorgrad worker: graph ${graphId} not found`)
+  return slot
+}
+// ----------------------------------------------------------------------------
+// Message dispatch
+// ----------------------------------------------------------------------------
+self.onmessage = async (ev: MessageEvent<Req>) => {
+  const req = ev.data
+  try {
+    let result: unknown
+    let transferList: ArrayBuffer[] = []
+    switch (req.kind) {
+      case 'createRuntime':     result = await handleCreateRuntime(req.payload); break
+      case 'compileForward':    result = await handleCompileForward(req.payload); break
+      case 'step':              result = await handleStep(req.payload); transferList = collectTransfers((result as any).captures); break
+      case 'run':               { const r = await handleRun(req.payload); result = r; transferList = [r.output.buffer as ArrayBuffer, ...collectTransfers(r.captures)]; break }
+      case 'uploadParams':      handleUploadParams(req.payload); result = null; break
+      case 'downloadParams':    { const r = await handleDownloadParams(req.payload); result = r; transferList = collectTransfers(r.params); break }
+      case 'downloadParamGrads':{ const r = await handleDownloadParamGrads(req.payload); result = r; transferList = collectTransfers(r.params); break }
+      case 'resetOptimizer':    handleResetOptimizer(req.payload); result = null; break
+      case 'destroy':           handleDestroy(req.payload); result = null; break
+      default: throw new Error(`unknown request kind: ${(req as { kind: string }).kind}`)
+    }
+    const reply: Res = { id: req.id, ok: true, result }
+    self.postMessage(reply, { transfer: transferList })
+  } catch (e) {
+    const error: WireError = wireError(e)
+    const reply: Res = { id: req.id, ok: false, error }
+    self.postMessage(reply)
+  }
+}
+function collectTransfers(rec: Record<string, Float32Array> | null | undefined): ArrayBuffer[] {
+  if (!rec) return []
+  const out: ArrayBuffer[] = []
+  for (const v of Object.values(rec)) out.push(v.buffer as ArrayBuffer)
+  return out
+}