npm - tensorgrad - Versions diffs - 0.0.15 → 0.0.17 - Mend

tensorgrad 0.0.15 → 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/src/module.ts CHANGED Viewed

@@ -32,30 +32,47 @@ import { paramInput } from './trace.js'
 // Init metadata
 // ============================================================================
-/** How a parameter's initial values are produced.
- *  - `'randn'` — Gaussian, with `scale` (default 0.02). The common case for
- *    weight matrices and embeddings.
- *  - `'zeros'` — fill with 0. Common for biases and LayerNorm beta.
- *  - `'ones'`  — fill with 1. Common for LayerNorm gain.
- *  - Custom function — receives total element count and shape, returns the
- *    Float32Array. Use for fan-in scaling or any non-standard scheme.
+/** How a parameter's initial values are produced. Serializable shape — no
+ *  closures, since the initial values cross the worker boundary at compile
+ *  time. Use the `init` helpers for ergonomic construction.
+ *
+ *  String shorthands:
+ *  - `'randn'` — Gaussian with std 0.02 (the common weight-matrix init).
+ *  - `'zeros'` — fill with 0 (biases, LayerNorm beta).
+ *  - `'ones'`  — fill with 1 (LayerNorm gain).
+ *
+ *  Object shapes:
+ *  - `{ kind: 'randn', scale }` — randn with explicit std.
+ *  - `{ kind: 'kaiming', gain? }` — `std = gain / sqrt(fan_in)`. Default
+ *    gain `sqrt(2)` (good for ReLU). `fan_in = shape[0]`.
+ *  - `{ kind: 'literal', data }` — explicit Float32Array; length must
+ *    match the parameter's element count.
  */
 export type InitSpec =
   | 'randn'
   | 'zeros'
   | 'ones'
-  | ((size: number, shape: readonly number[]) => Float32Array)
+  | { readonly kind: 'randn'; readonly scale: number }
+  | { readonly kind: 'kaiming'; readonly gain?: number }
+  | { readonly kind: 'literal'; readonly data: Float32Array }
+/** Ergonomic constructors for InitSpec object shapes. */
+export const init = {
+  randn: (opts: { scale?: number } = {}): InitSpec => ({ kind: 'randn', scale: opts.scale ?? 0.02 }),
+  kaiming: (opts: { gain?: number } = {}): InitSpec =>
+    opts.gain !== undefined ? { kind: 'kaiming', gain: opts.gain } : { kind: 'kaiming' },
+  literal: (data: Float32Array): InitSpec => ({ kind: 'literal', data }),
+}
 export interface ParamOptions {
   dtype?: Dtype
-  /** Init kind. Default: `'randn'`. */
+  /** Init shape. Default: `'randn'` (std 0.02). */
   init?: InitSpec
-  /** Std dev for `'randn'`. Default 0.02. Ignored for non-randn init. */
-  scale?: number
   /** Whether AdamW (when `weightDecay > 0`) should apply decoupled weight
-   *  decay to this param. Default: `true` for `'randn'` init (weight matrices,
-   *  embeddings), `false` for `'zeros'` / `'ones'` (biases, LN gains). Override
-   *  to force or skip. Replaces `adam.decayFilter` for the common case. */
+   *  decay to this param. Default: `true` for randn/kaiming/literal init
+   *  (weight matrices, embeddings); `false` for zeros/ones (biases, LN
+   *  gains). Override to force or skip. Replaces `adam.decayFilter` for
+   *  the common case. */
   decay?: boolean
 }
@@ -65,31 +82,52 @@ function boxMuller(): number {
   return Math.sqrt(-2 * Math.log(Math.max(1e-10, Math.random()))) * Math.cos(2 * Math.PI * Math.random())
 }
-function resolveInit(opts: ParamOptions | undefined): InitFn {
-  const init = opts?.init ?? 'randn'
-  if (init === 'randn') {
-    const scale = opts?.scale ?? 0.02
-    return (size) => {
-      const arr = new Float32Array(size)
-      for (let i = 0; i < size; i++) arr[i] = boxMuller() * scale
-      return arr
+function randnFn(scale: number): InitFn {
+  return (size) => {
+    const arr = new Float32Array(size)
+    for (let i = 0; i < size; i++) arr[i] = boxMuller() * scale
+    return arr
+  }
+}
+/** Compile-time-only: resolve an InitSpec shape into the closure that
+ *  generates the initial Float32Array for a given parameter shape. Runs
+ *  on the main thread before initial values are transferred to the worker. */
+function resolveInit(spec: InitSpec | undefined): InitFn {
+  if (!spec || spec === 'randn') return randnFn(0.02)
+  if (spec === 'zeros') return (size) => new Float32Array(size)
+  if (spec === 'ones') return (size) => { const a = new Float32Array(size); a.fill(1); return a }
+  switch (spec.kind) {
+    case 'randn': return randnFn(spec.scale)
+    case 'kaiming': {
+      const gain = spec.gain ?? Math.sqrt(2)
+      return (size, shape) => {
+        const fanIn = shape[0] ?? size
+        const std = gain / Math.sqrt(fanIn)
+        const arr = new Float32Array(size)
+        for (let i = 0; i < size; i++) arr[i] = boxMuller() * std
+        return arr
+      }
+    }
+    case 'literal': {
+      const data = spec.data
+      return (size) => {
+        if (data.length !== size) {
+          throw new Error(`init.literal: data length ${data.length} doesn't match param size ${size}`)
+        }
+        return new Float32Array(data)
+      }
     }
   }
-  if (init === 'zeros') return (size) => new Float32Array(size)
-  if (init === 'ones') return (size) => { const a = new Float32Array(size); a.fill(1); return a }
-  if (typeof init === 'function') return init
-  throw new Error(`Unknown init: ${String(init)}`)
 }
-/** Resolve the decay default for a param. Decay weight matrices and
- *  embedding tables (randn-initialized); skip biases (zeros) and LN gains
- *  (ones). Custom init functions default to "decay" — most user-supplied
- *  inits are weight-shaped (Kaiming etc.). Explicit `decay: false` overrides. */
+/** Resolve the decay default for a param. Weight-shaped inits (randn,
+ *  kaiming, literal) default to decay=true; ones/zeros default to false
+ *  (biases, LN gains). Explicit `decay` opt overrides. */
 function resolveDecay(opts: ParamOptions | undefined): boolean {
   if (opts?.decay !== undefined) return opts.decay
-  const init = opts?.init ?? 'randn'
-  if (init === 'zeros' || init === 'ones') return false
-  return true   // 'randn' or function
+  const spec = opts?.init ?? 'randn'
+  return spec !== 'zeros' && spec !== 'ones'
 }
 // ============================================================================
@@ -127,7 +165,7 @@ export abstract class Module {
   protected param(shape: Shape, opts?: ParamOptions): Tensor {
     const dtype = opts?.dtype ?? 'f32'
     // Lie to TypeScript: the sentinel becomes a Tensor at materialize time.
-    return new ParamSentinel(shape, dtype, resolveInit(opts), resolveDecay(opts)) as unknown as Tensor
+    return new ParamSentinel(shape, dtype, resolveInit(opts?.init), resolveDecay(opts)) as unknown as Tensor
   }
 }

package/src/runtime.ts CHANGED Viewed

@@ -348,42 +348,67 @@ export async function createRuntime(
       queue.writeBuffer(buffers.get(bufId)!, 0, data as unknown as BufferSource)
     }
-    const encoder = device.createCommandEncoder({ label: 'tensorgrad-step' })
-    for (let i = 0; i < kernels.length; i++) {
-      const k = kernels[i]!
-      if (!k.wgsl || k.threads === 0) continue
-      const pipeline = pipelines[i]!
-      const bindGroup = bindGroups[i]!
-      const pass = encoder.beginComputePass({ label: k.opKind })
-      pass.setPipeline(pipeline)
-      pass.setBindGroup(0, bindGroup)
-      // WebGPU caps each dispatch dimension at 65535 workgroups. Split into 2D
-      // when a kernel needs more than that on the X axis. Kernels compute their
-      // global index as `gid.x + gid.y * (65535 * workgroup_size)`, matching the
-      // stride we set here. For dispatches that fit in one row, gid.y is 0.
-      const wgCount = Math.max(1, Math.ceil(k.threads / k.workgroupSize))
-      const MAX_X = 65535
-      const wgX = Math.min(wgCount, MAX_X)
-      const wgY = Math.ceil(wgCount / MAX_X)
-      pass.dispatchWorkgroups(wgX, wgY, 1)
-      pass.end()
-    }
-    // After all dispatches: writebacks (Adam state, updated params). Empty for
-    // forward-only compiles.
-    for (const wb of plan.writebacks) {
-      encoder.copyBufferToBuffer(buffers.get(wb.source)!, 0, buffers.get(wb.dest)!, 0, wb.bytes)
-    }
-    encoder.copyBufferToBuffer(buffers.get(lossBufferId)!, 0, outputReadback, 0, outputSpec.byteSize)
-    // Capture readbacks (only when opted in). All captures concatenate into
-    // a single staging buffer so we mapAsync once instead of N times.
+    // Chunked submit. One queue.submit() of all 240 kernels monopolizes the
+    // GPU for the full step duration, blocking compositor frames the entire
+    // time. Splitting into chunks with an explicit GPU-drain await between
+    // them gives the compositor a slot at each chunk boundary. On graphs
+    // smaller than CHUNK_SIZE this collapses to a single submit (no
+    // overhead). See specs/WorkerArchitecture.md / mobile-jank investigation.
+    const CHUNK_SIZE = 32
     let layout: CaptureLayout | null = null
     if (wantCaptures) {
+      // Compute layout up front so the last chunk can append capture copies.
       layout = ensureCaptureStaging()
-      for (const s of layout.slices) {
-        encoder.copyBufferToBuffer(buffers.get(s.bufId)!, 0, layout.buffer, s.offset, s.byteSize)
+    }
+    let kernelIdx = 0
+    while (kernelIdx < kernels.length) {
+      const chunkEnd = Math.min(kernelIdx + CHUNK_SIZE, kernels.length)
+      const isLast = chunkEnd === kernels.length
+      const encoder = device.createCommandEncoder({
+        label: kernels.length > CHUNK_SIZE ? `tensorgrad-chunk-${kernelIdx}` : 'tensorgrad-step',
+      })
+      for (let i = kernelIdx; i < chunkEnd; i++) {
+        const k = kernels[i]!
+        if (!k.wgsl || k.threads === 0) continue
+        const pipeline = pipelines[i]!
+        const bindGroup = bindGroups[i]!
+        const pass = encoder.beginComputePass({ label: k.opKind })
+        pass.setPipeline(pipeline)
+        pass.setBindGroup(0, bindGroup)
+        // WebGPU caps each dispatch dimension at 65535 workgroups. Split into 2D
+        // when a kernel needs more than that on the X axis. Kernels compute their
+        // global index as `gid.x + gid.y * (65535 * workgroup_size)`, matching the
+        // stride we set here. For dispatches that fit in one row, gid.y is 0.
+        const wgCount = Math.max(1, Math.ceil(k.threads / k.workgroupSize))
+        const MAX_X = 65535
+        const wgX = Math.min(wgCount, MAX_X)
+        const wgY = Math.ceil(wgCount / MAX_X)
+        pass.dispatchWorkgroups(wgX, wgY, 1)
+        pass.end()
       }
+      if (isLast) {
+        // Writebacks (Adam state, updated params; empty for forward-only) +
+        // output readback copy + capture readback copies all go into the
+        // final chunk so a single mapAsync below sees everything.
+        for (const wb of plan.writebacks) {
+          encoder.copyBufferToBuffer(buffers.get(wb.source)!, 0, buffers.get(wb.dest)!, 0, wb.bytes)
+        }
+        encoder.copyBufferToBuffer(buffers.get(lossBufferId)!, 0, outputReadback, 0, outputSpec.byteSize)
+        if (layout) {
+          for (const s of layout.slices) {
+            encoder.copyBufferToBuffer(buffers.get(s.bufId)!, 0, layout.buffer, s.offset, s.byteSize)
+          }
+        }
+      }
+      queue.submit([encoder.finish()])
+      if (!isLast) {
+        // Drain the chunk before queuing the next one. This is the moment
+        // the compositor can interleave its own frame work onto the GPU.
+        await queue.onSubmittedWorkDone()
+      }
+      kernelIdx = chunkEnd
     }
-    queue.submit([encoder.finish()])
     // readback=false: training fire-and-forget. The encoder still copied
     // loss → outputReadback (and captures → staging), but we don't await

package/src/worker-protocol.ts ADDED Viewed

@@ -0,0 +1,183 @@
+// Wire format for the main-thread ↔ worker postMessage channel.
+//
+// All requests carry a numeric `id` assigned by the main thread; responses
+// echo it back so the proxy can match concurrent in-flight calls. Every
+// response is either `{ ok: true, result }` or `{ ok: false, error }`.
+// Errors carry serialized name/message/stack so the proxy can reconstitute
+// an Error with a working `instanceof` check on the receiving side.
+//
+// Inputs (typed arrays) and outputs (typed arrays, captures) are transferred
+// rather than copied — see the per-request notes for which fields go on the
+// transfer list. A single worker may host multiple compiled graphs (a train
+// graph plus sibling forward graphs); each has a `graphId` issued by the
+// main thread at compile time.
+import type { Graph } from './ir.js'
+import type { BufferPlan } from './buffers.js'
+import type { KernelSpec } from './codegen.js'
+import type { LRSchedule } from './adam.js'
+// ============================================================================
+// Serializable config (subset of AdamResolvedConfig that crosses the wire).
+// `decayFilter` (a function, used only at compile time) is NOT part of this —
+// the per-param decay decision is already baked into the IR by appendAdam
+// before the IR ships to the worker.
+// ============================================================================
+export interface WireAdamConfig {
+  lr: LRSchedule
+  b1: number
+  b2: number
+  eps: number
+  weightDecay: number
+  lrIsScheduled: boolean
+  /** Names of the per-step scalar inputs the worker must populate before
+   *  every step (`_adam_lrt`, optionally `_adam_decay_shrink`). Mirrors
+   *  AdamResult so the worker can update them without re-deriving. */
+  lrtInputName: string
+  decayShrinkInputName: string | null
+}
+/** Compile output that crosses to the worker. Same fields as CompiledIR
+ *  minus the `loss` tensor (carried by graph.outputs[0]). */
+export interface WireIR {
+  graph: Graph
+  plan: BufferPlan
+  kernels: KernelSpec[]
+}
+// ============================================================================
+// Requests (main → worker)
+// ============================================================================
+export type Req =
+  | { id: number; kind: 'createRuntime'; payload: CreateRuntimePayload }
+  | { id: number; kind: 'compileForward'; payload: CompileForwardPayload }
+  | { id: number; kind: 'step'; payload: StepPayload }
+  | { id: number; kind: 'run'; payload: RunPayload }
+  | { id: number; kind: 'uploadParams'; payload: UploadParamsPayload }
+  | { id: number; kind: 'downloadParams'; payload: { graphId: number } }
+  | { id: number; kind: 'downloadParamGrads'; payload: { graphId: number } }
+  | { id: number; kind: 'resetOptimizer'; payload: { graphId: number } }
+  | { id: number; kind: 'destroy'; payload: { graphId: number } }
+/** Build the training runtime. Always graphId=0 for a fresh worker. */
+export interface CreateRuntimePayload {
+  graphId: number
+  ir: WireIR
+  /** Initial param values per name. Transferred (zero-copy) — the main
+   *  thread loses access after postMessage. */
+  initialParams: Record<string, Float32Array>
+  /** Adam config when training; absent for forward-only compiles. */
+  adam: WireAdamConfig | null
+}
+/** Build a sibling forward-only graph that shares param buffers with an
+ *  existing graph (typically the training graph at graphId=0). */
+export interface CompileForwardPayload {
+  graphId: number
+  parentGraphId: number
+  ir: WireIR
+}
+/** One training step. Inputs are transferred; the caller's typed arrays
+ *  become detached after postMessage. */
+export interface StepPayload {
+  graphId: number
+  inputs: Record<string, Int32Array | Float32Array>
+  withCaptures: boolean
+}
+/** Forward-only run. Same transfer semantics as `step`. */
+export interface RunPayload {
+  graphId: number
+  inputs: Record<string, Int32Array | Float32Array>
+  withCaptures: boolean
+}
+export interface UploadParamsPayload {
+  graphId: number
+  params: Record<string, Float32Array>  // transferred
+  partial: boolean
+}
+// ============================================================================
+// Responses (worker → main)
+// ============================================================================
+export type Res<R = unknown> =
+  | { id: number; ok: true; result: R }
+  | { id: number; ok: false; error: WireError }
+export interface WireError {
+  name: string
+  message: string
+  stack: string
+}
+// Per-request result shapes:
+export interface CreateRuntimeResult {
+  paramNames: string[]
+  outputShape: number[]
+  kernelCount: number
+  captureShapes: Record<string, number[]>
+}
+export interface CompileForwardResult {
+  paramNames: string[]
+  outputShape: number[]
+  kernelCount: number
+  captureShapes: Record<string, number[]>
+}
+/** Step without `withCaptures` returns just `loss`. With captures, also
+ *  populates `captures` (per-name Float32Array, all transferred back). */
+export interface StepResultWire {
+  loss: number
+  captures: Record<string, Float32Array> | null
+}
+/** Run without `withCaptures` returns `{ output, captures: null }`.
+ *  With captures, also populates `captures`. */
+export interface RunResultWire {
+  output: Float32Array
+  captures: Record<string, Float32Array> | null
+}
+export interface DownloadParamsResult {
+  params: Record<string, Float32Array>  // transferred
+}
+// ============================================================================
+// Transfer-list helpers
+// ============================================================================
+/** Collect the underlying ArrayBuffers from a Record of typed arrays so we
+ *  can pass them on `postMessage`'s transfer list. The values themselves
+ *  stay in the Record; only their backing buffers move. */
+export function transferablesOfRecord(
+  rec: Record<string, Int32Array | Float32Array>,
+): ArrayBuffer[] {
+  const out: ArrayBuffer[] = []
+  for (const v of Object.values(rec)) out.push(v.buffer as ArrayBuffer)
+  return out
+}
+/** Serialize an Error to a wire-friendly shape, preserving stack + name so
+ *  the receiving side can reconstitute an Error that an `instanceof`-aware
+ *  caller (e.g., for `ShapeError`) can still pattern-match by name. */
+export function wireError(e: unknown): WireError {
+  if (e instanceof Error) {
+    return { name: e.name, message: e.message, stack: e.stack ?? '' }
+  }
+  return { name: 'Error', message: String(e), stack: '' }
+}
+/** Reconstitute an Error from the wire shape on the receiving (main) side. */
+export function reconstituteError(w: WireError): Error {
+  const err = new Error(w.message)
+  err.name = w.name
+  err.stack = w.stack
+  return err
+}

package/src/worker-proxy.ts ADDED Viewed

@@ -0,0 +1,76 @@
+// Main-thread half of the worker channel: request/response correlation,
+// promise wiring, error reconstitution. Knows nothing about Adam, captures,
+// IR, etc. — just shuttles typed messages.
+import type { Req, Res, WireError } from './worker-protocol.js'
+import { reconstituteError } from './worker-protocol.js'
+interface PendingHandlers {
+  resolve: (v: unknown) => void
+  reject: (e: Error) => void
+}
+/** Spawn a worker from an inlined source string and provide a typed
+ *  request/response channel. One WorkerProxy = one Worker = one GPUDevice
+ *  on the worker side. Sibling graphs share the same WorkerProxy. */
+export class WorkerProxy {
+  private worker: Worker
+  private nextId = 1
+  private pending = new Map<number, PendingHandlers>()
+  private terminated = false
+  constructor(workerSource: string) {
+    const blob = new Blob([workerSource], { type: 'application/javascript' })
+    const url = URL.createObjectURL(blob)
+    this.worker = new Worker(url, { type: 'module' })
+    // The Blob URL keeps memory alive as long as it's referenced; revoke
+    // once the worker has loaded its source. Browsers tolerate revoke
+    // immediately after construction in practice.
+    URL.revokeObjectURL(url)
+    this.worker.onmessage = (ev: MessageEvent<Res>) => {
+      const reply = ev.data
+      const handlers = this.pending.get(reply.id)
+      if (!handlers) return  // stale reply; ignore
+      this.pending.delete(reply.id)
+      if (reply.ok) handlers.resolve(reply.result)
+      else handlers.reject(reconstituteError(reply.error))
+    }
+    this.worker.onerror = (ev: ErrorEvent) => {
+      const err = new Error(`tensorgrad worker error: ${ev.message || 'unknown'}`)
+      const wire: WireError = { name: 'WorkerError', message: err.message, stack: err.stack ?? '' }
+      // Reject everything in flight; subsequent calls will fail too.
+      for (const handlers of this.pending.values()) handlers.reject(reconstituteError(wire))
+      this.pending.clear()
+    }
+  }
+  /** Send a request and await its matching response. `transfer` lists the
+   *  ArrayBuffers to move (zero-copy) into the worker. */
+  request<R>(req: Omit<Req, 'id'>, transfer: ArrayBuffer[] = []): Promise<R> {
+    if (this.terminated) return Promise.reject(new Error('tensorgrad: worker has been terminated'))
+    const id = this.nextId++
+    return new Promise<R>((resolve, reject) => {
+      this.pending.set(id, { resolve: resolve as (v: unknown) => void, reject })
+      this.worker.postMessage({ ...req, id } as Req, transfer)
+    })
+  }
+  /** Fire-and-forget variant for cases where the caller doesn't need a reply
+   *  (currently unused; keep for symmetry / future use). */
+  send(req: Omit<Req, 'id'>, transfer: ArrayBuffer[] = []): void {
+    if (this.terminated) return
+    const id = this.nextId++
+    this.worker.postMessage({ ...req, id } as Req, transfer)
+  }
+  terminate(): void {
+    if (this.terminated) return
+    this.terminated = true
+    this.worker.terminate()
+    const err = new Error('tensorgrad: worker terminated')
+    for (const handlers of this.pending.values()) handlers.reject(err)
+    this.pending.clear()
+  }
+}