npm - tensorgrad - Versions diffs - 0.0.1 - Mend

tensorgrad 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/src/runtime.ts ADDED Viewed

@@ -0,0 +1,232 @@
+// WebGPU runtime. Reads a BufferPlan + KernelSpec[] (produced by codegen),
+// allocates real GPU buffers and pipelines, and provides a `step()` method
+// that uploads inputs, dispatches all kernels, and reads back outputs.
+//
+// Browser-only: this module needs `navigator.gpu` at runtime.
+import type { BufferPlan } from './buffers.js'
+import type { KernelSpec } from './codegen.js'
+// TS lib.dom defines WebGPU types but not the GPUMapMode runtime constant.
+// Provided by the browser per WebGPU spec; declare just what we use.
+declare const GPUMapMode: { readonly READ: number; readonly WRITE: number }
+export interface CompiledRuntime {
+  /** Upload one or more parameter Float32Arrays to their GPU buffers. */
+  uploadParams(params: Record<string, Float32Array>): void
+  /** Read all parameters back as Float32Arrays — used for UI panels. */
+  downloadParams(): Promise<Record<string, Float32Array>>
+  /** Read all parameter gradients back. Mostly for verification / debugging. */
+  downloadParamGrads(): Promise<Record<string, Float32Array>>
+  /**
+   * One full forward+backward step.
+   *   1. Uploads `inputs` (tokens, targets, masks) to input buffers.
+   *   2. Dispatches every kernel in order.
+   *   3. Reads back the loss scalar.
+   * Returns the loss as a JS number.
+   */
+  step(inputs: Record<string, Int32Array | Float32Array>): Promise<number>
+  /** Free GPU resources. */
+  destroy(): void
+}
+export interface RuntimeOpts {
+  /** Pre-acquired GPUDevice. If omitted, runtime requests its own. */
+  device?: GPUDevice
+}
+// Inlined numeric values (per WebGPU spec) so this module is importable in Node
+// for codegen-only usage. The browser provides GPUBufferUsage as a global, but
+// referencing it at module scope would crash before any browser code runs.
+const STORAGE_RW = 0x80 /*STORAGE*/ | 0x8 /*COPY_DST*/ | 0x4 /*COPY_SRC*/
+const READBACK = 0x1 /*MAP_READ*/ | 0x8 /*COPY_DST*/
+export async function createRuntime(
+  plan: BufferPlan,
+  kernels: KernelSpec[],
+  lossBufferId: number,
+  opts: RuntimeOpts = {},
+): Promise<CompiledRuntime> {
+  const device = opts.device ?? await acquireDevice()
+  const queue = device.queue
+  // ---- Allocate one GPUBuffer per BufferSpec --------------------------------
+  // State buffers also get filled with their initValue at allocation time.
+  const buffers = new Map<number, GPUBuffer>()
+  for (const spec of plan.buffers) {
+    const buf = device.createBuffer({
+      size: spec.byteSize,
+      usage: STORAGE_RW,
+      label: spec.name ?? `t${spec.id}-${spec.kind}`,
+    })
+    buffers.set(spec.id, buf)
+    if (spec.kind === 'state') {
+      // Fill with initValue (typically 0). Float and int both 4 bytes per element.
+      const elements = spec.byteSize / 4
+      const init = spec.dtype === 'f32'
+        ? new Float32Array(elements).fill(spec.initValue ?? 0)
+        : new Int32Array(elements).fill(Math.trunc(spec.initValue ?? 0))
+      queue.writeBuffer(buf, 0, init as unknown as BufferSource)
+    }
+  }
+  // ---- Compile pipelines per kernel; cache by WGSL source -------------------
+  // Push an error scope around each shader+pipeline creation so we can surface
+  // the actual compile error rather than the cryptic "previous error" that
+  // comes from using an invalid pipeline at dispatch time.
+  const moduleCache = new Map<string, GPUShaderModule>()
+  const pipelines: (GPUComputePipeline | null)[] = []
+  type ErrorProbe = Promise<{ k: KernelSpec; module: GPUShaderModule; err: GPUError } | null>
+  const probes: ErrorProbe[] = []
+  for (const k of kernels) {
+    if (!k.wgsl) { pipelines.push(null); continue }
+    let module = moduleCache.get(k.wgsl)
+    if (!module) {
+      module = device.createShaderModule({ code: k.wgsl, label: k.opKind })
+      moduleCache.set(k.wgsl, module)
+    }
+    device.pushErrorScope('validation')
+    const pipeline = device.createComputePipeline({
+      layout: 'auto',
+      compute: { module, entryPoint: 'main' },
+      label: k.opKind,
+    })
+    pipelines.push(pipeline)
+    probes.push(device.popErrorScope().then(err => err ? { k, module: module!, err } : null))
+  }
+  const probeResults = await Promise.all(probes)
+  const failures = probeResults.filter((p): p is { k: KernelSpec; module: GPUShaderModule; err: GPUError } => p != null)
+  if (failures.length > 0) {
+    const reports: string[] = []
+    for (const { k, module, err } of failures) {
+      const info = await module.getCompilationInfo()
+      const messages = info.messages
+        .map(m => `  L${m.lineNum}:${m.linePos} [${m.type}] ${m.message}`)
+        .join('\n')
+      reports.push(
+        `[shader compile error] ${k.opKind} (op #${k.opIndex}): ${err.message}\n` +
+        (messages || '  (no compilation messages)') +
+        `\n--- WGSL ---\n${k.wgsl}\n-----------`,
+      )
+    }
+    // eslint-disable-next-line no-console
+    console.error(reports.join('\n\n'))
+    throw new Error(`tensorgrad: ${failures.length} shader(s) failed to compile (see console).`)
+  }
+  // ---- Pre-build bind groups (static — buffer ids don't change per step) ---
+  const bindGroups: (GPUBindGroup | null)[] = kernels.map((k, i) => {
+    const pipeline = pipelines[i]
+    if (!pipeline) return null
+    return device.createBindGroup({
+      layout: pipeline.getBindGroupLayout(0),
+      entries: k.bindings.map((bufId, idx) => ({
+        binding: idx,
+        resource: { buffer: buffers.get(bufId)! },
+      })),
+    })
+  })
+  // ---- Loss readback staging buffer -----------------------------------------
+  const lossSpec = plan.buffers[lossBufferId]!
+  const lossReadback = device.createBuffer({ size: lossSpec.byteSize, usage: READBACK })
+  // ---- step() ---------------------------------------------------------------
+  async function step(inputs: Record<string, Int32Array | Float32Array>): Promise<number> {
+    for (const [name, bufId] of plan.inputsByName) {
+      const data = inputs[name]
+      if (!data) throw new Error(`tensorgrad: missing input '${name}'`)
+      const expectedBytes = plan.buffers[bufId]!.byteSize
+      if (data.byteLength !== expectedBytes) {
+        throw new Error(`tensorgrad: input '${name}' has ${data.byteLength} bytes, expected ${expectedBytes}`)
+      }
+      // Cast to BufferSource: typed arrays are accepted by writeBuffer at runtime
+      // but TS may infer ArrayBufferLike (vs ArrayBuffer) under strict configs.
+      queue.writeBuffer(buffers.get(bufId)!, 0, data as unknown as BufferSource)
+    }
+    const encoder = device.createCommandEncoder({ label: 'tensorgrad-step' })
+    for (let i = 0; i < kernels.length; i++) {
+      const k = kernels[i]!
+      if (!k.wgsl || k.threads === 0) continue
+      const pipeline = pipelines[i]!
+      const bindGroup = bindGroups[i]!
+      const pass = encoder.beginComputePass({ label: k.opKind })
+      pass.setPipeline(pipeline)
+      pass.setBindGroup(0, bindGroup)
+      // WebGPU caps each dispatch dimension at 65535 workgroups. Split into 2D
+      // when a kernel needs more than that on the X axis. Kernels compute their
+      // global index as `gid.x + gid.y * (65535 * workgroup_size)`, matching the
+      // stride we set here. For dispatches that fit in one row, gid.y is 0.
+      const wgCount = Math.max(1, Math.ceil(k.threads / k.workgroupSize))
+      const MAX_X = 65535
+      const wgX = Math.min(wgCount, MAX_X)
+      const wgY = Math.ceil(wgCount / MAX_X)
+      pass.dispatchWorkgroups(wgX, wgY, 1)
+      pass.end()
+    }
+    // After all dispatches: writebacks (Adam state, updated params).
+    // copyBufferToBuffer is queued onto the same encoder so it's ordered after
+    // all kernel dispatches.
+    for (const wb of plan.writebacks) {
+      encoder.copyBufferToBuffer(buffers.get(wb.source)!, 0, buffers.get(wb.dest)!, 0, wb.bytes)
+    }
+    encoder.copyBufferToBuffer(buffers.get(lossBufferId)!, 0, lossReadback, 0, lossSpec.byteSize)
+    queue.submit([encoder.finish()])
+    await lossReadback.mapAsync(GPUMapMode.READ)
+    const view = new Float32Array(lossReadback.getMappedRange().slice(0))
+    lossReadback.unmap()
+    return view[0]!
+  }
+  // ---- uploadParams ---------------------------------------------------------
+  function uploadParams(params: Record<string, Float32Array>) {
+    for (const [name, bufId] of plan.paramsByName) {
+      const data = params[name]
+      if (!data) continue
+      queue.writeBuffer(buffers.get(bufId)!, 0, data as unknown as BufferSource)
+    }
+  }
+  // ---- download helpers -----------------------------------------------------
+  async function downloadFromMap(map: Map<string, number>): Promise<Record<string, Float32Array>> {
+    const stagings: { name: string; buf: GPUBuffer; bytes: number }[] = []
+    const encoder = device.createCommandEncoder({ label: 'tensorgrad-download' })
+    for (const [name, bufId] of map) {
+      const spec = plan.buffers[bufId]!
+      const staging = device.createBuffer({ size: spec.byteSize, usage: READBACK })
+      encoder.copyBufferToBuffer(buffers.get(bufId)!, 0, staging, 0, spec.byteSize)
+      stagings.push({ name, buf: staging, bytes: spec.byteSize })
+    }
+    queue.submit([encoder.finish()])
+    const out: Record<string, Float32Array> = {}
+    for (const s of stagings) {
+      await s.buf.mapAsync(GPUMapMode.READ)
+      out[s.name] = new Float32Array(s.buf.getMappedRange().slice(0))
+      s.buf.unmap()
+      s.buf.destroy()
+    }
+    return out
+  }
+  return {
+    uploadParams,
+    downloadParams: () => downloadFromMap(plan.paramsByName),
+    downloadParamGrads: () => downloadFromMap(plan.paramGradsByName),
+    step,
+    destroy: () => {
+      for (const b of buffers.values()) b.destroy()
+      lossReadback.destroy()
+    },
+  }
+}
+async function acquireDevice(): Promise<GPUDevice> {
+  if (typeof navigator === 'undefined' || !navigator.gpu) {
+    throw new Error('tensorgrad: WebGPU not available in this environment')
+  }
+  const adapter = await navigator.gpu.requestAdapter()
+  if (!adapter) throw new Error('tensorgrad: no WebGPU adapter')
+  return await adapter.requestDevice()
+}

package/src/shape.ts ADDED Viewed

@@ -0,0 +1,263 @@
+// Shape inference and validation for each op kind.
+//
+// Every op in src/ops.ts validates its inputs and computes its output shape
+// through helpers here. Errors throw with the captured call-site so the
+// stack trace points at the user's line, not into the library.
+//
+// Broadcasting rules (deliberately limited):
+//   * For element-wise binops (add/sub/mul/div), we support trailing-axis
+//     broadcasting: the smaller operand's shape must be a suffix of the
+//     larger's, with axes of size 1 broadcasting to any size. Examples
+//     ALLOWED:  [B, T, D] op [D]  →  [B, T, D]
+//               [B, T, D] op [1, D]  → [B, T, D]
+//               [B, T, D] op [B, T, D]  → [B, T, D]
+//     Examples REJECTED:  [B, T, D] op [B]   (suffix mismatch)
+//                         [B, T, D] op [T, D] when T != B (legal numpy, banned here)
+//   The restriction makes codegen and autograd much simpler and covers every
+//   broadcast pattern in our transformer (biases, layernorm gain/bias, masks).
+import type { Shape, CallSite } from './ir.js'
+import { formatSite } from './ir.js'
+// ============================================================================
+// Errors
+// ============================================================================
+export class ShapeError extends Error {
+  constructor(message: string, site: CallSite | null) {
+    const formatted = site ? `${message}\n  at ${formatSite(site)}` : message
+    super(formatted)
+    this.name = 'ShapeError'
+  }
+}
+function fail(message: string, site: CallSite | null): never {
+  throw new ShapeError(message, site)
+}
+// ============================================================================
+// Shape utilities
+// ============================================================================
+export function shapesEqual(a: Shape, b: Shape): boolean {
+  if (a.length !== b.length) return false
+  for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return false
+  return true
+}
+export function shapeSize(shape: Shape): number {
+  let n = 1
+  for (const d of shape) n *= d
+  return n
+}
+export function showShape(shape: Shape): string {
+  return `[${shape.join(', ')}]`
+}
+// Standard right-aligned NumPy-style broadcasting. Pad the shorter shape with
+// leading 1s, then per-axis: equal dims unify, size-1 dims broadcast on either
+// side, otherwise incompatible. Returns the resulting shape or null.
+export function broadcastTrailing(a: Shape, b: Shape): Shape | null {
+  const rank = Math.max(a.length, b.length)
+  const out: number[] = new Array(rank)
+  for (let i = 0; i < rank; i++) {
+    const ai = i - (rank - a.length)
+    const bi = i - (rank - b.length)
+    const av = ai < 0 ? 1 : a[ai]!
+    const bv = bi < 0 ? 1 : b[bi]!
+    if (av === bv) out[i] = av
+    else if (av === 1) out[i] = bv
+    else if (bv === 1) out[i] = av
+    else return null
+  }
+  return out
+}
+// ============================================================================
+// Per-op shape rules
+// ============================================================================
+//
+// Each rule takes the input shapes and returns the output shape, or throws.
+// All rules accept a `site` for error attribution.
+export function inferElementwiseBinop(
+  opName: string, aShape: Shape, bShape: Shape, site: CallSite | null,
+): Shape {
+  const result = broadcastTrailing(aShape, bShape)
+  if (!result) {
+    fail(
+      `${opName}: incompatible shapes ${showShape(aShape)} and ${showShape(bShape)}. ` +
+      `Trailing-suffix broadcasting only — the smaller shape must be a suffix of the larger, ` +
+      `with size-1 axes broadcasting to any size.`,
+      site,
+    )
+  }
+  return result
+}
+export function inferUnary(_opName: string, aShape: Shape, _site: CallSite | null): Shape {
+  return aShape
+}
+export function inferMeanLast(opName: string, aShape: Shape, site: CallSite | null): Shape {
+  if (aShape.length === 0) fail(`${opName}: cannot reduce a 0-d tensor`, site)
+  // keepdims=true: replace last axis with 1.
+  return [...aShape.slice(0, -1), 1]
+}
+export function inferSumLast(opName: string, aShape: Shape, site: CallSite | null): Shape {
+  if (aShape.length === 0) fail(`${opName}: cannot reduce a 0-d tensor`, site)
+  // keepdims=false: drop the last axis.
+  return aShape.slice(0, -1)
+}
+export function inferReshape(opName: string, aShape: Shape, newShape: Shape, site: CallSite | null): Shape {
+  // Validate -1 placeholder (at most one allowed) and total size match.
+  let inferIdx = -1
+  let knownSize = 1
+  for (let i = 0; i < newShape.length; i++) {
+    const d = newShape[i]!
+    if (d === -1) {
+      if (inferIdx !== -1) fail(`${opName}: at most one -1 dim allowed in newShape ${showShape(newShape)}`, site)
+      inferIdx = i
+    } else if (d <= 0) {
+      fail(`${opName}: invalid dim ${d} in newShape ${showShape(newShape)}`, site)
+    } else {
+      knownSize *= d
+    }
+  }
+  const totalIn = shapeSize(aShape)
+  const out = [...newShape]
+  if (inferIdx !== -1) {
+    if (totalIn % knownSize !== 0) {
+      fail(`${opName}: cannot reshape ${showShape(aShape)} (size ${totalIn}) to ${showShape(newShape)} — known dims multiply to ${knownSize}`, site)
+    }
+    out[inferIdx] = totalIn / knownSize
+  } else if (knownSize !== totalIn) {
+    fail(`${opName}: size mismatch — input ${showShape(aShape)} has ${totalIn} elements but newShape ${showShape(newShape)} has ${knownSize}`, site)
+  }
+  return out
+}
+export function inferTranspose(opName: string, aShape: Shape, perm: readonly number[], site: CallSite | null): Shape {
+  if (perm.length !== aShape.length) {
+    fail(`${opName}: perm length ${perm.length} must equal input rank ${aShape.length}`, site)
+  }
+  const seen = new Set<number>()
+  for (const p of perm) {
+    if (p < 0 || p >= aShape.length) fail(`${opName}: perm index ${p} out of range for rank ${aShape.length}`, site)
+    if (seen.has(p)) fail(`${opName}: perm has duplicate index ${p}`, site)
+    seen.add(p)
+  }
+  return perm.map(p => aShape[p]!)
+}
+// matmul: a [..., M, K] · b [K, N]  →  [..., M, N].  b is unbatched.
+export function inferMatmul(opName: string, aShape: Shape, bShape: Shape, site: CallSite | null): Shape {
+  if (aShape.length < 2) fail(`${opName}: lhs must have rank >= 2, got ${showShape(aShape)}`, site)
+  if (bShape.length !== 2) fail(`${opName}: rhs must have rank 2, got ${showShape(bShape)} — use matmulBatched for batched rhs`, site)
+  const M = aShape[aShape.length - 2]!
+  const Ka = aShape[aShape.length - 1]!
+  const Kb = bShape[0]!
+  const N = bShape[1]!
+  if (Ka !== Kb) fail(`${opName}: inner dims don't match — ${showShape(aShape)} · ${showShape(bShape)} (last axis of lhs = ${Ka}, first axis of rhs = ${Kb})`, site)
+  return [...aShape.slice(0, -2), M, N]
+}
+// matmul_batched: a [..., M, K] · b [..., K, N]  →  [..., M, N].  Both have leading batch dims.
+export function inferMatmulBatched(opName: string, aShape: Shape, bShape: Shape, site: CallSite | null): Shape {
+  if (aShape.length < 2 || bShape.length < 2) {
+    fail(`${opName}: both inputs must have rank >= 2, got ${showShape(aShape)} and ${showShape(bShape)}`, site)
+  }
+  if (aShape.length !== bShape.length) {
+    fail(`${opName}: ranks must match (got ${aShape.length} vs ${bShape.length}). Reshape if you need different batch dims.`, site)
+  }
+  const aBatch = aShape.slice(0, -2)
+  const bBatch = bShape.slice(0, -2)
+  for (let i = 0; i < aBatch.length; i++) {
+    if (aBatch[i] !== bBatch[i]) {
+      fail(`${opName}: batch dims must match — ${showShape(aShape)} vs ${showShape(bShape)}`, site)
+    }
+  }
+  const M = aShape[aShape.length - 2]!
+  const Ka = aShape[aShape.length - 1]!
+  const Kb = bShape[bShape.length - 2]!
+  const N = bShape[bShape.length - 1]!
+  if (Ka !== Kb) fail(`${opName}: inner dims don't match — last axis of lhs = ${Ka}, second-to-last of rhs = ${Kb}`, site)
+  return [...aBatch, M, N]
+}
+export function inferOneHot(opName: string, indicesShape: Shape, depth: number, site: CallSite | null): Shape {
+  if (depth <= 0) fail(`${opName}: depth must be positive, got ${depth}`, site)
+  return [...indicesShape, depth]
+}
+// where_causal preserves shape but requires the last two axes to be square.
+export function inferWhereCausal(opName: string, aShape: Shape, site: CallSite | null): Shape {
+  if (aShape.length < 2) fail(`${opName}: requires rank >= 2, got ${showShape(aShape)}`, site)
+  const m = aShape[aShape.length - 2]!
+  const n = aShape[aShape.length - 1]!
+  if (m !== n) fail(`${opName}: last two axes must be equal (square mask), got ${showShape(aShape)}`, site)
+  return aShape
+}
+export function inferSliceLastRange(opName: string, aShape: Shape, start: number, end: number, site: CallSite | null): Shape {
+  if (aShape.length === 0) fail(`${opName}: cannot slice 0-d tensor`, site)
+  const last = aShape[aShape.length - 1]!
+  if (start < 0 || end > last || start >= end) {
+    fail(`${opName}: invalid range [${start}, ${end}) for last axis of size ${last}`, site)
+  }
+  return [...aShape.slice(0, -1), end - start]
+}
+// broadcast_to: validate that `aShape` can broadcast to `targetShape` under
+// right-aligned NumPy rules. Returns targetShape on success.
+export function inferBroadcastTo(opName: string, aShape: Shape, targetShape: Shape, site: CallSite | null): Shape {
+  if (aShape.length > targetShape.length) {
+    fail(`${opName}: source rank ${aShape.length} > target rank ${targetShape.length}`, site)
+  }
+  const offset = targetShape.length - aShape.length
+  for (let i = 0; i < aShape.length; i++) {
+    const av = aShape[i]!
+    const tv = targetShape[offset + i]!
+    if (av !== tv && av !== 1) {
+      fail(`${opName}: cannot broadcast ${showShape(aShape)} to ${showShape(targetShape)} — axis ${i} (size ${av}) doesn't match target axis ${offset + i} (size ${tv}) and isn't 1`, site)
+    }
+  }
+  return targetShape
+}
+// sum_to_shape: validate that `targetShape` is a valid right-aligned reduction
+// of `aShape` (i.e., aShape can have been produced by broadcasting targetShape).
+export function inferSumToShape(opName: string, aShape: Shape, targetShape: Shape, site: CallSite | null): Shape {
+  if (targetShape.length > aShape.length) {
+    fail(`${opName}: target rank ${targetShape.length} > source rank ${aShape.length}`, site)
+  }
+  const offset = aShape.length - targetShape.length
+  for (let i = 0; i < targetShape.length; i++) {
+    const av = aShape[offset + i]!
+    const tv = targetShape[i]!
+    if (av !== tv && tv !== 1) {
+      fail(`${opName}: cannot sum-reduce ${showShape(aShape)} to ${showShape(targetShape)} — target axis ${i} (size ${tv}) must be 1 or match source`, site)
+    }
+  }
+  return targetShape
+}
+// Three-way broadcast for `where(cond, a, b)`. All three shapes must broadcast
+// to a common shape under standard NumPy rules.
+export function inferWhere(opName: string, condShape: Shape, aShape: Shape, bShape: Shape, site: CallSite | null): Shape {
+  const ab = broadcastTrailing(aShape, bShape)
+  if (!ab) fail(`${opName}: a/b incompatible: ${showShape(aShape)} vs ${showShape(bShape)}`, site)
+  const result = broadcastTrailing(condShape, ab)
+  if (!result) fail(`${opName}: cond ${showShape(condShape)} incompatible with broadcast(a, b) ${showShape(ab)}`, site)
+  return result
+}
+export function inferReluGrad(opName: string, xShape: Shape, dyShape: Shape, site: CallSite | null): Shape {
+  if (!shapesEqual(xShape, dyShape)) {
+    fail(`${opName}: x and dy must have matching shapes, got ${showShape(xShape)} and ${showShape(dyShape)}`, site)
+  }
+  return xShape
+}

package/src/trace.ts ADDED Viewed

@@ -0,0 +1,101 @@
+// Trace driver. Holds the "current graph" in module-local state so user code
+// can call ops without threading a graph parameter through every function.
+//
+// Usage:
+//
+//   const graph = trace(() => {
+//     const x = tensorInput('x', [B, T], 'i32')
+//     const w = paramInput('w', [V, D], 'f32')
+//     // ... user computation building tensors ...
+//     return finalLossTensor
+//   })
+//
+// `trace` is single-threaded and re-entrant only via nested calls (which share
+// the outer graph — but we don't currently have a use for nesting). Calling an
+// op outside a `trace(...)` block is an error.
+import type { Graph, Tensor, Shape, Dtype } from './ir.js'
+import { makeGraph, addOp, captureSite } from './ir.js'
+// Module-local: the graph being built right now, or null if no trace is active.
+let _current: Graph | null = null
+export function currentGraph(): Graph {
+  if (!_current) {
+    throw new Error(
+      'tensorgrad: ops can only be called inside trace(). ' +
+      'Did you forget to wrap your forward pass?',
+    )
+  }
+  return _current
+}
+// Run `fn` with a fresh graph as the current one; capture and return the graph.
+// `fn` must return the tensor (or array of tensors) to mark as graph outputs.
+export function trace(fn: () => Tensor | Tensor[]): Graph {
+  if (_current) {
+    throw new Error('tensorgrad: nested trace() is not supported')
+  }
+  const g = makeGraph()
+  _current = g
+  try {
+    const result = fn()
+    const outputs = Array.isArray(result) ? result : [result]
+    for (const t of outputs) {
+      ;(g.outputs as number[]).push(t.id)
+    }
+  } finally {
+    _current = null
+  }
+  return g
+}
+// Re-enter an existing graph to append more ops. Used by autograd to add
+// backward ops to a graph that's already been traced. `fn` runs with the
+// supplied graph as the current one; any ops it calls append to that graph.
+// Returns whatever `fn` returns.
+export function traceInto<T>(g: Graph, fn: () => T): T {
+  if (_current) {
+    throw new Error('tensorgrad: traceInto() called while another trace is active')
+  }
+  _current = g
+  try {
+    return fn()
+  } finally {
+    _current = null
+  }
+}
+// ---- Leaf tensor builders --------------------------------------------------
+// Inputs are added to the graph as `param_input` or `tensor_input` op nodes.
+// Their .source on the Tensor points at that node so codegen knows where to
+// bind external data.
+export function paramInput(name: string, shape: Shape, dtype: Dtype = 'f32'): Tensor {
+  const g = currentGraph()
+  if (g.ops.some(op => (op.kind === 'param_input' || op.kind === 'tensor_input') && op.name === name)) {
+    throw new Error(`tensorgrad: input name '${name}' already used in this trace`)
+  }
+  const site = captureSite('paramInput')
+  return addOp(g, 'param_input', shape, dtype, site, { name } as any)
+}
+export function tensorInput(name: string, shape: Shape, dtype: Dtype = 'f32'): Tensor {
+  const g = currentGraph()
+  if (g.ops.some(op => (op.kind === 'param_input' || op.kind === 'tensor_input') && op.name === name)) {
+    throw new Error(`tensorgrad: input name '${name}' already used in this trace`)
+  }
+  const site = captureSite('tensorInput')
+  return addOp(g, 'tensor_input', shape, dtype, site, { name } as any)
+}
+// Persistent state buffer. Allocated at compile time, zero-(or initValue-)initialized,
+// and updated across step() calls via writebacks declared by the optimizer helper.
+export function stateInput(name: string, shape: Shape, dtype: Dtype = 'f32', initValue = 0): Tensor {
+  const g = currentGraph()
+  if (g.ops.some(op => op.kind === 'state_input' && op.name === name)) {
+    throw new Error(`tensorgrad: state name '${name}' already used in this trace`)
+  }
+  const site = captureSite('stateInput')
+  return addOp(g, 'state_input', shape, dtype, site, { name, initValue } as any)
+}