npm - tensorgrad - Versions diffs - 0.0.1 - Mend

tensorgrad 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/src/ir.ts ADDED Viewed

@@ -0,0 +1,197 @@
+// Intermediate representation for tensor computations.
+//
+// A `Graph` is a flat array of `OpNode`s in topological (= construction) order.
+// A `Tensor` is an opaque handle: shape + dtype + a pointer back to the OpNode
+// that produced it (or `null` for graph leaves — params and external inputs).
+//
+// This is the data structure everything else operates on:
+//   - tracing builds it (src/trace.ts)
+//   - autograd walks it in reverse to add backward nodes (src/grad.ts, later)
+//   - codegen reads it to emit WGSL kernels and a dispatch plan (src/codegen.ts, later)
+//
+// Design intent: keep this file boring. No tracing logic, no shape inference,
+// no codegen — those live in their own modules and consume `Graph` / `OpNode`.
+export type Dtype = 'f32' | 'i32' | 'bool'
+export type Shape = readonly number[]
+// A Tensor is just metadata + a unique id. The actual storage doesn't exist
+// until the graph is compiled and run on a device.
+export interface Tensor {
+  readonly id: number
+  readonly shape: Shape
+  readonly dtype: Dtype
+  // null for leaves (params, external inputs); otherwise the index into Graph.ops.
+  readonly source: number | null
+  // Captured at op-call time so shape errors blame the user's frame, not the
+  // library's. Lazy: only formatted on demand.
+  readonly site: CallSite | null
+}
+export interface CallSite {
+  readonly opName: string
+  // Full Error stack at the point of op invocation. Format on demand.
+  readonly stack: string
+}
+// Discriminated union over every op the IR knows about. Adding an op means:
+//   1. add a variant here,
+//   2. add a shape rule in src/shape.ts,
+//   3. add a transpose rule in src/grad.ts (later),
+//   4. add a kernel template in src/codegen.ts (later).
+// The kinds intentionally match the surface API in src/ops.ts one-to-one.
+export type OpNode =
+  // ---- Leaves ----------------------------------------------------------------
+  // A trainable parameter, supplied by the caller as a Float32Array at runtime.
+  | { kind: 'param_input'; out: number; name: string }
+  // A non-trainable input (tokens, targets, constants). Bound at runtime.
+  | { kind: 'tensor_input'; out: number; name: string }
+  // Persistent state buffer (e.g. Adam's m/v). Allocated and zero-initialized
+  // at compile time; survives across step() calls. Updated via writebacks
+  // declared in the compile result.
+  | { kind: 'state_input'; out: number; name: string; initValue: number }
+  // ---- Element-wise --------------------------------------------------------
+  | { kind: 'add'; out: number; a: number; b: number }
+  | { kind: 'sub'; out: number; a: number; b: number }
+  | { kind: 'mul'; out: number; a: number; b: number }
+  | { kind: 'div'; out: number; a: number; b: number }
+  | { kind: 'mul_scalar'; out: number; a: number; scalar: number }
+  | { kind: 'add_scalar'; out: number; a: number; scalar: number }
+  // ---- Unary ---------------------------------------------------------------
+  | { kind: 'sqrt'; out: number; a: number }
+  | { kind: 'rsqrt'; out: number; a: number }
+  | { kind: 'log'; out: number; a: number }
+  | { kind: 'exp'; out: number; a: number }
+  | { kind: 'relu'; out: number; a: number }
+  // ---- Reductions (over last axis only; reshape if you need other axes) ----
+  | { kind: 'mean_last'; out: number; a: number }   // keepdims=true
+  | { kind: 'sum_last'; out: number; a: number }    // keepdims=false
+  // ---- Shape ---------------------------------------------------------------
+  | { kind: 'reshape'; out: number; a: number; newShape: Shape }
+  | { kind: 'transpose'; out: number; a: number; perm: readonly number[] }
+  // ---- Linear algebra -----------------------------------------------------
+  // matmul: a [..., M, K] · b [K, N] -> [..., M, N]. b is unbatched.
+  // (Batched-on-both-sides matmul, e.g. for attention scores, is a separate kind
+  //  to keep autograd transpose rules simple.)
+  | { kind: 'matmul'; out: number; a: number; b: number }
+  // matmul_batched: a [..., M, K] · b [..., K, N] -> [..., M, N]. Used by attention.
+  | { kind: 'matmul_batched'; out: number; a: number; b: number }
+  // ---- Indexing / casting --------------------------------------------------
+  | { kind: 'one_hot'; out: number; indices: number; depth: number; dtype: Dtype }
+  | { kind: 'arange'; out: number; n: number; dtype: Dtype }
+  // ---- ML primitives (fused for cleaner autograd) -------------------------
+  | { kind: 'softmax_causal_last'; out: number; a: number }
+  | { kind: 'log_softmax_last'; out: number; a: number }
+  // Sets cells where (i >= j) on the last two axes; for masking attention scores
+  // *before* softmax. Lower-triangle entries pass through; upper-triangle entries
+  // become `fillValue` (typically -inf or a large negative number).
+  | { kind: 'where_causal'; out: number; a: number; fillValue: number }
+  // ---- Comparisons + selection -------------------------------------------
+  // Element-wise comparison; result is bool (lowered to u32 in storage).
+  // Supports the same trailing-axis broadcast as element-wise binops.
+  | { kind: 'less'; out: number; a: number; b: number }
+  | { kind: 'greater'; out: number; a: number; b: number }
+  // Element-wise select: out[i] = cond[i] ? a[i] : b[i]. cond must be bool.
+  // a, b, cond all broadcast-compatible to out's shape.
+  | { kind: 'where'; out: number; cond: number; a: number; b: number }
+  // ---- Optimizer-fused ops (Adam) ----------------------------------------
+  // Each is a single kernel doing the full per-element math, baking in the
+  // hyperparameter constant. Used by appendAdam() to avoid decomposing the
+  // update into ~12 element-wise dispatches per param.
+  | { kind: 'adam_update_m'; out: number; m: number; g: number; b1: number }
+  | { kind: 'adam_update_v'; out: number; v: number; g: number; b2: number }
+  // adam_update_p: p_new = p - lrt[0] * m_new / (sqrt(v_new) + eps).
+  // `lrt` is a scalar tensor (provided as a tensor_input updated per step) that
+  // already includes Adam's bias-correction factor: lrt = lr * sqrt(1-b2^t) / (1-b1^t).
+  // Only `eps` is baked in.
+  | { kind: 'adam_update_p'; out: number; p: number; mNew: number; vNew: number; lrt: number; eps: number }
+  // ---- Slicing / broadcasting / autograd infrastructure -------------------
+  // Slice [start, end) along the last axis. Output shape: input shape with
+  // last axis replaced by (end - start). Used for splitting Q/K/V from a
+  // single fused QKV matmul.
+  | { kind: 'slice_last_range'; out: number; a: number; start: number; end: number }
+  // Broadcast `a` to `targetShape`. Standard right-aligned NumPy broadcast.
+  // Used by autograd to expand cotangents back over reduced/broadcast axes.
+  | { kind: 'broadcast_to'; out: number; a: number; targetShape: Shape }
+  // Inverse of broadcast_to: sum-reduce `a` to `targetShape`. Used by autograd
+  // to "un-broadcast" a cotangent back to the smaller operand's shape.
+  | { kind: 'sum_to_shape'; out: number; a: number; targetShape: Shape }
+  // 0-d tensor with a constant value. Used to seed loss cotangent (1.0).
+  | { kind: 'const_scalar'; out: number; value: number; dtype: Dtype }
+  // ReLU's backward: passes `dy` through where `x > 0`, else 0. Output shape = x's.
+  | { kind: 'relu_grad'; out: number; x: number; dy: number }
+// A Graph collects ops and tensors during tracing, then becomes the input to
+// autograd and codegen. Once tracing is done it should be treated as immutable.
+export interface Graph {
+  readonly ops: OpNode[]
+  readonly tensors: Tensor[]
+  // Names of tensors that should be exposed as outputs of the compiled function.
+  // Set by the trace driver; for a loss function, this is `[lossTensor]`.
+  readonly outputs: number[]
+}
+export function makeGraph(): Graph {
+  return { ops: [], tensors: [], outputs: [] }
+}
+// Internal: register a fresh tensor in the graph and return its id.
+export function addTensor(g: Graph, shape: Shape, dtype: Dtype, source: number | null, site: CallSite | null): Tensor {
+  const id = g.tensors.length
+  const t: Tensor = { id, shape, dtype, source, site }
+  g.tensors.push(t)
+  return t
+}
+// Internal: append an op and the tensor it produces. Returns the produced tensor.
+// Generic over the specific op kind so callers don't need `as any` casts.
+// `Extract<OpNode, { kind: K }>` narrows the union to the chosen variant, then
+// `Omit` strips the parts addOp itself supplies (the kind tag and out tensor id).
+export function addOp<K extends OpNode['kind']>(
+  g: Graph,
+  kind: K,
+  shape: Shape,
+  dtype: Dtype,
+  site: CallSite | null,
+  fields: Omit<Extract<OpNode, { kind: K }>, 'kind' | 'out'>,
+): Tensor {
+  const opIndex = g.ops.length
+  const out = addTensor(g, shape, dtype, opIndex, site)
+  const node = { kind, out: out.id, ...fields } as Extract<OpNode, { kind: K }>
+  g.ops.push(node)
+  return out
+}
+// Capture a call site without paying full Error formatting cost up-front.
+// The stack is materialised but parsing/trimming is deferred to error reporting.
+export function captureSite(opName: string): CallSite {
+  // Skip our own frame plus the op wrapper's frame; user's frame is what's left.
+  const stack = (new Error()).stack ?? ''
+  return { opName, stack }
+}
+// Format a CallSite for inclusion in a thrown error. Strips Tensorgrad frames
+// and library internals so the user sees their code first.
+export function formatSite(site: CallSite): string {
+  const lines = site.stack.split('\n')
+  // Stack starts with "Error" line; drop it. Then drop frames from this file
+  // and from src/ops.ts so the first surviving frame is user code.
+  const userFrames: string[] = []
+  for (const line of lines.slice(1)) {
+    if (line.includes('/tensorgrad/src/') || line.includes('\\tensorgrad\\src\\')) continue
+    userFrames.push(line.trim())
+    if (userFrames.length >= 3) break
+  }
+  if (userFrames.length === 0) return `[${site.opName}] (no user frame found)`
+  return `[${site.opName}]\n  ${userFrames.join('\n  ')}`
+}

package/src/module.ts ADDED Viewed

@@ -0,0 +1,126 @@
+// Module abstraction — a Domeleon-style component layer for parameter trees.
+//
+// User code defines a model as nested classes:
+//
+//   class Linear extends Module {
+//     W: Tensor; b: Tensor
+//     constructor(inDim: number, outDim: number) {
+//       super()
+//       this.W = this.param([inDim, outDim])
+//       this.b = this.param([outDim])
+//     }
+//   }
+//   class Block extends Module {
+//     attn = new Attention(D)
+//     mlp  = new MLP(D, 4 * D)
+//   }
+//   class Model extends Module {
+//     embed = new Linear(VOCAB, D)
+//     layers = range(N).map(() => new Block())
+//   }
+//
+// The param tree is discovered automatically at compile time by walking
+// enumerable instance properties. Each parameter gets a name auto-derived
+// from its path (`layers.0.attn.W_q`); names are used for upload/download
+// and writeback wiring. Forward functions are pure and stateless — they
+// take the materialized model and inputs, return a Tensor.
+import type { Tensor, Shape, Dtype } from './ir.js'
+import { paramInput } from './trace.js'
+// ============================================================================
+// Internals: param sentinel
+// ============================================================================
+//
+// `this.param(shape)` returns a placeholder that's replaced by a real Tensor
+// during `materializeParams`. We type-cheat by declaring the return type as
+// `Tensor` so user code can write `this.W` and have TS happy; the cheat is
+// only valid post-materialization (which is always before forward runs).
+class ParamSentinel {
+  constructor(public readonly shape: Shape, public readonly dtype: Dtype) {}
+}
+// ============================================================================
+// Module base class
+// ============================================================================
+export abstract class Module {
+  /**
+   * Declare a learnable parameter at this module. Must be called from inside
+   * the constructor (typically as a field assignment). Returns a placeholder
+   * that gets replaced with a real Tensor at compile time.
+   *
+   * The parameter's name is auto-derived from its property path in the model
+   * tree (e.g. `layers.0.attn.W_q`).
+   */
+  protected param(shape: Shape, dtype: Dtype = 'f32'): Tensor {
+    // Lie to TypeScript: the sentinel becomes a Tensor at materialize time.
+    return new ParamSentinel(shape, dtype) as unknown as Tensor
+  }
+}
+// ============================================================================
+// Tree walking
+// ============================================================================
+/**
+ * Walk the module tree and replace every ParamSentinel with a real Tensor
+ * created via `paramInput(autoName, ...)`. Must be called inside an active
+ * trace context (paramInput appends to the current graph).
+ *
+ * Returns a flat record of `{ path: tensor }` for every materialized param.
+ */
+export function materializeParams(root: Module): Record<string, Tensor> {
+  const out: Record<string, Tensor> = {}
+  visit(root, '', (path, val, owner, key) => {
+    if (val instanceof ParamSentinel) {
+      const t = paramInput(path, val.shape, val.dtype)
+      ;(owner as any)[key] = t
+      out[path] = t
+    }
+  })
+  return out
+}
+// ----------------------------------------------------------------------------
+// Visitor
+// ----------------------------------------------------------------------------
+//
+// Walks enumerable own properties recursively, building a path string. Recurses
+// into nested Modules and arrays of Modules (or arrays of arrays, etc.).
+// Calls `visitor` on every leaf — including ParamSentinels (pre-materialize)
+// and real Tensor leaves (post-materialize).
+type Visitor = (path: string, val: unknown, owner: object, key: string | number) => void
+function visit(node: unknown, path: string, visitor: Visitor): void {
+  if (node === null || node === undefined) return
+  if (typeof node !== 'object') return
+  if (node instanceof Module) {
+    for (const key of Object.keys(node as object)) {
+      const child = (node as any)[key]
+      const childPath = path ? `${path}.${key}` : key
+      visitChild(child, childPath, node, key, visitor)
+    }
+    return
+  }
+  if (Array.isArray(node)) {
+    node.forEach((item, i) => {
+      const childPath = path ? `${path}.${i}` : String(i)
+      visitChild(item, childPath, node as unknown as object, i, visitor)
+    })
+    return
+  }
+  // Plain leaf object (sentinel / tensor / something else): visitor decides.
+  // No deeper recursion.
+}
+function visitChild(child: unknown, path: string, owner: object, key: string | number, visitor: Visitor): void {
+  if (child instanceof Module || Array.isArray(child)) {
+    visit(child, path, visitor)
+  } else {
+    visitor(path, child, owner, key)
+  }
+}

package/src/ops.ts ADDED Viewed

@@ -0,0 +1,311 @@
+// User-facing op surface.
+//
+// Each function here is a thin wrapper:
+//   1. capture the call site (for error attribution)
+//   2. validate input shapes via src/shape.ts (which throws on mismatch)
+//   3. compute the output shape and dtype
+//   4. append the op to the current Graph (held in module state by src/trace.ts)
+//   5. return the produced Tensor handle
+//
+// No actual numeric work happens here. These calls just build the IR.
+import type { Tensor, Shape, Dtype, OpNode } from './ir.js'
+import { addOp, captureSite } from './ir.js'
+import { currentGraph } from './trace.js'
+import {
+  inferElementwiseBinop, inferUnary, inferMeanLast, inferSumLast,
+  inferReshape, inferTranspose, inferMatmul, inferMatmulBatched,
+  inferOneHot, inferWhereCausal, inferSliceLastRange,
+  inferBroadcastTo, inferSumToShape, inferReluGrad, inferWhere,
+  ShapeError,
+} from './shape.js'
+// ----------------------------------------------------------------------------
+// Element-wise binops (add/sub/mul/div). Trailing-suffix broadcast.
+// ----------------------------------------------------------------------------
+/**
+ * Build an element-wise binop op (forward declaration only — appends to the
+ * graph). Used by both arithmetic ops (add/sub/mul/div, output dtype = input
+ * dtype) and comparisons (less/greater, output dtype = bool).
+ */
+function binopOp(
+  name: string,
+  kind: OpNode['kind'],
+  a: Tensor, b: Tensor,
+  outDtype: Dtype = a.dtype,
+): Tensor {
+  const site = captureSite(name)
+  if (a.dtype !== b.dtype) throw new ShapeError(`${name}: dtype mismatch (${a.dtype} vs ${b.dtype})`, site)
+  const outShape = inferElementwiseBinop(name, a.shape, b.shape, site)
+  return addOp(currentGraph(), kind, outShape, outDtype, site, { a: a.id, b: b.id })
+}
+// Element-wise binops. Second arg can be a Tensor or a JS number; the latter
+// dispatches to scalar-fused IR ops internally. `mul(x, 2)` and `mul(x, y)`
+// both work — matches every NumPy-shaped library.
+export function add(a: Tensor, b: Tensor | number): Tensor {
+  return typeof b === 'number' ? addScalar(a, b) : binopOp('add', 'add', a, b)
+}
+export function sub(a: Tensor, b: Tensor | number): Tensor {
+  return typeof b === 'number' ? addScalar(a, -b) : binopOp('sub', 'sub', a, b)
+}
+export function mul(a: Tensor, b: Tensor | number): Tensor {
+  return typeof b === 'number' ? mulScalar(a, b) : binopOp('mul', 'mul', a, b)
+}
+export function div(a: Tensor, b: Tensor | number): Tensor {
+  if (typeof b === 'number') {
+    if (b === 0) throw new ShapeError(`div: scalar divisor cannot be zero`, captureSite('div'))
+    return mulScalar(a, 1 / b)
+  }
+  return binopOp('div', 'div', a, b)
+}
+// ----------------------------------------------------------------------------
+// Element-wise scalar binops (mul/add by JS number). Used for things like
+// `scores * (1/sqrt(d))` and `logits + 1e-5` where allocating a 0-d tensor
+// for the scalar is wasteful.
+// ----------------------------------------------------------------------------
+export function mulScalar(a: Tensor, scalar: number): Tensor {
+  const site = captureSite('mulScalar')
+  return addOp(currentGraph(), 'mul_scalar', a.shape, a.dtype, site, { a: a.id, scalar })
+}
+export function addScalar(a: Tensor, scalar: number): Tensor {
+  const site = captureSite('addScalar')
+  return addOp(currentGraph(), 'add_scalar', a.shape, a.dtype, site, { a: a.id, scalar })
+}
+// ----------------------------------------------------------------------------
+// Unary ops.
+// ----------------------------------------------------------------------------
+function unary(name: 'sqrt' | 'rsqrt' | 'log' | 'exp' | 'relu', a: Tensor): Tensor {
+  const site = captureSite(name)
+  if (a.dtype !== 'f32') throw new ShapeError(`${name}: requires f32, got ${a.dtype}`, site)
+  return addOp(currentGraph(), name, inferUnary(name, a.shape, site), 'f32', site, { a: a.id })
+}
+export const sqrt  = (a: Tensor): Tensor => unary('sqrt',  a)
+export const rsqrt = (a: Tensor): Tensor => unary('rsqrt', a)
+export const log   = (a: Tensor): Tensor => unary('log',   a)
+export const exp   = (a: Tensor): Tensor => unary('exp',   a)
+export const relu  = (a: Tensor): Tensor => unary('relu',  a)
+// ----------------------------------------------------------------------------
+// Reductions over the last axis. To reduce along other axes, transpose first.
+// (This is intentional — keeps codegen and autograd small.)
+// ----------------------------------------------------------------------------
+export function meanLast(a: Tensor): Tensor {
+  const site = captureSite('meanLast')
+  if (a.dtype !== 'f32') throw new ShapeError(`meanLast: requires f32, got ${a.dtype}`, site)
+  const outShape = inferMeanLast('meanLast', a.shape, site)
+  return addOp(currentGraph(), 'mean_last', outShape, a.dtype, site, { a: a.id })
+}
+export function sumLast(a: Tensor): Tensor {
+  const site = captureSite('sumLast')
+  if (a.dtype !== 'f32') throw new ShapeError(`sumLast: requires f32, got ${a.dtype}`, site)
+  const outShape = inferSumLast('sumLast', a.shape, site)
+  return addOp(currentGraph(), 'sum_last', outShape, a.dtype, site, { a: a.id })
+}
+// ----------------------------------------------------------------------------
+// Shape ops.
+// ----------------------------------------------------------------------------
+export function reshape(a: Tensor, newShape: Shape): Tensor {
+  const site = captureSite('reshape')
+  const outShape = inferReshape('reshape', a.shape, newShape, site)
+  return addOp(currentGraph(), 'reshape', outShape, a.dtype, site, { a: a.id, newShape: outShape })
+}
+export function transpose(a: Tensor, perm: readonly number[]): Tensor {
+  const site = captureSite('transpose')
+  const outShape = inferTranspose('transpose', a.shape, perm, site)
+  return addOp(currentGraph(), 'transpose', outShape, a.dtype, site, { a: a.id, perm })
+}
+// ----------------------------------------------------------------------------
+// Linear algebra.
+// ----------------------------------------------------------------------------
+export function matmul(a: Tensor, b: Tensor): Tensor {
+  const site = captureSite('matmul')
+  if (a.dtype !== 'f32' || b.dtype !== 'f32') {
+    throw new ShapeError(`matmul: requires f32, got ${a.dtype} and ${b.dtype}`, site)
+  }
+  const outShape = inferMatmul('matmul', a.shape, b.shape, site)
+  return addOp(currentGraph(), 'matmul', outShape, 'f32', site, { a: a.id, b: b.id })
+}
+export function matmulBatched(a: Tensor, b: Tensor): Tensor {
+  const site = captureSite('matmulBatched')
+  if (a.dtype !== 'f32' || b.dtype !== 'f32') {
+    throw new ShapeError(`matmulBatched: requires f32, got ${a.dtype} and ${b.dtype}`, site)
+  }
+  const outShape = inferMatmulBatched('matmulBatched', a.shape, b.shape, site)
+  return addOp(currentGraph(), 'matmul_batched', outShape, 'f32', site, { a: a.id, b: b.id })
+}
+// ----------------------------------------------------------------------------
+// Indexing / casting.
+// ----------------------------------------------------------------------------
+export function oneHot(indices: Tensor, depth: number, dtype: Dtype = 'f32'): Tensor {
+  const site = captureSite('oneHot')
+  if (indices.dtype !== 'i32') {
+    throw new ShapeError(`oneHot: indices must be i32, got ${indices.dtype}`, site)
+  }
+  const outShape = inferOneHot('oneHot', indices.shape, depth, site)
+  return addOp(currentGraph(), 'one_hot', outShape, dtype, site, { indices: indices.id, depth, dtype })
+}
+// arange(n) → [n] of values [0, 1, ..., n-1]. Used for position embeddings.
+export function arange(n: number, dtype: Dtype = 'i32'): Tensor {
+  const site = captureSite('arange')
+  if (n <= 0 || !Number.isInteger(n)) {
+    throw new ShapeError(`arange: n must be a positive integer, got ${n}`, site)
+  }
+  return addOp(currentGraph(), 'arange', [n], dtype, site, { n, dtype })
+}
+// ----------------------------------------------------------------------------
+// ML primitives. Fused so autograd's transpose rule is straightforward and the
+// kernels can be hand-tuned for our specific shapes.
+// ----------------------------------------------------------------------------
+// Causal-masked softmax along the last axis. Shape preserved. Last two axes
+// must be square (TxT attention scores).
+export function softmaxCausalLast(a: Tensor): Tensor {
+  const site = captureSite('softmaxCausalLast')
+  if (a.dtype !== 'f32') throw new ShapeError(`softmaxCausalLast: requires f32, got ${a.dtype}`, site)
+  inferWhereCausal('softmaxCausalLast', a.shape, site)  // shape check (square last 2 axes)
+  return addOp(currentGraph(), 'softmax_causal_last', a.shape, 'f32', site, { a: a.id })
+}
+// Numerically-stable log-softmax along the last axis. Shape preserved.
+export function logSoftmaxLast(a: Tensor): Tensor {
+  const site = captureSite('logSoftmaxLast')
+  if (a.dtype !== 'f32') throw new ShapeError(`logSoftmaxLast: requires f32, got ${a.dtype}`, site)
+  return addOp(currentGraph(), 'log_softmax_last', a.shape, 'f32', site, { a: a.id })
+}
+// Pre-softmax causal mask. Sets cells where (i < j) on the last two axes to
+// `fillValue` (typically -1e30). Lower-triangle entries pass through.
+// Use this when you want the masked scores explicitly (e.g. for capture);
+// for the common case, prefer softmaxCausalLast which fuses both.
+export function whereCausal(a: Tensor, fillValue: number): Tensor {
+  const site = captureSite('whereCausal')
+  if (a.dtype !== 'f32') throw new ShapeError(`whereCausal: requires f32, got ${a.dtype}`, site)
+  inferWhereCausal('whereCausal', a.shape, site)
+  return addOp(currentGraph(), 'where_causal', a.shape, 'f32', site, { a: a.id, fillValue })
+}
+// ----------------------------------------------------------------------------
+// Slicing.
+// ----------------------------------------------------------------------------
+// sliceLastRange(a, start, end): slice [start, end) along the last axis.
+// Used for splitting Q/K/V from a fused QKV matmul.
+export function sliceLastRange(a: Tensor, start: number, end: number): Tensor {
+  const site = captureSite('sliceLastRange')
+  const outShape = inferSliceLastRange('sliceLastRange', a.shape, start, end, site)
+  return addOp(currentGraph(), 'slice_last_range', outShape, a.dtype, site, { a: a.id, start, end })
+}
+// ----------------------------------------------------------------------------
+// Broadcast / un-broadcast. Mostly used by autograd, but exposed in case user
+// code needs them (e.g. explicit broadcasting for clarity).
+// ----------------------------------------------------------------------------
+export function broadcastTo(a: Tensor, targetShape: Shape): Tensor {
+  const site = captureSite('broadcastTo')
+  inferBroadcastTo('broadcastTo', a.shape, targetShape, site)
+  return addOp(currentGraph(), 'broadcast_to', targetShape, a.dtype, site, { a: a.id, targetShape })
+}
+export function sumToShape(a: Tensor, targetShape: Shape): Tensor {
+  const site = captureSite('sumToShape')
+  inferSumToShape('sumToShape', a.shape, targetShape, site)
+  return addOp(currentGraph(), 'sum_to_shape', targetShape, a.dtype, site, { a: a.id, targetShape })
+}
+// ----------------------------------------------------------------------------
+// Constants.
+// ----------------------------------------------------------------------------
+// 0-d tensor with a constant value. Used by autograd to seed the loss cotangent.
+export function constScalar(value: number, dtype: Dtype = 'f32'): Tensor {
+  const site = captureSite('constScalar')
+  return addOp(currentGraph(), 'const_scalar', [], dtype, site, { value, dtype })
+}
+// ----------------------------------------------------------------------------
+// Autograd-internal helpers (exposed for users writing custom transpose rules).
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// Comparisons and selection.
+// ----------------------------------------------------------------------------
+// Comparisons reuse the binop helper but return bool.
+export const less    = (a: Tensor, b: Tensor): Tensor => binopOp('less',    'less',    a, b, 'bool')
+export const greater = (a: Tensor, b: Tensor): Tensor => binopOp('greater', 'greater', a, b, 'bool')
+// where(cond, a, b): elementwise select. cond is bool; a and b can be any matching dtype.
+export function where(cond: Tensor, a: Tensor, b: Tensor): Tensor {
+  const site = captureSite('where')
+  if (cond.dtype !== 'bool') throw new ShapeError(`where: cond must be bool, got ${cond.dtype}`, site)
+  if (a.dtype !== b.dtype) throw new ShapeError(`where: a/b dtype mismatch (${a.dtype} vs ${b.dtype})`, site)
+  const outShape = inferWhere('where', cond.shape, a.shape, b.shape, site)
+  return addOp(currentGraph(), 'where', outShape, a.dtype, site, { cond: cond.id, a: a.id, b: b.id })
+}
+// reluGrad(x, dy) = dy where x > 0, else 0. Same shape as x. This is the
+// transpose rule for relu, exposed as an op so codegen can emit it.
+export function reluGrad(x: Tensor, dy: Tensor): Tensor {
+  const site = captureSite('reluGrad')
+  if (x.dtype !== 'f32' || dy.dtype !== 'f32') {
+    throw new ShapeError(`reluGrad: requires f32, got ${x.dtype} and ${dy.dtype}`, site)
+  }
+  const outShape = inferReluGrad('reluGrad', x.shape, dy.shape, site)
+  return addOp(currentGraph(), 'relu_grad', outShape, 'f32', site, { x: x.id, dy: dy.id })
+}
+// ----------------------------------------------------------------------------
+// Adam-fused ops. Each does its full per-element update in one kernel.
+// ----------------------------------------------------------------------------
+export function adamUpdateM(m: Tensor, g: Tensor, b1: number): Tensor {
+  const site = captureSite('adamUpdateM')
+  if (m.dtype !== 'f32' || g.dtype !== 'f32') throw new ShapeError(`adamUpdateM: requires f32`, site)
+  if (m.shape.length !== g.shape.length || m.shape.some((d, i) => d !== g.shape[i])) {
+    throw new ShapeError(`adamUpdateM: shape mismatch`, site)
+  }
+  return addOp(currentGraph(), 'adam_update_m', m.shape, 'f32', site, { m: m.id, g: g.id, b1 })
+}
+export function adamUpdateV(v: Tensor, g: Tensor, b2: number): Tensor {
+  const site = captureSite('adamUpdateV')
+  if (v.dtype !== 'f32' || g.dtype !== 'f32') throw new ShapeError(`adamUpdateV: requires f32`, site)
+  if (v.shape.length !== g.shape.length || v.shape.some((d, i) => d !== g.shape[i])) {
+    throw new ShapeError(`adamUpdateV: shape mismatch`, site)
+  }
+  return addOp(currentGraph(), 'adam_update_v', v.shape, 'f32', site, { v: v.id, g: g.id, b2 })
+}
+export function adamUpdateP(p: Tensor, mNew: Tensor, vNew: Tensor, lrt: Tensor, eps: number): Tensor {
+  const site = captureSite('adamUpdateP')
+  if (p.dtype !== 'f32') throw new ShapeError(`adamUpdateP: requires f32`, site)
+  if (lrt.dtype !== 'f32' || lrt.shape.length !== 0) {
+    throw new ShapeError(`adamUpdateP: lrt must be a 0-d f32 scalar`, site)
+  }
+  if (p.shape.length !== mNew.shape.length || p.shape.some((d, i) => d !== mNew.shape[i])) {
+    throw new ShapeError(`adamUpdateP: p/mNew shape mismatch`, site)
+  }
+  return addOp(currentGraph(), 'adam_update_p', p.shape, 'f32', site,
+    { p: p.id, mNew: mNew.id, vNew: vNew.id, lrt: lrt.id, eps })
+}