npm - tensorgrad - Versions diffs - 0.0.1 → 0.0.4 - Mend

tensorgrad 0.0.1 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

package/src/adam.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-// Adam optimizer, in-graph.
+// Adam / AdamW optimizer, in-graph.
 //
 // `appendAdam` extends a graph that already has a forward pass + autograd-emitted
 // backward (i.e., has paramGrads from `appendGrad`) with the Adam update math.
@@ -6,12 +6,14 @@
 // Per parameter P with gradient g:
 //   m_new = b1 * m + (1 - b1) * g
 //   v_new = b2 * v + (1 - b2) * g²
-//   p_new = p - lr * m_new / (sqrt(v_new) + eps)
+//   p_new = decayShrink * p - lrt * m_new / (sqrt(v_new) + eps)
 //
-// This is "Adam without bias correction" — the `1 / (1 - β^t)` factors are
-// dropped because computing them in-graph requires per-step uniforms or
-// awkward exp/log tricks. In practice the omission only affects the first
-// ~100 steps; convergence is unaffected.
+// `decayShrink = 1 - lr * weightDecay` when the param is being decayed
+// (Loshchilov & Hutter, "AdamW") and 1 otherwise — at which point the
+// multiply folds out and you're left with plain Adam. `lrt` is supplied
+// per-step from CPU and includes the bias-correction factor
+// `sqrt(1-b2^t)/(1-b1^t)`; that's why convergence isn't affected by the
+// first-step warmup that bias-correction-free Adam suffers.
 //
 // Returns writeback declarations the buffer planner uses to wire up the
 // "after step, copy the new value into the persistent home" path. m and v
@@ -29,6 +31,15 @@ export interface AdamConfig {
   b1?: number   // default 0.9
   b2?: number   // default 0.999
   eps?: number  // default 1e-8
+  /** AdamW: decoupled weight decay coefficient. Default 0 (plain Adam).
+   *  When non-zero, every step shrinks each decayed param by a factor of
+   *  `1 - lr * weightDecay` before the gradient update. */
+  weightDecay?: number
+  /** Filter deciding which params get weight decay. Only consulted when
+   *  weightDecay > 0. Default: decay every param. Override for the standard
+   *  transformer convention (decay weights/embeddings, skip biases + LN gains).
+   *  Example: `(name) => name.includes('.W') || name.endsWith('_emb')`. */
+  decayFilter?: (paramName: string) => boolean
 }
 export interface AdamResult {
@@ -38,7 +49,7 @@ export interface AdamResult {
    * with `lr * sqrt(1-b2^t)/(1-b1^t)` (Adam's bias-corrected effective LR). */
   lrtInputName: string
   /** Hyperparameters as captured (so the runtime can compute lrt). */
-  config: Required<AdamConfig>
+  config: Required<Omit<AdamConfig, 'decayFilter'>> & { decayFilter: (name: string) => boolean }
 }
 /**
@@ -50,7 +61,8 @@ export interface AdamResult {
  * @param paramTensors param name -> the param's leaf Tensor (the param_input).
  *                     Needed because the param_input lives in the graph but we
  *                     don't have a direct map by name in `Graph` — caller passes it.
- * @param config Adam hyperparameters
+ * @param config Adam hyperparameters. Set `weightDecay > 0` for AdamW; an
+ *               optional `decayFilter` selects which params receive decay.
  */
 export function appendAdam(
   graph: Graph,
@@ -58,11 +70,13 @@ export function appendAdam(
   paramTensors: Record<string, Tensor>,
   config: AdamConfig,
 ): AdamResult {
-  const fullConfig: Required<AdamConfig> = {
+  const fullConfig = {
     lr: config.lr,
     b1: config.b1 ?? 0.9,
     b2: config.b2 ?? 0.999,
     eps: config.eps ?? 1e-8,
+    weightDecay: config.weightDecay ?? 0,
+    decayFilter: config.decayFilter ?? (() => true),
   }
   const writebacks: WritebackDecl[] = []
   const lrtInputName = '_adam_lrt'
@@ -81,10 +95,17 @@ export function appendAdam(
       const mState = stateInput(`adam_m_${name}`, p.shape, 'f32', 0)
       const vState = stateInput(`adam_v_${name}`, p.shape, 'f32', 0)
+      // decayShrink baked at compile time. 1.0 for plain Adam (no extra cost
+      // — the WGSL compiler folds the constant multiply); 1 - lr * weightDecay
+      // for the params the filter selects.
+      const decayShrink = (fullConfig.weightDecay > 0 && fullConfig.decayFilter(name))
+        ? 1 - fullConfig.lr * fullConfig.weightDecay
+        : 1
       // Three fused kernels per parameter — one for each of m_new / v_new / p_new.
       const newM = adamUpdateM(mState, g, fullConfig.b1)
       const newV = adamUpdateV(vState, g, fullConfig.b2)
-      const newP = adamUpdateP(p, newM, newV, lrt, fullConfig.eps)
+      const newP = adamUpdateP(p, newM, newV, lrt, fullConfig.eps, decayShrink)
       writebacks.push({ source: newM, destName: `adam_m_${name}`, destKind: 'state' })
       writebacks.push({ source: newV, destName: `adam_v_${name}`, destKind: 'state' })

package/src/buffers.ts CHANGED Viewed

@@ -47,6 +47,7 @@ export interface BufferPlan {
   inputsByName: Map<string, number>           // name -> buffer id
   paramGradsByName: Map<string, number>       // name -> buffer id
   statesByName: Map<string, number>           // name -> buffer id (persistent state homes)
+  capturesByName: Map<string, number>         // name -> buffer id (activation captures)
   outputBufferIds: number[]                   // graph.outputs mapped through
   /** End-of-step writebacks (Adam updates for params, m, v, etc.) */
   writebacks: Writeback[]
@@ -169,5 +170,17 @@ export function planBuffers(
     return { source: sourceBufId, dest: destBufId, bytes: sourceSpec.byteSize }
   })
-  return { buffers, tensorToBuffer, paramsByName, inputsByName, paramGradsByName, statesByName, outputBufferIds, writebacks }
+  // Resolve graph.captures (name -> tensor id) to (name -> buffer id).
+  // No pinning needed at the planner level: each tensor already has its own
+  // buffer (see "v1 strategy" comment at top — no pooling yet).
+  const capturesByName = new Map<string, number>()
+  for (const [name, tensorId] of graph.captures) {
+    const bufId = tensorToBuffer.get(tensorId)
+    if (bufId === undefined) {
+      throw new Error(`planBuffers: capture '${name}' references unknown tensor #${tensorId}`)
+    }
+    capturesByName.set(name, bufId)
+  }
+  return { buffers, tensorToBuffer, paramsByName, inputsByName, paramGradsByName, statesByName, capturesByName, outputBufferIds, writebacks }
 }

package/src/capture.ts ADDED Viewed

@@ -0,0 +1,36 @@
+// Activation capture — opt-in readback of intermediate tensors at training step.
+//
+// Usage (inside the user's forward pass):
+//
+//   import { capture } from 'tensorgrad'
+//
+//   function attentionFwd(p, x) {
+//     const scores = mul(matmulBatched(q, kT), SCALE_QK)
+//     const attn = capture(`attn.${layerIdx}`, softmaxCausalLast(scores))
+//     return matmulBatched(attn, v)
+//   }
+//
+// Pass-through return type: `capture(name, t)` returns `t` unchanged so it
+// inlines at the point of computation. Behind the scenes it registers `t.id`
+// against `name` on the current graph; runtime exposes the registered tensors
+// via `step(inputs, { withCaptures: true })`.
+//
+// Outside the user's forward trace (during `appendGrad` / `appendAdam`'s
+// `traceInto` re-entry), `capture()` is a no-op — gradient and optimizer
+// internals shouldn't accidentally publish themselves to the UI.
+import type { Tensor } from './ir.js'
+import { currentGraph, isCaptureEnabled } from './trace.js'
+export function capture<T extends Tensor>(name: string, t: T): T {
+  if (!isCaptureEnabled()) return t
+  const g = currentGraph()
+  if (g.captures.has(name)) {
+    throw new Error(
+      `capture: name '${name}' already registered. Use unique names ` +
+      `(e.g. \`attn.\${layerIdx}\`) when capturing across a loop.`,
+    )
+  }
+  g.captures.set(name, t.id)
+  return t
+}

package/src/codegen.ts CHANGED Viewed

@@ -555,8 +555,10 @@ fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
       return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.v), buf(op.g), buf(op.out)], threads: total, workgroupSize: WG_SIZE }
     }
     case 'adam_update_p': {
-      // p_new = p - lrt[0] * m_new / (sqrt(v_new) + eps).
+      // p_new = decayShrink * p - lrt[0] * m_new / (sqrt(v_new) + eps).
       // lrt is supplied per-step from CPU (already includes bias correction).
+      // decayShrink encodes AdamW's decoupled weight decay; when no decay is
+      // requested it's exactly 1.0 and the WGSL compiler folds the multiply away.
       const out = tof(op.out)
       const total = shapeSize(out.shape)
       const wgsl = `
@@ -569,7 +571,7 @@ fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
 fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
   let i = gid.x + gid.y * 16776960u;
   if (i >= ${total}u) { return; }
-  out[i] = p[i] - lrt[0] * mNew[i] / (sqrt(vNew[i]) + ${wgslLiteral(op.eps, 'f32')});
+  out[i] = ${wgslLiteral(op.decayShrink, 'f32')} * p[i] - lrt[0] * mNew[i] / (sqrt(vNew[i]) + ${wgslLiteral(op.eps, 'f32')});
 }`.trim()
       return { opIndex, opKind: op.kind, wgsl, bindings: [buf(op.p), buf(op.mNew), buf(op.vNew), buf(op.lrt), buf(op.out)], threads: total, workgroupSize: WG_SIZE }
     }

package/src/compile.ts CHANGED Viewed

@@ -14,7 +14,7 @@ import { appendGrad, type GradResult } from './grad.js'
 import { appendAdam, type AdamConfig } from './adam.js'
 import { planBuffers, type BufferPlan } from './buffers.js'
 import { emitKernels, type KernelSpec } from './codegen.js'
-import { createRuntime, type CompiledRuntime, type RuntimeOpts } from './runtime.js'
+import { createRuntime, createForwardRuntime, type CompiledRuntime, type CompiledForward, type RuntimeOpts } from './runtime.js'
 import { Module, materializeParams } from './module.js'
 /** Declares one input tensor of the model's forward function. Order matches
@@ -65,10 +65,19 @@ export interface CompileModuleOptions extends RuntimeOpts {
   adam?: AdamConfig
 }
+export interface CompileForwardOptions extends RuntimeOpts {
+  /** Per-step data inputs to the forward function. */
+  inputs?: InputDecl[]
+}
 /**
- * Compile a Module-based model. The forward function takes the materialized
- * model and returns the loss tensor (typically by also calling tensorInput
- * for tokens/targets/masks inside).
+ * Compile a Module-based model. Pass a *factory* `() => new Model()`, not the
+ * model instance itself: compilation mutates the tree (every `ParamSentinel`
+ * field becomes a real `Tensor`), so the instance is consumed and shouldn't be
+ * referenced afterwards. Re-call the factory if you need a fresh tree.
+ *
+ * The forward function takes the materialized model and returns the loss
+ * tensor.
  *
  * Walks the module tree to materialize params with auto-derived names, then
  * runs trace → grad → adam → buffer plan → codegen → runtime.
@@ -78,14 +87,15 @@ export interface CompileModuleOptions extends RuntimeOpts {
  * users don't need to provide it themselves.
  */
 export async function compileModule<M extends Module>(
-  model: M,
+  modelFactory: () => M,
   forward: (m: M, ...inputs: Tensor[]) => Tensor,
   opts: CompileModuleOptions = {},
-): Promise<CompiledRuntime & { ir: CompiledIR }> {
+): Promise<CompiledRuntime & { ir: CompiledIR; uploadInitialParams: () => void }> {
   const inputDecls = opts.inputs ?? []
-  let paramTensors: Record<string, Tensor> = {}
+  const model = modelFactory()
+  let materialized: ReturnType<typeof materializeParams> = { tensors: {}, initFns: {} }
   const graph = trace(() => {
-    paramTensors = materializeParams(model)
+    materialized = materializeParams(model)
     const inputTensors = inputDecls.map(d => tensorInput(d.name, d.shape, d.dtype ?? 'f32'))
     return forward(model, ...inputTensors)
   })
@@ -94,7 +104,7 @@ export async function compileModule<M extends Module>(
   let adamResult: ReturnType<typeof appendAdam> | undefined
   if (opts.adam) {
-    adamResult = appendAdam(graph, paramGrads, paramTensors, opts.adam)
+    adamResult = appendAdam(graph, paramGrads, materialized.tensors, opts.adam)
   }
   const plan = planBuffers(graph, paramGrads, adamResult?.writebacks ?? [])
@@ -103,18 +113,107 @@ export async function compileModule<M extends Module>(
   const runtime = await createRuntime(plan, kernels, lossBufferId, opts)
   // If Adam is enabled, wrap step() to track the step count and supply lrt.
+  // Wrap resetOptimizerState() too, so a reset zeros m/v *and* the bias-correction
+  // counter — otherwise the next step would skip Adam's warmup phase.
   if (adamResult) {
     const { lrtInputName, config } = adamResult
     let t = 0
     const lrtBuf = new Float32Array(1)
-    const innerStep = runtime.step.bind(runtime)
-    runtime.step = async (inputs) => {
+    const innerStep = runtime.step.bind(runtime) as CompiledRuntime['step']
+    const innerReset = runtime.resetOptimizerState.bind(runtime)
+    const wrappedStep = (
+      inputs: Record<string, Int32Array | Float32Array>,
+      opts?: { withCaptures?: boolean },
+    ): Promise<number | { loss: number; captures: Record<string, Float32Array> }> => {
       t++
       lrtBuf[0] = config.lr * Math.sqrt(1 - Math.pow(config.b2, t)) / (1 - Math.pow(config.b1, t))
-      return innerStep({ ...inputs, [lrtInputName]: lrtBuf })
+      const merged = { ...inputs, [lrtInputName]: lrtBuf }
+      return opts?.withCaptures ? innerStep(merged, { withCaptures: true }) : innerStep(merged)
+    }
+    runtime.step = wrappedStep as CompiledRuntime['step']
+    runtime.resetOptimizerState = () => {
+      t = 0
+      innerReset()
+    }
+  }
+  const { initFns } = materialized
+  const uploadInitialParams = () => {
+    const out: Record<string, Float32Array> = {}
+    for (const [name, bufId] of plan.paramsByName) {
+      const shape = plan.buffers[bufId]!.shape
+      const size = shape.reduce((a, b) => a * b, 1)
+      const initFn = initFns[name]
+      if (!initFn) throw new Error(`uploadInitialParams: no init for param '${name}'`)
+      out[name] = initFn(size, shape)
     }
+    runtime.uploadParams(out)
   }
   const ir: CompiledIR = { graph, paramGrads, loss, plan, kernels }
-  return Object.assign(runtime, { ir })
+  return Object.assign(runtime, { ir, uploadInitialParams })
+}
+// ============================================================================
+// Forward-only compile
+// ============================================================================
+/**
+ * Compile a Module-based model in forward-only mode (no autograd, no Adam).
+ * The forward function returns the output tensor (e.g., logits) instead of a
+ * scalar loss; runtime exposes `run(inputs)` returning the full output as a
+ * `Float32Array`.
+ *
+ * **Sharing params with a training compile.** Pass `opts.sharedParams =
+ * trainCompiled.params` to bind this graph's param buffers to an existing
+ * training runtime's GPU buffers — every train step is then immediately
+ * visible to `run()` calls here, no copies. The forward graph's
+ * `uploadInitialParams()` skips any param covered by `sharedParams`.
+ *
+ * Typical use: a B=1 inference graph alongside a B=512 training graph,
+ * built from the same `Module` factory.
+ */
+export async function compileForward<M extends Module>(
+  modelFactory: () => M,
+  forward: (m: M, ...inputs: Tensor[]) => Tensor,
+  opts: CompileForwardOptions = {},
+): Promise<CompiledForward & { ir: CompiledIR; uploadInitialParams: () => void }> {
+  const inputDecls = opts.inputs ?? []
+  const model = modelFactory()
+  let materialized: ReturnType<typeof materializeParams> = { tensors: {}, initFns: {} }
+  const graph = trace(() => {
+    materialized = materializeParams(model)
+    const inputTensors = inputDecls.map(d => tensorInput(d.name, d.shape, d.dtype ?? 'f32'))
+    return forward(model, ...inputTensors)
+  })
+  const plan = planBuffers(graph, /* paramGrads */ {})
+  const kernels = emitKernels(graph, plan)
+  const outputTensor = graph.tensors[graph.outputs[0]!]!
+  const outputBufferId = plan.tensorToBuffer.get(outputTensor.id)!
+  const runtime = await createForwardRuntime(plan, kernels, outputBufferId, opts)
+  const sharedParams = opts.sharedParams
+  const { initFns } = materialized
+  const uploadInitialParams = () => {
+    const out: Record<string, Float32Array> = {}
+    let needsUpload = false
+    for (const [name, bufId] of plan.paramsByName) {
+      // Skip params covered by sharedParams — those are owned by the providing
+      // compile and already initialized there.
+      if (sharedParams?.has(name)) continue
+      const shape = plan.buffers[bufId]!.shape
+      const size = shape.reduce((a, b) => a * b, 1)
+      const initFn = initFns[name]
+      if (!initFn) throw new Error(`uploadInitialParams: no init for param '${name}'`)
+      out[name] = initFn(size, shape)
+      needsUpload = true
+    }
+    if (needsUpload) runtime.uploadParams(out, { partial: !!sharedParams })
+  }
+  // CompiledIR.loss is the field name; for forward-only, it carries the user's
+  // returned tensor (e.g., logits). Same shape conceptually; just no autograd.
+  const ir: CompiledIR = { graph, paramGrads: {}, loss: outputTensor, plan, kernels }
+  return Object.assign(runtime, { ir, uploadInitialParams })
 }

package/src/index.ts CHANGED Viewed

@@ -6,6 +6,7 @@
 export type { Tensor, Shape, Dtype, OpNode, Graph, CallSite } from './ir.js'
 export { ShapeError } from './shape.js'
 export { trace, traceInto, paramInput, tensorInput, stateInput } from './trace.js'
+export { capture } from './capture.js'
 export {
   // Element-wise arithmetic. The binops accept Tensor or JS-number for the second arg.
   add, sub, mul, div,
@@ -35,6 +36,7 @@ export { appendGrad, type GradResult } from './grad.js'
 export { appendAdam, type AdamConfig, type AdamResult } from './adam.js'
 export { planBuffers, type BufferPlan, type BufferSpec, type Writeback, type WritebackDecl } from './buffers.js'
 export { emitKernels, type KernelSpec } from './codegen.js'
-export { createRuntime, type CompiledRuntime, type RuntimeOpts } from './runtime.js'
-export { compile, compileToIR, compileModule, type CompiledIR, type CompileModuleOptions, type InputDecl } from './compile.js'
-export { Module, materializeParams } from './module.js'
+export { createRuntime, createForwardRuntime, type CompiledRuntime, type CompiledForward, type RuntimeOpts, type StepOptions, type StepWithCaptures, type RunOptions, type RunWithCaptures } from './runtime.js'
+export { compile, compileToIR, compileModule, compileForward, type CompiledIR, type CompileModuleOptions, type CompileForwardOptions, type InputDecl } from './compile.js'
+export { Module, materializeParams, type InitSpec, type ParamOptions, type MaterializedParams } from './module.js'
+export * as nn from './nn.js'

package/src/ir.ts CHANGED Viewed

@@ -109,11 +109,13 @@ export type OpNode =
   // update into ~12 element-wise dispatches per param.
   | { kind: 'adam_update_m'; out: number; m: number; g: number; b1: number }
   | { kind: 'adam_update_v'; out: number; v: number; g: number; b2: number }
-  // adam_update_p: p_new = p - lrt[0] * m_new / (sqrt(v_new) + eps).
+  // adam_update_p: p_new = decayShrink * p - lrt[0] * m_new / (sqrt(v_new) + eps).
   // `lrt` is a scalar tensor (provided as a tensor_input updated per step) that
   // already includes Adam's bias-correction factor: lrt = lr * sqrt(1-b2^t) / (1-b1^t).
-  // Only `eps` is baked in.
-  | { kind: 'adam_update_p'; out: number; p: number; mNew: number; vNew: number; lrt: number; eps: number }
+  // `decayShrink` is the decoupled-weight-decay factor (Loshchilov & Hutter,
+  // "AdamW") baked at compile time: 1 - lr * weightDecay when the param is being
+  // decayed, 1 otherwise. eps and decayShrink are both baked into the kernel.
+  | { kind: 'adam_update_p'; out: number; p: number; mNew: number; vNew: number; lrt: number; eps: number; decayShrink: number }
   // ---- Slicing / broadcasting / autograd infrastructure -------------------
   // Slice [start, end) along the last axis. Output shape: input shape with
@@ -139,10 +141,14 @@ export interface Graph {
   // Names of tensors that should be exposed as outputs of the compiled function.
   // Set by the trace driver; for a loss function, this is `[lossTensor]`.
   readonly outputs: number[]
+  // Tensors registered for activation readback via `capture(name, t)`.
+  // Keyed by user-supplied name; insertion order preserved. Empty when no
+  // captures registered (the common training case — zero overhead).
+  readonly captures: Map<string, number>
 }
 export function makeGraph(): Graph {
-  return { ops: [], tensors: [], outputs: [] }
+  return { ops: [], tensors: [], outputs: [], captures: new Map() }
 }
 // Internal: register a fresh tensor in the graph and return its id.

package/src/module.ts CHANGED Viewed

@@ -6,8 +6,8 @@
 //     W: Tensor; b: Tensor
 //     constructor(inDim: number, outDim: number) {
 //       super()
-//       this.W = this.param([inDim, outDim])
-//       this.b = this.param([outDim])
+//       this.W = this.param([inDim, outDim])               // randn, scale 0.02
+//       this.b = this.param([outDim], { init: 'zeros' })
 //     }
 //   }
 //   class Block extends Module {
@@ -28,6 +28,54 @@
 import type { Tensor, Shape, Dtype } from './ir.js'
 import { paramInput } from './trace.js'
+// ============================================================================
+// Init metadata
+// ============================================================================
+/** How a parameter's initial values are produced.
+ *  - `'randn'` — Gaussian, with `scale` (default 0.02). The common case for
+ *    weight matrices and embeddings.
+ *  - `'zeros'` — fill with 0. Common for biases and LayerNorm beta.
+ *  - `'ones'`  — fill with 1. Common for LayerNorm gain.
+ *  - Custom function — receives total element count and shape, returns the
+ *    Float32Array. Use for fan-in scaling or any non-standard scheme.
+ */
+export type InitSpec =
+  | 'randn'
+  | 'zeros'
+  | 'ones'
+  | ((size: number, shape: readonly number[]) => Float32Array)
+export interface ParamOptions {
+  dtype?: Dtype
+  /** Init kind. Default: `'randn'`. */
+  init?: InitSpec
+  /** Std dev for `'randn'`. Default 0.02. Ignored for non-randn init. */
+  scale?: number
+}
+type InitFn = (size: number, shape: readonly number[]) => Float32Array
+function boxMuller(): number {
+  return Math.sqrt(-2 * Math.log(Math.max(1e-10, Math.random()))) * Math.cos(2 * Math.PI * Math.random())
+}
+function resolveInit(opts: ParamOptions | undefined): InitFn {
+  const init = opts?.init ?? 'randn'
+  if (init === 'randn') {
+    const scale = opts?.scale ?? 0.02
+    return (size) => {
+      const arr = new Float32Array(size)
+      for (let i = 0; i < size; i++) arr[i] = boxMuller() * scale
+      return arr
+    }
+  }
+  if (init === 'zeros') return (size) => new Float32Array(size)
+  if (init === 'ones') return (size) => { const a = new Float32Array(size); a.fill(1); return a }
+  if (typeof init === 'function') return init
+  throw new Error(`Unknown init: ${String(init)}`)
+}
 // ============================================================================
 // Internals: param sentinel
 // ============================================================================
@@ -38,7 +86,11 @@ import { paramInput } from './trace.js'
 // only valid post-materialization (which is always before forward runs).
 class ParamSentinel {
-  constructor(public readonly shape: Shape, public readonly dtype: Dtype) {}
+  constructor(
+    public readonly shape: Shape,
+    public readonly dtype: Dtype,
+    public readonly initFn: InitFn,
+  ) {}
 }
 // ============================================================================
@@ -52,11 +104,13 @@ export abstract class Module {
    * that gets replaced with a real Tensor at compile time.
    *
    * The parameter's name is auto-derived from its property path in the model
-   * tree (e.g. `layers.0.attn.W_q`).
+   * tree (e.g. `layers.0.attn.W_q`). Init metadata travels with the param;
+   * call `compiled.uploadInitialParams()` to apply it after compile.
    */
-  protected param(shape: Shape, dtype: Dtype = 'f32'): Tensor {
+  protected param(shape: Shape, opts?: ParamOptions): Tensor {
+    const dtype = opts?.dtype ?? 'f32'
     // Lie to TypeScript: the sentinel becomes a Tensor at materialize time.
-    return new ParamSentinel(shape, dtype) as unknown as Tensor
+    return new ParamSentinel(shape, dtype, resolveInit(opts)) as unknown as Tensor
   }
 }
@@ -64,23 +118,33 @@ export abstract class Module {
 // Tree walking
 // ============================================================================
+export interface MaterializedParams {
+  /** Map from auto-derived path (e.g. `layers.0.attn.W_q`) to its Tensor. */
+  tensors: Record<string, Tensor>
+  /** Init function per param path. Used by `uploadInitialParams`. */
+  initFns: Record<string, InitFn>
+}
 /**
  * Walk the module tree and replace every ParamSentinel with a real Tensor
  * created via `paramInput(autoName, ...)`. Must be called inside an active
  * trace context (paramInput appends to the current graph).
  *
- * Returns a flat record of `{ path: tensor }` for every materialized param.
+ * Returns the param tensors keyed by path, plus init functions for use by
+ * `uploadInitialParams`.
  */
-export function materializeParams(root: Module): Record<string, Tensor> {
-  const out: Record<string, Tensor> = {}
+export function materializeParams(root: Module): MaterializedParams {
+  const tensors: Record<string, Tensor> = {}
+  const initFns: Record<string, InitFn> = {}
   visit(root, '', (path, val, owner, key) => {
     if (val instanceof ParamSentinel) {
       const t = paramInput(path, val.shape, val.dtype)
       ;(owner as any)[key] = t
-      out[path] = t
+      tensors[path] = t
+      initFns[path] = val.initFn
     }
   })
-  return out
+  return { tensors, initFns }
 }
 // ----------------------------------------------------------------------------

package/src/nn.ts ADDED Viewed

@@ -0,0 +1,59 @@
+// Standard "batteries-included" Module subclasses for the most common layers.
+//
+// JAX-style: each class declares its params (and their init); the forward is a
+// plain function the user calls with `(module, x)`. No subclassing, no method
+// dispatch — keeps the autograd-traced computation visible at the call site.
+//
+// Import as a namespace:
+//
+//   import { nn } from 'tensorgrad'
+//   class Block extends Module {
+//     ln  = new nn.LayerNorm(D)
+//     ffn = new nn.Linear(D, 4 * D)
+//   }
+//   const y = nn.linearFwd(p.ffn, nn.layerNormFwd(p.ln, x))
+import { Module } from './module.js'
+import type { Tensor } from './ir.js'
+import { add, matmul, sub, mul, div, sqrt, meanLast } from './ops.js'
+// ----------------------------------------------------------------------------
+// Linear: y = x @ W (+ b)
+// ----------------------------------------------------------------------------
+export class Linear extends Module {
+  W: Tensor
+  b: Tensor | null
+  constructor(public readonly inDim: number, public readonly outDim: number, withBias = true) {
+    super()
+    this.W = this.param([inDim, outDim])                      // randn, scale 0.02
+    this.b = withBias ? this.param([outDim], { init: 'zeros' }) : null
+  }
+}
+export function linearFwd(p: Linear, x: Tensor): Tensor {
+  const out = matmul(x, p.W)
+  return p.b ? add(out, p.b) : out
+}
+// ----------------------------------------------------------------------------
+// LayerNorm — normalizes over the last axis. eps defaults to 1e-5.
+// ----------------------------------------------------------------------------
+export class LayerNorm extends Module {
+  g: Tensor
+  b: Tensor
+  constructor(public readonly d: number, public readonly eps: number = 1e-5) {
+    super()
+    this.g = this.param([d], { init: 'ones' })
+    this.b = this.param([d], { init: 'zeros' })
+  }
+}
+export function layerNormFwd(p: LayerNorm, x: Tensor): Tensor {
+  const m = meanLast(x)
+  const c = sub(x, m)
+  const v = meanLast(mul(c, c))
+  const stdev = sqrt(add(v, p.eps))
+  return add(mul(div(c, stdev), p.g), p.b)
+}

package/src/ops.ts CHANGED Viewed

@@ -297,7 +297,7 @@ export function adamUpdateV(v: Tensor, g: Tensor, b2: number): Tensor {
   return addOp(currentGraph(), 'adam_update_v', v.shape, 'f32', site, { v: v.id, g: g.id, b2 })
 }
-export function adamUpdateP(p: Tensor, mNew: Tensor, vNew: Tensor, lrt: Tensor, eps: number): Tensor {
+export function adamUpdateP(p: Tensor, mNew: Tensor, vNew: Tensor, lrt: Tensor, eps: number, decayShrink: number = 1): Tensor {
   const site = captureSite('adamUpdateP')
   if (p.dtype !== 'f32') throw new ShapeError(`adamUpdateP: requires f32`, site)
   if (lrt.dtype !== 'f32' || lrt.shape.length !== 0) {
@@ -307,5 +307,5 @@ export function adamUpdateP(p: Tensor, mNew: Tensor, vNew: Tensor, lrt: Tensor,
     throw new ShapeError(`adamUpdateP: p/mNew shape mismatch`, site)
   }
   return addOp(currentGraph(), 'adam_update_p', p.shape, 'f32', site,
-    { p: p.id, mNew: mNew.id, vNew: vNew.id, lrt: lrt.id, eps })
+    { p: p.id, mNew: mNew.id, vNew: vNew.id, lrt: lrt.id, eps, decayShrink })
 }