@danielsimonjr/mathts-autograd 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/dist/index.d.ts +321 -2
  2. package/dist/index.js +1504 -5
  3. package/package.json +58 -58
package/dist/index.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { Tensor } from '@danielsimonjr/mathts-tensor';
1
+ import { Tensor, Index } from '@danielsimonjr/mathts-tensor';
2
2
 
3
3
  /**
4
4
  * DualTensor — a Tensor + per-element tangent component for forward-mode AD.
@@ -109,6 +109,7 @@ declare function forwardGrad(fn: (x: Tensor) => Tensor, x: Tensor): {
109
109
  * tape in reverse, accumulating gradients into each input slot.
110
110
  *
111
111
  * v0.1.0 supports the same ops as DualTensor: add, sub, mul, scale.
112
+ * v0.2.0 adds: contract (named-index), matmul (batched rank-N).
112
113
  */
113
114
 
114
115
  type BackwardFn = (outputGrad: Float64Array) => void;
@@ -135,7 +136,12 @@ declare class TapedTensor {
135
136
  readonly primal: Float64Array;
136
137
  readonly tape: Tape;
137
138
  readonly id: number;
138
- constructor(shape: ReadonlyArray<number>, primal: Float64Array, tape: Tape, id: number);
139
+ /**
140
+ * Optional per-axis Index labels. When set, enables `contract`.
141
+ * Must have the same length as `shape` when present.
142
+ */
143
+ readonly axisLabels?: ReadonlyArray<Index>;
144
+ constructor(shape: ReadonlyArray<number>, primal: Float64Array, tape: Tape, id: number, axisLabels?: ReadonlyArray<Index>);
139
145
  /**
140
146
  * S5 fix: existing engine ops (e.g. lower, pderiv, contract) reach into
141
147
  * `.data`. The getter returns the primal so those ops still work when a
@@ -145,11 +151,324 @@ declare class TapedTensor {
145
151
  get data(): Float64Array;
146
152
  static fromTensorAsInput(t: Tensor, tape: Tape): TapedTensor;
147
153
  toPrimalTensor(): Tensor;
154
+ /**
155
+ * Reconstruct the Tensor primal with the given axisLabels (used internally
156
+ * when the primal was computed from an op that produces labelled output).
157
+ */
158
+ private toPrimalTensorWith;
148
159
  add(other: TapedTensor): TapedTensor;
149
160
  sub(other: TapedTensor): TapedTensor;
161
+ /**
162
+ * Elementwise division: this / other.
163
+ *
164
+ * Adjoints (quotient rule):
165
+ * dA = dY / b
166
+ * dB = −dY · a / b²
167
+ *
168
+ * The alias case (a.divide(a)) is handled explicitly: the combined gradient
169
+ * is dA + dB = dY/a − dY·a/a² = dY/a − dY/a = 0. This is correct since
170
+ * a/a = 1 everywhere and d(1)/da = 0.
171
+ */
172
+ divide(other: TapedTensor): TapedTensor;
150
173
  mul(other: TapedTensor): TapedTensor;
151
174
  scale(k: number): TapedTensor;
175
+ /**
176
+ * Reverse-mode AD over `Tensor.contract`.
177
+ *
178
+ * Both operands must carry `axisLabels`; the resulting TapedTensor inherits
179
+ * the contracted-output axisLabels (non-shared axis concatenation).
180
+ *
181
+ * Adjoint derivation (T-notation):
182
+ * Let Y = A.contract(B) (contraction over shared indices S).
183
+ * dA = dY.contract(B') where B' = B with its free axes re-labelled to
184
+ * match the shared positions in A. Equivalently: for each element of A,
185
+ * dA[...a_free, ...s] = Σ_{...b_free} dY[...a_free, ...b_free] · B[...s, ...b_free]
186
+ * dB = A'.contract(dY) symmetrically.
187
+ *
188
+ * Implementation: because `Tensor.contract` matches by Index identity, we
189
+ * build the backward contraction by re-labelling the free axes of dY with
190
+ * the Index objects from the other operand — so `Tensor.contract` automatically
191
+ * finds the right shared axes.
192
+ */
193
+ contract(other: TapedTensor): TapedTensor;
194
+ /**
195
+ * Reverse-mode AD over batched matmul.
196
+ *
197
+ * Accepts operands of rank ≥ 2. Convention: trailing 2 axes are the matrix
198
+ * dims (rows × cols); all leading axes are batch dims (must be broadcast-
199
+ * compatible). For rank-2 inputs this is classical matrix multiplication.
200
+ *
201
+ * Adjoint derivation (classical matrix calc, extended to batched):
202
+ * Y = A @ B (A: ...×m×k, B: ...×k×n → Y: ...×m×n)
203
+ * dA = dY @ Bᵀ (dA: ...×m×k)
204
+ * dB = Aᵀ @ dY (dB: ...×k×n)
205
+ *
206
+ * Implemented via `Tensor.einsum` with a dynamically-built spec:
207
+ * forward: '...ik,...kj->...ij'
208
+ * dA: '...ij,...kj->...ik' (contract dY with B on j)
209
+ * dB: '...ki,...kj->...ij' (contract A on k with dY on k)
210
+ */
211
+ matmul(other: TapedTensor): TapedTensor;
152
212
  private checkSameShape;
213
+ /**
214
+ * Sum elements along the given axis/axes (or all axes if omitted).
215
+ *
216
+ * Adjoint: dX[...] = dY[reduced(idx)] broadcast back to input shape.
217
+ * Each input element receives the output-gradient entry from its reduced
218
+ * counterpart (the non-reduced coordinates select the dY element; the
219
+ * reduced coordinates are collapsed to 0 in the keepDims=false case).
220
+ */
221
+ sum(axis?: number | ReadonlyArray<number>, opts?: {
222
+ keepDims?: boolean;
223
+ }): TapedTensor;
224
+ /**
225
+ * Arithmetic mean along the given axis/axes (or all axes if omitted).
226
+ *
227
+ * Adjoint: dX[...] = dY[reduced(idx)] / N, broadcast back to input shape.
228
+ * N = product of reduced-axis dimensions.
229
+ */
230
+ mean(axis?: number | ReadonlyArray<number>, opts?: {
231
+ keepDims?: boolean;
232
+ }): TapedTensor;
233
+ /**
234
+ * Product of elements along the given axis/axes (or all axes if omitted).
235
+ *
236
+ * Adjoint: dX_i = dY * (prod_over_axes(x) / x_i) per element.
237
+ *
238
+ * Zero-element corners:
239
+ * - Exactly one x_i = 0 in the reduced group: d/dx_i = product of all others
240
+ * (which is the full product / x_i evaluated via alternate product),
241
+ * and d/dx_j = 0 for all j ≠ i where x_j ≠ 0.
242
+ * - Two or more zeros in the reduced group: gradient is 0 everywhere for that
243
+ * group (because changing any single zero cannot change a product that is
244
+ * zero due to another zero).
245
+ *
246
+ * Implementation: uses prefix/suffix products to handle zeros robustly.
247
+ */
248
+ prod(axis?: number | ReadonlyArray<number>, opts?: {
249
+ keepDims?: boolean;
250
+ }): TapedTensor;
251
+ /**
252
+ * Maximum value along the given axis/axes (or all axes if omitted).
253
+ *
254
+ * Adjoint: dY is scattered to the argmax position(s).
255
+ * Tie-breaking: "first-wins" — the gradient flows to the first (smallest
256
+ * flat-index) element among those that attain the maximum.
257
+ */
258
+ max(axis?: number | ReadonlyArray<number>, opts?: {
259
+ keepDims?: boolean;
260
+ }): TapedTensor;
261
+ /**
262
+ * Minimum value along the given axis/axes (or all axes if omitted).
263
+ *
264
+ * Adjoint: dY is scattered to the argmin position(s).
265
+ * Tie-breaking: "first-wins" — gradient flows to the first (smallest
266
+ * flat-index) element that attains the minimum.
267
+ */
268
+ min(axis?: number | ReadonlyArray<number>, opts?: {
269
+ keepDims?: boolean;
270
+ }): TapedTensor;
271
+ /**
272
+ * p-norm of the tensor.
273
+ *
274
+ * Supported p values: 1, 2, 'fro', 'inf'. Default p = 2.
275
+ * When `opts.axis` is given, reduces along that axis; otherwise reduces all axes.
276
+ *
277
+ * Adjoints:
278
+ * - p=2 / p='fro': dX = dY · x / ‖x‖₂ (Frobenius is the 2-norm of the flattened tensor)
279
+ * - p=1: dX = dY · sign(x) (subgradient = 0 at exact zero)
280
+ * - p='inf': dX scattered to the element(s) of max absolute value;
281
+ * tie-breaking: first-wins. Sign of the scattered gradient
282
+ * matches sign(x_max).
283
+ */
284
+ norm(opts?: {
285
+ p?: 1 | 2 | 'fro' | 'inf';
286
+ axis?: number;
287
+ keepDims?: boolean;
288
+ }): TapedTensor;
289
+ /**
290
+ * Elementwise natural logarithm.
291
+ *
292
+ * Adjoint: dX = dY / x
293
+ */
294
+ log(): TapedTensor;
295
+ /**
296
+ * Elementwise exponential.
297
+ *
298
+ * Adjoint: dX = dY · y where y = exp(x). Primal output is cached.
299
+ */
300
+ exp(): TapedTensor;
301
+ /**
302
+ * Elementwise sine.
303
+ *
304
+ * Adjoint: dX = dY · cos(x)
305
+ */
306
+ sin(): TapedTensor;
307
+ /**
308
+ * Elementwise cosine.
309
+ *
310
+ * Adjoint: dX = −dY · sin(x)
311
+ */
312
+ cos(): TapedTensor;
313
+ /**
314
+ * Elementwise tangent.
315
+ *
316
+ * Adjoint: dX = dY / cos²(x) (= dY · sec²(x))
317
+ */
318
+ tan(): TapedTensor;
319
+ /**
320
+ * Elementwise square root.
321
+ *
322
+ * Adjoint: dX = dY / (2·y) where y = sqrt(x). Primal output is cached.
323
+ */
324
+ sqrt(): TapedTensor;
325
+ /**
326
+ * Elementwise square (x²).
327
+ *
328
+ * Adjoint: dX = dY · 2x
329
+ */
330
+ square(): TapedTensor;
331
+ /**
332
+ * Elementwise fixed-exponent power: x^k.
333
+ *
334
+ * Only fixed (non-TapedTensor) exponents are supported. Variable-exponent
335
+ * pow(taped, taped) is a follow-up slice.
336
+ *
337
+ * Adjoint: dX = dY · k · x^(k−1)
338
+ */
339
+ pow(k: number): TapedTensor;
340
+ /**
341
+ * Elementwise reciprocal: 1 / x.
342
+ *
343
+ * Adjoint: dX = −dY / x²
344
+ */
345
+ reciprocal(): TapedTensor;
346
+ /**
347
+ * Elementwise absolute value: |x|.
348
+ *
349
+ * Adjoint: dX = dY · sign(x)
350
+ * Subgradient at exact zero is defined as 0 (rather than undefined).
351
+ */
352
+ abs(): TapedTensor;
353
+ /**
354
+ * Reverse-mode AD over `Tensor.tensordot`.
355
+ *
356
+ * `axes[i] = [a, b]` contracts axis `a` of `this` with axis `b` of `other`.
357
+ * The result shape is `this`'s non-contracted axes (in original order)
358
+ * followed by `other`'s non-contracted axes (in original order).
359
+ *
360
+ * Adjoint derivation (NumPy/PyTorch tensordot backward, see Townsend 2016 §6,
361
+ * and the canonical PyTorch implementation `TensorDotBackward0` in
362
+ * `torch/csrc/autograd/generated/Functions.cpp`):
363
+ *
364
+ * Z = tensordot(A, B, axes)
365
+ * dA = tensordot(dZ, B, [axes_of_dZ_corresponding_to_B's_free, B's_free])
366
+ * then permute back into A's original axis order.
367
+ * dB = tensordot(A, dZ, [A's_free, axes_of_dZ_corresponding_to_A's_free])
368
+ * then permute back into B's original axis order.
369
+ *
370
+ * The axis-permutation bookkeeping is the trickiest part: Tensor.tensordot
371
+ * produces output axes in the order [A's free axes (original A order),
372
+ * B's free axes (original B order)] — and after the backward contractions
373
+ * the survivors come out in B's original (resp. A's original) axis order
374
+ * for the contracted side, which then needs to be permuted into pair order
375
+ * (so axis k of the contracted block matches the kth pair) and finally
376
+ * scattered back into A's (resp. B's) full original axis order.
377
+ *
378
+ * For the rank-2 × rank-2 single-axis case (i.e. ordinary matmul A·B with
379
+ * axes = [[1, 0]]), this reduces to dA = dZ · Bᵀ, dB = Aᵀ · dZ — the same
380
+ * adjoint as `TapedTensor.matmul`.
381
+ */
382
+ tensordot(other: TapedTensor, axes: ReadonlyArray<readonly [number, number]>): TapedTensor;
383
+ /**
384
+ * Reverse-mode AD over the full SVD of a rank-2 matrix.
385
+ *
386
+ * Forward: `A = U · diag(S) · Vt`, where for input shape [m, n], k = min(m, n):
387
+ * - U has shape [m, k]
388
+ * - S has shape [k]
389
+ * - Vt has shape [k, n] (Vt is V^T in the standard A = U Σ V^T convention,
390
+ * i.e. its rows are right-singular-vector components)
391
+ *
392
+ * Returned TapedTensors share a single backward closure. When backward()
393
+ * runs, it pulls dU, dS, dV from each output's gradient slot, assembles
394
+ * dA, and writes to the input's gradient slot.
395
+ *
396
+ * Adjoint (real, distinct nonzero singular values, m = n square case;
397
+ * extended to rectangular below). Derived directly from the forward
398
+ * Jacobian; equivalent to PyTorch's `svd_backward`
399
+ * (`aten/src/ATen/native/BatchLinearAlgebra.cpp`) and Townsend (2016)
400
+ * "Differentiating the Singular Value Decomposition" §3:
401
+ *
402
+ * Let α = skew(U^T · dU), β = skew(V^T · dV) (k×k, antisymmetric)
403
+ * Build C (k×k):
404
+ * C[i,i] = dS[i]
405
+ * C[i,j] = (α[i,j] + β[i,j]) / (s_j − s_i)
406
+ * + (α[i,j] − β[i,j]) / (s_j + s_i) for i ≠ j
407
+ * dA_in = U · C · V^T (m×n in-subspace part)
408
+ *
409
+ * Rectangular correction (when m > k, i.e. m > n):
410
+ * dA += (I − U U^T) · dU · diag(1/s) · V^T
411
+ * Rectangular correction (when n > k, i.e. n > m):
412
+ * dA += U · diag(1/s) · dV^T · (I − V V^T)
413
+ *
414
+ * Regularisation at repeated/near-zero singular values (PyTorch-equivalent
415
+ * subgradient choice): the (i,j) entry of C is masked to 0 whenever
416
+ * `|s_j − s_i| < REL_TOL · max(|s|)` (the "difference" denominator) or
417
+ * `|s_j + s_i| < REL_TOL · max(|s|)` (the "sum" denominator, only relevant
418
+ * when both are ~0). REL_TOL = 1e-10. This makes the gradient a subgradient
419
+ * at exact degeneracy — finite, but not the unique true derivative (which
420
+ * does not exist at degeneracies). The rectangular correction also masks
421
+ * 1/s_i when |s_i| < REL_TOL · max(|s|).
422
+ *
423
+ * Throws if input is not rank-2. For rank > 2 inputs the user should
424
+ * reshape first.
425
+ */
426
+ svd(): {
427
+ U: TapedTensor;
428
+ S: TapedTensor;
429
+ V: TapedTensor;
430
+ };
431
+ /**
432
+ * Reverse-mode AD over the eigendecomposition of a rank-2 matrix.
433
+ *
434
+ * Symmetric path (`symmetric: true`):
435
+ * Forward: `A = U · diag(Λ) · U^T` for symmetric A (n×n).
436
+ * Adjoint (Magnus & Neudecker 1999 §10.6.6; PyTorch `linalg_eigh_backward`):
437
+ * F[i,j] = 1 / (λ_j − λ_i) for i ≠ j, 0 otherwise (with degeneracy mask)
438
+ * dA_raw = U · (diag(dΛ) + F ∘ (U^T · dU)) · U^T
439
+ * dA = (dA_raw + dA_raw^T) / 2 (symmetrise)
440
+ *
441
+ * Non-symmetric path (`symmetric: false`):
442
+ * Forward: `A = V · diag(λ) · V^{-1}` (V columns are right eigenvectors).
443
+ * Adjoint (Magnus & Neudecker 1999 §10.6 / Giles 2008 §3.2 / Townsend 2016 §4;
444
+ * cross-check: PyTorch `linalg_eig_backward`):
445
+ * E[i,j] = 1 / (λ_j − λ_i) for i ≠ j, 0 otherwise (with degeneracy mask)
446
+ * dA = V^{-T} · ( E ∘ (V^T · dV) + diag(dλ) ) · V^T
447
+ *
448
+ * Restrictions (all enforced — throw a clear error otherwise):
449
+ * 1. Eigenvalues must be real. The underlying matrix-eig primitive returns
450
+ * placeholder eigenvectors (not actual complex vectors) when complex
451
+ * eigenvalues arise, so the adjoint formula cannot be evaluated. Real-
452
+ * Schur differentiation would require complex arithmetic infrastructure
453
+ * throughout the Tape/TapedTensor stack, which is out of scope.
454
+ * 2. A must be diagonalisable (non-defective). The adjoint assumes V is
455
+ * invertible; defective inputs have algebraic > geometric multiplicity
456
+ * so V is rank-deficient. Detected by cond_∞(V) > 1e14.
457
+ *
458
+ * Regularisation at repeated eigenvalues (subgradient choice): mask
459
+ * F/E[i,j] = 0 when `|λ_i − λ_j| < REL_TOL · max(|λ|)`. REL_TOL = 1e-10.
460
+ *
461
+ * Throws if input is not rank-2 or square, if `symmetric` is missing, or
462
+ * (non-symmetric path) on complex eigenvalues / defective input.
463
+ */
464
+ eig(opts: {
465
+ symmetric: boolean;
466
+ }): {
467
+ eigvals: TapedTensor;
468
+ eigvecs: TapedTensor;
469
+ };
470
+ private _eigSymmetric;
471
+ private _eigGeneral;
153
472
  }
154
473
 
155
474
  /**