npm - @danielsimonjr/mathts-autograd - Versions diffs - 0.1.0 → 0.1.1 - Mend

@danielsimonjr/mathts-autograd 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { Tensor } from '@danielsimonjr/mathts-tensor';
+import { Tensor, Index } from '@danielsimonjr/mathts-tensor';
 /**
  * DualTensor — a Tensor + per-element tangent component for forward-mode AD.
@@ -109,6 +109,7 @@ declare function forwardGrad(fn: (x: Tensor) => Tensor, x: Tensor): {
  * tape in reverse, accumulating gradients into each input slot.
  *
  * v0.1.0 supports the same ops as DualTensor: add, sub, mul, scale.
+ * v0.2.0 adds: contract (named-index), matmul (batched rank-N).
  */
 type BackwardFn = (outputGrad: Float64Array) => void;
@@ -135,7 +136,12 @@ declare class TapedTensor {
     readonly primal: Float64Array;
     readonly tape: Tape;
     readonly id: number;
-    constructor(shape: ReadonlyArray<number>, primal: Float64Array, tape: Tape, id: number);
+    /**
+     * Optional per-axis Index labels. When set, enables `contract`.
+     * Must have the same length as `shape` when present.
+     */
+    readonly axisLabels?: ReadonlyArray<Index>;
+    constructor(shape: ReadonlyArray<number>, primal: Float64Array, tape: Tape, id: number, axisLabels?: ReadonlyArray<Index>);
     /**
      * S5 fix: existing engine ops (e.g. lower, pderiv, contract) reach into
      * `.data`. The getter returns the primal so those ops still work when a
@@ -145,11 +151,324 @@ declare class TapedTensor {
     get data(): Float64Array;
     static fromTensorAsInput(t: Tensor, tape: Tape): TapedTensor;
     toPrimalTensor(): Tensor;
+    /**
+     * Reconstruct the Tensor primal with the given axisLabels (used internally
+     * when the primal was computed from an op that produces labelled output).
+     */
+    private toPrimalTensorWith;
     add(other: TapedTensor): TapedTensor;
     sub(other: TapedTensor): TapedTensor;
+    /**
+     * Elementwise division: this / other.
+     *
+     * Adjoints (quotient rule):
+     *   dA = dY / b
+     *   dB = −dY · a / b²
+     *
+     * The alias case (a.divide(a)) is handled explicitly: the combined gradient
+     * is dA + dB = dY/a − dY·a/a² = dY/a − dY/a = 0. This is correct since
+     * a/a = 1 everywhere and d(1)/da = 0.
+     */
+    divide(other: TapedTensor): TapedTensor;
     mul(other: TapedTensor): TapedTensor;
     scale(k: number): TapedTensor;
+    /**
+     * Reverse-mode AD over `Tensor.contract`.
+     *
+     * Both operands must carry `axisLabels`; the resulting TapedTensor inherits
+     * the contracted-output axisLabels (non-shared axis concatenation).
+     *
+     * Adjoint derivation (T-notation):
+     *   Let Y = A.contract(B)  (contraction over shared indices S).
+     *   dA = dY.contract(B')  where B' = B with its free axes re-labelled to
+     *        match the shared positions in A. Equivalently: for each element of A,
+     *        dA[...a_free, ...s] = Σ_{...b_free} dY[...a_free, ...b_free] · B[...s, ...b_free]
+     *   dB = A'.contract(dY)  symmetrically.
+     *
+     * Implementation: because `Tensor.contract` matches by Index identity, we
+     * build the backward contraction by re-labelling the free axes of dY with
+     * the Index objects from the other operand — so `Tensor.contract` automatically
+     * finds the right shared axes.
+     */
+    contract(other: TapedTensor): TapedTensor;
+    /**
+     * Reverse-mode AD over batched matmul.
+     *
+     * Accepts operands of rank ≥ 2. Convention: trailing 2 axes are the matrix
+     * dims (rows × cols); all leading axes are batch dims (must be broadcast-
+     * compatible). For rank-2 inputs this is classical matrix multiplication.
+     *
+     * Adjoint derivation (classical matrix calc, extended to batched):
+     *   Y = A @ B  (A: ...×m×k, B: ...×k×n  →  Y: ...×m×n)
+     *   dA = dY @ Bᵀ       (dA: ...×m×k)
+     *   dB = Aᵀ @ dY       (dB: ...×k×n)
+     *
+     * Implemented via `Tensor.einsum` with a dynamically-built spec:
+     *   forward:  '...ik,...kj->...ij'
+     *   dA:       '...ij,...kj->...ik'   (contract dY with B on j)
+     *   dB:       '...ki,...kj->...ij'   (contract A on k with dY on k)
+     */
+    matmul(other: TapedTensor): TapedTensor;
     private checkSameShape;
+    /**
+     * Sum elements along the given axis/axes (or all axes if omitted).
+     *
+     * Adjoint: dX[...] = dY[reduced(idx)] broadcast back to input shape.
+     * Each input element receives the output-gradient entry from its reduced
+     * counterpart (the non-reduced coordinates select the dY element; the
+     * reduced coordinates are collapsed to 0 in the keepDims=false case).
+     */
+    sum(axis?: number | ReadonlyArray<number>, opts?: {
+        keepDims?: boolean;
+    }): TapedTensor;
+    /**
+     * Arithmetic mean along the given axis/axes (or all axes if omitted).
+     *
+     * Adjoint: dX[...] = dY[reduced(idx)] / N, broadcast back to input shape.
+     * N = product of reduced-axis dimensions.
+     */
+    mean(axis?: number | ReadonlyArray<number>, opts?: {
+        keepDims?: boolean;
+    }): TapedTensor;
+    /**
+     * Product of elements along the given axis/axes (or all axes if omitted).
+     *
+     * Adjoint: dX_i = dY * (prod_over_axes(x) / x_i) per element.
+     *
+     * Zero-element corners:
+     * - Exactly one x_i = 0 in the reduced group: d/dx_i = product of all others
+     *   (which is the full product / x_i evaluated via alternate product),
+     *   and d/dx_j = 0 for all j ≠ i where x_j ≠ 0.
+     * - Two or more zeros in the reduced group: gradient is 0 everywhere for that
+     *   group (because changing any single zero cannot change a product that is
+     *   zero due to another zero).
+     *
+     * Implementation: uses prefix/suffix products to handle zeros robustly.
+     */
+    prod(axis?: number | ReadonlyArray<number>, opts?: {
+        keepDims?: boolean;
+    }): TapedTensor;
+    /**
+     * Maximum value along the given axis/axes (or all axes if omitted).
+     *
+     * Adjoint: dY is scattered to the argmax position(s).
+     * Tie-breaking: "first-wins" — the gradient flows to the first (smallest
+     * flat-index) element among those that attain the maximum.
+     */
+    max(axis?: number | ReadonlyArray<number>, opts?: {
+        keepDims?: boolean;
+    }): TapedTensor;
+    /**
+     * Minimum value along the given axis/axes (or all axes if omitted).
+     *
+     * Adjoint: dY is scattered to the argmin position(s).
+     * Tie-breaking: "first-wins" — gradient flows to the first (smallest
+     * flat-index) element that attains the minimum.
+     */
+    min(axis?: number | ReadonlyArray<number>, opts?: {
+        keepDims?: boolean;
+    }): TapedTensor;
+    /**
+     * p-norm of the tensor.
+     *
+     * Supported p values: 1, 2, 'fro', 'inf'. Default p = 2.
+     * When `opts.axis` is given, reduces along that axis; otherwise reduces all axes.
+     *
+     * Adjoints:
+     * - p=2 / p='fro': dX = dY · x / ‖x‖₂  (Frobenius is the 2-norm of the flattened tensor)
+     * - p=1:           dX = dY · sign(x)  (subgradient = 0 at exact zero)
+     * - p='inf':       dX scattered to the element(s) of max absolute value;
+     *                  tie-breaking: first-wins. Sign of the scattered gradient
+     *                  matches sign(x_max).
+     */
+    norm(opts?: {
+        p?: 1 | 2 | 'fro' | 'inf';
+        axis?: number;
+        keepDims?: boolean;
+    }): TapedTensor;
+    /**
+     * Elementwise natural logarithm.
+     *
+     * Adjoint: dX = dY / x
+     */
+    log(): TapedTensor;
+    /**
+     * Elementwise exponential.
+     *
+     * Adjoint: dX = dY · y  where y = exp(x). Primal output is cached.
+     */
+    exp(): TapedTensor;
+    /**
+     * Elementwise sine.
+     *
+     * Adjoint: dX = dY · cos(x)
+     */
+    sin(): TapedTensor;
+    /**
+     * Elementwise cosine.
+     *
+     * Adjoint: dX = −dY · sin(x)
+     */
+    cos(): TapedTensor;
+    /**
+     * Elementwise tangent.
+     *
+     * Adjoint: dX = dY / cos²(x)  (= dY · sec²(x))
+     */
+    tan(): TapedTensor;
+    /**
+     * Elementwise square root.
+     *
+     * Adjoint: dX = dY / (2·y)  where y = sqrt(x). Primal output is cached.
+     */
+    sqrt(): TapedTensor;
+    /**
+     * Elementwise square (x²).
+     *
+     * Adjoint: dX = dY · 2x
+     */
+    square(): TapedTensor;
+    /**
+     * Elementwise fixed-exponent power: x^k.
+     *
+     * Only fixed (non-TapedTensor) exponents are supported. Variable-exponent
+     * pow(taped, taped) is a follow-up slice.
+     *
+     * Adjoint: dX = dY · k · x^(k−1)
+     */
+    pow(k: number): TapedTensor;
+    /**
+     * Elementwise reciprocal: 1 / x.
+     *
+     * Adjoint: dX = −dY / x²
+     */
+    reciprocal(): TapedTensor;
+    /**
+     * Elementwise absolute value: |x|.
+     *
+     * Adjoint: dX = dY · sign(x)
+     * Subgradient at exact zero is defined as 0 (rather than undefined).
+     */
+    abs(): TapedTensor;
+    /**
+     * Reverse-mode AD over `Tensor.tensordot`.
+     *
+     * `axes[i] = [a, b]` contracts axis `a` of `this` with axis `b` of `other`.
+     * The result shape is `this`'s non-contracted axes (in original order)
+     * followed by `other`'s non-contracted axes (in original order).
+     *
+     * Adjoint derivation (NumPy/PyTorch tensordot backward, see Townsend 2016 §6,
+     * and the canonical PyTorch implementation `TensorDotBackward0` in
+     * `torch/csrc/autograd/generated/Functions.cpp`):
+     *
+     *   Z = tensordot(A, B, axes)
+     *   dA = tensordot(dZ, B, [axes_of_dZ_corresponding_to_B's_free, B's_free])
+     *        then permute back into A's original axis order.
+     *   dB = tensordot(A, dZ, [A's_free, axes_of_dZ_corresponding_to_A's_free])
+     *        then permute back into B's original axis order.
+     *
+     * The axis-permutation bookkeeping is the trickiest part: Tensor.tensordot
+     * produces output axes in the order [A's free axes (original A order),
+     * B's free axes (original B order)] — and after the backward contractions
+     * the survivors come out in B's original (resp. A's original) axis order
+     * for the contracted side, which then needs to be permuted into pair order
+     * (so axis k of the contracted block matches the kth pair) and finally
+     * scattered back into A's (resp. B's) full original axis order.
+     *
+     * For the rank-2 × rank-2 single-axis case (i.e. ordinary matmul A·B with
+     * axes = [[1, 0]]), this reduces to dA = dZ · Bᵀ, dB = Aᵀ · dZ — the same
+     * adjoint as `TapedTensor.matmul`.
+     */
+    tensordot(other: TapedTensor, axes: ReadonlyArray<readonly [number, number]>): TapedTensor;
+    /**
+     * Reverse-mode AD over the full SVD of a rank-2 matrix.
+     *
+     * Forward: `A = U · diag(S) · Vt`, where for input shape [m, n], k = min(m, n):
+     *   - U  has shape [m, k]
+     *   - S  has shape [k]
+     *   - Vt has shape [k, n]    (Vt is V^T in the standard A = U Σ V^T convention,
+     *                              i.e. its rows are right-singular-vector components)
+     *
+     * Returned TapedTensors share a single backward closure. When backward()
+     * runs, it pulls dU, dS, dV from each output's gradient slot, assembles
+     * dA, and writes to the input's gradient slot.
+     *
+     * Adjoint (real, distinct nonzero singular values, m = n square case;
+     * extended to rectangular below). Derived directly from the forward
+     * Jacobian; equivalent to PyTorch's `svd_backward`
+     * (`aten/src/ATen/native/BatchLinearAlgebra.cpp`) and Townsend (2016)
+     * "Differentiating the Singular Value Decomposition" §3:
+     *
+     *   Let α = skew(U^T · dU),  β = skew(V^T · dV)        (k×k, antisymmetric)
+     *   Build C (k×k):
+     *     C[i,i] = dS[i]
+     *     C[i,j] = (α[i,j] + β[i,j]) / (s_j − s_i)
+     *            + (α[i,j] − β[i,j]) / (s_j + s_i)         for i ≠ j
+     *   dA_in = U · C · V^T                                 (m×n in-subspace part)
+     *
+     * Rectangular correction (when m > k, i.e. m > n):
+     *   dA += (I − U U^T) · dU · diag(1/s) · V^T
+     * Rectangular correction (when n > k, i.e. n > m):
+     *   dA += U · diag(1/s) · dV^T · (I − V V^T)
+     *
+     * Regularisation at repeated/near-zero singular values (PyTorch-equivalent
+     * subgradient choice): the (i,j) entry of C is masked to 0 whenever
+     * `|s_j − s_i| < REL_TOL · max(|s|)` (the "difference" denominator) or
+     * `|s_j + s_i| < REL_TOL · max(|s|)` (the "sum" denominator, only relevant
+     * when both are ~0). REL_TOL = 1e-10. This makes the gradient a subgradient
+     * at exact degeneracy — finite, but not the unique true derivative (which
+     * does not exist at degeneracies). The rectangular correction also masks
+     * 1/s_i when |s_i| < REL_TOL · max(|s|).
+     *
+     * Throws if input is not rank-2. For rank > 2 inputs the user should
+     * reshape first.
+     */
+    svd(): {
+        U: TapedTensor;
+        S: TapedTensor;
+        V: TapedTensor;
+    };
+    /**
+     * Reverse-mode AD over the eigendecomposition of a rank-2 matrix.
+     *
+     * Symmetric path (`symmetric: true`):
+     *   Forward: `A = U · diag(Λ) · U^T` for symmetric A (n×n).
+     *   Adjoint (Magnus & Neudecker 1999 §10.6.6; PyTorch `linalg_eigh_backward`):
+     *     F[i,j] = 1 / (λ_j − λ_i)   for i ≠ j, 0 otherwise (with degeneracy mask)
+     *     dA_raw = U · (diag(dΛ) + F ∘ (U^T · dU)) · U^T
+     *     dA     = (dA_raw + dA_raw^T) / 2          (symmetrise)
+     *
+     * Non-symmetric path (`symmetric: false`):
+     *   Forward: `A = V · diag(λ) · V^{-1}` (V columns are right eigenvectors).
+     *   Adjoint (Magnus & Neudecker 1999 §10.6 / Giles 2008 §3.2 / Townsend 2016 §4;
+     *   cross-check: PyTorch `linalg_eig_backward`):
+     *     E[i,j] = 1 / (λ_j − λ_i)  for i ≠ j, 0 otherwise (with degeneracy mask)
+     *     dA = V^{-T} · ( E ∘ (V^T · dV) + diag(dλ) ) · V^T
+     *
+     *   Restrictions (all enforced — throw a clear error otherwise):
+     *   1. Eigenvalues must be real. The underlying matrix-eig primitive returns
+     *      placeholder eigenvectors (not actual complex vectors) when complex
+     *      eigenvalues arise, so the adjoint formula cannot be evaluated. Real-
+     *      Schur differentiation would require complex arithmetic infrastructure
+     *      throughout the Tape/TapedTensor stack, which is out of scope.
+     *   2. A must be diagonalisable (non-defective). The adjoint assumes V is
+     *      invertible; defective inputs have algebraic > geometric multiplicity
+     *      so V is rank-deficient. Detected by cond_∞(V) > 1e14.
+     *
+     * Regularisation at repeated eigenvalues (subgradient choice): mask
+     * F/E[i,j] = 0 when `|λ_i − λ_j| < REL_TOL · max(|λ|)`. REL_TOL = 1e-10.
+     *
+     * Throws if input is not rank-2 or square, if `symmetric` is missing, or
+     * (non-symmetric path) on complex eigenvalues / defective input.
+     */
+    eig(opts: {
+        symmetric: boolean;
+    }): {
+        eigvals: TapedTensor;
+        eigvecs: TapedTensor;
+    };
+    private _eigSymmetric;
+    private _eigGeneral;
 }
 /**