npm - @jax-js/jax - Versions diffs - 0.1.5 → 0.1.7 - Mend

@jax-js/jax 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/README.md +60 -7
package/dist/{backend-DziQSaoQ.cjs → backend-B3foXiV_.cjs} +25 -6
package/dist/{backend-DaqL-MNz.js → backend-nEolvdLv.js} +20 -7
package/dist/index.cjs +450 -129
package/dist/index.d.cts +1669 -1467
package/dist/index.d.ts +1669 -1467
package/dist/index.js +450 -130
package/dist/{webgl-ClIYb8jP.cjs → webgl-DIIbKJ0G.cjs} +1 -1
package/dist/{webgl-RSuZKvgc.js → webgl-DweKSWEm.js} +1 -1
package/dist/{webgpu-Dh7k9io0.js → webgpu-B96vzWGE.js} +1 -1
package/dist/{webgpu-Db2JrNBr.cjs → webgpu-BykvF26B.cjs} +1 -1
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import { __export } from "./chunk-Cl8Af3a2.js";
-import { AluExp, AluGroup, AluOp, AluVar, DEBUG, DType, FpHash, Kernel, PPrint, Reduction, Routine, Routines, ShapeTracker, accessorAluExp, accessorGlobal, assertNonNull, byteWidth, checkAxis, deepEqual, defaultDevice, devices, dtypedArray, dtypedJsArray, generalBroadcast, getBackend, init, invertPermutation, isFloatDtype, isNumberPair, isPermutation, normalizeAxis, partitionList, prod, promoteTypes, range, recursiveFlatten, rep, runWithCache, setDebug, toposort, unravelAlu, unzip2, zip, zipn } from "./backend-DaqL-MNz.js";
+import { AluExp, AluGroup, AluOp, AluVar, DEBUG, DType, FpHash, Kernel, PPrint, Reduction, Routine, Routines, ShapeTracker, accessorAluExp, accessorGlobal, assertNonNull, byteWidth, checkAxis, checkInts, deepEqual, defaultDevice, devices, dtypedArray, dtypedJsArray, generalBroadcast, getBackend, init, invertPermutation, isFloatDtype, isNumberPair, isPermutation, normalizeAxis, partitionList, prod, promoteTypes, range, recursiveFlatten, rep, runWithCache, setDebug, toposort, unravelAlu, unzip2, zip, zipn } from "./backend-nEolvdLv.js";
 //#region src/frontend/convolution.ts
 /**
@@ -209,7 +209,7 @@ __export(tree_exports, {
 	structure: () => structure,
 	unflatten: () => unflatten
 });
-const JsArray$1 = globalThis.Array;
+const JsArray$2 = globalThis.Array;
 let NodeType = /* @__PURE__ */ function(NodeType$1) {
 	NodeType$1["Array"] = "Array";
 	NodeType$1["Object"] = "Object";
@@ -257,7 +257,7 @@ function flatten(tree) {
 	return [leaves$1, treedef];
 }
 function _flatten(tree, leaves$1) {
-	if (JsArray$1.isArray(tree)) {
+	if (JsArray$2.isArray(tree)) {
 		const childTrees = tree.map((c) => _flatten(c, leaves$1));
 		return new JsTreeDef(NodeType.Array, null, childTrees);
 	} else if (typeof tree === "object" && tree !== null && tree.constructor === Object) {
@@ -306,11 +306,11 @@ function map(fn, tree, ...rest) {
 }
 /** Take a reference of every array in a tree. */
 function ref(tree) {
-	return map((x) => x.ref, tree);
+	return map((x) => x instanceof Tracer ? x.ref : x, tree);
 }
 /** Dispose every array in a tree. */
 function dispose(tree) {
-	if (tree) map((x) => x.dispose(), tree);
+	if (tree) map((x) => x instanceof Tracer ? x.dispose() : void 0, tree);
 }
 //#endregion
@@ -381,6 +381,13 @@ let CompareOp = /* @__PURE__ */ function(CompareOp$1) {
 	CompareOp$1["LessEqual"] = "less_equal";
 	return CompareOp$1;
 }({});
+const routinePrimitives = new Map([
+	[Primitive.Sort, Routines.Sort],
+	[Primitive.Argsort, Routines.Argsort],
+	[Primitive.TriangularSolve, Routines.TriangularSolve],
+	[Primitive.Cholesky, Routines.Cholesky],
+	[Primitive.LU, Routines.LU]
+]);
 function add$1(x, y) {
 	return bind1(Primitive.Add, [x, y]);
 }
@@ -577,14 +584,20 @@ function shrink(x, slice) {
 }
 function pad$1(x, width) {
 	const nd = ndim$1(x);
-	if (typeof width === "number") width = [[width, width]];
-	else if (isNumberPair(width)) width = [width];
-	else if (!Array.isArray(width) || !width.every(isNumberPair)) throw new TypeError(`Invalid pad() type: ${JSON.stringify(width)}`);
-	if (width.length === 1) {
-		const [w0, w1] = width[0];
-		width = rep(nd, () => [w0, w1]);
-	} else if (width.length !== nd) throw new Error(`Invalid pad(): expected ${nd} axes, got ${width.length}`);
-	return bind1(Primitive.Pad, [x], { width });
+	let w;
+	if (typeof width === "number") w = [[width, width]];
+	else if (isNumberPair(width)) w = [width];
+	else if (!Array.isArray(width)) {
+		const indicesAndPairs = Object.entries(width);
+		w = rep(nd, [0, 0]);
+		for (const [k, v] of indicesAndPairs) w[checkAxis(parseInt(k), nd)] = v;
+	} else if (!width.every(isNumberPair)) throw new TypeError(`Invalid pad() type: ${JSON.stringify(width)}`);
+	else w = width;
+	if (w.length === 1) {
+		const [w0, w1] = w[0];
+		w = rep(nd, () => [w0, w1]);
+	} else if (w.length !== nd) throw new Error(`Invalid pad(): expected ${nd} axes, got ${w.length}`);
+	return bind1(Primitive.Pad, [x], { width: w });
 }
 function triangularSolve$1(a, b, { lower = false, unitDiagonal = false } = {}) {
 	const as = getShape(a);
@@ -654,6 +667,9 @@ function newDynamic(main) {
 		dynamicTrace = prevDynamicTrace;
 	} };
 }
+function currentTraceLevel() {
+	return traceStack[traceStack.length - 1].level;
+}
 var Trace = class {
 	constructor(main) {
 		this.main = main;
@@ -757,6 +773,22 @@ var Tracer = class Tracer {
 		const result = reduce(this.astype(castDtype), AluOp.Add, axis, opts);
 		return result.mul(1 / n).astype(originalDtype);
 	}
+	/** Minimum of the elements of the array along a given axis. */
+	min(axis = null, opts) {
+		return reduce(this, AluOp.Min, axis, opts);
+	}
+	/** Maximum of the elements of the array along a given axis. */
+	max(axis = null, opts) {
+		return reduce(this, AluOp.Max, axis, opts);
+	}
+	/** Test whether all array elements along a given axis evaluate to true. */
+	all(axis = null, opts) {
+		return this.astype(DType.Bool).min(axis, opts);
+	}
+	/** Test whether any array element along a given axis evaluates to true. */
+	any(axis = null, opts) {
+		return this.astype(DType.Bool).max(axis, opts);
+	}
 	/** Permute the dimensions of an array. Defaults to reversing the axis order. */
 	transpose(perm) {
 		return transpose$1(this, perm);
@@ -1031,6 +1063,7 @@ var TreeMismatchError = class extends TypeError {
 		super(`Mismatched tree structures in ${where$2}: ${left} != ${right}`);
 	}
 };
+/** Flatten a function of `JsTree` input/output for use in tracing. */
 function flattenFun(f, inTree) {
 	const store = { value: void 0 };
 	const flatFun = (...argsFlat) => {
@@ -1042,6 +1075,26 @@ function flattenFun(f, inTree) {
 	};
 	return [flatFun, store];
 }
+/** Like flattenFun, but expects f to return [main, aux] tuple. */
+function flattenFunWithAux(f, inTree) {
+	const store = { value: void 0 };
+	const auxStore = { value: void 0 };
+	const flatFun = (...argsFlat) => {
+		const pytreeArgs = unflatten(inTree, argsFlat);
+		const result = f(...pytreeArgs);
+		if (!Array.isArray(result) || result.length !== 2) throw new Error("Function with `hasAux: true` must return [output, aux] tuple");
+		const [out, aux] = result;
+		const [outFlat, outTree] = flatten(out);
+		store.value = outTree;
+		auxStore.value = aux;
+		return outFlat;
+	};
+	return [
+		flatFun,
+		store,
+		auxStore
+	];
+}
 var UseAfterFreeError = class extends ReferenceError {
 	constructor(tracer) {
 		super(`Referenced tracer ${tracer.toString()} freed, please use .ref move semantics`);
@@ -1771,13 +1824,6 @@ function jit$1(f, opts) {
 //#endregion
 //#region src/frontend/jit.ts
-const routinePrimitives = new Map([
-	[Primitive.Sort, Routines.Sort],
-	[Primitive.Argsort, Routines.Argsort],
-	[Primitive.TriangularSolve, Routines.TriangularSolve],
-	[Primitive.Cholesky, Routines.Cholesky],
-	[Primitive.LU, Routines.LU]
-]);
 /** Result of compiling a Jaxpr. Can be evaluated on a series of inputs. */
 var JitProgram = class {
 	constructor(backend, steps, inputs, outputs) {
@@ -2166,12 +2212,13 @@ const jitRules = {
 		const ndim$2 = avals[0].ndim;
 		const sizes = avals.map((x) => x.shape[axis]);
 		const finalSize = sizes.reduce((a, b) => a + b, 0);
+		const { dtype: dtypeOut } = avals.map((x) => x.scalar()).reduce(promoteAvals);
 		const makePadAxis = (start, end) => range(ndim$2).map((i) => i === axis ? [start, end] : [0, 0]);
 		let cum = 0;
 		const src = [];
 		for (let i = 0; i < exps.length; i++) {
 			const padding = makePadAxis(cum, finalSize - cum - sizes[i]);
-			src.push(reshapeViews(exps[i], (st) => st.pad(padding)));
+			src.push(reshapeViews(AluExp.cast(dtypeOut, exps[i]), (st) => st.pad(padding)));
 			cum += sizes[i];
 		}
 		return { exp: [src.reduce(AluExp.add)] };
@@ -2309,7 +2356,7 @@ function splitGraphDataflow(backend, jaxpr) {
 		p1NextBlack.set(v, v);
 	}
 	const heterogeneousViewPrimitives = [Primitive.RandomBits, Primitive.Gather];
-	const needsCleanShapePrimitives = [Primitive.Pad];
+	const needsCleanShapePrimitives = [Primitive.Concatenate, Primitive.Pad];
 	for (let i = jaxpr.eqns.length - 1; i >= 0; i--) {
 		const eqn = jaxpr.eqns[i];
 		if (reductionEndpointEqns.has(i) || heterogeneousViewPrimitives.includes(eqn.primitive) || routinePrimitives.has(eqn.primitive) || eqn.outBinders.some((v) => blackNodes.has(v))) {
@@ -2379,7 +2426,7 @@ function splitGraphDataflow(backend, jaxpr) {
 //#endregion
 //#region src/frontend/array.ts
-const JsArray = globalThis.Array;
+const JsArray$1 = globalThis.Array;
 const inlineArrayLimit = 128;
 /** Version of pureArray with fudged types. */
 const fudgeArray = pureArray;
@@ -2777,25 +2824,35 @@ var Array$1 = class Array$1 extends Tracer {
 		});
 	}
 	/** Apply an operation with custom lowering to this array. */
-	static #routine(routine, arrays, outputWeakType) {
-		const { backend, committed } = Array$1.#computeBackend(routine.name, arrays);
-		for (const ar of arrays) ar.#realize();
-		const inputs = arrays.map((ar) => ar.#source);
-		const outputs = routine.type.outputDtypes.map((dtype, i) => backend.malloc(byteWidth(dtype) * prod(routine.type.outputShapes[i])));
-		const pending = arrays.flatMap((ar) => ar.#pending);
-		for (const exe of pending) exe.updateRc(+outputs.length);
-		pending.push(new PendingExecute(backend, routine, inputs, outputs));
-		pending[pending.length - 1].updateRc(+outputs.length - 1);
-		arrays.forEach((ar) => ar.dispose());
-		return outputs.map((output, i) => new Array$1({
-			source: output,
-			st: ShapeTracker.fromShape(routine.type.outputShapes[i]),
-			dtype: routine.type.outputDtypes[i],
-			weakType: outputWeakType[i],
-			backend,
-			committed,
-			pending
-		}));
+	static #routine(prim) {
+		return (arrays, params) => {
+			const { backend, committed } = Array$1.#computeBackend(prim, arrays);
+			for (const ar of arrays) ar.#realize();
+			const avals = arrays.map((ar) => ar.aval);
+			const avalsOut = abstractEvalRules[prim](avals, params);
+			const routine = new Routine(routinePrimitives.get(prim), {
+				inputShapes: avals.map((a) => a.shape),
+				inputDtypes: avals.map((a) => a.dtype),
+				outputShapes: avalsOut.map((a) => a.shape),
+				outputDtypes: avalsOut.map((a) => a.dtype)
+			}, params);
+			const inputs = arrays.map((ar) => ar.#source);
+			const outputs = avalsOut.map((x) => backend.malloc(byteWidth(x.dtype) * x.size));
+			const pending = arrays.flatMap((ar) => ar.#pending);
+			for (const exe of pending) exe.updateRc(+outputs.length);
+			pending.push(new PendingExecute(backend, routine, inputs, outputs));
+			pending[pending.length - 1].updateRc(+outputs.length - 1);
+			arrays.forEach((ar) => ar.dispose());
+			return outputs.map((output, i) => new Array$1({
+				source: output,
+				st: ShapeTracker.fromShape(avalsOut[i].shape),
+				dtype: avalsOut[i].dtype,
+				weakType: avalsOut[i].weakType,
+				backend,
+				committed,
+				pending
+			}));
+		};
 	}
 	/**
 	* Normalizes this array into one backed by a `Slot`.
@@ -3129,65 +3186,11 @@ var Array$1 = class Array$1 extends Tracer {
 			[Primitive.Pad]([x], { width }) {
 				return [x.#reshape(x.#st.pad(width))];
 			},
-			[Primitive.Sort]([x]) {
-				const routine = new Routine(Routines.Sort, {
-					inputShapes: [x.shape],
-					inputDtypes: [x.dtype],
-					outputShapes: [x.shape],
-					outputDtypes: [x.dtype]
-				});
-				return Array$1.#routine(routine, [x], [x.#weakType]);
-			},
-			[Primitive.Argsort]([x]) {
-				const routine = new Routine(Routines.Argsort, {
-					inputShapes: [x.shape],
-					inputDtypes: [x.dtype],
-					outputShapes: [x.shape, x.shape],
-					outputDtypes: [x.dtype, DType.Int32]
-				});
-				return Array$1.#routine(routine, [x], [x.#weakType, false]);
-			},
-			[Primitive.TriangularSolve]([a, b], { unitDiagonal }) {
-				const routine = new Routine(Routines.TriangularSolve, {
-					inputShapes: [a.shape, b.shape],
-					inputDtypes: [a.dtype, b.dtype],
-					outputShapes: [b.shape],
-					outputDtypes: [b.dtype]
-				}, { unitDiagonal });
-				return Array$1.#routine(routine, [a, b], [a.#weakType && b.#weakType]);
-			},
-			[Primitive.Cholesky]([a]) {
-				const routine = new Routine(Routines.Cholesky, {
-					inputShapes: [a.shape],
-					inputDtypes: [a.dtype],
-					outputShapes: [a.shape],
-					outputDtypes: [a.dtype]
-				});
-				return Array$1.#routine(routine, [a], [a.#weakType]);
-			},
-			[Primitive.LU]([a]) {
-				const batch = a.shape.slice(0, -2);
-				const [m, n] = a.shape.slice(-2);
-				const routine = new Routine(Routines.LU, {
-					inputShapes: [a.shape],
-					inputDtypes: [a.dtype],
-					outputShapes: [
-						a.shape,
-						[...batch, Math.min(m, n)],
-						[...batch, m]
-					],
-					outputDtypes: [
-						a.dtype,
-						DType.Int32,
-						DType.Int32
-					]
-				});
-				return Array$1.#routine(routine, [a], [
-					a.#weakType,
-					false,
-					false
-				]);
-			},
+			[Primitive.Sort]: Array$1.#routine(Primitive.Sort),
+			[Primitive.Argsort]: Array$1.#routine(Primitive.Argsort),
+			[Primitive.TriangularSolve]: Array$1.#routine(Primitive.TriangularSolve),
+			[Primitive.Cholesky]: Array$1.#routine(Primitive.Cholesky),
+			[Primitive.LU]: Array$1.#routine(Primitive.LU),
 			[Primitive.Jit](args, { jaxpr }) {
 				if (jaxpr.inBinders.length !== args.length) throw new Error(`jit expects ${jaxpr.inBinders.length} args, got ${args.length}`);
 				const { backend, committed } = Array$1.#computeBackend("jit", args);
@@ -3269,7 +3272,7 @@ function array(values, { shape: shape$1, dtype, device } = {}) {
 		if (!shape$1) {
 			shape$1 = [];
 			let cur = values;
-			while (JsArray.isArray(cur)) {
+			while (JsArray$1.isArray(cur)) {
 				shape$1.push(cur.length);
 				cur = cur[0];
 			}
@@ -4223,17 +4226,39 @@ function jvpFlat(f, primals, tangents) {
 		_usingCtx$1.d();
 	}
 }
-function jvp$1(f, primals, tangents) {
+function jvp$1(f, primals, tangents, { hasAux = false } = {}) {
 	const [primalsFlat, inTree] = flatten(primals);
 	const [tangentsFlat, inTree2] = flatten(tangents);
 	if (!inTree.equals(inTree2)) throw new TreeMismatchError("jvp", inTree, inTree2);
-	const [flatFun, outTree] = flattenFun(f, inTree);
+	let flatFun, outTree, aux;
+	if (hasAux) [flatFun, outTree, aux] = flattenFunWithAux(f, inTree);
+	else [flatFun, outTree] = flattenFun(f, inTree);
 	const [primalsOutFlat, tangentsOutFlat] = jvpFlat(flatFun, primalsFlat, tangentsFlat);
 	if (outTree.value === void 0) throw new Error("outTree was not set in jvp");
 	const primalsOut = unflatten(outTree.value, primalsOutFlat);
 	const tangentsOut = unflatten(outTree.value, tangentsOutFlat);
+	if (hasAux) return [
+		primalsOut,
+		tangentsOut,
+		lowerAux(aux.value)
+	];
 	return [primalsOut, tangentsOut];
 }
+/** Lowering for auxiliary data returned in `hasAux: true` methods. */
+function lowerAux(aux) {
+	const level = currentTraceLevel();
+	return map((x) => {
+		if (x instanceof Tracer) while (x._trace.main.level > level) if (x instanceof JVPTracer) {
+			x.tangent.dispose();
+			x = x.primal;
+		} else {
+			const y = x.fullLower();
+			if (y._trace.main.level >= x._trace.main.level) throw new Error("internal: lowerAux did not reduce trace level");
+			x = y;
+		}
+		return x;
+	}, aux);
+}
 //#endregion
 //#region src/frontend/linearize.ts
@@ -4304,9 +4329,11 @@ function linearizeFlat(f, primalsIn) {
 		dispose$1
 	];
 }
-function linearize$1(f, ...primalsIn) {
+function linearize$1(f, primalsIn, { hasAux = false } = {}) {
 	const [primalsInFlat, inTree] = flatten(primalsIn);
-	const [fFlat, outTree] = flattenFun(f, inTree);
+	let fFlat, outTree, aux;
+	if (hasAux) [fFlat, outTree, aux] = flattenFunWithAux(f, inTree);
+	else [fFlat, outTree] = flattenFun(f, inTree);
 	const [primalsOutFlat, fLinFlat, dispose$1] = linearizeFlat(fFlat, primalsInFlat.map(pureArray));
 	if (outTree.value === void 0) throw new Error("outTree was not set in linearize");
 	const primalsOut = unflatten(outTree.value, primalsOutFlat);
@@ -4317,6 +4344,11 @@ function linearize$1(f, ...primalsIn) {
 		return unflatten(outTree.value, tangentsOutFlat);
 	});
 	fLin.dispose = dispose$1;
+	if (hasAux) return [
+		primalsOut,
+		fLin,
+		lowerAux(aux.value)
+	];
 	return [primalsOut, fLin];
 }
 var PartialEvalTracer = class extends Tracer {
@@ -4817,9 +4849,11 @@ function vjpFlat(f, primalsIn) {
 		dispose$1
 	];
 }
-function vjp$1(f, ...primalsIn) {
+function vjp$1(f, primalsIn, { hasAux = false } = {}) {
 	const [primalsInFlat, inTree] = flatten(primalsIn);
-	const [fFlat, outTree] = flattenFun(f, inTree);
+	let fFlat, outTree, aux;
+	if (hasAux) [fFlat, outTree, aux] = flattenFunWithAux(f, inTree);
+	else [fFlat, outTree] = flattenFun(f, inTree);
 	const [primalsOutFlat, fVjpFlat, dispose$1] = vjpFlat(fFlat, primalsInFlat.map(pureArray));
 	if (outTree.value === void 0) throw new Error("outTree was not set in vjp");
 	const primalsOut = unflatten(outTree.value, primalsOutFlat);
@@ -4830,26 +4864,43 @@ function vjp$1(f, ...primalsIn) {
 		return unflatten(inTree, cotangentsInFlat);
 	});
 	fVjp.dispose = dispose$1;
+	if (hasAux) return [
+		primalsOut,
+		fVjp,
+		lowerAux(aux.value)
+	];
 	return [primalsOut, fVjp];
 }
-function grad$1(f) {
-	const valueAndGradFn = valueAndGrad$1(f);
+function grad$1(f, opts) {
+	const valueAndGradFn = valueAndGrad$1(f, opts);
 	return (...x) => {
-		const [y, dx] = valueAndGradFn(...x);
-		y.dispose();
-		return dx;
+		if (opts?.hasAux) {
+			const [[y, aux], dx] = valueAndGradFn(...x);
+			y.dispose();
+			return [dx, aux];
+		} else {
+			const [y, dx] = valueAndGradFn(...x);
+			y.dispose();
+			return dx;
+		}
 	};
 }
-function valueAndGrad$1(f) {
+function valueAndGrad$1(f, opts) {
+	const argnums = opts?.argnums ?? 0;
+	const hasAux = opts?.hasAux ?? false;
+	checkInts(argnums);
+	const argnumsSet = new Set(typeof argnums === "number" ? [argnums] : argnums);
 	return (...x) => {
 		if (x.length === 0) throw new Error("grad requires at least one argument to differentiate");
-		const [y, fVjp] = vjp$1(f, x[0], ...x.slice(1).map(stopGradient));
+		for (let i = 0; i < x.length; i++) if (!argnumsSet.has(i)) x[i] = map(stopGradient, x[i]);
+		const [y, fVjp, aux] = vjp$1(f, x, { hasAux });
 		if (!(y instanceof Tracer) || ndim$1(y) !== 0) throw new TypeError("grad requires a scalar output");
 		if (!isFloatDtype(y.dtype)) throw new TypeError("grad only supports floating-point dtypes");
-		const [ct, ...rest] = fVjp(onesLike$1(y.ref));
-		for (const r of rest) dispose(r);
+		const cts = fVjp(onesLike$1(y.ref));
 		fVjp.dispose();
-		return [y, ct];
+		for (let i = 0; i < cts.length; i++) if (!argnumsSet.has(i)) dispose(cts[i]);
+		const grads = typeof argnums === "number" ? cts[argnums] : argnums.map((i) => cts[i]);
+		return hasAux ? [[y, aux], grads] : [y, grads];
 	};
 }
 function jacrev$1(f) {
@@ -4857,7 +4908,7 @@ function jacrev$1(f) {
 		if (x.shape.length !== 1) throw new TypeError("jacrev only supports 1D inputs");
 		const [size$1] = x.shape;
 		const pullback = (ct) => {
-			const [y, fVjp] = vjp$1(f, x);
+			const [y, fVjp] = vjp$1(f, [x]);
 			y.dispose();
 			const [ret] = fVjp(ct);
 			fVjp.dispose();
@@ -4866,6 +4917,9 @@ function jacrev$1(f) {
 		return vmap$1(pullback, [1])(eye(size$1, void 0, { dtype: x.dtype }));
 	};
 }
+function hessian$1(f) {
+	return jacfwd$1(grad$1(f));
+}
 //#endregion
 //#region src/library/numpy/einsum.ts
@@ -5538,6 +5592,7 @@ __export(numpy_exports, {
 	moveaxis: () => moveaxis$1,
 	multiply: () => multiply,
 	nan: () => nan,
+	nanToNum: () => nanToNum,
 	ndim: () => ndim,
 	negative: () => negative,
 	notEqual: () => notEqual,
@@ -5575,6 +5630,7 @@ __export(numpy_exports, {
 	std: () => std,
 	subtract: () => subtract,
 	sum: () => sum,
+	swapaxes: () => swapaxes,
 	take: () => take,
 	tan: () => tan,
 	tanh: () => tanh,
@@ -5734,24 +5790,22 @@ function max(a, axis = null, opts) {
 	return reduce(a, AluOp.Max, axis, opts);
 }
 /**
-* Test whether all array elements along a given axis evaluate to True.
+* Test whether any array element along a given axis evaluates to True.
 *
 * Returns a boolean array with the same shape as `a` with the specified axis
 * removed. If axis is None, returns a scalar.
 */
-function all(a, axis = null, opts) {
-	a = fudgeArray(a).astype(DType.Bool);
-	return min(a, axis, opts);
+function any(a, axis = null, opts) {
+	return fudgeArray(a).any(axis, opts);
 }
 /**
-* Test whether any array element along a given axis evaluates to True.
+* Test whether all array elements along a given axis evaluate to True.
 *
 * Returns a boolean array with the same shape as `a` with the specified axis
 * removed. If axis is None, returns a scalar.
 */
-function any(a, axis = null, opts) {
-	a = fudgeArray(a).astype(DType.Bool);
-	return max(a, axis, opts);
+function all(a, axis = null, opts) {
+	return fudgeArray(a).all(axis, opts);
 }
 /** Return the peak-to-peak range along a given axis (`max - min`). */
 function ptp(a, axis = null, opts) {
@@ -5852,7 +5906,7 @@ function split$1(a, indicesOrSections, axis = 0) {
 		const partSize = size$1 / indicesOrSections;
 		sizes = rep(indicesOrSections, partSize);
 	} else {
-		const indices = indicesOrSections;
+		const indices = indicesOrSections.map((i) => i < 0 ? i + size$1 : i);
 		sizes = [indices[0]];
 		for (let i = 1; i < indices.length; i++) sizes.push(indices[i] - indices[i - 1]);
 		sizes.push(size$1 - indices[indices.length - 1]);
@@ -5973,6 +6027,17 @@ function flipud(x) {
 function fliplr(x) {
 	return flip(x, 1);
 }
+/** Interchange two axes of an array. */
+function swapaxes(a, axis1, axis2) {
+	a = fudgeArray(a);
+	axis1 = checkAxis(axis1, a.ndim);
+	axis2 = checkAxis(axis2, a.ndim);
+	if (axis1 === axis2) return a;
+	const perm = range(a.ndim);
+	perm[axis1] = axis2;
+	perm[axis2] = axis1;
+	return transpose(a, perm);
+}
 /** Transpose the last two dimensions of an array. */
 function matrixTranspose(a) {
 	if (ndim(a) < 2) throw new Error(`matrixTranspose: input array must be at least 2D`);
@@ -6789,6 +6854,21 @@ function isposinf(x) {
 	return isFloatDtype(x.dtype) ? x.equal(Infinity) : fullLike$1(x, false);
 }
 /**
+* Replace NaN and infinite entries in an array.
+*
+* By default, NaNs are replaced with `0.0`, and infinities are are substituted
+* with the corresponding maximum or minimum finite values.
+*/
+function nanToNum(x, { nan: nan$1 = 0, posinf = null, neginf = null } = {}) {
+	x = fudgeArray(x);
+	x = where(isnan(x.ref), nan$1, x);
+	posinf ??= isFloatDtype(x.dtype) ? finfo(x.dtype).max : iinfo(x.dtype).max;
+	neginf ??= isFloatDtype(x.dtype) ? finfo(x.dtype).min : iinfo(x.dtype).min;
+	x = where(isposinf(x.ref), posinf, x);
+	x = where(isneginf(x.ref), neginf, x);
+	return x;
+}
+/**
 * @function
 * Test element-wise for finite values (not infinity or NaN).
 */
@@ -6901,6 +6981,7 @@ var lax_exports = {};
 __export(lax_exports, {
 	conv: () => conv,
 	convGeneralDilated: () => convGeneralDilated,
+	convTranspose: () => convTranspose,
 	convWithGeneralPadding: () => convWithGeneralPadding,
 	dot: () => dot,
 	erf: () => erf,
@@ -6909,6 +6990,7 @@ __export(lax_exports, {
 	reduceWindow: () => reduceWindow,
 	stopGradient: () => stopGradient$1
 });
+const JsArray = globalThis.Array;
 /**
 * General dot product/contraction operator.
 *
@@ -6980,7 +7062,11 @@ function padtypeToPads(inShape, filterShape, strides, dilation, padding) {
 * The semantics of this operation mimic the `jax.lax.conv_general_dilated`
 * function in JAX, which wraps XLA's general convolution operator.
 *
-* Grouped convolutions are not supported right now.
+* @param lhs - Input tensor; shape `[N, C_in, ...xs]`
+* @param rhs - Convolution kernel; shape `[C_out, C_in / G, ...ks]`
+* @param windowStrides - Strides for each spatial dimension
+* @param padding - Padding for each spatial dimension, or a string
+*   (`"VALID"`, `"SAME"`, or `"SAME_LOWER"`)
 */
 function convGeneralDilated(lhs, rhs, windowStrides, padding, { lhsDilation, rhsDilation, featureGroupCount = 1 } = {}) {
 	if (lhs.ndim < 2) throw new Error("lhs must have at least 2 dimensions");
@@ -7040,6 +7126,60 @@ function convWithGeneralPadding(lhs, rhs, windowStrides, padding, lhsDilation, r
 function conv(lhs, rhs, windowStrides, padding) {
 	return convGeneralDilated(lhs, rhs, windowStrides, padding);
 }
+/**
+* Convenience wrapper for calculating the N-d convolution "transpose".
+*
+* This function directly calculates a fractionally strided conv rather than
+* indirectly calculating the gradient (transpose) of a forward convolution.
+* It is equivalent to the JAX version, except:
+*
+* - The `use_consistent_padding` option is not available. We only have the
+*   consistent padding case (JAX version >0.8.4).
+* - The order of dimensions matches `lax.conv_general_dilated`.
+*
+* Unlike PyTorch/TensorFlow, by default we don't reverse the kernel's spatial
+* dimensions or the `(C_out, C_in)` axis order. To get this behavior, set
+* `transposeKernel` to true.
+*
+* @param lhs - Input tensor; shape `[N, C_in, ...xs]`
+* @param rhs - Convolution kernel; shape `[C_out, C_in, ...ks]`
+* @param strides - Sequence of n integers, sets fractional stride
+* @param padding - Apply padding of `dilation * (kernel_size - 1) - padding` to
+*   each side of the input, so it acts like gradient of `conv()`
+* @param rhsDilation - Atrous dilation for the kernel
+* @param transposeKernel - Flip spatial axes and swap the input/output channels
+*   of the kernel; its shape should be `[C_in, C_out, ...ks]`
+*/
+function convTranspose(lhs, rhs, strides, padding, { rhsDilation, transposeKernel = false } = {}) {
+	const kernelShape = rhs.shape.slice(2);
+	rhsDilation = rhsDilation ?? rep(kernelShape.length, 1);
+	const effectiveKernel = kernelShape.map((k, i) => Math.max(0, (k - 1) * rhsDilation[i] + 1));
+	const pads = effectiveKernel.map((k, i) => convTransposePadding(k, strides[i], typeof padding === "string" ? padding : padding[i]));
+	if (transposeKernel) {
+		rhs = flip$1(rhs, range(2, rhs.ndim));
+		rhs = moveaxis(rhs, 0, 1);
+	}
+	return convGeneralDilated(lhs, rhs, rep(lhs.ndim - 2, 1), pads, {
+		lhsDilation: strides,
+		rhsDilation
+	});
+}
+function convTransposePadding(k, s, padding) {
+	let padLen;
+	let pad1;
+	if (padding === "SAME") {
+		padLen = k + s - 2;
+		pad1 = s > k - 1 ? k - 1 : Math.ceil(padLen / 2);
+	} else if (padding === "VALID") {
+		padLen = k + s - 2 + Math.max(k - s, 0);
+		pad1 = k - 1;
+	} else if (JsArray.isArray(padding)) {
+		const pads = [k - 1 - padding[0], k - 1 - padding[1]];
+		pad1 = pads[0];
+		padLen = pads[0] + pads[1];
+	} else throw new Error(`convTranspose: Invalid padding type ${padding}`);
+	return [pad1, padLen - pad1];
+}
 /** Reduce a computation over padded windows. */
 function reduceWindow(operand, computation, windowDimensions, windowStrides) {
 	if (operand.ndim < windowDimensions.length) throw new Error(`Operand dimensions ${operand.ndim} < window ${windowDimensions.length}`);
@@ -7078,6 +7218,7 @@ function stopGradient$1(x) {
 var nn_exports = {};
 __export(nn_exports, {
 	celu: () => celu,
+	dotProductAttention: () => dotProductAttention,
 	elu: () => elu,
 	gelu: () => gelu,
 	glu: () => glu,
@@ -7394,6 +7535,125 @@ function oneHot(x, numClasses) {
 	if (isFloatDtype(x.dtype)) throw new TypeError(`oneHot expects integers, got ${x.dtype}`);
 	return eye(numClasses, void 0, { device: x.device }).slice(x);
 }
+/**
+* Scaled dot product attention (SDPA).
+*
+* Computes `softmax((Q @ K^T) / sqrt(d) + bias) @ V`, where `Q` is the query,
+* `K` is the key, `V` is the value, and `d` is the dimensionality of each key
+* and query vector.
+*
+* Multi-query attention is applied when input `key` and `value` tensors have
+* fewer heads than `query`.
+*
+* We use the following uppercase letters to denote array shapes:
+* - `B` = batch size
+* - `S` = length of key/value sequences (source)
+* - `L` = length of query sequences
+* - `N` = number of attention heads
+* - `H` = dimensionality of each attention head
+* - `K` = number of key/value heads (for grouped-query attention)
+*
+* The batch size `B` may be omitted, which is equivalent to `B = 1`. In this
+* case it must be omitted from all inputs.
+*
+* @param query - Query array; shape `[B, L, N, H]`
+* @param key - Key array; shape `[B, S, K, H]`
+* @param value - Value array; same shape as `key`
+* @param opts.bias - Optional bias to add to the attention logits; shape
+*   `[B, N, L, S]` or broadcastable to it.
+* @param opts.mask - Optional mask to apply to the attention logits; should be
+*   a boolean array broadcastable to `[B, N, L, S]`, where `true` indicates
+*   the element should take part in attention.
+* @param opts.scale - Scaling factor override, default is `1 / sqrt(H)`.
+* @param opts.isCausal - If true, applies a casual mask.
+* @param opts.querySeqLengths - Optional sequence lengths for the queries;
+*   shape `(B,)`. Taken from the beginning of the tensor.
+* @param opts.keyValueSeqLengths - Optional sequence lengths for the keys and
+*   values; shape `(B,)`. Taken from the beginning of the tensor.
+* @param opts.localWindowSize - If specified, applies a local attention window
+*   of the given size. Can be a single number or a tuple `[left, right]`.
+*
+* @returns The result of the attention operation; shape is the same as query
+*   `[B, L, N, H]`, or `[L, N, H]` if `B` is omitted.
+*/
+function dotProductAttention(query, key$1, value, opts = {}) {
+	query = fudgeArray(query);
+	key$1 = fudgeArray(key$1);
+	value = fudgeArray(value);
+	if (query.ndim !== 3 && query.ndim !== 4 || query.ndim !== key$1.ndim || query.ndim !== value.ndim) throw new Error(`dotProductAttention: expected all tensors to have rank 3 or 4, got Q=${query.aval}, K=${key$1.aval}, V=${value.aval}`);
+	if (!deepEqual(key$1.shape, value.shape)) throw new Error(`dotProductAttention: key and value shapes must match, got K=${key$1.shape}, V=${value.shape}`);
+	const isRank3 = query.ndim === 3;
+	if (isRank3) {
+		query = expandDims(query, 0);
+		key$1 = expandDims(key$1, 0);
+		value = expandDims(value, 0);
+	}
+	const [B, L, N, H] = query.shape;
+	if (key$1.shape[0] !== B || key$1.shape[3] !== H) throw new Error(`dotProductAttention: query and key shapes mismatch, got Q=${query.aval}, K=${key$1.aval}`);
+	const S = key$1.shape[1];
+	const K = key$1.shape[2];
+	if (N < K || N != K && N % K !== 0) throw new Error(`dotProductAttention: number of query heads N=${N} must be divisible by number of key/value heads K=${K} for GQA`);
+	const G = N / K;
+	key$1 = tile(key$1, [
+		1,
+		1,
+		G,
+		1
+	]);
+	value = tile(value, [
+		1,
+		1,
+		G,
+		1
+	]);
+	const scale = opts.scale ?? 1 / Math.sqrt(H);
+	let scores = einsum("BLNH,BSNH->BNLS", query, key$1).mul(scale);
+	if (opts.bias !== void 0) scores = scores.add(opts.bias);
+	if (opts.mask !== void 0) scores = where(opts.mask, scores, -Infinity);
+	if (opts.isCausal) {
+		const causalMask = tri(L, S, 0, { dtype: DType.Bool });
+		scores = where(causalMask, scores, -Infinity);
+	}
+	if (opts.localWindowSize !== void 0) {
+		const [before, after] = typeof opts.localWindowSize === "number" ? [opts.localWindowSize, opts.localWindowSize] : opts.localWindowSize;
+		if (before < 0 || after < 0 || !Number.isInteger(before) || !Number.isInteger(after)) throw new Error(`dotProductAttention: localWindowSize values must be non-negative, got ${opts.localWindowSize}`);
+		const localMask = tri(L, S, after, { dtype: DType.Bool }).mul(tri(L, S, -before - 1, { dtype: DType.Bool }).notEqual(true));
+		scores = where(localMask, scores, -Infinity);
+	}
+	if (opts.querySeqLengths !== void 0) {
+		const sl = expandDims(opts.querySeqLengths, [
+			-1,
+			-2,
+			-3
+		]);
+		scores = where(arange(L).reshape([
+			1,
+			1,
+			L,
+			1
+		]).less(sl), scores, -Infinity);
+	}
+	if (opts.keyValueSeqLengths !== void 0) {
+		const sl = expandDims(opts.keyValueSeqLengths, [
+			-1,
+			-2,
+			-3
+		]);
+		scores = where(arange(S).reshape([
+			1,
+			1,
+			1,
+			S
+		]).less(sl), scores, -Infinity);
+	}
+	const attn = softmax(scores, -1);
+	const out = einsum("BNLS,BSNH->BLNH", attn, value);
+	return isRank3 ? out.reshape([
+		L,
+		N,
+		H
+	]) : out;
+}
 //#endregion
 //#region src/library/random.ts
@@ -7629,17 +7889,62 @@ const linearize = linearize$1;
 /**
 * @function
 * Calculate the reverse-mode vector-Jacobian product for a function.
+*
+* The return value is a tuple of `[out, vjpFn]`, where `out` is the output of
+* `f(primals)`, and `vjpFn` is a function that takes in cotangents for each
+* output and returns the cotangents for each input.
+*
+* When `{ hasAux: true }` is passed, the function `f` is expected to return an
+* `[out, aux]` tuple, and `vjp` returns `[out, vjpFn, aux]`.
+*
+* @example
+* ```ts
+* const [y, vjpFn] = vjp(f, [x]);
+*
+* // With hasAux
+* const [y, vjpFn, aux] = vjp(f, [x], { hasAux: true });
+* ```
 */
 const vjp = vjp$1;
 /**
 * @function
 * Compute the gradient of a scalar-valued function `f` with respect to its
 * first argument.
+*
+* Pass in different `argnums` to differentiate with respect to other
+* arguments. If a tuple is provided, the return value will be a tuple of
+* gradients corresponding to each argument index.
+*
+* When `{ hasAux: true }` is passed, the function `f` is expected to return a
+* `[out, aux]` tuple, and the return value will be `[gradient, aux]`.
+*
+* @example
+* ```ts
+* const gradient = grad(f)(x);
+*
+* // With `argnums`
+* const [gradientX, gradientZ] = grad(f, { argnums: [0, 2] })(x, y, z);
+*
+* // With `hasAux`
+* const [gradient, aux] = grad(f, { hasAux: true })(x);
+* ```
 */
 const grad = grad$1;
 /**
 * @function
 * Create a function that evaluates both `f` and the gradient of `f`.
+*
+* When `{ hasAux: true }` is passed, the function `f` is expected to return an
+* `[out, aux]` tuple, and the return value will be `[[out, aux], gradient]`.
+*
+* @example
+* ```ts
+* // Without hasAux
+* const [value, gradient] = valueAndGrad(f)(x);
+*
+* // With hasAux
+* const [[value, aux], gradient] = valueAndGrad(f, { hasAux: true })(x);
+* ```
 */
 const valueAndGrad = valueAndGrad$1;
 /**
@@ -7648,6 +7953,21 @@ const valueAndGrad = valueAndGrad$1;
 */
 const jacrev = jacrev$1;
 /**
+* @function
+* Compute the Hessian matrix of a scalar-valued function.
+*
+* The Hessian is the matrix of second-order partial derivatives of a function.
+* This is implemented as `jacfwd(grad(f))`.
+*
+* @example
+* ```ts
+* const f = (x: np.Array) => np.sum(x.ref.mul(x.ref).mul(x)); // x^3
+* const H = hessian(f)(np.array([1, 2, 3]));
+* // H[i,j] = d^2f / dx_i dx_j
+* ```
+*/
+const hessian = hessian$1;
+/**
 * Wait until all `Array` leaves are ready by calling `Array.blockUntilReady()`.
 *
 * This can be used to wait for the results of an intermediate computation to
@@ -7682,4 +8002,4 @@ async function devicePut(x, device) {
 }
 //#endregion
-export { Array$1 as Array, ClosedJaxpr, DType, Jaxpr, blockUntilReady, defaultDevice, devicePut, devices, grad, init, jacfwd, jacrev as jacobian, jacrev, jit, jvp, lax_exports as lax, linearize, makeJaxpr, nn_exports as nn, numpy_exports as numpy, random_exports as random, scipy_special_exports as scipySpecial, setDebug, tree_exports as tree, valueAndGrad, vjp, vmap };
+export { Array$1 as Array, ClosedJaxpr, DType, Jaxpr, blockUntilReady, defaultDevice, devicePut, devices, grad, hessian, init, jacfwd, jacrev as jacobian, jacrev, jit, jvp, lax_exports as lax, linearize, makeJaxpr, nn_exports as nn, numpy_exports as numpy, random_exports as random, scipy_special_exports as scipySpecial, setDebug, tree_exports as tree, valueAndGrad, vjp, vmap };