npm - @jax-js/jax - Versions diffs - 0.1.5 → 0.1.6 - Mend

@jax-js/jax 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/{backend-DziQSaoQ.cjs → backend-D7s-Retx.cjs} +23 -4
package/dist/{backend-DaqL-MNz.js → backend-Dx6Ob2D1.js} +18 -5
package/dist/index.cjs +365 -110
package/dist/index.d.cts +192 -13
package/dist/index.d.ts +192 -13
package/dist/index.js +365 -111
package/dist/{webgl-RSuZKvgc.js → webgl-CLLvzJlO.js} +1 -1
package/dist/{webgl-ClIYb8jP.cjs → webgl-CyfzNW8T.cjs} +1 -1
package/dist/{webgpu-Dh7k9io0.js → webgpu-C-VfevQW.js} +1 -1
package/dist/{webgpu-Db2JrNBr.cjs → webgpu-rraa6dfz.cjs} +1 -1
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import { __export } from "./chunk-Cl8Af3a2.js";
-import { AluExp, AluGroup, AluOp, AluVar, DEBUG, DType, FpHash, Kernel, PPrint, Reduction, Routine, Routines, ShapeTracker, accessorAluExp, accessorGlobal, assertNonNull, byteWidth, checkAxis, deepEqual, defaultDevice, devices, dtypedArray, dtypedJsArray, generalBroadcast, getBackend, init, invertPermutation, isFloatDtype, isNumberPair, isPermutation, normalizeAxis, partitionList, prod, promoteTypes, range, recursiveFlatten, rep, runWithCache, setDebug, toposort, unravelAlu, unzip2, zip, zipn } from "./backend-DaqL-MNz.js";
+import { AluExp, AluGroup, AluOp, AluVar, DEBUG, DType, FpHash, Kernel, PPrint, Reduction, Routine, Routines, ShapeTracker, accessorAluExp, accessorGlobal, assertNonNull, byteWidth, checkAxis, checkInts, deepEqual, defaultDevice, devices, dtypedArray, dtypedJsArray, generalBroadcast, getBackend, init, invertPermutation, isFloatDtype, isNumberPair, isPermutation, normalizeAxis, partitionList, prod, promoteTypes, range, recursiveFlatten, rep, runWithCache, setDebug, toposort, unravelAlu, unzip2, zip, zipn } from "./backend-Dx6Ob2D1.js";
 //#region src/frontend/convolution.ts
 /**
@@ -209,7 +209,7 @@ __export(tree_exports, {
 	structure: () => structure,
 	unflatten: () => unflatten
 });
-const JsArray$1 = globalThis.Array;
+const JsArray$2 = globalThis.Array;
 let NodeType = /* @__PURE__ */ function(NodeType$1) {
 	NodeType$1["Array"] = "Array";
 	NodeType$1["Object"] = "Object";
@@ -257,7 +257,7 @@ function flatten(tree) {
 	return [leaves$1, treedef];
 }
 function _flatten(tree, leaves$1) {
-	if (JsArray$1.isArray(tree)) {
+	if (JsArray$2.isArray(tree)) {
 		const childTrees = tree.map((c) => _flatten(c, leaves$1));
 		return new JsTreeDef(NodeType.Array, null, childTrees);
 	} else if (typeof tree === "object" && tree !== null && tree.constructor === Object) {
@@ -381,6 +381,13 @@ let CompareOp = /* @__PURE__ */ function(CompareOp$1) {
 	CompareOp$1["LessEqual"] = "less_equal";
 	return CompareOp$1;
 }({});
+const routinePrimitives = new Map([
+	[Primitive.Sort, Routines.Sort],
+	[Primitive.Argsort, Routines.Argsort],
+	[Primitive.TriangularSolve, Routines.TriangularSolve],
+	[Primitive.Cholesky, Routines.Cholesky],
+	[Primitive.LU, Routines.LU]
+]);
 function add$1(x, y) {
 	return bind1(Primitive.Add, [x, y]);
 }
@@ -654,6 +661,9 @@ function newDynamic(main) {
 		dynamicTrace = prevDynamicTrace;
 	} };
 }
+function currentTraceLevel() {
+	return traceStack[traceStack.length - 1].level;
+}
 var Trace = class {
 	constructor(main) {
 		this.main = main;
@@ -1031,6 +1041,7 @@ var TreeMismatchError = class extends TypeError {
 		super(`Mismatched tree structures in ${where$2}: ${left} != ${right}`);
 	}
 };
+/** Flatten a function of `JsTree` input/output for use in tracing. */
 function flattenFun(f, inTree) {
 	const store = { value: void 0 };
 	const flatFun = (...argsFlat) => {
@@ -1042,6 +1053,26 @@ function flattenFun(f, inTree) {
 	};
 	return [flatFun, store];
 }
+/** Like flattenFun, but expects f to return [main, aux] tuple. */
+function flattenFunWithAux(f, inTree) {
+	const store = { value: void 0 };
+	const auxStore = { value: void 0 };
+	const flatFun = (...argsFlat) => {
+		const pytreeArgs = unflatten(inTree, argsFlat);
+		const result = f(...pytreeArgs);
+		if (!Array.isArray(result) || result.length !== 2) throw new Error("Function with `hasAux: true` must return [output, aux] tuple");
+		const [out, aux] = result;
+		const [outFlat, outTree] = flatten(out);
+		store.value = outTree;
+		auxStore.value = aux;
+		return outFlat;
+	};
+	return [
+		flatFun,
+		store,
+		auxStore
+	];
+}
 var UseAfterFreeError = class extends ReferenceError {
 	constructor(tracer) {
 		super(`Referenced tracer ${tracer.toString()} freed, please use .ref move semantics`);
@@ -1771,13 +1802,6 @@ function jit$1(f, opts) {
 //#endregion
 //#region src/frontend/jit.ts
-const routinePrimitives = new Map([
-	[Primitive.Sort, Routines.Sort],
-	[Primitive.Argsort, Routines.Argsort],
-	[Primitive.TriangularSolve, Routines.TriangularSolve],
-	[Primitive.Cholesky, Routines.Cholesky],
-	[Primitive.LU, Routines.LU]
-]);
 /** Result of compiling a Jaxpr. Can be evaluated on a series of inputs. */
 var JitProgram = class {
 	constructor(backend, steps, inputs, outputs) {
@@ -2166,12 +2190,13 @@ const jitRules = {
 		const ndim$2 = avals[0].ndim;
 		const sizes = avals.map((x) => x.shape[axis]);
 		const finalSize = sizes.reduce((a, b) => a + b, 0);
+		const { dtype: dtypeOut } = avals.map((x) => x.scalar()).reduce(promoteAvals);
 		const makePadAxis = (start, end) => range(ndim$2).map((i) => i === axis ? [start, end] : [0, 0]);
 		let cum = 0;
 		const src = [];
 		for (let i = 0; i < exps.length; i++) {
 			const padding = makePadAxis(cum, finalSize - cum - sizes[i]);
-			src.push(reshapeViews(exps[i], (st) => st.pad(padding)));
+			src.push(reshapeViews(AluExp.cast(dtypeOut, exps[i]), (st) => st.pad(padding)));
 			cum += sizes[i];
 		}
 		return { exp: [src.reduce(AluExp.add)] };
@@ -2309,7 +2334,7 @@ function splitGraphDataflow(backend, jaxpr) {
 		p1NextBlack.set(v, v);
 	}
 	const heterogeneousViewPrimitives = [Primitive.RandomBits, Primitive.Gather];
-	const needsCleanShapePrimitives = [Primitive.Pad];
+	const needsCleanShapePrimitives = [Primitive.Concatenate, Primitive.Pad];
 	for (let i = jaxpr.eqns.length - 1; i >= 0; i--) {
 		const eqn = jaxpr.eqns[i];
 		if (reductionEndpointEqns.has(i) || heterogeneousViewPrimitives.includes(eqn.primitive) || routinePrimitives.has(eqn.primitive) || eqn.outBinders.some((v) => blackNodes.has(v))) {
@@ -2379,7 +2404,7 @@ function splitGraphDataflow(backend, jaxpr) {
 //#endregion
 //#region src/frontend/array.ts
-const JsArray = globalThis.Array;
+const JsArray$1 = globalThis.Array;
 const inlineArrayLimit = 128;
 /** Version of pureArray with fudged types. */
 const fudgeArray = pureArray;
@@ -2777,25 +2802,35 @@ var Array$1 = class Array$1 extends Tracer {
 		});
 	}
 	/** Apply an operation with custom lowering to this array. */
-	static #routine(routine, arrays, outputWeakType) {
-		const { backend, committed } = Array$1.#computeBackend(routine.name, arrays);
-		for (const ar of arrays) ar.#realize();
-		const inputs = arrays.map((ar) => ar.#source);
-		const outputs = routine.type.outputDtypes.map((dtype, i) => backend.malloc(byteWidth(dtype) * prod(routine.type.outputShapes[i])));
-		const pending = arrays.flatMap((ar) => ar.#pending);
-		for (const exe of pending) exe.updateRc(+outputs.length);
-		pending.push(new PendingExecute(backend, routine, inputs, outputs));
-		pending[pending.length - 1].updateRc(+outputs.length - 1);
-		arrays.forEach((ar) => ar.dispose());
-		return outputs.map((output, i) => new Array$1({
-			source: output,
-			st: ShapeTracker.fromShape(routine.type.outputShapes[i]),
-			dtype: routine.type.outputDtypes[i],
-			weakType: outputWeakType[i],
-			backend,
-			committed,
-			pending
-		}));
+	static #routine(prim) {
+		return (arrays, params) => {
+			const { backend, committed } = Array$1.#computeBackend(prim, arrays);
+			for (const ar of arrays) ar.#realize();
+			const avals = arrays.map((ar) => ar.aval);
+			const avalsOut = abstractEvalRules[prim](avals, params);
+			const routine = new Routine(routinePrimitives.get(prim), {
+				inputShapes: avals.map((a) => a.shape),
+				inputDtypes: avals.map((a) => a.dtype),
+				outputShapes: avalsOut.map((a) => a.shape),
+				outputDtypes: avalsOut.map((a) => a.dtype)
+			}, params);
+			const inputs = arrays.map((ar) => ar.#source);
+			const outputs = avalsOut.map((x) => backend.malloc(byteWidth(x.dtype) * x.size));
+			const pending = arrays.flatMap((ar) => ar.#pending);
+			for (const exe of pending) exe.updateRc(+outputs.length);
+			pending.push(new PendingExecute(backend, routine, inputs, outputs));
+			pending[pending.length - 1].updateRc(+outputs.length - 1);
+			arrays.forEach((ar) => ar.dispose());
+			return outputs.map((output, i) => new Array$1({
+				source: output,
+				st: ShapeTracker.fromShape(avalsOut[i].shape),
+				dtype: avalsOut[i].dtype,
+				weakType: avalsOut[i].weakType,
+				backend,
+				committed,
+				pending
+			}));
+		};
 	}
 	/**
 	* Normalizes this array into one backed by a `Slot`.
@@ -3129,65 +3164,11 @@ var Array$1 = class Array$1 extends Tracer {
 			[Primitive.Pad]([x], { width }) {
 				return [x.#reshape(x.#st.pad(width))];
 			},
-			[Primitive.Sort]([x]) {
-				const routine = new Routine(Routines.Sort, {
-					inputShapes: [x.shape],
-					inputDtypes: [x.dtype],
-					outputShapes: [x.shape],
-					outputDtypes: [x.dtype]
-				});
-				return Array$1.#routine(routine, [x], [x.#weakType]);
-			},
-			[Primitive.Argsort]([x]) {
-				const routine = new Routine(Routines.Argsort, {
-					inputShapes: [x.shape],
-					inputDtypes: [x.dtype],
-					outputShapes: [x.shape, x.shape],
-					outputDtypes: [x.dtype, DType.Int32]
-				});
-				return Array$1.#routine(routine, [x], [x.#weakType, false]);
-			},
-			[Primitive.TriangularSolve]([a, b], { unitDiagonal }) {
-				const routine = new Routine(Routines.TriangularSolve, {
-					inputShapes: [a.shape, b.shape],
-					inputDtypes: [a.dtype, b.dtype],
-					outputShapes: [b.shape],
-					outputDtypes: [b.dtype]
-				}, { unitDiagonal });
-				return Array$1.#routine(routine, [a, b], [a.#weakType && b.#weakType]);
-			},
-			[Primitive.Cholesky]([a]) {
-				const routine = new Routine(Routines.Cholesky, {
-					inputShapes: [a.shape],
-					inputDtypes: [a.dtype],
-					outputShapes: [a.shape],
-					outputDtypes: [a.dtype]
-				});
-				return Array$1.#routine(routine, [a], [a.#weakType]);
-			},
-			[Primitive.LU]([a]) {
-				const batch = a.shape.slice(0, -2);
-				const [m, n] = a.shape.slice(-2);
-				const routine = new Routine(Routines.LU, {
-					inputShapes: [a.shape],
-					inputDtypes: [a.dtype],
-					outputShapes: [
-						a.shape,
-						[...batch, Math.min(m, n)],
-						[...batch, m]
-					],
-					outputDtypes: [
-						a.dtype,
-						DType.Int32,
-						DType.Int32
-					]
-				});
-				return Array$1.#routine(routine, [a], [
-					a.#weakType,
-					false,
-					false
-				]);
-			},
+			[Primitive.Sort]: Array$1.#routine(Primitive.Sort),
+			[Primitive.Argsort]: Array$1.#routine(Primitive.Argsort),
+			[Primitive.TriangularSolve]: Array$1.#routine(Primitive.TriangularSolve),
+			[Primitive.Cholesky]: Array$1.#routine(Primitive.Cholesky),
+			[Primitive.LU]: Array$1.#routine(Primitive.LU),
 			[Primitive.Jit](args, { jaxpr }) {
 				if (jaxpr.inBinders.length !== args.length) throw new Error(`jit expects ${jaxpr.inBinders.length} args, got ${args.length}`);
 				const { backend, committed } = Array$1.#computeBackend("jit", args);
@@ -3269,7 +3250,7 @@ function array(values, { shape: shape$1, dtype, device } = {}) {
 		if (!shape$1) {
 			shape$1 = [];
 			let cur = values;
-			while (JsArray.isArray(cur)) {
+			while (JsArray$1.isArray(cur)) {
 				shape$1.push(cur.length);
 				cur = cur[0];
 			}
@@ -4223,17 +4204,39 @@ function jvpFlat(f, primals, tangents) {
 		_usingCtx$1.d();
 	}
 }
-function jvp$1(f, primals, tangents) {
+function jvp$1(f, primals, tangents, { hasAux = false } = {}) {
 	const [primalsFlat, inTree] = flatten(primals);
 	const [tangentsFlat, inTree2] = flatten(tangents);
 	if (!inTree.equals(inTree2)) throw new TreeMismatchError("jvp", inTree, inTree2);
-	const [flatFun, outTree] = flattenFun(f, inTree);
+	let flatFun, outTree, aux;
+	if (hasAux) [flatFun, outTree, aux] = flattenFunWithAux(f, inTree);
+	else [flatFun, outTree] = flattenFun(f, inTree);
 	const [primalsOutFlat, tangentsOutFlat] = jvpFlat(flatFun, primalsFlat, tangentsFlat);
 	if (outTree.value === void 0) throw new Error("outTree was not set in jvp");
 	const primalsOut = unflatten(outTree.value, primalsOutFlat);
 	const tangentsOut = unflatten(outTree.value, tangentsOutFlat);
+	if (hasAux) return [
+		primalsOut,
+		tangentsOut,
+		lowerAux(aux.value)
+	];
 	return [primalsOut, tangentsOut];
 }
+/** Lowering for auxiliary data returned in `hasAux: true` methods. */
+function lowerAux(aux) {
+	const level = currentTraceLevel();
+	return map((x) => {
+		if (x instanceof Tracer) while (x._trace.main.level > level) if (x instanceof JVPTracer) {
+			x.tangent.dispose();
+			x = x.primal;
+		} else {
+			const y = x.fullLower();
+			if (y._trace.main.level >= x._trace.main.level) throw new Error("internal: lowerAux did not reduce trace level");
+			x = y;
+		}
+		return x;
+	}, aux);
+}
 //#endregion
 //#region src/frontend/linearize.ts
@@ -4304,9 +4307,11 @@ function linearizeFlat(f, primalsIn) {
 		dispose$1
 	];
 }
-function linearize$1(f, ...primalsIn) {
+function linearize$1(f, primalsIn, { hasAux = false } = {}) {
 	const [primalsInFlat, inTree] = flatten(primalsIn);
-	const [fFlat, outTree] = flattenFun(f, inTree);
+	let fFlat, outTree, aux;
+	if (hasAux) [fFlat, outTree, aux] = flattenFunWithAux(f, inTree);
+	else [fFlat, outTree] = flattenFun(f, inTree);
 	const [primalsOutFlat, fLinFlat, dispose$1] = linearizeFlat(fFlat, primalsInFlat.map(pureArray));
 	if (outTree.value === void 0) throw new Error("outTree was not set in linearize");
 	const primalsOut = unflatten(outTree.value, primalsOutFlat);
@@ -4317,6 +4322,11 @@ function linearize$1(f, ...primalsIn) {
 		return unflatten(outTree.value, tangentsOutFlat);
 	});
 	fLin.dispose = dispose$1;
+	if (hasAux) return [
+		primalsOut,
+		fLin,
+		lowerAux(aux.value)
+	];
 	return [primalsOut, fLin];
 }
 var PartialEvalTracer = class extends Tracer {
@@ -4817,9 +4827,11 @@ function vjpFlat(f, primalsIn) {
 		dispose$1
 	];
 }
-function vjp$1(f, ...primalsIn) {
+function vjp$1(f, primalsIn, { hasAux = false } = {}) {
 	const [primalsInFlat, inTree] = flatten(primalsIn);
-	const [fFlat, outTree] = flattenFun(f, inTree);
+	let fFlat, outTree, aux;
+	if (hasAux) [fFlat, outTree, aux] = flattenFunWithAux(f, inTree);
+	else [fFlat, outTree] = flattenFun(f, inTree);
 	const [primalsOutFlat, fVjpFlat, dispose$1] = vjpFlat(fFlat, primalsInFlat.map(pureArray));
 	if (outTree.value === void 0) throw new Error("outTree was not set in vjp");
 	const primalsOut = unflatten(outTree.value, primalsOutFlat);
@@ -4830,26 +4842,43 @@ function vjp$1(f, ...primalsIn) {
 		return unflatten(inTree, cotangentsInFlat);
 	});
 	fVjp.dispose = dispose$1;
+	if (hasAux) return [
+		primalsOut,
+		fVjp,
+		lowerAux(aux.value)
+	];
 	return [primalsOut, fVjp];
 }
-function grad$1(f) {
-	const valueAndGradFn = valueAndGrad$1(f);
+function grad$1(f, opts) {
+	const valueAndGradFn = valueAndGrad$1(f, opts);
 	return (...x) => {
-		const [y, dx] = valueAndGradFn(...x);
-		y.dispose();
-		return dx;
+		if (opts?.hasAux) {
+			const [[y, aux], dx] = valueAndGradFn(...x);
+			y.dispose();
+			return [dx, aux];
+		} else {
+			const [y, dx] = valueAndGradFn(...x);
+			y.dispose();
+			return dx;
+		}
 	};
 }
-function valueAndGrad$1(f) {
+function valueAndGrad$1(f, opts) {
+	const argnums = opts?.argnums ?? 0;
+	const hasAux = opts?.hasAux ?? false;
+	checkInts(argnums);
+	const argnumsSet = new Set(typeof argnums === "number" ? [argnums] : argnums);
 	return (...x) => {
 		if (x.length === 0) throw new Error("grad requires at least one argument to differentiate");
-		const [y, fVjp] = vjp$1(f, x[0], ...x.slice(1).map(stopGradient));
+		for (let i = 0; i < x.length; i++) if (!argnumsSet.has(i)) x[i] = map(stopGradient, x[i]);
+		const [y, fVjp, aux] = vjp$1(f, x, { hasAux });
 		if (!(y instanceof Tracer) || ndim$1(y) !== 0) throw new TypeError("grad requires a scalar output");
 		if (!isFloatDtype(y.dtype)) throw new TypeError("grad only supports floating-point dtypes");
-		const [ct, ...rest] = fVjp(onesLike$1(y.ref));
-		for (const r of rest) dispose(r);
+		const cts = fVjp(onesLike$1(y.ref));
 		fVjp.dispose();
-		return [y, ct];
+		for (let i = 0; i < cts.length; i++) if (!argnumsSet.has(i)) dispose(cts[i]);
+		const grads = typeof argnums === "number" ? cts[argnums] : argnums.map((i) => cts[i]);
+		return hasAux ? [[y, aux], grads] : [y, grads];
 	};
 }
 function jacrev$1(f) {
@@ -4857,7 +4886,7 @@ function jacrev$1(f) {
 		if (x.shape.length !== 1) throw new TypeError("jacrev only supports 1D inputs");
 		const [size$1] = x.shape;
 		const pullback = (ct) => {
-			const [y, fVjp] = vjp$1(f, x);
+			const [y, fVjp] = vjp$1(f, [x]);
 			y.dispose();
 			const [ret] = fVjp(ct);
 			fVjp.dispose();
@@ -4866,6 +4895,9 @@ function jacrev$1(f) {
 		return vmap$1(pullback, [1])(eye(size$1, void 0, { dtype: x.dtype }));
 	};
 }
+function hessian$1(f) {
+	return jacfwd$1(grad$1(f));
+}
 //#endregion
 //#region src/library/numpy/einsum.ts
@@ -5575,6 +5607,7 @@ __export(numpy_exports, {
 	std: () => std,
 	subtract: () => subtract,
 	sum: () => sum,
+	swapaxes: () => swapaxes,
 	take: () => take,
 	tan: () => tan,
 	tanh: () => tanh,
@@ -5973,6 +6006,17 @@ function flipud(x) {
 function fliplr(x) {
 	return flip(x, 1);
 }
+/** Interchange two axes of an array. */
+function swapaxes(a, axis1, axis2) {
+	a = fudgeArray(a);
+	axis1 = checkAxis(axis1, a.ndim);
+	axis2 = checkAxis(axis2, a.ndim);
+	if (axis1 === axis2) return a;
+	const perm = range(a.ndim);
+	perm[axis1] = axis2;
+	perm[axis2] = axis1;
+	return transpose(a, perm);
+}
 /** Transpose the last two dimensions of an array. */
 function matrixTranspose(a) {
 	if (ndim(a) < 2) throw new Error(`matrixTranspose: input array must be at least 2D`);
@@ -6901,6 +6945,7 @@ var lax_exports = {};
 __export(lax_exports, {
 	conv: () => conv,
 	convGeneralDilated: () => convGeneralDilated,
+	convTranspose: () => convTranspose,
 	convWithGeneralPadding: () => convWithGeneralPadding,
 	dot: () => dot,
 	erf: () => erf,
@@ -6909,6 +6954,7 @@ __export(lax_exports, {
 	reduceWindow: () => reduceWindow,
 	stopGradient: () => stopGradient$1
 });
+const JsArray = globalThis.Array;
 /**
 * General dot product/contraction operator.
 *
@@ -6980,7 +7026,11 @@ function padtypeToPads(inShape, filterShape, strides, dilation, padding) {
 * The semantics of this operation mimic the `jax.lax.conv_general_dilated`
 * function in JAX, which wraps XLA's general convolution operator.
 *
-* Grouped convolutions are not supported right now.
+* @param lhs - Input tensor; shape `[N, C_in, ...xs]`
+* @param rhs - Convolution kernel; shape `[C_out, C_in / G, ...ks]`
+* @param windowStrides - Strides for each spatial dimension
+* @param padding - Padding for each spatial dimension, or a string
+*   (`"VALID"`, `"SAME"`, or `"SAME_LOWER"`)
 */
 function convGeneralDilated(lhs, rhs, windowStrides, padding, { lhsDilation, rhsDilation, featureGroupCount = 1 } = {}) {
 	if (lhs.ndim < 2) throw new Error("lhs must have at least 2 dimensions");
@@ -7040,6 +7090,60 @@ function convWithGeneralPadding(lhs, rhs, windowStrides, padding, lhsDilation, r
 function conv(lhs, rhs, windowStrides, padding) {
 	return convGeneralDilated(lhs, rhs, windowStrides, padding);
 }
+/**
+* Convenience wrapper for calculating the N-d convolution "transpose".
+*
+* This function directly calculates a fractionally strided conv rather than
+* indirectly calculating the gradient (transpose) of a forward convolution.
+* It is equivalent to the JAX version, except:
+*
+* - The `use_consistent_padding` option is not available. We only have the
+*   consistent padding case (JAX version >0.8.4).
+* - The order of dimensions matches `lax.conv_general_dilated`.
+*
+* Unlike PyTorch/TensorFlow, by default we don't reverse the kernel's spatial
+* dimensions or the `(C_out, C_in)` axis order. To get this behavior, set
+* `transposeKernel` to true.
+*
+* @param lhs - Input tensor; shape `[N, C_in, ...xs]`
+* @param rhs - Convolution kernel; shape `[C_out, C_in, ...ks]`
+* @param strides - Sequence of n integers, sets fractional stride
+* @param padding - Apply padding of `dilation * (kernel_size - 1) - padding` to
+*   each side of the input, so it acts like gradient of `conv()`
+* @param rhsDilation - Atrous dilation for the kernel
+* @param transposeKernel - Flip spatial axes and swap the input/output channels
+*   of the kernel; its shape should be `[C_in, C_out, ...ks]`
+*/
+function convTranspose(lhs, rhs, strides, padding, { rhsDilation, transposeKernel = false } = {}) {
+	const kernelShape = rhs.shape.slice(2);
+	rhsDilation = rhsDilation ?? rep(kernelShape.length, 1);
+	const effectiveKernel = kernelShape.map((k, i) => Math.max(0, (k - 1) * rhsDilation[i] + 1));
+	const pads = effectiveKernel.map((k, i) => convTransposePadding(k, strides[i], typeof padding === "string" ? padding : padding[i]));
+	if (transposeKernel) {
+		rhs = flip$1(rhs, range(2, rhs.ndim));
+		rhs = moveaxis(rhs, 0, 1);
+	}
+	return convGeneralDilated(lhs, rhs, rep(lhs.ndim - 2, 1), pads, {
+		lhsDilation: strides,
+		rhsDilation
+	});
+}
+function convTransposePadding(k, s, padding) {
+	let padLen;
+	let pad1;
+	if (padding === "SAME") {
+		padLen = k + s - 2;
+		pad1 = s > k - 1 ? k - 1 : Math.ceil(padLen / 2);
+	} else if (padding === "VALID") {
+		padLen = k + s - 2 + Math.max(k - s, 0);
+		pad1 = k - 1;
+	} else if (JsArray.isArray(padding)) {
+		const pads = [k - 1 - padding[0], k - 1 - padding[1]];
+		pad1 = pads[0];
+		padLen = pads[0] + pads[1];
+	} else throw new Error(`convTranspose: Invalid padding type ${padding}`);
+	return [pad1, padLen - pad1];
+}
 /** Reduce a computation over padded windows. */
 function reduceWindow(operand, computation, windowDimensions, windowStrides) {
 	if (operand.ndim < windowDimensions.length) throw new Error(`Operand dimensions ${operand.ndim} < window ${windowDimensions.length}`);
@@ -7078,6 +7182,7 @@ function stopGradient$1(x) {
 var nn_exports = {};
 __export(nn_exports, {
 	celu: () => celu,
+	dotProductAttention: () => dotProductAttention,
 	elu: () => elu,
 	gelu: () => gelu,
 	glu: () => glu,
@@ -7394,6 +7499,95 @@ function oneHot(x, numClasses) {
 	if (isFloatDtype(x.dtype)) throw new TypeError(`oneHot expects integers, got ${x.dtype}`);
 	return eye(numClasses, void 0, { device: x.device }).slice(x);
 }
+/**
+* Scaled dot product attention (SDPA).
+*
+* Computes `softmax((Q @ K^T) / sqrt(d) + bias) @ V`, where `Q` is the query,
+* `K` is the key, `V` is the value, and `d` is the dimensionality of each key
+* and query vector.
+*
+* Multi-query attention is applied when input `key` and `value` tensors have
+* fewer heads than `query`.
+*
+* We use the following uppercase letters to denote array shapes:
+* - `B` = batch size
+* - `S` = length of key/value sequences (source)
+* - `L` = length of query sequences
+* - `N` = number of attention heads
+* - `H` = dimensionality of each attention head
+* - `K` = number of key/value heads (for grouped-query attention)
+*
+* The batch size `B` may be omitted, which is equivalent to `B = 1`. In this
+* case it must be omitted from all inputs.
+*
+* @param query - Query array; shape `[B, L, N, H]`
+* @param key - Key array; shape `[B, S, K, H]`
+* @param value - Value array; same shape as `key`
+* @param opts.bias - Optional bias to add to the attention logits; shape
+*   `[B, N, L, S]` or broadcastable to it.
+* @param opts.mask - Optional mask to apply to the attention logits; should be
+*   a boolean array broadcastable to `[B, N, L, S]`, where `true` indicates
+*   the element should take part in attention.
+* @param opts.scale - Scaling factor override, default is `1 / sqrt(H)`.
+* @param opts.isCausal - If true, applies a casual mask.
+* @param opts.querySeqLengths - Optional sequence lengths for the queries;
+*   shape `(B,)`. Taken from the beginning of the tensor.
+* @param opts.keyValueSeqLengths - Optional sequence lengths for the keys and
+*   values; shape `(B,)`. Taken from the beginning of the tensor.
+* @param opts.localWindowSize - If specified, applies a local attention window
+*   of the given size. Can be a single number or a tuple `[left, right]`.
+*
+* @returns The result of the attention operation; shape is the same as query
+*   `[B, L, N, H]`, or `[L, N, H]` if `B` is omitted.
+*/
+function dotProductAttention(query, key$1, value, opts = {}) {
+	if (opts.querySeqLengths !== void 0 || opts.keyValueSeqLengths !== void 0) throw new Error("Sequence length masking is not yet implemented");
+	if (opts.localWindowSize !== void 0) throw new Error("Local attention is not yet implemented");
+	query = fudgeArray(query);
+	key$1 = fudgeArray(key$1);
+	value = fudgeArray(value);
+	if (query.ndim !== 3 && query.ndim !== 4 || query.ndim !== key$1.ndim || query.ndim !== value.ndim) throw new Error(`dotProductAttention: expected all tensors to have rank 3 or 4, got Q=${query.aval}, K=${key$1.aval}, V=${value.aval}`);
+	if (!deepEqual(key$1.shape, value.shape)) throw new Error(`dotProductAttention: key and value shapes must match, got K=${key$1.shape}, V=${value.shape}`);
+	const isRank3 = query.ndim === 3;
+	if (isRank3) {
+		query = expandDims(query, 0);
+		key$1 = expandDims(key$1, 0);
+		value = expandDims(value, 0);
+	}
+	const [B, L, N, H] = query.shape;
+	if (key$1.shape[0] !== B || key$1.shape[3] !== H) throw new Error(`dotProductAttention: query and key shapes mismatch, got Q=${query.aval}, K=${key$1.aval}`);
+	const S = key$1.shape[1];
+	const K = key$1.shape[2];
+	if (N < K || N != K && N % K !== 0) throw new Error(`dotProductAttention: number of query heads N=${N} must be divisible by number of key/value heads K=${K} for GQA`);
+	const G = N / K;
+	key$1 = tile(key$1, [
+		1,
+		1,
+		G,
+		1
+	]);
+	value = tile(value, [
+		1,
+		1,
+		G,
+		1
+	]);
+	const scale = opts.scale ?? 1 / Math.sqrt(H);
+	let scores = einsum("BLNH,BSNH->BNLS", query, key$1).mul(scale);
+	if (opts.bias !== void 0) scores = scores.add(opts.bias);
+	if (opts.mask !== void 0) scores = where(opts.mask, scores, -Infinity);
+	if (opts.isCausal) {
+		const causalMask = tri(L, S, 0, { dtype: DType.Bool });
+		scores = where(causalMask, scores, -Infinity);
+	}
+	const attn = softmax(scores, -1);
+	const out = einsum("BNLS,BSNH->BLNH", attn, value);
+	return isRank3 ? out.reshape([
+		L,
+		N,
+		H
+	]) : out;
+}
 //#endregion
 //#region src/library/random.ts
@@ -7629,17 +7823,62 @@ const linearize = linearize$1;
 /**
 * @function
 * Calculate the reverse-mode vector-Jacobian product for a function.
+*
+* The return value is a tuple of `[out, vjpFn]`, where `out` is the output of
+* `f(primals)`, and `vjpFn` is a function that takes in cotangents for each
+* output and returns the cotangents for each input.
+*
+* When `{ hasAux: true }` is passed, the function `f` is expected to return an
+* `[out, aux]` tuple, and `vjp` returns `[out, vjpFn, aux]`.
+*
+* @example
+* ```ts
+* const [y, vjpFn] = vjp(f, [x]);
+*
+* // With hasAux
+* const [y, vjpFn, aux] = vjp(f, [x], { hasAux: true });
+* ```
 */
 const vjp = vjp$1;
 /**
 * @function
 * Compute the gradient of a scalar-valued function `f` with respect to its
 * first argument.
+*
+* Pass in different `argnums` to differentiate with respect to other
+* arguments. If a tuple is provided, the return value will be a tuple of
+* gradients corresponding to each argument index.
+*
+* When `{ hasAux: true }` is passed, the function `f` is expected to return a
+* `[out, aux]` tuple, and the return value will be `[gradient, aux]`.
+*
+* @example
+* ```ts
+* const gradient = grad(f)(x);
+*
+* // With `argnums`
+* const [gradientX, gradientZ] = grad(f, { argnums: [0, 2] })(x, y, z);
+*
+* // With `hasAux`
+* const [gradient, aux] = grad(f, { hasAux: true })(x);
+* ```
 */
 const grad = grad$1;
 /**
 * @function
 * Create a function that evaluates both `f` and the gradient of `f`.
+*
+* When `{ hasAux: true }` is passed, the function `f` is expected to return an
+* `[out, aux]` tuple, and the return value will be `[[out, aux], gradient]`.
+*
+* @example
+* ```ts
+* // Without hasAux
+* const [value, gradient] = valueAndGrad(f)(x);
+*
+* // With hasAux
+* const [[value, aux], gradient] = valueAndGrad(f, { hasAux: true })(x);
+* ```
 */
 const valueAndGrad = valueAndGrad$1;
 /**
@@ -7648,6 +7887,21 @@ const valueAndGrad = valueAndGrad$1;
 */
 const jacrev = jacrev$1;
 /**
+* @function
+* Compute the Hessian matrix of a scalar-valued function.
+*
+* The Hessian is the matrix of second-order partial derivatives of a function.
+* This is implemented as `jacfwd(grad(f))`.
+*
+* @example
+* ```ts
+* const f = (x: np.Array) => np.sum(x.ref.mul(x.ref).mul(x)); // x^3
+* const H = hessian(f)(np.array([1, 2, 3]));
+* // H[i,j] = d^2f / dx_i dx_j
+* ```
+*/
+const hessian = hessian$1;
+/**
 * Wait until all `Array` leaves are ready by calling `Array.blockUntilReady()`.
 *
 * This can be used to wait for the results of an intermediate computation to
@@ -7682,4 +7936,4 @@ async function devicePut(x, device) {
 }
 //#endregion
-export { Array$1 as Array, ClosedJaxpr, DType, Jaxpr, blockUntilReady, defaultDevice, devicePut, devices, grad, init, jacfwd, jacrev as jacobian, jacrev, jit, jvp, lax_exports as lax, linearize, makeJaxpr, nn_exports as nn, numpy_exports as numpy, random_exports as random, scipy_special_exports as scipySpecial, setDebug, tree_exports as tree, valueAndGrad, vjp, vmap };
+export { Array$1 as Array, ClosedJaxpr, DType, Jaxpr, blockUntilReady, defaultDevice, devicePut, devices, grad, hessian, init, jacfwd, jacrev as jacobian, jacrev, jit, jvp, lax_exports as lax, linearize, makeJaxpr, nn_exports as nn, numpy_exports as numpy, random_exports as random, scipy_special_exports as scipySpecial, setDebug, tree_exports as tree, valueAndGrad, vjp, vmap };