npm - @jax-js/jax - Versions diffs - 0.1.2 → 0.1.3 - Mend

@jax-js/jax 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md +11 -32
package/dist/{backend-BqymqzuU.js → backend-BY8wlLEl.js} +58 -20
package/dist/{backend-DeVfWEFS.cjs → backend-CmaidnkQ.cjs} +58 -20
package/dist/index.cjs +298 -134
package/dist/index.d.cts +21 -5
package/dist/index.d.ts +21 -5
package/dist/index.js +298 -134
package/dist/{webgpu-CcGP160M.cjs → webgpu-BVns4DbI.cjs} +14 -6
package/dist/{webgpu-BGuG58KZ.js → webgpu-C9iAP5h5.js} +14 -6
package/package.json +1 -1

package/dist/index.cjs CHANGED Viewed

@@ -30,30 +30,38 @@ var __toESM = (mod$1, isNodeMode, target) => (target = mod$1 != null ? __create(
 }) : target, mod$1));
 //#endregion
-const require_backend = require('./backend-DeVfWEFS.cjs');
+const require_backend = require('./backend-CmaidnkQ.cjs');
 //#region src/frontend/convolution.ts
 /**
 * Check that the shapes and parameters passed to convolution are valid.
+* Expected shapes of the lhs and rhs of the convolution are:
+*
+* - `lhsShape = [*vmapDims, batchSize, inChannels, spatialDims...]`
+* - `rhsShape = [*vmapDims, outChannels, inChannels, kernelSize...]`
 *
 * If the check succeeds, returns the output shape.
 */
-function checkConvShape(lhsShape, rhsShape, { strides, padding, lhsDilation, rhsDilation }) {
+function checkConvShape(lhsShape, rhsShape, { vmapDims, strides, padding, lhsDilation, rhsDilation }) {
 	if (lhsShape.length !== rhsShape.length) throw new Error(`conv() requires inputs with the same number of dimensions, got ${lhsShape.length} and ${rhsShape.length}`);
-	const n = lhsShape.length - 2;
+	const n = lhsShape.length - 2 - vmapDims;
 	if (n < 0) throw new Error("conv() requires at least 2D inputs");
 	if (strides.length !== n) throw new Error("conv() strides != spatial dims");
 	if (padding.length !== n) throw new Error("conv() padding != spatial dims");
 	if (lhsDilation.length !== n) throw new Error("conv() lhsDilation != spatial dimensions");
 	if (rhsDilation.length !== n) throw new Error("conv() rhsDilation != spatial dimensions");
-	if (lhsShape[1] !== rhsShape[1]) throw new Error(`conv() input channels: ${lhsShape[1]} != ${rhsShape[1]}`);
-	const outShape = [lhsShape[0], rhsShape[0]];
+	if (lhsShape[vmapDims + 1] !== rhsShape[vmapDims + 1]) throw new Error(`conv() input channels: ${lhsShape[1]} != ${rhsShape[1]}`);
+	const outShape = [
+		...require_backend.generalBroadcast(lhsShape.slice(0, vmapDims), rhsShape.slice(0, vmapDims)),
+		lhsShape[vmapDims],
+		rhsShape[vmapDims]
+	];
 	for (let i = 0; i < n; i++) {
 		if (strides[i] <= 0 || !Number.isInteger(strides[i])) throw new Error(`conv() strides[${i}] must be a positive integer`);
 		if (padding[i].length !== 2 || !padding[i].every(Number.isInteger)) throw new Error(`conv() padding[${i}] must be a 2-tuple of integers`);
 		if (lhsDilation[i] <= 0 || !Number.isInteger(lhsDilation[i])) throw new Error(`conv() lhsDilation[${i}] must be a positive integer`);
 		if (rhsDilation[i] <= 0 || !Number.isInteger(rhsDilation[i])) throw new Error(`conv() rhsDilation[${i}] must be a positive integer`);
-		const [x, k] = [lhsShape[i + 2], rhsShape[i + 2]];
+		const [x, k] = [lhsShape[i + vmapDims + 2], rhsShape[i + vmapDims + 2]];
 		if (k <= 0) throw new Error("conv() kernel size must be positive");
 		const [pl, pr] = padding[i];
 		if (pl < -x || pr < -x || pl + pr < -x) throw new Error(`conv() padding[${i}]=(${pl},${pr}) is too negative for input size ${x}`);
@@ -178,27 +186,13 @@ function poolTranspose(st, inShape, ks, strides = 1, dilation = 1) {
 function applyDilation(st, dilation) {
 	if (dilation.every((s) => s === 1)) return st;
 	const s_ = dilation;
-	const [a, b, ...k_] = st.shape;
-	st = st.reshape([
-		a,
-		b,
-		...k_.flatMap((k) => [k, 1])
-	]);
-	st = st.pad([
-		[0, 0],
-		[0, 0],
-		...s_.flatMap((s) => [[0, 0], [0, s - 1]])
-	]);
-	st = st.reshape([
-		a,
-		b,
-		...k_.map((k, i) => k * s_[i])
-	]);
-	st = st.shrink([
-		[0, a],
-		[0, b],
-		...k_.map((k, i) => [0, (k - 1) * s_[i] + 1])
-	]);
+	const n = s_.length;
+	const prefix = st.shape.slice(0, -n);
+	const k_ = st.shape.slice(-n);
+	st = st.reshape([...prefix, ...k_.flatMap((k) => [k, 1])]);
+	st = st.pad([...prefix.map(() => [0, 0]), ...s_.flatMap((s) => [[0, 0], [0, s - 1]])]);
+	st = st.reshape([...prefix, ...k_.map((k, i) => k * s_[i])]);
+	st = st.shrink([...prefix.map((p) => [0, p]), ...k_.map((k, i) => [0, (k - 1) * s_[i] + 1])]);
 	return st;
 }
 /**
@@ -208,25 +202,26 @@ function applyDilation(st, dilation) {
 * beforehand using `checkConvShape()`.
 */
 function prepareConv(stX, stY, params) {
-	const n = stX.shape.length - 2;
+	const v = params.vmapDims;
+	const n = stX.shape.length - 2 - v;
+	const vmapShape = stX.shape.slice(0, v);
 	stX = applyDilation(stX, params.lhsDilation);
-	const ks = stY.shape.slice(2);
-	stX = stX.padOrShrink([
-		[0, 0],
-		[0, 0],
-		...params.padding
-	]);
+	const ks = stY.shape.slice(v + 2);
+	stX = stX.padOrShrink([...require_backend.rep(v + 2, [0, 0]), ...params.padding]);
 	stX = pool(stX, ks, params.strides, params.rhsDilation);
-	stX = stX.moveaxis(1, n + 1).reshape([
-		stX.shape[0],
+	stX = stX.moveaxis(v + 1, v + n + 1).reshape([
+		...vmapShape,
+		stX.shape[v],
 		1,
-		...stX.shape.slice(2, n + 2),
-		stX.shape[1] * require_backend.prod(ks)
+		...stX.shape.slice(v + 2, v + n + 2),
+		stX.shape[v + 1] * require_backend.prod(ks)
 	]);
 	stY = stY.reshape([
-		stY.shape[0],
+		...vmapShape,
+		1,
+		stY.shape[v],
 		...require_backend.rep(n, 1),
-		stY.shape[1] * require_backend.prod(ks)
+		stY.shape[v + 1] * require_backend.prod(ks)
 	]);
 	return [stX, stY];
 }
@@ -498,9 +493,11 @@ function dot$2(x, y) {
 }
 function conv$1(x, y, params = {}) {
 	if (x.ndim !== y.ndim) throw new Error(`conv() requires inputs with the same number of dimensions, got ${x.ndim} and ${y.ndim}`);
-	const n = x.ndim - 2;
+	const vmapDims = params.vmapDims ?? 0;
+	const n = x.ndim - 2 - vmapDims;
 	if (n < 0) throw new Error("conv() requires at least 2D inputs");
 	return bind1(Primitive.Conv, [x, y], {
+		vmapDims,
 		strides: params.strides ?? require_backend.rep(n, 1),
 		padding: params.padding ?? require_backend.rep(n, [0, 0]),
 		lhsDilation: params.lhsDilation ?? require_backend.rep(n, 1),
@@ -724,8 +721,10 @@ var Tracer = class Tracer {
 		axis = require_backend.normalizeAxis(axis, this.ndim);
 		const n = axis.reduce((acc, a) => acc * this.shape[a], 1);
 		if (n === 0) throw new Error("mean: cannot compute mean over zero-length axis");
-		const result = reduce(this, require_backend.AluOp.Add, axis, opts);
-		return result.mul(1 / n);
+		const originalDtype = this.dtype;
+		const castDtype = require_backend.promoteTypes(originalDtype, require_backend.DType.Float32);
+		const result = reduce(this.astype(castDtype), require_backend.AluOp.Add, axis, opts);
+		return result.mul(1 / n).astype(originalDtype);
 	}
 	/** Permute the dimensions of an array. Defaults to reversing the axis order. */
 	transpose(perm) {
@@ -1205,7 +1204,7 @@ var Jaxpr = class Jaxpr {
 			} else if (eqn.primitive === Primitive.Idiv) {
 				const [a, b] = inputs;
 				const c = eqn.outBinders[0];
-				if (atomIsLit(b, 1)) context.set(c, a);
+				if (atomIsLit(b, 1) && !require_backend.isFloatDtype(a.aval.dtype)) context.set(c, a);
 				else newEqns.push(eqn);
 			} else if ((eqn.primitive === Primitive.Broadcast || eqn.primitive === Primitive.Reshape) && require_backend.deepEqual(eqn.params.shape, eqn.inputs[0].aval.shape) || eqn.primitive === Primitive.Transpose && eqn.params.perm.every((p, i) => p === i) || eqn.primitive === Primitive.Flip && eqn.params.axis.length === 0 || eqn.primitive === Primitive.Shrink && eqn.params.slice.every(([s, e$2], i) => s === 0 && e$2 === eqn.inputs[0].aval.shape[i]) || eqn.primitive === Primitive.Pad && eqn.params.width.every(([w0, w1]) => w0 === 0 && w1 === 0)) context.set(eqn.outBinders[0], eqn.inputs[0]);
 			else newEqns.push(eqn);
@@ -1790,48 +1789,73 @@ function jitCompile(backend, jaxpr, consts) {
 		const inputExps = [];
 		const inputAvals = [];
 		const inputArgs = [];
-		for (const input of eqn.inputs) if (input instanceof Var) {
-			const jitValue = ctx.get(input);
-			if (jitValue.type === "exp") {
-				const gidMap = /* @__PURE__ */ new Map();
-				for (const [gid, jitId] of jitValue.args.entries()) {
-					let newGid = inputArgs.indexOf(jitId);
-					if (newGid === -1) {
-						newGid = inputArgs.length;
-						inputArgs.push(jitId);
-					}
-					gidMap.set(gid, newGid);
-				}
-				inputExps.push(jitValue.exp.reindexGids(gidMap));
-			} else if (jitValue.type === "imm") {
-				let gid = inputArgs.indexOf(jitValue.arg);
-				if (gid === -1) {
-					gid = inputArgs.length;
-					inputArgs.push(jitValue.arg);
+		let inputReduction = null;
+		const addArgs = (args) => {
+			const newGids = [];
+			for (const jitId of args) {
+				let newGid = inputArgs.indexOf(jitId);
+				if (newGid === -1) {
+					newGid = inputArgs.length;
+					inputArgs.push(jitId);
 				}
+				newGids.push(newGid);
+			}
+			return newGids;
+		};
+		for (const input of eqn.inputs) if (input instanceof Var) {
+			const jv = ctx.get(input);
+			if (jv.type === "exp") {
+				const newGids = addArgs(jv.args);
+				inputExps.push(jv.exp.reindexGids(newGids));
+			} else if (jv.type === "imm") {
+				const [gid] = addArgs([jv.arg]);
 				const st = require_backend.ShapeTracker.fromShape(input.aval.shape);
 				const indices = require_backend.unravelAlu(st.shape, require_backend.AluVar.gidx);
 				inputExps.push(require_backend.AluExp.globalView(input.aval.dtype, gid, st, indices));
+			} else if (jv.type === "red") {
+				if (inputReduction) throw new Error("jit: unexpected, multiple red inputs");
+				const newGids = addArgs(jv.args);
+				inputExps.push(jv.reduction.epilogue.reindexGids(newGids));
+				inputReduction = jv;
 			}
 			inputAvals.push(input.aval);
 		} else if (input instanceof Lit) {
 			inputExps.push(require_backend.AluExp.const(input.dtype, input.value));
 			inputAvals.push(input.aval);
 		} else throw new TypeError(`Unexpected input in Jaxpr: ${input}`);
-		const nargs$1 = inputArgs.length;
 		const rule = jitRules[eqn.primitive];
 		if (!rule) throw new TypeError(`JIT not implemented for primitive ${eqn.primitive}`);
-		const kernel = rule(nargs$1, inputExps, inputAvals, eqn.params);
+		let exp$2;
+		let reduction;
+		if (inputReduction) {
+			const jv = inputReduction;
+			const newEpilogue = rule(inputExps, inputAvals, eqn.params).exp;
+			exp$2 = jv.exp.reindexGids(addArgs(jv.args));
+			reduction = new require_backend.Reduction(jv.reduction.dtype, jv.reduction.op, jv.reduction.size, newEpilogue);
+		} else {
+			const ruleOutput = rule(inputExps, inputAvals, eqn.params);
+			exp$2 = ruleOutput.exp;
+			reduction = ruleOutput.reduction;
+		}
 		const outVar = eqn.outBinders[0];
-		if (kernel.reduction || blackNodes.has(outVar)) {
+		if (blackNodes.has(outVar)) {
+			const nargs$1 = inputArgs.length;
+			const size$1 = require_backend.prod(outVar.aval.shape);
+			const kernel = new require_backend.Kernel(nargs$1, size$1, exp$2, reduction);
 			const outId = builder.pushKernel(kernel, inputArgs);
 			ctx.set(outVar, {
 				type: "imm",
 				arg: outId
 			});
-		} else ctx.set(outVar, {
+		} else if (reduction) ctx.set(outVar, {
+			type: "red",
+			exp: exp$2,
+			reduction,
+			args: inputArgs
+		});
+		else ctx.set(outVar, {
 			type: "exp",
-			exp: kernel.exp,
+			exp: exp$2,
 			args: inputArgs
 		});
 	}
@@ -1863,31 +1887,28 @@ function reshapeViews(exp$2, mapping, reduceAxis = false) {
 	});
 }
 function broadcastedJit(fn, opts) {
-	return (nargs, exps, avals, params) => {
+	return (exps, avals, params) => {
 		let { shape: newShape, dtype: newDtype } = avals.reduce(promoteAvals);
 		const skipCastIdx = opts?.skipCastIdx ?? [];
 		if (skipCastIdx.length) newDtype = avals.filter((_, i) => !skipCastIdx.includes(i)).reduce(promoteAvals).dtype;
-		exps = exps.map((exp$3, i) => {
-			exp$3 = reshapeViews(exp$3, (st) => {
+		exps = exps.map((exp$2, i) => {
+			exp$2 = reshapeViews(exp$2, (st) => {
 				if (!require_backend.deepEqual(st.shape, newShape)) return st.broadcast(newShape, require_backend.range(newShape.length - st.shape.length));
 			});
-			if (exp$3.dtype !== newDtype && !skipCastIdx.includes(i)) exp$3 = require_backend.AluExp.cast(newDtype, exp$3);
-			return exp$3;
+			if (exp$2.dtype !== newDtype && !skipCastIdx.includes(i)) exp$2 = require_backend.AluExp.cast(newDtype, exp$2);
+			return exp$2;
 		});
-		const exp$2 = fn(exps, params);
-		return new require_backend.Kernel(nargs, require_backend.prod(newShape), exp$2);
+		return { exp: fn(exps, params) };
 	};
 }
 function unopJit(fn) {
-	return (nargs, [a], [as], params) => {
-		return new require_backend.Kernel(nargs, require_backend.prod(as.shape), fn(a, params));
+	return ([a], [_as], params) => {
+		return { exp: fn(a, params) };
 	};
 }
 function reshapeJit(fn) {
-	return (nargs, [a], [as], params) => {
-		a = reshapeViews(a, (st) => fn(st, params));
-		const newShape = fn(require_backend.ShapeTracker.fromShape(as.shape), params).shape;
-		return new require_backend.Kernel(nargs, require_backend.prod(newShape), a);
+	return ([a], [_as], params) => {
+		return { exp: reshapeViews(a, (st) => fn(st, params)) };
 	};
 }
 const jitRules = {
@@ -1902,7 +1923,7 @@ const jitRules = {
 	[Primitive.StopGradient]: unopJit((a) => a),
 	[Primitive.Cast]: unopJit((a, { dtype }) => require_backend.AluExp.cast(dtype, a)),
 	[Primitive.Bitcast]: unopJit((a, { dtype }) => require_backend.AluExp.bitcast(dtype, a)),
-	[Primitive.RandomBits]: (nargs, keys, keyShapes, { shape: shape$1, mode }) => {
+	[Primitive.RandomBits]: (keys, keyShapes, { shape: shape$1, mode }) => {
 		const mapping = (st) => {
 			if (!require_backend.deepEqual(st.shape, shape$1)) return st.broadcast(shape$1, require_backend.range(shape$1.length - st.shape.length));
 		};
@@ -1911,7 +1932,7 @@ const jitRules = {
 		const c0 = require_backend.AluExp.u32(0);
 		const c1 = require_backend.AluExp.cast(require_backend.DType.Uint32, require_backend.AluVar.gidx);
 		const exp$2 = require_backend.AluExp.threefry2x32(k0, k1, c0, c1, mode);
-		return new require_backend.Kernel(nargs, require_backend.prod(shape$1), exp$2);
+		return { exp: exp$2 };
 	},
 	[Primitive.Sin]: unopJit(require_backend.AluExp.sin),
 	[Primitive.Cos]: unopJit(require_backend.AluExp.cos),
@@ -1924,7 +1945,7 @@ const jitRules = {
 	[Primitive.Sqrt]: unopJit(require_backend.AluExp.sqrt),
 	[Primitive.Min]: broadcastedJit(([a, b]) => require_backend.AluExp.min(a, b)),
 	[Primitive.Max]: broadcastedJit(([a, b]) => require_backend.AluExp.max(a, b)),
-	[Primitive.Reduce](nargs, [a], [as], { op, axis }) {
+	[Primitive.Reduce]([a], [as], { op, axis }) {
 		const keptAxes = [];
 		const shiftedAxes = [];
 		const newShape = [];
@@ -1933,39 +1954,43 @@ const jitRules = {
 			keptAxes.push(i);
 			newShape.push(as.shape[i]);
 		}
-		const size$1 = require_backend.prod(newShape);
 		const reductionSize = require_backend.prod(shiftedAxes.map((ax) => as.shape[ax]));
 		newShape.push(reductionSize);
 		const perm = keptAxes.concat(shiftedAxes);
 		a = reshapeViews(a, (st) => st.permute(perm).reshape(newShape), true);
 		const reduction = new require_backend.Reduction(a.dtype, op, reductionSize);
-		return new require_backend.Kernel(nargs, size$1, a, reduction);
+		return {
+			exp: a,
+			reduction
+		};
 	},
 	[Primitive.Pool]: reshapeJit((st, { window, strides }) => pool(st, window, strides)),
-	[Primitive.PoolTranspose](nargs, [a], [as], { inShape, window, strides }) {
+	[Primitive.PoolTranspose]([a], [as], { inShape, window, strides }) {
 		let stX = poolTranspose(require_backend.ShapeTracker.fromShape(as.shape), inShape, window, strides);
-		const size$1 = require_backend.prod(inShape);
 		stX = stX.reshape([...inShape, require_backend.prod(stX.shape.slice(inShape.length))]);
 		a = reshapeViews(a, (st) => st.compose(stX), true);
 		const reduction = new require_backend.Reduction(a.dtype, require_backend.AluOp.Add, stX.shape[stX.shape.length - 1]);
-		return new require_backend.Kernel(nargs, size$1, a, reduction);
+		return {
+			exp: a,
+			reduction
+		};
 	},
-	[Primitive.Dot](nargs, [a, b], [as, bs]) {
-		const k1 = jitRules[Primitive.Mul](nargs, [a, b], [as, bs], {});
+	[Primitive.Dot]([a, b], [as, bs]) {
+		const k1 = jitRules[Primitive.Mul]([a, b], [as, bs], {});
 		const c = k1.exp;
 		const cs = promoteAvals(as, bs);
-		return jitRules[Primitive.Reduce](nargs, [c], [cs], {
+		return jitRules[Primitive.Reduce]([c], [cs], {
 			op: require_backend.AluOp.Add,
 			axis: [cs.ndim - 1]
 		});
 	},
-	[Primitive.Conv](nargs, [a, b], [as, bs], params) {
+	[Primitive.Conv]([a, b], [as, bs], params) {
 		const [stX, stY] = prepareConv(require_backend.ShapeTracker.fromShape(as.shape), require_backend.ShapeTracker.fromShape(bs.shape), params);
 		a = reshapeViews(a, (st) => st.compose(stX));
 		b = reshapeViews(b, (st) => st.compose(stY));
 		as = new ShapedArray(stX.shape, as.dtype, as.weakType);
 		bs = new ShapedArray(stY.shape, bs.dtype, bs.weakType);
-		return jitRules[Primitive.Dot](nargs, [a, b], [as, bs], {});
+		return jitRules[Primitive.Dot]([a, b], [as, bs], {});
 	},
 	[Primitive.Compare]: broadcastedJit(([a, b], { op }) => aluCompare(a, b, op)),
 	[Primitive.Where]: broadcastedJit(([cond, a, b]) => require_backend.AluExp.where(cond, a, b), { skipCastIdx: [0] }),
@@ -1979,7 +2004,7 @@ const jitRules = {
 	}),
 	[Primitive.Shrink]: reshapeJit((st, { slice }) => st.shrink(slice)),
 	[Primitive.Pad]: reshapeJit((st, { width }) => st.pad(width)),
-	[Primitive.Gather](nargs, [x, ...indices], [xs, ...indicesShapes], { axis, outDim }) {
+	[Primitive.Gather]([x, ...indices], [xs, ...indicesShapes], { axis, outDim }) {
 		const axisSet = new Set(axis);
 		const indexShape = indicesShapes.map((c) => c.shape).reduce(require_backend.generalBroadcast);
 		const finalShape = xs.shape.filter((_, i) => !axisSet.has(i));
@@ -1992,7 +2017,7 @@ const jitRules = {
 		for (const [i, iexp] of indices.entries()) src[axis[i]] = require_backend.AluExp.cast(require_backend.DType.Int32, reshapeViews(iexp, (st) => st.broadcast(finalShape, [...require_backend.range(outDim + indexShape.length - st.shape.length), ...require_backend.range(outDim + indexShape.length, finalShape.length)])));
 		const [index, valid] = require_backend.ShapeTracker.fromShape(xs.shape).toAluExp(src);
 		if (!valid.resolve()) throw new Error("internal: expected full validity mask in Gather");
-		return new require_backend.Kernel(nargs, require_backend.prod(finalShape), x.substitute({ gidx: index }));
+		return { exp: x.substitute({ gidx: index }) };
 	},
 	[Primitive.JitCall]() {
 		throw new Error("internal: JitCall should have been flattened before JIT compilation");
@@ -2000,16 +2025,16 @@ const jitRules = {
 };
 /** Determines how to split the Jaxpr into kernels via dataflow analysis. */
 function splitGraphDataflow(backend, jaxpr) {
-	const varToEqn = /* @__PURE__ */ new Map();
+	const varToDefn = /* @__PURE__ */ new Map();
+	const varToUsages = /* @__PURE__ */ new Map();
 	for (let i = 0; i < jaxpr.eqns.length; i++) {
 		const eqn = jaxpr.eqns[i];
-		for (const v of eqn.outBinders) if (v instanceof Var) varToEqn.set(v, i);
-	}
-	const blackNodes = /* @__PURE__ */ new Set();
-	const p1NextBlack = /* @__PURE__ */ new Map();
-	for (const v of jaxpr.outs) if (v instanceof Var) {
-		blackNodes.add(v);
-		p1NextBlack.set(v, v);
+		for (const v of eqn.outBinders) if (v instanceof Var) varToDefn.set(v, i);
+		for (const input of eqn.inputs) if (input instanceof Var) {
+			const usages = varToUsages.get(input);
+			if (usages) usages.push(i);
+			else varToUsages.set(input, [i]);
+		}
 	}
 	const reducePrimitives = [
 		Primitive.Reduce,
@@ -2017,10 +2042,68 @@ function splitGraphDataflow(backend, jaxpr) {
 		Primitive.Conv,
 		Primitive.PoolTranspose
 	];
+	const reductionEpilogueEqns = /* @__PURE__ */ new Set();
+	const reductionEndpointEqns = /* @__PURE__ */ new Set();
+	for (let i = 0; i < jaxpr.eqns.length; i++) {
+		const eqn = jaxpr.eqns[i];
+		if (reducePrimitives.includes(eqn.primitive)) {
+			let head = i;
+			while (true) {
+				reductionEpilogueEqns.add(head);
+				const outVar = jaxpr.eqns[head].outBinders[0];
+				const usages = varToUsages.get(outVar) ?? [];
+				if (jaxpr.outs.includes(outVar) || usages.length !== 1) break;
+				if (reductionEpilogueEqns.has(usages[0])) break;
+				const nextEqn = jaxpr.eqns[usages[0]];
+				switch (nextEqn.primitive) {
+					case Primitive.Neg:
+					case Primitive.Reciprocal:
+					case Primitive.Floor:
+					case Primitive.Ceil:
+					case Primitive.StopGradient:
+					case Primitive.Cast:
+					case Primitive.Bitcast:
+					case Primitive.Sin:
+					case Primitive.Cos:
+					case Primitive.Asin:
+					case Primitive.Atan:
+					case Primitive.Exp:
+					case Primitive.Log:
+					case Primitive.Erf:
+					case Primitive.Erfc:
+					case Primitive.Sqrt:
+						head = usages[0];
+						continue;
+					case Primitive.Add:
+					case Primitive.Mul:
+					case Primitive.Idiv:
+					case Primitive.Mod:
+					case Primitive.Max:
+					case Primitive.Min: {
+						const otherInput = nextEqn.inputs.find((v) => v !== outVar);
+						if (otherInput instanceof Lit || require_backend.deepEqual(require_backend.generalBroadcast(otherInput.aval.shape, outVar.aval.shape), outVar.aval.shape)) {
+							head = usages[0];
+							continue;
+						}
+						break;
+					}
+				}
+				break;
+			}
+			reductionEndpointEqns.add(head);
+		}
+	}
+	const blackNodes = /* @__PURE__ */ new Set();
+	const p1NextBlack = /* @__PURE__ */ new Map();
+	for (const v of jaxpr.outs) if (v instanceof Var) {
+		blackNodes.add(v);
+		p1NextBlack.set(v, v);
+	}
 	const heterogeneousViewPrimitives = [Primitive.Gather, Primitive.RandomBits];
+	const needsCleanShapePrimitives = [Primitive.Pad];
 	for (let i = jaxpr.eqns.length - 1; i >= 0; i--) {
 		const eqn = jaxpr.eqns[i];
-		if (reducePrimitives.includes(eqn.primitive) || heterogeneousViewPrimitives.includes(eqn.primitive) || eqn.outBinders.some((v) => blackNodes.has(v))) {
+		if (reductionEndpointEqns.has(i) || heterogeneousViewPrimitives.includes(eqn.primitive) || eqn.outBinders.some((v) => blackNodes.has(v))) {
 			for (const v of eqn.outBinders) {
 				blackNodes.add(v);
 				p1NextBlack.set(v, v);
@@ -2028,17 +2111,25 @@ function splitGraphDataflow(backend, jaxpr) {
 			continue;
 		}
 		const reach = /* @__PURE__ */ new Set();
-		for (let j = i + 1; j < jaxpr.eqns.length; j++) for (const v of jaxpr.eqns[j].inputs) if (v instanceof Var && eqn.outBinders.includes(v)) for (const o of jaxpr.eqns[j].outBinders) {
-			const u = p1NextBlack.get(o);
-			if (u) reach.add(u);
+		let needsCleanOutput = false;
+		outer: for (const v of eqn.outBinders) for (const j of varToUsages.get(v) ?? []) {
+			if (needsCleanShapePrimitives.includes(jaxpr.eqns[j].primitive)) {
+				needsCleanOutput = true;
+				break outer;
+			}
+			for (const o of jaxpr.eqns[j].outBinders) {
+				const u = p1NextBlack.get(o);
+				if (u) reach.add(u);
+			}
 		}
-		if (reach.size === 1) {
-			const b = reach.values().next().value;
-			for (const v of eqn.outBinders) p1NextBlack.set(v, b);
-		} else if (reach.size > 1) for (const v of eqn.outBinders) {
+		if (reach.size > 1 || needsCleanOutput) for (const v of eqn.outBinders) {
 			blackNodes.add(v);
 			p1NextBlack.set(v, v);
 		}
+		else if (reach.size === 1) {
+			const b = reach.values().next().value;
+			for (const v of eqn.outBinders) p1NextBlack.set(v, b);
+		}
 	}
 	const p2Deps = /* @__PURE__ */ new Map();
 	for (const v of jaxpr.inBinders) p2Deps.set(v, new Set([v]));
@@ -2057,7 +2148,7 @@ function splitGraphDataflow(backend, jaxpr) {
 			let assocInput = -1;
 			for (let i = 0; i < eqn.inputs.length; i++) {
 				const input = eqn.inputs[i];
-				if (input instanceof Var && varToEqn.has(input)) {
+				if (input instanceof Var && varToDefn.has(input)) {
 					let uniqueDeps = 0;
 					for (const dep of deps[i]) if (depCounter.get(dep) === 1) uniqueDeps++;
 					if (uniqueDeps > maxUniqueDeps) {
@@ -2068,7 +2159,7 @@ function splitGraphDataflow(backend, jaxpr) {
 			}
 			if (assocInput === -1) throw new Error(`internal: maxArgs, no input found to mark as black in Jaxpr equation ${eqn}`);
 			const assocVar = eqn.inputs[assocInput];
-			p2idx = varToEqn.get(assocVar);
+			p2idx = varToDefn.get(assocVar);
 			for (const out of jaxpr.eqns[p2idx].outBinders) blackNodes.add(out);
 		} else {
 			const s = new Set(depCounter.keys());
@@ -3497,6 +3588,15 @@ const vmapRules = {
 		const z = dot$2(x, y);
 		return [[z], [z.ndim - 1]];
 	},
+	[Primitive.Conv](axisSize, [x, y], [xBdim, yBdim], params) {
+		x = moveBatchAxis(axisSize, xBdim, 0, x);
+		y = moveBatchAxis(axisSize, yBdim, 0, y);
+		const z = conv$1(x, y, {
+			...params,
+			vmapDims: params.vmapDims + 1
+		});
+		return [[z], [0]];
+	},
 	[Primitive.Compare](axisSize, args, dims, { op }) {
 		return broadcastBatcher((x, y) => compare(x, y, op))(axisSize, args, dims, {});
 	},
@@ -3941,7 +4041,7 @@ function partialEvalGraphToJaxpr(tracersIn, tracersOut) {
 	for (const t of tracersIn) t.dispose();
 	for (const t of tracersOut) t.dispose();
 	jaxpr = jaxpr.simplify();
-	if (require_backend.DEBUG >= 5) console.log("jaxpr from partial evaluation:\n" + jaxpr.toString());
+	if (require_backend.DEBUG >= 5) console.info("jaxpr from partial evaluation:\n" + jaxpr.toString());
 	return {
 		jaxpr,
 		consts
@@ -4075,22 +4175,25 @@ const transposeRules = {
 	},
 	[Primitive.Conv]([ct], [lhs, rhs], params) {
 		if (lhs instanceof UndefPrimal === rhs instanceof UndefPrimal) throw new NonlinearError(Primitive.Conv);
+		const v = params.vmapDims;
 		const rev01 = [
-			1,
-			0,
-			...require_backend.range(2, ct.ndim)
+			...require_backend.range(v),
+			v + 1,
+			v,
+			...require_backend.range(v + 2, ct.ndim)
 		];
 		if (lhs instanceof UndefPrimal) {
 			let kernel = rhs;
 			kernel = transpose$1(kernel, rev01);
-			kernel = flip$1(kernel, require_backend.range(2, kernel.ndim));
+			kernel = flip$1(kernel, require_backend.range(v + 2, kernel.ndim));
 			const result = conv$1(ct, kernel, {
+				vmapDims: v,
 				strides: params.lhsDilation,
 				padding: params.padding.map(([pl, _pr], i) => {
-					const dilatedKernel = (kernel.shape[i + 2] - 1) * params.rhsDilation[i] + 1;
-					const dilatedCt = (ct.shape[i + 2] - 1) * params.strides[i] + 1;
+					const dilatedKernel = (kernel.shape[i + v + 2] - 1) * params.rhsDilation[i] + 1;
+					const dilatedCt = (ct.shape[i + v + 2] - 1) * params.strides[i] + 1;
 					const padBefore = dilatedKernel - 1 - pl;
-					const dilatedLhs = (lhs.aval.shape[i + 2] - 1) * params.lhsDilation[i] + 1;
+					const dilatedLhs = (lhs.aval.shape[i + v + 2] - 1) * params.lhsDilation[i] + 1;
 					const padAfter = dilatedLhs + dilatedKernel - 1 - dilatedCt - padBefore;
 					return [padBefore, padAfter];
 				}),
@@ -4102,11 +4205,12 @@ const transposeRules = {
 			const newLhs = transpose$1(lhs, rev01);
 			const newRhs = transpose$1(ct, rev01);
 			let result = conv$1(newLhs, newRhs, {
+				vmapDims: v,
 				strides: params.rhsDilation,
 				padding: params.padding.map(([pl, _pr], i) => {
-					const dilatedLhs = (lhs.aval.shape[i + 2] - 1) * params.lhsDilation[i] + 1;
-					const dilatedKernel = (rhs.aval.shape[i + 2] - 1) * params.rhsDilation[i] + 1;
-					const dilatedCt = (ct.shape[i + 2] - 1) * params.strides[i] + 1;
+					const dilatedLhs = (lhs.aval.shape[i + v + 2] - 1) * params.lhsDilation[i] + 1;
+					const dilatedKernel = (rhs.aval.shape[i + v + 2] - 1) * params.rhsDilation[i] + 1;
+					const dilatedCt = (ct.shape[i + v + 2] - 1) * params.strides[i] + 1;
 					const padFromLhs = dilatedCt - dilatedLhs;
 					const padFromRhs = dilatedKernel - pl - 1;
 					return [pl, padFromLhs + padFromRhs];
@@ -4355,13 +4459,46 @@ function padtypeToPads(inShape, filterShape, strides, dilation, padding) {
 *
 * Grouped convolutions are not supported right now.
 */
-function convGeneralDilated(lhs, rhs, windowStrides, padding, { lhsDilation, rhsDilation } = {}) {
+function convGeneralDilated(lhs, rhs, windowStrides, padding, { lhsDilation, rhsDilation, featureGroupCount = 1 } = {}) {
 	if (lhs.ndim < 2) throw new Error("lhs must have at least 2 dimensions");
 	if (rhs.ndim < 2) throw new Error("rhs must have at least 2 dimensions");
 	if (typeof padding === "string") {
 		if (lhsDilation?.some((d) => d !== 1)) throw new Error("String padding is not supported for transposed convolutions");
 		padding = padtypeToPads(lhs.shape.slice(2), rhs.shape.slice(2), windowStrides, rhsDilation ?? require_backend.rep(rhs.ndim - 2, 1), padding);
 	}
+	if (featureGroupCount !== 1) {
+		const G = featureGroupCount;
+		const [N, C_in, ...xs] = lhs.shape;
+		const [C_out, C_in_per_group, ...ks] = rhs.shape;
+		if (C_in % G !== 0) throw new Error(`featureGroupCount=${G} must divide input channels=${C_in}`);
+		if (C_out % G !== 0) throw new Error(`featureGroupCount=${G} must divide output channels=${C_out}`);
+		if (C_in / G !== C_in_per_group) throw new Error(`rhs input channels=${C_in_per_group} must equal lhs input channels / groups=${C_in / G}`);
+		const lhsGrouped = moveaxis(lhs.reshape([
+			N,
+			G,
+			C_in / G,
+			...xs
+		]), 1, 0);
+		const rhsGrouped = rhs.reshape([
+			G,
+			C_out / G,
+			C_in_per_group,
+			...ks
+		]);
+		const result = conv$1(lhsGrouped, rhsGrouped, {
+			vmapDims: 1,
+			strides: windowStrides,
+			padding,
+			lhsDilation,
+			rhsDilation
+		});
+		const ys = result.shape.slice(3);
+		return moveaxis(result, 0, 1).reshape([
+			N,
+			C_out,
+			...ys
+		]);
+	}
 	return conv$1(lhs, rhs, {
 		strides: windowStrides,
 		padding,
@@ -4647,6 +4784,8 @@ __export(numpy_exports, {
 	concatenate: () => concatenate,
 	cos: () => cos,
 	cosh: () => cosh,
+	cumsum: () => cumsum,
+	cumulativeSum: () => cumulativeSum,
 	deg2rad: () => deg2rad,
 	degrees: () => degrees,
 	diag: () => diag,
@@ -4955,6 +5094,25 @@ function argmax(a, axis, opts) {
 	}).reshape([shape$1[axis], ...require_backend.rep(shape$1.length - axis - 1, 1)]));
 	return length.sub(max(idx, axis, opts));
 }
+/**
+* Cumulative sum of elements along an axis.
+*
+* Currently this function is `O(n^2)`, we'll improve this later on with a
+* two-phase parallel reduction algorithm.
+*/
+function cumsum(a, axis) {
+	a = fudgeArray(a);
+	if (axis === void 0) {
+		a = a.ravel();
+		axis = 0;
+	} else axis = require_backend.checkAxis(axis, a.ndim);
+	const n = a.shape[axis];
+	a = moveaxis$1(a, axis, -1);
+	a = broadcast(a, a.shape.concat(n), [-2]);
+	return moveaxis$1(tril(a).sum(-1), -1, axis);
+}
+/** @function Alternative name for `jax.numpy.cumsum()`. */
+const cumulativeSum = cumsum;
 /** Reverse the elements in an array along the given axes. */
 function flip(x, axis = null) {
 	const nd = ndim(x);
@@ -5190,7 +5348,10 @@ function allclose(actual, expected, options) {
 	if (!require_backend.deepEqual(x.shape, y.shape)) return false;
 	const xData = x.dataSync();
 	const yData = y.dataSync();
-	for (let i = 0; i < xData.length; i++) if (Math.abs(xData[i] - yData[i]) > atol + rtol * Math.abs(yData[i])) return false;
+	for (let i = 0; i < xData.length; i++) {
+		if (isNaN(xData[i]) !== isNaN(yData[i])) return false;
+		if (Math.abs(xData[i] - yData[i]) > atol + rtol * Math.abs(yData[i])) return false;
+	}
 	return true;
 }
 /** Matrix product of two arrays. */
@@ -5649,7 +5810,10 @@ const degrees = rad2deg;
 * Computes first array raised to power of second array, element-wise.
 */
 const power = jit$1(function power$1(x1, x2) {
-	return exp(log(x1).mul(x2));
+	const x2i = trunc(x2.ref);
+	const shouldBeNaN = multiply(x2.ref.notEqual(x2i.ref), x1.ref.less(0));
+	const resultSign = where(mod(x2i, 2).notEqual(0), where(x1.ref.less(0), -1, 1), 1);
+	return where(shouldBeNaN, nan, exp(log(abs(x1)).mul(x2)).mul(resultSign));
 });
 /** @function Alias of `jax.numpy.power()`. */
 const pow = power;
@@ -6005,22 +6169,22 @@ function logSoftmax(x, axis = -1) {
 *
 * Reference: https://en.wikipedia.org/wiki/LogSumExp
 */
-function logsumexp(x, axis = null) {
+function logsumexp(x, axis = null, opts) {
 	x = fudgeArray(x);
 	axis = require_backend.normalizeAxis(axis, x.ndim);
 	if (axis.length === 0) return x;
-	const xMax = stopGradient(max(x.ref, axis));
-	const xMaxDims = broadcast(xMax.ref, x.shape, axis);
-	const shifted = x.sub(xMaxDims);
-	return xMax.add(log(exp(shifted).sum(axis)));
+	const xMax = stopGradient(max(x.ref, axis, { keepdims: true }));
+	const shifted = x.sub(xMax.ref);
+	const result = xMax.add(log(exp(shifted).sum(axis, { keepdims: true })));
+	return opts?.keepdims ? result : squeeze(result, axis);
 }
 /** Log-mean-exp reduction, like `jax.nn.logsumexp()` but subtracts `log(n)`. */
-function logmeanexp(x, axis = null) {
+function logmeanexp(x, axis = null, opts) {
 	x = fudgeArray(x);
 	axis = require_backend.normalizeAxis(axis, x.ndim);
 	if (axis.length === 0) return x;
 	const n = axis.reduce((acc, a) => acc * x.shape[a], 1);
-	return logsumexp(x, axis).sub(Math.log(n));
+	return logsumexp(x, axis, opts).sub(Math.log(n));
 }
 /**
 * Standardizes input to zero mean and unit variance.