npm - @jax-js/jax - Versions diffs - 0.0.5 → 0.1.0 - Mend

@jax-js/jax 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md +267 -92
package/dist/{backend-CdcTZEOF.js → backend-DwIAd0AG.js} +205 -112
package/dist/{backend-yEU0L_ig.cjs → backend-FtkbO6pI.cjs} +217 -118
package/dist/index.cjs +344 -67
package/dist/index.d.cts +96 -18
package/dist/index.d.ts +96 -18
package/dist/index.js +337 -67
package/dist/{webgpu-CNOpiO5T.cjs → webgpu-BE7zA_01.cjs} +181 -151
package/dist/{webgpu-CM-xNYzW.js → webgpu-LGi2A3mS.js} +181 -151
package/package.json +7 -5

package/dist/index.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import { __export } from "./chunk-Cl8Af3a2.js";
-import { AluExp, AluGroup, AluOp, AluVar, DEBUG, DType, FpHash, Kernel, PPrint, Reduction, ShapeTracker, accessorAluExp, accessorGlobal, byteWidth, checkAxis, deepEqual, defaultDevice, devices, dtypedArray, dtypedJsArray, generalBroadcast, getBackend, init, invertPermutation, isFloatDtype, isNumberPair, isPermutation, normalizeAxis, partitionList, prod, promoteTypes, range, recursiveFlatten, rep, runWithCache, setDebug, toposort, unravelAlu, unzip2, zip, zipn } from "./backend-CdcTZEOF.js";
+import { AluExp, AluGroup, AluOp, AluVar, DEBUG, DType, FpHash, Kernel, PPrint, Reduction, ShapeTracker, accessorAluExp, accessorGlobal, assertNonNull, byteWidth, checkAxis, deepEqual, defaultDevice, devices, dtypedArray, dtypedJsArray, generalBroadcast, getBackend, init, invertPermutation, isFloatDtype, isNumberPair, isPermutation, normalizeAxis, partitionList, prod, promoteTypes, range, recursiveFlatten, rep, runWithCache, setDebug, toposort, unravelAlu, unzip2, zip, zipn } from "./backend-DwIAd0AG.js";
 //#region src/tree.ts
 var tree_exports = {};
@@ -29,6 +29,10 @@ var JsTreeDef = class JsTreeDef {
 		this.nodeMetadata = nodeMetadata;
 		this.childTreedefs = childTreedefs;
 	}
+	/** Get the total number of leaves in the tree. */
+	get size() {
+		return this.nodeType === NodeType.Leaf ? 1 : this.childTreedefs.reduce((a, b) => a + b.size, 0);
+	}
 	/** Returns a string representation of this tree definition. */
 	toString(root = true) {
 		if (root) return "JsTreeDef(" + this.toString(false) + ")";
@@ -184,6 +188,16 @@ function pool(st, ks, strides = 1, dilation = 1) {
 	const s_ = strides;
 	const d_ = dilation;
 	const o_ = zipn(i_, d_, ks, s_).map(([i, d, k, s]) => Math.ceil((i - d * (k - 1)) / s));
+	if (d_.every((d) => d === 1) && ks.every((k, j) => k <= s_[j])) {
+		st = st.padOrShrink([...noop.map(() => [0, 0]), ...zipn(i_, o_, s_).map(([i, o, s]) => [0, o * s - i])]);
+		st = st.reshape([...noop, ...zip(o_, s_).flatMap(([o, s]) => [o, s])]).shrink([...noop.map((x) => [0, x]), ...zip(o_, ks).flatMap(([o, k]) => [[0, o], [0, k]])]);
+		st = st.permute([
+			...range(noop.length),
+			...ks.map((_, j) => noop.length + 2 * j),
+			...ks.map((_, j) => noop.length + 2 * j + 1)
+		]);
+		return st;
+	}
 	const f_ = zipn(o_, s_, i_, d_, ks).map(([o, s, i, d, k]) => 1 + Number(o * s > i - d * (k - 1)));
 	const kidf = zipn(ks, i_, d_, f_);
 	st = st.repeat([...rep(noop.length, 1), ...kidf.map(([k, i, d, f]) => Math.ceil(k * (i * f + d) / i))]);
@@ -218,6 +232,12 @@ function poolTranspose(st, inShape, ks, strides = 1, dilation = 1) {
 	const s_ = strides;
 	const d_ = dilation;
 	const o_ = zipn(i_, d_, ks, s_).map(([i, d, k, s]) => Math.ceil((i - d * (k - 1)) / s));
+	if (d_.every((d) => d === 1) && ks.every((k, j) => k <= s_[j])) {
+		st = st.permute([...range(noop.length), ...ks.flatMap((_, j) => [noop.length + j, noop.length + o_.length + j])]);
+		st = st.pad([...noop.map(() => [0, 0]), ...zip(s_, ks).flatMap(([s, k]) => [[0, 0], [0, s - k]])]).reshape([...noop, ...zip(o_, s_).map(([o, s]) => o * s)]);
+		st = st.padOrShrink([...noop.map(() => [0, 0]), ...zipn(i_, o_, s_).map(([i, o, s]) => [0, i - o * s])]);
+		return st.reshape(st.shape.concat(rep(ks.length, 1)));
+	}
 	if (!deepEqual(o_, st.shape.slice(noop.length, noop.length + ks.length))) throw new Error("poolTranspose() called with mismatched output shape");
 	const f_ = zipn(o_, s_, i_, d_, ks).map(([o, s, i, d, k]) => 1 + Number(o * s > i - d * (k - 1)));
 	const kidf = zipn(ks, i_, d_, f_);
@@ -327,6 +347,8 @@ let Primitive = /* @__PURE__ */ function(Primitive$1) {
 	Primitive$1["Atan"] = "atan";
 	Primitive$1["Exp"] = "exp";
 	Primitive$1["Log"] = "log";
+	Primitive$1["Erf"] = "erf";
+	Primitive$1["Erfc"] = "erfc";
 	Primitive$1["Sqrt"] = "sqrt";
 	Primitive$1["Min"] = "min";
 	Primitive$1["Max"] = "max";
@@ -404,6 +426,12 @@ function exp$1(x) {
 function log$1(x) {
 	return bind1(Primitive.Log, [x]);
 }
+function erf$1(x) {
+	return bind1(Primitive.Erf, [x]);
+}
+function erfc$1(x) {
+	return bind1(Primitive.Erfc, [x]);
+}
 function sqrt$1(x) {
 	return bind1(Primitive.Sqrt, [x]);
 }
@@ -1146,12 +1174,18 @@ function reshapeViews(exp$2, mapping, reduceAxis = false) {
 		} else if (exp$3.op === AluOp.GlobalIndex) throw new Error("internal: reshapeViews() called with GlobalIndex op");
 	});
 }
-function broadcastedJit(fn) {
+function broadcastedJit(fn, opts) {
 	return (nargs, exps, avals, params) => {
-		const newShape = avals.map((aval) => aval.shape).reduce(generalBroadcast);
-		exps = exps.map((exp$3) => reshapeViews(exp$3, (st) => {
-			if (!deepEqual(st.shape, newShape)) return st.broadcast(newShape, range(newShape.length - st.shape.length));
-		}));
+		let { shape: newShape, dtype: newDtype } = avals.reduce(promoteAvals);
+		const skipCastIdx = opts?.skipCastIdx ?? [];
+		if (skipCastIdx.length) newDtype = avals.filter((_, i) => !skipCastIdx.includes(i)).reduce(promoteAvals).dtype;
+		exps = exps.map((exp$3, i) => {
+			exp$3 = reshapeViews(exp$3, (st) => {
+				if (!deepEqual(st.shape, newShape)) return st.broadcast(newShape, range(newShape.length - st.shape.length));
+			});
+			if (exp$3.dtype !== newDtype && !skipCastIdx.includes(i)) exp$3 = AluExp.cast(newDtype, exp$3);
+			return exp$3;
+		});
 		const exp$2 = fn(exps, params);
 		return new Kernel(nargs, prod(newShape), exp$2);
 	};
@@ -1194,6 +1228,8 @@ const jitRules = {
 	[Primitive.Atan]: unopJit(AluExp.atan),
 	[Primitive.Exp]: unopJit(AluExp.exp),
 	[Primitive.Log]: unopJit(AluExp.log),
+	[Primitive.Erf]: unopJit(AluExp.erf),
+	[Primitive.Erfc]: unopJit(AluExp.erfc),
 	[Primitive.Sqrt]: unopJit(AluExp.sqrt),
 	[Primitive.Min]: broadcastedJit(([a, b]) => AluExp.min(a, b)),
 	[Primitive.Max]: broadcastedJit(([a, b]) => AluExp.max(a, b)),
@@ -1241,7 +1277,7 @@ const jitRules = {
 		return jitRules[Primitive.Dot](nargs, [a, b], [as, bs], {});
 	},
 	[Primitive.Compare]: broadcastedJit(([a, b], { op }) => aluCompare(a, b, op)),
-	[Primitive.Where]: broadcastedJit(([cond, a, b]) => AluExp.where(cond, a, b)),
+	[Primitive.Where]: broadcastedJit(([cond, a, b]) => AluExp.where(cond, a, b), { skipCastIdx: [0] }),
 	[Primitive.Transpose]: reshapeJit((st, { perm }) => st.permute(perm)),
 	[Primitive.Broadcast]: reshapeJit((st, { shape: shape$1, axis }) => st.broadcast(shape$1, axis)),
 	[Primitive.Reshape]: reshapeJit((st, { shape: shape$1 }) => st.reshape(shape$1)),
@@ -1412,7 +1448,7 @@ var PendingExecute = class {
 /**
 * A multidimensional numeric array with data stored on CPU or GPU.
 *
-* This is the library's core data type. Equivalent to `jnp.Array` from JAX, or
+* This is the library's core data type. Equivalent to `jax.Array` from JAX, or
 * `torch.Tensor`.
 *
 * Not to be confused with the JavaScript "Array" constructor. Avoid importing
@@ -1427,6 +1463,7 @@ var Array$1 = class Array$1 extends Tracer {
 	#source;
 	#st;
 	#backend;
+	#committed;
 	#rc;
 	#pendingSet;
 	/**
@@ -1443,6 +1480,7 @@ var Array$1 = class Array$1 extends Tracer {
 		this.#source = args.source;
 		this.#st = args.st;
 		this.#backend = args.backend;
+		this.#committed = args.committed;
 		this.#rc = 1;
 		this.#pendingSet = new Set(args.pending);
 		if (this.#pendingSet.size === 0) this.#pendingSet = null;
@@ -1470,6 +1508,7 @@ var Array$1 = class Array$1 extends Tracer {
 			dtype: args.dtype ?? this.#dtype,
 			weakType: this.#weakType,
 			backend: args.backend ?? this.#backend,
+			committed: args.committed ?? this.#committed,
 			pending: args.pending ?? this.#pending ?? void 0
 		});
 	}
@@ -1525,9 +1564,10 @@ var Array$1 = class Array$1 extends Tracer {
 	*/
 	#gather(indices, axis, outDim) {
 		this.#check();
-		if (indices.some((a) => a.#backend !== this.#backend)) throw new TypeError(`Gather indices must have the same backend: ${this.#backend.type}`);
 		const axisSet = new Set(axis);
 		if (axisSet.size !== axis.length) throw new TypeError("Gather axis must not have duplicates");
+		if (indices.some((a) => a.#committed && a.#backend !== this.#backend)) throw new TypeError(`Gather indices must have the same backend: ${this.#backend.type}`);
+		indices = indices.map((ar) => ar._putSync(this.#backend));
 		indices = Array$1.#broadcastArrays(indices);
 		const indexShape = indices[0].shape;
 		const finalShape = this.shape.filter((_, i) => !axisSet.has(i));
@@ -1596,6 +1636,7 @@ var Array$1 = class Array$1 extends Tracer {
 		this.#check();
 		if (this.#source instanceof AluExp) {
 			const exp$3 = new AluExp(op, dtypeOutput, [this.#source]);
+			this.dispose();
 			return this.#newArrayFrom({
 				source: exp$3.simplify(),
 				dtype: dtypeOutput,
@@ -1624,21 +1665,19 @@ var Array$1 = class Array$1 extends Tracer {
 	}
 	static #naryCustom(name, custom, arrays, { dtypeOverride, strongTypeOutput, reduceAxis } = {}) {
 		const n = arrays.length;
-		const backend = arrays[0].#backend;
 		if (n === 0) throw new TypeError(`No inputs for ${name}`);
 		for (const ar of arrays) ar.#check();
 		let castDtype;
 		let castWeakType = true;
-		for (let i = 0; i < n; i++) {
-			if (dtypeOverride?.[i]) {
-				if (arrays[i].#dtype !== dtypeOverride[i]) throw new TypeError(`Wrong dtype in ${name}: expected ${dtypeOverride[i]}, got ${arrays[i].#dtype}`);
-			} else if (castDtype === void 0) {
-				castDtype = arrays[i].#dtype;
-				castWeakType = arrays[i].#weakType;
-			} else ({dtype: castDtype, weakType: castWeakType} = promoteAvals(new ShapedArray([], castDtype, castWeakType), new ShapedArray([], arrays[i].#dtype, arrays[i].#weakType)));
-			if (arrays[i].#backend !== backend) throw new TypeError(`Backend mismatch in ${name}: ${backend.type} vs ${arrays[i].#backend.type}`);
-		}
+		for (let i = 0; i < n; i++) if (dtypeOverride?.[i]) {
+			if (arrays[i].#dtype !== dtypeOverride[i]) throw new TypeError(`Wrong dtype in ${name}: expected ${dtypeOverride[i]}, got ${arrays[i].#dtype}`);
+		} else if (castDtype === void 0) {
+			castDtype = arrays[i].#dtype;
+			castWeakType = arrays[i].#weakType;
+		} else ({dtype: castDtype, weakType: castWeakType} = promoteAvals(new ShapedArray([], castDtype, castWeakType), new ShapedArray([], arrays[i].#dtype, arrays[i].#weakType)));
 		const weakType = castWeakType && !strongTypeOutput;
+		const { backend, committed } = Array$1.#computeBackend(name, arrays);
+		arrays = arrays.map((ar) => ar._putSync(backend));
 		arrays = Array$1.#broadcastArrays(arrays);
 		const newShape = [...arrays[0].shape];
 		if (arrays.every((ar) => ar.#source instanceof AluExp) && !reduceAxis) {
@@ -1648,12 +1687,14 @@ var Array$1 = class Array$1 extends Tracer {
 			});
 			if (arrays.every((ar) => deepEqual(ar.#st, arrays[0].#st))) {
 				const exp$4 = custom(sources);
+				arrays.forEach((ar) => ar.dispose());
 				return new Array$1({
 					source: exp$4.simplify(),
 					st: arrays[0].#st,
 					dtype: exp$4.dtype,
 					weakType,
-					backend
+					backend,
+					committed
 				});
 			}
 			const exp$3 = custom(arrays.map((ar, i) => {
@@ -1662,12 +1703,14 @@ var Array$1 = class Array$1 extends Tracer {
 				return accessorAluExp(src$1, ar.#st, unravelAlu(newShape, AluVar.idx));
 			}));
 			const st = ShapeTracker.fromShape(newShape);
+			arrays.forEach((ar) => ar.dispose());
 			return new Array$1({
 				source: exp$3.simplify(),
 				st,
 				dtype: exp$3.dtype,
 				weakType,
-				backend
+				backend,
+				committed
 			});
 		}
 		let indices;
@@ -1703,13 +1746,14 @@ var Array$1 = class Array$1 extends Tracer {
 		const pending = new Set([...arrays.flatMap((ar) => ar.#pending)]);
 		for (const exe of pending) exe.updateRc(1);
 		pending.add(new PendingExecute(backend, kernel, inputs, [output]));
-		for (const ar of arrays) ar.dispose();
+		arrays.forEach((ar) => ar.dispose());
 		return new Array$1({
 			source: output,
 			st: ShapeTracker.fromShape(newShape),
 			dtype: kernel.dtype,
 			weakType,
 			backend,
+			committed,
 			pending
 		});
 	}
@@ -1787,6 +1831,23 @@ var Array$1 = class Array$1 extends Tracer {
 			return ar.#reshape(ar.#st.broadcast(newShape, range(newShape.length - ar.ndim)));
 		});
 	}
+	static #computeBackend(name, arrays) {
+		const committed = arrays.filter((ar) => ar.#committed);
+		if (committed.length > 0) {
+			const backend = committed[0].#backend;
+			for (const ar of committed) if (ar.#backend !== backend) throw new Error(`Device mismatch in ${name} between committed arrays on (${backend.type}, ${ar.#backend.type}), please move to the same device with devicePut()`);
+			return {
+				backend,
+				committed: true
+			};
+		} else {
+			const backend = arrays.length > 0 ? arrays[0].#backend : getBackend();
+			return {
+				backend,
+				committed: false
+			};
+		}
+	}
 	/** Realize the array and return it as data. */
 	async data() {
 		if (this.#source instanceof AluExp && this.size < inlineArrayLimit && this.device !== "cpu") return this.#dataInline();
@@ -1946,6 +2007,12 @@ var Array$1 = class Array$1 extends Tracer {
 			[Primitive.Log]([x]) {
 				return [x.#unary(AluOp.Log)];
 			},
+			[Primitive.Erf]([x]) {
+				return [x.#unary(AluOp.Erf)];
+			},
+			[Primitive.Erfc]([x]) {
+				return [x.#unary(AluOp.Erfc)];
+			},
 			[Primitive.Sqrt]([x]) {
 				return [x.#unary(AluOp.Sqrt)];
 			},
@@ -2014,7 +2081,8 @@ var Array$1 = class Array$1 extends Tracer {
 			},
 			[Primitive.JitCall](args, { jaxpr, numConsts }) {
 				if (jaxpr.inBinders.length !== args.length) throw new Error(`jit_call expects ${jaxpr.inBinders.length} args, got ${args.length}`);
-				const backend = getBackend();
+				const { backend, committed } = Array$1.#computeBackend("jit_call", args);
+				args = args.map((ar) => ar._putSync(backend));
 				const consts = args.slice(0, numConsts);
 				const tracers = args.slice(numConsts);
 				const jp = jitCompile(backend, jaxpr, consts);
@@ -2031,16 +2099,54 @@ var Array$1 = class Array$1 extends Tracer {
 						dtype: jaxpr.outs[i].aval.dtype,
 						weakType: jaxpr.outs[i].aval.weakType,
 						backend,
+						committed,
 						pending
 					});
 				});
 			}
 		};
 	}
+	/** @private */
 	_realizeSource() {
 		this.#realize();
 		return this.#source;
 	}
+	/** @private Put this array on a new backend, asynchronously. */
+	async _put(backend) {
+		if (this.#backend === backend) return this;
+		if (this.#source instanceof AluExp) {
+			const ar = this.#newArrayFrom({
+				backend,
+				committed: true
+			});
+			this.dispose();
+			return ar;
+		} else {
+			const data = await this.data();
+			return arrayFromData(data, this.shape, {
+				dtype: this.#dtype,
+				device: backend.type
+			}, this.#weakType);
+		}
+	}
+	/** @private Put this array on a new backend, synchronously. */
+	_putSync(backend) {
+		if (this.#backend === backend) return this;
+		if (this.#source instanceof AluExp) {
+			const ar = this.#newArrayFrom({
+				backend,
+				committed: true
+			});
+			this.dispose();
+			return ar;
+		} else {
+			const data = this.dataSync();
+			return arrayFromData(data, this.shape, {
+				dtype: this.#dtype,
+				device: backend.type
+			}, this.#weakType);
+		}
+	}
 };
 /** Constructor for creating a new array from data. */
 function array(values, { shape: shape$1, dtype, device } = {}) {
@@ -2123,7 +2229,8 @@ function arrayFromData(data, shape$1, { dtype, device }, weakType = false) {
 		st: ShapeTracker.fromShape(shape$1),
 		dtype,
 		weakType,
-		backend
+		backend,
+		committed: device != void 0
 	});
 }
 function dataToJs(dtype, data, shape$1) {
@@ -2157,7 +2264,8 @@ function fullInternal(aval, fillValue, device) {
 		st: ShapeTracker.fromShape(aval.shape),
 		dtype: aval.dtype,
 		weakType: aval.weakType,
-		backend: getBackend(device)
+		backend: getBackend(device),
+		committed: device != void 0
 	});
 }
 function zerosLike$1(val, dtype) {
@@ -2225,7 +2333,8 @@ function eye(numRows, numCols, { dtype, device } = {}) {
 		st: ShapeTracker.fromShape([numRows, numCols]),
 		dtype,
 		weakType,
-		backend: getBackend(device)
+		backend: getBackend(device),
+		committed: device != void 0
 	});
 }
 /** Return the identity matrix, with ones on the main diagonal. */
@@ -2268,7 +2377,8 @@ function arange(start, stop, step = 1, { dtype, device } = {}) {
 		st,
 		dtype,
 		weakType: false,
-		backend: getBackend(device)
+		backend: getBackend(device),
+		committed: device != void 0
 	});
 }
 /**
@@ -2304,7 +2414,8 @@ function linspace(start, stop, num = 50, endpoint = true, { dtype, device } = {}
 		st,
 		dtype,
 		weakType: false,
-		backend: getBackend(device)
+		backend: getBackend(device),
+		committed: device != void 0
 	});
 }
 function aluCompare(a, b, op) {
@@ -2812,6 +2923,8 @@ const abstractEvalRules = {
 	[Primitive.Atan]: vectorizedUnopAbstractEval,
 	[Primitive.Exp]: vectorizedUnopAbstractEval,
 	[Primitive.Log]: vectorizedUnopAbstractEval,
+	[Primitive.Erf]: vectorizedUnopAbstractEval,
+	[Primitive.Erfc]: vectorizedUnopAbstractEval,
 	[Primitive.Sqrt]: vectorizedUnopAbstractEval,
 	[Primitive.Min]: binopAbstractEval,
 	[Primitive.Max]: binopAbstractEval,
@@ -3064,6 +3177,16 @@ const jvpRules = {
 	[Primitive.Log]([x], [dx]) {
 		return [[log$1(x.ref)], [reciprocal$1(x).mul(dx)]];
 	},
+	[Primitive.Erf]([x], [dx]) {
+		const coeff = 2 / Math.sqrt(Math.PI);
+		const expTerm = exp$1(neg(x.ref.mul(x.ref)));
+		return [[erf$1(x)], [expTerm.mul(coeff).mul(dx)]];
+	},
+	[Primitive.Erfc]([x], [dx]) {
+		const coeff = -2 / Math.sqrt(Math.PI);
+		const expTerm = exp$1(neg(x.ref.mul(x.ref)));
+		return [[erfc$1(x)], [expTerm.mul(coeff).mul(dx)]];
+	},
 	[Primitive.Sqrt]([x], [dx]) {
 		const z = sqrt$1(x);
 		return [[z.ref], [reciprocal$1(z.mul(2)).mul(dx)]];
@@ -3225,6 +3348,10 @@ var BatchTrace = class extends Trace {
 		const [valsIn, bdimsIn] = unzip2(tracers.map((t) => [t.val, t.batchDim]));
 		const vmapRule = vmapRules[primitive];
 		if (vmapRule === void 0) throw new Error(`No vmap rule for: ${primitive}`);
+		if (bdimsIn.every((d) => d === null)) {
+			const valOuts$1 = bind(primitive, valsIn, params);
+			return valOuts$1.map((x) => new BatchTracer(this, x, null));
+		}
 		const [valOuts, bdimOuts] = vmapRule(this.axisSize, valsIn, bdimsIn, params);
 		return zip(valOuts, bdimOuts).map(([x, bd]) => new BatchTracer(this, x, bd));
 	}
@@ -3232,24 +3359,28 @@ var BatchTrace = class extends Trace {
 		return this.main.globalData;
 	}
 };
-function handleScalarBroadcasting(nd, x, d) {
-	if (d === null || nd === ndim$1(x)) return x;
-	else {
-		const axis = range(ndim$1(x), nd);
-		const shape$1 = [...x.shape, ...axis.map(() => 1)];
-		return broadcast(x, shape$1, axis);
-	}
-}
-/** Process a primitive with built-in broadcasting. */
+/**
+* Process a primitive with built-in broadcasting.
+*
+* Reference: https://github.com/jax-ml/jax/blob/jax-v0.8.1/jax/_src/interpreters/batching.py#L1029
+*/
 function broadcastBatcher(op) {
 	return (axisSize, args, dims) => {
 		if (args.length === 0) throw new Error("Empty list in broadcastBatcher");
-		const idx = dims.findIndex((d) => d !== null);
-		if (idx === -1) return [[op(...args)], [null]];
-		if (zip(args, dims).every(([x, d]) => ndim$1(x) === 0 || deepEqual(x.shape, args[idx].shape) && d === dims[idx])) return [[op(...args)], [dims[idx]]];
-		args = args.map((x, i) => ndim$1(x) > 0 ? moveBatchAxis(axisSize, dims[i], 0, x) : x);
-		const nd = Math.max(...args.map(ndim$1));
-		args = args.map((x, i) => handleScalarBroadcasting(nd, x, dims[i]));
+		const nd = Math.max(...args.map((x, i) => ndim$1(x) + (dims[i] === null ? 1 : 0)));
+		const firstIdx = dims.findIndex((d) => d !== null);
+		const firstBdim = dims[firstIdx] - args[firstIdx].ndim;
+		if (zip(args, dims).every(([x, d]) => d === null && ndim$1(x) < -firstBdim || d !== null && d - x.ndim === firstBdim)) return [[op(...args)], [nd + firstBdim]];
+		args = args.map((x, i) => {
+			if (dims[i] === null) return x;
+			x = moveBatchAxis(axisSize, dims[i], 0, x);
+			if (x.ndim < nd) x = x.reshape([
+				x.shape[0],
+				...rep(nd - x.ndim, 1),
+				...x.shape.slice(1)
+			]);
+			return x;
+		});
 		return [[op(...args)], [0]];
 	};
 }
@@ -3273,17 +3404,18 @@ const vmapRules = {
 	[Primitive.Atan]: unopBatcher(atan$1),
 	[Primitive.Exp]: unopBatcher(exp$1),
 	[Primitive.Log]: unopBatcher(log$1),
+	[Primitive.Erf]: unopBatcher(erf$1),
+	[Primitive.Erfc]: unopBatcher(erfc$1),
 	[Primitive.Sqrt]: unopBatcher(sqrt$1),
 	[Primitive.Min]: broadcastBatcher(min$1),
 	[Primitive.Max]: broadcastBatcher(max$1),
 	[Primitive.Reduce](axisSize, [x], [xBdim], { op, axis }) {
-		if (xBdim === null) return [[reduce(x, op, axis)], [null]];
+		assertNonNull(xBdim);
 		const newAxis = axis.map((ax) => ax + (xBdim <= ax ? 1 : 0));
 		const outBdim = xBdim - axis.filter((ax) => ax < xBdim).length;
 		return [[reduce(x, op, newAxis)], [outBdim]];
 	},
 	[Primitive.Dot](axisSize, [x, y], [xBdim, yBdim]) {
-		if (xBdim === null && yBdim === null) return [[dot$1(x, y)], [null]];
 		x = moveBatchAxis(axisSize, xBdim, x.ndim - (xBdim === null ? 1 : 2), x);
 		y = moveBatchAxis(axisSize, yBdim, y.ndim - (yBdim === null ? 1 : 2), y);
 		const z = dot$1(x, y);
@@ -3292,26 +3424,68 @@ const vmapRules = {
 	[Primitive.Compare](axisSize, args, dims, { op }) {
 		return broadcastBatcher((x, y) => compare(x, y, op))(axisSize, args, dims, {});
 	},
+	[Primitive.Where]: broadcastBatcher(where$1),
+	[Primitive.Transpose](axisSize, [x], [xBdim], { perm }) {
+		assertNonNull(xBdim);
+		const newPerm = perm.map((p) => p + (xBdim <= p ? 1 : 0));
+		newPerm.splice(xBdim, 0, xBdim);
+		return [[transpose$1(x, newPerm)], [xBdim]];
+	},
+	[Primitive.Broadcast](axisSize, [x], [xBdim], { shape: shape$1, axis }) {
+		assertNonNull(xBdim);
+		const newShape = shape$1.toSpliced(xBdim, 0, axisSize);
+		const newAxis = axis.map((ax) => ax + (xBdim <= ax ? 1 : 0));
+		return [[broadcast(x, newShape, newAxis)], [xBdim]];
+	},
 	[Primitive.Reshape](axisSize, [x], [xBdim], { shape: shape$1 }) {
-		if (xBdim === null) return [[reshape$1(x, shape$1)], [null]];
 		x = moveBatchAxis(axisSize, xBdim, 0, x);
 		return [[reshape$1(x, [axisSize, ...shape$1])], [0]];
 	},
 	[Primitive.Flip](axisSize, [x], [xBdim], { axis }) {
-		if (xBdim === null) return [[flip$1(x, axis)], [null]];
+		assertNonNull(xBdim);
 		const newAxis = axis.map((ax) => ax + (xBdim <= ax ? 1 : 0));
 		return [[flip$1(x, newAxis)], [xBdim]];
 	},
 	[Primitive.Shrink](axisSize, [x], [xBdim], { slice }) {
-		if (xBdim === null) return [[shrink(x, slice)], [null]];
+		assertNonNull(xBdim);
 		const newSlice = slice.toSpliced(xBdim, 0, [0, axisSize]);
 		return [[shrink(x, newSlice)], [xBdim]];
 	},
 	[Primitive.Pad](axisSize, [x], [xBdim], { width }) {
-		if (xBdim === null) return [[pad$1(x, width)], [null]];
+		assertNonNull(xBdim);
 		const newWidth = width.toSpliced(xBdim, 0, [0, 0]);
 		return [[pad$1(x, newWidth)], [xBdim]];
 	},
+	[Primitive.Gather](axisSize, [x, ...indices], [xBdim, ...indicesBdim], { axis, outDim }) {
+		if (indicesBdim.every((d) => d === null)) {
+			assertNonNull(xBdim);
+			const newAxis = axis.map((ax) => ax + (xBdim <= ax ? 1 : 0));
+			let newBdim = xBdim - axis.filter((ax) => ax < xBdim).length;
+			let newOutDim = outDim;
+			if (newOutDim < newBdim) newBdim += axis.length;
+			else newOutDim += 1;
+			return [[gather(x, indices, newAxis, newOutDim)], [newBdim]];
+		}
+		const nd = Math.max(...indices.map((m, i) => ndim$1(m) + (indicesBdim[i] === null ? 1 : 0)));
+		indices = indices.map((m, i) => {
+			if (indicesBdim[i] === null) return m;
+			m = moveBatchAxis(axisSize, indicesBdim[i], 0, m);
+			if (m.ndim < nd) m = m.reshape([
+				m.shape[0],
+				...rep(nd - m.ndim, 1),
+				...m.shape.slice(1)
+			]);
+			return m;
+		});
+		if (xBdim === null) return [[gather(x, indices, axis, outDim)], [outDim]];
+		else {
+			x = moveBatchAxis(axisSize, xBdim, 0, x);
+			const newAxis = [0, ...axis.map((ax) => ax + 1)];
+			const extraBatchIndex = arange(axisSize).reshape([-1, ...rep(nd - 1, 1)]);
+			indices.splice(0, 0, extraBatchIndex);
+			return [[gather(x, indices, newAxis, outDim)], [outDim]];
+		}
+	},
 	[Primitive.JitCall](axisSize, args, dims, { name, jaxpr }) {
 		const { newJaxpr, newConsts } = vmapJaxpr(jaxpr, axisSize, dims);
 		const outs = bind(Primitive.JitCall, [...newConsts.map((c) => c.ref), ...args], {
@@ -3371,12 +3545,14 @@ function vmapFlat(f, inAxes, args) {
 function vmap$1(f, inAxes = 0) {
 	return (...args) => {
 		const [argsFlat, inTree] = flatten(args);
-		let inAxesFlat;
+		let inAxesFlat = [];
 		if (typeof inAxes === "number") inAxesFlat = rep(argsFlat.length, inAxes);
+		else for (let i = 0; i < args.length; i++) if (inAxes[i] == null) inAxesFlat.push(...rep(inTree.childTreedefs[i].size, null));
+		else if (typeof inAxes[i] === "number") inAxesFlat.push(...rep(inTree.childTreedefs[i].size, inAxes[i]));
 		else {
-			let inTree2;
-			[inAxesFlat, inTree2] = flatten(inAxes);
-			if (!inTree.equals(inTree2)) throw new TreeMismatchError("vmap", inTree, inTree2);
+			const [axesFlat, axesTreeDef] = flatten(inAxes[i]);
+			if (!inTree.childTreedefs[i].equals(axesTreeDef)) throw new TreeMismatchError("vmap", inTree.childTreedefs[i], axesTreeDef);
+			inAxesFlat.push(...axesFlat);
 		}
 		const [fFlat, outTree] = flattenFun(f, inTree);
 		const outsFlat = vmapFlat(fFlat, inAxesFlat, argsFlat);
@@ -3996,7 +4172,7 @@ function valueAndGrad$1(f) {
 		const [y, fVjp] = vjp$1(f, x[0], ...x.slice(1).map(stopGradient));
 		if (!(y instanceof Tracer) || ndim$1(y) !== 0) throw new TypeError("grad requires a scalar output");
 		if (!isFloatDtype(y.dtype)) throw new TypeError("grad only supports floating-point dtypes");
-		const [ct, ...rest] = fVjp(array(1, { dtype: y.dtype }));
+		const [ct, ...rest] = fVjp(onesLike$1(y.ref));
 		for (const r of rest) dispose(r);
 		fVjp.dispose();
 		return [y, ct];
@@ -4024,7 +4200,10 @@ __export(lax_exports, {
 	conv: () => conv$1,
 	convGeneralDilated: () => convGeneralDilated,
 	convWithGeneralPadding: () => convWithGeneralPadding,
-	reduceWindow: () => reduceWindow
+	erf: () => erf,
+	erfc: () => erfc,
+	reduceWindow: () => reduceWindow,
+	stopGradient: () => stopGradient$1
 });
 function padtypeToPads(inShape, filterShape, strides, dilation, padding) {
 	const padType = padding.toUpperCase();
@@ -4083,6 +4262,28 @@ function reduceWindow(operand, computation, windowDimensions, windowStrides) {
 		strides: windowStrides
 	}));
 }
+/** The error function: `erf(x) = 2/sqrt(pi) * int[0..x] exp(-t^2) dt`. */
+function erf(x) {
+	return erf$1(x);
+}
+/**
+* The complementary error function: `erfc(x) = 1 - erf(x)`.
+*
+* This function is more accurate than `1 - erf(x)` for large values of `x`,
+* where `erf(x)` is very close to 1.
+*/
+function erfc(x) {
+	return erfc$1(x);
+}
+/**
+* Stops gradient computation.
+*
+* Behaves as the identity function but prevents the flow of gradients during
+* forward or reverse-mode automatic differentiation.
+*/
+function stopGradient$1(x) {
+	return stopGradient(x);
+}
 //#endregion
 //#region src/numpy.ts
@@ -4145,6 +4346,9 @@ __export(numpy_exports, {
 	fullLike: () => fullLike$1,
 	greater: () => greater,
 	greaterEqual: () => greaterEqual,
+	hamming: () => hamming,
+	hann: () => hann,
+	heaviside: () => heaviside,
 	hstack: () => hstack,
 	hypot: () => hypot,
 	identity: () => identity$1,
@@ -4784,6 +4988,32 @@ function sign(x) {
 	x = fudgeArray(x);
 	return where(notEqual(x.ref, 0), where(less(x.ref, 0), -1, 1), 0);
 }
+/**
+* Return the Hamming window of size M, a taper with a weighted cosine bell.
+*
+* `w(n) = 0.54 - 0.46 * cos(2πn/(M-1))` for `0 <= n <= M-1`.
+*/
+function hamming(M) {
+	return cos(linspace(0, 2 * Math.PI, M)).mul(-.46).add(.54);
+}
+/**
+* Return the Hann window of size M, a taper with a weighted cosine bell.
+*
+* `w(n) = 0.5 - 0.5 * cos(2πn/(M-1))` for `0 <= n <= M-1`.
+*/
+function hann(M) {
+	return cos(linspace(0, 2 * Math.PI, M)).mul(-.5).add(.5);
+}
+/**
+* @function
+* Compute the Heaviside step function. It is defined piecewise:
+* - `heaviside(x1, x2) = 0` for `x1 < 0`,
+* - `heaviside(x1, x2) = x2` for `x1 == 0`,
+* - `heaviside(x1, x2) = 1` for `x1 > 0`.
+*/
+const heaviside = jit$1(function heaviside$1(x1, x2) {
+	return where(less(x1.ref, 0), 0, where(equal(x1, 0), x2, 1));
+});
 /** Calculate element-wise square of the input array. */
 function square(x) {
 	x = fudgeArray(x);
@@ -4803,8 +5033,8 @@ function acos(x) {
 * Return element-wise hypotenuse for the given legs of a right triangle.
 *
 * In the original NumPy/JAX implementation, this function is more numerically
-* stable than sqrt(x1**2 + x2**2). We don't currently implement those stability
-* improvements.
+* stable than `sqrt(x1**2 + x2**2)`. We don't currently implement those
+* stability improvements.
 */
 const hypot = jit$1(function hypot$1(x1, x2) {
 	return sqrt(square(x1).add(square(x2)));
@@ -5128,18 +5358,20 @@ function celu(x, alpha = 1) {
 * @function
 * Gaussion error linear unit (GELU) activation function.
 *
-* This is computed element-wise. Currently jax-js does not support the erf() or
-* gelu() functions exactly as primitives, so an approximation is used:
-* `gelu(x) ~= x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))`.
+* This is computed element-wise. There are two variants depending on whether
+* `approximate` is set (default true):
 *
-* Reference: https://ml-explore.github.io/mlx/build/html/python/nn/_autosummary_functions/mlx.nn.gelu_approx.html
+* - Approximate: `gelu(x) ~= x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))`
+* - Exact: `gelu(x) = x * 0.5 * erfc(-x / sqrt(2))`
 *
-* This will be improved in the future.
+* Reference: https://ml-explore.github.io/mlx/build/html/python/nn/_autosummary_functions/mlx.nn.gelu_approx.html
 */
-const gelu = jit$1(function gelu$1(x) {
-	const SQRT_2_OVER_PI = Math.sqrt(2 / Math.PI);
-	return x.ref.mul(.5).mul(tanh(x.ref.mul(x.ref.mul(x).mul(.044715).add(1)).mul(SQRT_2_OVER_PI)).add(1));
-});
+const gelu = jit$1(function gelu$1(x, opts) {
+	if (opts?.approximate ?? true) {
+		const SQRT_2_OVER_PI = Math.sqrt(2 / Math.PI);
+		return x.ref.mul(.5).mul(tanh(x.ref.mul(x.ref.mul(x).mul(.044715).add(1)).mul(SQRT_2_OVER_PI)).add(1));
+	} else return x.ref.mul(.5).mul(erfc$1(negative(x.ref.mul(Math.SQRT1_2))));
+}, { staticArgnums: [1] });
 /**
 * Gated linear unit (GLU) activation function.
 *
@@ -5360,6 +5592,25 @@ const normal = jit$1(function normal$1(key$1, shape$1 = []) {
 	return radius.mul(cos(theta));
 }, { staticArgnums: [1] });
+//#endregion
+//#region src/scipy-special.ts
+var scipy_special_exports = {};
+__export(scipy_special_exports, {
+	erf: () => erf,
+	erfc: () => erfc,
+	logSoftmax: () => logSoftmax,
+	logit: () => logit,
+	logsumexp: () => logsumexp,
+	softmax: () => softmax
+});
+/**
+* @function
+* The logit function, `logit(p) = log(p / (1-p))`.
+*/
+const logit = jit$1(function logit$1(x) {
+	return log(x.ref.div(subtract(1, x)));
+});
 //#endregion
 //#region src/polyfills.ts
 /** @file Polyfills for using this library. */
@@ -5453,6 +5704,25 @@ async function blockUntilReady(x) {
 	await Promise.all(promises);
 	return x;
 }
+/**
+* Transfer `x` to `device`.
+*
+* `x` may be a nested container of arrays or scalars. The resulting structure
+* is committed to the device.
+*
+* If `device` is not specified, this function behaves as identity if the input
+* is already an `Array`, otherwise it places the scalar uncommitted on the
+* default device.
+*/
+async function devicePut(x, device) {
+	const [xflat, structure$1] = flatten(x);
+	const yflat = await Promise.all(xflat.map((leaf) => {
+		if (leaf instanceof Array$1) return device ? leaf._put(getBackend(device)) : Promise.resolve(leaf);
+		else return Promise.resolve(array(leaf, { device }));
+	}));
+	return unflatten(structure$1, yflat);
+}
 //#endregion
-export { Array$1 as Array, DType, Jaxpr, blockUntilReady, defaultDevice, devices, grad, init, jacfwd, jacobian, jacrev, jit, jvp, lax_exports as lax, linearize, makeJaxpr, nn_exports as nn, numpy_exports as numpy, random_exports as random, setDebug, tree_exports as tree, valueAndGrad, vjp, vmap };
+export { Array$1 as Array, DType, Jaxpr, blockUntilReady, defaultDevice, devicePut, devices, grad, init, jacfwd, jacobian, jacrev, jit, jvp, lax_exports as lax, linearize, makeJaxpr, nn_exports as nn, numpy_exports as numpy, random_exports as random, scipy_special_exports as scipySpecial, setDebug, tree_exports as tree, valueAndGrad, vjp, vmap };
+//# sourceMappingURL=index.js.map