npm - @jax-js/jax - Versions diffs - 0.1.2 → 0.1.3 - Mend

@jax-js/jax 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md +11 -32
package/dist/{backend-BqymqzuU.js → backend-BY8wlLEl.js} +58 -20
package/dist/{backend-DeVfWEFS.cjs → backend-CmaidnkQ.cjs} +58 -20
package/dist/index.cjs +298 -134
package/dist/index.d.cts +21 -5
package/dist/index.d.ts +21 -5
package/dist/index.js +298 -134
package/dist/{webgpu-CcGP160M.cjs → webgpu-BVns4DbI.cjs} +14 -6
package/dist/{webgpu-BGuG58KZ.js → webgpu-C9iAP5h5.js} +14 -6
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -257,36 +257,12 @@ await devicePut(ar, "webgpu"); // Now device="webgpu"
 There are other libraries in the `@jax-js` namespace that can work with jax-js, or be used in a
 self-contained way in other projects.
-**`@jax-js/optax`** provides implementations of optimizers like Adam and SGD.
-```ts
-import { adam } from "@jax-js/optax";
-let params = np.array([1.0, 2.0, 3.0]);
-const solver = adam(1e-3);
-let optState = solver.init(params.ref);
-let updates: np.Array;
-const f = (x: np.Array) => squaredError(x, np.ones([3])).sum();
-for (let i = 0; i < 100; i++) {
-  const paramsGrad = grad(f)(params.ref);
-  [updates, optState] = solver.update(paramsGrad, optState);
-  params = applyUpdates(params, updates);
-}
-```
-**`@jax-js/loaders`** can load tensors from various formats like Safetensors, includes a fast and
-compliant implementation of BPE, and caches HTTP requests for large assets like model weights in
-OPFS.
-```ts
-import { tokenizers } from "@jax-js/loaders";
-const enc = await tokenizers.getBpe("clip");
-const tokens = enc.encode("Hello, world!"); // => [ 49406, 3306, 267, 1002, ... ]
-```
+- [**`@jax-js/loaders`**](packages/loaders) can load tensors from various formats like Safetensors,
+  includes a fast and compliant implementation of BPE, and caches HTTP requests for large assets
+  like model weights in OPFS.
+- [**`@jax-js/onnx`**](packages/onnx) is a model loader from the [ONNX](https://onnx.ai/) format
+  into native jax-js functions.
+- [**`@jax-js/optax`**](packages/optax) provides implementations of optimizers like Adam and SGD.
 ### Performance
@@ -311,6 +287,7 @@ If you make something cool with jax-js, don't be a stranger! We can feature it h
 - [Training neural networks on MNIST](https://jax-js.com/mnist)
 - [CLIP embeddings for books in-browser](https://jax-js.com/mobileclip)
+- [Object detection with DETR ResNet-50 (ONNX)](https://jax-js.com/detr-resnet-50)
 - [In-browser REPL](https://jax-js.com/repl)
 - [Matmul benchmark](https://jax-js.com/bench/matmul)
 - [Conv2d benchmark](https://jax-js.com/bench/conv2d)
@@ -351,7 +328,9 @@ Contributions are welcomed! Especially in:
 - Adding support for more JAX functions and operations, see [compatibility table](./FEATURES.md).
 - Improving performance of the WebGPU and Wasm runtimes, generating better kernels, and using SIMD
   and multithreading. (Even single-threaded Wasm could be ~20x faster.)
-- Helping the JIT compiler to fuse operations in more cases, like `tanh` branches and adding
-  epilogue to reductions.
+- Adding support for `jax.profiling`, in particular the start and end trace functions. We should be
+  able to generate `traceEvents` from backends (especially on GPU, with precise timestamp queries)
+  to help with model performance debugging.
+- Helping the JIT compiler to fuse operations in more cases, like `tanh` branches.
 - Adding WebGL runtime for older browsers that don't support WebGPU.
 - Making a fast transformer inference engine, comparing against onnxruntime-web.

package/dist/{backend-BqymqzuU.js → backend-BY8wlLEl.js} RENAMED Viewed

@@ -557,16 +557,16 @@ var AluExp = class AluExp {
 		});
 	}
 	/** Reindex gid values in this expression as needed. */
-	reindexGids(gidMap) {
+	reindexGids(newGids) {
 		return this.rewrite((exp) => {
 			if (exp.op === AluOp.GlobalIndex) {
 				const [gid, len] = exp.arg;
-				const newGid = gidMap.get(gid);
-				if (newGid !== void 0 && newGid !== gid) return AluExp.globalIndex(exp.dtype, newGid, len, exp.src[0]);
+				const newGid = newGids[gid];
+				if (newGid !== gid) return AluExp.globalIndex(exp.dtype, newGid, len, exp.src[0]);
 			} else if (exp.op === AluOp.GlobalView) {
 				const gid = exp.arg[0];
-				const newGid = gidMap.get(gid);
-				if (newGid !== void 0 && newGid !== gid) return AluExp.globalView(exp.dtype, newGid, exp.arg[1], exp.src);
+				const newGid = newGids[gid];
+				if (newGid !== gid) return AluExp.globalView(exp.dtype, newGid, exp.arg[1], exp.src);
 			}
 		});
 	}
@@ -780,7 +780,7 @@ var AluExp = class AluExp {
 			if (op === AluOp.Sub && i === 1 && x === 0) return src[1 - i];
 			if (op === AluOp.Mul && x === 1) return src[1 - i];
 			if (op === AluOp.Mul && x === 0) return AluExp.const(this.dtype, 0);
-			if (op === AluOp.Idiv && i === 1 && x === 1) return src[1 - i];
+			if (op === AluOp.Idiv && i === 1 && x === 1 && !isFloatDtype(this.dtype)) return src[1 - i];
 			if (op === AluOp.Cmpne && src[i].dtype === DType.Bool && x === 0) return src[1 - i];
 		}
 		if ((op === AluOp.Add || op === AluOp.Sub) && src[1].op === AluOp.Mul) {
@@ -2066,7 +2066,8 @@ function tuneNullopt(kernel) {
 	if (kernel.reduction) vars.ridx = AluExp.special(DType.Int32, "ridx", kernel.reduction.size);
 	return {
 		exp: kernel.exp.substitute(vars).rewriteGlobalViews().simplify(),
-		outputIdxExp: AluExp.special(DType.Int32, "gidx", kernel.size),
+		epilogue: kernel.reduction?.epilogue.substitute({ gidx: vars.gidx }).rewriteGlobalViews().simplify(),
+		outputIdxExp: vars.gidx,
 		threadCount: kernel.size,
 		size: { reduce: kernel.reduction ? kernel.reduction.size : 0 }
 	};
@@ -2099,7 +2100,11 @@ function tuneWebgpu(kernel) {
 	while (prod(dim.st.shape.slice(0, dim.groups)) >= 1024) {
 		const choices = [];
 		const composedSts = sts.map((st) => st.compose(dim.st));
-		for (let axis = 0; axis < dim.groups; axis++) for (const amount of [3, 4]) if (!upcastedAxis.has(axis) && dim.st.shape[axis] % amount === 0 && composedSts.some((st) => st.lastStrides[axis] === 0 && st.lastStrides.slice(dim.unroll).every((stride) => stride > 0))) {
+		for (let axis = 0; axis < dim.groups; axis++) for (const amount of [
+			3,
+			4,
+			5
+		]) if (!upcastedAxis.has(axis) && dim.st.shape[axis] % amount === 0 && composedSts.some((st) => st.lastStrides[axis] === 0 && st.lastStrides.slice(dim.unroll).every((stride) => stride > 0))) {
 			let nonzeroStrides = 0;
 			let totalStrides = 0;
 			for (const st of composedSts) {
@@ -2175,7 +2180,15 @@ function tuneWebgpu(kernel) {
 	});
 	const outputGidx = dim.outputSt.shape.slice(0, dim.groups);
 	const outputUpcast = dim.outputSt.shape.slice(dim.groups);
-	const [outputIdxExp, _] = dim.outputSt.toAluExp([...unravelAlu(outputGidx, AluExp.special(DType.Int32, "gidx", prod(outputGidx))), ...unravelAlu(outputUpcast, AluVar.upcast)]);
+	const outputIndices = [...unravelAlu(outputGidx, AluExp.special(DType.Int32, "gidx", prod(outputGidx))), ...unravelAlu(outputUpcast, AluVar.upcast)];
+	const [outputIdxExp, _] = dim.outputSt.toAluExp(outputIndices);
+	const newEpilogue = reduction.epilogue.rewrite((exp$1) => {
+		if (exp$1.op === AluOp.GlobalView) {
+			const gid = exp$1.arg[0];
+			const st = exp$1.arg[1];
+			return accessorGlobal(exp$1.dtype, gid, st.compose(dim.outputSt), outputIndices);
+		}
+	});
 	if (prod(dim.st.shape.slice(dim.groups, dim.upcast)) !== reduction.size) throw new Error(`Invariant violation: reduction size ${reduction.size} does not match tuned dims ${JSON.stringify(dim.st.shape.slice(dim.groups, dim.upcast))}`);
 	const size = {
 		groups: prod(dim.st.shape.slice(dim.groups, dim.reduce)),
@@ -2185,6 +2198,7 @@ function tuneWebgpu(kernel) {
 	};
 	return {
 		exp: newExp.simplify(),
+		epilogue: newEpilogue.simplify(),
 		outputIdxExp: outputIdxExp.simplify(),
 		threadCount: kernel.size / size.upcast * size.groups,
 		size
@@ -2243,10 +2257,10 @@ var CpuBackend = class {
 		return new Executable(kernel, void 0);
 	}
 	dispatch({ kernel }, inputs, outputs) {
-		const { exp } = tuneNullopt(kernel);
+		const { exp, epilogue } = tuneNullopt(kernel);
 		const inputBuffers = inputs.map((slot) => this.#getBuffer(slot));
 		const outputBuffers = outputs.map((slot) => this.#getBuffer(slot));
-		const usedArgs = new Map(exp.collect((exp$1) => exp$1.op === AluOp.GlobalIndex).map((exp$1) => [exp$1.arg[0], exp$1.dtype]));
+		const usedArgs = new Map([...exp.collect((exp$1) => exp$1.op === AluOp.GlobalIndex), ...epilogue ? epilogue.collect((exp$1) => exp$1.op === AluOp.GlobalIndex) : []].map((exp$1) => [exp$1.arg[0], exp$1.dtype]));
 		const inputArrays = inputBuffers.map((buf, i) => {
 			const dtype = usedArgs.get(i);
 			if (!dtype) return null;
@@ -2268,7 +2282,10 @@ var CpuBackend = class {
 				}, globals);
 				acc = kernel.reduction.evaluate(acc, item);
 			}
-			outputArray[i] = kernel.reduction.epilogue.evaluate({ acc });
+			outputArray[i] = epilogue.evaluate({
+				acc,
+				gidx: i
+			}, globals);
 		}
 	}
 	#getBuffer(slot) {
@@ -2431,7 +2448,7 @@ function wasm_log(cg) {
 		const t2 = cg.local.declare(cg.f32);
 		cg.local.get(0);
 		cg.f32.const(0);
-		cg.f32.le();
+		cg.f32.lt();
 		cg.if(cg.void);
 		cg.f32.const(NaN);
 		cg.return();
@@ -2446,6 +2463,20 @@ function wasm_log(cg) {
 		cg.i32.const(127);
 		cg.i32.sub();
 		cg.local.set(e);
+		cg.local.get(e);
+		cg.i32.const(-127);
+		cg.i32.eq();
+		cg.if(cg.void);
+		cg.f32.const(-Infinity);
+		cg.return();
+		cg.end();
+		cg.local.get(e);
+		cg.i32.const(128);
+		cg.i32.eq();
+		cg.if(cg.void);
+		cg.local.get(0);
+		cg.return();
+		cg.end();
 		cg.local.get(bits);
 		cg.i32.const(8388607);
 		cg.i32.and();
@@ -2511,7 +2542,7 @@ function _sincos(cg) {
 	cg.f32.mul();
 	cg.f32.nearest();
 	cg.local.tee(qf);
-	cg.i32.trunc_f32_s();
+	cg.i32.trunc_sat_f32_s();
 	cg.local.set(q);
 	cg.local.get(y);
 	cg.local.get(qf);
@@ -3598,6 +3629,7 @@ var F32x4 = class extends V128 {
 //#endregion
 //#region src/backend/wasm.ts
+const moduleCache = /* @__PURE__ */ new Map();
 /** Backend that compiles into WebAssembly bytecode for immediate execution. */
 var WasmBackend = class {
 	type = "wasm";
@@ -3653,8 +3685,11 @@ var WasmBackend = class {
 		return this.prepareSync(kernel);
 	}
 	prepareSync(kernel) {
-		const bytes = codegenWasm(kernel);
-		const module = new WebAssembly.Module(bytes);
+		const kernelHash = FpHash.hash(kernel);
+		const module = runWithCache(moduleCache, kernelHash.toString(), () => {
+			const bytes = codegenWasm(kernel);
+			return new WebAssembly.Module(bytes);
+		});
 		return new Executable(kernel, { module });
 	}
 	dispatch(exe, inputs, outputs) {
@@ -3675,7 +3710,7 @@ function codegenWasm(kernel) {
 	if (DEBUG >= 3) console.info(`kernel.exp: ${kernel.exp}\ntune.exp: ${tune.exp}`);
 	const cg = new CodeGenerator();
 	cg.memory.import("env", "memory");
-	const distinctOps = mapSetUnion(tune.exp.distinctOps(), re?.epilogue.distinctOps());
+	const distinctOps = mapSetUnion(tune.exp.distinctOps(), tune.epilogue?.distinctOps());
 	const funcs = {};
 	if (distinctOps.has(AluOp.Sin)) funcs.sin = wasm_sin(cg);
 	if (distinctOps.has(AluOp.Cos)) funcs.cos = wasm_cos(cg);
@@ -3753,7 +3788,10 @@ function codegenWasm(kernel) {
 			cg.br(1);
 			cg.end();
 			cg.end();
-			translateExp(cg, funcs, kernel.reduction.epilogue, { acc });
+			translateExp(cg, funcs, tune.epilogue, {
+				acc,
+				gidx
+			});
 		} else translateExp(cg, funcs, tune.exp, { gidx });
 		dty(cg, null, kernel.dtype).store(Math.log2(byteWidth(kernel.dtype)));
 		cg.local.get(gidx);
@@ -4002,7 +4040,7 @@ async function createBackend(device) {
 		if (!navigator.gpu) return null;
 		const adapter = await navigator.gpu.requestAdapter({ powerPreference: "high-performance" });
 		if (!adapter) return null;
-		const { WebGPUBackend } = await import("./webgpu-BGuG58KZ.js");
+		const { WebGPUBackend } = await import("./webgpu-C9iAP5h5.js");
 		const importantLimits = [
 			"maxBufferSize",
 			"maxComputeInvocationsPerWorkgroup",
@@ -4056,4 +4094,4 @@ var UnsupportedOpError = class extends Error {
 //#endregion
 export { AluExp, AluGroup, AluOp, AluVar, DEBUG, DType, Executable, FpHash, Kernel, PPrint, Reduction, ShapeTracker, SlotError, UnsupportedOpError, accessorAluExp, accessorGlobal, assertNonNull, byteWidth, checkAxis, deepEqual, defaultDevice, devices, dtypedArray, dtypedJsArray, findPow2, generalBroadcast, getBackend, init, invertPermutation, isFloatDtype, isNumberPair, isPermutation, mapSetUnion, normalizeAxis, partitionList, prod, promoteTypes, range, recursiveFlatten, rep, runWithCache, setDebug, strip1, toposort, tuneWebgpu, unravelAlu, unzip2, zip, zipn };
-//# sourceMappingURL=backend-BqymqzuU.js.map
+//# sourceMappingURL=backend-BY8wlLEl.js.map

package/dist/{backend-DeVfWEFS.cjs → backend-CmaidnkQ.cjs} RENAMED Viewed

@@ -558,16 +558,16 @@ var AluExp = class AluExp {
 		});
 	}
 	/** Reindex gid values in this expression as needed. */
-	reindexGids(gidMap) {
+	reindexGids(newGids) {
 		return this.rewrite((exp) => {
 			if (exp.op === AluOp.GlobalIndex) {
 				const [gid, len] = exp.arg;
-				const newGid = gidMap.get(gid);
-				if (newGid !== void 0 && newGid !== gid) return AluExp.globalIndex(exp.dtype, newGid, len, exp.src[0]);
+				const newGid = newGids[gid];
+				if (newGid !== gid) return AluExp.globalIndex(exp.dtype, newGid, len, exp.src[0]);
 			} else if (exp.op === AluOp.GlobalView) {
 				const gid = exp.arg[0];
-				const newGid = gidMap.get(gid);
-				if (newGid !== void 0 && newGid !== gid) return AluExp.globalView(exp.dtype, newGid, exp.arg[1], exp.src);
+				const newGid = newGids[gid];
+				if (newGid !== gid) return AluExp.globalView(exp.dtype, newGid, exp.arg[1], exp.src);
 			}
 		});
 	}
@@ -781,7 +781,7 @@ var AluExp = class AluExp {
 			if (op === AluOp.Sub && i === 1 && x === 0) return src[1 - i];
 			if (op === AluOp.Mul && x === 1) return src[1 - i];
 			if (op === AluOp.Mul && x === 0) return AluExp.const(this.dtype, 0);
-			if (op === AluOp.Idiv && i === 1 && x === 1) return src[1 - i];
+			if (op === AluOp.Idiv && i === 1 && x === 1 && !isFloatDtype(this.dtype)) return src[1 - i];
 			if (op === AluOp.Cmpne && src[i].dtype === DType.Bool && x === 0) return src[1 - i];
 		}
 		if ((op === AluOp.Add || op === AluOp.Sub) && src[1].op === AluOp.Mul) {
@@ -2067,7 +2067,8 @@ function tuneNullopt(kernel) {
 	if (kernel.reduction) vars.ridx = AluExp.special(DType.Int32, "ridx", kernel.reduction.size);
 	return {
 		exp: kernel.exp.substitute(vars).rewriteGlobalViews().simplify(),
-		outputIdxExp: AluExp.special(DType.Int32, "gidx", kernel.size),
+		epilogue: kernel.reduction?.epilogue.substitute({ gidx: vars.gidx }).rewriteGlobalViews().simplify(),
+		outputIdxExp: vars.gidx,
 		threadCount: kernel.size,
 		size: { reduce: kernel.reduction ? kernel.reduction.size : 0 }
 	};
@@ -2100,7 +2101,11 @@ function tuneWebgpu(kernel) {
 	while (prod(dim.st.shape.slice(0, dim.groups)) >= 1024) {
 		const choices = [];
 		const composedSts = sts.map((st) => st.compose(dim.st));
-		for (let axis = 0; axis < dim.groups; axis++) for (const amount of [3, 4]) if (!upcastedAxis.has(axis) && dim.st.shape[axis] % amount === 0 && composedSts.some((st) => st.lastStrides[axis] === 0 && st.lastStrides.slice(dim.unroll).every((stride) => stride > 0))) {
+		for (let axis = 0; axis < dim.groups; axis++) for (const amount of [
+			3,
+			4,
+			5
+		]) if (!upcastedAxis.has(axis) && dim.st.shape[axis] % amount === 0 && composedSts.some((st) => st.lastStrides[axis] === 0 && st.lastStrides.slice(dim.unroll).every((stride) => stride > 0))) {
 			let nonzeroStrides = 0;
 			let totalStrides = 0;
 			for (const st of composedSts) {
@@ -2176,7 +2181,15 @@ function tuneWebgpu(kernel) {
 	});
 	const outputGidx = dim.outputSt.shape.slice(0, dim.groups);
 	const outputUpcast = dim.outputSt.shape.slice(dim.groups);
-	const [outputIdxExp, _] = dim.outputSt.toAluExp([...unravelAlu(outputGidx, AluExp.special(DType.Int32, "gidx", prod(outputGidx))), ...unravelAlu(outputUpcast, AluVar.upcast)]);
+	const outputIndices = [...unravelAlu(outputGidx, AluExp.special(DType.Int32, "gidx", prod(outputGidx))), ...unravelAlu(outputUpcast, AluVar.upcast)];
+	const [outputIdxExp, _] = dim.outputSt.toAluExp(outputIndices);
+	const newEpilogue = reduction.epilogue.rewrite((exp$1) => {
+		if (exp$1.op === AluOp.GlobalView) {
+			const gid = exp$1.arg[0];
+			const st = exp$1.arg[1];
+			return accessorGlobal(exp$1.dtype, gid, st.compose(dim.outputSt), outputIndices);
+		}
+	});
 	if (prod(dim.st.shape.slice(dim.groups, dim.upcast)) !== reduction.size) throw new Error(`Invariant violation: reduction size ${reduction.size} does not match tuned dims ${JSON.stringify(dim.st.shape.slice(dim.groups, dim.upcast))}`);
 	const size = {
 		groups: prod(dim.st.shape.slice(dim.groups, dim.reduce)),
@@ -2186,6 +2199,7 @@ function tuneWebgpu(kernel) {
 	};
 	return {
 		exp: newExp.simplify(),
+		epilogue: newEpilogue.simplify(),
 		outputIdxExp: outputIdxExp.simplify(),
 		threadCount: kernel.size / size.upcast * size.groups,
 		size
@@ -2244,10 +2258,10 @@ var CpuBackend = class {
 		return new Executable(kernel, void 0);
 	}
 	dispatch({ kernel }, inputs, outputs) {
-		const { exp } = tuneNullopt(kernel);
+		const { exp, epilogue } = tuneNullopt(kernel);
 		const inputBuffers = inputs.map((slot) => this.#getBuffer(slot));
 		const outputBuffers = outputs.map((slot) => this.#getBuffer(slot));
-		const usedArgs = new Map(exp.collect((exp$1) => exp$1.op === AluOp.GlobalIndex).map((exp$1) => [exp$1.arg[0], exp$1.dtype]));
+		const usedArgs = new Map([...exp.collect((exp$1) => exp$1.op === AluOp.GlobalIndex), ...epilogue ? epilogue.collect((exp$1) => exp$1.op === AluOp.GlobalIndex) : []].map((exp$1) => [exp$1.arg[0], exp$1.dtype]));
 		const inputArrays = inputBuffers.map((buf, i) => {
 			const dtype = usedArgs.get(i);
 			if (!dtype) return null;
@@ -2269,7 +2283,10 @@ var CpuBackend = class {
 				}, globals);
 				acc = kernel.reduction.evaluate(acc, item);
 			}
-			outputArray[i] = kernel.reduction.epilogue.evaluate({ acc });
+			outputArray[i] = epilogue.evaluate({
+				acc,
+				gidx: i
+			}, globals);
 		}
 	}
 	#getBuffer(slot) {
@@ -2432,7 +2449,7 @@ function wasm_log(cg) {
 		const t2 = cg.local.declare(cg.f32);
 		cg.local.get(0);
 		cg.f32.const(0);
-		cg.f32.le();
+		cg.f32.lt();
 		cg.if(cg.void);
 		cg.f32.const(NaN);
 		cg.return();
@@ -2447,6 +2464,20 @@ function wasm_log(cg) {
 		cg.i32.const(127);
 		cg.i32.sub();
 		cg.local.set(e);
+		cg.local.get(e);
+		cg.i32.const(-127);
+		cg.i32.eq();
+		cg.if(cg.void);
+		cg.f32.const(-Infinity);
+		cg.return();
+		cg.end();
+		cg.local.get(e);
+		cg.i32.const(128);
+		cg.i32.eq();
+		cg.if(cg.void);
+		cg.local.get(0);
+		cg.return();
+		cg.end();
 		cg.local.get(bits);
 		cg.i32.const(8388607);
 		cg.i32.and();
@@ -2512,7 +2543,7 @@ function _sincos(cg) {
 	cg.f32.mul();
 	cg.f32.nearest();
 	cg.local.tee(qf);
-	cg.i32.trunc_f32_s();
+	cg.i32.trunc_sat_f32_s();
 	cg.local.set(q);
 	cg.local.get(y);
 	cg.local.get(qf);
@@ -3599,6 +3630,7 @@ var F32x4 = class extends V128 {
 //#endregion
 //#region src/backend/wasm.ts
+const moduleCache = /* @__PURE__ */ new Map();
 /** Backend that compiles into WebAssembly bytecode for immediate execution. */
 var WasmBackend = class {
 	type = "wasm";
@@ -3654,8 +3686,11 @@ var WasmBackend = class {
 		return this.prepareSync(kernel);
 	}
 	prepareSync(kernel) {
-		const bytes = codegenWasm(kernel);
-		const module$1 = new WebAssembly.Module(bytes);
+		const kernelHash = FpHash.hash(kernel);
+		const module$1 = runWithCache(moduleCache, kernelHash.toString(), () => {
+			const bytes = codegenWasm(kernel);
+			return new WebAssembly.Module(bytes);
+		});
 		return new Executable(kernel, { module: module$1 });
 	}
 	dispatch(exe, inputs, outputs) {
@@ -3676,7 +3711,7 @@ function codegenWasm(kernel) {
 	if (DEBUG >= 3) console.info(`kernel.exp: ${kernel.exp}\ntune.exp: ${tune.exp}`);
 	const cg = new CodeGenerator();
 	cg.memory.import("env", "memory");
-	const distinctOps = mapSetUnion(tune.exp.distinctOps(), re?.epilogue.distinctOps());
+	const distinctOps = mapSetUnion(tune.exp.distinctOps(), tune.epilogue?.distinctOps());
 	const funcs = {};
 	if (distinctOps.has(AluOp.Sin)) funcs.sin = wasm_sin(cg);
 	if (distinctOps.has(AluOp.Cos)) funcs.cos = wasm_cos(cg);
@@ -3754,7 +3789,10 @@ function codegenWasm(kernel) {
 			cg.br(1);
 			cg.end();
 			cg.end();
-			translateExp(cg, funcs, kernel.reduction.epilogue, { acc });
+			translateExp(cg, funcs, tune.epilogue, {
+				acc,
+				gidx
+			});
 		} else translateExp(cg, funcs, tune.exp, { gidx });
 		dty(cg, null, kernel.dtype).store(Math.log2(byteWidth(kernel.dtype)));
 		cg.local.get(gidx);
@@ -4003,7 +4041,7 @@ async function createBackend(device) {
 		if (!navigator.gpu) return null;
 		const adapter = await navigator.gpu.requestAdapter({ powerPreference: "high-performance" });
 		if (!adapter) return null;
-		const { WebGPUBackend } = await Promise.resolve().then(() => require("./webgpu-CcGP160M.cjs"));
+		const { WebGPUBackend } = await Promise.resolve().then(() => require("./webgpu-BVns4DbI.cjs"));
 		const importantLimits = [
 			"maxBufferSize",
 			"maxComputeInvocationsPerWorkgroup",
@@ -4350,4 +4388,4 @@ Object.defineProperty(exports, 'zipn', {
     return zipn;
   }
 });
-//# sourceMappingURL=backend-DeVfWEFS.cjs.map
+//# sourceMappingURL=backend-CmaidnkQ.cjs.map