npm - @jax-js/jax - Versions diffs - 0.1.9 → 0.1.11 - Mend

@jax-js/jax 0.1.9 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/README.md +35 -19
package/dist/{backend-BId79r5b.js → backend-DZvR7mZV.js} +831 -26
package/dist/{backend-DpI0riom.cjs → backend-DlYlOYqN.cjs} +872 -25
package/dist/index.cjs +364 -20
package/dist/index.d.cts +175 -11
package/dist/index.d.ts +175 -11
package/dist/index.js +363 -21
package/dist/{webgl-DnGrclTz.js → webgl-D8-14NzA.js} +7 -1
package/dist/{webgl-C5NjXc1p.cjs → webgl-Ovaaa-Qx.cjs} +7 -1
package/dist/{webgpu-AN0cG_nB.js → webgpu-Dg8FpYrH.js} +141 -6
package/dist/{webgpu-CdjiJSa7.cjs → webgpu-uU9nnttc.cjs} +141 -6
package/package.json +5 -16

package/dist/{backend-BId79r5b.js → backend-DZvR7mZV.js} RENAMED Viewed

@@ -313,6 +313,16 @@ function runWithCache(cache, key, thunk) {
 		return value;
 	}
 }
+/** Async version of `runWithCache`. */
+async function runWithCacheAsync(cache, key, thunk) {
+	const keyStr = JSON.stringify(key);
+	if (cache.has(keyStr)) return cache.get(keyStr);
+	else {
+		const value = await thunk();
+		cache.set(keyStr, value);
+		return value;
+	}
+}
 //#endregion
 //#region src/alu.ts
@@ -415,8 +425,23 @@ var AluExp = class AluExp {
 		this.src = src;
 		this.arg = arg;
 		if (AluGroup.RequiredFloat.has(op) && !isFloatDtype(dtype)) throw new TypeError(`Unsupported dtype for ${op}: ${dtype}`);
-		if (op === AluOp.Bitcast && (dtype === DType.Bool || src[0].dtype === DType.Bool || byteWidth(dtype) !== byteWidth(src[0].dtype))) throw new TypeError(`Bitcast from ${src[0].dtype} -> ${dtype}`);
-		if (op === AluOp.Threefry2x32 && (dtype !== DType.Uint32 || src.some((x) => x.dtype !== DType.Uint32))) throw new TypeError("Threefry2x32 requires uint32 types");
+		switch (op) {
+			case AluOp.Bitcast:
+				if (dtype === DType.Bool || src[0].dtype === DType.Bool || byteWidth(dtype) !== byteWidth(src[0].dtype)) throw new TypeError(`Bitcast from ${src[0].dtype} -> ${dtype}`);
+				break;
+			case AluOp.Threefry2x32:
+				if (dtype !== DType.Uint32 || src.some((x) => x.dtype !== DType.Uint32)) throw new TypeError("Threefry2x32 requires uint32 types");
+				break;
+			case AluOp.BitCombine:
+				if (src[0].dtype !== src[1].dtype || isFloatDtype(src[0].dtype)) throw new TypeError(`BitCombine[${arg}] requires matching integral dtype, got ${src[0].dtype} and ${src[1].dtype}`);
+				break;
+			case AluOp.BitShift:
+				if (src[0].dtype === DType.Bool || src[1].dtype === DType.Bool || isFloatDtype(src[0].dtype) || isFloatDtype(src[1].dtype)) throw new TypeError(`BitShift[${arg}] requires two integral, non-bool dtypes, got ${src[0].dtype} and ${src[1].dtype}`);
+				break;
+			case AluOp.BitInvert:
+				if (isFloatDtype(src[0].dtype)) throw new TypeError(`BitInvert requires an integral dtype, got ${src[0].dtype}`);
+				break;
+		}
 	}
 	static add(a, b) {
 		return new AluExp(AluOp.Add, a.dtype, [a, b]);
@@ -493,6 +518,12 @@ var AluExp = class AluExp {
 			c1
 		], mode);
 	}
+	static bitCombine(a, b, mode) {
+		return new AluExp(AluOp.BitCombine, a.dtype, [a, b], mode);
+	}
+	static bitShift(a, b, mode) {
+		return new AluExp(AluOp.BitShift, a.dtype, [a, b], mode);
+	}
 	static cmplt(a, b) {
 		return new AluExp(AluOp.Cmplt, DType.Bool, [a, b]);
 	}
@@ -965,6 +996,16 @@ var AluExp = class AluExp {
 				case AluOp.Mod: return x % y;
 				case AluOp.Min: return Math.min(x, y);
 				case AluOp.Max: return Math.max(x, y);
+				case AluOp.BitCombine: {
+					let r;
+					if (this.arg === "and") r = x & y;
+					else if (this.arg === "or") r = x | y;
+					else r = x ^ y;
+					return this.dtype === DType.Int32 ? r | 0 : r >>> 0;
+				}
+				case AluOp.BitShift:
+					if (this.arg === "shl") return this.dtype === DType.Int32 ? x << y | 0 : x << y >>> 0;
+					return x >>> y;
 				case AluOp.Cmplt: return Number(x < y);
 				case AluOp.Cmpne: return Number(x != y);
 				default: throw new Error(`Missing implemementation for ${this.op}`);
@@ -1086,6 +1127,18 @@ var AluExp = class AluExp {
 			}
 			if (BIN_SYM[node.op]) return `(${parts[0]} ${BIN_SYM[node.op]} ${parts[1]})`;
 			if (CMP_SYM[node.op]) return `(${parts[0]} ${CMP_SYM[node.op]} ${parts[1]})`;
+			if (node.op === AluOp.BitCombine) {
+				const sym = {
+					and: "&",
+					or: "|",
+					xor: "^"
+				}[node.arg];
+				return `(${parts[0]} ${sym} ${parts[1]})`;
+			}
+			if (node.op === AluOp.BitShift) {
+				const sym = node.arg === "shl" ? "<<" : ">>";
+				return `(${parts[0]} ${sym} ${parts[1]})`;
+			}
 			if (UNARY_SYM[node.op]) return `${UNARY_SYM[node.op]}${parts[0]}`;
 			if (node.op === AluOp.Cast) return `Cast<${node.dtype}>(${strip1(parts[0])})`;
 			if (node.op === AluOp.Bitcast) return `Bitcast<${node.dtype}>(${strip1(parts[0])})`;
@@ -1178,6 +1231,9 @@ let AluOp = /* @__PURE__ */ function(AluOp$1) {
 	AluOp$1["Reciprocal"] = "Reciprocal";
 	AluOp$1["Cast"] = "Cast";
 	AluOp$1["Bitcast"] = "Bitcast";
+	AluOp$1["BitCombine"] = "BitCombine";
+	AluOp$1["BitInvert"] = "BitInvert";
+	AluOp$1["BitShift"] = "BitShift";
 	AluOp$1["Cmplt"] = "Cmplt";
 	AluOp$1["Cmpne"] = "Cmpne";
 	AluOp$1["Where"] = "Where";
@@ -1197,7 +1253,9 @@ const AluGroup = {
 		AluOp.Idiv,
 		AluOp.Mod,
 		AluOp.Min,
-		AluOp.Max
+		AluOp.Max,
+		AluOp.BitCombine,
+		AluOp.BitShift
 	]),
 	Unary: new Set([
 		AluOp.Sin,
@@ -1312,6 +1370,10 @@ var Reduction = class {
 		this.epilogue = epilogue;
 		if (!AluGroup.Reduce.has(op)) throw new TypeError(`Unsupported reduction: ${op}`);
 		this.epilogue = epilogue.simplify();
+		if (this.dtype === DType.Float16 && this.op === AluOp.Add) {
+			this.epilogue = this.epilogue.substitute({ acc: AluExp.cast(this.dtype, AluVar.acc(DType.Float32)) });
+			this.dtype = DType.Float32;
+		}
 	}
 	hash(state) {
 		state.update(this.dtype).update(this.op).update(this.size).update(this.epilogue);
@@ -2266,11 +2328,15 @@ var TuneDims = class {
 };
 /** Tuning step that does not apply any optimization. */
 function tuneNullopt(kernel) {
+	let exp = kernel.exp;
 	const vars = {};
 	vars.gidx = AluExp.special(DType.Int32, "gidx", kernel.size);
-	if (kernel.reduction) vars.ridx = AluExp.special(DType.Int32, "ridx", kernel.reduction.size);
+	if (kernel.reduction) {
+		vars.ridx = AluExp.special(DType.Int32, "ridx", kernel.reduction.size);
+		if (exp.dtype !== kernel.reduction.dtype) exp = AluExp.cast(kernel.reduction.dtype, exp);
+	}
 	return {
-		exp: kernel.exp.substitute(vars).rewriteGlobalViews().simplify(),
+		exp: exp.substitute(vars).rewriteGlobalViews().simplify(),
 		epilogue: kernel.reduction?.epilogue.substitute({ gidx: vars.gidx }).rewriteGlobalViews().simplify(),
 		outputIdxExp: vars.gidx,
 		threadCount: kernel.size,
@@ -2279,8 +2345,9 @@ function tuneNullopt(kernel) {
 }
 /** Tuning for WebGPU kernels. */
 function tuneWebgpu(kernel) {
-	const { exp, reduction } = kernel;
+	const reduction = kernel.reduction;
 	if (!reduction) return tuneNullopt(kernel);
+	const exp = AluExp.cast(reduction.dtype, kernel.exp);
 	const globalIndexes = exp.collect((exp$1) => exp$1.op === AluOp.GlobalIndex);
 	if (globalIndexes.length > 0) {
 		if (DEBUG >= 4) console.info("Tuning: Found GlobalIndex ops, skipping opt.");
@@ -2508,6 +2575,85 @@ var CpuBackend = class {
 	}
 };
+//#endregion
+//#region src/tracing.ts
+let traceEnabled = false;
+const flushCallbacks = [];
+/**
+* Start collecting kernel traces.
+*
+* Traces appear in developer tools under the "Performance" tab, and they are
+* useful for measuring fine-grained kernel execution time.
+*/
+function startTrace() {
+	traceEnabled = true;
+}
+/**
+* Stop collecting kernel traces.
+*
+* Traces appear in developer tools under the "Performance" tab, and they are
+* useful for measuring fine-grained kernel execution time.
+*/
+function stopTrace() {
+	traceEnabled = false;
+	for (const cb of flushCallbacks) cb();
+}
+/** Check if tracing is currently enabled. */
+function isTracing() {
+	return traceEnabled;
+}
+/** Register a callback to flush pending trace data when tracing stops. */
+function onFlushTrace(cb) {
+	flushCallbacks.push(cb);
+}
+function humanSize(n) {
+	if (n >= 1e9) return `${(n / 1e9).toPrecision(3)}B`;
+	if (n >= 1e6) return `${(n / 1e6).toPrecision(3)}M`;
+	if (n >= 1e3) return `${(n / 1e3).toPrecision(3)}K`;
+	return `${n}`;
+}
+/** Build a trace label, properties, and color from a kernel or routine source. */
+function traceSourceInfo(source) {
+	const properties = [];
+	let label;
+	let color;
+	if (source instanceof Kernel) {
+		label = `Kernel[${humanSize(source.size)}]`;
+		properties.push(["exp", `${source.exp}`]);
+		properties.push(["size", `${source.size}`]);
+		properties.push(["nargs", `${source.nargs}`]);
+		if (!source.reduction) color = "primary";
+		else {
+			color = "secondary";
+			properties.push(["reduction", `${source.reduction.op}:${source.reduction.size}`]);
+		}
+	} else {
+		color = "tertiary";
+		label = source.name;
+		properties.push(["inputShapes", source.type.inputShapes.map((s) => `[${s}]`).join(", ")]);
+		properties.push(["outputShapes", source.type.outputShapes.map((s) => `[${s}]`).join(", ")]);
+		properties.push(["dtype", source.type.inputDtypes.join(", ")]);
+	}
+	return {
+		label,
+		color,
+		properties
+	};
+}
+/** Emit a trace entry as a `performance.measure` with devtools metadata. */
+function emitTrace(track, info, start, end) {
+	performance.measure(info.label, {
+		detail: { devtools: {
+			trackGroup: "JAX Profiler",
+			track,
+			color: info.color,
+			properties: info.properties
+		} },
+		start,
+		end
+	});
+}
 //#endregion
 //#region src/backend/wasm/allocator.ts
 /** Simple tensor memory allocator for WebAssembly linear memory. */
@@ -3070,6 +3216,147 @@ function wasm_threefry2x32(cg) {
 	});
 }
+//#endregion
+//#region src/backend/wasm/parallel.ts
+/** Check if SharedArrayBuffer is available. */
+function hasSharedArrayBuffer() {
+	return typeof SharedArrayBuffer !== "undefined" && typeof Worker !== "undefined";
+}
+const MIN_ELEMS_PER_THREAD = 256;
+const WORKER_SOURCE = `
+let memory = null;
+let cachedModule = null;
+let cachedFunc = null;
+self.onmessage = (e) => {
+  const msg = e.data;
+  if (msg.type === "init") {
+    memory = msg.memory;
+    postMessage({ type: "ready" });
+    return;
+  }
+  try {
+    const { module, ptrs, begin, end } = msg;
+    if (module !== cachedModule) {
+      cachedModule = module;
+      const instance = new WebAssembly.Instance(module, { env: { memory } });
+      cachedFunc = instance.exports.kernel;
+    }
+    cachedFunc(...ptrs, begin, end);
+    postMessage({ type: "done", ok: true });
+  } catch (err) {
+    postMessage({ type: "done", ok: false, error: String(err) });
+  }
+};
+`;
+/** Pool of Web Workers for parallel WASM kernel dispatch. */
+var WasmWorkerPool = class {
+	#memory;
+	#numWorkers;
+	#workers = [];
+	#ready = Promise.resolve();
+	/** Serializes dispatches so concurrent read() calls don't clobber onmessage. */
+	#queue = Promise.resolve();
+	#epoch = 0n;
+	#epochEnd = 0n;
+	#hooks = /* @__PURE__ */ new Map();
+	constructor(memory, numWorkers) {
+		if (numWorkers <= 0) throw new Error("numWorkers must be positive");
+		this.#memory = memory;
+		this.#numWorkers = numWorkers;
+	}
+	get epoch() {
+		return this.#epoch;
+	}
+	waitForEpoch(target) {
+		if (target <= this.#epoch) return Promise.resolve();
+		return new Promise((resolve) => {
+			if (target <= this.#epoch) return resolve();
+			const hooks = this.#hooks.get(target);
+			if (hooks) hooks.push(resolve);
+			else this.#hooks.set(target, [resolve]);
+		});
+	}
+	#ensureInit() {
+		if (this.#workers.length > 0) return;
+		const blob = new Blob([WORKER_SOURCE], { type: "application/javascript" });
+		const url = URL.createObjectURL(blob);
+		this.#workers = [];
+		const readyPromises = [];
+		for (let i = 0; i < this.#numWorkers; i++) {
+			const worker = new Worker(url, { type: "module" });
+			this.#workers.push(worker);
+			readyPromises.push(new Promise((resolve, reject) => {
+				worker.onmessage = () => resolve();
+				worker.onerror = (e) => reject(new Error(e.message || "Worker failed to load"));
+			}));
+			worker.postMessage({
+				type: "init",
+				memory: this.#memory
+			});
+		}
+		this.#ready = Promise.all(readyPromises).then(() => {
+			URL.revokeObjectURL(url);
+		});
+		this.#queue = this.#ready;
+	}
+	/**
+	* Dispatch a kernel across multiple workers.
+	*
+	* Returns an epoch that can be used to wait for the ongoing work to complete,
+	* which is guaranteed to be monotonically increasing.
+	*/
+	dispatch(module, ptrs, size) {
+		this.#ensureInit();
+		this.#epochEnd++;
+		const result = this.#queue.then(() => this.#dispatchNow(module, ptrs, size));
+		this.#queue = result.then(() => {}, () => {}).then(() => {
+			this.#epoch++;
+			const hooks = this.#hooks.get(this.#epoch);
+			if (hooks) {
+				for (const hook of hooks) hook();
+				this.#hooks.delete(this.#epoch);
+			}
+		});
+		return this.#epochEnd;
+	}
+	async #dispatchNow(module, ptrs, size) {
+		if (size === 0) return;
+		const n = Math.min(this.#workers.length, Math.ceil(size / MIN_ELEMS_PER_THREAD));
+		const chunkSize = Math.ceil(size / n / 16) * 16;
+		const promises = [];
+		for (let i = 0; i < n; i++) {
+			const begin = i * chunkSize;
+			const end = Math.min(begin + chunkSize, size);
+			if (begin >= size) break;
+			const worker = this.#workers[i];
+			promises.push(new Promise((resolve, reject) => {
+				worker.onmessage = (e) => {
+					if (e.data.ok) resolve();
+					else reject(/* @__PURE__ */ new Error(`Worker error: ${e.data.error}`));
+				};
+				worker.postMessage({
+					module,
+					ptrs,
+					begin,
+					end
+				});
+			}));
+		}
+		await Promise.all(promises);
+	}
+};
+/** Try to create a worker pool. Returns null if workers are unavailable. */
+function createWorkerPool(memory) {
+	if (!hasSharedArrayBuffer()) return null;
+	try {
+		const numWorkers = Math.max(1, typeof navigator !== "undefined" && navigator.hardwareConcurrency || 4);
+		return new WasmWorkerPool(memory, numWorkers);
+	} catch {
+		return null;
+	}
+}
 //#endregion
 //#region src/backend/wasm/wasmblr.ts
 /**
@@ -3407,7 +3694,7 @@ var CodeGenerator = class {
 				concat(importSectionBytes, encodeString(this.memory.aString));
 				concat(importSectionBytes, encodeString(this.memory.bString));
 				importSectionBytes.push(2);
-				if (this.memory.min && this.memory.max) {
+				if (this.memory.max) {
 					if (this.memory.isShared) importSectionBytes.push(3);
 					else importSectionBytes.push(1);
 					concat(importSectionBytes, encodeUnsigned(this.memory.min));
@@ -3814,6 +4101,8 @@ var I32x4 = class extends V128 {
 	min_u = VECTOR_OP("min_u", 183, ["v128", "v128"], "v128");
 	max_s = VECTOR_OP("max_s", 184, ["v128", "v128"], "v128");
 	max_u = VECTOR_OP("max_u", 185, ["v128", "v128"], "v128");
+	trunc_sat_f32x4_s = VECTOR_OP("trunc_sat_f32x4_s", 248, ["v128"], "v128");
+	trunc_sat_f32x4_u = VECTOR_OP("trunc_sat_f32x4_u", 249, ["v128"], "v128");
 };
 var F32x4 = class extends V128 {
 	splat = VECTOR_OP("splat", 19, ["f32"], "v128");
@@ -3840,10 +4129,333 @@ var F32x4 = class extends V128 {
 	max = VECTOR_OP("max", 233, ["v128", "v128"], "v128");
 	pmin = VECTOR_OP("pmin", 234, ["v128", "v128"], "v128");
 	pmax = VECTOR_OP("pmax", 235, ["v128", "v128"], "v128");
+	convert_i32x4_s = VECTOR_OP("convert_i32x4_s", 250, ["v128"], "v128");
+	convert_i32x4_u = VECTOR_OP("convert_i32x4_u", 251, ["v128"], "v128");
 };
 //#endregion
 //#region src/backend/wasm.ts
+/**
+* SIMD version of translateExp: emits v128 (f32x4 or i32x4) instructions instead of scalar.
+* gidx always steps by 4. strideMap classifies each GlobalIndex as broadcast/contiguous/gather.
+*/
+function translateExpSimd(cg, funcs, exp, ctx, strideMap) {
+	const references = /* @__PURE__ */ new Map();
+	const seen = /* @__PURE__ */ new Set();
+	const countReferences = (exp$1) => {
+		references.set(exp$1, (references.get(exp$1) ?? 0) + 1);
+		if (!seen.has(exp$1)) {
+			seen.add(exp$1);
+			for (const src of exp$1.src) countReferences(src);
+		}
+	};
+	const expContext = /* @__PURE__ */ new Map();
+	const gen = (exp$1) => {
+		if (expContext.has(exp$1)) return cg.local.get(expContext.get(exp$1));
+		const { op, src, arg, dtype } = exp$1;
+		const isInt = dtype === DType.Int32 || dtype === DType.Uint32 || dtype === DType.Bool;
+		const isSigned = dtype === DType.Int32;
+		if (op === AluOp.Add) {
+			gen(src[0]);
+			gen(src[1]);
+			if (isInt) cg.i32x4.add();
+			else cg.f32x4.add();
+		} else if (op === AluOp.Sub) {
+			gen(src[0]);
+			gen(src[1]);
+			if (isInt) cg.i32x4.sub();
+			else cg.f32x4.sub();
+		} else if (op === AluOp.Mul) {
+			gen(src[0]);
+			gen(src[1]);
+			if (isInt) cg.i32x4.mul();
+			else cg.f32x4.mul();
+		} else if (op === AluOp.Min) {
+			gen(src[0]);
+			gen(src[1]);
+			if (isInt) if (isSigned) cg.i32x4.min_s();
+			else cg.i32x4.min_u();
+			else cg.f32x4.min();
+		} else if (op === AluOp.Max) {
+			gen(src[0]);
+			gen(src[1]);
+			if (isInt) if (isSigned) cg.i32x4.max_s();
+			else cg.i32x4.max_u();
+			else cg.f32x4.max();
+		} else if (op === AluOp.Sqrt) {
+			gen(src[0]);
+			cg.f32x4.sqrt();
+		} else if (op === AluOp.Floor) {
+			gen(src[0]);
+			cg.f32x4.floor();
+		} else if (op === AluOp.Ceil) {
+			gen(src[0]);
+			cg.f32x4.ceil();
+		} else if (op === AluOp.Const) if (isInt) {
+			cg.i32.const(arg);
+			cg.i32x4.splat();
+		} else {
+			cg.f32.const(arg);
+			cg.f32x4.splat();
+		}
+		else if (op === AluOp.Cast) {
+			gen(src[0]);
+			const dtype0 = src[0].dtype;
+			const src0IsInt = dtype0 === DType.Int32 || dtype0 === DType.Uint32 || dtype0 === DType.Bool;
+			if (isInt && !src0IsInt) if (isSigned) cg.i32x4.trunc_sat_f32x4_s();
+			else cg.i32x4.trunc_sat_f32x4_u();
+			else if (!isInt && src0IsInt) if (dtype0 === DType.Int32 || dtype0 === DType.Bool) cg.f32x4.convert_i32x4_s();
+			else cg.f32x4.convert_i32x4_u();
+		} else if (op === AluOp.Cmplt) {
+			gen(src[0]);
+			gen(src[1]);
+			const srcDtype = src[0].dtype;
+			if (srcDtype === DType.Float32) cg.f32x4.lt();
+			else if (srcDtype === DType.Int32) cg.i32x4.lt_s();
+			else if (srcDtype === DType.Uint32) cg.i32x4.lt_u();
+			else throw new UnsupportedOpError(op, dtype, "wasm");
+			cg.i32.const(1);
+			cg.i32x4.splat();
+			cg.v128.and();
+		} else if (op === AluOp.Cmpne) {
+			gen(src[0]);
+			gen(src[1]);
+			const srcDtype = src[0].dtype;
+			if (srcDtype === DType.Float32) cg.f32x4.ne();
+			else cg.i32x4.ne();
+			cg.i32.const(1);
+			cg.i32x4.splat();
+			cg.v128.and();
+		} else if (op === AluOp.Where) {
+			gen(src[1]);
+			gen(src[2]);
+			gen(src[0]);
+			cg.i32.const(0);
+			cg.i32x4.splat();
+			cg.i32x4.ne();
+			cg.v128.bitselect();
+		} else if (op === AluOp.Variable || op === AluOp.Special) throw new Error(`translateExpSimd: unexpected ${op}(${arg})`);
+		else if (op === AluOp.GlobalIndex) {
+			const [gid, len] = arg;
+			const indexSubtree = src[0];
+			const stride = strideMap.get(exp$1) ?? GATHER;
+			if (stride.kind === "contiguous") {
+				translateExp(cg, funcs, indexSubtree, ctx);
+				{
+					const maxIdx = Math.max(len - SIMD_LANES, 0);
+					const wideIdx = cg.local.declare(cg.i32);
+					cg.local.set(wideIdx);
+					cg.local.get(wideIdx);
+					cg.i32.const(maxIdx);
+					cg.local.get(wideIdx);
+					cg.i32.const(maxIdx);
+					cg.i32.lt_u();
+					cg.select();
+				}
+				cg.i32.const(byteWidth(dtype));
+				cg.i32.mul();
+				cg.local.get(gid);
+				cg.i32.add();
+				if (isInt) cg.i32x4.load(4);
+				else cg.f32x4.load(4);
+			} else if (stride.kind === "broadcast") {
+				translateExp(cg, funcs, indexSubtree, ctx);
+				const local = cg.local.declare(cg.i32);
+				cg.local.tee(local);
+				cg.i32.const(0);
+				cg.local.get(local), cg.i32.const(len), cg.i32.lt_u();
+				cg.select();
+				cg.i32.const(byteWidth(dtype));
+				cg.i32.mul();
+				cg.local.get(gid);
+				cg.i32.add();
+				if (isInt) {
+					cg.i32.load(2);
+					cg.i32x4.splat();
+				} else {
+					cg.f32.load(2);
+					cg.f32x4.splat();
+				}
+			} else {
+				const steppingLocal = ctx["gidx"];
+				const origValue = cg.local.declare(cg.i32);
+				cg.local.get(steppingLocal);
+				cg.local.set(origValue);
+				if (isInt) {
+					cg.i32.const(0);
+					cg.i32x4.splat();
+				} else {
+					cg.f32.const(0);
+					cg.f32x4.splat();
+				}
+				const vec = cg.local.declare(isInt ? cg.i32x4 : cg.f32x4);
+				cg.local.set(vec);
+				const idx = cg.local.declare(cg.i32);
+				const scalarVal = cg.local.declare(isInt ? cg.i32 : cg.f32);
+				for (let lane = 0; lane < SIMD_LANES; lane++) {
+					cg.local.get(origValue);
+					if (lane > 0) {
+						cg.i32.const(lane);
+						cg.i32.add();
+					}
+					cg.local.set(steppingLocal);
+					translateExp(cg, funcs, indexSubtree, ctx);
+					cg.local.tee(idx);
+					cg.i32.const(0);
+					cg.local.get(idx), cg.i32.const(len), cg.i32.lt_u();
+					cg.select();
+					cg.i32.const(byteWidth(dtype));
+					cg.i32.mul();
+					cg.local.get(gid);
+					cg.i32.add();
+					if (isInt) cg.i32.load(2);
+					else cg.f32.load(2);
+					cg.local.set(scalarVal);
+					cg.local.get(vec);
+					cg.local.get(scalarVal);
+					if (isInt) cg.i32x4.replace_lane(lane);
+					else cg.f32x4.replace_lane(lane);
+					cg.local.set(vec);
+				}
+				cg.local.get(origValue);
+				cg.local.set(steppingLocal);
+				cg.local.get(vec);
+			}
+		} else throw new Error(`translateExpSimd: unsupported op ${op}`);
+		if ((references.get(exp$1) ?? 0) > 1) {
+			const local = cg.local.declare(isInt ? cg.i32x4 : cg.f32x4);
+			cg.local.tee(local);
+			expContext.set(exp$1, local);
+		}
+	};
+	countReferences(exp);
+	gen(exp);
+}
+/** Number of SIMD lanes (f32x4 / i32x4 = 4 lanes). */
+const SIMD_LANES = 4;
+function referencesGidx(exp) {
+	if (exp.op === AluOp.Special && exp.arg[0] === "gidx") return true;
+	return exp.src.some(referencesGidx);
+}
+/** When tileSize > N but doesn't divide evenly, the last group before the
+*  inner reset is shorter than N — a SIMD group could straddle it. */
+function hasFragmentRisk(tileSize, N) {
+	return isFinite(tileSize) && tileSize > N && tileSize % N !== 0;
+}
+const GATHER = { kind: "gather" };
+/**
+* Classify how a GlobalIndex's index expression behaves as gidx increments.
+*/
+function analyzeStride(exp) {
+	if (!referencesGidx(exp)) return {
+		kind: "broadcast",
+		tileSize: Infinity
+	};
+	if (exp.op === AluOp.Special && exp.arg[0] === "gidx") return {
+		kind: "contiguous",
+		tileSize: Infinity
+	};
+	if (exp.op === AluOp.Idiv && exp.src[1].op === AluOp.Const) {
+		const N = exp.src[1].arg;
+		const inner = analyzeStride(exp.src[0]);
+		if (inner.kind === "broadcast") return inner;
+		if (inner.kind !== "contiguous") return GATHER;
+		if (hasFragmentRisk(inner.tileSize, N)) return GATHER;
+		return {
+			kind: "broadcast",
+			tileSize: Math.min(inner.tileSize, N)
+		};
+	}
+	if (exp.op === AluOp.Mod && exp.src[1].op === AluOp.Const) {
+		const N = exp.src[1].arg;
+		const inner = analyzeStride(exp.src[0]);
+		if (inner.kind === "broadcast") return inner;
+		if (inner.kind !== "contiguous") return GATHER;
+		if (hasFragmentRisk(inner.tileSize, N)) return GATHER;
+		return {
+			kind: "contiguous",
+			tileSize: Math.min(inner.tileSize, N)
+		};
+	}
+	if (exp.op === AluOp.Mul) {
+		for (let i = 0; i < 2; i++) if (exp.src[i].op === AluOp.Const) {
+			const inner = analyzeStride(exp.src[1 - i]);
+			if (inner.kind === "broadcast") return inner;
+			return GATHER;
+		}
+	}
+	if (exp.op === AluOp.Add) {
+		const lhsHasGidx = referencesGidx(exp.src[0]);
+		const rhsHasGidx = referencesGidx(exp.src[1]);
+		if (lhsHasGidx && !rhsHasGidx) return analyzeStride(exp.src[0]);
+		if (!lhsHasGidx && rhsHasGidx) return analyzeStride(exp.src[1]);
+	}
+	return GATHER;
+}
+/** Ops that have direct SIMD (f32x4) instruction variants. */
+const simdF32Ops = new Set([
+	AluOp.Add,
+	AluOp.Sub,
+	AluOp.Mul,
+	AluOp.Floor,
+	AluOp.Ceil,
+	AluOp.Min,
+	AluOp.Max,
+	AluOp.Sqrt,
+	AluOp.Cast,
+	AluOp.Where,
+	AluOp.Const,
+	AluOp.GlobalIndex
+]);
+/** Ops that have direct SIMD (i32x4) instruction variants. */
+const simdI32Ops = new Set([
+	AluOp.Add,
+	AluOp.Sub,
+	AluOp.Mul,
+	AluOp.Min,
+	AluOp.Max,
+	AluOp.Cast,
+	AluOp.Where,
+	AluOp.Const,
+	AluOp.GlobalIndex
+]);
+/** Ops that produce Bool (i32x4 bitmask) in SIMD. */
+const simdBoolOps = new Set([
+	AluOp.Cmplt,
+	AluOp.Cmpne,
+	AluOp.Const,
+	AluOp.GlobalIndex
+]);
+/**
+* Check if a kernel is eligible for SIMD codegen.
+*
+* A kernel qualifies when:
+* - size >= 4 (need at least 4 elements for a SIMD group)
+* - For reductions: the reduction op has a SIMD variant for its dtype
+* - All nodes have a supported dtype (f32, i32, u32, bool) with SIMD variants
+*/
+function isSimdEligible(tunedExp, kernel) {
+	if (kernel.size < SIMD_LANES) return false;
+	if (kernel.reduction) {
+		if (!simdSupportedOpsForDtype(kernel.reduction.dtype)?.has(kernel.reduction.op)) return false;
+	}
+	const check = (exp, visited) => {
+		if (visited.has(exp)) return true;
+		visited.add(exp);
+		const supportedOps = simdSupportedOpsForDtype(exp.dtype);
+		if (!supportedOps || !supportedOps.has(exp.op)) return false;
+		if (exp.op === AluOp.GlobalIndex) return true;
+		for (const child of exp.src) if (!check(child, visited)) return false;
+		return true;
+	};
+	return check(tunedExp, /* @__PURE__ */ new Set());
+}
+function simdSupportedOpsForDtype(dtype) {
+	if (dtype === DType.Float32) return simdF32Ops;
+	if (dtype === DType.Int32 || dtype === DType.Uint32) return simdI32Ops;
+	if (dtype === DType.Bool) return simdBoolOps;
+	return null;
+}
 const moduleCache = /* @__PURE__ */ new Map();
 /** Backend that compiles into WebAssembly bytecode for immediate execution. */
 var WasmBackend = class {
@@ -3853,11 +4465,18 @@ var WasmBackend = class {
 	#nextSlot;
 	#allocator;
 	#buffers;
+	#workerPool;
+	#pendingWork = /* @__PURE__ */ new Map();
 	constructor() {
-		this.#memory = new WebAssembly.Memory({ initial: 0 });
+		this.#memory = hasSharedArrayBuffer() ? new WebAssembly.Memory({
+			initial: 0,
+			maximum: 65536,
+			shared: true
+		}) : new WebAssembly.Memory({ initial: 0 });
 		this.#allocator = new WasmAllocator(this.#memory);
 		this.#nextSlot = 1;
 		this.#buffers = /* @__PURE__ */ new Map();
+		this.#workerPool = createWorkerPool(this.#memory);
 	}
 	malloc(size, initialData) {
 		const ptr = this.#allocator.malloc(size);
@@ -3888,37 +4507,70 @@ var WasmBackend = class {
 		}
 	}
 	async read(slot, start, count) {
-		return this.readSync(slot, start, count);
+		const epoch = this.#pendingWork.get(slot);
+		if (epoch) await this.#workerPool.waitForEpoch(epoch);
+		return this.#readData(slot, start, count);
 	}
 	readSync(slot, start, count) {
+		const epoch = this.#pendingWork.get(slot);
+		if (epoch && this.#workerPool.epoch < epoch) throw new Error("cannot read synchronously from a slot with async work");
+		return this.#readData(slot, start, count);
+	}
+	#readData(slot, start, count) {
 		const buffer = this.#getBuffer(slot);
 		if (start === void 0) start = 0;
 		if (count === void 0) count = buffer.byteLength - start;
-		return buffer.slice(start, start + count);
+		if (buffer.buffer instanceof SharedArrayBuffer) return new Uint8Array(buffer.slice(start, start + count));
+		else return buffer.slice(start, start + count);
 	}
 	async prepareKernel(kernel) {
-		return this.prepareKernelSync(kernel);
+		const kernelHash = FpHash.hash(kernel);
+		const module = await runWithCacheAsync(moduleCache, kernelHash.toString(), () => WebAssembly.compile(codegenWasm(kernel)));
+		return new Executable(kernel, {
+			module,
+			parallel: this.#workerPool !== null
+		});
 	}
 	prepareKernelSync(kernel) {
 		const kernelHash = FpHash.hash(kernel);
-		const module = runWithCache(moduleCache, kernelHash.toString(), () => {
-			const bytes = codegenWasm(kernel);
-			return new WebAssembly.Module(bytes);
+		const module = runWithCache(moduleCache, kernelHash.toString(), () => new WebAssembly.Module(codegenWasm(kernel)));
+		return new Executable(kernel, {
+			module,
+			parallel: false
 		});
-		return new Executable(kernel, { module });
 	}
 	async prepareRoutine(routine) {
 		return this.prepareRoutineSync(routine);
 	}
 	prepareRoutineSync(routine) {
-		return new Executable(routine, void 0);
+		return new Executable(routine, {
+			module: void 0,
+			parallel: false
+		});
 	}
 	dispatch(exe, inputs, outputs) {
-		if (exe.source instanceof Routine) return runCpuRoutine(exe.source, inputs.map((slot) => this.#getBuffer(slot)), outputs.map((slot) => this.#getBuffer(slot)));
-		const instance = new WebAssembly.Instance(exe.data.module, { env: { memory: this.#memory } });
-		const func = instance.exports.kernel;
-		const ptrs = [...inputs, ...outputs].map((slot) => this.#buffers.get(slot).ptr);
-		func(...ptrs);
+		const tracing = isTracing();
+		const start = tracing ? performance.now() : 0;
+		if (exe.source instanceof Routine) runCpuRoutine(exe.source, inputs.map((slot) => this.#getBuffer(slot)), outputs.map((slot) => this.#getBuffer(slot)));
+		else {
+			const ptrs = [...inputs, ...outputs].map((slot) => this.#buffers.get(slot).ptr);
+			if (exe.data.parallel && this.#workerPool) {
+				const epoch = this.#workerPool.dispatch(exe.data.module, ptrs, exe.source.size);
+				for (const slot of outputs) this.#pendingWork.set(slot, epoch);
+			} else {
+				if (inputs.some((slot) => {
+					const epoch = this.#pendingWork.get(slot);
+					return epoch && this.#workerPool.epoch < epoch;
+				})) throw new Error("cannot dispatch synchronously with pending async work");
+				const instance = new WebAssembly.Instance(exe.data.module, { env: { memory: this.#memory } });
+				const func = instance.exports.kernel;
+				func(...ptrs, 0, exe.source.size);
+			}
+		}
+		if (tracing) {
+			const info = traceSourceInfo(exe.source);
+			emitTrace("wasm", info, start, performance.now());
+		}
 	}
 	#getBuffer(slot) {
 		const buffer = this.#buffers.get(slot);
@@ -3926,12 +4578,36 @@ var WasmBackend = class {
 		return new Uint8Array(this.#memory.buffer, buffer.ptr, buffer.size);
 	}
 };
+/** Emit a runtime guard: enter the if-block only when [begin, end) is SIMD-aligned. */
+function emitAlignmentGuard(cg, paramBegin, paramEnd) {
+	const mask = SIMD_LANES - 1;
+	cg.local.get(paramEnd);
+	cg.local.get(paramBegin);
+	cg.i32.sub();
+	cg.i32.const(mask);
+	cg.i32.and();
+	cg.i32.eqz();
+	cg.local.get(paramBegin);
+	cg.i32.const(mask);
+	cg.i32.and();
+	cg.i32.eqz();
+	cg.i32.and();
+	cg.if(cg.void);
+}
 function codegenWasm(kernel) {
 	const tune = tuneNullopt(kernel);
 	const re = kernel.reduction;
 	if (DEBUG >= 3) console.info(`kernel.exp: ${kernel.exp}\ntune.exp: ${tune.exp}`);
+	const useSimd = isSimdEligible(tune.exp, kernel);
+	const bufferStrides = /* @__PURE__ */ new Map();
+	if (useSimd) tune.exp.collect((e) => e.op === AluOp.GlobalIndex).forEach((gi) => {
+		const result = analyzeStride(gi.src[0]);
+		if (result.kind !== "gather" && (result.tileSize < SIMD_LANES || isFinite(result.tileSize) && result.tileSize % SIMD_LANES !== 0)) bufferStrides.set(gi, GATHER);
+		else bufferStrides.set(gi, result);
+	});
 	const cg = new CodeGenerator();
 	cg.memory.import("env", "memory");
+	if (hasSharedArrayBuffer()) cg.memory.pages(0, 65536).shared(true);
 	const distinctOps = mapSetUnion(tune.exp.distinctOps(), tune.epilogue?.distinctOps());
 	const funcs = {};
 	if (distinctOps.has(AluOp.Sin)) funcs.sin = wasm_sin(cg);
@@ -3943,12 +4619,127 @@ function codegenWasm(kernel) {
 	if (distinctOps.has(AluOp.Erf)) funcs.erf = wasm_erf(cg, funcs.exp);
 	if (distinctOps.has(AluOp.Erfc)) funcs.erfc = wasm_erfc(cg, funcs.exp);
 	if (distinctOps.has(AluOp.Threefry2x32)) funcs.threefry2x32 = wasm_threefry2x32(cg);
-	const kernelFunc = cg.function(rep(kernel.nargs + 1, cg.i32), [], () => {
+	const paramBegin = kernel.nargs + 1;
+	const paramEnd = kernel.nargs + 2;
+	const kernelFunc = cg.function(rep(kernel.nargs + 3, cg.i32), [], () => {
 		const gidx = cg.local.declare(cg.i32);
+		cg.local.get(paramBegin);
+		cg.local.set(gidx);
+		if (useSimd) {
+			emitAlignmentGuard(cg, paramBegin, paramEnd);
+			cg.loop(cg.void);
+			if (!re) {
+				cg.block(cg.void);
+				cg.local.get(gidx);
+				cg.local.get(paramEnd);
+				cg.i32.ge_u();
+				cg.br_if(0);
+				cg.local.get(kernel.nargs);
+				cg.local.get(gidx);
+				cg.i32.const(byteWidth(kernel.dtype));
+				cg.i32.mul();
+				cg.i32.add();
+				translateExpSimd(cg, funcs, tune.exp, { gidx }, bufferStrides);
+				cg.v128.store(4);
+				cg.local.get(gidx);
+				cg.i32.const(SIMD_LANES);
+				cg.i32.add();
+				cg.local.set(gidx);
+				cg.br(1);
+				cg.end();
+			} else {
+				const reIsInt = kernel.exp.dtype === DType.Int32 || kernel.exp.dtype === DType.Uint32;
+				cg.block(cg.void);
+				cg.local.get(gidx);
+				cg.local.get(paramEnd);
+				cg.i32.ge_u();
+				cg.br_if(0);
+				const vecAcc = cg.local.declare(reIsInt ? cg.i32x4 : cg.f32x4);
+				if (reIsInt) {
+					cg.i32.const(re.identity);
+					cg.i32x4.splat();
+				} else {
+					cg.f32.const(re.identity);
+					cg.f32x4.splat();
+				}
+				cg.local.set(vecAcc);
+				const ridx = cg.local.declare(cg.i32);
+				cg.i32.const(0);
+				cg.local.set(ridx);
+				cg.loop(cg.void);
+				cg.block(cg.void);
+				cg.local.get(ridx);
+				cg.i32.const(re.size);
+				cg.i32.ge_u();
+				cg.br_if(0);
+				translateExpSimd(cg, funcs, tune.exp, {
+					gidx,
+					ridx
+				}, bufferStrides);
+				cg.local.get(vecAcc);
+				if (reIsInt) if (re.op === AluOp.Add) cg.i32x4.add();
+				else if (re.op === AluOp.Mul) cg.i32x4.mul();
+				else if (re.op === AluOp.Min) if (re.dtype === DType.Int32) cg.i32x4.min_s();
+				else cg.i32x4.min_u();
+				else if (re.op === AluOp.Max) if (re.dtype === DType.Int32) cg.i32x4.max_s();
+				else cg.i32x4.max_u();
+				else throw new Error(`invalid SIMD reduction op: ${re.op}`);
+				else if (re.op === AluOp.Add) cg.f32x4.add();
+				else if (re.op === AluOp.Mul) cg.f32x4.mul();
+				else if (re.op === AluOp.Min) cg.f32x4.min();
+				else if (re.op === AluOp.Max) cg.f32x4.max();
+				else throw new Error(`invalid SIMD reduction op: ${re.op}`);
+				cg.local.set(vecAcc);
+				cg.local.get(ridx);
+				cg.i32.const(1);
+				cg.i32.add();
+				cg.local.set(ridx);
+				cg.br(1);
+				cg.end();
+				cg.end();
+				for (let lane = 0; lane < SIMD_LANES; lane++) {
+					cg.local.get(kernel.nargs);
+					cg.local.get(gidx);
+					if (lane > 0) {
+						cg.i32.const(lane);
+						cg.i32.add();
+					}
+					cg.i32.const(byteWidth(kernel.dtype));
+					cg.i32.mul();
+					cg.i32.add();
+					const acc = cg.local.declare(reIsInt ? cg.i32 : cg.f32);
+					cg.local.get(vecAcc);
+					if (reIsInt) cg.i32x4.extract_lane(lane);
+					else cg.f32x4.extract_lane(lane);
+					cg.local.set(acc);
+					const laneGidx = cg.local.declare(cg.i32);
+					cg.local.get(gidx);
+					if (lane > 0) {
+						cg.i32.const(lane);
+						cg.i32.add();
+					}
+					cg.local.set(laneGidx);
+					translateExp(cg, funcs, tune.epilogue, {
+						acc,
+						gidx: laneGidx
+					});
+					dty(cg, null, kernel.dtype).store(Math.log2(byteWidth(kernel.dtype)));
+				}
+				cg.local.get(gidx);
+				cg.i32.const(SIMD_LANES);
+				cg.i32.add();
+				cg.local.set(gidx);
+				cg.br(1);
+				cg.end();
+			}
+			cg.end();
+			cg.return();
+			cg.end();
+		}
 		cg.loop(cg.void);
 		cg.block(cg.void);
 		cg.local.get(gidx);
-		cg.i32.const(kernel.size);
+		cg.local.get(paramEnd);
 		cg.i32.ge_u();
 		cg.br_if(0);
 		cg.local.get(kernel.nargs);
@@ -4087,6 +4878,11 @@ function translateExp(cg, funcs, exp, ctx) {
 				else cg.i32.gt_u();
 				cg.select();
 			} else throw new UnsupportedOpError(op, dtype, "wasm");
+			else if (op === AluOp.BitCombine) if (arg === "and") cg.i32.and();
+			else if (arg === "or") cg.i32.or();
+			else cg.i32.xor();
+			else if (op === AluOp.BitShift) if (arg === "shl") cg.i32.shl();
+			else cg.i32.shr_u();
 			else if (op === AluOp.Cmplt) {
 				const srcDtype = src[0].dtype;
 				if (isFloatDtype(srcDtype)) dtyF(cg, op, srcDtype).lt();
@@ -4263,7 +5059,7 @@ async function createBackend(device) {
 		if (!navigator.gpu) return null;
 		const adapter = await navigator.gpu.requestAdapter({ powerPreference: "high-performance" });
 		if (!adapter) return null;
-		const { WebGPUBackend } = await import("./webgpu-AN0cG_nB.js");
+		const { WebGPUBackend } = await import("./webgpu-Dg8FpYrH.js");
 		const importantLimits = [
 			"maxBufferSize",
 			"maxComputeInvocationsPerWorkgroup",
@@ -4301,7 +5097,7 @@ async function createBackend(device) {
 		});
 		if (!gl) return null;
 		if (!gl.getExtension("EXT_color_buffer_float")) return null;
-		const { WebGLBackend } = await import("./webgl-DnGrclTz.js");
+		const { WebGLBackend } = await import("./webgl-D8-14NzA.js");
 		return new WebGLBackend(gl);
 	} else throw new Error(`Backend not found: ${device}`);
 }
@@ -4335,6 +5131,15 @@ var UnsupportedRoutineError = class extends Error {
 		super(`routine '${name}' is not supported in ${device} backend`);
 	}
 };
+/**
+* If the WebGPU backend has been initialized, return the `GPUDevice` that this
+* backend runs on. This is useful for sharing buffers.
+*/
+function getWebGPUDevice() {
+	const backend = initializedBackends.get("webgpu");
+	if (!backend) throw new Error("WebGPU backend not initialized, call init('webgpu') first");
+	return backend.device;
+}
 //#endregion
-export { AluExp, AluGroup, AluOp, AluVar, DEBUG, DType, Executable, FpHash, Kernel, PPrint, Reduction, Routine, Routines, ShapeTracker, SlotError, UnsupportedOpError, UnsupportedRoutineError, accessorAluExp, accessorGlobal, assertNonNull, byteWidth, checkAxis, checkInts, deepEqual, defaultDevice, devices, dtypedArray, dtypedJsArray, findPow2, generalBroadcast, getBackend, init, invertPermutation, isFloatDtype, isNumberPair, isPermutation, mapSetUnion, normalizeAxis, partitionList, prod, promoteTypes, range, recursiveFlatten, rep, runWithCache, setDebug, strip1, toposort, tuneNullopt, tuneWebgpu, unravelAlu, unzip2, zip, zipn };
+export { AluExp, AluGroup, AluOp, AluVar, DEBUG, DType, Executable, FpHash, Kernel, PPrint, Reduction, Routine, Routines, ShapeTracker, SlotError, UnsupportedOpError, UnsupportedRoutineError, accessorAluExp, accessorGlobal, assertNonNull, byteWidth, checkAxis, checkInts, deepEqual, defaultDevice, devices, dtypedArray, dtypedJsArray, emitTrace, findPow2, generalBroadcast, getBackend, getWebGPUDevice, init, invertPermutation, isFloatDtype, isNumberPair, isPermutation, isTracing, mapSetUnion, normalizeAxis, onFlushTrace, partitionList, prod, promoteTypes, range, recursiveFlatten, rep, runWithCache, setDebug, startTrace, stopTrace, strip1, toposort, traceSourceInfo, tuneNullopt, tuneWebgpu, unravelAlu, unzip2, zip, zipn };