npm - @jax-js/jax - Versions diffs - 0.1.12 → 0.1.14 - Mend

@jax-js/jax 0.1.12 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md +11 -7
package/dist/{backend-x-6vqzIM.cjs → backend-VlXzdQvR.cjs} +2111 -1557
package/dist/{backend-DI-V78Rk.js → backend-apsUOPzb.js} +2111 -1557
package/dist/index.cjs +10 -1
package/dist/index.js +10 -1
package/dist/{webgl-CD3WK_Me.cjs → webgl-C6rCbloA.cjs} +1 -1
package/dist/{webgl-BhsnpeB0.js → webgl-Hh0FX6oV.js} +1 -1
package/dist/{webgpu-C2kLdkUh.js → webgpu-BRv5r9Sl.js} +84 -31
package/dist/{webgpu-C4S8Uq9e.cjs → webgpu-pWnE96Xc.cjs} +84 -31
package/package.json +1 -1

package/dist/index.cjs CHANGED Viewed

@@ -30,7 +30,7 @@ var __toESM = (mod$1, isNodeMode, target) => (target = mod$1 != null ? __create(
 }) : target, mod$1));
 //#endregion
-const require_backend = require('./backend-x-6vqzIM.cjs');
+const require_backend = require('./backend-VlXzdQvR.cjs');
 //#region src/frontend/convolution.ts
 /**
@@ -3224,6 +3224,15 @@ var Array$1 = class Array$1 extends Tracer {
 			},
 			[Primitive.Conv]([x, y], params) {
 				checkConvShape(x.shape, y.shape, params);
+				const shouldMaterializePadding = x.#backend.type === "wasm" && params.lhsDilation.every((d) => d === 1) && params.padding.some(([left, right]) => left > 0 || right > 0);
+				if (shouldMaterializePadding) {
+					x = x.#reshape(x.#st.padOrShrink([...require_backend.rep(params.vmapDims + 2, [0, 0]), ...params.padding]));
+					x.#realize();
+					params = {
+						...params,
+						padding: require_backend.rep(params.padding.length, [0, 0])
+					};
+				}
 				const [stX, stY] = prepareConv(x.#st, y.#st, params);
 				return [Array$1.#naryCustom("conv", ([x$1, y$1]) => require_backend.AluExp.mul(x$1, y$1), [x.#reshape(stX), y.#reshape(stY)], { reduceAxis: true })];
 			},

package/dist/index.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import { __export } from "./chunk-Cl8Af3a2.js";
-import { AluExp, AluGroup, AluOp, AluVar, DEBUG, DType, FpHash, Kernel, PPrint, Reduction, Routine, Routines, ShapeTracker, accessorAluExp, accessorGlobal, assertNonNull, byteWidth, checkAxis, checkInts, deepEqual, defaultDevice, devices, dtypedArray, dtypedJsArray, generalBroadcast, getBackend, getWebGPUDevice, init, invertPermutation, isFloatDtype, isNumberPair, isPermutation, normalizeAxis, partitionList, prod, promoteTypes, range, recursiveFlatten, rep, runWithCache, setDebug, startTrace, stopTrace, toposort, unravelAlu, unzip2, zip, zipn } from "./backend-DI-V78Rk.js";
+import { AluExp, AluGroup, AluOp, AluVar, DEBUG, DType, FpHash, Kernel, PPrint, Reduction, Routine, Routines, ShapeTracker, accessorAluExp, accessorGlobal, assertNonNull, byteWidth, checkAxis, checkInts, deepEqual, defaultDevice, devices, dtypedArray, dtypedJsArray, generalBroadcast, getBackend, getWebGPUDevice, init, invertPermutation, isFloatDtype, isNumberPair, isPermutation, normalizeAxis, partitionList, prod, promoteTypes, range, recursiveFlatten, rep, runWithCache, setDebug, startTrace, stopTrace, toposort, unravelAlu, unzip2, zip, zipn } from "./backend-apsUOPzb.js";
 //#region src/frontend/convolution.ts
 /**
@@ -3189,6 +3189,15 @@ var Array$1 = class Array$1 extends Tracer {
 			},
 			[Primitive.Conv]([x, y], params) {
 				checkConvShape(x.shape, y.shape, params);
+				const shouldMaterializePadding = x.#backend.type === "wasm" && params.lhsDilation.every((d) => d === 1) && params.padding.some(([left, right]) => left > 0 || right > 0);
+				if (shouldMaterializePadding) {
+					x = x.#reshape(x.#st.padOrShrink([...rep(params.vmapDims + 2, [0, 0]), ...params.padding]));
+					x.#realize();
+					params = {
+						...params,
+						padding: rep(params.padding.length, [0, 0])
+					};
+				}
 				const [stX, stY] = prepareConv(x.#st, y.#st, params);
 				return [Array$1.#naryCustom("conv", ([x$1, y$1]) => AluExp.mul(x$1, y$1), [x.#reshape(stX), y.#reshape(stY)], { reduceAxis: true })];
 			},

package/dist/{webgl-CD3WK_Me.cjs → webgl-C6rCbloA.cjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-const require_backend = require('./backend-x-6vqzIM.cjs');
+const require_backend = require('./backend-VlXzdQvR.cjs');
 //#region src/backend/webgl/builtins.ts
 const threefrySrc = `

package/dist/{webgl-BhsnpeB0.js → webgl-Hh0FX6oV.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { AluGroup, AluOp, DEBUG, DType, Executable, SlotError, UnsupportedOpError, UnsupportedRoutineError, isFloatDtype, range, strip1, tuneNullopt } from "./backend-DI-V78Rk.js";
+import { AluGroup, AluOp, DEBUG, DType, Executable, SlotError, UnsupportedOpError, UnsupportedRoutineError, isFloatDtype, range, strip1, tuneNullopt } from "./backend-apsUOPzb.js";
 //#region src/backend/webgl/builtins.ts
 const threefrySrc = `

package/dist/{webgpu-C2kLdkUh.js → webgpu-BRv5r9Sl.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { AluExp, AluGroup, AluOp, DEBUG, DType, Executable, FpHash, Routines, SlotError, UnsupportedOpError, UnsupportedRoutineError, emitTrace, findPow2, isFloatDtype, isTracing, mapSetUnion, onFlushTrace, prod, range, strip1, traceSourceInfo, tuneWebgpu } from "./backend-DI-V78Rk.js";
+import { AluExp, AluGroup, AluOp, DEBUG, DType, Executable, FpHash, Routines, SlotError, UnsupportedOpError, UnsupportedRoutineError, emitTrace, findPow2, isFloatDtype, isTracing, mapSetUnion, onFlushTrace, prod, range, strip1, traceSourceInfo, tuneWebgpu } from "./backend-apsUOPzb.js";
 //#region src/backend/webgpu/builtins.ts
 const threefrySrc = `
@@ -147,6 +147,13 @@ function constToWgsl(dtype, value) {
 	}
 	throw new Error(`Unsupported const dtype: ${dtype}`);
 }
+function reduceOpWgsl(op, dtype, a, b) {
+	if (op === AluOp.Add) return `(${a} + ${b})`;
+	if (op === AluOp.Mul) return `(${a} * ${b})`;
+	if (op === AluOp.Min) return dtype === DType.Bool ? `(${a} && ${b})` : `min(${a}, ${b})`;
+	if (op === AluOp.Max) return dtype === DType.Bool ? `(${a} || ${b})` : `max(${a}, ${b})`;
+	throw new Error(`Unsupported reduction op: ${op}`);
+}
 /** Codegen for WebGPU expressions, linearizing AluOp into a kernel. */
 var WgslExpCodegen = class {
 	#gensymCount = 0;
@@ -1099,6 +1106,8 @@ function flushTracingBatch(device, batch) {
 //#endregion
 //#region src/backend/webgpu.ts
+const MAX_REUSABLE_BUFFER_BYTES = 64 * 1024 * 1024;
+const MAX_REUSABLE_BUFFERS_PER_SIZE = 64;
 /** Implementation of `Backend` that uses WebGPU in browsers. */
 var WebGPUBackend = class {
 	type = "webgpu";
@@ -1109,6 +1118,7 @@ var WebGPUBackend = class {
 	nextSlot;
 	#cachedShaderMap = /* @__PURE__ */ new Map();
 	#reusableZsb;
+	#bufferPool = /* @__PURE__ */ new Map();
 	constructor(device) {
 		this.device = device;
 		if (DEBUG >= 3 && device.adapterInfo) console.info("webgpu adapter:", device.adapterInfo.vendor, device.adapterInfo.architecture);
@@ -1123,31 +1133,22 @@ var WebGPUBackend = class {
 		});
 	}
 	malloc(size, initialData) {
-		let buffer;
-		const paddedSize = Math.ceil(size / 4) * 4;
-		if (size === 0) buffer = this.#reusableZsb;
-		else if (initialData) {
-			if (initialData.byteLength !== size) throw new Error("initialData size does not match buffer size");
-			if (initialData.byteLength < 4096) {
-				buffer = this.#createBuffer(paddedSize, { mapped: true });
-				new Uint8Array(buffer.getMappedRange(), 0, size).set(initialData);
-				buffer.unmap();
-			} else {
-				buffer = this.#createBuffer(paddedSize);
-				if (initialData.byteLength % 4 === 0) this.device.queue.writeBuffer(buffer, 0, initialData);
-				else {
-					const aligned = initialData.byteLength - initialData.byteLength % 4;
-					this.device.queue.writeBuffer(buffer, 0, initialData, 0, aligned);
-					const remainder = new Uint8Array(4);
-					remainder.set(initialData.subarray(aligned));
-					this.device.queue.writeBuffer(buffer, aligned, remainder);
-				}
-			}
-		} else buffer = this.#createBuffer(paddedSize);
+		if (initialData && initialData.byteLength !== size) throw new Error("initialData size does not match buffer size");
+		const allocatedSize = Math.ceil(size / 4) * 4 || 4;
+		const buffer = size === 0 ? this.#reusableZsb : this.#acquireBuffer(allocatedSize);
+		if (initialData && size > 0) if (initialData.byteLength % 4 === 0) this.device.queue.writeBuffer(buffer, 0, initialData);
+		else {
+			const aligned = initialData.byteLength - initialData.byteLength % 4;
+			if (aligned > 0) this.device.queue.writeBuffer(buffer, 0, initialData, 0, aligned);
+			const remainder = new Uint8Array(4);
+			remainder.set(initialData.subarray(aligned));
+			this.device.queue.writeBuffer(buffer, aligned, remainder);
+		}
 		const slot = this.nextSlot++;
 		this.buffers.set(slot, {
 			buffer,
 			size,
+			allocatedSize,
 			ref: 1
 		});
 		return slot;
@@ -1163,7 +1164,7 @@ var WebGPUBackend = class {
 		buffer.ref--;
 		if (buffer.ref === 0) {
 			this.buffers.delete(slot);
-			if (buffer.buffer !== this.#reusableZsb) buffer.buffer.destroy();
+			if (buffer.buffer !== this.#reusableZsb) this.#releaseBuffer(buffer.buffer, buffer.allocatedSize);
 		}
 	}
 	async read(slot, start, count) {
@@ -1251,6 +1252,29 @@ var WebGPUBackend = class {
 			size: buffer.size
 		};
 	}
+	#acquireBuffer(size) {
+		if (size > MAX_REUSABLE_BUFFER_BYTES) return this.#createBuffer(size);
+		const bucket = this.#bufferPool.get(size);
+		const buffer = bucket?.pop();
+		if (bucket && bucket.length === 0) this.#bufferPool.delete(size);
+		return buffer ?? this.#createBuffer(size);
+	}
+	#releaseBuffer(buffer, size) {
+		if (size > MAX_REUSABLE_BUFFER_BYTES) {
+			buffer.destroy();
+			return;
+		}
+		const bucket = this.#bufferPool.get(size);
+		if (!bucket) {
+			this.#bufferPool.set(size, [buffer]);
+			return;
+		}
+		if (bucket.length >= MAX_REUSABLE_BUFFERS_PER_SIZE) {
+			buffer.destroy();
+			return;
+		}
+		bucket.push(buffer);
+	}
 	/**
 	* Create a GPU buffer.
 	*
@@ -1299,14 +1323,30 @@ function pipelineSource(device, kernel) {
 	}
 	const resultTy = dtypeToWgsl(kernel.dtype, true);
 	wb.emit(`@group(0) @binding(${nargs}) var<storage, read_write> result : array<${resultTy}>;`);
-	const workgroupSize = findPow2(tune.threadCount, 256);
-	const gridSize = Math.ceil(tune.threadCount / workgroupSize);
+	const groupCount = re ? tune.size.groups ?? 1 : 1;
+	const groupedReduction = re && groupCount > 1;
+	if (groupedReduction && tune.threadCount % groupCount !== 0) throw new Error("WebGPU grouped reduction has invalid thread count");
+	if (groupedReduction && groupCount > device.limits.maxComputeWorkgroupSizeX) throw new Error("WebGPU grouped reduction exceeds workgroup size limit");
+	const workgroupSize = groupedReduction ? groupCount : findPow2(tune.threadCount, 256);
+	const gridSize = groupedReduction ? tune.threadCount / groupCount : Math.ceil(tune.threadCount / workgroupSize);
 	const [gridX, gridY] = calculateGrid(gridSize);
-	wb.emit("", `@compute @workgroup_size(${workgroupSize})`, "fn main(@builtin(global_invocation_id) id : vec3<u32>) {", wb.pushIndent);
-	if (gridY === 1) wb.emit(`if (id.x >= ${tune.threadCount}) { return; }`, "let gidx: i32 = i32(id.x);");
-	else {
-		const sizeX = gridX * workgroupSize;
-		wb.emit(`if (${sizeX} * id.y + id.x >= ${tune.threadCount}) { return; }`, `let gidx: i32 = i32(${sizeX} * id.y + id.x);`);
+	if (groupedReduction) {
+		const partialTy = dtypeToWgsl(re.dtype);
+		for (let i = 0; i < (tune.size.upcast ?? 1); i++) wb.emit(`var<workgroup> partial${i}: array<${partialTy}, ${groupCount}>;`);
+	}
+	wb.emit("", `@compute @workgroup_size(${workgroupSize})`);
+	if (groupedReduction) {
+		wb.emit("fn main(", wb.pushIndent, "@builtin(local_invocation_id) lid : vec3<u32>,", "@builtin(workgroup_id) wg_id : vec3<u32>,", wb.popIndent, ") {", wb.pushIndent);
+		if (gridY === 1) wb.emit(`if (wg_id.x >= ${gridSize}u) { return; }`, "let gidx: i32 = i32(wg_id.x);");
+		else wb.emit(`if (${gridX}u * wg_id.y + wg_id.x >= ${gridSize}u) { return; }`, `let gidx: i32 = i32(${gridX}u * wg_id.y + wg_id.x);`);
+		wb.emit("let group: i32 = i32(lid.x);");
+	} else {
+		wb.emit("fn main(@builtin(global_invocation_id) id : vec3<u32>) {", wb.pushIndent);
+		if (gridY === 1) wb.emit(`if (id.x >= ${tune.threadCount}) { return; }`, "let gidx: i32 = i32(id.x);");
+		else {
+			const sizeX = gridX * workgroupSize;
+			wb.emit(`if (${sizeX} * id.y + id.x >= ${tune.threadCount}) { return; }`, `let gidx: i32 = i32(${sizeX} * id.y + id.x);`);
+		}
 	}
 	wb.emitPhonyAssignments(args);
 	const gen = new WgslExpCodegen(wb, args);
@@ -1316,7 +1356,6 @@ function pipelineSource(device, kernel) {
 		if (resultTy !== dtypeToWgsl(tune.exp.dtype)) rhs = `${resultTy}(${rhs})`;
 		wb.emit(`result[gidx] = ${rhs};`);
 	} else {
-		if ((tune.size.groups ?? 1) > 1) throw new Error("WebGPU backend does not support group optimization yet");
 		const unroll = tune.size.unroll ?? 1;
 		const upcast = tune.size.upcast ?? 1;
 		const acc = [...Array(upcast)].map((_, i) => `acc${i}`);
@@ -1352,6 +1391,15 @@ function pipelineSource(device, kernel) {
 			else throw new Error(`Unsupported reduction op: ${re.op}`);
 		}
 		wb.emit(wb.popIndent, "}");
+		if (groupedReduction) {
+			for (let i = 0; i < upcast; i++) wb.emit(`partial${i}[lid.x] = ${acc[i]};`);
+			wb.emit("workgroupBarrier();");
+			for (let stride = groupCount / 2; stride >= 1; stride /= 2) {
+				wb.emit(`if (lid.x < ${stride}u) {`, wb.pushIndent);
+				for (let i = 0; i < upcast; i++) wb.emit(`partial${i}[lid.x] = ${reduceOpWgsl(re.op, re.dtype, `partial${i}[lid.x]`, `partial${i}[lid.x + ${stride}u]`)};`);
+				wb.emit(wb.popIndent, "}", "workgroupBarrier();");
+			}
+		}
 		gen.reset();
 		const outputIdxExps = [];
 		const fusionExps = [];
@@ -1365,12 +1413,17 @@ function pipelineSource(device, kernel) {
 			}).simplify(cache));
 			gen.countReferences(fusionExps[i]);
 		}
+		if (groupedReduction) {
+			wb.emit("if (lid.x == 0u) {", wb.pushIndent);
+			for (let i = 0; i < upcast; i++) wb.emit(`${acc[i]} = partial${i}[0u];`);
+		}
 		for (let i = 0; i < upcast; i++) {
 			const index = strip1(gen.run(outputIdxExps[i]));
 			let rhs = strip1(gen.run(fusionExps[i]));
 			if (resultTy !== dtypeToWgsl(fusionExps[i].dtype)) rhs = `${resultTy}(${rhs})`;
 			wb.emit(`result[${index}] = ${rhs};`);
 		}
+		if (groupedReduction) wb.emit(wb.popIndent, "}");
 	}
 	wb.emit(wb.popIndent, "}");
 	return {

package/dist/{webgpu-C4S8Uq9e.cjs → webgpu-pWnE96Xc.cjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-const require_backend = require('./backend-x-6vqzIM.cjs');
+const require_backend = require('./backend-VlXzdQvR.cjs');
 //#region src/backend/webgpu/builtins.ts
 const threefrySrc = `
@@ -147,6 +147,13 @@ function constToWgsl(dtype, value) {
 	}
 	throw new Error(`Unsupported const dtype: ${dtype}`);
 }
+function reduceOpWgsl(op, dtype, a, b) {
+	if (op === require_backend.AluOp.Add) return `(${a} + ${b})`;
+	if (op === require_backend.AluOp.Mul) return `(${a} * ${b})`;
+	if (op === require_backend.AluOp.Min) return dtype === require_backend.DType.Bool ? `(${a} && ${b})` : `min(${a}, ${b})`;
+	if (op === require_backend.AluOp.Max) return dtype === require_backend.DType.Bool ? `(${a} || ${b})` : `max(${a}, ${b})`;
+	throw new Error(`Unsupported reduction op: ${op}`);
+}
 /** Codegen for WebGPU expressions, linearizing AluOp into a kernel. */
 var WgslExpCodegen = class {
 	#gensymCount = 0;
@@ -1099,6 +1106,8 @@ function flushTracingBatch(device, batch) {
 //#endregion
 //#region src/backend/webgpu.ts
+const MAX_REUSABLE_BUFFER_BYTES = 64 * 1024 * 1024;
+const MAX_REUSABLE_BUFFERS_PER_SIZE = 64;
 /** Implementation of `Backend` that uses WebGPU in browsers. */
 var WebGPUBackend = class {
 	type = "webgpu";
@@ -1109,6 +1118,7 @@ var WebGPUBackend = class {
 	nextSlot;
 	#cachedShaderMap = /* @__PURE__ */ new Map();
 	#reusableZsb;
+	#bufferPool = /* @__PURE__ */ new Map();
 	constructor(device) {
 		this.device = device;
 		if (require_backend.DEBUG >= 3 && device.adapterInfo) console.info("webgpu adapter:", device.adapterInfo.vendor, device.adapterInfo.architecture);
@@ -1123,31 +1133,22 @@ var WebGPUBackend = class {
 		});
 	}
 	malloc(size, initialData) {
-		let buffer;
-		const paddedSize = Math.ceil(size / 4) * 4;
-		if (size === 0) buffer = this.#reusableZsb;
-		else if (initialData) {
-			if (initialData.byteLength !== size) throw new Error("initialData size does not match buffer size");
-			if (initialData.byteLength < 4096) {
-				buffer = this.#createBuffer(paddedSize, { mapped: true });
-				new Uint8Array(buffer.getMappedRange(), 0, size).set(initialData);
-				buffer.unmap();
-			} else {
-				buffer = this.#createBuffer(paddedSize);
-				if (initialData.byteLength % 4 === 0) this.device.queue.writeBuffer(buffer, 0, initialData);
-				else {
-					const aligned = initialData.byteLength - initialData.byteLength % 4;
-					this.device.queue.writeBuffer(buffer, 0, initialData, 0, aligned);
-					const remainder = new Uint8Array(4);
-					remainder.set(initialData.subarray(aligned));
-					this.device.queue.writeBuffer(buffer, aligned, remainder);
-				}
-			}
-		} else buffer = this.#createBuffer(paddedSize);
+		if (initialData && initialData.byteLength !== size) throw new Error("initialData size does not match buffer size");
+		const allocatedSize = Math.ceil(size / 4) * 4 || 4;
+		const buffer = size === 0 ? this.#reusableZsb : this.#acquireBuffer(allocatedSize);
+		if (initialData && size > 0) if (initialData.byteLength % 4 === 0) this.device.queue.writeBuffer(buffer, 0, initialData);
+		else {
+			const aligned = initialData.byteLength - initialData.byteLength % 4;
+			if (aligned > 0) this.device.queue.writeBuffer(buffer, 0, initialData, 0, aligned);
+			const remainder = new Uint8Array(4);
+			remainder.set(initialData.subarray(aligned));
+			this.device.queue.writeBuffer(buffer, aligned, remainder);
+		}
 		const slot = this.nextSlot++;
 		this.buffers.set(slot, {
 			buffer,
 			size,
+			allocatedSize,
 			ref: 1
 		});
 		return slot;
@@ -1163,7 +1164,7 @@ var WebGPUBackend = class {
 		buffer.ref--;
 		if (buffer.ref === 0) {
 			this.buffers.delete(slot);
-			if (buffer.buffer !== this.#reusableZsb) buffer.buffer.destroy();
+			if (buffer.buffer !== this.#reusableZsb) this.#releaseBuffer(buffer.buffer, buffer.allocatedSize);
 		}
 	}
 	async read(slot, start, count) {
@@ -1251,6 +1252,29 @@ var WebGPUBackend = class {
 			size: buffer.size
 		};
 	}
+	#acquireBuffer(size) {
+		if (size > MAX_REUSABLE_BUFFER_BYTES) return this.#createBuffer(size);
+		const bucket = this.#bufferPool.get(size);
+		const buffer = bucket?.pop();
+		if (bucket && bucket.length === 0) this.#bufferPool.delete(size);
+		return buffer ?? this.#createBuffer(size);
+	}
+	#releaseBuffer(buffer, size) {
+		if (size > MAX_REUSABLE_BUFFER_BYTES) {
+			buffer.destroy();
+			return;
+		}
+		const bucket = this.#bufferPool.get(size);
+		if (!bucket) {
+			this.#bufferPool.set(size, [buffer]);
+			return;
+		}
+		if (bucket.length >= MAX_REUSABLE_BUFFERS_PER_SIZE) {
+			buffer.destroy();
+			return;
+		}
+		bucket.push(buffer);
+	}
 	/**
 	* Create a GPU buffer.
 	*
@@ -1299,14 +1323,30 @@ function pipelineSource(device, kernel) {
 	}
 	const resultTy = dtypeToWgsl(kernel.dtype, true);
 	wb.emit(`@group(0) @binding(${nargs}) var<storage, read_write> result : array<${resultTy}>;`);
-	const workgroupSize = require_backend.findPow2(tune.threadCount, 256);
-	const gridSize = Math.ceil(tune.threadCount / workgroupSize);
+	const groupCount = re ? tune.size.groups ?? 1 : 1;
+	const groupedReduction = re && groupCount > 1;
+	if (groupedReduction && tune.threadCount % groupCount !== 0) throw new Error("WebGPU grouped reduction has invalid thread count");
+	if (groupedReduction && groupCount > device.limits.maxComputeWorkgroupSizeX) throw new Error("WebGPU grouped reduction exceeds workgroup size limit");
+	const workgroupSize = groupedReduction ? groupCount : require_backend.findPow2(tune.threadCount, 256);
+	const gridSize = groupedReduction ? tune.threadCount / groupCount : Math.ceil(tune.threadCount / workgroupSize);
 	const [gridX, gridY] = calculateGrid(gridSize);
-	wb.emit("", `@compute @workgroup_size(${workgroupSize})`, "fn main(@builtin(global_invocation_id) id : vec3<u32>) {", wb.pushIndent);
-	if (gridY === 1) wb.emit(`if (id.x >= ${tune.threadCount}) { return; }`, "let gidx: i32 = i32(id.x);");
-	else {
-		const sizeX = gridX * workgroupSize;
-		wb.emit(`if (${sizeX} * id.y + id.x >= ${tune.threadCount}) { return; }`, `let gidx: i32 = i32(${sizeX} * id.y + id.x);`);
+	if (groupedReduction) {
+		const partialTy = dtypeToWgsl(re.dtype);
+		for (let i = 0; i < (tune.size.upcast ?? 1); i++) wb.emit(`var<workgroup> partial${i}: array<${partialTy}, ${groupCount}>;`);
+	}
+	wb.emit("", `@compute @workgroup_size(${workgroupSize})`);
+	if (groupedReduction) {
+		wb.emit("fn main(", wb.pushIndent, "@builtin(local_invocation_id) lid : vec3<u32>,", "@builtin(workgroup_id) wg_id : vec3<u32>,", wb.popIndent, ") {", wb.pushIndent);
+		if (gridY === 1) wb.emit(`if (wg_id.x >= ${gridSize}u) { return; }`, "let gidx: i32 = i32(wg_id.x);");
+		else wb.emit(`if (${gridX}u * wg_id.y + wg_id.x >= ${gridSize}u) { return; }`, `let gidx: i32 = i32(${gridX}u * wg_id.y + wg_id.x);`);
+		wb.emit("let group: i32 = i32(lid.x);");
+	} else {
+		wb.emit("fn main(@builtin(global_invocation_id) id : vec3<u32>) {", wb.pushIndent);
+		if (gridY === 1) wb.emit(`if (id.x >= ${tune.threadCount}) { return; }`, "let gidx: i32 = i32(id.x);");
+		else {
+			const sizeX = gridX * workgroupSize;
+			wb.emit(`if (${sizeX} * id.y + id.x >= ${tune.threadCount}) { return; }`, `let gidx: i32 = i32(${sizeX} * id.y + id.x);`);
+		}
 	}
 	wb.emitPhonyAssignments(args);
 	const gen = new WgslExpCodegen(wb, args);
@@ -1316,7 +1356,6 @@ function pipelineSource(device, kernel) {
 		if (resultTy !== dtypeToWgsl(tune.exp.dtype)) rhs = `${resultTy}(${rhs})`;
 		wb.emit(`result[gidx] = ${rhs};`);
 	} else {
-		if ((tune.size.groups ?? 1) > 1) throw new Error("WebGPU backend does not support group optimization yet");
 		const unroll = tune.size.unroll ?? 1;
 		const upcast = tune.size.upcast ?? 1;
 		const acc = [...Array(upcast)].map((_, i) => `acc${i}`);
@@ -1352,6 +1391,15 @@ function pipelineSource(device, kernel) {
 			else throw new Error(`Unsupported reduction op: ${re.op}`);
 		}
 		wb.emit(wb.popIndent, "}");
+		if (groupedReduction) {
+			for (let i = 0; i < upcast; i++) wb.emit(`partial${i}[lid.x] = ${acc[i]};`);
+			wb.emit("workgroupBarrier();");
+			for (let stride = groupCount / 2; stride >= 1; stride /= 2) {
+				wb.emit(`if (lid.x < ${stride}u) {`, wb.pushIndent);
+				for (let i = 0; i < upcast; i++) wb.emit(`partial${i}[lid.x] = ${reduceOpWgsl(re.op, re.dtype, `partial${i}[lid.x]`, `partial${i}[lid.x + ${stride}u]`)};`);
+				wb.emit(wb.popIndent, "}", "workgroupBarrier();");
+			}
+		}
 		gen.reset();
 		const outputIdxExps = [];
 		const fusionExps = [];
@@ -1365,12 +1413,17 @@ function pipelineSource(device, kernel) {
 			}).simplify(cache));
 			gen.countReferences(fusionExps[i]);
 		}
+		if (groupedReduction) {
+			wb.emit("if (lid.x == 0u) {", wb.pushIndent);
+			for (let i = 0; i < upcast; i++) wb.emit(`${acc[i]} = partial${i}[0u];`);
+		}
 		for (let i = 0; i < upcast; i++) {
 			const index = require_backend.strip1(gen.run(outputIdxExps[i]));
 			let rhs = require_backend.strip1(gen.run(fusionExps[i]));
 			if (resultTy !== dtypeToWgsl(fusionExps[i].dtype)) rhs = `${resultTy}(${rhs})`;
 			wb.emit(`result[${index}] = ${rhs};`);
 		}
+		if (groupedReduction) wb.emit(wb.popIndent, "}");
 	}
 	wb.emit(wb.popIndent, "}");
 	return {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@jax-js/jax",
-  "version": "0.1.12",
+  "version": "0.1.14",
   "description": "Numerical computing and ML in the browser",
   "keywords": [
     "machine learning",