npm - @jax-js/jax - Versions diffs - 0.1.2 → 0.1.4 - Mend

@jax-js/jax 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md +16 -34
package/dist/{backend-DeVfWEFS.cjs → backend-Bu9GY6sK.cjs} +222 -36
package/dist/{backend-BqymqzuU.js → backend-tngXtWe4.js} +204 -36
package/dist/index.cjs +1798 -955
package/dist/index.d.cts +383 -97
package/dist/index.d.ts +383 -97
package/dist/index.js +1791 -949
package/dist/{webgpu-BGuG58KZ.js → webgpu-ChVgx3b6.js} +410 -97
package/dist/{webgpu-CcGP160M.cjs → webgpu-Oj3Kd-kd.cjs} +410 -97
package/package.json +1 -1

package/dist/{webgpu-BGuG58KZ.js → webgpu-ChVgx3b6.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { AluExp, AluGroup, AluOp, DEBUG, DType, Executable, FpHash, SlotError, UnsupportedOpError, findPow2, isFloatDtype, mapSetUnion, strip1, tuneWebgpu } from "./backend-BqymqzuU.js";
+import { AluExp, AluGroup, AluOp, DEBUG, DType, Executable, FpHash, Routines, SlotError, UnsupportedOpError, UnsupportedRoutineError, findPow2, isFloatDtype, mapSetUnion, prod, range, strip1, tuneWebgpu } from "./backend-tngXtWe4.js";
 //#region src/backend/webgpu/builtins.ts
 const threefrySrc = `
@@ -66,6 +66,59 @@ fn erfc(x: f32) -> f32 {
   return select(2.0 - E, E, x >= 0.0);
 }`;
+//#endregion
+//#region src/backend/webgpu/codegen.ts
+const headerWgsl = String.raw`
+fn nan() -> f32 { let bits = 0xffffffffu; return bitcast<f32>(bits); }
+fn inf() -> f32 { let bits = 0x7f800000u; return bitcast<f32>(bits); }
+`.trim();
+function dtypeToWgsl(dtype, storage = false) {
+	switch (dtype) {
+		case DType.Bool: return storage ? "i32" : "bool";
+		case DType.Int32: return "i32";
+		case DType.Uint32: return "u32";
+		case DType.Float32: return "f32";
+		case DType.Float16: return "f16";
+		default: throw new Error(`Unsupported dtype for WebGPU: ${dtype}`);
+	}
+}
+function maxValueWgsl(dtype) {
+	switch (dtype) {
+		case DType.Bool: return "1";
+		case DType.Int32: return "2147483647";
+		case DType.Uint32: return "4294967295u";
+		case DType.Float32: return "inf()";
+		case DType.Float16: return "f16(inf())";
+		default: throw new Error(`Unsupported dtype for WebGPU: ${dtype}`);
+	}
+}
+function constToWgsl(dtype, value) {
+	if (dtype === DType.Bool) return value ? "true" : "false";
+	if (dtype === DType.Int32) return value.toString();
+	if (dtype === DType.Uint32) return value.toString() + "u";
+	if (dtype === DType.Float32) {
+		if (Number.isNaN(value)) return "nan()";
+		if (!Number.isFinite(value)) return value > 0 ? "inf()" : "-inf()";
+		return "f32(" + value.toString() + ")";
+	}
+	if (dtype === DType.Float16) {
+		if (Number.isNaN(value)) return "f16(nan())";
+		if (!Number.isFinite(value)) return value > 0 ? "f16(inf())" : "f16(-inf())";
+		return "f16(" + value.toString() + ")";
+	}
+	throw new Error(`Unsupported const dtype: ${dtype}`);
+}
+const gridOffsetY = 16384;
+function calculateGrid(gridSize) {
+	let gridX = gridSize;
+	let gridY = 1;
+	if (gridSize > 65535) {
+		gridX = gridOffsetY;
+		gridY = Math.ceil(gridSize / gridOffsetY);
+	}
+	return [gridX, gridY];
+}
 //#endregion
 //#region src/backend/webgpu/reader.ts
 /**
@@ -170,6 +223,205 @@ var SyncReader = class SyncReader {
 	}
 };
+//#endregion
+//#region src/backend/webgpu/routines.ts
+function bitonicSortUniform(pass) {
+	const ar = new Uint32Array(3);
+	ar[0] = pass.kind === "sort" ? 0 : 1;
+	ar[1] = pass.mergeStep ?? 0;
+	ar[2] = pass.mergeStage ?? 0;
+	return new Uint8Array(ar.buffer);
+}
+/**
+* Generate a bitonic sort shader.
+*
+* We implement a variant of bitonic sort that [only has forward comparators](
+* <https://sortingalgos.miraheze.org/wiki/Bitonic_Sort#Bitonic_Sort_using_Forward_Comparators>),
+* so we don't need to allocate memory for power-of-two padding.
+*
+* This uses workgroup shared memory up to `2*workgroupSize` elements, for each
+* array in `batches`. For larger arrays, multiple passes are done:
+*
+* - Initial "sort" pass: each workgroup sorts its `2*workgroupSize` elements.
+* - Subsequent "merge" passes: each pass merges sorted sequences of size
+*   `2^(step+1)` with multiple workgroups. This doesn't use shared memory.
+*
+* The total number of passes is roughly `log2(n / workgroupSize)^2 / 2`.
+*/
+function bitonicSortShader(device, dtype, n, batches, outputIndices) {
+	const ty = dtypeToWgsl(dtype, true);
+	const paddedN = 1 << Math.ceil(Math.log2(n || 1));
+	const numThreads = Math.ceil(paddedN / 2);
+	const workgroupSize = findPow2(numThreads, device.limits.maxComputeWorkgroupSizeX);
+	const workgroupsPerBatch = numThreads / workgroupSize;
+	const numStages = Math.log2(paddedN);
+	const numLocalStages = Math.min(numStages, Math.log2(workgroupSize * 2));
+	const needsF16 = dtype === DType.Float16;
+	const padValue = isFloatDtype(dtype) ? `${ty}(nan())` : maxValueWgsl(dtype);
+	const code = `
+${needsF16 ? "enable f16;" : ""}
+${headerWgsl}
+struct Uniforms {
+  kind: u32, // 0 = sort, 1 = merge
+  merge_step: u32, // half_block = 2^step
+  merge_stage: u32, // only used for merge
+}
+@group(0) @binding(0) var<storage, read> input: array<${ty}>;
+@group(0) @binding(1) var<storage, read_write> output: array<${ty}>;
+${outputIndices ? `@group(0) @binding(2) var<storage, read_write> output_idx: array<i32>;` : ""}
+@group(1) @binding(0) var<uniform> uniforms: Uniforms;
+var<workgroup> shared_vals: array<${ty}, ${workgroupSize * 2}>;
+${outputIndices ? `var<workgroup> shared_idx: array<i32, ${workgroupSize * 2}>;` : ""}
+fn compare(a: ${ty}, b: ${ty}) -> bool {
+${isFloatDtype(dtype) ? `
+  let min_value = min(a, b);
+  return a == min_value && b != min_value;` : "  return a < b;"}
+}
+fn compare_and_swap(i: u32, j: u32) {
+  let val_i = shared_vals[i];
+  let val_j = shared_vals[j];
+  if (compare(val_j, val_i)) {
+    shared_vals[i] = val_j;
+    shared_vals[j] = val_i;
+${outputIndices ? `
+    let tmp_idx = shared_idx[i];
+    shared_idx[i] = shared_idx[j];
+    shared_idx[j] = tmp_idx;` : ""}
+  }
+}
+@compute @workgroup_size(${workgroupSize})
+fn main(
+  @builtin(workgroup_id) wg_id: vec3<u32>,
+  @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+  let blockid = wg_id.x + wg_id.y * ${gridOffsetY}u;
+  let batch = blockid / ${workgroupsPerBatch}u;
+  let wg_in_batch = blockid % ${workgroupsPerBatch}u;
+  let tid = local_id.x;
+  let base = batch * ${n}u;
+  if (uniforms.kind == 0u || (uniforms.kind == 1u && uniforms.merge_step == ${numLocalStages - 1}u)) {
+    let wg_base = wg_in_batch * ${workgroupSize * 2}u;
+    // Load data into shared memory (2 elements per thread)
+    let idx0 = tid * 2u;
+    let idx1 = tid * 2u + 1u;
+    // Load from input for initial 'sort' pass, then from output (read-write) for 'merge' passes.
+    if (uniforms.kind == 0u) {
+      shared_vals[idx0] = select(${padValue}, input[base + wg_base + idx0], wg_base + idx0 < ${n}u);
+      shared_vals[idx1] = select(${padValue}, input[base + wg_base + idx1], wg_base + idx1 < ${n}u);
+${outputIndices ? `
+      shared_idx[idx0] = i32(wg_base + idx0);
+      shared_idx[idx1] = i32(wg_base + idx1);` : ""}
+    } else {
+      shared_vals[idx0] = select(${padValue}, output[base + wg_base + idx0], wg_base + idx0 < ${n}u);
+      shared_vals[idx1] = select(${padValue}, output[base + wg_base + idx1], wg_base + idx1 < ${n}u);
+${outputIndices ? `
+      shared_idx[idx0] = select(${n}, output_idx[base + wg_base + idx0], wg_base + idx0 < ${n}u);
+      shared_idx[idx1] = select(${n}, output_idx[base + wg_base + idx1], wg_base + idx1 < ${n}u);` : ""}
+    }
+    workgroupBarrier();
+    let initial_stage = select(0u, ${numLocalStages - 1}u, uniforms.kind != 0u);
+    for (var stage = initial_stage; stage < ${numLocalStages}u; stage++) {
+      for (var step1 = stage + 1u; step1 > 0u; step1--) {
+        let step = step1 - 1u;
+        let half_block = 1u << step;
+        let is_first_step = uniforms.kind == 0u && step == stage;
+        let block_offset = (tid / half_block) * half_block;
+        let local_offset = tid % half_block;
+        let i = block_offset * 2u + local_offset;
+        let j = select(i + half_block, i ^ (half_block * 2u - 1u), is_first_step);
+        compare_and_swap(i, j);
+        workgroupBarrier();
+      }
+    }
+    if (wg_base + idx0 < ${n}u) {
+      output[base + wg_base + idx0] = shared_vals[idx0];
+      ${outputIndices ? `output_idx[base + wg_base + idx0] = shared_idx[idx0];` : ""}
+    }
+    if (wg_base + idx1 < ${n}u) {
+      output[base + wg_base + idx1] = shared_vals[idx1];
+      ${outputIndices ? `output_idx[base + wg_base + idx1] = shared_idx[idx1];` : ""}
+    }
+  } else {
+    // Execute single merge pass for a step >= numLocalStages.
+    let half_block = 1u << uniforms.merge_step;  // half_block >= workgroupSize * 2
+    let thread_in_batch = wg_in_batch * ${workgroupSize} + tid;
+    let is_first_step = uniforms.merge_step == uniforms.merge_stage;
+    let block_offset = (thread_in_batch / half_block) * half_block;
+    let local_offset = thread_in_batch % half_block;
+    let i = block_offset * 2u + local_offset;
+    let j = select(i + half_block, i ^ (half_block * 2u - 1u), is_first_step);
+    // Global version of compare_and_swap()
+    if (j < ${n}u) {
+      let val_i = output[base + i];
+      let val_j = output[base + j];
+      if (compare(val_j, val_i)) {
+        output[base + i] = val_j;
+        output[base + j] = val_i;
+${outputIndices ? `
+        let tmp_idx = output_idx[base + i];
+        output_idx[base + i] = output_idx[base + j];
+        output_idx[base + j] = tmp_idx;` : ""}
+      }
+    }
+  }
+}
+`.trim();
+	const grid = calculateGrid(batches * workgroupsPerBatch);
+	const passes = [{ kind: "sort" }];
+	for (let mergeStage = numLocalStages; mergeStage < numStages; mergeStage++) for (let mergeStep = mergeStage; mergeStep >= numLocalStages - 1; mergeStep--) passes.push({
+		kind: "merge",
+		mergeStep,
+		mergeStage
+	});
+	return [{
+		code,
+		numInputs: 1,
+		numOutputs: outputIndices ? 2 : 1,
+		hasUniform: true,
+		passes: passes.map((pass) => ({
+			grid,
+			uniform: bitonicSortUniform(pass)
+		}))
+	}];
+}
+function createSort(device, type) {
+	const dtype = type.inputDtypes[0];
+	const shape = type.inputShapes[0];
+	const n = shape[shape.length - 1];
+	const batches = prod(shape.slice(0, -1));
+	return bitonicSortShader(device, dtype, n, batches, false);
+}
+function createArgsort(device, type) {
+	const dtype = type.inputDtypes[0];
+	const shape = type.inputShapes[0];
+	const n = shape[shape.length - 1];
+	const batches = prod(shape.slice(0, -1));
+	return bitonicSortShader(device, dtype, n, batches, true);
+}
+function createRoutineShader(device, routine) {
+	switch (routine.name) {
+		case Routines.Sort: return createSort(device, routine.type);
+		case Routines.Argsort: return createArgsort(device, routine.type);
+		default: throw new UnsupportedRoutineError(routine.name, "webgpu");
+	}
+}
 //#endregion
 //#region src/backend/webgpu.ts
 /** Implementation of `Backend` that uses WebGPU in browsers. */
@@ -181,6 +433,7 @@ var WebGPUBackend = class {
 	buffers;
 	nextSlot;
 	#cachedShaderMap = /* @__PURE__ */ new Map();
+	#reusableZsb;
 	constructor(device) {
 		this.device = device;
 		if (DEBUG >= 3 && device.adapterInfo) console.info("webgpu adapter:", device.adapterInfo.vendor, device.adapterInfo.architecture);
@@ -189,11 +442,16 @@ var WebGPUBackend = class {
 		this.syncReader = new SyncReader(device);
 		this.buffers = /* @__PURE__ */ new Map();
 		this.nextSlot = 1;
+		this.#reusableZsb = this.#createBuffer(4);
+		device.addEventListener("uncapturederror", (event) => {
+			console.error("Uncaptured error in WebGPU backend:", event.error.message);
+		});
 	}
 	malloc(size, initialData) {
 		let buffer;
 		const paddedSize = Math.ceil(size / 4) * 4;
-		if (initialData) {
+		if (size === 0) buffer = this.#reusableZsb;
+		else if (initialData) {
 			if (initialData.byteLength !== size) throw new Error("initialData size does not match buffer size");
 			if (initialData.byteLength < 4096) {
 				buffer = this.#createBuffer(paddedSize, { mapped: true });
@@ -230,11 +488,12 @@ var WebGPUBackend = class {
 		buffer.ref--;
 		if (buffer.ref === 0) {
 			this.buffers.delete(slot);
-			buffer.buffer.destroy();
+			if (buffer.buffer !== this.#reusableZsb) buffer.buffer.destroy();
 		}
 	}
 	async read(slot, start, count) {
 		const { buffer, size } = this.#getBuffer(slot);
+		if (buffer === this.#reusableZsb) return new Uint8Array();
 		if (start === void 0) start = 0;
 		if (count === void 0) count = size - start;
 		const paddedSize = Math.ceil(count / 4) * 4;
@@ -252,6 +511,7 @@ var WebGPUBackend = class {
 	}
 	readSync(slot, start, count) {
 		const { buffer, size } = this.#getBuffer(slot);
+		if (buffer === this.#reusableZsb) return new Uint8Array();
 		if (start === void 0) start = 0;
 		if (count === void 0) count = size - start;
 		return this.syncReader.read(buffer, start, count);
@@ -265,23 +525,43 @@ var WebGPUBackend = class {
 		}
 		return result;
 	}
-	async prepare(kernel) {
-		const { shader, grid } = this.#cachedShader(kernel);
+	async prepareKernel(kernel) {
+		const shader = this.#cachedShader(kernel);
 		const pipeline = await this.pipelines.prepare(shader);
-		return new Executable(kernel, {
-			shader,
-			grid,
+		return new Executable(kernel, [{
+			...shader,
 			pipeline
-		});
+		}]);
 	}
-	prepareSync(kernel) {
-		const { shader, grid } = this.#cachedShader(kernel);
+	prepareKernelSync(kernel) {
+		const shader = this.#cachedShader(kernel);
 		const pipeline = this.pipelines.prepareSync(shader);
-		return new Executable(kernel, {
-			shader,
-			grid,
+		return new Executable(kernel, [{
+			...shader,
 			pipeline
+		}]);
+	}
+	async prepareRoutine(routine) {
+		const shaders = createRoutineShader(this.device, routine);
+		const dispatches = await Promise.all(shaders.map(async (shader) => {
+			const pipeline = await this.pipelines.prepare(shader);
+			return {
+				...shader,
+				pipeline
+			};
+		}));
+		return new Executable(routine, dispatches);
+	}
+	prepareRoutineSync(routine) {
+		const shaders = createRoutineShader(this.device, routine);
+		const dispatches = shaders.map((shader) => {
+			const pipeline = this.pipelines.prepareSync(shader);
+			return {
+				...shader,
+				pipeline
+			};
 		});
+		return new Executable(routine, dispatches);
 	}
 	dispatch(exe, inputs, outputs) {
 		const inputBuffers = inputs.map((slot) => this.#getBuffer(slot).buffer);
@@ -316,32 +596,6 @@ var WebGPUBackend = class {
 		return buffer;
 	}
 };
-function dtypeToWgsl(dtype, storage = false) {
-	switch (dtype) {
-		case DType.Bool: return storage ? "i32" : "bool";
-		case DType.Int32: return "i32";
-		case DType.Uint32: return "u32";
-		case DType.Float32: return "f32";
-		case DType.Float16: return "f16";
-		default: throw new Error(`Unsupported dtype for WebGPU: ${dtype}`);
-	}
-}
-function constToWgsl(dtype, value) {
-	if (dtype === DType.Bool) return value ? "true" : "false";
-	if (dtype === DType.Int32) return value.toString();
-	if (dtype === DType.Uint32) return value.toString() + "u";
-	if (dtype === DType.Float32) {
-		if (Number.isNaN(value)) return "nan()";
-		if (!Number.isFinite(value)) return value > 0 ? "inf()" : "-inf()";
-		return "f32(" + value.toString() + ")";
-	}
-	if (dtype === DType.Float16) {
-		if (Number.isNaN(value)) return "f16(nan())";
-		if (!Number.isFinite(value)) return value > 0 ? "f16(inf())" : "f16(-inf())";
-		return "f16(" + value.toString() + ")";
-	}
-	throw new Error(`Unsupported const dtype: ${dtype}`);
-}
 /**
 * Compiles an expression into WebGPU shader source code.
 *
@@ -362,12 +616,12 @@ function pipelineSource(device, kernel) {
 		else if (line === popIndent) indent = indent.slice(0, -2);
 		else shader.push(line ? indent + line : line);
 	};
-	if (tune.exp.some((exp) => exp.dtype === DType.Float16) || re?.epilogue.some((exp) => exp.dtype === DType.Float16)) {
+	if (tune.exp.some((exp) => exp.dtype === DType.Float16) || tune.epilogue?.some((exp) => exp.dtype === DType.Float16)) {
 		if (!device.features.has("shader-f16")) throw new Error("WebGPU device does not support shader-f16 feature");
 		emit("enable f16;");
 	}
-	emit("fn nan() -> f32 { let bits = 0xffffffffu; return bitcast<f32>(bits); }", "fn inf() -> f32 { let bits = 0x7f800000u; return bitcast<f32>(bits); }");
-	const distinctOps = mapSetUnion(tune.exp.distinctOps(), re?.epilogue.distinctOps());
+	emit(headerWgsl);
+	const distinctOps = mapSetUnion(tune.exp.distinctOps(), tune.epilogue?.distinctOps());
 	if (distinctOps.has(AluOp.Threefry2x32)) emit(threefrySrc);
 	if (distinctOps.has(AluOp.Erf) || distinctOps.has(AluOp.Erfc)) emit(erfSrc);
 	emit("");
@@ -375,6 +629,9 @@ function pipelineSource(device, kernel) {
 	tune.exp.fold((exp) => {
 		if (exp.op === AluOp.GlobalIndex) usedArgs[exp.arg[0]] = exp.dtype;
 	});
+	tune.epilogue?.fold((exp) => {
+		if (exp.op === AluOp.GlobalIndex) usedArgs[exp.arg[0]] = exp.dtype;
+	});
 	for (let i = 0; i < nargs; i++) {
 		const ty = dtypeToWgsl(usedArgs[i] ?? DType.Float32, true);
 		emit(`@group(0) @binding(${i}) var<storage, read> ${args[i]} : array<${ty}>;`);
@@ -383,12 +640,7 @@ function pipelineSource(device, kernel) {
 	emit(`@group(0) @binding(${nargs}) var<storage, read_write> result : array<${resultTy}>;`);
 	const workgroupSize = findPow2(tune.threadCount, 256);
 	const gridSize = Math.ceil(tune.threadCount / workgroupSize);
-	let gridX = gridSize;
-	let gridY = 1;
-	if (gridSize > device.limits.maxComputeWorkgroupsPerDimension) {
-		gridX = 16384;
-		gridY = Math.ceil(gridSize / gridX);
-	}
+	const [gridX, gridY] = calculateGrid(gridSize);
 	emit("", `@compute @workgroup_size(${workgroupSize})`, "fn main(@builtin(global_invocation_id) id : vec3<u32>) {", pushIndent);
 	if (gridY === 1) emit(`if (id.x >= ${tune.threadCount}) { return; }`, "let gidx: i32 = i32(id.x);");
 	else {
@@ -398,7 +650,7 @@ function pipelineSource(device, kernel) {
 	let gensymCount = 0;
 	const gensym = () => `alu${gensymCount++}`;
 	const isGensym = (text) => text.match(/^alu[0-9]+$/);
-	for (let i = 0; i < args.length; i++) if (!usedArgs[i]) emit(`_ = &${args[i]};`);
+	if (args.length > 0) emit(args.map((arg) => `_ = &${arg};`).join(" "));
 	const references = /* @__PURE__ */ new Map();
 	const seen = /* @__PURE__ */ new Set();
 	const countReferences = (exp) => {
@@ -511,13 +763,15 @@ function pipelineSource(device, kernel) {
 			let rhs = items[i][0];
 			for (let j = 1; j < unroll; j++) if (re.op === AluOp.Add) rhs = `${rhs} + ${items[i][j]}`;
 			else if (re.op === AluOp.Mul) rhs = `${rhs} * ${items[i][j]}`;
-			else if (re.op === AluOp.Min) rhs = `min(${rhs}, ${items[i][j]})`;
-			else if (re.op === AluOp.Max) rhs = `max(${rhs}, ${items[i][j]})`;
+			else if (re.op === AluOp.Min) rhs = re.dtype === DType.Bool ? `(${rhs} && ${items[i][j]})` : `min(${rhs}, ${items[i][j]})`;
+			else if (re.op === AluOp.Max) rhs = re.dtype === DType.Bool ? `(${rhs} || ${items[i][j]})` : `max(${rhs}, ${items[i][j]})`;
 			else throw new Error(`Unsupported reduction op: ${re.op}`);
 			if (re.op === AluOp.Add) emit(`${acc[i]} += ${rhs};`);
 			else if (re.op === AluOp.Mul) emit(`${acc[i]} *= ${rhs};`);
-			else if (re.op === AluOp.Min) emit(`${acc[i]} = min(${acc[i]}, ${rhs});`);
-			else if (re.op === AluOp.Max) emit(`${acc[i]} = max(${acc[i]}, ${rhs});`);
+			else if (re.op === AluOp.Min) if (re.dtype === DType.Bool) emit(`${acc[i]} = ${acc[i]} && ${rhs};`);
+			else emit(`${acc[i]} = min(${acc[i]}, ${rhs});`);
+			else if (re.op === AluOp.Max) if (re.dtype === DType.Bool) emit(`${acc[i]} = ${acc[i]} || ${rhs};`);
+			else emit(`${acc[i]} = max(${acc[i]}, ${rhs});`);
 			else throw new Error(`Unsupported reduction op: ${re.op}`);
 		}
 		emit(popIndent, "}");
@@ -530,7 +784,10 @@ function pipelineSource(device, kernel) {
 			const exp = tune.outputIdxExp.substitute({ upcast: AluExp.i32(i) });
 			outputIdxExps.push(exp.simplify(cache));
 			countReferences(outputIdxExps[i]);
-			fusionExps.push(re.epilogue.substitute({ acc: AluExp.variable(re.dtype, acc[i]) }).simplify(cache));
+			fusionExps.push(tune.epilogue.substitute({
+				acc: AluExp.variable(re.dtype, acc[i]),
+				upcast: AluExp.i32(i)
+			}).simplify(cache));
 			countReferences(fusionExps[i]);
 		}
 		for (let i = 0; i < upcast; i++) {
@@ -542,36 +799,72 @@ function pipelineSource(device, kernel) {
 	}
 	emit(popIndent, "}");
 	return {
-		shader: shader.join("\n"),
-		grid: [gridX, gridY]
+		code: shader.join("\n"),
+		numInputs: nargs,
+		numOutputs: 1,
+		hasUniform: false,
+		passes: [{ grid: [gridX, gridY] }]
 	};
 }
-function pipelineSubmit(device, { pipeline, grid }, inputs, outputs) {
-	if (inputs.length + outputs.length > device.limits.maxStorageBuffersPerShaderStage) {
-		const actual = inputs.length + outputs.length;
-		const max = device.limits.maxStorageBuffersPerShaderStage;
-		throw new Error(`Too many buffers (${actual}) for WebGPU pipeline (max: ${max})`);
-	}
-	const bindGroup = device.createBindGroup({
-		layout: pipeline.getBindGroupLayout(0),
-		entries: [...inputs.map((buffer, i) => {
-			return {
+function pipelineSubmit(device, pipelines, inputs, outputs) {
+	const commandEncoder = device.createCommandEncoder();
+	for (const { pipeline,...shader } of pipelines) {
+		if (inputs.length !== shader.numInputs || outputs.length !== shader.numOutputs) throw new Error(`webgpu: expected ${shader.numInputs} inputs and ${shader.numOutputs} outputs, got ${inputs.length} inputs and ${outputs.length} outputs`);
+		const filteredPasses = shader.passes.filter(({ grid }) => prod(grid) > 0);
+		if (filteredPasses.length === 0) continue;
+		const bindGroup = device.createBindGroup({
+			layout: pipeline.getBindGroupLayout(0),
+			entries: [...inputs.map((buffer, i) => ({
 				binding: i,
 				resource: { buffer }
-			};
-		}), {
-			binding: inputs.length,
-			resource: { buffer: outputs[0] }
-		}]
-	});
-	const commandEncoder = device.createCommandEncoder();
-	const passEncoder = commandEncoder.beginComputePass();
-	passEncoder.setPipeline(pipeline);
-	passEncoder.setBindGroup(0, bindGroup);
-	passEncoder.dispatchWorkgroups(grid[0], grid[1]);
-	passEncoder.end();
+			})), ...outputs.map((buffer, i) => ({
+				binding: inputs.length + i,
+				resource: { buffer }
+			}))]
+		});
+		let uniformBindGroup = null;
+		let uniformAlignment = 0;
+		if (shader.hasUniform) {
+			const uniforms = filteredPasses.map(({ uniform }) => uniform);
+			const [uniformBuffer, alignment] = combineUniforms(device, uniforms);
+			uniformAlignment = alignment;
+			uniformBindGroup = device.createBindGroup({
+				layout: pipeline.getBindGroupLayout(1),
+				entries: [{
+					binding: 0,
+					resource: {
+						buffer: uniformBuffer,
+						size: alignment
+					}
+				}]
+			});
+		}
+		for (let i = 0; i < filteredPasses.length; i++) {
+			const { grid } = filteredPasses[i];
+			const passEncoder = commandEncoder.beginComputePass();
+			passEncoder.setPipeline(pipeline);
+			passEncoder.setBindGroup(0, bindGroup);
+			if (uniformBindGroup) passEncoder.setBindGroup(1, uniformBindGroup, [i * uniformAlignment]);
+			passEncoder.dispatchWorkgroups(grid[0], grid[1]);
+			passEncoder.end();
+		}
+	}
 	device.queue.submit([commandEncoder.finish()]);
 }
+function combineUniforms(device, uniforms) {
+	for (const buf of uniforms) if (!buf || buf.byteLength === 0 || buf.byteLength !== uniforms[0].byteLength) throw new Error("webgpu: Uniform mismatch between shader passes");
+	const minAlign = device.limits.minUniformBufferOffsetAlignment;
+	const alignment = Math.ceil(uniforms[0].byteLength / minAlign) * minAlign;
+	const buffer = device.createBuffer({
+		size: alignment * uniforms.length,
+		usage: GPUBufferUsage.UNIFORM,
+		mappedAtCreation: true
+	});
+	const bufferMapped = new Uint8Array(buffer.getMappedRange());
+	for (let i = 0; i < uniforms.length; i++) bufferMapped.set(uniforms[i], i * alignment);
+	buffer.unmap();
+	return [buffer, alignment];
+}
 /**
 * A cache for compiled GPU compute pipelines, keyed by the shader source.
 *
@@ -588,18 +881,39 @@ var ShaderPipelineCache = class {
 		this.cache = /* @__PURE__ */ new Map();
 		this.inProgress = /* @__PURE__ */ new Map();
 	}
-	async prepare(code) {
-		const existingPipeline = this.cache.get(code);
+	#getLayout(shader) {
+		if (shader.numInputs + shader.numOutputs > this.device.limits.maxStorageBuffersPerShaderStage) {
+			const actual = shader.numInputs + shader.numOutputs;
+			const max = this.device.limits.maxStorageBuffersPerShaderStage;
+			throw new Error(`Too many buffers (${actual}) for WebGPU pipeline (max: ${max})`);
+		}
+		const bindGroupLayouts = [this.device.createBindGroupLayout({ entries: range(shader.numInputs + shader.numOutputs).map((i) => ({
+			binding: i,
+			visibility: GPUShaderStage.COMPUTE,
+			buffer: { type: i < shader.numInputs ? "read-only-storage" : "storage" }
+		})) })];
+		if (shader.hasUniform) bindGroupLayouts.push(this.device.createBindGroupLayout({ entries: [{
+			binding: 0,
+			visibility: GPUShaderStage.COMPUTE,
+			buffer: {
+				type: "uniform",
+				hasDynamicOffset: true
+			}
+		}] }));
+		return this.device.createPipelineLayout({ bindGroupLayouts });
+	}
+	async prepare(shader) {
+		const existingPipeline = this.cache.get(shader.code);
 		if (existingPipeline) return existingPipeline;
-		const existingPromise = this.inProgress.get(code);
+		const existingPromise = this.inProgress.get(shader.code);
 		if (existingPromise) return await existingPromise;
-		if (DEBUG >= 2) console.info("=========== WebGPU shader ===========\n" + code);
-		const shaderModule = this.device.createShaderModule({ code });
+		if (DEBUG >= 2) console.info("=========== WebGPU shader ===========\n" + shader.code);
+		const shaderModule = this.device.createShaderModule({ code: shader.code });
 		const promise = (async () => {
 			this.device.pushErrorScope("validation");
 			try {
 				const pipeline$1 = await this.device.createComputePipelineAsync({
-					layout: "auto",
+					layout: this.#getLayout(shader),
 					compute: {
 						module: shaderModule,
 						entryPoint: "main"
@@ -609,23 +923,23 @@ var ShaderPipelineCache = class {
 				return pipeline$1;
 			} catch (_error) {
 				const scope = await this.device.popErrorScope();
-				const emsg = await compileError(shaderModule, scope, code);
+				const emsg = await compileError(shaderModule, scope, shader.code);
 				throw new Error(emsg);
 			}
 		})();
-		this.inProgress.set(code, promise);
+		this.inProgress.set(shader.code, promise);
 		const pipeline = await promise;
-		this.cache.set(code, pipeline);
+		this.cache.set(shader.code, pipeline);
 		return pipeline;
 	}
-	prepareSync(code) {
-		const existingPipeline = this.cache.get(code);
+	prepareSync(shader) {
+		const existingPipeline = this.cache.get(shader.code);
 		if (existingPipeline) return existingPipeline;
-		if (DEBUG >= 2) console.info("=========== WebGPU shader ===========\n" + code);
-		const shaderModule = this.device.createShaderModule({ code });
+		if (DEBUG >= 2) console.info("=========== WebGPU shader ===========\n" + shader.code);
+		const shaderModule = this.device.createShaderModule({ code: shader.code });
 		this.device.pushErrorScope("validation");
 		const pipeline = this.device.createComputePipeline({
-			layout: "auto",
+			layout: this.#getLayout(shader),
 			compute: {
 				module: shaderModule,
 				entryPoint: "main"
@@ -633,11 +947,11 @@ var ShaderPipelineCache = class {
 		});
 		this.device.popErrorScope().then(async (scope) => {
 			if (scope !== null) {
-				const emsg = await compileError(shaderModule, scope, code);
+				const emsg = await compileError(shaderModule, scope, shader.code);
 				console.error(emsg);
 			}
 		});
-		this.cache.set(code, pipeline);
+		this.cache.set(shader.code, pipeline);
 		return pipeline;
 	}
 };
@@ -651,5 +965,4 @@ async function compileError(shaderModule, scope, code) {
 }
 //#endregion
-export { WebGPUBackend };
-//# sourceMappingURL=webgpu-BGuG58KZ.js.map
+export { WebGPUBackend };