npm - @jax-js/jax - Versions diffs - 0.1.8 → 0.1.10 - Mend

@jax-js/jax 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/README.md +46 -29
package/dist/{backend-nEolvdLv.js → backend-Ctqs8la1.js} +122 -15
package/dist/{backend-B3foXiV_.cjs → backend-DMauYnfl.cjs} +157 -14
package/dist/index.cjs +331 -46
package/dist/index.d.cts +175 -31
package/dist/index.d.ts +175 -31
package/dist/index.js +331 -47
package/dist/{webgl-DweKSWEm.js → webgl-CvQ1QBX1.js} +1 -1
package/dist/{webgl-DIIbKJ0G.cjs → webgl-kvVt7-T7.cjs} +1 -1
package/dist/{webgpu-BykvF26B.cjs → webgpu-DMSx7a6M.cjs} +160 -15
package/dist/{webgpu-B96vzWGE.js → webgpu-v_W_-oKw.js} +160 -15
package/package.json +5 -16

package/dist/{webgpu-BykvF26B.cjs → webgpu-DMSx7a6M.cjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-const require_backend = require('./backend-B3foXiV_.cjs');
+const require_backend = require('./backend-DMauYnfl.cjs');
 //#region src/backend/webgpu/builtins.ts
 const threefrySrc = `
@@ -152,6 +152,7 @@ var SyncReader = class SyncReader {
 		this.device = device;
 	}
 	#init() {
+		if (typeof OffscreenCanvas === "undefined") throw new Error("OffscreenCanvas is not available in this environment, so you cannot read data from WebGPU synchronously. Consider using the async API.");
 		const makeCanvas = () => new OffscreenCanvas(SyncReader.width, SyncReader.height);
 		this.deviceStorage = SyncReader.alphaModes.map(makeCanvas);
 		this.deviceContexts = this.deviceStorage.map((canvas, i) => {
@@ -247,6 +248,10 @@ function bitonicSortUniform(pass) {
 *   `2^(step+1)` with multiple workgroups. This doesn't use shared memory.
 *
 * The total number of passes is roughly `log2(n / workgroupSize)^2 / 2`.
+*
+* If `outputIndices` is true, the shader also tracks the original indices of
+* the sorted elements (argsort) and outputs them to a separate buffer. This
+* also makes the sorting algorithm stable.
 */
 function bitonicSortShader(device, dtype, n, batches, outputIndices) {
 	const ty = dtypeToWgsl(dtype, true);
@@ -286,14 +291,21 @@ ${require_backend.isFloatDtype(dtype) ? `
 fn compare_and_swap(i: u32, j: u32) {
   let val_i = shared_vals[i];
   let val_j = shared_vals[j];
-  if (compare(val_j, val_i)) {
+${outputIndices ? `
+  if (
+    compare(val_j, val_i) ||
+    (!compare(val_i, val_j) && shared_idx[j] < shared_idx[i])
+  ) {
     shared_vals[i] = val_j;
     shared_vals[j] = val_i;
-${outputIndices ? `
     let tmp_idx = shared_idx[i];
     shared_idx[i] = shared_idx[j];
-    shared_idx[j] = tmp_idx;` : ""}
-  }
+    shared_idx[j] = tmp_idx;
+  }` : `
+  if (compare(val_j, val_i)) {
+    shared_vals[i] = val_j;
+    shared_vals[j] = val_i;
+  }`}
 }
 @compute @workgroup_size(${workgroupSize})
@@ -370,13 +382,17 @@ ${outputIndices ? `
     if (j < ${n}u) {
       let val_i = output[base + i];
       let val_j = output[base + j];
-      if (compare(val_j, val_i)) {
+${outputIndices ? `
+      let idx_i = output_idx[base + i];
+      let idx_j = output_idx[base + j];
+      if (compare(val_j, val_i) || (!compare(val_i, val_j) && idx_j < idx_i)) {
         output[base + i] = val_j;
         output[base + j] = val_i;
-${outputIndices ? `
-        let tmp_idx = output_idx[base + i];
-        output_idx[base + i] = output_idx[base + j];
-        output_idx[base + j] = tmp_idx;` : ""}
+        output_idx[base + i] = idx_j;
+        output_idx[base + j] = idx_i;` : `
+      if (compare(val_j, val_i)) {
+        output[base + i] = val_j;
+        output[base + j] = val_i;`}
       }
     }
   }
@@ -713,6 +729,120 @@ function createRoutineShader(device, routine) {
 	}
 }
+//#endregion
+//#region src/backend/webgpu/tracing.ts
+const MAX_TIMESTAMP_QUERIES = 4096;
+const activeBatch = /* @__PURE__ */ new WeakMap();
+function createTracingBatch(device) {
+	return {
+		querySet: device.createQuerySet({
+			type: "timestamp",
+			count: MAX_TIMESTAMP_QUERIES
+		}),
+		resolve: device.createBuffer({
+			size: MAX_TIMESTAMP_QUERIES * 8,
+			usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC
+		}),
+		dst: device.createBuffer({
+			size: MAX_TIMESTAMP_QUERIES * 8,
+			usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST
+		}),
+		nextIndex: 0,
+		entries: []
+	};
+}
+function acquireTracingSlot(device) {
+	if (!device.features.has("timestamp-query")) return void 0;
+	let batch = activeBatch.get(device);
+	if (batch && batch.nextIndex >= MAX_TIMESTAMP_QUERIES) {
+		flushTracingBatch(device, batch);
+		batch = void 0;
+	}
+	if (!batch) {
+		batch = createTracingBatch(device);
+		activeBatch.set(device, batch);
+		require_backend.onFlushTrace(() => {
+			const b = activeBatch.get(device);
+			if (b && b.entries.length > 0) flushTracingBatch(device, b);
+			activeBatch.delete(device);
+		});
+	}
+	const beginIndex = batch.nextIndex;
+	const endIndex = beginIndex + 1;
+	batch.nextIndex += 2;
+	return {
+		batch,
+		beginIndex,
+		endIndex
+	};
+}
+/**
+* If tracing is active, acquire a slot for timestamp queries.
+*
+* Returns undefined if tracing is not active or the device doesn't support
+* timestamp queries.
+*/
+function maybeAcquireTracingSlot(device) {
+	if (!require_backend.isTracing()) return void 0;
+	return acquireTracingSlot(device);
+}
+/**
+* Record a tracing entry for a pipeline dispatch and schedule an auto-flush.
+*/
+function recordTrace(device, slot, source, numPasses, wgslSource) {
+	const info = require_backend.traceSourceInfo(source);
+	info.properties.push(["passes", `${numPasses}`]);
+	info.properties.push(["source", wgslSource]);
+	slot.batch.entries.push({
+		...info,
+		beginIndex: slot.beginIndex,
+		endIndex: slot.endIndex
+	});
+	scheduleAutoFlush(device);
+}
+/**
+* If the active batch has pending entries, flush and replace it so traces
+* are emitted without waiting for the batch to fill or stopTrace().
+*
+* Called after each dispatch records its entry via a microtask so that
+* synchronous back-to-back dispatches are still batched together.
+*/
+function scheduleAutoFlush(device) {
+	queueMicrotask(() => {
+		const batch = activeBatch.get(device);
+		if (batch && batch.entries.length > 0) {
+			flushTracingBatch(device, batch);
+			activeBatch.set(device, createTracingBatch(device));
+		}
+	});
+}
+function flushTracingBatch(device, batch) {
+	if (batch.entries.length === 0) return;
+	const usedQueries = batch.nextIndex;
+	const encoder = device.createCommandEncoder();
+	encoder.resolveQuerySet(batch.querySet, 0, usedQueries, batch.resolve, 0);
+	encoder.copyBufferToBuffer(batch.resolve, 0, batch.dst, 0, usedQueries * 8);
+	device.queue.submit([encoder.finish()]);
+	const { entries } = batch;
+	batch.dst.mapAsync(GPUMapMode.READ).then(() => {
+		try {
+			const times = new BigInt64Array(batch.dst.getMappedRange());
+			const anchorGpuNs = times[entries[entries.length - 1].endIndex];
+			const anchorCpuMs = performance.now();
+			for (const entry of entries) {
+				const startMs = anchorCpuMs + Number(times[entry.beginIndex] - anchorGpuNs) / 1e6;
+				const endMs = anchorCpuMs + Number(times[entry.endIndex] - anchorGpuNs) / 1e6;
+				require_backend.emitTrace("webgpu", entry, startMs, endMs);
+			}
+		} finally {
+			batch.dst.unmap();
+			batch.querySet.destroy();
+			batch.resolve.destroy();
+			batch.dst.destroy();
+		}
+	});
+}
 //#endregion
 //#region src/backend/webgpu.ts
 /** Implementation of `Backend` that uses WebGPU in browsers. */
@@ -857,7 +987,7 @@ var WebGPUBackend = class {
 	dispatch(exe, inputs, outputs) {
 		const inputBuffers = inputs.map((slot) => this.#getBuffer(slot).buffer);
 		const outputBuffers = outputs.map((slot) => this.#getBuffer(slot).buffer);
-		pipelineSubmit(this.device, exe.data, inputBuffers, outputBuffers);
+		pipelineSubmit(this.device, exe, inputBuffers, outputBuffers);
 	}
 	#getBuffer(slot) {
 		const buffer = this.buffers.get(slot);
@@ -995,8 +1125,16 @@ function pipelineSource(device, kernel) {
 			else if (op === require_backend.AluOp.Reciprocal) source = `(1.0 / ${a})`;
 			else if (op === require_backend.AluOp.Floor) source = `floor(${require_backend.strip1(a)})`;
 			else if (op === require_backend.AluOp.Ceil) source = `ceil(${require_backend.strip1(a)})`;
-			else if (op === require_backend.AluOp.Cast) source = `${dtypeToWgsl(dtype)}(${require_backend.strip1(a)})`;
-			else if (op === require_backend.AluOp.Bitcast) source = `bitcast<${dtypeToWgsl(dtype)}>(${require_backend.strip1(a)})`;
+			else if (op === require_backend.AluOp.Cast) {
+				const srcTy = dtypeToWgsl(src[0].dtype);
+				const dstTy = dtypeToWgsl(dtype);
+				if (require_backend.isFloatDtype(src[0].dtype) && !(require_backend.isFloatDtype(dtype) || dtype === require_backend.DType.Bool)) {
+					const maxVal = maxValueWgsl(dtype);
+					const x = isGensym(a) ? a : gensym();
+					if (x !== a) emit(`let ${x}: ${srcTy} = ${require_backend.strip1(a)};`);
+					source = `select(${dstTy}(${x}), ${maxVal}, ${x} >= ${srcTy}(${maxVal}))`;
+				} else source = `${dstTy}(${require_backend.strip1(a)})`;
+			} else if (op === require_backend.AluOp.Bitcast) source = `bitcast<${dtypeToWgsl(dtype)}>(${require_backend.strip1(a)})`;
 		}
 		else if (op === require_backend.AluOp.Where) source = `select(${require_backend.strip1(gen(src[2]))}, ${require_backend.strip1(gen(src[1]))}, ${require_backend.strip1(gen(src[0]))})`;
 		else if (op === require_backend.AluOp.Threefry2x32) {
@@ -1099,12 +1237,14 @@ function pipelineSource(device, kernel) {
 		passes: [{ grid: [gridX, gridY] }]
 	};
 }
-function pipelineSubmit(device, pipelines, inputs, outputs) {
+function pipelineSubmit(device, exe, inputs, outputs) {
+	const { data: pipelines, source } = exe;
 	const commandEncoder = device.createCommandEncoder();
 	for (const { pipeline,...shader } of pipelines) {
 		if (inputs.length !== shader.numInputs || outputs.length !== shader.numOutputs) throw new Error(`webgpu: expected ${shader.numInputs} inputs and ${shader.numOutputs} outputs, got ${inputs.length} inputs and ${outputs.length} outputs`);
 		const filteredPasses = shader.passes.filter(({ grid }) => require_backend.prod(grid) > 0);
 		if (filteredPasses.length === 0) continue;
+		const slot = maybeAcquireTracingSlot(device);
 		const bindGroup = device.createBindGroup({
 			layout: pipeline.getBindGroupLayout(0),
 			entries: [...inputs.map((buffer, i) => ({
@@ -1134,13 +1274,18 @@ function pipelineSubmit(device, pipelines, inputs, outputs) {
 		}
 		for (let i = 0; i < filteredPasses.length; i++) {
 			const { grid } = filteredPasses[i];
-			const passEncoder = commandEncoder.beginComputePass();
+			const passEncoder = commandEncoder.beginComputePass({ timestampWrites: slot ? {
+				querySet: slot.batch.querySet,
+				beginningOfPassWriteIndex: i === 0 ? slot.beginIndex : void 0,
+				endOfPassWriteIndex: i === filteredPasses.length - 1 ? slot.endIndex : void 0
+			} : void 0 });
 			passEncoder.setPipeline(pipeline);
 			passEncoder.setBindGroup(0, bindGroup);
 			if (uniformBindGroup) passEncoder.setBindGroup(1, uniformBindGroup, [i * uniformAlignment]);
 			passEncoder.dispatchWorkgroups(grid[0], grid[1]);
 			passEncoder.end();
 		}
+		if (slot) recordTrace(device, slot, source, filteredPasses.length, shader.code);
 	}
 	device.queue.submit([commandEncoder.finish()]);
 }

package/dist/{webgpu-B96vzWGE.js → webgpu-v_W_-oKw.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { AluExp, AluGroup, AluOp, DEBUG, DType, Executable, FpHash, Routines, SlotError, UnsupportedOpError, UnsupportedRoutineError, findPow2, isFloatDtype, mapSetUnion, prod, range, strip1, tuneWebgpu } from "./backend-nEolvdLv.js";
+import { AluExp, AluGroup, AluOp, DEBUG, DType, Executable, FpHash, Routines, SlotError, UnsupportedOpError, UnsupportedRoutineError, emitTrace, findPow2, isFloatDtype, isTracing, mapSetUnion, onFlushTrace, prod, range, strip1, traceSourceInfo, tuneWebgpu } from "./backend-Ctqs8la1.js";
 //#region src/backend/webgpu/builtins.ts
 const threefrySrc = `
@@ -152,6 +152,7 @@ var SyncReader = class SyncReader {
 		this.device = device;
 	}
 	#init() {
+		if (typeof OffscreenCanvas === "undefined") throw new Error("OffscreenCanvas is not available in this environment, so you cannot read data from WebGPU synchronously. Consider using the async API.");
 		const makeCanvas = () => new OffscreenCanvas(SyncReader.width, SyncReader.height);
 		this.deviceStorage = SyncReader.alphaModes.map(makeCanvas);
 		this.deviceContexts = this.deviceStorage.map((canvas, i) => {
@@ -247,6 +248,10 @@ function bitonicSortUniform(pass) {
 *   `2^(step+1)` with multiple workgroups. This doesn't use shared memory.
 *
 * The total number of passes is roughly `log2(n / workgroupSize)^2 / 2`.
+*
+* If `outputIndices` is true, the shader also tracks the original indices of
+* the sorted elements (argsort) and outputs them to a separate buffer. This
+* also makes the sorting algorithm stable.
 */
 function bitonicSortShader(device, dtype, n, batches, outputIndices) {
 	const ty = dtypeToWgsl(dtype, true);
@@ -286,14 +291,21 @@ ${isFloatDtype(dtype) ? `
 fn compare_and_swap(i: u32, j: u32) {
   let val_i = shared_vals[i];
   let val_j = shared_vals[j];
-  if (compare(val_j, val_i)) {
+${outputIndices ? `
+  if (
+    compare(val_j, val_i) ||
+    (!compare(val_i, val_j) && shared_idx[j] < shared_idx[i])
+  ) {
     shared_vals[i] = val_j;
     shared_vals[j] = val_i;
-${outputIndices ? `
     let tmp_idx = shared_idx[i];
     shared_idx[i] = shared_idx[j];
-    shared_idx[j] = tmp_idx;` : ""}
-  }
+    shared_idx[j] = tmp_idx;
+  }` : `
+  if (compare(val_j, val_i)) {
+    shared_vals[i] = val_j;
+    shared_vals[j] = val_i;
+  }`}
 }
 @compute @workgroup_size(${workgroupSize})
@@ -370,13 +382,17 @@ ${outputIndices ? `
     if (j < ${n}u) {
       let val_i = output[base + i];
       let val_j = output[base + j];
-      if (compare(val_j, val_i)) {
+${outputIndices ? `
+      let idx_i = output_idx[base + i];
+      let idx_j = output_idx[base + j];
+      if (compare(val_j, val_i) || (!compare(val_i, val_j) && idx_j < idx_i)) {
         output[base + i] = val_j;
         output[base + j] = val_i;
-${outputIndices ? `
-        let tmp_idx = output_idx[base + i];
-        output_idx[base + i] = output_idx[base + j];
-        output_idx[base + j] = tmp_idx;` : ""}
+        output_idx[base + i] = idx_j;
+        output_idx[base + j] = idx_i;` : `
+      if (compare(val_j, val_i)) {
+        output[base + i] = val_j;
+        output[base + j] = val_i;`}
       }
     }
   }
@@ -713,6 +729,120 @@ function createRoutineShader(device, routine) {
 	}
 }
+//#endregion
+//#region src/backend/webgpu/tracing.ts
+const MAX_TIMESTAMP_QUERIES = 4096;
+const activeBatch = /* @__PURE__ */ new WeakMap();
+function createTracingBatch(device) {
+	return {
+		querySet: device.createQuerySet({
+			type: "timestamp",
+			count: MAX_TIMESTAMP_QUERIES
+		}),
+		resolve: device.createBuffer({
+			size: MAX_TIMESTAMP_QUERIES * 8,
+			usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC
+		}),
+		dst: device.createBuffer({
+			size: MAX_TIMESTAMP_QUERIES * 8,
+			usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST
+		}),
+		nextIndex: 0,
+		entries: []
+	};
+}
+function acquireTracingSlot(device) {
+	if (!device.features.has("timestamp-query")) return void 0;
+	let batch = activeBatch.get(device);
+	if (batch && batch.nextIndex >= MAX_TIMESTAMP_QUERIES) {
+		flushTracingBatch(device, batch);
+		batch = void 0;
+	}
+	if (!batch) {
+		batch = createTracingBatch(device);
+		activeBatch.set(device, batch);
+		onFlushTrace(() => {
+			const b = activeBatch.get(device);
+			if (b && b.entries.length > 0) flushTracingBatch(device, b);
+			activeBatch.delete(device);
+		});
+	}
+	const beginIndex = batch.nextIndex;
+	const endIndex = beginIndex + 1;
+	batch.nextIndex += 2;
+	return {
+		batch,
+		beginIndex,
+		endIndex
+	};
+}
+/**
+* If tracing is active, acquire a slot for timestamp queries.
+*
+* Returns undefined if tracing is not active or the device doesn't support
+* timestamp queries.
+*/
+function maybeAcquireTracingSlot(device) {
+	if (!isTracing()) return void 0;
+	return acquireTracingSlot(device);
+}
+/**
+* Record a tracing entry for a pipeline dispatch and schedule an auto-flush.
+*/
+function recordTrace(device, slot, source, numPasses, wgslSource) {
+	const info = traceSourceInfo(source);
+	info.properties.push(["passes", `${numPasses}`]);
+	info.properties.push(["source", wgslSource]);
+	slot.batch.entries.push({
+		...info,
+		beginIndex: slot.beginIndex,
+		endIndex: slot.endIndex
+	});
+	scheduleAutoFlush(device);
+}
+/**
+* If the active batch has pending entries, flush and replace it so traces
+* are emitted without waiting for the batch to fill or stopTrace().
+*
+* Called after each dispatch records its entry via a microtask so that
+* synchronous back-to-back dispatches are still batched together.
+*/
+function scheduleAutoFlush(device) {
+	queueMicrotask(() => {
+		const batch = activeBatch.get(device);
+		if (batch && batch.entries.length > 0) {
+			flushTracingBatch(device, batch);
+			activeBatch.set(device, createTracingBatch(device));
+		}
+	});
+}
+function flushTracingBatch(device, batch) {
+	if (batch.entries.length === 0) return;
+	const usedQueries = batch.nextIndex;
+	const encoder = device.createCommandEncoder();
+	encoder.resolveQuerySet(batch.querySet, 0, usedQueries, batch.resolve, 0);
+	encoder.copyBufferToBuffer(batch.resolve, 0, batch.dst, 0, usedQueries * 8);
+	device.queue.submit([encoder.finish()]);
+	const { entries } = batch;
+	batch.dst.mapAsync(GPUMapMode.READ).then(() => {
+		try {
+			const times = new BigInt64Array(batch.dst.getMappedRange());
+			const anchorGpuNs = times[entries[entries.length - 1].endIndex];
+			const anchorCpuMs = performance.now();
+			for (const entry of entries) {
+				const startMs = anchorCpuMs + Number(times[entry.beginIndex] - anchorGpuNs) / 1e6;
+				const endMs = anchorCpuMs + Number(times[entry.endIndex] - anchorGpuNs) / 1e6;
+				emitTrace("webgpu", entry, startMs, endMs);
+			}
+		} finally {
+			batch.dst.unmap();
+			batch.querySet.destroy();
+			batch.resolve.destroy();
+			batch.dst.destroy();
+		}
+	});
+}
 //#endregion
 //#region src/backend/webgpu.ts
 /** Implementation of `Backend` that uses WebGPU in browsers. */
@@ -857,7 +987,7 @@ var WebGPUBackend = class {
 	dispatch(exe, inputs, outputs) {
 		const inputBuffers = inputs.map((slot) => this.#getBuffer(slot).buffer);
 		const outputBuffers = outputs.map((slot) => this.#getBuffer(slot).buffer);
-		pipelineSubmit(this.device, exe.data, inputBuffers, outputBuffers);
+		pipelineSubmit(this.device, exe, inputBuffers, outputBuffers);
 	}
 	#getBuffer(slot) {
 		const buffer = this.buffers.get(slot);
@@ -995,8 +1125,16 @@ function pipelineSource(device, kernel) {
 			else if (op === AluOp.Reciprocal) source = `(1.0 / ${a})`;
 			else if (op === AluOp.Floor) source = `floor(${strip1(a)})`;
 			else if (op === AluOp.Ceil) source = `ceil(${strip1(a)})`;
-			else if (op === AluOp.Cast) source = `${dtypeToWgsl(dtype)}(${strip1(a)})`;
-			else if (op === AluOp.Bitcast) source = `bitcast<${dtypeToWgsl(dtype)}>(${strip1(a)})`;
+			else if (op === AluOp.Cast) {
+				const srcTy = dtypeToWgsl(src[0].dtype);
+				const dstTy = dtypeToWgsl(dtype);
+				if (isFloatDtype(src[0].dtype) && !(isFloatDtype(dtype) || dtype === DType.Bool)) {
+					const maxVal = maxValueWgsl(dtype);
+					const x = isGensym(a) ? a : gensym();
+					if (x !== a) emit(`let ${x}: ${srcTy} = ${strip1(a)};`);
+					source = `select(${dstTy}(${x}), ${maxVal}, ${x} >= ${srcTy}(${maxVal}))`;
+				} else source = `${dstTy}(${strip1(a)})`;
+			} else if (op === AluOp.Bitcast) source = `bitcast<${dtypeToWgsl(dtype)}>(${strip1(a)})`;
 		}
 		else if (op === AluOp.Where) source = `select(${strip1(gen(src[2]))}, ${strip1(gen(src[1]))}, ${strip1(gen(src[0]))})`;
 		else if (op === AluOp.Threefry2x32) {
@@ -1099,12 +1237,14 @@ function pipelineSource(device, kernel) {
 		passes: [{ grid: [gridX, gridY] }]
 	};
 }
-function pipelineSubmit(device, pipelines, inputs, outputs) {
+function pipelineSubmit(device, exe, inputs, outputs) {
+	const { data: pipelines, source } = exe;
 	const commandEncoder = device.createCommandEncoder();
 	for (const { pipeline,...shader } of pipelines) {
 		if (inputs.length !== shader.numInputs || outputs.length !== shader.numOutputs) throw new Error(`webgpu: expected ${shader.numInputs} inputs and ${shader.numOutputs} outputs, got ${inputs.length} inputs and ${outputs.length} outputs`);
 		const filteredPasses = shader.passes.filter(({ grid }) => prod(grid) > 0);
 		if (filteredPasses.length === 0) continue;
+		const slot = maybeAcquireTracingSlot(device);
 		const bindGroup = device.createBindGroup({
 			layout: pipeline.getBindGroupLayout(0),
 			entries: [...inputs.map((buffer, i) => ({
@@ -1134,13 +1274,18 @@ function pipelineSubmit(device, pipelines, inputs, outputs) {
 		}
 		for (let i = 0; i < filteredPasses.length; i++) {
 			const { grid } = filteredPasses[i];
-			const passEncoder = commandEncoder.beginComputePass();
+			const passEncoder = commandEncoder.beginComputePass({ timestampWrites: slot ? {
+				querySet: slot.batch.querySet,
+				beginningOfPassWriteIndex: i === 0 ? slot.beginIndex : void 0,
+				endOfPassWriteIndex: i === filteredPasses.length - 1 ? slot.endIndex : void 0
+			} : void 0 });
 			passEncoder.setPipeline(pipeline);
 			passEncoder.setBindGroup(0, bindGroup);
 			if (uniformBindGroup) passEncoder.setBindGroup(1, uniformBindGroup, [i * uniformAlignment]);
 			passEncoder.dispatchWorkgroups(grid[0], grid[1]);
 			passEncoder.end();
 		}
+		if (slot) recordTrace(device, slot, source, filteredPasses.length, shader.code);
 	}
 	device.queue.submit([commandEncoder.finish()]);
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@jax-js/jax",
-  "version": "0.1.8",
+  "version": "0.1.10",
   "description": "Numerical computing and ML in the browser",
   "keywords": [
     "machine learning",
@@ -38,15 +38,13 @@
   "devDependencies": {
     "@eslint/js": "^9.31.0",
     "@types/debug": "^4.1.12",
-    "@vitest/browser-playwright": "^4.0.9",
-    "@vitest/coverage-v8": "4.0.9",
+    "@vitest/browser-playwright": "^4.1.0",
+    "@vitest/coverage-v8": "^4.1.0",
     "@webgpu/types": "^0.1.68",
     "eslint": "^9.31.0",
     "eslint-plugin-import": "^2.32.0",
     "globals": "^16.0.0",
-    "husky": "^9.1.7",
-    "lint-staged": "^16.2.7",
-    "playwright": "~1.52.0",
+    "playwright": "~1.58.2",
     "prettier": "^3.6.2",
     "prettier-plugin-svelte": "^3.4.0",
     "tsdown": "^0.13.2",
@@ -55,7 +53,7 @@
     "typedoc-theme-fresh": "^0.2.3",
     "typescript": "~5.9.3",
     "typescript-eslint": "^8.46.4",
-    "vitest": "^4.0.9"
+    "vitest": "^4.1.0"
   },
   "engines": {
     "pnpm": ">=10.0.0"
@@ -76,15 +74,6 @@
     ],
     "proseWrap": "always"
   },
-  "lint-staged": {
-    "*.{ts,tsx,js,jsx}": [
-      "eslint --fix",
-      "prettier --write"
-    ],
-    "*.{json,md,yml,yaml,css,svelte,html}": [
-      "prettier --write"
-    ]
-  },
   "scripts": {
     "build": "tsdown",
     "build:watch": "TSDOWN_WATCH_MODE=1 tsdown",