npm - @jax-js/jax - Versions diffs - 0.1.4 → 0.1.5 - Mend

@jax-js/jax 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/README.md +10 -7
package/dist/{backend-tngXtWe4.js → backend-DaqL-MNz.js} +96 -7
package/dist/{backend-Bu9GY6sK.cjs → backend-DziQSaoQ.cjs} +101 -6
package/dist/index.cjs +737 -141
package/dist/index.d.cts +238 -9
package/dist/index.d.ts +238 -9
package/dist/index.js +737 -141
package/dist/webgl-ClIYb8jP.cjs +522 -0
package/dist/webgl-RSuZKvgc.js +522 -0
package/dist/{webgpu-Oj3Kd-kd.cjs → webgpu-Db2JrNBr.cjs} +296 -3
package/dist/{webgpu-ChVgx3b6.js → webgpu-Dh7k9io0.js} +296 -3
package/package.json +1 -1

package/dist/{webgpu-Oj3Kd-kd.cjs → webgpu-Db2JrNBr.cjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-const require_backend = require('./backend-Bu9GY6sK.cjs');
+const require_backend = require('./backend-DziQSaoQ.cjs');
 //#region src/backend/webgpu/builtins.ts
 const threefrySrc = `
@@ -414,10 +414,301 @@ function createArgsort(device, type) {
 	const batches = require_backend.prod(shape.slice(0, -1));
 	return bitonicSortShader(device, dtype, n, batches, true);
 }
+/**
+* Generate a triangular solve shader.
+*
+* Solves A @ X.T = B.T for X, where A is upper-triangular.
+* Uses a parallelized back-substitution:
+*   1. Copy b to x
+*   2. For j = n-1 down to 0:
+*      - Divide x[j] by a[j,j] (single thread)
+*      - All threads subtract x[j] * a[i,j] from x[i] for i < j in parallel
+*/
+function createTriangularSolve(device, type, params) {
+	const dtype = type.inputDtypes[0];
+	const aShape = type.inputShapes[0];
+	const bShape = type.inputShapes[1];
+	const n = aShape[aShape.length - 1];
+	const numRhs = bShape[bShape.length - 2];
+	const numMatrices = require_backend.prod(aShape.slice(0, -2));
+	const needsF16 = dtype === require_backend.DType.Float16;
+	const ty = dtypeToWgsl(dtype, true);
+	const workgroupSize = require_backend.findPow2(n, device.limits.maxComputeWorkgroupSizeX);
+	const code = `
+${needsF16 ? "enable f16;" : ""}
+${headerWgsl}
+@group(0) @binding(0) var<storage, read> a: array<${ty}>;
+@group(0) @binding(1) var<storage, read> b: array<${ty}>;
+@group(0) @binding(2) var<storage, read_write> x: array<${ty}>;
+// Shared memory for the current pivot value x[j]
+var<workgroup> x_j: ${ty};
+@compute @workgroup_size(${workgroupSize})
+fn main(
+  @builtin(workgroup_id) wg_id: vec3<u32>,
+  @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+  let wg_idx = wg_id.x + wg_id.y * ${gridOffsetY}u;
+  let mat_idx = wg_idx / ${numRhs}u;
+  let rhs_idx = wg_idx % ${numRhs}u;
+  if (mat_idx >= ${numMatrices}u) {
+    return;
+  }
+  let a_base = mat_idx * ${n * n}u;
+  let bx_base = (mat_idx * ${numRhs}u + rhs_idx) * ${n}u;
+  let tid = local_id.x;
+  // Step 1: Copy b to x (threads collaborate)
+  for (var idx = tid; idx < ${n}u; idx += ${workgroupSize}u) {
+    x[bx_base + idx] = b[bx_base + idx];
+  }
+  storageBarrier();
+  // Step 2: Back-substitution from j = n-1 down to 0
+  for (var jj = 0u; jj < ${n}u; jj++) {
+    let j = ${n - 1}u - jj;
+    // Thread 0 computes x[j] = x[j] / a[j,j]
+    if (tid == 0u) {
+      ${params.unitDiagonal ? `x_j = x[bx_base + j];` : `x_j = x[bx_base + j] / a[a_base + j * ${n}u + j];`}
+      x[bx_base + j] = x_j;
+    }
+    workgroupBarrier();  // Sync shared memory x_j
+    // All threads subtract x[j] * a[i,j] from x[i] for i < j
+    for (var i = tid; i < j; i += ${workgroupSize}u) {
+      x[bx_base + i] -= x_j * a[a_base + i * ${n}u + j];
+    }
+    workgroupBarrier();
+    storageBarrier();
+  }
+}
+`.trim();
+	const totalWorkgroups = numMatrices * numRhs;
+	const grid = calculateGrid(totalWorkgroups);
+	return [{
+		code,
+		numInputs: 2,
+		numOutputs: 1,
+		hasUniform: false,
+		passes: [{ grid }]
+	}];
+}
+/**
+* Generate a Cholesky decomposition shader.
+*
+* Computes the lower triangular matrix L such that A = L * L^T for each
+* positive semi-definite matrix in the batch. Uses the Cholesky-Crout
+* algorithm which processes column-by-column.
+*
+* For each column j:
+*   1. All threads compute their row's sum in parallel and store to output
+*   2. Thread 0 computes L[j][j] = sqrt(output[j][j]) and stores to shared memory
+*   3. All threads divide their output[i][j] by L[j][j] in parallel
+*/
+function createCholesky(device, type) {
+	const dtype = type.inputDtypes[0];
+	const shape = type.inputShapes[0];
+	const n = shape[shape.length - 1];
+	const batches = require_backend.prod(shape.slice(0, -2));
+	const needsF16 = dtype === require_backend.DType.Float16;
+	const ty = dtypeToWgsl(dtype, true);
+	const workgroupSize = require_backend.findPow2(n, device.limits.maxComputeWorkgroupSizeX);
+	const code = `
+${needsF16 ? "enable f16;" : ""}
+${headerWgsl}
+@group(0) @binding(0) var<storage, read> input: array<${ty}>;
+@group(0) @binding(1) var<storage, read_write> output: array<${ty}>;
+// Shared memory for the diagonal element
+var<workgroup> L_jj: ${ty};
+@compute @workgroup_size(${workgroupSize})
+fn main(
+  @builtin(workgroup_id) wg_id: vec3<u32>,
+  @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+  let batch = wg_id.x + wg_id.y * ${gridOffsetY}u;
+  if (batch >= ${batches}u) {
+    return;
+  }
+  let base = batch * ${n * n}u;
+  let tid = local_id.x;
+  // Zero out output and copy lower triangle from input (threads collaborate)
+  for (var idx = tid; idx < ${n * n}u; idx += ${workgroupSize}u) {
+    let row = idx / ${n}u;
+    let col = idx % ${n}u;
+    output[base + idx] = select(0, input[base + idx], col <= row);
+  }
+  storageBarrier();
+  // Cholesky-Crout algorithm: process column by column
+  for (var j = 0u; j < ${n}u; j++) {
+    // Step 1: All threads compute sum for their rows i >= j in parallel
+    // sum = A[i][j] - sum(L[i][k] * L[j][k] for k < j)
+    for (var i = j + tid; i < ${n}u; i += ${workgroupSize}u) {
+      var sum = output[base + i * ${n}u + j];
+      for (var k = 0u; k < j; k++) {
+        sum -= output[base + i * ${n}u + k] * output[base + j * ${n}u + k];
+      }
+      output[base + i * ${n}u + j] = sum;
+    }
+    storageBarrier();
+    // Step 2: Thread 0 computes L[j][j] = sqrt(output[j][j])
+    if (tid == 0u) {
+      L_jj = sqrt(output[base + j * ${n}u + j]);
+      output[base + j * ${n}u + j] = L_jj;
+    }
+    workgroupBarrier();
+    // Step 3: All threads divide output[i][j] by L[j][j] for i > j
+    for (var i = j + 1u + tid; i < ${n}u; i += ${workgroupSize}u) {
+      output[base + i * ${n}u + j] /= L_jj;
+    }
+    storageBarrier();
+  }
+}
+`.trim();
+	const grid = calculateGrid(batches);
+	return [{
+		code,
+		numInputs: 1,
+		numOutputs: 1,
+		hasUniform: false,
+		passes: [{ grid }]
+	}];
+}
+/**
+* Generate an LU decomposition shader with partial pivoting.
+*
+* Computes PA = LU where P is a permutation matrix, L is lower triangular
+* with unit diagonal, and U is upper triangular.
+*
+* For each column j:
+*   1. Find pivot row (max absolute value in column j, rows >= j)
+*   2. Swap rows j and pivot row
+*   3. Compute L[i][j] = A[i][j] / A[j][j] for i > j
+*   4. Update submatrix: A[i][k] -= L[i][j] * A[j][k] for i > j, k > j
+*/
+function createLU(device, type) {
+	const dtype = type.inputDtypes[0];
+	const shape = type.inputShapes[0];
+	const m = shape[shape.length - 2];
+	const n = shape[shape.length - 1];
+	const r = Math.min(m, n);
+	const batches = require_backend.prod(shape.slice(0, -2));
+	const needsF16 = dtype === require_backend.DType.Float16;
+	const ty = dtypeToWgsl(dtype, true);
+	const workgroupSize = require_backend.findPow2(Math.max(m, n), device.limits.maxComputeWorkgroupSizeX);
+	const code = `
+${needsF16 ? "enable f16;" : ""}
+${headerWgsl}
+@group(0) @binding(0) var<storage, read> input: array<${ty}>;
+@group(0) @binding(1) var<storage, read_write> lu: array<${ty}>;
+@group(0) @binding(2) var<storage, read_write> pivots: array<i32>;
+@group(0) @binding(3) var<storage, read_write> perm: array<i32>;
+var<workgroup> pivot_row: u32;
+var<workgroup> pivot_val: ${ty};
+@compute @workgroup_size(${workgroupSize})
+fn main(
+  @builtin(workgroup_id) wg_id: vec3<u32>,
+  @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+  let batch = wg_id.x + wg_id.y * ${gridOffsetY}u;
+  if (batch >= ${batches}u) {
+    return;
+  }
+  let lu_base = batch * ${m * n}u;
+  let piv_base = batch * ${r}u;
+  let perm_base = batch * ${m}u;
+  let tid = local_id.x;
+  // Copy input to lu
+  for (var idx = tid; idx < ${m * n}u; idx += ${workgroupSize}u) {
+    lu[lu_base + idx] = input[lu_base + idx];
+  }
+  // Initialize permutation
+  for (var idx = tid; idx < ${m}u; idx += ${workgroupSize}u) {
+    perm[perm_base + idx] = i32(idx);
+  }
+  storageBarrier();
+  // LU decomposition with partial pivoting
+  for (var j = 0u; j < ${r}u; j++) {
+    // Step 1: Thread 0 finds pivot (max abs value in column j, rows >= j)
+    if (tid == 0u) {
+      var max_val = abs(lu[lu_base + j * ${n}u + j]);
+      var max_row = j;
+      for (var i = j + 1u; i < ${m}u; i++) {
+        let val = abs(lu[lu_base + i * ${n}u + j]);
+        if (val > max_val) {
+          max_val = val;
+          max_row = i;
+        }
+      }
+      pivot_row = max_row;
+      pivot_val = lu[lu_base + max_row * ${n}u + j];
+      pivots[piv_base + j] = i32(max_row);
+    }
+    workgroupBarrier();
+    // Step 2: Swap rows j and pivot_row (threads collaborate)
+    let pr = pivot_row;
+    if (pr != j) {
+      for (var col = tid; col < ${n}u; col += ${workgroupSize}u) {
+        let tmp = lu[lu_base + j * ${n}u + col];
+        lu[lu_base + j * ${n}u + col] = lu[lu_base + pr * ${n}u + col];
+        lu[lu_base + pr * ${n}u + col] = tmp;
+      }
+      if (tid == 0u) {
+        let tmp_p = perm[perm_base + j];
+        perm[perm_base + j] = perm[perm_base + pr];
+        perm[perm_base + pr] = tmp_p;
+      }
+    }
+    storageBarrier();
+    // Step 3: Compute L[i][j] and update submatrix
+    // Each thread handles one row i > j
+    for (var i = j + 1u + tid; i < ${m}u; i += ${workgroupSize}u) {
+      let factor = lu[lu_base + i * ${n}u + j] / pivot_val;
+      lu[lu_base + i * ${n}u + j] = factor; // L[i][j]
+      for (var k = j + 1u; k < ${n}u; k++) {
+        lu[lu_base + i * ${n}u + k] -= factor * lu[lu_base + j * ${n}u + k];
+      }
+    }
+    storageBarrier();
+  }
+}
+`.trim();
+	const grid = calculateGrid(batches);
+	return [{
+		code,
+		numInputs: 1,
+		numOutputs: 3,
+		hasUniform: false,
+		passes: [{ grid }]
+	}];
+}
 function createRoutineShader(device, routine) {
 	switch (routine.name) {
 		case require_backend.Routines.Sort: return createSort(device, routine.type);
 		case require_backend.Routines.Argsort: return createArgsort(device, routine.type);
+		case require_backend.Routines.TriangularSolve: return createTriangularSolve(device, routine.type, routine.params);
+		case require_backend.Routines.Cholesky: return createCholesky(device, routine.type);
+		case require_backend.Routines.LU: return createLU(device, routine.type);
 		default: throw new require_backend.UnsupportedRoutineError(routine.name, "webgpu");
 	}
 }
@@ -675,8 +966,10 @@ function pipelineSource(device, kernel) {
 			else source = `(${a} * ${b})`;
 			else if (op === require_backend.AluOp.Idiv) source = require_backend.isFloatDtype(dtype) ? `trunc(${a} / ${b})` : `(${a} / ${b})`;
 			else if (op === require_backend.AluOp.Mod) source = `(${a} % ${b})`;
-			else if (op === require_backend.AluOp.Min) source = `min(${require_backend.strip1(a)}, ${require_backend.strip1(b)})`;
-			else if (op === require_backend.AluOp.Max) source = `max(${require_backend.strip1(a)}, ${require_backend.strip1(b)})`;
+			else if (op === require_backend.AluOp.Min) if (dtype === require_backend.DType.Bool) source = `(${a} && ${b})`;
+			else source = `min(${require_backend.strip1(a)}, ${require_backend.strip1(b)})`;
+			else if (op === require_backend.AluOp.Max) if (dtype === require_backend.DType.Bool) source = `(${a} || ${b})`;
+			else source = `max(${require_backend.strip1(a)}, ${require_backend.strip1(b)})`;
 			else if (op === require_backend.AluOp.Cmplt) source = `(${a} < ${b})`;
 			else if (op === require_backend.AluOp.Cmpne) if (require_backend.isFloatDtype(src[0].dtype)) {
 				const x = isGensym(a) ? a : gensym();

package/dist/{webgpu-ChVgx3b6.js → webgpu-Dh7k9io0.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { AluExp, AluGroup, AluOp, DEBUG, DType, Executable, FpHash, Routines, SlotError, UnsupportedOpError, UnsupportedRoutineError, findPow2, isFloatDtype, mapSetUnion, prod, range, strip1, tuneWebgpu } from "./backend-tngXtWe4.js";
+import { AluExp, AluGroup, AluOp, DEBUG, DType, Executable, FpHash, Routines, SlotError, UnsupportedOpError, UnsupportedRoutineError, findPow2, isFloatDtype, mapSetUnion, prod, range, strip1, tuneWebgpu } from "./backend-DaqL-MNz.js";
 //#region src/backend/webgpu/builtins.ts
 const threefrySrc = `
@@ -414,10 +414,301 @@ function createArgsort(device, type) {
 	const batches = prod(shape.slice(0, -1));
 	return bitonicSortShader(device, dtype, n, batches, true);
 }
+/**
+* Generate a triangular solve shader.
+*
+* Solves A @ X.T = B.T for X, where A is upper-triangular.
+* Uses a parallelized back-substitution:
+*   1. Copy b to x
+*   2. For j = n-1 down to 0:
+*      - Divide x[j] by a[j,j] (single thread)
+*      - All threads subtract x[j] * a[i,j] from x[i] for i < j in parallel
+*/
+function createTriangularSolve(device, type, params) {
+	const dtype = type.inputDtypes[0];
+	const aShape = type.inputShapes[0];
+	const bShape = type.inputShapes[1];
+	const n = aShape[aShape.length - 1];
+	const numRhs = bShape[bShape.length - 2];
+	const numMatrices = prod(aShape.slice(0, -2));
+	const needsF16 = dtype === DType.Float16;
+	const ty = dtypeToWgsl(dtype, true);
+	const workgroupSize = findPow2(n, device.limits.maxComputeWorkgroupSizeX);
+	const code = `
+${needsF16 ? "enable f16;" : ""}
+${headerWgsl}
+@group(0) @binding(0) var<storage, read> a: array<${ty}>;
+@group(0) @binding(1) var<storage, read> b: array<${ty}>;
+@group(0) @binding(2) var<storage, read_write> x: array<${ty}>;
+// Shared memory for the current pivot value x[j]
+var<workgroup> x_j: ${ty};
+@compute @workgroup_size(${workgroupSize})
+fn main(
+  @builtin(workgroup_id) wg_id: vec3<u32>,
+  @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+  let wg_idx = wg_id.x + wg_id.y * ${gridOffsetY}u;
+  let mat_idx = wg_idx / ${numRhs}u;
+  let rhs_idx = wg_idx % ${numRhs}u;
+  if (mat_idx >= ${numMatrices}u) {
+    return;
+  }
+  let a_base = mat_idx * ${n * n}u;
+  let bx_base = (mat_idx * ${numRhs}u + rhs_idx) * ${n}u;
+  let tid = local_id.x;
+  // Step 1: Copy b to x (threads collaborate)
+  for (var idx = tid; idx < ${n}u; idx += ${workgroupSize}u) {
+    x[bx_base + idx] = b[bx_base + idx];
+  }
+  storageBarrier();
+  // Step 2: Back-substitution from j = n-1 down to 0
+  for (var jj = 0u; jj < ${n}u; jj++) {
+    let j = ${n - 1}u - jj;
+    // Thread 0 computes x[j] = x[j] / a[j,j]
+    if (tid == 0u) {
+      ${params.unitDiagonal ? `x_j = x[bx_base + j];` : `x_j = x[bx_base + j] / a[a_base + j * ${n}u + j];`}
+      x[bx_base + j] = x_j;
+    }
+    workgroupBarrier();  // Sync shared memory x_j
+    // All threads subtract x[j] * a[i,j] from x[i] for i < j
+    for (var i = tid; i < j; i += ${workgroupSize}u) {
+      x[bx_base + i] -= x_j * a[a_base + i * ${n}u + j];
+    }
+    workgroupBarrier();
+    storageBarrier();
+  }
+}
+`.trim();
+	const totalWorkgroups = numMatrices * numRhs;
+	const grid = calculateGrid(totalWorkgroups);
+	return [{
+		code,
+		numInputs: 2,
+		numOutputs: 1,
+		hasUniform: false,
+		passes: [{ grid }]
+	}];
+}
+/**
+* Generate a Cholesky decomposition shader.
+*
+* Computes the lower triangular matrix L such that A = L * L^T for each
+* positive semi-definite matrix in the batch. Uses the Cholesky-Crout
+* algorithm which processes column-by-column.
+*
+* For each column j:
+*   1. All threads compute their row's sum in parallel and store to output
+*   2. Thread 0 computes L[j][j] = sqrt(output[j][j]) and stores to shared memory
+*   3. All threads divide their output[i][j] by L[j][j] in parallel
+*/
+function createCholesky(device, type) {
+	const dtype = type.inputDtypes[0];
+	const shape = type.inputShapes[0];
+	const n = shape[shape.length - 1];
+	const batches = prod(shape.slice(0, -2));
+	const needsF16 = dtype === DType.Float16;
+	const ty = dtypeToWgsl(dtype, true);
+	const workgroupSize = findPow2(n, device.limits.maxComputeWorkgroupSizeX);
+	const code = `
+${needsF16 ? "enable f16;" : ""}
+${headerWgsl}
+@group(0) @binding(0) var<storage, read> input: array<${ty}>;
+@group(0) @binding(1) var<storage, read_write> output: array<${ty}>;
+// Shared memory for the diagonal element
+var<workgroup> L_jj: ${ty};
+@compute @workgroup_size(${workgroupSize})
+fn main(
+  @builtin(workgroup_id) wg_id: vec3<u32>,
+  @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+  let batch = wg_id.x + wg_id.y * ${gridOffsetY}u;
+  if (batch >= ${batches}u) {
+    return;
+  }
+  let base = batch * ${n * n}u;
+  let tid = local_id.x;
+  // Zero out output and copy lower triangle from input (threads collaborate)
+  for (var idx = tid; idx < ${n * n}u; idx += ${workgroupSize}u) {
+    let row = idx / ${n}u;
+    let col = idx % ${n}u;
+    output[base + idx] = select(0, input[base + idx], col <= row);
+  }
+  storageBarrier();
+  // Cholesky-Crout algorithm: process column by column
+  for (var j = 0u; j < ${n}u; j++) {
+    // Step 1: All threads compute sum for their rows i >= j in parallel
+    // sum = A[i][j] - sum(L[i][k] * L[j][k] for k < j)
+    for (var i = j + tid; i < ${n}u; i += ${workgroupSize}u) {
+      var sum = output[base + i * ${n}u + j];
+      for (var k = 0u; k < j; k++) {
+        sum -= output[base + i * ${n}u + k] * output[base + j * ${n}u + k];
+      }
+      output[base + i * ${n}u + j] = sum;
+    }
+    storageBarrier();
+    // Step 2: Thread 0 computes L[j][j] = sqrt(output[j][j])
+    if (tid == 0u) {
+      L_jj = sqrt(output[base + j * ${n}u + j]);
+      output[base + j * ${n}u + j] = L_jj;
+    }
+    workgroupBarrier();
+    // Step 3: All threads divide output[i][j] by L[j][j] for i > j
+    for (var i = j + 1u + tid; i < ${n}u; i += ${workgroupSize}u) {
+      output[base + i * ${n}u + j] /= L_jj;
+    }
+    storageBarrier();
+  }
+}
+`.trim();
+	const grid = calculateGrid(batches);
+	return [{
+		code,
+		numInputs: 1,
+		numOutputs: 1,
+		hasUniform: false,
+		passes: [{ grid }]
+	}];
+}
+/**
+* Generate an LU decomposition shader with partial pivoting.
+*
+* Computes PA = LU where P is a permutation matrix, L is lower triangular
+* with unit diagonal, and U is upper triangular.
+*
+* For each column j:
+*   1. Find pivot row (max absolute value in column j, rows >= j)
+*   2. Swap rows j and pivot row
+*   3. Compute L[i][j] = A[i][j] / A[j][j] for i > j
+*   4. Update submatrix: A[i][k] -= L[i][j] * A[j][k] for i > j, k > j
+*/
+function createLU(device, type) {
+	const dtype = type.inputDtypes[0];
+	const shape = type.inputShapes[0];
+	const m = shape[shape.length - 2];
+	const n = shape[shape.length - 1];
+	const r = Math.min(m, n);
+	const batches = prod(shape.slice(0, -2));
+	const needsF16 = dtype === DType.Float16;
+	const ty = dtypeToWgsl(dtype, true);
+	const workgroupSize = findPow2(Math.max(m, n), device.limits.maxComputeWorkgroupSizeX);
+	const code = `
+${needsF16 ? "enable f16;" : ""}
+${headerWgsl}
+@group(0) @binding(0) var<storage, read> input: array<${ty}>;
+@group(0) @binding(1) var<storage, read_write> lu: array<${ty}>;
+@group(0) @binding(2) var<storage, read_write> pivots: array<i32>;
+@group(0) @binding(3) var<storage, read_write> perm: array<i32>;
+var<workgroup> pivot_row: u32;
+var<workgroup> pivot_val: ${ty};
+@compute @workgroup_size(${workgroupSize})
+fn main(
+  @builtin(workgroup_id) wg_id: vec3<u32>,
+  @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+  let batch = wg_id.x + wg_id.y * ${gridOffsetY}u;
+  if (batch >= ${batches}u) {
+    return;
+  }
+  let lu_base = batch * ${m * n}u;
+  let piv_base = batch * ${r}u;
+  let perm_base = batch * ${m}u;
+  let tid = local_id.x;
+  // Copy input to lu
+  for (var idx = tid; idx < ${m * n}u; idx += ${workgroupSize}u) {
+    lu[lu_base + idx] = input[lu_base + idx];
+  }
+  // Initialize permutation
+  for (var idx = tid; idx < ${m}u; idx += ${workgroupSize}u) {
+    perm[perm_base + idx] = i32(idx);
+  }
+  storageBarrier();
+  // LU decomposition with partial pivoting
+  for (var j = 0u; j < ${r}u; j++) {
+    // Step 1: Thread 0 finds pivot (max abs value in column j, rows >= j)
+    if (tid == 0u) {
+      var max_val = abs(lu[lu_base + j * ${n}u + j]);
+      var max_row = j;
+      for (var i = j + 1u; i < ${m}u; i++) {
+        let val = abs(lu[lu_base + i * ${n}u + j]);
+        if (val > max_val) {
+          max_val = val;
+          max_row = i;
+        }
+      }
+      pivot_row = max_row;
+      pivot_val = lu[lu_base + max_row * ${n}u + j];
+      pivots[piv_base + j] = i32(max_row);
+    }
+    workgroupBarrier();
+    // Step 2: Swap rows j and pivot_row (threads collaborate)
+    let pr = pivot_row;
+    if (pr != j) {
+      for (var col = tid; col < ${n}u; col += ${workgroupSize}u) {
+        let tmp = lu[lu_base + j * ${n}u + col];
+        lu[lu_base + j * ${n}u + col] = lu[lu_base + pr * ${n}u + col];
+        lu[lu_base + pr * ${n}u + col] = tmp;
+      }
+      if (tid == 0u) {
+        let tmp_p = perm[perm_base + j];
+        perm[perm_base + j] = perm[perm_base + pr];
+        perm[perm_base + pr] = tmp_p;
+      }
+    }
+    storageBarrier();
+    // Step 3: Compute L[i][j] and update submatrix
+    // Each thread handles one row i > j
+    for (var i = j + 1u + tid; i < ${m}u; i += ${workgroupSize}u) {
+      let factor = lu[lu_base + i * ${n}u + j] / pivot_val;
+      lu[lu_base + i * ${n}u + j] = factor; // L[i][j]
+      for (var k = j + 1u; k < ${n}u; k++) {
+        lu[lu_base + i * ${n}u + k] -= factor * lu[lu_base + j * ${n}u + k];
+      }
+    }
+    storageBarrier();
+  }
+}
+`.trim();
+	const grid = calculateGrid(batches);
+	return [{
+		code,
+		numInputs: 1,
+		numOutputs: 3,
+		hasUniform: false,
+		passes: [{ grid }]
+	}];
+}
 function createRoutineShader(device, routine) {
 	switch (routine.name) {
 		case Routines.Sort: return createSort(device, routine.type);
 		case Routines.Argsort: return createArgsort(device, routine.type);
+		case Routines.TriangularSolve: return createTriangularSolve(device, routine.type, routine.params);
+		case Routines.Cholesky: return createCholesky(device, routine.type);
+		case Routines.LU: return createLU(device, routine.type);
 		default: throw new UnsupportedRoutineError(routine.name, "webgpu");
 	}
 }
@@ -675,8 +966,10 @@ function pipelineSource(device, kernel) {
 			else source = `(${a} * ${b})`;
 			else if (op === AluOp.Idiv) source = isFloatDtype(dtype) ? `trunc(${a} / ${b})` : `(${a} / ${b})`;
 			else if (op === AluOp.Mod) source = `(${a} % ${b})`;
-			else if (op === AluOp.Min) source = `min(${strip1(a)}, ${strip1(b)})`;
-			else if (op === AluOp.Max) source = `max(${strip1(a)}, ${strip1(b)})`;
+			else if (op === AluOp.Min) if (dtype === DType.Bool) source = `(${a} && ${b})`;
+			else source = `min(${strip1(a)}, ${strip1(b)})`;
+			else if (op === AluOp.Max) if (dtype === DType.Bool) source = `(${a} || ${b})`;
+			else source = `max(${strip1(a)}, ${strip1(b)})`;
 			else if (op === AluOp.Cmplt) source = `(${a} < ${b})`;
 			else if (op === AluOp.Cmpne) if (isFloatDtype(src[0].dtype)) {
 				const x = isGensym(a) ? a : gensym();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@jax-js/jax",
-  "version": "0.1.4",
+  "version": "0.1.5",
   "description": "Numerical computing and ML in the browser",
   "keywords": [
     "machine learning",