npm - @simulatte/doppler - Versions diffs - 0.1.5 → 0.1.6 - Mend

@simulatte/doppler 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

package/README.md +23 -8
package/package.json +7 -4
package/src/config/kernels/kernel-ref-digests.js +39 -39
package/src/config/kernels/registry.json +42 -2
package/src/config/loader.js +31 -2
package/src/config/merge.js +18 -0
package/src/config/presets/models/qwen3.json +9 -2
package/src/config/presets/models/transformer.json +5 -0
package/src/config/required-inference-fields-contract-check.js +6 -0
package/src/config/schema/inference-defaults.schema.js +3 -0
package/src/config/schema/inference.schema.d.ts +9 -0
package/src/config/schema/kernel-path.schema.d.ts +6 -0
package/src/config/schema/manifest.schema.d.ts +6 -0
package/src/config/schema/manifest.schema.js +3 -0
package/src/converter/rope-config.js +42 -0
package/src/gpu/device.js +58 -0
package/src/gpu/kernels/attention.js +98 -0
package/src/gpu/kernels/bias_add.wgsl +8 -6
package/src/gpu/kernels/bias_add_f16.wgsl +8 -5
package/src/gpu/kernels/conv2d.js +1 -1
package/src/gpu/kernels/conv2d.wgsl +7 -8
package/src/gpu/kernels/conv2d_f16.wgsl +7 -8
package/src/gpu/kernels/depthwise_conv2d.js +2 -1
package/src/gpu/kernels/depthwise_conv2d.wgsl +6 -9
package/src/gpu/kernels/depthwise_conv2d_f16.wgsl +6 -9
package/src/gpu/kernels/grouped_pointwise_conv2d.js +2 -1
package/src/gpu/kernels/grouped_pointwise_conv2d.wgsl +6 -9
package/src/gpu/kernels/grouped_pointwise_conv2d_f16.wgsl +6 -9
package/src/gpu/kernels/matmul.js +25 -0
package/src/gpu/kernels/pixel_shuffle.js +1 -1
package/src/gpu/kernels/pixel_shuffle.wgsl +4 -5
package/src/gpu/kernels/pixel_shuffle_f16.wgsl +4 -5
package/src/gpu/kernels/relu.js +15 -2
package/src/gpu/kernels/relu.wgsl +2 -1
package/src/gpu/kernels/relu_f16.wgsl +2 -1
package/src/gpu/kernels/repeat_channels.js +1 -1
package/src/gpu/kernels/repeat_channels.wgsl +4 -5
package/src/gpu/kernels/repeat_channels_f16.wgsl +4 -5
package/src/gpu/kernels/residual.js +44 -8
package/src/gpu/kernels/residual.wgsl +6 -3
package/src/gpu/kernels/residual_f16.wgsl +2 -1
package/src/gpu/kernels/residual_f16_vec4.wgsl +2 -1
package/src/gpu/kernels/residual_vec4.wgsl +2 -1
package/src/gpu/kernels/rmsnorm.js +58 -6
package/src/gpu/kernels/rmsnorm.wgsl +14 -6
package/src/gpu/kernels/rmsnorm_f16.wgsl +10 -2
package/src/gpu/kernels/rope.d.ts +2 -0
package/src/gpu/kernels/rope.js +11 -1
package/src/gpu/kernels/rope.wgsl +56 -40
package/src/gpu/kernels/sana_linear_attention.js +1 -2
package/src/gpu/kernels/sana_linear_attention_apply.wgsl +4 -5
package/src/gpu/kernels/sana_linear_attention_apply_f16.wgsl +4 -5
package/src/gpu/kernels/sana_linear_attention_summary.wgsl +4 -0
package/src/gpu/kernels/sana_linear_attention_summary_f16.wgsl +4 -0
package/src/gpu/kernels/silu.d.ts +1 -0
package/src/gpu/kernels/silu.js +32 -14
package/src/gpu/kernels/silu.wgsl +19 -9
package/src/gpu/kernels/silu_f16.wgsl +19 -9
package/src/gpu/kernels/transpose.js +15 -2
package/src/gpu/kernels/transpose.wgsl +5 -6
package/src/gpu/kernels/upsample2d.js +2 -1
package/src/gpu/kernels/upsample2d.wgsl +6 -9
package/src/gpu/kernels/upsample2d_f16.wgsl +6 -9
package/src/gpu/kernels/utils.js +16 -1
package/src/inference/browser-harness.js +47 -1
package/src/inference/pipelines/diffusion/pipeline.js +15 -6
package/src/inference/pipelines/diffusion/text-encoder-gpu.d.ts +5 -0
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +27 -15
package/src/inference/pipelines/text/attention/record.js +11 -2
package/src/inference/pipelines/text/attention/run.js +11 -2
package/src/inference/pipelines/text/chat-format.js +25 -1
package/src/inference/pipelines/text/config.d.ts +4 -0
package/src/inference/pipelines/text/config.js +68 -1
package/src/inference/pipelines/text/execution-plan.js +23 -31
package/src/inference/pipelines/text/execution-v0.js +29 -2
package/src/inference/pipelines/text/ffn/standard.js +3 -0
package/src/inference/pipelines/text/init.d.ts +4 -0
package/src/inference/pipelines/text/init.js +56 -9
package/src/inference/pipelines/text/layer.js +11 -0
package/src/inference/pipelines/text.js +4 -0
package/src/inference/tokenizers/bundled.js +156 -33
package/src/rules/tooling/command-runtime.rules.json +18 -0
package/src/tooling/command-api.d.ts +27 -1
package/src/tooling/command-api.js +142 -3
package/src/tooling/node-browser-command-runner.d.ts +4 -0
package/src/tooling/node-browser-command-runner.js +58 -3
package/src/tooling/node-command-runner.js +15 -0
package/src/tooling/node-webgpu.js +9 -87
package/src/training/checkpoint-watch.d.ts +7 -0
package/src/training/checkpoint-watch.js +106 -0
package/src/training/checkpoint.d.ts +6 -1
package/src/training/checkpoint.js +12 -2
package/src/training/distillation/artifacts.d.ts +71 -0
package/src/training/distillation/artifacts.js +132 -0
package/src/training/distillation/checkpoint-watch.d.ts +10 -0
package/src/training/distillation/checkpoint-watch.js +57 -0
package/src/training/distillation/dataset.d.ts +59 -0
package/src/training/distillation/dataset.js +337 -0
package/src/training/distillation/eval.d.ts +34 -0
package/src/training/distillation/eval.js +310 -0
package/src/training/distillation/index.d.ts +29 -0
package/src/training/distillation/index.js +29 -0
package/src/training/distillation/runtime.d.ts +20 -0
package/src/training/distillation/runtime.js +121 -0
package/src/training/distillation/scoreboard.d.ts +6 -0
package/src/training/distillation/scoreboard.js +8 -0
package/src/training/distillation/stage-a.d.ts +45 -0
package/src/training/distillation/stage-a.js +338 -0
package/src/training/distillation/stage-b.d.ts +24 -0
package/src/training/distillation/stage-b.js +20 -0
package/src/training/index.d.ts +10 -0
package/src/training/index.js +10 -0
package/src/training/lora-pipeline.d.ts +40 -0
package/src/training/lora-pipeline.js +796 -0
package/src/training/operator-artifacts.d.ts +62 -0
package/src/training/operator-artifacts.js +140 -0
package/src/training/operator-command.d.ts +5 -0
package/src/training/operator-command.js +453 -0
package/src/training/operator-eval.d.ts +48 -0
package/src/training/operator-eval.js +230 -0
package/src/training/operator-scoreboard.d.ts +5 -0
package/src/training/operator-scoreboard.js +44 -0
package/src/training/runner.d.ts +52 -0
package/src/training/runner.js +29 -4
package/src/training/suite.d.ts +112 -0
package/src/training/suite.js +9 -9
package/src/training/workloads.d.ts +164 -0
package/src/training/workloads.js +539 -0
package/src/version.js +1 -1
package/tools/doppler-cli.js +137 -40

package/src/gpu/device.js CHANGED Viewed

@@ -28,6 +28,62 @@ function advanceDeviceEpoch() {
   deviceEpoch += 1;
 }
+function isValidGPUBuffer(value) {
+  if (!value) {
+    return false;
+  }
+  if (typeof GPUBuffer === 'undefined') {
+    return true;
+  }
+  return value instanceof GPUBuffer;
+}
+function describeBindGroupBufferValue(value) {
+  if (value === null) return 'null';
+  if (value === undefined) return 'undefined';
+  if (typeof GPUBuffer !== 'undefined' && value instanceof GPUBuffer) return 'GPUBuffer';
+  if (typeof value === 'object') {
+    return value.constructor?.name || 'object';
+  }
+  return typeof value;
+}
+function validateBindGroupDescriptor(descriptor) {
+  const label = descriptor?.label || 'unlabeled_bind_group';
+  const entries = Array.isArray(descriptor?.entries) ? descriptor.entries : [];
+  for (const entry of entries) {
+    const resource = entry?.resource;
+    if (!resource || typeof resource !== 'object' || !('buffer' in resource)) {
+      continue;
+    }
+    if (isValidGPUBuffer(resource.buffer)) {
+      continue;
+    }
+    throw new Error(
+      `[${label}] binding ${entry.binding} requires a GPUBuffer; ` +
+      `got ${describeBindGroupBufferValue(resource.buffer)}.`
+    );
+  }
+}
+function wrapDeviceCreateBindGroup(device) {
+  if (!device || device.__dopplerBindGroupValidationWrapped) {
+    return device;
+  }
+  const originalCreateBindGroup = device.createBindGroup.bind(device);
+  device.createBindGroup = (descriptor) => {
+    validateBindGroupDescriptor(descriptor);
+    return originalCreateBindGroup(descriptor);
+  };
+  Object.defineProperty(device, '__dopplerBindGroupValidationWrapped', {
+    value: true,
+    configurable: true,
+    enumerable: false,
+    writable: false,
+  });
+  return device;
+}
 export const FEATURES =  ({
   SHADER_F16: 'shader-f16',
@@ -201,6 +257,7 @@ export async function initDevice() {
   if (!gpuDevice) {
     throw createDopplerError(ERROR_CODES.GPU_DEVICE_FAILED, 'Failed to create WebGPU device');
   }
+  wrapDeviceCreateBindGroup(gpuDevice);
   advanceDeviceEpoch();
   // Set up device lost handler
@@ -253,6 +310,7 @@ export function setDevice(device, options = {}) {
   }
   gpuDevice = device;
+  wrapDeviceCreateBindGroup(gpuDevice);
   advanceDeviceEpoch();
   wrapQueueForTracking(gpuDevice.queue);

package/src/gpu/kernels/attention.js CHANGED Viewed

@@ -780,6 +780,23 @@ function resolveAttentionExecution(recorder) {
   };
 }
+function assertAttentionBindGroupBuffer(kernelName, variant, bindingIndex, bindingLabel, buffer, details = []) {
+  const isGpuBuffer = buffer && (
+    typeof GPUBuffer === 'undefined'
+      ? true
+      : buffer instanceof GPUBuffer
+  );
+  if (isGpuBuffer) {
+    return;
+  }
+  const detailText = details.filter(Boolean).join(', ');
+  throw new Error(
+    `[${kernelName}] variant="${variant}" binding ${bindingIndex} "${bindingLabel}" requires a GPUBuffer` +
+    (detailText ? ` (${detailText})` : '') +
+    '.'
+  );
+}
 function releaseAttentionUniform(execution, uniformBuffer) {
   if (!execution.recorder) {
     releaseUniformBuffer(uniformBuffer);
@@ -867,6 +884,26 @@ async function executeAttentionBDPA(
     slidingWindow,
   });
+  assertAttentionBindGroupBuffer('attention_bdpa', variant, 0, 'uniforms', uniformBuffer);
+  assertAttentionBindGroupBuffer('attention_bdpa', variant, 1, 'Q', Q?.buffer, [
+    `QLabel=${Q?.label ?? 'unknown'}`,
+    `QDtype=${Q?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention_bdpa', variant, 2, 'basisK', basisK?.buffer, [
+    `basisKLabel=${basisK?.label ?? 'unknown'}`,
+    `basisKDtype=${basisK?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention_bdpa', variant, 3, 'basisV', basisV?.buffer, [
+    `basisVLabel=${basisV?.label ?? 'unknown'}`,
+    `basisVDtype=${basisV?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention_bdpa', variant, 4, 'pagedK', pagedK);
+  assertAttentionBindGroupBuffer('attention_bdpa', variant, 5, 'pagedV', pagedV);
+  assertAttentionBindGroupBuffer('attention_bdpa', variant, 6, 'index', index);
+  assertAttentionBindGroupBuffer('attention_bdpa', variant, 7, 'ropeCos', ropeCos);
+  assertAttentionBindGroupBuffer('attention_bdpa', variant, 8, 'ropeSin', ropeSin);
+  assertAttentionBindGroupBuffer('attention_bdpa', variant, 9, 'output', outputBuf);
   const bindGroup = execution.device.createBindGroup({
     label: 'attention_bdpa_bind_group',
     layout: pipeline.getBindGroupLayout(0),
@@ -982,6 +1019,24 @@ async function executeAttention(
   const kvLenBinding = kvLenBuffer || getKvLenFallbackBuffer(execution.device);
   const pageTableBinding = kvPageTable || getPageTableFallbackBuffer(execution.device);
+  assertAttentionBindGroupBuffer('attention', plan.variant, 0, 'uniforms', uniformBuffer);
+  assertAttentionBindGroupBuffer('attention', plan.variant, 1, 'Q', Q?.buffer, [
+    `QLabel=${Q?.label ?? 'unknown'}`,
+    `QDtype=${Q?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention', plan.variant, 2, 'K', K?.buffer, [
+    `KLabel=${K?.label ?? 'unknown'}`,
+    `KDtype=${K?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention', plan.variant, 3, 'V', V?.buffer, [
+    `VLabel=${V?.label ?? 'unknown'}`,
+    `VDtype=${V?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention', plan.variant, 4, 'output', outputBuf);
+  assertAttentionBindGroupBuffer('attention', plan.variant, 5, 'kvLen', kvLenBinding);
+  assertAttentionBindGroupBuffer('attention', plan.variant, 6, 'pageTable', pageTableBinding, [
+    `kvLayout=${kvLayout}`,
+  ]);
   const bindGroup = execution.device.createBindGroup({
     label: 'attention_bind_group',
     layout: pipeline.getBindGroupLayout(0),
@@ -1099,6 +1154,31 @@ async function executeAttentionTiered(
   });
   const pageTableBinding = coldPageTable || getPageTableFallbackBuffer(execution.device);
+  assertAttentionBindGroupBuffer('attention_tiered', variant, 0, 'uniforms', uniformBuffer);
+  assertAttentionBindGroupBuffer('attention_tiered', variant, 1, 'Q', Q?.buffer, [
+    `QLabel=${Q?.label ?? 'unknown'}`,
+    `QDtype=${Q?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention_tiered', variant, 2, 'hotK', hotK?.buffer, [
+    `hotKLabel=${hotK?.label ?? 'unknown'}`,
+    `hotKDtype=${hotK?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention_tiered', variant, 3, 'hotV', hotV?.buffer, [
+    `hotVLabel=${hotV?.label ?? 'unknown'}`,
+    `hotVDtype=${hotV?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention_tiered', variant, 4, 'coldK', coldK?.buffer, [
+    `coldKLabel=${coldK?.label ?? 'unknown'}`,
+    `coldKDtype=${coldK?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention_tiered', variant, 5, 'coldV', coldV?.buffer, [
+    `coldVLabel=${coldV?.label ?? 'unknown'}`,
+    `coldVDtype=${coldV?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention_tiered', variant, 6, 'output', outputBuf);
+  assertAttentionBindGroupBuffer('attention_tiered', variant, 7, 'pageTable', pageTableBinding, [
+    `coldLayout=${coldLayout}`,
+  ]);
   const bindGroup = execution.device.createBindGroup({
     label: 'attention_tiered_bind_group',
     layout: pipeline.getBindGroupLayout(0),
@@ -1200,6 +1280,24 @@ async function executeAttentionTieredQuant(
     packedStride,
   });
+  assertAttentionBindGroupBuffer('attention_tiered_quant', variant, 0, 'uniforms', uniformBuffer);
+  assertAttentionBindGroupBuffer('attention_tiered_quant', variant, 1, 'Q', Q?.buffer, [
+    `QLabel=${Q?.label ?? 'unknown'}`,
+    `QDtype=${Q?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention_tiered_quant', variant, 2, 'hotK', hotK?.buffer, [
+    `hotKLabel=${hotK?.label ?? 'unknown'}`,
+    `hotKDtype=${hotK?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention_tiered_quant', variant, 3, 'hotV', hotV?.buffer, [
+    `hotVLabel=${hotV?.label ?? 'unknown'}`,
+    `hotVDtype=${hotV?.dtype ?? 'unknown'}`,
+  ]);
+  assertAttentionBindGroupBuffer('attention_tiered_quant', variant, 4, 'coldPackedK', coldPackedK);
+  assertAttentionBindGroupBuffer('attention_tiered_quant', variant, 5, 'coldPackedV', coldPackedV);
+  assertAttentionBindGroupBuffer('attention_tiered_quant', variant, 6, 'coldScalesK', coldScalesK);
+  assertAttentionBindGroupBuffer('attention_tiered_quant', variant, 7, 'coldScalesV', coldScalesV);
+  assertAttentionBindGroupBuffer('attention_tiered_quant', variant, 8, 'output', outputBuf);
   const bindGroup = execution.device.createBindGroup({
     label: 'attention_tiered_quant_bind_group',
     layout: pipeline.getBindGroupLayout(0),

package/src/gpu/kernels/bias_add.wgsl CHANGED Viewed

@@ -14,6 +14,10 @@ struct Uniforms {
     dim: u32,
     data_offset: u32,  // byte offset into data buffer (divide by 4 for F32)
     bias_offset: u32,  // byte offset into bias buffer (divide by 4 for F32)
+    token_stride: u32,
+    _pad0: u32,
+    _pad1: u32,
+    _pad2: u32,
 }
 override WORKGROUP_SIZE: u32 = 256u;
@@ -24,17 +28,15 @@ override WORKGROUP_SIZE: u32 = 256u;
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
-    let total = u.num_tokens * u.dim;
-    if (idx >= total) {
+    let d = gid.x;
+    let token = gid.z * max(u.token_stride, 1u) + gid.y;
+    if (token >= u.num_tokens || d >= u.dim) {
         return;
     }
     // Convert byte offsets to F32 indices
     let data_base = u.data_offset / 4u;
     let bias_base = u.bias_offset / 4u;
-    let d = idx % u.dim;
+    let idx = token * u.dim + d;
     data[data_base + idx] = data[data_base + idx] + bias[bias_base + d];
 }

package/src/gpu/kernels/bias_add_f16.wgsl CHANGED Viewed

@@ -18,6 +18,10 @@ struct Uniforms {
     dim: u32,
     data_offset: u32,  // byte offset into data buffer (divide by 2 for F16)
     bias_offset: u32,  // byte offset into bias buffer (divide by 2 for F16)
+    token_stride: u32,
+    _pad0: u32,
+    _pad1: u32,
+    _pad2: u32,
 }
 override WORKGROUP_SIZE: u32 = 256u;
@@ -28,17 +32,16 @@ override WORKGROUP_SIZE: u32 = 256u;
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
-    let total = u.num_tokens * u.dim;
-    if (idx >= total) {
+    let d = gid.x;
+    let token = gid.z * max(u.token_stride, 1u) + gid.y;
+    if (token >= u.num_tokens || d >= u.dim) {
         return;
     }
     // Convert byte offsets to F16 indices
     let data_base = u.data_offset / 2u;
     let bias_base = u.bias_offset / 2u;
-    let d = idx % u.dim;
+    let idx = token * u.dim + d;
     let out = f32(data[data_base + idx]) + f32(bias[bias_base + d]);
     data[data_base + idx] = f16(out);
 }

package/src/gpu/kernels/conv2d.js CHANGED Viewed

@@ -58,7 +58,7 @@ async function _conv2d(target, input, weight, bias, options = {}) {
       kernel_h: kernelH, kernel_w: kernelW,
       stride, pad, _pad0: 0, _pad1: 0,
     },
-    Math.ceil((outChannels * outHeight * outWidth) / WORKGROUP_SIZES.DEFAULT)
+    [Math.ceil((outHeight * outWidth) / WORKGROUP_SIZES.DEFAULT), outChannels, 1]
   );
   if (tempBias) {

package/src/gpu/kernels/conv2d.wgsl CHANGED Viewed

@@ -27,19 +27,18 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
     let out_height = u.out_height;
     let out_width = u.out_width;
-    let out_size = u.out_channels * out_height * out_width;
-    if (idx >= out_size) {
+    let out_spatial = out_height * out_width;
+    let out_spatial_idx = gid.x;
+    let out_c = gid.y;
+    if (out_c >= u.out_channels || out_spatial_idx >= out_spatial) {
         return;
     }
-    let out_spatial = out_height * out_width;
-    let out_c = idx / out_spatial;
-    let rem = idx - out_c * out_spatial;
-    let out_y = rem / out_width;
-    let out_x = rem - out_y * out_width;
+    let out_y = out_spatial_idx / out_width;
+    let out_x = out_spatial_idx - out_y * out_width;
+    let idx = out_c * out_spatial + out_spatial_idx;
     var sum: f32 = bias[out_c];

package/src/gpu/kernels/conv2d_f16.wgsl CHANGED Viewed

@@ -29,19 +29,18 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
     let out_height = u.out_height;
     let out_width = u.out_width;
-    let out_size = u.out_channels * out_height * out_width;
-    if (idx >= out_size) {
+    let out_spatial = out_height * out_width;
+    let out_spatial_idx = gid.x;
+    let out_c = gid.y;
+    if (out_c >= u.out_channels || out_spatial_idx >= out_spatial) {
         return;
     }
-    let out_spatial = out_height * out_width;
-    let out_c = idx / out_spatial;
-    let rem = idx - out_c * out_spatial;
-    let out_y = rem / out_width;
-    let out_x = rem - out_y * out_width;
+    let out_y = out_spatial_idx / out_width;
+    let out_x = out_spatial_idx - out_y * out_width;
+    let idx = out_c * out_spatial + out_spatial_idx;
     var sum: f32 = f32(bias[out_c]);

package/src/gpu/kernels/depthwise_conv2d.js CHANGED Viewed

@@ -45,6 +45,7 @@ async function _depthwiseConv2D(target, input, weight, bias, options = {}) {
   const bytesPerElement = dtypeBytes(input.dtype);
   const outputSize = channels * outHeight * outWidth * bytesPerElement;
   const output = outputBuffer || acquireBuffer(outputSize, undefined, 'depthwise_conv2d_output');
+  const outSpatial = outHeight * outWidth;
   const weightBuffer = getBuffer(weight);
   let biasBuffer = getBuffer(bias);
@@ -75,7 +76,7 @@ async function _depthwiseConv2D(target, input, weight, bias, options = {}) {
       _pad0: 0,
       _pad1: 0,
     },
-    Math.ceil((channels * outHeight * outWidth) / WORKGROUP_SIZES.DEFAULT)
+    [Math.ceil(outSpatial / WORKGROUP_SIZES.DEFAULT), channels, 1]
   );
   if (tempBias) {

package/src/gpu/kernels/depthwise_conv2d.wgsl CHANGED Viewed

@@ -23,17 +23,14 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
     let out_spatial = u.out_height * u.out_width;
-    let out_size = u.channels * out_spatial;
-    if (idx >= out_size) {
+    let spatial_idx = gid.x;
+    let channel = gid.y;
+    if (spatial_idx >= out_spatial || channel >= u.channels) {
         return;
     }
-    let channel = idx / out_spatial;
-    let rem = idx - channel * out_spatial;
-    let out_y = rem / u.out_width;
-    let out_x = rem - out_y * u.out_width;
+    let out_y = spatial_idx / u.out_width;
+    let out_x = spatial_idx - out_y * u.out_width;
     var sum: f32 = bias[channel];
     let pad = i32(u.pad);
@@ -54,5 +51,5 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
         }
     }
-    output[idx] = sum;
+    output[channel * out_spatial + spatial_idx] = sum;
 }

package/src/gpu/kernels/depthwise_conv2d_f16.wgsl CHANGED Viewed

@@ -27,17 +27,14 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
     let out_spatial = u.out_height * u.out_width;
-    let out_size = u.channels * out_spatial;
-    if (idx >= out_size) {
+    let spatial_idx = gid.x;
+    let channel = gid.y;
+    if (spatial_idx >= out_spatial || channel >= u.channels) {
         return;
     }
-    let channel = idx / out_spatial;
-    let rem = idx - channel * out_spatial;
-    let out_y = rem / u.out_width;
-    let out_x = rem - out_y * u.out_width;
+    let out_y = spatial_idx / u.out_width;
+    let out_x = spatial_idx - out_y * u.out_width;
     var sum: f32 = f32(bias[channel]);
     let pad = i32(u.pad);
@@ -58,5 +55,5 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
         }
     }
-    output[idx] = f16(sum);
+    output[channel * out_spatial + spatial_idx] = f16(sum);
 }

package/src/gpu/kernels/grouped_pointwise_conv2d.js CHANGED Viewed

@@ -42,6 +42,7 @@ async function _groupedPointwiseConv2D(target, input, weight, bias, options = {}
   const bytesPerElement = dtypeBytes(input.dtype);
   const outputSize = outChannels * height * width * bytesPerElement;
   const output = outputBuffer || acquireBuffer(outputSize, undefined, 'grouped_pointwise_conv2d_output');
+  const spatial = height * width;
   const weightBuffer = getBuffer(weight);
   let biasBuffer = getBuffer(bias);
@@ -69,7 +70,7 @@ async function _groupedPointwiseConv2D(target, input, weight, bias, options = {}
       _pad1: 0,
       _pad2: 0,
     },
-    Math.ceil((outChannels * height * width) / WORKGROUP_SIZES.DEFAULT)
+    [Math.ceil(spatial / WORKGROUP_SIZES.DEFAULT), outChannels, 1]
   );
   if (tempBias) {

package/src/gpu/kernels/grouped_pointwise_conv2d.wgsl CHANGED Viewed

@@ -19,17 +19,14 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
     let spatial = u.height * u.width;
-    let out_size = u.out_channels * spatial;
-    if (idx >= out_size) {
+    let spatial_idx = gid.x;
+    let out_channel = gid.y;
+    if (spatial_idx >= spatial || out_channel >= u.out_channels) {
         return;
     }
-    let out_channel = idx / spatial;
-    let rem = idx - out_channel * spatial;
-    let y = rem / u.width;
-    let x = rem - y * u.width;
+    let y = spatial_idx / u.width;
+    let x = spatial_idx - y * u.width;
     let in_per_group = u.in_channels / u.groups;
     let out_per_group = u.out_channels / u.groups;
@@ -43,5 +40,5 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
         sum = sum + input[input_idx] * weight[weight_idx];
     }
-    output[idx] = sum;
+    output[out_channel * spatial + spatial_idx] = sum;
 }

package/src/gpu/kernels/grouped_pointwise_conv2d_f16.wgsl CHANGED Viewed

@@ -23,17 +23,14 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
     let spatial = u.height * u.width;
-    let out_size = u.out_channels * spatial;
-    if (idx >= out_size) {
+    let spatial_idx = gid.x;
+    let out_channel = gid.y;
+    if (spatial_idx >= spatial || out_channel >= u.out_channels) {
         return;
     }
-    let out_channel = idx / spatial;
-    let rem = idx - out_channel * spatial;
-    let y = rem / u.width;
-    let x = rem - y * u.width;
+    let y = spatial_idx / u.width;
+    let x = spatial_idx - y * u.width;
     let in_per_group = u.in_channels / u.groups;
     let out_per_group = u.out_channels / u.groups;
@@ -47,5 +44,5 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
         sum = sum + f32(input[input_idx]) * f32(weight[weight_idx]);
     }
-    output[idx] = f16(sum);
+    output[out_channel * spatial + spatial_idx] = f16(sum);
 }

package/src/gpu/kernels/matmul.js CHANGED Viewed

@@ -52,6 +52,23 @@ function buildProfileLabel(options = {}) {
   return `matmul${roleLabel}${layerLabel}`;
 }
+function assertBindGroupBuffer(kernelName, variant, bindingIndex, bindingLabel, buffer, details = []) {
+  const isGpuBuffer = buffer && (
+    typeof GPUBuffer === 'undefined'
+      ? true
+      : buffer instanceof GPUBuffer
+  );
+  if (isGpuBuffer) {
+    return;
+  }
+  const detailText = details.filter(Boolean).join(', ');
+  throw new Error(
+    `[${kernelName}] variant="${variant}" binding ${bindingIndex} "${bindingLabel}" requires a GPUBuffer` +
+    (detailText ? ` (${detailText})` : '') +
+    '.'
+  );
+}
 function createMatmulBindGroupEntries(variant, uniformBuffer, matmulInput, bBuffer, outputBuffer, offsets, bindingSizes) {
   const isQ4KF16 = variant === 'q4_fused_multicol_f16'
     || variant === 'q4_fused_f16a'
@@ -59,6 +76,14 @@ function createMatmulBindGroupEntries(variant, uniformBuffer, matmulInput, bBuff
     || variant === 'q4_fused_multicol_f16a'
     || variant === 'q4_fused_batched_f16a';
+  assertBindGroupBuffer('matmul', variant, 0, 'uniforms', uniformBuffer);
+  assertBindGroupBuffer('matmul', variant, 1, 'input', matmulInput?.buffer, [
+    `inputLabel=${matmulInput?.label ?? 'unknown'}`,
+    `inputDtype=${matmulInput?.dtype ?? 'unknown'}`,
+  ]);
+  assertBindGroupBuffer('matmul', variant, 2, 'weights', bBuffer);
+  assertBindGroupBuffer('matmul', variant, isQ4KF16 ? 4 : 3, 'output', outputBuffer);
   const entries = [
     { binding: 0, resource: { buffer: uniformBuffer } },
     { binding: 1, resource: { buffer: matmulInput.buffer, offset: offsets.aOffset, size: bindingSizes.aBindingSize } },

package/src/gpu/kernels/pixel_shuffle.js CHANGED Viewed

@@ -34,7 +34,7 @@ async function _pixelShuffle(target, input, options = {}) {
       grid_width: gridWidth, grid_height: gridHeight, patch_size: patchSize,
       patch_channels: inferredPatchChannels, _pad0: 0,
     },
-    Math.ceil((outChannels * outHeight * outWidth) / WORKGROUP_SIZES.DEFAULT)
+    [Math.ceil((outHeight * outWidth) / WORKGROUP_SIZES.DEFAULT), outChannels, 1]
   );
   return createTensor(output, input.dtype, [outChannels, outHeight, outWidth], 'pixel_shuffle_output');

package/src/gpu/kernels/pixel_shuffle.wgsl CHANGED Viewed

@@ -19,17 +19,16 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
     let spatial_size = u.out_height * u.out_width;
-    let total = u.out_channels * spatial_size;
-    if (idx >= total) {
+    let spatial = gid.x;
+    let c = gid.y;
+    if (c >= u.out_channels || spatial >= spatial_size) {
         return;
     }
-    let c = idx / spatial_size;
-    let spatial = idx - c * spatial_size;
     let y = spatial / u.out_width;
     let x = spatial - y * u.out_width;
+    let idx = c * spatial_size + spatial;
     let grid_y = y / u.patch_size;
     let grid_x = x / u.patch_size;

package/src/gpu/kernels/pixel_shuffle_f16.wgsl CHANGED Viewed

@@ -22,17 +22,16 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
     let spatial_size = u.out_height * u.out_width;
-    let total = u.out_channels * spatial_size;
-    if (idx >= total) {
+    let spatial = gid.x;
+    let c = gid.y;
+    if (c >= u.out_channels || spatial >= spatial_size) {
         return;
     }
-    let c = idx / spatial_size;
-    let spatial = idx - c * spatial_size;
     let y = spatial / u.out_width;
     let x = spatial - y * u.out_width;
+    let idx = c * spatial_size + spatial;
     let grid_y = y / u.patch_size;
     let grid_x = x / u.patch_size;

package/src/gpu/kernels/relu.js CHANGED Viewed

@@ -18,19 +18,32 @@ function resolveCount(input, countOverride) {
   return Math.floor(input.buffer.size / dtypeBytes(input.dtype));
 }
+function planReluDispatch(target, size) {
+  const device = target?.device;
+  const maxPerDim = Number.isFinite(device?.limits?.maxComputeWorkgroupsPerDimension)
+    ? device.limits.maxComputeWorkgroupsPerDimension
+    : 65535;
+  const dispatchStride = Math.min(size, maxPerDim * WORKGROUP_SIZES.DEFAULT);
+  return {
+    dispatchStride,
+    workgroups: [Math.ceil(dispatchStride / WORKGROUP_SIZES.DEFAULT), 1, 1],
+  };
+}
 async function _relu(target, input, options = {}) {
   const { count = null, outputBuffer = null } = options;
   const size = resolveCount(input, count);
   const variant = selectReluVariant(input.dtype);
   const output = outputBuffer || acquireBuffer(size * dtypeBytes(input.dtype), undefined, 'relu_output');
+  const dispatchPlan = planReluDispatch(target, size);
   await unifiedKernelWrapper(
     'relu',
     target,
     variant,
     [input, output],
-    { size, _pad0: 0, _pad1: 0, _pad2: 0 },
-    Math.ceil(size / WORKGROUP_SIZES.DEFAULT)
+    { size, _pad0: dispatchPlan.dispatchStride, _pad1: 0, _pad2: 0 },
+    dispatchPlan.workgroups
   );
   return createTensor(output, input.dtype, [...input.shape], 'relu_output');

package/src/gpu/kernels/relu.wgsl CHANGED Viewed

@@ -13,7 +13,8 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
+    let dispatch_stride = max(u._pad0, 1u);
+    let idx = gid.y * dispatch_stride + gid.x;
     if (idx >= u.size) {
         return;
     }

package/src/gpu/kernels/relu_f16.wgsl CHANGED Viewed

@@ -15,7 +15,8 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
+    let dispatch_stride = max(u._pad0, 1u);
+    let idx = gid.y * dispatch_stride + gid.x;
     if (idx >= u.size) {
         return;
     }

package/src/gpu/kernels/repeat_channels.js CHANGED Viewed

@@ -45,7 +45,7 @@ async function _repeatChannels(target, input, options = {}) {
       repeats,
       _pad0: 0,
     },
-    Math.ceil((outChannels * height * width) / WORKGROUP_SIZES.DEFAULT)
+    [Math.ceil((height * width) / WORKGROUP_SIZES.DEFAULT), outChannels, 1]
   );
   return createTensor(output, input.dtype, [outChannels, height, width], 'repeat_channels_output');