npm - @simulatte/doppler - Versions diffs - 0.1.7 → 0.1.9 - Mend

@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

package/CHANGELOG.md +32 -0
package/README.md +25 -6
package/package.json +25 -38
package/src/browser/browser-converter.js +5 -0
package/src/client/doppler-api.browser.js +6 -0
package/src/client/doppler-api.d.ts +3 -0
package/src/client/doppler-api.js +11 -2
package/src/client/doppler-registry.js +3 -5
package/src/client/doppler-registry.json +2 -2
package/src/config/kernel-path-loader.d.ts +5 -0
package/src/config/kernel-path-loader.js +13 -0
package/src/config/kernels/kernel-ref-digests.js +23 -21
package/src/config/kernels/moe/mixtral.paths.json +46 -0
package/src/config/kernels/registry.json +74 -0
package/src/config/loader.js +9 -0
package/src/config/merge-contract-check.js +7 -0
package/src/config/platforms/loader.js +3 -1
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
package/src/config/presets/kernel-paths/registry.json +21 -0
package/src/config/presets/models/gemma2.json +2 -1
package/src/config/presets/models/gemma3.json +4 -1
package/src/config/presets/models/gemma4.json +61 -0
package/src/config/presets/models/granite-docling.json +70 -0
package/src/config/presets/models/lfm2.json +6 -1
package/src/config/presets/models/qwen3.json +4 -3
package/src/config/presets/models/qwen3_5.json +16 -0
package/src/config/presets/models/qwen3_vl.json +40 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
package/src/config/presets/runtime/modes/trace-layers.json +1 -0
package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
package/src/config/runtime.js +3 -0
package/src/config/schema/conversion.schema.d.ts +1 -0
package/src/config/schema/debug.schema.d.ts +40 -0
package/src/config/schema/debug.schema.js +28 -0
package/src/config/schema/index.js +2 -0
package/src/config/schema/inference-defaults.schema.js +1 -1
package/src/config/schema/kernel-path.schema.d.ts +1 -0
package/src/config/schema/manifest.schema.d.ts +1 -1
package/src/config/schema/manifest.schema.js +1 -1
package/src/config/schema/memory-limits.schema.js +2 -2
package/src/config/schema/storage.schema.js +2 -2
package/src/converter/conversion-plan.js +11 -3
package/src/converter/core.js +19 -8
package/src/converter/manifest-inference.js +12 -22
package/src/converter/parsers/transformer.js +4 -0
package/src/converter/quantization-info.js +5 -1
package/src/converter/quantizer.d.ts +5 -0
package/src/converter/quantizer.js +34 -12
package/src/converter/rope-config.js +8 -6
package/src/converter/tokenizer-utils.d.ts +1 -0
package/src/converter/tokenizer-utils.js +4 -1
package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
package/src/distribution/shard-delivery.js +40 -1
package/src/formats/rdrr/classification.js +32 -0
package/src/formats/rdrr/parsing.d.ts +4 -0
package/src/formats/rdrr/parsing.js +14 -1
package/src/gpu/kernel-runtime.js +4 -2
package/src/gpu/kernels/attention.js +2 -1
package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
package/src/gpu/kernels/dequant_shared.wgsl +4 -2
package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
package/src/gpu/kernels/gated-short-conv.js +284 -0
package/src/gpu/kernels/index.d.ts +8 -0
package/src/gpu/kernels/index.js +6 -0
package/src/gpu/kernels/linear-attention-core.js +37 -17
package/src/gpu/kernels/matmul-selection.js +48 -4
package/src/gpu/kernels/matmul.d.ts +5 -0
package/src/gpu/kernels/matmul.js +71 -2
package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
package/src/gpu/kernels/rmsnorm.js +9 -2
package/src/gpu/kernels/sample.js +1 -3
package/src/gpu/kernels/sample.wgsl +39 -9
package/src/gpu/kernels/sample_f16.wgsl +38 -8
package/src/gpu/kernels/shader-cache.js +9 -4
package/src/gpu/kernels/split_qg.d.ts +50 -0
package/src/gpu/kernels/split_qg.js +46 -0
package/src/gpu/kernels/split_qg.wgsl +58 -0
package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
package/src/gpu/weight-buffer.d.ts +1 -1
package/src/gpu/weight-buffer.js +1 -1
package/src/inference/browser-harness.d.ts +2 -0
package/src/inference/browser-harness.js +20 -1
package/src/inference/kv-cache/base.js +3 -10
package/src/inference/pipelines/diffusion/helpers.js +3 -0
package/src/inference/pipelines/diffusion/pipeline.js +2 -1
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
package/src/inference/pipelines/text/attention/output-projection.js +8 -0
package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
package/src/inference/pipelines/text/attention/projections.js +54 -13
package/src/inference/pipelines/text/attention/record.js +16 -6
package/src/inference/pipelines/text/attention/run.js +59 -6
package/src/inference/pipelines/text/config.d.ts +1 -0
package/src/inference/pipelines/text/config.js +46 -4
package/src/inference/pipelines/text/embed.js +26 -7
package/src/inference/pipelines/text/execution-plan.js +5 -4
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
package/src/inference/pipelines/text/execution-v0.js +12 -1
package/src/inference/pipelines/text/generator-helpers.js +1 -0
package/src/inference/pipelines/text/generator-runtime.js +19 -0
package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
package/src/inference/pipelines/text/generator-steps.js +71 -26
package/src/inference/pipelines/text/generator.d.ts +5 -0
package/src/inference/pipelines/text/generator.js +353 -166
package/src/inference/pipelines/text/init.d.ts +15 -0
package/src/inference/pipelines/text/init.js +35 -10
package/src/inference/pipelines/text/layer.js +38 -8
package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
package/src/inference/pipelines/text/linear-attention.js +33 -3
package/src/inference/pipelines/text/logits/gpu.js +2 -2
package/src/inference/pipelines/text/logits/index.d.ts +6 -1
package/src/inference/pipelines/text/logits/index.js +3 -1
package/src/inference/pipelines/text/model-load.js +3 -0
package/src/inference/pipelines/text/moe-gpu.js +21 -3
package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
package/src/inference/pipelines/text/ops.js +123 -53
package/src/inference/pipelines/text/probes.js +1 -0
package/src/inference/pipelines/text/sampling.js +52 -6
package/src/inference/pipelines/text/state.js +2 -0
package/src/inference/pipelines/text.d.ts +5 -0
package/src/inference/pipelines/text.js +59 -1
package/src/inference/pipelines/vision/encoder.js +386 -0
package/src/inference/pipelines/vision/image-preprocess.js +151 -0
package/src/inference/pipelines/vision/index.js +173 -0
package/src/inference/pipelines/vision/ops.js +78 -0
package/src/inference/pipelines/vision/patch-embed.js +151 -0
package/src/inference/test-harness.js +11 -9
package/src/loader/doppler-loader.d.ts +3 -0
package/src/loader/doppler-loader.js +20 -3
package/src/loader/experts/expert-cache.js +6 -2
package/src/loader/experts/expert-loader.js +6 -2
package/src/loader/final-weights-loader.js +2 -0
package/src/loader/layer-loader.js +42 -3
package/src/loader/manifest-config.js +3 -1
package/src/loader/shard-cache.js +3 -2
package/src/loader/tensors/tensor-loader.d.ts +3 -0
package/src/loader/tensors/tensor-loader.js +130 -4
package/src/rules/inference/dtype.rules.json +5 -0
package/src/rules/inference/kernel-path.rules.json +2 -2
package/src/rules/kernels/moe.rules.mixtral.json +75 -0
package/src/rules/kernels/softmax.rules.json +2 -0
package/src/rules/kernels/split-qg.rules.json +6 -0
package/src/rules/rule-registry.d.ts +1 -0
package/src/rules/rule-registry.js +4 -0
package/src/storage/downloader.js +2 -1
package/src/storage/quickstart-downloader.d.ts +3 -0
package/src/storage/quickstart-downloader.js +27 -30
package/src/storage/shard-manager.js +4 -3
package/src/tooling/conversion-config-materializer.js +3 -5
package/src/tooling/node-converter.js +28 -7
package/src/tooling/node-source-runtime.js +65 -5
package/src/tooling/node-webgpu.js +24 -7
package/src/types/model.d.ts +5 -0
package/src/utils/hf-resolve-url.d.ts +16 -0
package/src/utils/hf-resolve-url.js +17 -0
package/src/version.js +1 -1
package/tools/doppler-cli.js +6 -1
package/src/tooling/node-convert.d.ts +0 -54

package/src/gpu/kernels/linear-attention-core.js CHANGED Viewed

@@ -10,7 +10,7 @@ import {
 import { recordDispatch } from './dispatch.js';
 const CONV_WORKGROUP_SIZE = WORKGROUP_SIZES.DEFAULT;
-const HEAD_WORKGROUP_SIZE = 64;
+const HEAD_WORKGROUP_SIZE = 128;
 const CONV_SHADER = /* wgsl */ `
 override WORKGROUP_SIZE: u32 = 256u;
@@ -79,7 +79,7 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
 `;
 const RECURRENT_SHADER = /* wgsl */ `
-override WORKGROUP_SIZE: u32 = 64u;
+override WORKGROUP_SIZE: u32 = 128u;
 struct LinearAttentionParams {
   num_tokens: u32,
@@ -111,6 +111,8 @@ struct LinearAttentionParams {
 @group(0) @binding(8) var<storage, read_write> recurrent_state: array<f32>;
 @group(0) @binding(9) var<storage, read_write> output: array<f32>;
+var<workgroup> shared_sq: array<f32, WORKGROUP_SIZE>;
 fn softplus(x: f32) -> f32 {
   if (x > 20.0) {
     return x;
@@ -131,17 +133,19 @@ fn silu(x: f32) -> f32 {
 }
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
-fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-  let head = gid.x;
+fn main(@builtin(workgroup_id) wid: vec3<u32>,
+        @builtin(local_invocation_id) lid: vec3<u32>) {
+  let head = wid.x;
+  let vd = lid.x;
   if (head >= params.num_v_heads) {
     return;
   }
   let head_k_dim = params.head_k_dim;
   let head_v_dim = params.head_v_dim;
+  let is_active = vd < head_v_dim;
   let head_scale = inverseSqrt(f32(head_k_dim));
   let recurrent_head_base = head * head_k_dim * head_v_dim;
-  let recurrent_head_size = head_k_dim * head_v_dim;
   let q_rep = max(params.q_rep, 1u);
   let src_head = head / q_rep;
   let q_base = src_head * head_k_dim;
@@ -154,6 +158,7 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let ab_row_base = token_idx * params.num_v_heads + head;
     let out_row_base = token_idx * params.value_dim + head * head_v_dim;
+    // L2 norm for Q and K (redundant across threads but avoids shared memory)
     var q_norm_sq = 0.0;
     var k_norm_sq = 0.0;
     for (var d: u32 = 0u; d < head_k_dim; d = d + 1u) {
@@ -169,11 +174,16 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let g = a_neg_exp[head] * softplus(a_proj[ab_row_base] + dt_bias[head]);
     let g_exp = exp(g);
-    for (var i: u32 = 0u; i < recurrent_head_size; i = i + 1u) {
-      recurrent_state[recurrent_head_base + i] = recurrent_state[recurrent_head_base + i] * g_exp;
+    // Decay state — each thread handles head_k_dim elements at stride head_v_dim
+    if (is_active) {
+      for (var kd: u32 = 0u; kd < head_k_dim; kd = kd + 1u) {
+        let state_idx = recurrent_head_base + kd * head_v_dim + vd;
+        recurrent_state[state_idx] = recurrent_state[state_idx] * g_exp;
+      }
     }
-    for (var vd: u32 = 0u; vd < head_v_dim; vd = vd + 1u) {
+    // Delta update — each thread handles one vd slice (no cross-thread dependency)
+    if (is_active) {
       var kv_mem = 0.0;
       for (var kd: u32 = 0u; kd < head_k_dim; kd = kd + 1u) {
         let k_normed = conv_out[conv_row_base + k_base + kd] * k_norm_scale;
@@ -188,21 +198,31 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
       }
     }
-    var mean_sq = 0.0;
-    for (var vd: u32 = 0u; vd < head_v_dim; vd = vd + 1u) {
-      var out_value = 0.0;
+    // Output — each thread computes one vd element
+    var out_value = 0.0;
+    if (is_active) {
       for (var kd: u32 = 0u; kd < head_k_dim; kd = kd + 1u) {
         let q_normed = conv_out[conv_row_base + q_base + kd] * q_norm_scale;
         let state_idx = recurrent_head_base + kd * head_v_dim + vd;
         out_value = out_value + recurrent_state[state_idx] * q_normed;
       }
       output[out_row_base + vd] = out_value;
-      let value = out_value;
-      mean_sq = mean_sq + value * value;
     }
-    let inv_rms = inverseSqrt(mean_sq / f32(head_v_dim) + params.rms_norm_eps);
-    for (var vd: u32 = 0u; vd < head_v_dim; vd = vd + 1u) {
+    // RMS norm reduction across vd (workgroup-level)
+    shared_sq[vd] = select(0.0, out_value * out_value, is_active);
+    workgroupBarrier();
+    // Tree reduction
+    for (var stride: u32 = WORKGROUP_SIZE / 2u; stride > 0u; stride = stride / 2u) {
+      if (vd < stride) {
+        shared_sq[vd] = shared_sq[vd] + shared_sq[vd + stride];
+      }
+      workgroupBarrier();
+    }
+    let inv_rms = inverseSqrt(shared_sq[0] / f32(head_v_dim) + params.rms_norm_eps);
+    // Apply norm + gate
+    if (is_active) {
       let gate = silu(z_proj[z_row_base + vd]);
       let norm_index = select(vd, head * head_v_dim + vd, params.norm_mode == 1u);
       output[out_row_base + vd] = (output[out_row_base + vd] * inv_rms) * norm_weight[norm_index] * gate;
@@ -415,7 +435,7 @@ export async function runLinearAttentionCoreGPU(qkvTensor, zTensor, aTensor, bTe
         recorder,
         recurrentPipeline,
         recurrentBindGroup,
-        [Math.ceil(layerState.numVHeads / HEAD_WORKGROUP_SIZE), 1, 1],
+        [layerState.numVHeads, 1, 1],
         'linear_attention_recurrent'
       );
@@ -502,7 +522,7 @@ export async function runLinearAttentionCoreGPU(qkvTensor, zTensor, aTensor, bTe
       const pass = encoder.beginComputePass({ label: 'linear_attention_recurrent_pass' });
       pass.setPipeline(recurrentPipeline);
       pass.setBindGroup(0, recurrentBindGroup);
-      pass.dispatchWorkgroups(Math.ceil(layerState.numVHeads / HEAD_WORKGROUP_SIZE), 1, 1);
+      pass.dispatchWorkgroups(layerState.numVHeads, 1, 1);
       pass.end();
     }

package/src/gpu/kernels/matmul-selection.js CHANGED Viewed

@@ -29,7 +29,13 @@ function selectQ4KFusedVariant(isM1, wantF16Output, aDtype) {
 }
-export function resolveMatmulPhase(M) {
+export function resolveMatmulPhase(M, phaseOverride = null) {
+  if (phaseOverride != null) {
+    if (phaseOverride !== 'decode' && phaseOverride !== 'prefill') {
+      throw new Error(`[Matmul] Invalid phase override "${phaseOverride}". Expected "decode" or "prefill".`);
+    }
+    return phaseOverride;
+  }
   return selectKernelRuleValue('matmul', 'phase', { isDecode: M === 1 });
 }
@@ -86,6 +92,7 @@ export function getMatmulConfig(variant, constants) {
 export function isFusedQ4KDisabled(options = {}) {
+  if (options.disableFusedQ4K === true) return true;
   const capabilities = getKernelCapabilities();
   const hasSubgroups = capabilities?.hasSubgroups === true;
@@ -125,7 +132,9 @@ export function selectMatmulKernel(options = {}) {
   const { tiledPrefillMinRows } = getKernelThresholds().matmul;
   const inputsAreF16 = aDtype === 'f16' && bDtype === 'f16';
-  const weightsAreF16 = bDtype === 'f16' && aDtype !== 'f16';
+  // F16 weights needing F32a path: weights are F16 and either activation is already F32,
+  // or both inputs are F16 but output is F32 (activation will be cast to F32 by executeMatmul)
+  const weightsAreF16 = bDtype === 'f16' && (aDtype !== 'f16' || outputDtype !== 'f16');
   const useF16Matmul = outputDtype === 'f16' && preferF16 && inputsAreF16 && capabilities.hasF16;
   const useF16wF32a = preferF16 && weightsAreF16 && capabilities.hasF16;
   const useTiled = isPrefill
@@ -244,6 +253,30 @@ export function requiresF32Input(variant) {
   return !supportsF16Input(variant);
 }
+function resolveRequiredWeightDtype(config) {
+  const shaderFile = String(config?.shaderFile ?? config?.wgsl ?? '');
+  if (!shaderFile) {
+    return null;
+  }
+  if (shaderFile.startsWith('fused_matmul_q4')) {
+    return 'q4k';
+  }
+  if (
+    shaderFile === 'matmul_f16.wgsl'
+    || shaderFile === 'matmul_f16_tiled.wgsl'
+    || shaderFile === 'matmul_f16w_f32a.wgsl'
+    || shaderFile === 'matmul_f16w_f32a_tiled.wgsl'
+    || shaderFile === 'matmul_gemv_subgroup.wgsl'
+    || shaderFile === 'matmul_gemv_subgroup_f16a.wgsl'
+  ) {
+    return 'f16';
+  }
+  if (shaderFile === 'matmul_f32.wgsl') {
+    return 'f32';
+  }
+  return null;
+}
 function resolveMatmulOverride(
   variantOverride,
@@ -287,6 +320,16 @@ function resolveMatmulOverride(
     );
   }
+  const requiredWeightDtype = resolveRequiredWeightDtype(config);
+  const weightDtypeOk = !requiredWeightDtype
+    || bDtype === requiredWeightDtype
+    || (requiredWeightDtype === 'f16' && bDtype === 'q4k');
+  if (!weightDtypeOk) {
+    return failOrWarn(
+      `Matmul kernel "${variantOverride}" requires ${requiredWeightDtype} weights but B dtype is ${bDtype}.`
+    );
+  }
   if (supportsF16Input(override) && aDtype !== 'f16') {
     return failOrWarn(`Matmul kernel "${variantOverride}" requires f16 activations but A dtype is ${aDtype}.`);
   }
@@ -341,7 +384,7 @@ function selectGemvVariant(useF16Gemv, useF32Gemv, hasSubgroups, useVec4, N, mul
 export function selectMatmulVariantAndFlags(mode, M, N, K, aDtype, bDtype, transposeB, requestedOutputDtype, options) {
   const capabilities = getKernelCapabilities();
   const strict = getKernelPathStrict();
-  const phase = resolveMatmulPhase(M);
+  const phase = resolveMatmulPhase(M, options.phaseOverride ?? null);
   let pathVariant = getKernelPathMatmulVariant(options.role, phase, options.layerIdx, options.kernelPath);
   const hadPathVariant = Boolean(pathVariant);
@@ -426,7 +469,8 @@ export function selectMatmulVariantAndFlags(mode, M, N, K, aDtype, bDtype, trans
   const canGemv = M === 1 && effectiveBDtype === 'f16' && capabilities.hasF16;
   const useF16Gemv = canGemv && aDtype === 'f16' && wantF16Output;
-  const useF32Gemv = canGemv && aDtype === 'f32';
+  // F32 GEMV: activation is F32, or activation is F16 with F32 output (will be cast to F32)
+  const useF32Gemv = canGemv && (aDtype === 'f32' || (aDtype === 'f16' && !wantF16Output));
   const useGemv = useF16Gemv || useF32Gemv;
   const useVec4 = (K % 4 === 0);
   const { multicolThreshold } = getKernelThresholds().matmul;

package/src/gpu/kernels/matmul.d.ts CHANGED Viewed

@@ -13,6 +13,7 @@ import type { WeightBuffer } from '../weight-buffer.js';
 import type { CommandRecorder } from '../command-recorder.js';
 import type { OutputBufferOptions, OutputDtypeOptions, Vec4Options } from './types.js';
 import type { KernelPathSchema } from '../../config/schema/index.js';
+import type { MatmulDebugConfigSchema } from '../../config/schema/debug.schema.js';
 /** Matmul kernel options */
 export interface MatmulOptions extends OutputBufferOptions, OutputDtypeOptions, Vec4Options {
@@ -23,6 +24,8 @@ export interface MatmulOptions extends OutputBufferOptions, OutputDtypeOptions,
   layerIdx?: number;
   /** Explicit kernel path context for variant selection (avoids global path state). */
   kernelPath?: KernelPathSchema | null;
+  /** Optional explicit phase for kernel-path lookup when the runtime rewrites rows (for example prefill last-position logits). */
+  phaseOverride?: 'decode' | 'prefill' | null;
   /**
    * Whether B matrix is stored transposed.
    * - true: B is [N,K] (SafeTensors/row-major), needs transpose
@@ -38,6 +41,8 @@ export interface MatmulOptions extends OutputBufferOptions, OutputDtypeOptions,
   preferF16?: boolean;
   /** WGSL override constants for pipeline creation */
   constants?: Record<string, number | boolean>;
+  /** Runtime debug controls for attention projection diagnostics. */
+  matmulDebug?: MatmulDebugConfigSchema | null;
 }
 /** Context for base matmul kernel selection rules. */

package/src/gpu/kernels/matmul.js CHANGED Viewed

@@ -2,7 +2,7 @@ import { getDevice, getKernelCapabilities } from '../device.js';
 import { createTensor } from '../tensor.js';
 import { getBuffer, getLayout, getWeightDtype } from '../weight-buffer.js';
 import { log, trace, isTraceEnabled } from '../../debug/index.js';
-import { releaseBuffer } from '../../memory/buffer-pool.js';
+import { releaseBuffer, readBuffer } from '../../memory/buffer-pool.js';
 import { releaseUniformBuffer } from '../uniform-cache.js';
 import { castF16ToF32, recordCastF16ToF32 } from './cast.js';
 import {
@@ -34,6 +34,24 @@ export { createMatmulBindGroupLayout };
 let _runMatmulDebugCount = 0;
 let _recordMatmulDebugCount = 0;
+function normalizeMatmulDebugConfig(config) {
+  if (!config || typeof config !== 'object') {
+    return null;
+  }
+  return {
+    enabled: config.enabled === true,
+    forceSplitQKV: config.forceSplitQKV === true,
+    validateAttentionWeightBuffer: config.validateAttentionWeightBuffer === true,
+    failOnSmallAttentionWeightBuffer: config.failOnSmallAttentionWeightBuffer === true,
+    logAttentionWeightBuffer: config.logAttentionWeightBuffer === true,
+    logProjectionValues: config.logProjectionValues === true,
+  };
+}
+function isAttentionProjectionRole(role = '') {
+  return role === 'qkv_proj' || role === 'q_proj' || role === 'k_proj' || role === 'v_proj';
+}
 function getDebugCounter(isRecord) {
   return isRecord ? _recordMatmulDebugCount : _runMatmulDebugCount;
 }
@@ -126,6 +144,12 @@ async function executeMatmul(recorder, A, B, M, N, K, options = {}) {
   const weightLabel = (B && typeof B === 'object' ? B.label : null) ?? bBuffer?.label ?? null;
   const weightLayout = getLayout(B);
   const weightShape = B?.shape ? `[${B.shape.join(', ')}]` : null;
+  const matmulDebug = normalizeMatmulDebugConfig(options.matmulDebug);
+  const debugAttention = matmulDebug?.enabled === true;
+  const isAttnProj = isAttentionProjectionRole(options.role ?? '');
+  const shouldValidateAttentionWeightBuffer = debugAttention && matmulDebug.validateAttentionWeightBuffer;
+  const shouldFailOnSmallAttentionWeightBuffer = debugAttention && matmulDebug.failOnSmallAttentionWeightBuffer;
+  const shouldLogAttentionWeightBuffer = debugAttention && matmulDebug.logAttentionWeightBuffer;
   if (isTraceEnabled('kernels') && getDebugCounter(isRecord) < 20) {
     incrementDebugCounter(isRecord);
@@ -165,7 +189,7 @@ async function executeMatmul(recorder, A, B, M, N, K, options = {}) {
     options
   );
-  const phase = resolveMatmulPhase(M);
+  const phase = resolveMatmulPhase(M, options.phaseOverride ?? null);
   const constants = resolveMatmulConstants(options, phase);
   let matmulInput = A;
@@ -201,6 +225,27 @@ async function executeMatmul(recorder, A, B, M, N, K, options = {}) {
       bOffset
     );
   } catch (err) {
+    if (shouldValidateAttentionWeightBuffer && isAttnProj && err instanceof Error && err.message.includes('B buffer too small')) {
+      const detailParts = [
+        `role=${options.role ?? ''}`,
+        `layer=${Number.isFinite(options.layerIdx) ? options.layerIdx : '?'}`,
+        `M=${M}`,
+        `N=${N}`,
+        `K=${K}`,
+      ];
+      if (weightDtype) detailParts.push(`weightDtype=${weightDtype}`);
+      if (weightLayout) detailParts.push(`weightLayout=${weightLayout}`);
+      if (weightShape) detailParts.push(`shape=${weightShape}`);
+      if (weightLabel) detailParts.push(`label=${weightLabel}`);
+      if (Number.isFinite(bBuffer?.size)) detailParts.push(`bSize=${bBuffer.size}`);
+      const detail = detailParts.join(' ');
+      if (shouldLogAttentionWeightBuffer) {
+        log.warn('MatmulQKVProbe', `${err.message} | ${detail}`);
+      }
+      if (shouldFailOnSmallAttentionWeightBuffer) {
+        throw new Error(`${err.message}${detail ? ` (${detail})` : ''}`);
+      }
+    }
     if (!isRecord && err instanceof Error && err.message.includes('B buffer too small')) {
       const detailParts = [];
       if (weightLabel) detailParts.push(`label=${weightLabel}`);
@@ -226,6 +271,15 @@ async function executeMatmul(recorder, A, B, M, N, K, options = {}) {
     trace.kernels(`MATMUL_LARGE: N=${N}, variant=${variant}, aDtype=${aDtype}, bDtype=${bDtype}, transposeB=${transposeB}`);
   }
+  if (isAttnProj && shouldLogAttentionWeightBuffer) {
+    log.warn('MatmulQKVProbe',
+      `role=${options.role ?? ''} layer=${Number.isFinite(options.layerIdx) ? options.layerIdx : '?'} ` +
+      `M=${M} N=${N} K=${K} transposeB=${transposeB} bSize=${bBuffer?.size ?? 0} ` +
+      `requiredB=${bindingSizes?.bBindingSize ?? 'n/a'} weightShape=${weightShape ?? 'n/a'} ` +
+      `weightDtype=${weightDtype ?? 'unknown'} weightLayout=${weightLayout ?? 'unknown'}`
+    );
+  }
   const config = getMatmulConfig(variant, constants);
   const kernel = new MatmulKernel(device);
   const pipeline = await getMatmulPipeline(variant, constants);
@@ -238,6 +292,14 @@ async function executeMatmul(recorder, A, B, M, N, K, options = {}) {
   );
   const ownsOutput = outputBuffer == null;
+  if (isAttnProj && shouldLogAttentionWeightBuffer) {
+    log.warn('MatmulVariantDiag',
+      `role=${options.role ?? ''} layer=${Number.isFinite(options.layerIdx) ? options.layerIdx : '?'} mode=${mode} ` +
+      `variant=${variant} useQ4KFused=${useQ4KFused} useGemv=${useGemv} ` +
+      `aDtype=${aDtype} bDtype=${bDtype} output=${actualOutputDtype}`
+    );
+  }
   if (!Number.isFinite(outputSize) || outputSize <= 0) {
     throw new Error(`[${opLabel}] Invalid output size: ${outputSize} (M=${M}, N=${N})`);
   }
@@ -290,6 +352,13 @@ async function executeMatmul(recorder, A, B, M, N, K, options = {}) {
       kernel.dispatch(pipeline, bindGroup, dispatchPlan.workgroups);
     }
     completed = true;
+    if (!isRecord && matmulDebug?.logProjectionValues && isAttnProj && M === 1 && options.layerIdx === 0) {
+      await device.queue.onSubmittedWorkDone();
+      const raw = await readBuffer(C);
+      const numVals = Math.min(8, Math.floor(raw.byteLength / 4));
+      const vals = numVals > 0 ? new Float32Array(raw, 0, numVals) : [];
+      log.warn('ProjectionProbe', `role=${options.role ?? ''} L0 M1 first8_f32: ${Array.from(vals).map(v => v.toFixed(5)).join(' ')}`);
+    }
     return createTensor(C, actualOutputDtype, [M, N], 'matmul_output');
   } finally {
     if (!isRecord && uniformBuffer) {

package/src/gpu/kernels/matmul_gemv_subgroup.wgsl CHANGED Viewed

@@ -5,7 +5,11 @@
 // 1. Use subgroupAdd() for reduction - much faster than shared memory
 // 2. Vectorized vec4 loads for weights
 // 3. Each workgroup processes multiple output columns
-// 4. Loop unrolling for better ILP
+// 4. Warp-stride loop for row-major (transpose_b=1): all threads in a column
+//    step through K together, so adjacent threads load adjacent addresses.
+//    At each step, 64 threads × 8 bytes = 512 bytes from 4 consecutive cache
+//    lines → 100% cache-line utilization vs ~10% for the old contiguous-range
+//    pattern (where threads were 80 bytes apart in the same iteration).
 //
 // A is f32 (activations), B is f16 (weights), C is f32.
 // transpose_b=0: B is [K, N] (GGUF/column-major), access B[k * N + col]
@@ -69,40 +73,29 @@ fn main(
     // Each thread computes partial sum for its assigned k values
     var partial_sum: f32 = 0.0;
-    // Only do work if this column is valid
     if (is_valid) {
-        // Process K in chunks, each thread handles K/64 elements
-        let k_per_thread = (u.K + THREADS_PER_COL - 1u) / THREADS_PER_COL;
-        let k_start = thread_in_col * k_per_thread;
-        let k_end = min(k_start + k_per_thread, u.K);
-        // Main loop - process 4 elements at a time when aligned
-        var k = k_start;
-        let k_aligned_end = k_start + ((k_end - k_start) / 4u) * 4u;
         if (u.transpose_b == 1u) {
-            // B is [N, K] (SafeTensors/row-major): B[col, k] = B[col * K + k]
+            // B is [N, K] (row-major): B[col, k] = B[col * K + k]
+            // Warp-stride: step THREADS_PER_COL elements per outer iteration so that
+            // all wavefront threads load consecutive addresses simultaneously.
+            // At each step, 64 threads × 2 bytes = 128 bytes = exactly 1 cache line → 100% utilization.
             let b_row_offset = col * u.K;
-            for (; k < k_aligned_end; k = k + 4u) {
-                let a0 = A[k];
-                let a1 = A[k + 1u];
-                let a2 = A[k + 2u];
-                let a3 = A[k + 3u];
-                let b0 = f32(B[b_row_offset + k]);
-                let b1 = f32(B[b_row_offset + k + 1u]);
-                let b2 = f32(B[b_row_offset + k + 2u]);
-                let b3 = f32(B[b_row_offset + k + 3u]);
-                partial_sum = partial_sum + a0 * b0 + a1 * b1 + a2 * b2 + a3 * b3;
-            }
-            for (; k < k_end; k = k + 1u) {
-                partial_sum = partial_sum + A[k] * f32(B[b_row_offset + k]);
+            for (var k_base: u32 = 0u; k_base < u.K; k_base = k_base + THREADS_PER_COL) {
+                let k = k_base + thread_in_col;
+                if (k < u.K) {
+                    partial_sum = partial_sum + A[k] * f32(B[b_row_offset + k]);
+                }
             }
         } else {
-            // B is [K, N] (GGUF/column-major): B[k, col] = B[k * N + col]
+            // B is [K, N] (column-major): B[k, col] = B[k * N + col]
+            // Contiguous-range per thread: sequential access within each thread.
+            let k_per_thread = (u.K + THREADS_PER_COL - 1u) / THREADS_PER_COL;
+            let k_start = thread_in_col * k_per_thread;
+            let k_end = min(k_start + k_per_thread, u.K);
+            var k = k_start;
+            let k_aligned_end = k_start + ((k_end - k_start) / 4u) * 4u;
             for (; k < k_aligned_end; k = k + 4u) {
                 let a0 = A[k];
                 let a1 = A[k + 1u];
@@ -189,38 +182,36 @@ fn main_multicol(
     var partial_sum: f32 = 0.0;
     if (is_valid) {
-        // Each of 8 threads splits K
-        let k_per_thread = (u.K + MULTICOL_THREADS_PER_COL - 1u) / MULTICOL_THREADS_PER_COL;
-        let k_start = thread_in_col * k_per_thread;
-        let k_end = min(k_start + k_per_thread, u.K);
-        // Unroll by 4 for ILP
-        var k = k_start;
-        let k_aligned_end = k_start + ((k_end - k_start) / 4u) * 4u;
         if (u.transpose_b == 1u) {
-            // B is [N, K] (SafeTensors/row-major): B[col, k] = B[col * K + k]
+            // B is [N, K] (row-major): B[col, k] = B[col * K + k]
+            // Warp-stride: step MULTICOL_THREADS_PER_COL vec4 groups per outer iteration.
+            // Adjacent threads in the same column load adjacent vec4 groups → coalesced.
+            let K4 = u.K / 4u;
             let b_row_offset = col * u.K;
-            for (; k < k_aligned_end; k = k + 4u) {
-                let a0 = A[k];
-                let a1 = A[k + 1u];
-                let a2 = A[k + 2u];
-                let a3 = A[k + 3u];
-                let b0 = f32(B[b_row_offset + k]);
-                let b1 = f32(B[b_row_offset + k + 1u]);
-                let b2 = f32(B[b_row_offset + k + 2u]);
-                let b3 = f32(B[b_row_offset + k + 3u]);
-                partial_sum = partial_sum + a0 * b0 + a1 * b1 + a2 * b2 + a3 * b3;
-            }
-            for (; k < k_end; k = k + 1u) {
-                partial_sum = partial_sum + A[k] * f32(B[b_row_offset + k]);
+            for (var k4_base: u32 = 0u; k4_base < K4; k4_base = k4_base + MULTICOL_THREADS_PER_COL) {
+                let k4 = k4_base + thread_in_col;
+                if (k4 < K4) {
+                    let k = k4 * 4u;
+                    let a0 = A[k];
+                    let a1 = A[k + 1u];
+                    let a2 = A[k + 2u];
+                    let a3 = A[k + 3u];
+                    let b0 = f32(B[b_row_offset + k]);
+                    let b1 = f32(B[b_row_offset + k + 1u]);
+                    let b2 = f32(B[b_row_offset + k + 2u]);
+                    let b3 = f32(B[b_row_offset + k + 3u]);
+                    partial_sum = partial_sum + a0 * b0 + a1 * b1 + a2 * b2 + a3 * b3;
+                }
             }
         } else {
-            // B is [K, N] (GGUF/column-major): B[k, col] = B[k * N + col]
+            // B is [K, N] (column-major): B[k, col] = B[k * N + col]
+            let k_per_thread = (u.K + MULTICOL_THREADS_PER_COL - 1u) / MULTICOL_THREADS_PER_COL;
+            let k_start = thread_in_col * k_per_thread;
+            let k_end = min(k_start + k_per_thread, u.K);
+            var k = k_start;
+            let k_aligned_end = k_start + ((k_end - k_start) / 4u) * 4u;
             for (; k < k_aligned_end; k = k + 4u) {
                 let a0 = A[k];
                 let a1 = A[k + 1u];
@@ -245,7 +236,7 @@ fn main_multicol(
     multicol_wg_sums[local_id] = partial_sum;
     workgroupBarrier();
-    // Thread 0 of each column reduces its 8 values
+    // Thread 0 of each column reduces its MULTICOL_THREADS_PER_COL values
     if (thread_in_col == 0u && is_valid) {
         var final_sum: f32 = 0.0;
         let base = col_in_wg * MULTICOL_THREADS_PER_COL;
@@ -282,30 +273,37 @@ fn main_vec4(
     if (is_valid) {
         // K is guaranteed to be multiple of 4
         let K4 = u.K / 4u;
-        let k4_per_thread = (K4 + THREADS_PER_COL - 1u) / THREADS_PER_COL;
-        let k4_start = thread_in_col * k4_per_thread;
-        let k4_end = min(k4_start + k4_per_thread, K4);
         if (u.transpose_b == 1u) {
-            // B is [N, K] (SafeTensors/row-major): B[col, k] = B[col * K + k]
+            // B is [N, K] (row-major): B[col, k] = B[col * K + k]
+            // Warp-stride: step THREADS_PER_COL vec4 groups per outer iteration so that
+            // adjacent threads load adjacent groups → 100% cache-line utilization.
+            // At each step: 64 threads × 4 f16 × 2 bytes = 512 bytes from 4 consecutive
+            // cache lines, vs the old contiguous-range pattern (~10% utilization).
             let b_row_offset = col * u.K;
-            for (var k4: u32 = k4_start; k4 < k4_end; k4 = k4 + 1u) {
-                let k = k4 * 4u;
-                let a = vec4<f32>(A[k], A[k + 1u], A[k + 2u], A[k + 3u]);
-                let b = vec4<f32>(
-                    f32(B[b_row_offset + k]),
-                    f32(B[b_row_offset + k + 1u]),
-                    f32(B[b_row_offset + k + 2u]),
-                    f32(B[b_row_offset + k + 3u])
-                );
-                partial_sum = partial_sum + dot(a, b);
+            for (var k4_base: u32 = 0u; k4_base < K4; k4_base = k4_base + THREADS_PER_COL) {
+                let k4 = k4_base + thread_in_col;
+                if (k4 < K4) {
+                    let k = k4 * 4u;
+                    let a = vec4<f32>(A[k], A[k + 1u], A[k + 2u], A[k + 3u]);
+                    let b = vec4<f32>(
+                        f32(B[b_row_offset + k]),
+                        f32(B[b_row_offset + k + 1u]),
+                        f32(B[b_row_offset + k + 2u]),
+                        f32(B[b_row_offset + k + 3u])
+                    );
+                    partial_sum = partial_sum + dot(a, b);
+                }
             }
         } else {
-            // B is [K, N] (GGUF/column-major): B[k, col] = B[k * N + col]
+            // B is [K, N] (column-major): B[k, col] = B[k * N + col]
+            // Contiguous-range per thread: sequential access within each thread.
+            let k4_per_thread = (K4 + THREADS_PER_COL - 1u) / THREADS_PER_COL;
+            let k4_start = thread_in_col * k4_per_thread;
+            let k4_end = min(k4_start + k4_per_thread, K4);
             for (var k4: u32 = k4_start; k4 < k4_end; k4 = k4 + 1u) {
                 let k = k4 * 4u;
@@ -342,4 +340,4 @@ fn main_vec4(
         }
         C[col] = final_sum * u.alpha;
     }
-}
+}

package/src/gpu/kernels/rmsnorm.js CHANGED Viewed

@@ -9,6 +9,9 @@ import { selectRuleValue as selectLoaderRule } from '../../rules/rule-registry.j
 import { getBuffer, getWeightDtype, getBufferDtype } from '../weight-buffer.js';
 import { unifiedKernelWrapper } from './utils.js';
+// Conservative fallback dtype for norm weight inference when metadata is unavailable.
+const DEFAULT_DTYPE = 'f32';
 function inferHiddenSize(input, hiddenSize) {
   if (hiddenSize != null) return hiddenSize;
   const shape = input?.shape;
@@ -39,9 +42,12 @@ function resolveNormWeightDtype(weight, hiddenSize) {
     return taggedDtype;
   }
+  // Conservative fallback: f32 avoids precision loss when dtype cannot be determined.
+  // This path fires for non-GPU buffers or missing hiddenSize, both of which prevent
+  // size-based dtype inference below.
   const hasGPUBufferType = typeof GPUBuffer !== 'undefined';
   if (!hasGPUBufferType || !(weightBuffer instanceof GPUBuffer) || hiddenSize == null || hiddenSize <= 0) {
-    return 'f32';
+    return DEFAULT_DTYPE;
   }
   const byteSize = getBufferRequestedSize(weightBuffer);
@@ -55,7 +61,8 @@ function resolveNormWeightDtype(weight, hiddenSize) {
       sizeMatchesF32,
     });
   }
-  return 'f32';
+  // Buffer size matches neither f16 nor f32 for given hiddenSize; fall back to f32.
+  return DEFAULT_DTYPE;
 }
 function assertRMSNormWeightBuffer(weight, weightBuffer, hiddenSize) {

package/src/gpu/kernels/sample.js CHANGED Viewed

@@ -7,7 +7,6 @@ import { createPipeline, createUniformBufferWithView, getOrCreateBindGroupLayout
 import { allowReadback } from '../perf-guards.js';
 import { selectRuleValue as selectKernelRuleValue } from './rule-registry.js';
 import { selectRuleValue as selectSharedRuleValue } from '../../rules/rule-registry.js';
-import { getKernelThresholds } from '../../config/schema/index.js';
 function getSampleBindGroupLayout(device) {
@@ -96,8 +95,7 @@ async function resolveArgmaxPipelines(device, vocabSize, variants) {
   const argmaxPipeline = await createSamplePipeline(device, variants.argmax);
   const numWorkgroups = Math.min(WORKGROUP_SIZES.DEFAULT, Math.ceil(vocabSize / WORKGROUP_SIZES.DEFAULT));
   const useSinglePassArgmax = numWorkgroups === 1;
-  const argmaxReduceVocabThreshold = getKernelThresholds().sample.argmaxReduceVocabThreshold;
-  const reducePipeline = useSinglePassArgmax || vocabSize <= argmaxReduceVocabThreshold
+  const reducePipeline = useSinglePassArgmax
     ? null
     : await createSamplePipeline(device, variants.argmaxReduce);
   const singlePassPipeline = useSinglePassArgmax