npm - @simulatte/doppler - Versions diffs - 0.1.7 → 0.1.8 - Mend

@simulatte/doppler 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

package/CHANGELOG.md +19 -0
package/package.json +21 -36
package/src/browser/browser-converter.js +5 -0
package/src/client/doppler-registry.json +1 -17
package/src/config/kernel-path-loader.d.ts +5 -0
package/src/config/kernel-path-loader.js +13 -0
package/src/config/kernels/registry.json +74 -0
package/src/config/loader.js +3 -0
package/src/config/merge-contract-check.js +7 -0
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
package/src/config/presets/kernel-paths/registry.json +14 -0
package/src/config/presets/models/gemma2.json +2 -1
package/src/config/presets/models/gemma3.json +2 -0
package/src/config/presets/models/qwen3.json +4 -3
package/src/config/presets/models/qwen3_5.json +16 -0
package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
package/src/config/schema/conversion.schema.d.ts +1 -0
package/src/config/schema/manifest.schema.d.ts +1 -1
package/src/config/schema/manifest.schema.js +1 -1
package/src/config/schema/storage.schema.js +1 -1
package/src/converter/conversion-plan.js +10 -2
package/src/converter/core.js +2 -0
package/src/converter/manifest-inference.js +12 -22
package/src/converter/parsers/transformer.js +4 -0
package/src/converter/quantization-info.js +5 -1
package/src/converter/quantizer.js +19 -12
package/src/converter/rope-config.js +8 -6
package/src/converter/tokenizer-utils.d.ts +1 -0
package/src/converter/tokenizer-utils.js +4 -1
package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
package/src/distribution/shard-delivery.js +6 -1
package/src/formats/rdrr/parsing.d.ts +4 -0
package/src/formats/rdrr/parsing.js +14 -1
package/src/gpu/kernels/index.d.ts +8 -0
package/src/gpu/kernels/index.js +6 -0
package/src/gpu/kernels/matmul-selection.js +47 -4
package/src/gpu/kernels/matmul.d.ts +2 -0
package/src/gpu/kernels/matmul.js +1 -1
package/src/gpu/kernels/rmsnorm.js +9 -2
package/src/gpu/kernels/split_qg.d.ts +50 -0
package/src/gpu/kernels/split_qg.js +46 -0
package/src/gpu/kernels/split_qg.wgsl +58 -0
package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
package/src/gpu/weight-buffer.d.ts +1 -1
package/src/gpu/weight-buffer.js +1 -1
package/src/inference/browser-harness.d.ts +2 -0
package/src/inference/browser-harness.js +20 -1
package/src/inference/pipelines/diffusion/helpers.js +3 -0
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
package/src/inference/pipelines/text/attention/output-projection.js +8 -0
package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
package/src/inference/pipelines/text/attention/projections.js +41 -11
package/src/inference/pipelines/text/attention/record.js +15 -6
package/src/inference/pipelines/text/attention/run.js +50 -6
package/src/inference/pipelines/text/config.js +14 -0
package/src/inference/pipelines/text/execution-plan.js +5 -4
package/src/inference/pipelines/text/generator-runtime.js +5 -0
package/src/inference/pipelines/text/generator-steps.d.ts +6 -0
package/src/inference/pipelines/text/generator-steps.js +43 -15
package/src/inference/pipelines/text/generator.js +50 -17
package/src/inference/pipelines/text/init.d.ts +13 -0
package/src/inference/pipelines/text/init.js +16 -5
package/src/inference/pipelines/text/layer.js +1 -0
package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
package/src/inference/pipelines/text/linear-attention.js +33 -3
package/src/inference/pipelines/text/logits/gpu.js +2 -2
package/src/inference/pipelines/text/logits/index.d.ts +6 -1
package/src/inference/pipelines/text/logits/index.js +3 -1
package/src/inference/pipelines/text/model-load.js +3 -0
package/src/inference/pipelines/text/sampling.js +52 -6
package/src/inference/test-harness.js +2 -2
package/src/loader/final-weights-loader.js +2 -0
package/src/loader/shard-cache.js +3 -2
package/src/loader/tensors/tensor-loader.js +6 -1
package/src/rules/inference/dtype.rules.json +5 -0
package/src/rules/inference/kernel-path.rules.json +2 -2
package/src/rules/kernels/split-qg.rules.json +6 -0
package/src/rules/rule-registry.js +2 -0
package/src/storage/downloader.js +2 -1
package/src/storage/shard-manager.js +4 -3
package/src/tooling/conversion-config-materializer.js +3 -5
package/src/tooling/node-converter.js +3 -0
package/src/tooling/node-source-runtime.js +36 -0
package/src/types/model.d.ts +5 -0
package/tools/doppler-cli.js +6 -1

package/src/debug/reference/hf_qwen35_linear_attn_debug.py ADDED Viewed

@@ -0,0 +1,268 @@
+#!/usr/bin/env python3
+"""
+Dump intermediate values from Qwen3.5 linear attention (GatedDeltaNet) for comparison with Doppler.
+Usage:
+    HF_HOME=/media/x/models/huggingface_cache python3 src/debug/reference/hf_qwen35_linear_attn_debug.py
+"""
+import os
+import torch
+import numpy as np
+os.environ.setdefault("HF_HOME", "/media/x/models/huggingface_cache")
+from transformers import AutoModelForCausalLM, AutoTokenizer
+MODEL_ID = "Qwen/Qwen3.5-0.8B"
+PROMPT = "Hello"
+def stats(name, tensor):
+    t = tensor.float().detach().flatten()
+    print(f"  {name}: shape={list(tensor.shape)}, "
+          f"min={t.min().item():.6f}, max={t.max().item():.6f}, "
+          f"mean={t.mean().item():.6f}, absMax={t.abs().max().item():.6f}")
+    first8 = t[:8].tolist()
+    print(f"    first8: {[f'{v:.6f}' for v in first8]}")
+def main():
+    print(f"Loading {MODEL_ID}...")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype=torch.float32)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model.eval()
+    inputs = tokenizer(PROMPT, return_tensors="pt")
+    input_ids = inputs["input_ids"]
+    print(f"Prompt: '{PROMPT}', Token IDs: {input_ids[0].tolist()}")
+    num_tokens = input_ids.shape[1]
+    # Dump key weight values for layer 0
+    layer0 = model.model.layers[0]
+    attn = layer0.linear_attn
+    print(f"\n=== Layer 0 weights ===")
+    if hasattr(attn, 'A_log'):
+        a_log = attn.A_log.detach().float()
+        a_neg_exp = -torch.exp(a_log)
+        stats("A_log", a_log)
+        stats("a_neg_exp", a_neg_exp)
+    if hasattr(attn, 'dt_bias'):
+        stats("dt_bias", attn.dt_bias.detach().float())
+    stats("conv1d.weight", attn.conv1d.weight.detach().float())
+    stats("norm.weight", attn.norm.weight.detach().float())
+    # Hook into the linear_attn module to capture its input and output
+    captured = {}
+    def hook_linear_attn_input(module, args, kwargs):
+        if len(args) > 0:
+            captured['linear_attn_input'] = args[0].detach().clone()
+        return None
+    def hook_linear_attn_output(module, args, kwargs, output):
+        if isinstance(output, tuple):
+            captured['linear_attn_output'] = output[0].detach().clone()
+        else:
+            captured['linear_attn_output'] = output.detach().clone()
+        return None
+    # Hook into individual projection layers
+    def make_hook(name):
+        def hook(module, input, output):
+            captured[name] = output.detach().clone()
+        return hook
+    hooks = []
+    hooks.append(attn.register_forward_pre_hook(hook_linear_attn_input, with_kwargs=True))
+    hooks.append(attn.register_forward_hook(hook_linear_attn_output, with_kwargs=True))
+    hooks.append(attn.in_proj_qkv.register_forward_hook(make_hook('qkv_proj')))
+    hooks.append(attn.in_proj_z.register_forward_hook(make_hook('z_proj')))
+    hooks.append(attn.in_proj_a.register_forward_hook(make_hook('a_proj')))
+    hooks.append(attn.in_proj_b.register_forward_hook(make_hook('b_proj')))
+    hooks.append(attn.out_proj.register_forward_hook(make_hook('out_proj')))
+    hooks.append(attn.conv1d.register_forward_hook(make_hook('conv1d_raw')))
+    hooks.append(attn.norm.register_forward_hook(make_hook('gated_norm')))
+    # Also hook input_layernorm
+    hooks.append(layer0.input_layernorm.register_forward_hook(make_hook('input_layernorm')))
+    print(f"\n=== Running forward pass ===")
+    with torch.no_grad():
+        outputs = model(input_ids, output_hidden_states=True)
+    # Remove hooks
+    for h in hooks:
+        h.remove()
+    print(f"\n=== Captured intermediates ===")
+    for name in ['input_layernorm', 'qkv_proj', 'z_proj', 'a_proj', 'b_proj',
+                  'conv1d_raw', 'gated_norm', 'linear_attn_input', 'linear_attn_output', 'out_proj']:
+        if name in captured:
+            stats(name, captured[name])
+        else:
+            print(f"  {name}: NOT CAPTURED")
+    # Hidden states per layer
+    print(f"\n=== Hidden states per layer (last token) ===")
+    for i in range(min(6, len(outputs.hidden_states) - 1)):
+        hs = outputs.hidden_states[i + 1]
+        t = hs[0, -1]  # last token
+        vals = t[:8].tolist()
+        max_abs = t.abs().max().item()
+        mean_abs = t.abs().mean().item()
+        layer_type = type(model.model.layers[i]).__name__
+        attn_type = "linear" if hasattr(model.model.layers[i], 'linear_attn') else "full"
+        print(f"  Layer {i} ({attn_type}): first8={[f'{v:.4f}' for v in vals]}, "
+              f"maxAbs={max_abs:.4f}, meanAbs={mean_abs:.4f}")
+    # Logits
+    logits = outputs.logits[0, -1]
+    top5 = torch.topk(logits, 5)
+    print(f"\nTop-5 logits: {[(tokenizer.decode([idx.item()]), f'{val.item():.2f}') for val, idx in zip(top5.values, top5.indices)]}")
+    # Also trace through the linear attention manually to compare with Doppler's kernel
+    print(f"\n=== Manual linear attention trace (layer 0) ===")
+    with torch.no_grad():
+        embed = model.model.embed_tokens(input_ids)
+        normed = layer0.input_layernorm(embed)
+        stats("normed_input", normed)
+        qkv = attn.in_proj_qkv(normed)
+        stats("qkv", qkv)
+        # The HF Qwen3.5 GatedDeltaNet does conv1d on the QKV, then applies SiLU
+        # The conv1d expects [batch, channels, seq_len] format
+        qkv_t = qkv.transpose(1, 2)  # [1, 6144, 1]
+        # Use the conv1d module directly (it has padding configured)
+        conv_raw = attn.conv1d(qkv_t)
+        stats("conv_raw (from module)", conv_raw.transpose(1, 2))
+        # Truncate to seq_len (causal conv padding)
+        conv_causal = conv_raw[..., :num_tokens]
+        stats("conv_causal (truncated)", conv_causal.transpose(1, 2))
+        # Apply SiLU
+        conv_silu = torch.nn.functional.silu(conv_causal)
+        stats("conv_silu", conv_silu.transpose(1, 2))
+        # Split Q, K, V
+        conv_out = conv_silu.transpose(1, 2)  # [1, seq_len, 6144]
+        num_k_heads = 16
+        head_k_dim = 128
+        head_v_dim = 128
+        num_v_heads = 16
+        q_size = num_k_heads * head_k_dim  # 2048
+        k_size = q_size
+        v_size = num_v_heads * head_v_dim  # 2048
+        q = conv_out[..., :q_size]
+        k = conv_out[..., q_size:q_size + k_size]
+        v = conv_out[..., q_size + k_size:]
+        stats("Q (raw)", q)
+        stats("K (raw)", k)
+        stats("V (raw)", v)
+        # Reshape for per-head processing
+        # Q and K: [batch, seq, num_k_heads, head_k_dim]
+        q_heads = q.view(1, num_tokens, num_k_heads, head_k_dim)
+        k_heads = k.view(1, num_tokens, num_k_heads, head_k_dim)
+        v_heads = v.view(1, num_tokens, num_v_heads, head_v_dim)
+        # L2 normalize Q and K
+        eps = 1e-6
+        q_norm = torch.nn.functional.normalize(q_heads, p=2, dim=-1, eps=eps)
+        k_norm = torch.nn.functional.normalize(k_heads, p=2, dim=-1, eps=eps)
+        # Scale Q by 1/sqrt(head_k_dim)
+        head_scale = 1.0 / (head_k_dim ** 0.5)
+        q_scaled = q_norm * head_scale
+        stats("Q_normed_scaled (per-head)", q_scaled.reshape(1, num_tokens, -1))
+        stats("K_normed (per-head)", k_norm.reshape(1, num_tokens, -1))
+        # Projections for gating
+        z = attn.in_proj_z(normed)
+        a_out = attn.in_proj_a(normed)
+        b_out = attn.in_proj_b(normed)
+        stats("z", z)
+        stats("a", a_out)
+        stats("b", b_out)
+        # Compute gating values
+        a_log = attn.A_log.detach().float()
+        a_neg_exp = -torch.exp(a_log)
+        dt_bias = attn.dt_bias.detach().float()
+        softplus_input = a_out.squeeze(0).squeeze(0) + dt_bias
+        softplus_val = torch.nn.functional.softplus(softplus_input)
+        g = a_neg_exp * softplus_val
+        g_exp = torch.exp(g)
+        beta = torch.sigmoid(b_out.squeeze(0).squeeze(0))
+        stats("softplus(a + dt_bias)", softplus_val.unsqueeze(0).unsqueeze(0))
+        stats("g (decay)", g.unsqueeze(0).unsqueeze(0))
+        stats("g_exp (decay factor)", g_exp.unsqueeze(0).unsqueeze(0))
+        stats("beta (sigmoid(b))", beta.unsqueeze(0).unsqueeze(0))
+        # Recurrent state update (for first token, state is all zeros)
+        # state[head, kd, vd] = state * g_exp + k[kd] * delta[vd]
+        # where delta[vd] = (v[vd] - state^T @ k * beta
+        # For zero state: delta[vd] = v[vd] * beta, state = k ⊗ delta
+        state = torch.zeros(num_v_heads, head_k_dim, head_v_dim)
+        # Apply decay (no-op for zero state)
+        for head in range(num_v_heads):
+            state[head] *= g_exp[head].item()
+            k_head = k_norm[0, 0, head % num_k_heads]  # broadcast q_rep
+            v_head = v_heads[0, 0, head]
+            # kv_mem = state @ k
+            kv_mem = state[head].t() @ k_head  # [head_v_dim]
+            # delta = (v - kv_mem) * beta
+            delta = (v_head - kv_mem) * beta[head].item()
+            # state += outer(k, delta)
+            state[head] += torch.outer(k_head, delta)
+        # Output: out = state^T @ q
+        output_per_head = torch.zeros(1, num_tokens, num_v_heads, head_v_dim)
+        for head in range(num_v_heads):
+            q_head = q_scaled[0, 0, head % num_k_heads]
+            out_head = state[head].t() @ q_head  # [head_v_dim]
+            output_per_head[0, 0, head] = out_head
+        raw_out = output_per_head.reshape(1, num_tokens, num_v_heads * head_v_dim)
+        stats("Recurrent output (raw)", raw_out)
+        # RMS norm per head + SiLU gate
+        z_reshaped = z.view(1, num_tokens, num_v_heads, head_v_dim)
+        norm_weight = attn.norm.weight.detach().float()  # [head_v_dim] (shared mode)
+        rms_eps = 1e-6
+        for head in range(num_v_heads):
+            head_out = output_per_head[0, 0, head]  # [head_v_dim]
+            mean_sq = (head_out ** 2).mean()
+            inv_rms = 1.0 / torch.sqrt(mean_sq + rms_eps)
+            z_gate = torch.nn.functional.silu(z_reshaped[0, 0, head])
+            output_per_head[0, 0, head] = head_out * inv_rms * norm_weight * z_gate
+        gated_out = output_per_head.reshape(1, num_tokens, num_v_heads * head_v_dim)
+        stats("After RMSNorm + SiLU gate", gated_out)
+        # Output projection
+        o_result = torch.nn.functional.linear(gated_out, attn.out_proj.weight)
+        stats("After out_proj", o_result)
+        # Compare with captured output
+        if 'linear_attn_output' in captured:
+            diff = (o_result - captured['linear_attn_output']).abs()
+            print(f"\n  Diff vs captured output: maxDiff={diff.max().item():.6f}")
+if __name__ == "__main__":
+    main()

package/src/distribution/shard-delivery.js CHANGED Viewed

@@ -1,4 +1,5 @@
 import { log } from '../debug/index.js';
+import { getExpectedShardHash } from '../formats/rdrr/index.js';
 import {
   computeHash,
   createStreamingHasher,
@@ -2018,7 +2019,11 @@ export async function downloadShard(
     onDeliveryMetrics,
     signal,
     requiredEncoding: requiredEncoding ?? activeConfig.requiredContentEncoding ?? null,
-    expectedHash: options.expectedHash ?? shardInfo?.hash ?? activeConfig.expectedHash ?? null,
+    expectedHash:
+      options.expectedHash
+      ?? getExpectedShardHash(shardInfo, algorithm)
+      ?? activeConfig.expectedHash
+      ?? null,
     expectedSize: expectedSize ?? shardInfo?.size ?? null,
     expectedManifestVersionSet: options.expectedManifestVersionSet ?? null,
     writeToStore,

package/src/formats/rdrr/parsing.d.ts CHANGED Viewed

@@ -7,6 +7,10 @@
 import type { RDRRManifest, ShardInfo, TensorMap } from './types.js';
 export declare function parseManifest(jsonString: string): RDRRManifest;
+export declare function getExpectedShardHash(
+  shard: Partial<ShardInfo> | Record<string, unknown> | null | undefined,
+  manifestHashAlgorithm?: string | null
+): string;
 export declare function parseTensorMap(jsonString: string): TensorMap;

package/src/formats/rdrr/parsing.js CHANGED Viewed

@@ -4,6 +4,19 @@ import { validateManifest } from './validation.js';
 let currentManifest = null;
+export function getExpectedShardHash(shard, manifestHashAlgorithm = null) {
+  if (!shard || typeof shard !== 'object' || Array.isArray(shard)) {
+    return '';
+  }
+  const algorithm = typeof manifestHashAlgorithm === 'string'
+    ? manifestHashAlgorithm.trim().toLowerCase()
+    : '';
+  if (algorithm === 'blake3') {
+    return shard.blake3 || shard.hash || '';
+  }
+  return shard.hash || shard.blake3 || '';
+}
 export function parseManifest(jsonString) {
   let manifest;
@@ -21,7 +34,7 @@ export function parseManifest(jsonString) {
         index: shard.index ?? i,
         filename: shard.filename || shard.fileName || '',
         size: shard.size,
-        hash: shard.hash || shard.blake3 || '',
+        hash: getExpectedShardHash(shard, manifest.hashAlgorithm),
         blake3: shard.blake3 || shard.hash,
         offset: shard.offset ?? offset,
         hashAlgorithm: shard.hashAlgorithm,

package/src/gpu/kernels/index.d.ts CHANGED Viewed

@@ -326,6 +326,14 @@ export {
   type SplitQKVResult,
 } from './split_qkv.js';
+// Split Q and Gate (de-interleave attentionOutputGate q_proj output)
+export {
+  runSplitQG,
+  recordSplitQG,
+  type SplitQGOptions,
+  type SplitQGResult,
+} from './split_qg.js';
 // Transpose
 export {
   runTranspose,

package/src/gpu/kernels/index.js CHANGED Viewed

@@ -268,6 +268,12 @@ export {
   recordSplitQKV,
 } from './split_qkv.js';
+// Split Q and Gate (de-interleave attentionOutputGate q_proj output)
+export {
+  runSplitQG,
+  recordSplitQG,
+} from './split_qg.js';
 // Transpose
 export {
   runTranspose,

package/src/gpu/kernels/matmul-selection.js CHANGED Viewed

@@ -29,7 +29,13 @@ function selectQ4KFusedVariant(isM1, wantF16Output, aDtype) {
 }
-export function resolveMatmulPhase(M) {
+export function resolveMatmulPhase(M, phaseOverride = null) {
+  if (phaseOverride != null) {
+    if (phaseOverride !== 'decode' && phaseOverride !== 'prefill') {
+      throw new Error(`[Matmul] Invalid phase override "${phaseOverride}". Expected "decode" or "prefill".`);
+    }
+    return phaseOverride;
+  }
   return selectKernelRuleValue('matmul', 'phase', { isDecode: M === 1 });
 }
@@ -125,7 +131,9 @@ export function selectMatmulKernel(options = {}) {
   const { tiledPrefillMinRows } = getKernelThresholds().matmul;
   const inputsAreF16 = aDtype === 'f16' && bDtype === 'f16';
-  const weightsAreF16 = bDtype === 'f16' && aDtype !== 'f16';
+  // F16 weights needing F32a path: weights are F16 and either activation is already F32,
+  // or both inputs are F16 but output is F32 (activation will be cast to F32 by executeMatmul)
+  const weightsAreF16 = bDtype === 'f16' && (aDtype !== 'f16' || outputDtype !== 'f16');
   const useF16Matmul = outputDtype === 'f16' && preferF16 && inputsAreF16 && capabilities.hasF16;
   const useF16wF32a = preferF16 && weightsAreF16 && capabilities.hasF16;
   const useTiled = isPrefill
@@ -244,6 +252,30 @@ export function requiresF32Input(variant) {
   return !supportsF16Input(variant);
 }
+function resolveRequiredWeightDtype(config) {
+  const shaderFile = String(config?.shaderFile ?? config?.wgsl ?? '');
+  if (!shaderFile) {
+    return null;
+  }
+  if (shaderFile.startsWith('fused_matmul_q4')) {
+    return 'q4k';
+  }
+  if (
+    shaderFile === 'matmul_f16.wgsl'
+    || shaderFile === 'matmul_f16_tiled.wgsl'
+    || shaderFile === 'matmul_f16w_f32a.wgsl'
+    || shaderFile === 'matmul_f16w_f32a_tiled.wgsl'
+    || shaderFile === 'matmul_gemv_subgroup.wgsl'
+    || shaderFile === 'matmul_gemv_subgroup_f16a.wgsl'
+  ) {
+    return 'f16';
+  }
+  if (shaderFile === 'matmul_f32.wgsl') {
+    return 'f32';
+  }
+  return null;
+}
 function resolveMatmulOverride(
   variantOverride,
@@ -287,6 +319,16 @@ function resolveMatmulOverride(
     );
   }
+  const requiredWeightDtype = resolveRequiredWeightDtype(config);
+  const weightDtypeOk = !requiredWeightDtype
+    || bDtype === requiredWeightDtype
+    || (requiredWeightDtype === 'f16' && bDtype === 'q4k');
+  if (!weightDtypeOk) {
+    return failOrWarn(
+      `Matmul kernel "${variantOverride}" requires ${requiredWeightDtype} weights but B dtype is ${bDtype}.`
+    );
+  }
   if (supportsF16Input(override) && aDtype !== 'f16') {
     return failOrWarn(`Matmul kernel "${variantOverride}" requires f16 activations but A dtype is ${aDtype}.`);
   }
@@ -341,7 +383,7 @@ function selectGemvVariant(useF16Gemv, useF32Gemv, hasSubgroups, useVec4, N, mul
 export function selectMatmulVariantAndFlags(mode, M, N, K, aDtype, bDtype, transposeB, requestedOutputDtype, options) {
   const capabilities = getKernelCapabilities();
   const strict = getKernelPathStrict();
-  const phase = resolveMatmulPhase(M);
+  const phase = resolveMatmulPhase(M, options.phaseOverride ?? null);
   let pathVariant = getKernelPathMatmulVariant(options.role, phase, options.layerIdx, options.kernelPath);
   const hadPathVariant = Boolean(pathVariant);
@@ -426,7 +468,8 @@ export function selectMatmulVariantAndFlags(mode, M, N, K, aDtype, bDtype, trans
   const canGemv = M === 1 && effectiveBDtype === 'f16' && capabilities.hasF16;
   const useF16Gemv = canGemv && aDtype === 'f16' && wantF16Output;
-  const useF32Gemv = canGemv && aDtype === 'f32';
+  // F32 GEMV: activation is F32, or activation is F16 with F32 output (will be cast to F32)
+  const useF32Gemv = canGemv && (aDtype === 'f32' || (aDtype === 'f16' && !wantF16Output));
   const useGemv = useF16Gemv || useF32Gemv;
   const useVec4 = (K % 4 === 0);
   const { multicolThreshold } = getKernelThresholds().matmul;

package/src/gpu/kernels/matmul.d.ts CHANGED Viewed

@@ -23,6 +23,8 @@ export interface MatmulOptions extends OutputBufferOptions, OutputDtypeOptions,
   layerIdx?: number;
   /** Explicit kernel path context for variant selection (avoids global path state). */
   kernelPath?: KernelPathSchema | null;
+  /** Optional explicit phase for kernel-path lookup when the runtime rewrites rows (for example prefill last-position logits). */
+  phaseOverride?: 'decode' | 'prefill' | null;
   /**
    * Whether B matrix is stored transposed.
    * - true: B is [N,K] (SafeTensors/row-major), needs transpose

package/src/gpu/kernels/matmul.js CHANGED Viewed

@@ -165,7 +165,7 @@ async function executeMatmul(recorder, A, B, M, N, K, options = {}) {
     options
   );
-  const phase = resolveMatmulPhase(M);
+  const phase = resolveMatmulPhase(M, options.phaseOverride ?? null);
   const constants = resolveMatmulConstants(options, phase);
   let matmulInput = A;

package/src/gpu/kernels/rmsnorm.js CHANGED Viewed

@@ -9,6 +9,9 @@ import { selectRuleValue as selectLoaderRule } from '../../rules/rule-registry.j
 import { getBuffer, getWeightDtype, getBufferDtype } from '../weight-buffer.js';
 import { unifiedKernelWrapper } from './utils.js';
+// Conservative fallback dtype for norm weight inference when metadata is unavailable.
+const DEFAULT_DTYPE = 'f32';
 function inferHiddenSize(input, hiddenSize) {
   if (hiddenSize != null) return hiddenSize;
   const shape = input?.shape;
@@ -39,9 +42,12 @@ function resolveNormWeightDtype(weight, hiddenSize) {
     return taggedDtype;
   }
+  // Conservative fallback: f32 avoids precision loss when dtype cannot be determined.
+  // This path fires for non-GPU buffers or missing hiddenSize, both of which prevent
+  // size-based dtype inference below.
   const hasGPUBufferType = typeof GPUBuffer !== 'undefined';
   if (!hasGPUBufferType || !(weightBuffer instanceof GPUBuffer) || hiddenSize == null || hiddenSize <= 0) {
-    return 'f32';
+    return DEFAULT_DTYPE;
   }
   const byteSize = getBufferRequestedSize(weightBuffer);
@@ -55,7 +61,8 @@ function resolveNormWeightDtype(weight, hiddenSize) {
       sizeMatchesF32,
     });
   }
-  return 'f32';
+  // Buffer size matches neither f16 nor f32 for given hiddenSize; fall back to f32.
+  return DEFAULT_DTYPE;
 }
 function assertRMSNormWeightBuffer(weight, weightBuffer, hiddenSize) {

package/src/gpu/kernels/split_qg.d.ts ADDED Viewed

@@ -0,0 +1,50 @@
+/**
+ * Split Q and Gate Kernel
+ *
+ * De-interleaves Q and Gate projections from q_proj output for attentionOutputGate models.
+ * Models like Qwen 3.5 store q_proj weights in per-head interleaved layout:
+ *   rows [h*headDim*2 : h*headDim*2+headDim] = Q for head h
+ *   rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
+ * This kernel separates the full matmul output into contiguous Q and Gate tensors.
+ */
+import type { Tensor } from '../tensor.js';
+import type { CommandRecorder } from '../command-recorder.js';
+/** Split Q and Gate options */
+export interface SplitQGOptions {
+  numTokens: number;
+  numHeads: number;
+  headDim: number;
+  /** Pre-allocated Q output tensor */
+  qTensor?: Tensor | null;
+  /** Pre-allocated Gate output tensor */
+  gTensor?: Tensor | null;
+}
+/** Split Q and Gate result */
+export interface SplitQGResult {
+  Q: Tensor;
+  G: Tensor;
+}
+/**
+ * De-interleave Q and Gate from q_proj output.
+ *
+ * @param qgTensor - Full q_proj output [numTokens, numHeads * headDim * 2] (interleaved)
+ * @param options - Split configuration
+ * @returns Separate Q and Gate tensors, each [numTokens, numHeads * headDim]
+ */
+export declare function runSplitQG(
+  qgTensor: Tensor,
+  options: SplitQGOptions
+): Promise<SplitQGResult>;
+/**
+ * Record split Q and Gate (batched, no submit).
+ */
+export declare function recordSplitQG(
+  recorder: CommandRecorder,
+  qgTensor: Tensor,
+  options: SplitQGOptions
+): Promise<SplitQGResult>;

package/src/gpu/kernels/split_qg.js ADDED Viewed

@@ -0,0 +1,46 @@
+import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
+import { createTensor, dtypeBytes } from '../tensor.js';
+import { WORKGROUP_SIZES } from './constants.js';
+import { unifiedKernelWrapper } from './utils.js';
+import { selectRuleValue } from './rule-registry.js';
+async function _splitQG(target, qgTensor, options) {
+  const { numTokens, numHeads, headDim, qTensor = null, gTensor = null } = options;
+  const ownsQ = qTensor == null;
+  const ownsG = gTensor == null;
+  const outputDtype = qgTensor.dtype;
+  const pipelineVariant = selectRuleValue('splitQg', 'variant', { outputDtype });
+  const bytesPerElement = dtypeBytes(outputDtype);
+  const qSize = numHeads * headDim;
+  const qBuffer = qTensor?.buffer || acquireBuffer(numTokens * qSize * bytesPerElement, undefined, 'Q');
+  const gBuffer = gTensor?.buffer || acquireBuffer(numTokens * qSize * bytesPerElement, undefined, 'Q_gate');
+  try {
+    await unifiedKernelWrapper(
+      'split_qg', target, pipelineVariant,
+      [qgTensor, qBuffer, gBuffer],
+      { num_tokens: numTokens, num_heads: numHeads, head_dim: headDim, _pad: 0 },
+      Math.ceil((numTokens * qSize) / WORKGROUP_SIZES.DEFAULT)
+    );
+    const Q = qTensor || createTensor(qBuffer, outputDtype, [numTokens, qSize], 'Q');
+    const G = gTensor || createTensor(gBuffer, outputDtype, [numTokens, qSize], 'Q_gate');
+    return { Q, G };
+  } catch (error) {
+    if (ownsQ) releaseBuffer(qBuffer);
+    if (ownsG) releaseBuffer(gBuffer);
+    throw error;
+  }
+}
+export async function runSplitQG(qgTensor, options) {
+  return _splitQG(null, qgTensor, options);
+}
+export async function recordSplitQG(recorder, qgTensor, options) {
+  return _splitQG(recorder, qgTensor, options);
+}

package/src/gpu/kernels/split_qg.wgsl ADDED Viewed

@@ -0,0 +1,58 @@
+// split_qg.wgsl
+/**
+ * De-interleave Q and Gate projections from q_proj output for attentionOutputGate models.
+ *
+ * Models like Qwen 3.5 store q_proj weights with interleaved head layout:
+ *   rows [h*headDim*2 : h*headDim*2+headDim]     = Q for head h
+ *   rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
+ *
+ * A single full matmul over all 2*qSize rows produces interleaved output:
+ *   input[token, h*headDim*2 : h*headDim*2+headDim]     = Q head h
+ *   input[token, h*headDim*2+headDim : (h+1)*headDim*2] = Gate head h
+ *
+ * This kernel separates them into contiguous Q and G outputs:
+ *   Q[token, h*headDim + dim] = input[token, h*headDim*2 + dim]
+ *   G[token, h*headDim + dim] = input[token, h*headDim*2 + headDim + dim]
+ *
+ * Input layout  (row-major): [numTokens, numHeads * headDim * 2]
+ * Output Q layout (row-major): [numTokens, numHeads * headDim]
+ * Output G layout (row-major): [numTokens, numHeads * headDim]
+ */
+struct Params {
+    num_tokens: u32,
+    num_heads: u32,
+    head_dim: u32,
+    _pad: u32,
+}
+override WORKGROUP_SIZE: u32 = 256u;
+@group(0) @binding(0) var<uniform> params: Params;
+@group(0) @binding(1) var<storage, read> input: array<f32>;
+@group(0) @binding(2) var<storage, read_write> Q: array<f32>;
+@group(0) @binding(3) var<storage, read_write> G: array<f32>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    let q_size = params.num_heads * params.head_dim;
+    let total_elements = params.num_tokens * q_size;
+    if (idx >= total_elements) {
+        return;
+    }
+    let token = idx / q_size;
+    let elem = idx % q_size;
+    let head = elem / params.head_dim;
+    let dim = elem % params.head_dim;
+    // Input is interleaved per head: [Q_h (headDim elems), G_h (headDim elems)]
+    let src_q = token * (q_size * 2u) + head * (params.head_dim * 2u) + dim;
+    let src_g = src_q + params.head_dim;
+    Q[idx] = input[src_q];
+    G[idx] = input[src_g];
+}