npm - @simulatte/doppler - Versions diffs - 0.1.7 → 0.1.9 - Mend

@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

package/CHANGELOG.md +32 -0
package/README.md +25 -6
package/package.json +25 -38
package/src/browser/browser-converter.js +5 -0
package/src/client/doppler-api.browser.js +6 -0
package/src/client/doppler-api.d.ts +3 -0
package/src/client/doppler-api.js +11 -2
package/src/client/doppler-registry.js +3 -5
package/src/client/doppler-registry.json +2 -2
package/src/config/kernel-path-loader.d.ts +5 -0
package/src/config/kernel-path-loader.js +13 -0
package/src/config/kernels/kernel-ref-digests.js +23 -21
package/src/config/kernels/moe/mixtral.paths.json +46 -0
package/src/config/kernels/registry.json +74 -0
package/src/config/loader.js +9 -0
package/src/config/merge-contract-check.js +7 -0
package/src/config/platforms/loader.js +3 -1
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
package/src/config/presets/kernel-paths/registry.json +21 -0
package/src/config/presets/models/gemma2.json +2 -1
package/src/config/presets/models/gemma3.json +4 -1
package/src/config/presets/models/gemma4.json +61 -0
package/src/config/presets/models/granite-docling.json +70 -0
package/src/config/presets/models/lfm2.json +6 -1
package/src/config/presets/models/qwen3.json +4 -3
package/src/config/presets/models/qwen3_5.json +16 -0
package/src/config/presets/models/qwen3_vl.json +40 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
package/src/config/presets/runtime/modes/trace-layers.json +1 -0
package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
package/src/config/runtime.js +3 -0
package/src/config/schema/conversion.schema.d.ts +1 -0
package/src/config/schema/debug.schema.d.ts +40 -0
package/src/config/schema/debug.schema.js +28 -0
package/src/config/schema/index.js +2 -0
package/src/config/schema/inference-defaults.schema.js +1 -1
package/src/config/schema/kernel-path.schema.d.ts +1 -0
package/src/config/schema/manifest.schema.d.ts +1 -1
package/src/config/schema/manifest.schema.js +1 -1
package/src/config/schema/memory-limits.schema.js +2 -2
package/src/config/schema/storage.schema.js +2 -2
package/src/converter/conversion-plan.js +11 -3
package/src/converter/core.js +19 -8
package/src/converter/manifest-inference.js +12 -22
package/src/converter/parsers/transformer.js +4 -0
package/src/converter/quantization-info.js +5 -1
package/src/converter/quantizer.d.ts +5 -0
package/src/converter/quantizer.js +34 -12
package/src/converter/rope-config.js +8 -6
package/src/converter/tokenizer-utils.d.ts +1 -0
package/src/converter/tokenizer-utils.js +4 -1
package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
package/src/distribution/shard-delivery.js +40 -1
package/src/formats/rdrr/classification.js +32 -0
package/src/formats/rdrr/parsing.d.ts +4 -0
package/src/formats/rdrr/parsing.js +14 -1
package/src/gpu/kernel-runtime.js +4 -2
package/src/gpu/kernels/attention.js +2 -1
package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
package/src/gpu/kernels/dequant_shared.wgsl +4 -2
package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
package/src/gpu/kernels/gated-short-conv.js +284 -0
package/src/gpu/kernels/index.d.ts +8 -0
package/src/gpu/kernels/index.js +6 -0
package/src/gpu/kernels/linear-attention-core.js +37 -17
package/src/gpu/kernels/matmul-selection.js +48 -4
package/src/gpu/kernels/matmul.d.ts +5 -0
package/src/gpu/kernels/matmul.js +71 -2
package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
package/src/gpu/kernels/rmsnorm.js +9 -2
package/src/gpu/kernels/sample.js +1 -3
package/src/gpu/kernels/sample.wgsl +39 -9
package/src/gpu/kernels/sample_f16.wgsl +38 -8
package/src/gpu/kernels/shader-cache.js +9 -4
package/src/gpu/kernels/split_qg.d.ts +50 -0
package/src/gpu/kernels/split_qg.js +46 -0
package/src/gpu/kernels/split_qg.wgsl +58 -0
package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
package/src/gpu/weight-buffer.d.ts +1 -1
package/src/gpu/weight-buffer.js +1 -1
package/src/inference/browser-harness.d.ts +2 -0
package/src/inference/browser-harness.js +20 -1
package/src/inference/kv-cache/base.js +3 -10
package/src/inference/pipelines/diffusion/helpers.js +3 -0
package/src/inference/pipelines/diffusion/pipeline.js +2 -1
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
package/src/inference/pipelines/text/attention/output-projection.js +8 -0
package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
package/src/inference/pipelines/text/attention/projections.js +54 -13
package/src/inference/pipelines/text/attention/record.js +16 -6
package/src/inference/pipelines/text/attention/run.js +59 -6
package/src/inference/pipelines/text/config.d.ts +1 -0
package/src/inference/pipelines/text/config.js +46 -4
package/src/inference/pipelines/text/embed.js +26 -7
package/src/inference/pipelines/text/execution-plan.js +5 -4
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
package/src/inference/pipelines/text/execution-v0.js +12 -1
package/src/inference/pipelines/text/generator-helpers.js +1 -0
package/src/inference/pipelines/text/generator-runtime.js +19 -0
package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
package/src/inference/pipelines/text/generator-steps.js +71 -26
package/src/inference/pipelines/text/generator.d.ts +5 -0
package/src/inference/pipelines/text/generator.js +353 -166
package/src/inference/pipelines/text/init.d.ts +15 -0
package/src/inference/pipelines/text/init.js +35 -10
package/src/inference/pipelines/text/layer.js +38 -8
package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
package/src/inference/pipelines/text/linear-attention.js +33 -3
package/src/inference/pipelines/text/logits/gpu.js +2 -2
package/src/inference/pipelines/text/logits/index.d.ts +6 -1
package/src/inference/pipelines/text/logits/index.js +3 -1
package/src/inference/pipelines/text/model-load.js +3 -0
package/src/inference/pipelines/text/moe-gpu.js +21 -3
package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
package/src/inference/pipelines/text/ops.js +123 -53
package/src/inference/pipelines/text/probes.js +1 -0
package/src/inference/pipelines/text/sampling.js +52 -6
package/src/inference/pipelines/text/state.js +2 -0
package/src/inference/pipelines/text.d.ts +5 -0
package/src/inference/pipelines/text.js +59 -1
package/src/inference/pipelines/vision/encoder.js +386 -0
package/src/inference/pipelines/vision/image-preprocess.js +151 -0
package/src/inference/pipelines/vision/index.js +173 -0
package/src/inference/pipelines/vision/ops.js +78 -0
package/src/inference/pipelines/vision/patch-embed.js +151 -0
package/src/inference/test-harness.js +11 -9
package/src/loader/doppler-loader.d.ts +3 -0
package/src/loader/doppler-loader.js +20 -3
package/src/loader/experts/expert-cache.js +6 -2
package/src/loader/experts/expert-loader.js +6 -2
package/src/loader/final-weights-loader.js +2 -0
package/src/loader/layer-loader.js +42 -3
package/src/loader/manifest-config.js +3 -1
package/src/loader/shard-cache.js +3 -2
package/src/loader/tensors/tensor-loader.d.ts +3 -0
package/src/loader/tensors/tensor-loader.js +130 -4
package/src/rules/inference/dtype.rules.json +5 -0
package/src/rules/inference/kernel-path.rules.json +2 -2
package/src/rules/kernels/moe.rules.mixtral.json +75 -0
package/src/rules/kernels/softmax.rules.json +2 -0
package/src/rules/kernels/split-qg.rules.json +6 -0
package/src/rules/rule-registry.d.ts +1 -0
package/src/rules/rule-registry.js +4 -0
package/src/storage/downloader.js +2 -1
package/src/storage/quickstart-downloader.d.ts +3 -0
package/src/storage/quickstart-downloader.js +27 -30
package/src/storage/shard-manager.js +4 -3
package/src/tooling/conversion-config-materializer.js +3 -5
package/src/tooling/node-converter.js +28 -7
package/src/tooling/node-source-runtime.js +65 -5
package/src/tooling/node-webgpu.js +24 -7
package/src/types/model.d.ts +5 -0
package/src/utils/hf-resolve-url.d.ts +16 -0
package/src/utils/hf-resolve-url.js +17 -0
package/src/version.js +1 -1
package/tools/doppler-cli.js +6 -1
package/src/tooling/node-convert.d.ts +0 -54

package/src/inference/pipelines/text/attention/output-projection.js ADDED Viewed

@@ -0,0 +1,8 @@
+export async function prepareAttentionProjectionInput(attnForProjection, matmulOutputDtype, castToF16) {
+  if (matmulOutputDtype === 'f16' && attnForProjection.dtype !== 'f16') {
+    const casted = await castToF16(attnForProjection);
+    return { oProjInput: casted, oProjInputTemp: casted };
+  }
+  return { oProjInput: attnForProjection, oProjInputTemp: null };
+}

package/src/inference/pipelines/text/attention/projections.d.ts CHANGED Viewed

@@ -3,6 +3,7 @@ import type { Tensor } from '../../../../gpu/tensor.js';
 import type { WeightBuffer, CpuWeightBuffer } from '../../../../gpu/weight-buffer.js';
 import type { LayerWeights } from '../types.js';
 import type { LoRAAdapter } from '../lora.js';
+import type { MatmulDebugConfigSchema } from '../../../../config/schema/debug.schema.js';
 export interface AttentionInputInfo {
   phase: 'prefill' | 'decode';
@@ -46,7 +47,16 @@ export function recordAttentionInputs(
   info: AttentionInputInfo | null | undefined
 ): void;
-export function resolveAttentionProjectionOutputDtype(attentionInputDtype: string): 'f16' | 'f32' | string;
+export function shouldForceF32AttentionProjectionForRoPE(options: {
+  attentionInputDtype: string;
+  headDim: number;
+  rotaryDim?: number;
+  interleaved?: boolean;
+}): boolean;
+export function resolveAttentionProjectionOutputDtype(
+  attentionInputDtype: string,
+  options?: { forceF32?: boolean }
+): 'f16' | 'f32' | string;
 export function resolveProjectionSliceOffsetBytes(
   weightBuffer: WeightBuffer | Tensor | GPUBuffer | null | undefined,
   outputRows: number,
@@ -67,11 +77,13 @@ export interface ProjectAttentionQKVOptions {
   getWeightBuffer?: (weight: GPUBuffer | WeightBuffer | Float32Array | ArrayBuffer | CpuWeightBuffer, label: string) => GPUBuffer | WeightBuffer;
   lora?: LoRAAdapter | null;
   releaseTemporary: (buffer: GPUBuffer) => void;
+  matmulDebug?: MatmulDebugConfigSchema | null;
   onFusedQKV?: ((info: { qSize: number; kSize: number; vSize: number; totalSize: number }) => void) | null;
 }
 export interface ProjectAttentionQKVResult {
   qTensor: Tensor;
+  qGateTensor: Tensor | null;
   kTensor: Tensor;
   vTensor: Tensor;
   usedFusedQKV: boolean;

package/src/inference/pipelines/text/attention/projections.js CHANGED Viewed

@@ -5,6 +5,8 @@ import {
   recordMatmul,
   runSplitQKV,
   recordSplitQKV,
+  runSplitQG,
+  recordSplitQG,
   runRMSNorm,
   recordRMSNorm,
 } from '../../../../gpu/kernel-selector.js';
@@ -28,6 +30,13 @@ function getSplitRunner(recorder) {
   return (qkvTensor, options) => recordSplitQKV(recorder, qkvTensor, options);
 }
+function getSplitQGRunner(recorder) {
+  if (!recorder) {
+    return (qgTensor, options) => runSplitQG(qgTensor, options);
+  }
+  return (qgTensor, options) => recordSplitQG(recorder, qgTensor, options);
+}
 function getRmsNormRunner(recorder) {
   if (!recorder) {
     return (input, weight, eps, options) => runRMSNorm(input, weight, eps, options);
@@ -62,9 +71,10 @@ async function projectSingleQkvTensor({
   matmulOutputDtype,
   getWeightBuffer,
   lora,
+  matmulDebug,
   releaseTemporary,
 }) {
-  const runMatmulForMode = getMatmulRunner(recorder);
+    const runMatmulForMode = getMatmulRunner(recorder);
   const layerWeight = layerWeights?.[weightKey];
   if (!layerWeight) {
     throw new Error(`Attention projection requires ${weightKey}.`);
@@ -82,6 +92,7 @@ async function projectSingleQkvTensor({
       layerIdx,
       kernelPath,
       outputDtype: matmulOutputDtype,
+      matmulDebug,
     });
   } finally {
     releaseOwnedWeightBuffer(layerWeight, projBuffer, releaseTemporary);
@@ -169,6 +180,7 @@ async function projectQueryWithOptionalGate({
   matmulOutputDtype,
   getWeightBuffer,
   lora,
+  matmulDebug,
   releaseTemporary,
   attentionOutputGate,
 }) {
@@ -196,34 +208,44 @@ async function projectQueryWithOptionalGate({
       matmulOutputDtype,
       getWeightBuffer,
       lora,
+      matmulDebug,
       releaseTemporary,
     });
     return { qTensor, qGateTensor: null };
   }
+  // q_proj weights are stored with interleaved head layout: for head h,
+  // rows [h*headDim*2 : h*headDim*2+headDim] = Q, rows [h*headDim*2+headDim : (h+1)*headDim*2] = gate.
+  // Compute the full 2*qSize matmul, then de-interleave into separate Q and gate tensors.
   const runMatmulForMode = getMatmulRunner(recorder);
+  const runSplitQGForMode = getSplitQGRunner(recorder);
   const qWeightBuffer = getWeightBuffer(qWeight, 'q_proj');
-  const gateOffset = resolveProjectionSliceOffsetBytes(qWeightBuffer, qSize, hiddenSize);
+  let fullQGTensor = null;
   let qTensor = null;
   let qGateTensor = null;
   try {
-    qTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize, hiddenSize, {
+    fullQGTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize * 2, hiddenSize, {
       transposeB: 'auto',
       role: 'q_proj',
       layerIdx,
       kernelPath,
       outputDtype: matmulOutputDtype,
+      matmulDebug,
     });
-    qGateTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize, hiddenSize, {
-      transposeB: 'auto',
-      role: 'q_proj_gate',
-      layerIdx,
-      kernelPath,
-      bOffset: gateOffset,
-      outputDtype: matmulOutputDtype,
+    const split = await runSplitQGForMode(fullQGTensor, {
+      numTokens,
+      numHeads,
+      headDim,
     });
+    releaseTemporary(fullQGTensor.buffer);
+    fullQGTensor = null;
+    qTensor = split.Q;
+    qGateTensor = split.G;
   } catch (error) {
+    if (fullQGTensor) {
+      releaseTemporary(fullQGTensor.buffer);
+    }
     if (qTensor) {
       releaseTemporary(qTensor.buffer);
     }
@@ -277,9 +299,22 @@ export function recordAttentionInputs(state, info) {
   state.stats.attentionInputs.push(info);
 }
-export function resolveAttentionProjectionOutputDtype(attentionInputDtype) {
+export function shouldForceF32AttentionProjectionForRoPE({
+  attentionInputDtype,
+  headDim,
+  rotaryDim = headDim,
+  interleaved = false,
+}) {
+  return attentionInputDtype === 'f16'
+    && Number.isFinite(headDim)
+    && Number.isFinite(rotaryDim)
+    && (rotaryDim !== headDim || interleaved === true);
+}
+export function resolveAttentionProjectionOutputDtype(attentionInputDtype, options = {}) {
   const useF16Activations = attentionInputDtype === 'f16';
-  return selectRuleValue('shared', 'dtype', 'f16OrFallbackByFlag', {
+  return selectRuleValue('inference', 'dtype', 'attentionProjectionOutputDtype', {
+    forceF32: options.forceF32 === true,
     useF16: useF16Activations,
     fallback: attentionInputDtype,
   });
@@ -299,6 +334,7 @@ export async function projectAttentionQKV({
   matmulOutputDtype,
   getWeightBuffer,
   lora,
+  matmulDebug,
   releaseTemporary,
   onFusedQKV = null,
   attentionOutputGate = false,
@@ -309,7 +345,8 @@ export async function projectAttentionQKV({
   const hasLoRA = getLoRAModule(lora, layerIdx, 'q_proj')
     || getLoRAModule(lora, layerIdx, 'k_proj')
     || getLoRAModule(lora, layerIdx, 'v_proj');
-  const useFusedQKV = selectRuleValue('inference', 'attention', 'useFusedQkv', {
+  const forceSplitQKV = Boolean(matmulDebug?.enabled) && matmulDebug?.forceSplitQKV === true;
+  const useFusedQKV = !forceSplitQKV && selectRuleValue('inference', 'attention', 'useFusedQkv', {
     hasQkvProj: Boolean(layerWeights.qkvProj),
     hasQkvSizes: Boolean(layerWeights.qkvSizes),
     hasLoRA: Boolean(hasLoRA),
@@ -326,6 +363,7 @@ export async function projectAttentionQKV({
         layerIdx,
         kernelPath,
         outputDtype: matmulOutputDtype,
+        matmulDebug,
       });
       const split = await runSplitForMode(qkvTensor, {
         numTokens,
@@ -364,6 +402,7 @@ export async function projectAttentionQKV({
       matmulOutputDtype,
       getWeightBuffer,
       lora,
+      matmulDebug,
       releaseTemporary,
       attentionOutputGate,
     }));
@@ -384,6 +423,7 @@ export async function projectAttentionQKV({
       matmulOutputDtype,
       getWeightBuffer,
       lora,
+      matmulDebug,
       releaseTemporary,
     });
@@ -403,6 +443,7 @@ export async function projectAttentionQKV({
       matmulOutputDtype,
       getWeightBuffer,
       lora,
+      matmulDebug,
       releaseTemporary,
     });

package/src/inference/pipelines/text/attention/record.js CHANGED Viewed

@@ -24,10 +24,12 @@ import { selectRuleValue } from '../../../../rules/rule-registry.js';
 import { SlidingWindowKVCache } from '../../../kv-cache.js';
 import {
   recordAttentionInputs,
+  shouldForceF32AttentionProjectionForRoPE,
   resolveAttentionProjectionOutputDtype,
   projectAttentionQKV,
   applyAttentionQKNorm,
 } from './projections.js';
+import { prepareAttentionProjectionInput } from './output-projection.js';
 import { releaseOrTrack, shouldDebugLayer } from './types.js';
@@ -142,7 +144,14 @@ export async function recordLayerAttentionGPU(
   }
   // 2. Q/K/V projections
-  const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype);
+  const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
+    forceF32: shouldForceF32AttentionProjectionForRoPE({
+      attentionInputDtype: desiredOutputDtype,
+      headDim,
+      rotaryDim: config.ropeRotaryDim,
+      interleaved: config.ropeInterleaved,
+    }),
+  });
   let usedFusedQKV = false;
   ({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
     recorder,
@@ -158,6 +167,7 @@ export async function recordLayerAttentionGPU(
     matmulOutputDtype,
     getWeightBuffer,
     lora,
+    matmulDebug: state.runtimeConfig?.shared?.debug?.matmul ?? null,
     attentionOutputGate: config.attentionOutputGate === true,
     releaseTemporary: (buffer) => releaseOrTrack(recorder, buffer),
     onFusedQKV: layerIdx === 0 && isPrefill
@@ -535,14 +545,14 @@ export async function recordLayerAttentionGPU(
   let oProjInput = attnForProjection;
   oProjInputTemp = null;
   if (layerWeights.oProj && getWeightBuffer) {
+    ({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
+      attnForProjection,
+      matmulOutputDtype,
+      (tensor) => recordCastF32ToF16(recorder, tensor)
+    ));
     const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
     const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
-    if (matmulOutputDtype === 'f16' && attnForProjection.dtype !== 'f16') {
-      oProjInput = await recordCastF32ToF16(recorder, attnForProjection);
-      oProjInputTemp = oProjInput;
-    }
     // Use fused o_proj + residual for decode when possible
     // Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
     const oProjDtype = getWeightDtype(oProjBuf);

package/src/inference/pipelines/text/attention/run.js CHANGED Viewed

@@ -28,10 +28,12 @@ import { runProbes } from '../probes.js';
 import { SlidingWindowKVCache } from '../../../kv-cache.js';
 import {
   recordAttentionInputs,
+  shouldForceF32AttentionProjectionForRoPE,
   resolveAttentionProjectionOutputDtype,
   projectAttentionQKV,
   applyAttentionQKNorm,
 } from './projections.js';
+import { prepareAttentionProjectionInput } from './output-projection.js';
 import {
   shouldDebugLayer,
@@ -164,6 +166,14 @@ export async function runLayerAttentionGPU(
         dtype: normed.dtype,
       });
     }
+    await runProbes('post_input_norm', normed.buffer, {
+      layerIdx,
+      numTokens,
+      hiddenSize,
+      probes: state.debugProbes,
+      dtype: normed.dtype,
+    });
   }
   // Debug: Check normed input for L0 prefill
@@ -193,7 +203,14 @@ export async function runLayerAttentionGPU(
   }
   // 2. Q/K/V projections
-  const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype);
+  const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
+    forceF32: shouldForceF32AttentionProjectionForRoPE({
+      attentionInputDtype: desiredOutputDtype,
+      headDim,
+      rotaryDim: config.ropeRotaryDim,
+      interleaved: config.ropeInterleaved,
+    }),
+  });
   let usedFusedQKV = false;
   ({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
     recorder: null,
@@ -209,6 +226,7 @@ export async function runLayerAttentionGPU(
     matmulOutputDtype,
     getWeightBuffer,
     lora,
+    matmulDebug: state.runtimeConfig?.shared?.debug?.matmul ?? null,
     attentionOutputGate: config.attentionOutputGate === true,
     releaseTemporary: (buffer) => releaseBuffer(buffer),
     onFusedQKV: layerIdx === 0 && isPrefill
@@ -224,6 +242,27 @@ export async function runLayerAttentionGPU(
     await traceStep('matmul', `L${layerIdx}.k_proj`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
     await traceStep('matmul', `L${layerIdx}.v_proj`, layerIdx, vTensor.buffer, [numTokens, numKVHeads * headDim]);
   }
+  await runProbes('q_proj', qTensor.buffer, {
+    layerIdx,
+    numTokens,
+    hiddenSize: numHeads * headDim,
+    probes: state.debugProbes,
+    dtype: qTensor.dtype,
+  });
+  await runProbes('k_proj', kTensor.buffer, {
+    layerIdx,
+    numTokens,
+    hiddenSize: numKVHeads * headDim,
+    probes: state.debugProbes,
+    dtype: kTensor.dtype,
+  });
+  await runProbes('v_proj', vTensor.buffer, {
+    layerIdx,
+    numTokens,
+    hiddenSize: numKVHeads * headDim,
+    probes: state.debugProbes,
+    dtype: vTensor.dtype,
+  });
   // Kernel step debug: Q/K/V projections
   if (isKernelDebugEnabled(layerIdx)) {
@@ -331,6 +370,20 @@ export async function runLayerAttentionGPU(
       await traceStep('rope', `L${layerIdx}.k_rope`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
     }
   }
+  await runProbes('q_rope', qTensor.buffer, {
+    layerIdx,
+    numTokens,
+    hiddenSize: numHeads * headDim,
+    probes: state.debugProbes,
+    dtype: qTensor.dtype,
+  });
+  await runProbes('k_rope', kTensor.buffer, {
+    layerIdx,
+    numTokens,
+    hiddenSize: numKVHeads * headDim,
+    probes: state.debugProbes,
+    dtype: kTensor.dtype,
+  });
   if (isKernelDebugEnabled(layerIdx)) {
     logKernelStep('rope', { layerIdx, label: `startPos=${currentSeqLen}` });
     await dumpTokenVector(qTensor.buffer, 'Q_rope', {
@@ -723,14 +776,14 @@ export async function runLayerAttentionGPU(
   let oProjInput = attnForProjection;
   oProjInputTemp = null;
   if (layerWeights.oProj && getWeightBuffer) {
+    ({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
+      attnForProjection,
+      matmulOutputDtype,
+      castF32ToF16
+    ));
     const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
     const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
-    if (matmulOutputDtype === 'f16' && attnOutput.dtype !== 'f16') {
-      oProjInput = await castF32ToF16(attnOutput);
-      oProjInputTemp = oProjInput;
-    }
     // Use fused o_proj + residual for decode when possible
     // Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
     const oProjDtype = getWeightDtype(oProjBuf);

package/src/inference/pipelines/text/config.d.ts CHANGED Viewed

@@ -150,6 +150,7 @@ export interface ParsedModelConfig {
   ropeLocalTheta: number | null;
   ropeRotaryDim: number;
   ropeInterleaved: boolean;
+  mropeInterleaved: boolean;
   mropeSection: number[] | null;
   partialRotaryFactor: number | null;
   ropeScale: number;

package/src/inference/pipelines/text/config.js CHANGED Viewed

@@ -349,6 +349,24 @@ function normalizeLayerTypeTag(value) {
   return null;
 }
+function resolveVisionConfig(rawConfig, manifest) {
+  const vc = rawConfig?.vision_config ?? manifest?.config?.vision_config;
+  if (!vc || typeof vc !== 'object') return null;
+  return {
+    depth: vc.depth ?? 24,
+    hiddenSize: vc.hidden_size ?? 1024,
+    intermediateSize: vc.intermediate_size ?? 4096,
+    numHeads: vc.num_heads ?? 16,
+    outHiddenSize: vc.out_hidden_size ?? vc.hidden_size ?? 1024,
+    patchSize: vc.patch_size ?? 16,
+    spatialMergeSize: vc.spatial_merge_size ?? 2,
+    temporalPatchSize: vc.temporal_patch_size ?? 2,
+    eps: vc.eps ?? 1e-6,
+    deepstackVisualIndexes: Array.isArray(vc.deepstack_visual_indexes) ? vc.deepstack_visual_indexes : [],
+    imageTokenId: rawConfig?.image_token_id ?? manifest?.image_token_id ?? null,
+  };
+}
 function parseCustomLayerTypes(layerTypes, numLayers, modelId) {
   if (!Array.isArray(layerTypes) || layerTypes.length === 0) {
     throw new Error(
@@ -482,6 +500,20 @@ export function toParsedConfigFromMerged(merged, manifest) {
   const queryPreAttnScalar = inf.attention.queryPreAttnScalar;
   const causalAttention = inf.attention.causal;
+  // Cross-field sanity: queryPreAttnScalar should typically equal headDim.
+  // A value of sqrt(headDim) indicates a known converter bug that produces
+  // attnScale = 1/sqrt(sqrt(headDim)) instead of the correct 1/sqrt(headDim).
+  if (queryPreAttnScalar != null && headDim != null
+      && queryPreAttnScalar !== headDim
+      && Math.abs(queryPreAttnScalar - Math.sqrt(headDim)) < 0.01) {
+    throw new Error(
+      `Model "${merged.modelId}": queryPreAttnScalar (${queryPreAttnScalar}) ` +
+      `equals sqrt(headDim) instead of headDim (${headDim}). ` +
+      `This is a known converter bug — the manifest must be regenerated ` +
+      `with the corrected converter.`
+    );
+  }
   // Get stop token IDs (cast to Manifest for compatibility)
   const stopTokenIds = getStopTokenIds(manifest);
@@ -498,10 +530,18 @@ export function toParsedConfigFromMerged(merged, manifest) {
   // RoPE scaling - use manifest inference as source of truth (not raw config)
   const ropeScale = inf.rope.ropeScalingFactor;
   const ropeScalingType = inf.rope.ropeScalingType;
-  const ropeLocalScale = inf.rope.ropeLocalScalingFactor ?? ropeScale;
-  const ropeLocalScalingType = inf.rope.ropeLocalScalingType ?? ropeScalingType;
+  const ropeLocalScale = inf.rope.ropeLocalScalingFactor;
+  const ropeLocalScalingType = inf.rope.ropeLocalScalingType;
   const partialRotaryFactor = inf.rope.partialRotaryFactor;
-  const ropeInterleaved = inf.rope.mropeInterleaved === true;
+  const mropeInterleaved = inf.rope.mropeInterleaved === true;
+  const ropeInterleaved = false;
+  if (ropeLocalScale == null && (inf.rope.ropeLocalTheta != null || inf.rope.mropeSection != null)) {
+    throw new Error(
+      `Model "${merged.modelId}" uses hybrid/mRoPE but is missing rope.ropeLocalScalingFactor in manifest. ` +
+      `Re-convert the model using the latest converter or update the manifest to include an explicit scale.`
+    );
+  }
   const mropeSection = Array.isArray(inf.rope.mropeSection)
     ? inf.rope.mropeSection.map((entry) => Math.trunc(Number(entry)))
     : null;
@@ -511,7 +551,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
       `Manifest "${merged.modelId}" has invalid rope.mropeSection; expected positive integers.`
     );
   }
-  if (ropeInterleaved && mropeSection) {
+  if (mropeInterleaved && mropeSection) {
     const doubledMropeDim = mropeSection.reduce((sum, entry) => sum + entry, 0) * 2;
     if (doubledMropeDim !== ropeRotaryDim) {
       throw new Error(
@@ -596,6 +636,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
     ropeLocalTheta: inf.rope.ropeLocalTheta,
     ropeRotaryDim,
     ropeInterleaved,
+    mropeInterleaved,
     mropeSection,
     partialRotaryFactor,
     ropeScale,
@@ -636,6 +677,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
     chatTemplateType,
     chatTemplateEnabled,
     kernelPath: inf.defaultKernelPath,
+    visionConfig: resolveVisionConfig(config, manifest),
   };
 }

package/src/inference/pipelines/text/embed.js CHANGED Viewed

@@ -9,6 +9,7 @@ import { decodeReadback } from './debug-utils/index.js';
 import { createTensor } from '../../../gpu/tensor.js';
 import { castF32ToF16, recordCastF32ToF16 } from '../../../gpu/kernels/cast.js';
 import { isCpuWeightBuffer } from '../../../gpu/weight-buffer.js';
+import { f16ToF32 } from '../../../loader/dtype-utils.js';
 import { selectRuleValue } from '../../../rules/rule-registry.js';
 const scaleShaderCode = `
@@ -202,11 +203,19 @@ export async function embed(tokenIds, embedBuffer, config) {
   const dtype = selectRuleValue('inference', 'dtype', 'f16OrF32', { useF16 });
-  const cpuEmbeddings = isCpuWeightBuffer(embedBuffer)
-    ? embedBuffer.data
-    : embedBuffer instanceof Float32Array
-      ? embedBuffer
-      : null;
+  let cpuEmbeddings = null;
+  if (isCpuWeightBuffer(embedBuffer)) {
+    const bufDtype = embedBuffer.dtype;
+    if (bufDtype !== 'f32' && bufDtype !== 'f16') {
+      throw new Error(
+        `[Embed] CPU embedding buffer has unsupported dtype '${bufDtype}'; ` +
+        `only 'f32' and 'f16' are supported in the CPU gather path.`
+      );
+    }
+    cpuEmbeddings = embedBuffer.data;
+  } else if (embedBuffer instanceof Float32Array) {
+    cpuEmbeddings = embedBuffer;
+  }
   if (debug) {
     trace.embed(`tokens=${numTokens}, hidden=${hiddenSize}, vocab=${vocabSize}, scaleEmbeddings=${scaleEmbeddings}, transpose=${transpose}, indexOffset=${indexOffset}, activationDtype=${activationDtype}, useF16=${useF16}`);
@@ -226,18 +235,28 @@ export async function embed(tokenIds, embedBuffer, config) {
     }
     const output = new Float32Array(numTokens * hiddenSize);
+    // Check actual data type: loader's f16_to_f32 CPU path already decodes F16 into Float32Array,
+    // so dtype='f16' does not reliably indicate raw F16 bytes. Only Uint16Array needs per-element decoding.
+    const isF16Cpu = cpuEmbeddings instanceof Uint16Array;
     if (!transpose) {
       for (let t = 0; t < numTokens; t++) {
         const tokenId =  (tokenIdArray)[t];
         const srcOffset = tokenId * hiddenSize;
-        output.set(cpuEmbeddings.subarray(srcOffset, srcOffset + hiddenSize), t * hiddenSize);
+        if (isF16Cpu) {
+          for (let h = 0; h < hiddenSize; h++) {
+            output[t * hiddenSize + h] = f16ToF32(cpuEmbeddings[srcOffset + h]);
+          }
+        } else {
+          output.set(cpuEmbeddings.subarray(srcOffset, srcOffset + hiddenSize), t * hiddenSize);
+        }
       }
     } else {
       for (let t = 0; t < numTokens; t++) {
         const tokenId =  (tokenIdArray)[t];
         const dstOffset = t * hiddenSize;
         for (let h = 0; h < hiddenSize; h++) {
-          output[dstOffset + h] = cpuEmbeddings[h * vocabSize + tokenId];
+          const raw = cpuEmbeddings[h * vocabSize + tokenId];
+          output[dstOffset + h] = isF16Cpu ? f16ToF32(raw) : raw;
         }
       }
     }

package/src/inference/pipelines/text/execution-plan.js CHANGED Viewed

@@ -58,10 +58,11 @@ function resolveFallbackActivationDtype(primaryActivationDtype) {
 function resolveFallbackKernelPath(primaryKernelPath) {
   const primaryKernelPathId = primaryKernelPath?.id ?? null;
   if (!primaryKernelPathId) {
-    throw new Error(
-      '[ExecutionPlan] F16 finiteness fallback requires a primary kernel path with a stable id. ' +
-      'Add a registered kernelPath id and a finiteness fallback rule.'
-    );
+    return {
+      kernelPath: null,
+      kernelPathId: null,
+      kernelPathSource: 'none',
+    };
   }
   const explicitFallbackKernelPathId = typeof primaryKernelPath?.finitenessFallbackKernelPathId === 'string'

package/src/inference/pipelines/text/execution-v0-runtime-builders.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { selectRuleValue } from '../../../rules/rule-registry.js';
 import { cloneJson, isPhaseMatch, normalizeDtype, requireSessionActivationDtype, stepHasLayer } from './execution-v0-contract-helpers.js';
-const PIPELINE_COMPATIBLE_OPS = new Set([
+export const PIPELINE_COMPATIBLE_OPS = new Set([
   'save',
   'load',
   'conv',
@@ -191,8 +191,15 @@ export function buildLayerPipelineFromExecution(steps) {
   if (layerSectionSteps.length === 0) {
     return null;
   }
-  if (layerSectionSteps.some((step) => !PIPELINE_COMPATIBLE_OPS.has(step.op))) {
-    return null;
+  const incompatibleOps = [
+    ...new Set(
+      layerSectionSteps
+        .filter((step) => !PIPELINE_COMPATIBLE_OPS.has(step.op))
+        .map((step) => step.op)
+    ),
+  ];
+  if (incompatibleOps.length > 0) {
+    return { incompatibleOps };
   }
   const layerSteps = layerSectionSteps

package/src/inference/pipelines/text/execution-v0.js CHANGED Viewed

@@ -31,6 +31,7 @@ import {
   buildModelRuntimeOverrides,
   buildSessionRuntimePatch,
   resolveFinitenessFallbackKernelPathId,
+  PIPELINE_COMPATIBLE_OPS,
 } from './execution-v0-runtime-builders.js';
 export function hasExecutionV0(manifestInference) {
@@ -152,7 +153,17 @@ export function compileExecutionV0(options = {}) {
     numLayers,
     finitenessFallbackKernelPathId
   );
-  const layerPipeline = buildLayerPipelineFromExecution(resolvedSteps);
+  const layerPipelineResult = buildLayerPipelineFromExecution(resolvedSteps);
+  if (layerPipelineResult?.incompatibleOps && !kernelPath) {
+    throw new Error(
+      `[ExecutionV0] manifest.inference.execution.steps contains layer ops that are not ` +
+      `compatible with the JS layer pipeline and no inline kernelPath was built to cover execution. ` +
+      `Unsupported ops: ${layerPipelineResult.incompatibleOps.join(', ')}. ` +
+      `Either add explicit kernel references to each step (for inline-kernel execution) ` +
+      `or restrict layer ops to: ${[...PIPELINE_COMPATIBLE_OPS].join(', ')}.`
+    );
+  }
+  const layerPipeline = layerPipelineResult?.incompatibleOps ? null : layerPipelineResult;
   const sessionPatch = buildSessionRuntimePatch(resolvedSession);
   const modelOverrides = buildModelRuntimeOverrides(manifestInference);
   for (const [path, source] of sessionSourceByPath.entries()) {

package/src/inference/pipelines/text/generator-helpers.js CHANGED Viewed

@@ -111,6 +111,7 @@ export function buildLayerContext(state, recorder, isDecodeMode, debugLayers, de
     ropeLocalCos: state.ropeLocalCos,
     ropeLocalSin: state.ropeLocalSin,
     linearAttentionRuntime: state.linearAttentionRuntime,
+    convLayerStates: state.convLayerStates,
     weightConfig: getWeightBufferConfig(state),
     debugFlags: state.debugFlags,
     debugProbes: state.runtimeConfig.shared.debug.probes,