npm - @simulatte/doppler - Versions diffs - 0.1.7 → 0.1.8 - Mend

@simulatte/doppler 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

package/CHANGELOG.md +19 -0
package/package.json +21 -36
package/src/browser/browser-converter.js +5 -0
package/src/client/doppler-registry.json +1 -17
package/src/config/kernel-path-loader.d.ts +5 -0
package/src/config/kernel-path-loader.js +13 -0
package/src/config/kernels/registry.json +74 -0
package/src/config/loader.js +3 -0
package/src/config/merge-contract-check.js +7 -0
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
package/src/config/presets/kernel-paths/registry.json +14 -0
package/src/config/presets/models/gemma2.json +2 -1
package/src/config/presets/models/gemma3.json +2 -0
package/src/config/presets/models/qwen3.json +4 -3
package/src/config/presets/models/qwen3_5.json +16 -0
package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
package/src/config/schema/conversion.schema.d.ts +1 -0
package/src/config/schema/manifest.schema.d.ts +1 -1
package/src/config/schema/manifest.schema.js +1 -1
package/src/config/schema/storage.schema.js +1 -1
package/src/converter/conversion-plan.js +10 -2
package/src/converter/core.js +2 -0
package/src/converter/manifest-inference.js +12 -22
package/src/converter/parsers/transformer.js +4 -0
package/src/converter/quantization-info.js +5 -1
package/src/converter/quantizer.js +19 -12
package/src/converter/rope-config.js +8 -6
package/src/converter/tokenizer-utils.d.ts +1 -0
package/src/converter/tokenizer-utils.js +4 -1
package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
package/src/distribution/shard-delivery.js +6 -1
package/src/formats/rdrr/parsing.d.ts +4 -0
package/src/formats/rdrr/parsing.js +14 -1
package/src/gpu/kernels/index.d.ts +8 -0
package/src/gpu/kernels/index.js +6 -0
package/src/gpu/kernels/matmul-selection.js +47 -4
package/src/gpu/kernels/matmul.d.ts +2 -0
package/src/gpu/kernels/matmul.js +1 -1
package/src/gpu/kernels/rmsnorm.js +9 -2
package/src/gpu/kernels/split_qg.d.ts +50 -0
package/src/gpu/kernels/split_qg.js +46 -0
package/src/gpu/kernels/split_qg.wgsl +58 -0
package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
package/src/gpu/weight-buffer.d.ts +1 -1
package/src/gpu/weight-buffer.js +1 -1
package/src/inference/browser-harness.d.ts +2 -0
package/src/inference/browser-harness.js +20 -1
package/src/inference/pipelines/diffusion/helpers.js +3 -0
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
package/src/inference/pipelines/text/attention/output-projection.js +8 -0
package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
package/src/inference/pipelines/text/attention/projections.js +41 -11
package/src/inference/pipelines/text/attention/record.js +15 -6
package/src/inference/pipelines/text/attention/run.js +50 -6
package/src/inference/pipelines/text/config.js +14 -0
package/src/inference/pipelines/text/execution-plan.js +5 -4
package/src/inference/pipelines/text/generator-runtime.js +5 -0
package/src/inference/pipelines/text/generator-steps.d.ts +6 -0
package/src/inference/pipelines/text/generator-steps.js +43 -15
package/src/inference/pipelines/text/generator.js +50 -17
package/src/inference/pipelines/text/init.d.ts +13 -0
package/src/inference/pipelines/text/init.js +16 -5
package/src/inference/pipelines/text/layer.js +1 -0
package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
package/src/inference/pipelines/text/linear-attention.js +33 -3
package/src/inference/pipelines/text/logits/gpu.js +2 -2
package/src/inference/pipelines/text/logits/index.d.ts +6 -1
package/src/inference/pipelines/text/logits/index.js +3 -1
package/src/inference/pipelines/text/model-load.js +3 -0
package/src/inference/pipelines/text/sampling.js +52 -6
package/src/inference/test-harness.js +2 -2
package/src/loader/final-weights-loader.js +2 -0
package/src/loader/shard-cache.js +3 -2
package/src/loader/tensors/tensor-loader.js +6 -1
package/src/rules/inference/dtype.rules.json +5 -0
package/src/rules/inference/kernel-path.rules.json +2 -2
package/src/rules/kernels/split-qg.rules.json +6 -0
package/src/rules/rule-registry.js +2 -0
package/src/storage/downloader.js +2 -1
package/src/storage/shard-manager.js +4 -3
package/src/tooling/conversion-config-materializer.js +3 -5
package/src/tooling/node-converter.js +3 -0
package/src/tooling/node-source-runtime.js +36 -0
package/src/types/model.d.ts +5 -0
package/tools/doppler-cli.js +6 -1

package/src/gpu/kernels/split_qg_f16.wgsl ADDED Viewed

@@ -0,0 +1,62 @@
+// AUTO-GENERATED from src/gpu/kernels/split_qg.wgsl.
+// Edit the source kernel and tools/configs/wgsl-variants.js, then run `npm run kernels:generate`.
+// split_qg_f16.wgsl
+/**
+ * De-interleave Q and Gate projections from q_proj output for attentionOutputGate models (f16).
+ *
+ * Models like Qwen 3.5 store q_proj weights with interleaved head layout:
+ *   rows [h*headDim*2 : h*headDim*2+headDim]     = Q for head h
+ *   rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
+ *
+ * A single full matmul over all 2*qSize rows produces interleaved output:
+ *   input[token, h*headDim*2 : h*headDim*2+headDim]     = Q head h
+ *   input[token, h*headDim*2+headDim : (h+1)*headDim*2] = Gate head h
+ *
+ * This kernel separates them into contiguous Q and G outputs:
+ *   Q[token, h*headDim + dim] = input[token, h*headDim*2 + dim]
+ *   G[token, h*headDim + dim] = input[token, h*headDim*2 + headDim + dim]
+ *
+ * Input layout  (row-major): [numTokens, numHeads * headDim * 2]
+ * Output Q layout (row-major): [numTokens, numHeads * headDim]
+ * Output G layout (row-major): [numTokens, numHeads * headDim]
+ */
+enable f16;
+struct Params {
+    num_tokens: u32,
+    num_heads: u32,
+    head_dim: u32,
+    _pad: u32,
+}
+override WORKGROUP_SIZE: u32 = 256u;
+@group(0) @binding(0) var<uniform> params: Params;
+@group(0) @binding(1) var<storage, read> input: array<f16>;
+@group(0) @binding(2) var<storage, read_write> Q: array<f16>;
+@group(0) @binding(3) var<storage, read_write> G: array<f16>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    let q_size = params.num_heads * params.head_dim;
+    let total_elements = params.num_tokens * q_size;
+    if (idx >= total_elements) {
+        return;
+    }
+    let token = idx / q_size;
+    let elem = idx % q_size;
+    let head = elem / params.head_dim;
+    let dim = elem % params.head_dim;
+    // Input is interleaved per head: [Q_h (headDim elems), G_h (headDim elems)]
+    let src_q = token * (q_size * 2u) + head * (params.head_dim * 2u) + dim;
+    let src_g = src_q + params.head_dim;
+    Q[idx] = input[src_q];
+    G[idx] = input[src_g];
+}

package/src/gpu/weight-buffer.d.ts CHANGED Viewed

@@ -110,6 +110,6 @@ export function getBuffer(weight: GPUBuffer | WeightBuffer | TensorLike): GPUBuf
 export function getLayout(weight: GPUBuffer | WeightBuffer | TensorLike): WeightLayout | null;
 /**
- * Get dtype from WeightBuffer, or null for raw GPUBuffer.
+ * Get dtype from WeightBuffer, tagged raw GPUBuffer, or TensorLike.
  */
 export function getWeightDtype(weight: GPUBuffer | WeightBuffer | TensorLike): WeightDtype | TensorLike['dtype'] | null;

package/src/gpu/weight-buffer.js CHANGED Viewed

@@ -114,5 +114,5 @@ export function getLayout(weight) {
 export function getWeightDtype(weight) {
   if (isWeightBuffer(weight)) return weight.dtype;
   if (isTensorLike(weight)) return weight.dtype;
-  return null;
+  return getBufferDtype(weight);
 }

package/src/inference/browser-harness.d.ts CHANGED Viewed

@@ -9,6 +9,7 @@ import type { InferencePipeline } from './pipelines/text.js';
 import type { DiffusionPipeline } from './pipelines/diffusion/pipeline.js';
 import type { EnergyPipeline } from './pipelines/energy/pipeline.js';
 import type { SavedReportInfo, SaveReportOptions } from '../storage/reports.js';
+import type { DebugSnapshot } from '../debug/history.js';
 export interface BrowserHarnessOptions extends InferenceHarnessOptions {
   modelUrl: string;
@@ -143,6 +144,7 @@ export interface BrowserSuiteResult extends SuiteSummary {
   output?: string | DiffusionOutput | null;
   deviceInfo?: Record<string, unknown> | null;
   memoryStats?: ReturnType<InferencePipeline['getMemoryStats']> | null;
+  debugSnapshot?: DebugSnapshot | null;
   pipeline?: InferencePipeline | DiffusionPipeline | EnergyPipeline | null;
   report: Record<string, unknown>;
   reportInfo: SavedReportInfo;

package/src/inference/browser-harness.js CHANGED Viewed

@@ -2,6 +2,7 @@
 import { initializeInference } from './test-harness.js';
 import { saveReport } from '../storage/reports.js';
 import { getRuntimeConfig, setRuntimeConfig } from '../config/runtime.js';
+import { clearLogHistory, getDebugSnapshot } from '../debug/history.js';
 import { computeSampleStats } from '../debug/stats.js';
 import {
   setActiveKernelPath,
@@ -846,15 +847,32 @@ async function dispatchBrowserSuite(suite, options) {
   return null;
 }
+function shouldCaptureDebugSnapshot(suite, runtimeConfig) {
+  const debug = runtimeConfig?.shared?.debug ?? {};
+  const logLevel = String(debug.logLevel?.defaultLogLevel ?? '').toLowerCase();
+  return suite === 'debug'
+    || debug.trace?.enabled === true
+    || debug.pipeline?.enabled === true
+    || (Array.isArray(debug.probes) && debug.probes.length > 0)
+    || debug.profiler?.enabled === true
+    || logLevel === 'debug'
+    || logLevel === 'verbose';
+}
 export async function runBrowserSuite(options = {}) {
   return runWithRuntimeIsolationForSuite(async () => {
     const suiteTimestamp = resolveReportTimestamp(options.timestamp, 'runBrowserSuite timestamp');
     const suiteContext = resolveSuiteContext(options);
     const suite = normalizeSuite(options.suite, suiteContext);
+    const captureDebugSnapshot = shouldCaptureDebugSnapshot(suite, getRuntimeConfig());
+    if (captureDebugSnapshot) {
+      clearLogHistory();
+    }
     const suiteResult = await dispatchBrowserSuite(suite, options);
     if (!suiteResult) {
       throw createUnsupportedSuiteError(suite, suiteContext);
     }
+    const debugSnapshot = captureDebugSnapshot ? getDebugSnapshot() : null;
     if (suite === 'bench' && suiteResult?.metrics?.workloadType === 'training') {
       const trainingReport = suiteResult?.metrics?.trainingMetricsReport;
@@ -886,6 +904,7 @@ export async function runBrowserSuite(options = {}) {
       metrics: suiteResult.metrics ?? null,
       output: reportOutput,
       memory: suiteResult.memoryStats ?? null,
+      debugSnapshot,
       ...options.report,
     };
     if (ulArtifacts.length > 0 || distillArtifacts.length > 0 || checkpointResumeTimeline.length > 0) {
@@ -907,7 +926,7 @@ export async function runBrowserSuite(options = {}) {
       report.timestamp = suiteTimestamp;
     }
     const reportInfo = await saveReport(modelId, report, { timestamp: report.timestamp });
-    return { ...suiteResult, report, reportInfo };
+    return { ...suiteResult, debugSnapshot, report, reportInfo };
   });
 }

package/src/inference/pipelines/diffusion/helpers.js CHANGED Viewed

@@ -89,6 +89,9 @@ export function normalizeDiffusionMatmulLocationDtype(dtype) {
   return normalized;
 }
+// Artifact-derived dtype inference: determines actual storage dtype from buffer byte size.
+// This is NOT a config-bypass — it reads physical buffer dimensions (artifact-derived config),
+// which is a valid merge layer per the config merge contract.
 export function inferDiffusionMatmulDtypeFromBuffer(weight, N, K, preferred) {
   const buffer = getBuffer(weight);
   if (!buffer || !Number.isFinite(N) || !Number.isFinite(K)) return preferred;

package/src/inference/pipelines/diffusion/text-encoder-gpu.js CHANGED Viewed

@@ -45,6 +45,8 @@ import { processLayerGPU } from '../text/layer.js';
 const QUICK_GELU_ALPHA = 1.702;
 const SUPPORTED_CLIP_HIDDEN_ACTIVATIONS = new Set(['gelu', 'quick_gelu']);
+// Standard CLIP hidden activation per OpenAI CLIP specification.
+const DEFAULT_CLIP_HIDDEN_ACT = 'gelu';
 function padTokens(tokens, maxLength, padTokenId) {
   if (!Number.isFinite(maxLength) || maxLength <= 0) {
@@ -100,11 +102,15 @@ function createVectorTensor(device, data, dtype, label) {
   return createTensor(buffer, dtype, [1, length], label);
 }
+// Conservative fallback dtype for diffusion bias tensors when no dtype
+// metadata is available. F32 avoids precision loss in bias additions.
+const DEFAULT_BIAS_DTYPE = 'f32';
 function resolveBiasDtype(weight, weightsEntry, key) {
   if (weight && weight.dtype) return weight.dtype;
   const locationDtype = weightsEntry?.dtypes?.get(key);
   const mapped = normalizeDiffusionLocationDtype(locationDtype);
-  return mapped || 'f32';
+  return mapped || DEFAULT_BIAS_DTYPE;
 }
 function createBiasTensorWithDtype(weight, weightsEntry, key, size, label) {
@@ -145,7 +151,7 @@ function createKernelOps(recorder) {
 }
 function resolveClipHiddenActivation(config) {
-  const hiddenAct = config?.hidden_act ?? 'gelu';
+  const hiddenAct = config?.hidden_act ?? DEFAULT_CLIP_HIDDEN_ACT;
   if (!SUPPORTED_CLIP_HIDDEN_ACTIVATIONS.has(hiddenAct)) {
     throw new Error(
       `Unsupported CLIP hidden_act "${hiddenAct}". ` +

package/src/inference/pipelines/text/attention/output-projection.d.ts ADDED Viewed

@@ -0,0 +1,12 @@
+import type { Tensor } from '../../../../gpu/tensor.js';
+export interface AttentionProjectionInputResult {
+  oProjInput: Tensor;
+  oProjInputTemp: Tensor | null;
+}
+export function prepareAttentionProjectionInput(
+  attnForProjection: Tensor,
+  matmulOutputDtype: string,
+  castToF16: (tensor: Tensor) => Promise<Tensor>
+): Promise<AttentionProjectionInputResult>;

package/src/inference/pipelines/text/attention/output-projection.js ADDED Viewed

@@ -0,0 +1,8 @@
+export async function prepareAttentionProjectionInput(attnForProjection, matmulOutputDtype, castToF16) {
+  if (matmulOutputDtype === 'f16' && attnForProjection.dtype !== 'f16') {
+    const casted = await castToF16(attnForProjection);
+    return { oProjInput: casted, oProjInputTemp: casted };
+  }
+  return { oProjInput: attnForProjection, oProjInputTemp: null };
+}

package/src/inference/pipelines/text/attention/projections.d.ts CHANGED Viewed

@@ -46,7 +46,16 @@ export function recordAttentionInputs(
   info: AttentionInputInfo | null | undefined
 ): void;
-export function resolveAttentionProjectionOutputDtype(attentionInputDtype: string): 'f16' | 'f32' | string;
+export function shouldForceF32AttentionProjectionForRoPE(options: {
+  attentionInputDtype: string;
+  headDim: number;
+  rotaryDim?: number;
+  interleaved?: boolean;
+}): boolean;
+export function resolveAttentionProjectionOutputDtype(
+  attentionInputDtype: string,
+  options?: { forceF32?: boolean }
+): 'f16' | 'f32' | string;
 export function resolveProjectionSliceOffsetBytes(
   weightBuffer: WeightBuffer | Tensor | GPUBuffer | null | undefined,
   outputRows: number,

package/src/inference/pipelines/text/attention/projections.js CHANGED Viewed

@@ -5,6 +5,8 @@ import {
   recordMatmul,
   runSplitQKV,
   recordSplitQKV,
+  runSplitQG,
+  recordSplitQG,
   runRMSNorm,
   recordRMSNorm,
 } from '../../../../gpu/kernel-selector.js';
@@ -28,6 +30,13 @@ function getSplitRunner(recorder) {
   return (qkvTensor, options) => recordSplitQKV(recorder, qkvTensor, options);
 }
+function getSplitQGRunner(recorder) {
+  if (!recorder) {
+    return (qgTensor, options) => runSplitQG(qgTensor, options);
+  }
+  return (qgTensor, options) => recordSplitQG(recorder, qgTensor, options);
+}
 function getRmsNormRunner(recorder) {
   if (!recorder) {
     return (input, weight, eps, options) => runRMSNorm(input, weight, eps, options);
@@ -201,13 +210,17 @@ async function projectQueryWithOptionalGate({
     return { qTensor, qGateTensor: null };
   }
+  // q_proj weights are stored with interleaved head layout: for head h,
+  // rows [h*headDim*2 : h*headDim*2+headDim] = Q, rows [h*headDim*2+headDim : (h+1)*headDim*2] = gate.
+  // Compute the full 2*qSize matmul, then de-interleave into separate Q and gate tensors.
   const runMatmulForMode = getMatmulRunner(recorder);
+  const runSplitQGForMode = getSplitQGRunner(recorder);
   const qWeightBuffer = getWeightBuffer(qWeight, 'q_proj');
-  const gateOffset = resolveProjectionSliceOffsetBytes(qWeightBuffer, qSize, hiddenSize);
+  let fullQGTensor = null;
   let qTensor = null;
   let qGateTensor = null;
   try {
-    qTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize, hiddenSize, {
+    fullQGTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize * 2, hiddenSize, {
       transposeB: 'auto',
       role: 'q_proj',
       layerIdx,
@@ -215,15 +228,19 @@ async function projectQueryWithOptionalGate({
       outputDtype: matmulOutputDtype,
     });
-    qGateTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize, hiddenSize, {
-      transposeB: 'auto',
-      role: 'q_proj_gate',
-      layerIdx,
-      kernelPath,
-      bOffset: gateOffset,
-      outputDtype: matmulOutputDtype,
+    const split = await runSplitQGForMode(fullQGTensor, {
+      numTokens,
+      numHeads,
+      headDim,
     });
+    releaseTemporary(fullQGTensor.buffer);
+    fullQGTensor = null;
+    qTensor = split.Q;
+    qGateTensor = split.G;
   } catch (error) {
+    if (fullQGTensor) {
+      releaseTemporary(fullQGTensor.buffer);
+    }
     if (qTensor) {
       releaseTemporary(qTensor.buffer);
     }
@@ -277,9 +294,22 @@ export function recordAttentionInputs(state, info) {
   state.stats.attentionInputs.push(info);
 }
-export function resolveAttentionProjectionOutputDtype(attentionInputDtype) {
+export function shouldForceF32AttentionProjectionForRoPE({
+  attentionInputDtype,
+  headDim,
+  rotaryDim = headDim,
+  interleaved = false,
+}) {
+  return attentionInputDtype === 'f16'
+    && Number.isFinite(headDim)
+    && Number.isFinite(rotaryDim)
+    && (rotaryDim !== headDim || interleaved === true);
+}
+export function resolveAttentionProjectionOutputDtype(attentionInputDtype, options = {}) {
   const useF16Activations = attentionInputDtype === 'f16';
-  return selectRuleValue('shared', 'dtype', 'f16OrFallbackByFlag', {
+  return selectRuleValue('inference', 'dtype', 'attentionProjectionOutputDtype', {
+    forceF32: options.forceF32 === true,
     useF16: useF16Activations,
     fallback: attentionInputDtype,
   });

package/src/inference/pipelines/text/attention/record.js CHANGED Viewed

@@ -24,10 +24,12 @@ import { selectRuleValue } from '../../../../rules/rule-registry.js';
 import { SlidingWindowKVCache } from '../../../kv-cache.js';
 import {
   recordAttentionInputs,
+  shouldForceF32AttentionProjectionForRoPE,
   resolveAttentionProjectionOutputDtype,
   projectAttentionQKV,
   applyAttentionQKNorm,
 } from './projections.js';
+import { prepareAttentionProjectionInput } from './output-projection.js';
 import { releaseOrTrack, shouldDebugLayer } from './types.js';
@@ -142,7 +144,14 @@ export async function recordLayerAttentionGPU(
   }
   // 2. Q/K/V projections
-  const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype);
+  const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
+    forceF32: shouldForceF32AttentionProjectionForRoPE({
+      attentionInputDtype: desiredOutputDtype,
+      headDim,
+      rotaryDim: config.ropeRotaryDim,
+      interleaved: config.ropeInterleaved,
+    }),
+  });
   let usedFusedQKV = false;
   ({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
     recorder,
@@ -535,14 +544,14 @@ export async function recordLayerAttentionGPU(
   let oProjInput = attnForProjection;
   oProjInputTemp = null;
   if (layerWeights.oProj && getWeightBuffer) {
+    ({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
+      attnForProjection,
+      matmulOutputDtype,
+      (tensor) => recordCastF32ToF16(recorder, tensor)
+    ));
     const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
     const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
-    if (matmulOutputDtype === 'f16' && attnForProjection.dtype !== 'f16') {
-      oProjInput = await recordCastF32ToF16(recorder, attnForProjection);
-      oProjInputTemp = oProjInput;
-    }
     // Use fused o_proj + residual for decode when possible
     // Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
     const oProjDtype = getWeightDtype(oProjBuf);

package/src/inference/pipelines/text/attention/run.js CHANGED Viewed

@@ -28,10 +28,12 @@ import { runProbes } from '../probes.js';
 import { SlidingWindowKVCache } from '../../../kv-cache.js';
 import {
   recordAttentionInputs,
+  shouldForceF32AttentionProjectionForRoPE,
   resolveAttentionProjectionOutputDtype,
   projectAttentionQKV,
   applyAttentionQKNorm,
 } from './projections.js';
+import { prepareAttentionProjectionInput } from './output-projection.js';
 import {
   shouldDebugLayer,
@@ -193,7 +195,14 @@ export async function runLayerAttentionGPU(
   }
   // 2. Q/K/V projections
-  const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype);
+  const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
+    forceF32: shouldForceF32AttentionProjectionForRoPE({
+      attentionInputDtype: desiredOutputDtype,
+      headDim,
+      rotaryDim: config.ropeRotaryDim,
+      interleaved: config.ropeInterleaved,
+    }),
+  });
   let usedFusedQKV = false;
   ({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
     recorder: null,
@@ -224,6 +233,27 @@ export async function runLayerAttentionGPU(
     await traceStep('matmul', `L${layerIdx}.k_proj`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
     await traceStep('matmul', `L${layerIdx}.v_proj`, layerIdx, vTensor.buffer, [numTokens, numKVHeads * headDim]);
   }
+  await runProbes('q_proj', qTensor.buffer, {
+    layerIdx,
+    numTokens,
+    hiddenSize: numHeads * headDim,
+    probes: state.debugProbes,
+    dtype: qTensor.dtype,
+  });
+  await runProbes('k_proj', kTensor.buffer, {
+    layerIdx,
+    numTokens,
+    hiddenSize: numKVHeads * headDim,
+    probes: state.debugProbes,
+    dtype: kTensor.dtype,
+  });
+  await runProbes('v_proj', vTensor.buffer, {
+    layerIdx,
+    numTokens,
+    hiddenSize: numKVHeads * headDim,
+    probes: state.debugProbes,
+    dtype: vTensor.dtype,
+  });
   // Kernel step debug: Q/K/V projections
   if (isKernelDebugEnabled(layerIdx)) {
@@ -331,6 +361,20 @@ export async function runLayerAttentionGPU(
       await traceStep('rope', `L${layerIdx}.k_rope`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
     }
   }
+  await runProbes('q_rope', qTensor.buffer, {
+    layerIdx,
+    numTokens,
+    hiddenSize: numHeads * headDim,
+    probes: state.debugProbes,
+    dtype: qTensor.dtype,
+  });
+  await runProbes('k_rope', kTensor.buffer, {
+    layerIdx,
+    numTokens,
+    hiddenSize: numKVHeads * headDim,
+    probes: state.debugProbes,
+    dtype: kTensor.dtype,
+  });
   if (isKernelDebugEnabled(layerIdx)) {
     logKernelStep('rope', { layerIdx, label: `startPos=${currentSeqLen}` });
     await dumpTokenVector(qTensor.buffer, 'Q_rope', {
@@ -723,14 +767,14 @@ export async function runLayerAttentionGPU(
   let oProjInput = attnForProjection;
   oProjInputTemp = null;
   if (layerWeights.oProj && getWeightBuffer) {
+    ({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
+      attnForProjection,
+      matmulOutputDtype,
+      castF32ToF16
+    ));
     const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
     const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
-    if (matmulOutputDtype === 'f16' && attnOutput.dtype !== 'f16') {
-      oProjInput = await castF32ToF16(attnOutput);
-      oProjInputTemp = oProjInput;
-    }
     // Use fused o_proj + residual for decode when possible
     // Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
     const oProjDtype = getWeightDtype(oProjBuf);

package/src/inference/pipelines/text/config.js CHANGED Viewed

@@ -482,6 +482,20 @@ export function toParsedConfigFromMerged(merged, manifest) {
   const queryPreAttnScalar = inf.attention.queryPreAttnScalar;
   const causalAttention = inf.attention.causal;
+  // Cross-field sanity: queryPreAttnScalar should typically equal headDim.
+  // A value of sqrt(headDim) indicates a known converter bug that produces
+  // attnScale = 1/sqrt(sqrt(headDim)) instead of the correct 1/sqrt(headDim).
+  if (queryPreAttnScalar != null && headDim != null
+      && queryPreAttnScalar !== headDim
+      && Math.abs(queryPreAttnScalar - Math.sqrt(headDim)) < 0.01) {
+    throw new Error(
+      `Model "${merged.modelId}": queryPreAttnScalar (${queryPreAttnScalar}) ` +
+      `equals sqrt(headDim) instead of headDim (${headDim}). ` +
+      `This is a known converter bug — the manifest must be regenerated ` +
+      `with the corrected converter.`
+    );
+  }
   // Get stop token IDs (cast to Manifest for compatibility)
   const stopTokenIds = getStopTokenIds(manifest);

package/src/inference/pipelines/text/execution-plan.js CHANGED Viewed

@@ -58,10 +58,11 @@ function resolveFallbackActivationDtype(primaryActivationDtype) {
 function resolveFallbackKernelPath(primaryKernelPath) {
   const primaryKernelPathId = primaryKernelPath?.id ?? null;
   if (!primaryKernelPathId) {
-    throw new Error(
-      '[ExecutionPlan] F16 finiteness fallback requires a primary kernel path with a stable id. ' +
-      'Add a registered kernelPath id and a finiteness fallback rule.'
-    );
+    return {
+      kernelPath: null,
+      kernelPathId: null,
+      kernelPathSource: 'none',
+    };
   }
   const explicitFallbackKernelPathId = typeof primaryKernelPath?.finitenessFallbackKernelPathId === 'string'

package/src/inference/pipelines/text/generator-runtime.js CHANGED Viewed

@@ -213,6 +213,10 @@ export function resolvePrefillEmbeddingOptions(state, options = {}) {
     ? state.manifest.modelType.toLowerCase()
     : '';
   const generationDefaults = state.runtimeConfig.inference.generation;
+  // Embedding models default to 'mean' pooling — this is a model-category behavior,
+  // not a model-family identity check. Ideally embedding model presets would set
+  // generation.embeddingMode='mean' in their runtime config; the modelType fallback
+  // provides this default for manifests that predate runtime-preset embedding mode.
   const defaultEmbeddingMode = modelType === 'embedding'
     ? 'mean'
     : generationDefaults.embeddingMode;
@@ -226,6 +230,7 @@ export function resolveAdvanceEmbeddingMode(state, options = {}) {
   const modelType = typeof state.manifest?.modelType === 'string'
     ? state.manifest.modelType.toLowerCase()
     : '';
+  // See resolvePrefillEmbeddingOptions for embedding-model pooling rationale.
   const configuredMode = state.runtimeConfig.inference.generation.embeddingMode;
   return resolveConfiguredValue(
     options.embeddingMode,

package/src/inference/pipelines/text/generator-steps.d.ts CHANGED Viewed

@@ -19,6 +19,12 @@ export declare function resolveBatchStop(
   eosTokenId: number | undefined | null
 ): number;
+export declare function findInvalidGeneratedToken(
+  tokens: number[],
+  vocabSize: number,
+  padTokenId?: number | null
+): { index: number; tokenId: number } | null;
 export interface SampledTokenStagingBuffer {
   mapAsync(mode: number): Promise<void>;
   getMappedRange(): ArrayBufferLike;

package/src/inference/pipelines/text/generator-steps.js CHANGED Viewed

@@ -113,6 +113,20 @@ export function resolveBatchStop(tokens, stopFlags, stopTokenIds, eosTokenId) {
   return actualCount;
 }
+export function findInvalidGeneratedToken(tokens, vocabSize, padTokenId = null) {
+  for (let i = 0; i < tokens.length; i++) {
+    const tokenId = tokens[i];
+    const isInvalid = !Number.isFinite(tokenId)
+      || tokenId < 0
+      || tokenId >= vocabSize
+      || (padTokenId != null ? tokenId === padTokenId : tokenId === 0);
+    if (isInvalid) {
+      return { index: i, tokenId };
+    }
+  }
+  return null;
+}
 export async function readSampledTokenFromStagingBuffer(stagingBuffer, options = {}) {
   const ownsStagingBuffer = options.ownsStagingBuffer === true;
   const hasFinitenessBuffer = options.hasFinitenessBuffer === true;
@@ -240,11 +254,9 @@ async function runDecodeLayers(state, tokenId, opts, helpers) {
     throw new Error('Embed buffer not found or not a supported buffer type');
   }
   const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
-  const embedDtype = isWeightBuffer(embedBufferRaw)
-    ? getWeightDtype(embedBufferRaw)
-    : isCpuWeightBuffer(embedBufferRaw)
-      ? embedBufferRaw.dtype
-      : null;
+  const embedDtype = isCpuWeightBuffer(embedBufferRaw)
+    ? embedBufferRaw.dtype
+    : getWeightDtype(embedBufferRaw);
   const activationDtype = getEffectiveActivationDtype(state, opts);
   const embedTensor = await embed([tokenId], embedBuffer, {
@@ -326,11 +338,9 @@ export async function decodeStep(state, currentIds, opts, helpers) {
     throw new Error('Embed buffer not found or not a supported buffer type');
   }
   const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
-  const embedDtype = isWeightBuffer(embedBufferRaw)
-    ? getWeightDtype(embedBufferRaw)
-    : isCpuWeightBuffer(embedBufferRaw)
-      ? embedBufferRaw.dtype
-      : null;
+  const embedDtype = isCpuWeightBuffer(embedBufferRaw)
+    ? embedBufferRaw.dtype
+    : getWeightDtype(embedBufferRaw);
   const activationDtype = getEffectiveActivationDtype(state, opts);
   const activationBytes = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: activationDtype });
@@ -636,11 +646,21 @@ export async function decodeStep(state, currentIds, opts, helpers) {
         });
       releaseBuffer(logitsBuffer);
-      if (!context.decodeBuffers?.ownsBuffer(hiddenStates)) {
-        releaseBuffer(hiddenStates);
+      const invalidGpuToken = nextToken >= config.vocabSize
+        || (padTokenId != null && nextToken === padTokenId)
+        || (padTokenId == null && nextToken === 0);
+      if (!invalidGpuToken) {
+        if (!context.decodeBuffers?.ownsBuffer(hiddenStates)) {
+          releaseBuffer(hiddenStates);
+        }
+        state.currentSeqLen++;
+        return nextToken;
       }
-      state.currentSeqLen++;
-      return nextToken;
+      state.disableFusedDecode = true;
+      log.warn(
+        'Decode',
+        `GPU sampling produced invalid token ${nextToken} (vocabSize=${config.vocabSize}, step=${state.decodeStepCount}); falling back to CPU sampling.`
+      );
     }
   }
@@ -981,7 +1001,7 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
       throw new Error('Embed buffer not found or not a GPUBuffer/WeightBuffer');
     }
     const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
-    const embedDtype = isWeightBuffer(embedBufferRaw) ? getWeightDtype(embedBufferRaw) : null;
+    const embedDtype = getWeightDtype(embedBufferRaw);
     const activationDtype = getEffectiveActivationDtype(state, opts);
     for (let i = 0; i < N; i++) {
@@ -1125,10 +1145,18 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
     const actualCount = resolveBatchStop(tokens, stopFlags, stopTokenIds, eosToken);
     const generatedTokens = tokens.slice(0, actualCount);
+    const invalidToken = findInvalidGeneratedToken(generatedTokens, config.vocabSize, padTokenId);
     if (isInfinite) {
       throw new FinitenessError(`F16 bounds exceeded during batch generation${metadata}`);
     }
+    if (invalidToken) {
+      state.disableFusedDecode = true;
+      throw new Error(
+        `[Pipeline] Batch decode produced invalid token ${invalidToken.tokenId} ` +
+        `at batch index ${invalidToken.index} (vocabSize=${config.vocabSize}, padTokenId=${padTokenId ?? 'none'}).`
+      );
+    }
     if (opts.profile && recorder.isProfilingEnabled()) {
       const timings = await recorder.resolveProfileTimings();