@simulatte/doppler 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/package.json +21 -36
  3. package/src/browser/browser-converter.js +5 -0
  4. package/src/client/doppler-registry.json +1 -17
  5. package/src/config/kernel-path-loader.d.ts +5 -0
  6. package/src/config/kernel-path-loader.js +13 -0
  7. package/src/config/kernels/registry.json +74 -0
  8. package/src/config/loader.js +3 -0
  9. package/src/config/merge-contract-check.js +7 -0
  10. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
  11. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
  12. package/src/config/presets/kernel-paths/registry.json +14 -0
  13. package/src/config/presets/models/gemma2.json +2 -1
  14. package/src/config/presets/models/gemma3.json +2 -0
  15. package/src/config/presets/models/qwen3.json +4 -3
  16. package/src/config/presets/models/qwen3_5.json +16 -0
  17. package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
  18. package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
  19. package/src/config/schema/conversion.schema.d.ts +1 -0
  20. package/src/config/schema/manifest.schema.d.ts +1 -1
  21. package/src/config/schema/manifest.schema.js +1 -1
  22. package/src/config/schema/storage.schema.js +1 -1
  23. package/src/converter/conversion-plan.js +10 -2
  24. package/src/converter/core.js +2 -0
  25. package/src/converter/manifest-inference.js +12 -22
  26. package/src/converter/parsers/transformer.js +4 -0
  27. package/src/converter/quantization-info.js +5 -1
  28. package/src/converter/quantizer.js +19 -12
  29. package/src/converter/rope-config.js +8 -6
  30. package/src/converter/tokenizer-utils.d.ts +1 -0
  31. package/src/converter/tokenizer-utils.js +4 -1
  32. package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
  33. package/src/distribution/shard-delivery.js +6 -1
  34. package/src/formats/rdrr/parsing.d.ts +4 -0
  35. package/src/formats/rdrr/parsing.js +14 -1
  36. package/src/gpu/kernels/index.d.ts +8 -0
  37. package/src/gpu/kernels/index.js +6 -0
  38. package/src/gpu/kernels/matmul-selection.js +47 -4
  39. package/src/gpu/kernels/matmul.d.ts +2 -0
  40. package/src/gpu/kernels/matmul.js +1 -1
  41. package/src/gpu/kernels/rmsnorm.js +9 -2
  42. package/src/gpu/kernels/split_qg.d.ts +50 -0
  43. package/src/gpu/kernels/split_qg.js +46 -0
  44. package/src/gpu/kernels/split_qg.wgsl +58 -0
  45. package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
  46. package/src/gpu/weight-buffer.d.ts +1 -1
  47. package/src/gpu/weight-buffer.js +1 -1
  48. package/src/inference/browser-harness.d.ts +2 -0
  49. package/src/inference/browser-harness.js +20 -1
  50. package/src/inference/pipelines/diffusion/helpers.js +3 -0
  51. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +8 -2
  52. package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
  53. package/src/inference/pipelines/text/attention/output-projection.js +8 -0
  54. package/src/inference/pipelines/text/attention/projections.d.ts +10 -1
  55. package/src/inference/pipelines/text/attention/projections.js +41 -11
  56. package/src/inference/pipelines/text/attention/record.js +15 -6
  57. package/src/inference/pipelines/text/attention/run.js +50 -6
  58. package/src/inference/pipelines/text/config.js +14 -0
  59. package/src/inference/pipelines/text/execution-plan.js +5 -4
  60. package/src/inference/pipelines/text/generator-runtime.js +5 -0
  61. package/src/inference/pipelines/text/generator-steps.d.ts +6 -0
  62. package/src/inference/pipelines/text/generator-steps.js +43 -15
  63. package/src/inference/pipelines/text/generator.js +50 -17
  64. package/src/inference/pipelines/text/init.d.ts +13 -0
  65. package/src/inference/pipelines/text/init.js +16 -5
  66. package/src/inference/pipelines/text/layer.js +1 -0
  67. package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
  68. package/src/inference/pipelines/text/linear-attention.js +33 -3
  69. package/src/inference/pipelines/text/logits/gpu.js +2 -2
  70. package/src/inference/pipelines/text/logits/index.d.ts +6 -1
  71. package/src/inference/pipelines/text/logits/index.js +3 -1
  72. package/src/inference/pipelines/text/model-load.js +3 -0
  73. package/src/inference/pipelines/text/sampling.js +52 -6
  74. package/src/inference/test-harness.js +2 -2
  75. package/src/loader/final-weights-loader.js +2 -0
  76. package/src/loader/shard-cache.js +3 -2
  77. package/src/loader/tensors/tensor-loader.js +6 -1
  78. package/src/rules/inference/dtype.rules.json +5 -0
  79. package/src/rules/inference/kernel-path.rules.json +2 -2
  80. package/src/rules/kernels/split-qg.rules.json +6 -0
  81. package/src/rules/rule-registry.js +2 -0
  82. package/src/storage/downloader.js +2 -1
  83. package/src/storage/shard-manager.js +4 -3
  84. package/src/tooling/conversion-config-materializer.js +3 -5
  85. package/src/tooling/node-converter.js +3 -0
  86. package/src/tooling/node-source-runtime.js +36 -0
  87. package/src/types/model.d.ts +5 -0
  88. package/tools/doppler-cli.js +6 -1
@@ -0,0 +1,62 @@
1
+ // AUTO-GENERATED from src/gpu/kernels/split_qg.wgsl.
2
+ // Edit the source kernel and tools/configs/wgsl-variants.js, then run `npm run kernels:generate`.
3
+ // split_qg_f16.wgsl
4
+
5
+ /**
6
+ * De-interleave Q and Gate projections from q_proj output for attentionOutputGate models (f16).
7
+ *
8
+ * Models like Qwen 3.5 store q_proj weights with interleaved head layout:
9
+ * rows [h*headDim*2 : h*headDim*2+headDim] = Q for head h
10
+ * rows [h*headDim*2+headDim : (h+1)*headDim*2] = Gate for head h
11
+ *
12
+ * A single full matmul over all 2*qSize rows produces interleaved output:
13
+ * input[token, h*headDim*2 : h*headDim*2+headDim] = Q head h
14
+ * input[token, h*headDim*2+headDim : (h+1)*headDim*2] = Gate head h
15
+ *
16
+ * This kernel separates them into contiguous Q and G outputs:
17
+ * Q[token, h*headDim + dim] = input[token, h*headDim*2 + dim]
18
+ * G[token, h*headDim + dim] = input[token, h*headDim*2 + headDim + dim]
19
+ *
20
+ * Input layout (row-major): [numTokens, numHeads * headDim * 2]
21
+ * Output Q layout (row-major): [numTokens, numHeads * headDim]
22
+ * Output G layout (row-major): [numTokens, numHeads * headDim]
23
+ */
24
+
25
+ enable f16;
26
+
27
+ struct Params {
28
+ num_tokens: u32,
29
+ num_heads: u32,
30
+ head_dim: u32,
31
+ _pad: u32,
32
+ }
33
+
34
+ override WORKGROUP_SIZE: u32 = 256u;
35
+
36
+ @group(0) @binding(0) var<uniform> params: Params;
37
+ @group(0) @binding(1) var<storage, read> input: array<f16>;
38
+ @group(0) @binding(2) var<storage, read_write> Q: array<f16>;
39
+ @group(0) @binding(3) var<storage, read_write> G: array<f16>;
40
+
41
+ @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
42
+ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
43
+ let idx = gid.x;
44
+ let q_size = params.num_heads * params.head_dim;
45
+ let total_elements = params.num_tokens * q_size;
46
+
47
+ if (idx >= total_elements) {
48
+ return;
49
+ }
50
+
51
+ let token = idx / q_size;
52
+ let elem = idx % q_size;
53
+ let head = elem / params.head_dim;
54
+ let dim = elem % params.head_dim;
55
+
56
+ // Input is interleaved per head: [Q_h (headDim elems), G_h (headDim elems)]
57
+ let src_q = token * (q_size * 2u) + head * (params.head_dim * 2u) + dim;
58
+ let src_g = src_q + params.head_dim;
59
+
60
+ Q[idx] = input[src_q];
61
+ G[idx] = input[src_g];
62
+ }
@@ -110,6 +110,6 @@ export function getBuffer(weight: GPUBuffer | WeightBuffer | TensorLike): GPUBuf
110
110
  export function getLayout(weight: GPUBuffer | WeightBuffer | TensorLike): WeightLayout | null;
111
111
 
112
112
  /**
113
- * Get dtype from WeightBuffer, or null for raw GPUBuffer.
113
+ * Get dtype from WeightBuffer, tagged raw GPUBuffer, or TensorLike.
114
114
  */
115
115
  export function getWeightDtype(weight: GPUBuffer | WeightBuffer | TensorLike): WeightDtype | TensorLike['dtype'] | null;
@@ -114,5 +114,5 @@ export function getLayout(weight) {
114
114
  export function getWeightDtype(weight) {
115
115
  if (isWeightBuffer(weight)) return weight.dtype;
116
116
  if (isTensorLike(weight)) return weight.dtype;
117
- return null;
117
+ return getBufferDtype(weight);
118
118
  }
@@ -9,6 +9,7 @@ import type { InferencePipeline } from './pipelines/text.js';
9
9
  import type { DiffusionPipeline } from './pipelines/diffusion/pipeline.js';
10
10
  import type { EnergyPipeline } from './pipelines/energy/pipeline.js';
11
11
  import type { SavedReportInfo, SaveReportOptions } from '../storage/reports.js';
12
+ import type { DebugSnapshot } from '../debug/history.js';
12
13
 
13
14
  export interface BrowserHarnessOptions extends InferenceHarnessOptions {
14
15
  modelUrl: string;
@@ -143,6 +144,7 @@ export interface BrowserSuiteResult extends SuiteSummary {
143
144
  output?: string | DiffusionOutput | null;
144
145
  deviceInfo?: Record<string, unknown> | null;
145
146
  memoryStats?: ReturnType<InferencePipeline['getMemoryStats']> | null;
147
+ debugSnapshot?: DebugSnapshot | null;
146
148
  pipeline?: InferencePipeline | DiffusionPipeline | EnergyPipeline | null;
147
149
  report: Record<string, unknown>;
148
150
  reportInfo: SavedReportInfo;
@@ -2,6 +2,7 @@
2
2
  import { initializeInference } from './test-harness.js';
3
3
  import { saveReport } from '../storage/reports.js';
4
4
  import { getRuntimeConfig, setRuntimeConfig } from '../config/runtime.js';
5
+ import { clearLogHistory, getDebugSnapshot } from '../debug/history.js';
5
6
  import { computeSampleStats } from '../debug/stats.js';
6
7
  import {
7
8
  setActiveKernelPath,
@@ -846,15 +847,32 @@ async function dispatchBrowserSuite(suite, options) {
846
847
  return null;
847
848
  }
848
849
 
850
+ function shouldCaptureDebugSnapshot(suite, runtimeConfig) {
851
+ const debug = runtimeConfig?.shared?.debug ?? {};
852
+ const logLevel = String(debug.logLevel?.defaultLogLevel ?? '').toLowerCase();
853
+ return suite === 'debug'
854
+ || debug.trace?.enabled === true
855
+ || debug.pipeline?.enabled === true
856
+ || (Array.isArray(debug.probes) && debug.probes.length > 0)
857
+ || debug.profiler?.enabled === true
858
+ || logLevel === 'debug'
859
+ || logLevel === 'verbose';
860
+ }
861
+
849
862
  export async function runBrowserSuite(options = {}) {
850
863
  return runWithRuntimeIsolationForSuite(async () => {
851
864
  const suiteTimestamp = resolveReportTimestamp(options.timestamp, 'runBrowserSuite timestamp');
852
865
  const suiteContext = resolveSuiteContext(options);
853
866
  const suite = normalizeSuite(options.suite, suiteContext);
867
+ const captureDebugSnapshot = shouldCaptureDebugSnapshot(suite, getRuntimeConfig());
868
+ if (captureDebugSnapshot) {
869
+ clearLogHistory();
870
+ }
854
871
  const suiteResult = await dispatchBrowserSuite(suite, options);
855
872
  if (!suiteResult) {
856
873
  throw createUnsupportedSuiteError(suite, suiteContext);
857
874
  }
875
+ const debugSnapshot = captureDebugSnapshot ? getDebugSnapshot() : null;
858
876
 
859
877
  if (suite === 'bench' && suiteResult?.metrics?.workloadType === 'training') {
860
878
  const trainingReport = suiteResult?.metrics?.trainingMetricsReport;
@@ -886,6 +904,7 @@ export async function runBrowserSuite(options = {}) {
886
904
  metrics: suiteResult.metrics ?? null,
887
905
  output: reportOutput,
888
906
  memory: suiteResult.memoryStats ?? null,
907
+ debugSnapshot,
889
908
  ...options.report,
890
909
  };
891
910
  if (ulArtifacts.length > 0 || distillArtifacts.length > 0 || checkpointResumeTimeline.length > 0) {
@@ -907,7 +926,7 @@ export async function runBrowserSuite(options = {}) {
907
926
  report.timestamp = suiteTimestamp;
908
927
  }
909
928
  const reportInfo = await saveReport(modelId, report, { timestamp: report.timestamp });
910
- return { ...suiteResult, report, reportInfo };
929
+ return { ...suiteResult, debugSnapshot, report, reportInfo };
911
930
  });
912
931
  }
913
932
 
@@ -89,6 +89,9 @@ export function normalizeDiffusionMatmulLocationDtype(dtype) {
89
89
  return normalized;
90
90
  }
91
91
 
92
+ // Artifact-derived dtype inference: determines actual storage dtype from buffer byte size.
93
+ // This is NOT a config-bypass — it reads physical buffer dimensions (artifact-derived config),
94
+ // which is a valid merge layer per the config merge contract.
92
95
  export function inferDiffusionMatmulDtypeFromBuffer(weight, N, K, preferred) {
93
96
  const buffer = getBuffer(weight);
94
97
  if (!buffer || !Number.isFinite(N) || !Number.isFinite(K)) return preferred;
@@ -45,6 +45,8 @@ import { processLayerGPU } from '../text/layer.js';
45
45
 
46
46
  const QUICK_GELU_ALPHA = 1.702;
47
47
  const SUPPORTED_CLIP_HIDDEN_ACTIVATIONS = new Set(['gelu', 'quick_gelu']);
48
+ // Standard CLIP hidden activation per OpenAI CLIP specification.
49
+ const DEFAULT_CLIP_HIDDEN_ACT = 'gelu';
48
50
 
49
51
  function padTokens(tokens, maxLength, padTokenId) {
50
52
  if (!Number.isFinite(maxLength) || maxLength <= 0) {
@@ -100,11 +102,15 @@ function createVectorTensor(device, data, dtype, label) {
100
102
  return createTensor(buffer, dtype, [1, length], label);
101
103
  }
102
104
 
105
+ // Conservative fallback dtype for diffusion bias tensors when no dtype
106
+ // metadata is available. F32 avoids precision loss in bias additions.
107
+ const DEFAULT_BIAS_DTYPE = 'f32';
108
+
103
109
  function resolveBiasDtype(weight, weightsEntry, key) {
104
110
  if (weight && weight.dtype) return weight.dtype;
105
111
  const locationDtype = weightsEntry?.dtypes?.get(key);
106
112
  const mapped = normalizeDiffusionLocationDtype(locationDtype);
107
- return mapped || 'f32';
113
+ return mapped || DEFAULT_BIAS_DTYPE;
108
114
  }
109
115
 
110
116
  function createBiasTensorWithDtype(weight, weightsEntry, key, size, label) {
@@ -145,7 +151,7 @@ function createKernelOps(recorder) {
145
151
  }
146
152
 
147
153
  function resolveClipHiddenActivation(config) {
148
- const hiddenAct = config?.hidden_act ?? 'gelu';
154
+ const hiddenAct = config?.hidden_act ?? DEFAULT_CLIP_HIDDEN_ACT;
149
155
  if (!SUPPORTED_CLIP_HIDDEN_ACTIVATIONS.has(hiddenAct)) {
150
156
  throw new Error(
151
157
  `Unsupported CLIP hidden_act "${hiddenAct}". ` +
@@ -0,0 +1,12 @@
1
+ import type { Tensor } from '../../../../gpu/tensor.js';
2
+
3
+ export interface AttentionProjectionInputResult {
4
+ oProjInput: Tensor;
5
+ oProjInputTemp: Tensor | null;
6
+ }
7
+
8
+ export function prepareAttentionProjectionInput(
9
+ attnForProjection: Tensor,
10
+ matmulOutputDtype: string,
11
+ castToF16: (tensor: Tensor) => Promise<Tensor>
12
+ ): Promise<AttentionProjectionInputResult>;
@@ -0,0 +1,8 @@
1
+ export async function prepareAttentionProjectionInput(attnForProjection, matmulOutputDtype, castToF16) {
2
+ if (matmulOutputDtype === 'f16' && attnForProjection.dtype !== 'f16') {
3
+ const casted = await castToF16(attnForProjection);
4
+ return { oProjInput: casted, oProjInputTemp: casted };
5
+ }
6
+
7
+ return { oProjInput: attnForProjection, oProjInputTemp: null };
8
+ }
@@ -46,7 +46,16 @@ export function recordAttentionInputs(
46
46
  info: AttentionInputInfo | null | undefined
47
47
  ): void;
48
48
 
49
- export function resolveAttentionProjectionOutputDtype(attentionInputDtype: string): 'f16' | 'f32' | string;
49
+ export function shouldForceF32AttentionProjectionForRoPE(options: {
50
+ attentionInputDtype: string;
51
+ headDim: number;
52
+ rotaryDim?: number;
53
+ interleaved?: boolean;
54
+ }): boolean;
55
+ export function resolveAttentionProjectionOutputDtype(
56
+ attentionInputDtype: string,
57
+ options?: { forceF32?: boolean }
58
+ ): 'f16' | 'f32' | string;
50
59
  export function resolveProjectionSliceOffsetBytes(
51
60
  weightBuffer: WeightBuffer | Tensor | GPUBuffer | null | undefined,
52
61
  outputRows: number,
@@ -5,6 +5,8 @@ import {
5
5
  recordMatmul,
6
6
  runSplitQKV,
7
7
  recordSplitQKV,
8
+ runSplitQG,
9
+ recordSplitQG,
8
10
  runRMSNorm,
9
11
  recordRMSNorm,
10
12
  } from '../../../../gpu/kernel-selector.js';
@@ -28,6 +30,13 @@ function getSplitRunner(recorder) {
28
30
  return (qkvTensor, options) => recordSplitQKV(recorder, qkvTensor, options);
29
31
  }
30
32
 
33
+ function getSplitQGRunner(recorder) {
34
+ if (!recorder) {
35
+ return (qgTensor, options) => runSplitQG(qgTensor, options);
36
+ }
37
+ return (qgTensor, options) => recordSplitQG(recorder, qgTensor, options);
38
+ }
39
+
31
40
  function getRmsNormRunner(recorder) {
32
41
  if (!recorder) {
33
42
  return (input, weight, eps, options) => runRMSNorm(input, weight, eps, options);
@@ -201,13 +210,17 @@ async function projectQueryWithOptionalGate({
201
210
  return { qTensor, qGateTensor: null };
202
211
  }
203
212
 
213
+ // q_proj weights are stored with interleaved head layout: for head h,
214
+ // rows [h*headDim*2 : h*headDim*2+headDim] = Q, rows [h*headDim*2+headDim : (h+1)*headDim*2] = gate.
215
+ // Compute the full 2*qSize matmul, then de-interleave into separate Q and gate tensors.
204
216
  const runMatmulForMode = getMatmulRunner(recorder);
217
+ const runSplitQGForMode = getSplitQGRunner(recorder);
205
218
  const qWeightBuffer = getWeightBuffer(qWeight, 'q_proj');
206
- const gateOffset = resolveProjectionSliceOffsetBytes(qWeightBuffer, qSize, hiddenSize);
219
+ let fullQGTensor = null;
207
220
  let qTensor = null;
208
221
  let qGateTensor = null;
209
222
  try {
210
- qTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize, hiddenSize, {
223
+ fullQGTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize * 2, hiddenSize, {
211
224
  transposeB: 'auto',
212
225
  role: 'q_proj',
213
226
  layerIdx,
@@ -215,15 +228,19 @@ async function projectQueryWithOptionalGate({
215
228
  outputDtype: matmulOutputDtype,
216
229
  });
217
230
 
218
- qGateTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize, hiddenSize, {
219
- transposeB: 'auto',
220
- role: 'q_proj_gate',
221
- layerIdx,
222
- kernelPath,
223
- bOffset: gateOffset,
224
- outputDtype: matmulOutputDtype,
231
+ const split = await runSplitQGForMode(fullQGTensor, {
232
+ numTokens,
233
+ numHeads,
234
+ headDim,
225
235
  });
236
+ releaseTemporary(fullQGTensor.buffer);
237
+ fullQGTensor = null;
238
+ qTensor = split.Q;
239
+ qGateTensor = split.G;
226
240
  } catch (error) {
241
+ if (fullQGTensor) {
242
+ releaseTemporary(fullQGTensor.buffer);
243
+ }
227
244
  if (qTensor) {
228
245
  releaseTemporary(qTensor.buffer);
229
246
  }
@@ -277,9 +294,22 @@ export function recordAttentionInputs(state, info) {
277
294
  state.stats.attentionInputs.push(info);
278
295
  }
279
296
 
280
- export function resolveAttentionProjectionOutputDtype(attentionInputDtype) {
297
+ export function shouldForceF32AttentionProjectionForRoPE({
298
+ attentionInputDtype,
299
+ headDim,
300
+ rotaryDim = headDim,
301
+ interleaved = false,
302
+ }) {
303
+ return attentionInputDtype === 'f16'
304
+ && Number.isFinite(headDim)
305
+ && Number.isFinite(rotaryDim)
306
+ && (rotaryDim !== headDim || interleaved === true);
307
+ }
308
+
309
+ export function resolveAttentionProjectionOutputDtype(attentionInputDtype, options = {}) {
281
310
  const useF16Activations = attentionInputDtype === 'f16';
282
- return selectRuleValue('shared', 'dtype', 'f16OrFallbackByFlag', {
311
+ return selectRuleValue('inference', 'dtype', 'attentionProjectionOutputDtype', {
312
+ forceF32: options.forceF32 === true,
283
313
  useF16: useF16Activations,
284
314
  fallback: attentionInputDtype,
285
315
  });
@@ -24,10 +24,12 @@ import { selectRuleValue } from '../../../../rules/rule-registry.js';
24
24
  import { SlidingWindowKVCache } from '../../../kv-cache.js';
25
25
  import {
26
26
  recordAttentionInputs,
27
+ shouldForceF32AttentionProjectionForRoPE,
27
28
  resolveAttentionProjectionOutputDtype,
28
29
  projectAttentionQKV,
29
30
  applyAttentionQKNorm,
30
31
  } from './projections.js';
32
+ import { prepareAttentionProjectionInput } from './output-projection.js';
31
33
 
32
34
  import { releaseOrTrack, shouldDebugLayer } from './types.js';
33
35
 
@@ -142,7 +144,14 @@ export async function recordLayerAttentionGPU(
142
144
  }
143
145
 
144
146
  // 2. Q/K/V projections
145
- const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype);
147
+ const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
148
+ forceF32: shouldForceF32AttentionProjectionForRoPE({
149
+ attentionInputDtype: desiredOutputDtype,
150
+ headDim,
151
+ rotaryDim: config.ropeRotaryDim,
152
+ interleaved: config.ropeInterleaved,
153
+ }),
154
+ });
146
155
  let usedFusedQKV = false;
147
156
  ({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
148
157
  recorder,
@@ -535,14 +544,14 @@ export async function recordLayerAttentionGPU(
535
544
  let oProjInput = attnForProjection;
536
545
  oProjInputTemp = null;
537
546
  if (layerWeights.oProj && getWeightBuffer) {
547
+ ({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
548
+ attnForProjection,
549
+ matmulOutputDtype,
550
+ (tensor) => recordCastF32ToF16(recorder, tensor)
551
+ ));
538
552
  const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
539
553
  const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
540
554
 
541
- if (matmulOutputDtype === 'f16' && attnForProjection.dtype !== 'f16') {
542
- oProjInput = await recordCastF32ToF16(recorder, attnForProjection);
543
- oProjInputTemp = oProjInput;
544
- }
545
-
546
555
  // Use fused o_proj + residual for decode when possible
547
556
  // Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
548
557
  const oProjDtype = getWeightDtype(oProjBuf);
@@ -28,10 +28,12 @@ import { runProbes } from '../probes.js';
28
28
  import { SlidingWindowKVCache } from '../../../kv-cache.js';
29
29
  import {
30
30
  recordAttentionInputs,
31
+ shouldForceF32AttentionProjectionForRoPE,
31
32
  resolveAttentionProjectionOutputDtype,
32
33
  projectAttentionQKV,
33
34
  applyAttentionQKNorm,
34
35
  } from './projections.js';
36
+ import { prepareAttentionProjectionInput } from './output-projection.js';
35
37
 
36
38
  import {
37
39
  shouldDebugLayer,
@@ -193,7 +195,14 @@ export async function runLayerAttentionGPU(
193
195
  }
194
196
 
195
197
  // 2. Q/K/V projections
196
- const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype);
198
+ const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
199
+ forceF32: shouldForceF32AttentionProjectionForRoPE({
200
+ attentionInputDtype: desiredOutputDtype,
201
+ headDim,
202
+ rotaryDim: config.ropeRotaryDim,
203
+ interleaved: config.ropeInterleaved,
204
+ }),
205
+ });
197
206
  let usedFusedQKV = false;
198
207
  ({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
199
208
  recorder: null,
@@ -224,6 +233,27 @@ export async function runLayerAttentionGPU(
224
233
  await traceStep('matmul', `L${layerIdx}.k_proj`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
225
234
  await traceStep('matmul', `L${layerIdx}.v_proj`, layerIdx, vTensor.buffer, [numTokens, numKVHeads * headDim]);
226
235
  }
236
+ await runProbes('q_proj', qTensor.buffer, {
237
+ layerIdx,
238
+ numTokens,
239
+ hiddenSize: numHeads * headDim,
240
+ probes: state.debugProbes,
241
+ dtype: qTensor.dtype,
242
+ });
243
+ await runProbes('k_proj', kTensor.buffer, {
244
+ layerIdx,
245
+ numTokens,
246
+ hiddenSize: numKVHeads * headDim,
247
+ probes: state.debugProbes,
248
+ dtype: kTensor.dtype,
249
+ });
250
+ await runProbes('v_proj', vTensor.buffer, {
251
+ layerIdx,
252
+ numTokens,
253
+ hiddenSize: numKVHeads * headDim,
254
+ probes: state.debugProbes,
255
+ dtype: vTensor.dtype,
256
+ });
227
257
 
228
258
  // Kernel step debug: Q/K/V projections
229
259
  if (isKernelDebugEnabled(layerIdx)) {
@@ -331,6 +361,20 @@ export async function runLayerAttentionGPU(
331
361
  await traceStep('rope', `L${layerIdx}.k_rope`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
332
362
  }
333
363
  }
364
+ await runProbes('q_rope', qTensor.buffer, {
365
+ layerIdx,
366
+ numTokens,
367
+ hiddenSize: numHeads * headDim,
368
+ probes: state.debugProbes,
369
+ dtype: qTensor.dtype,
370
+ });
371
+ await runProbes('k_rope', kTensor.buffer, {
372
+ layerIdx,
373
+ numTokens,
374
+ hiddenSize: numKVHeads * headDim,
375
+ probes: state.debugProbes,
376
+ dtype: kTensor.dtype,
377
+ });
334
378
  if (isKernelDebugEnabled(layerIdx)) {
335
379
  logKernelStep('rope', { layerIdx, label: `startPos=${currentSeqLen}` });
336
380
  await dumpTokenVector(qTensor.buffer, 'Q_rope', {
@@ -723,14 +767,14 @@ export async function runLayerAttentionGPU(
723
767
  let oProjInput = attnForProjection;
724
768
  oProjInputTemp = null;
725
769
  if (layerWeights.oProj && getWeightBuffer) {
770
+ ({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
771
+ attnForProjection,
772
+ matmulOutputDtype,
773
+ castF32ToF16
774
+ ));
726
775
  const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
727
776
  const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
728
777
 
729
- if (matmulOutputDtype === 'f16' && attnOutput.dtype !== 'f16') {
730
- oProjInput = await castF32ToF16(attnOutput);
731
- oProjInputTemp = oProjInput;
732
- }
733
-
734
778
  // Use fused o_proj + residual for decode when possible
735
779
  // Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
736
780
  const oProjDtype = getWeightDtype(oProjBuf);
@@ -482,6 +482,20 @@ export function toParsedConfigFromMerged(merged, manifest) {
482
482
  const queryPreAttnScalar = inf.attention.queryPreAttnScalar;
483
483
  const causalAttention = inf.attention.causal;
484
484
 
485
+ // Cross-field sanity: queryPreAttnScalar should typically equal headDim.
486
+ // A value of sqrt(headDim) indicates a known converter bug that produces
487
+ // attnScale = 1/sqrt(sqrt(headDim)) instead of the correct 1/sqrt(headDim).
488
+ if (queryPreAttnScalar != null && headDim != null
489
+ && queryPreAttnScalar !== headDim
490
+ && Math.abs(queryPreAttnScalar - Math.sqrt(headDim)) < 0.01) {
491
+ throw new Error(
492
+ `Model "${merged.modelId}": queryPreAttnScalar (${queryPreAttnScalar}) ` +
493
+ `equals sqrt(headDim) instead of headDim (${headDim}). ` +
494
+ `This is a known converter bug — the manifest must be regenerated ` +
495
+ `with the corrected converter.`
496
+ );
497
+ }
498
+
485
499
  // Get stop token IDs (cast to Manifest for compatibility)
486
500
  const stopTokenIds = getStopTokenIds(manifest);
487
501
 
@@ -58,10 +58,11 @@ function resolveFallbackActivationDtype(primaryActivationDtype) {
58
58
  function resolveFallbackKernelPath(primaryKernelPath) {
59
59
  const primaryKernelPathId = primaryKernelPath?.id ?? null;
60
60
  if (!primaryKernelPathId) {
61
- throw new Error(
62
- '[ExecutionPlan] F16 finiteness fallback requires a primary kernel path with a stable id. ' +
63
- 'Add a registered kernelPath id and a finiteness fallback rule.'
64
- );
61
+ return {
62
+ kernelPath: null,
63
+ kernelPathId: null,
64
+ kernelPathSource: 'none',
65
+ };
65
66
  }
66
67
 
67
68
  const explicitFallbackKernelPathId = typeof primaryKernelPath?.finitenessFallbackKernelPathId === 'string'
@@ -213,6 +213,10 @@ export function resolvePrefillEmbeddingOptions(state, options = {}) {
213
213
  ? state.manifest.modelType.toLowerCase()
214
214
  : '';
215
215
  const generationDefaults = state.runtimeConfig.inference.generation;
216
+ // Embedding models default to 'mean' pooling — this is a model-category behavior,
217
+ // not a model-family identity check. Ideally embedding model presets would set
218
+ // generation.embeddingMode='mean' in their runtime config; the modelType fallback
219
+ // provides this default for manifests that predate runtime-preset embedding mode.
216
220
  const defaultEmbeddingMode = modelType === 'embedding'
217
221
  ? 'mean'
218
222
  : generationDefaults.embeddingMode;
@@ -226,6 +230,7 @@ export function resolveAdvanceEmbeddingMode(state, options = {}) {
226
230
  const modelType = typeof state.manifest?.modelType === 'string'
227
231
  ? state.manifest.modelType.toLowerCase()
228
232
  : '';
233
+ // See resolvePrefillEmbeddingOptions for embedding-model pooling rationale.
229
234
  const configuredMode = state.runtimeConfig.inference.generation.embeddingMode;
230
235
  return resolveConfiguredValue(
231
236
  options.embeddingMode,
@@ -19,6 +19,12 @@ export declare function resolveBatchStop(
19
19
  eosTokenId: number | undefined | null
20
20
  ): number;
21
21
 
22
+ export declare function findInvalidGeneratedToken(
23
+ tokens: number[],
24
+ vocabSize: number,
25
+ padTokenId?: number | null
26
+ ): { index: number; tokenId: number } | null;
27
+
22
28
  export interface SampledTokenStagingBuffer {
23
29
  mapAsync(mode: number): Promise<void>;
24
30
  getMappedRange(): ArrayBufferLike;
@@ -113,6 +113,20 @@ export function resolveBatchStop(tokens, stopFlags, stopTokenIds, eosTokenId) {
113
113
  return actualCount;
114
114
  }
115
115
 
116
+ export function findInvalidGeneratedToken(tokens, vocabSize, padTokenId = null) {
117
+ for (let i = 0; i < tokens.length; i++) {
118
+ const tokenId = tokens[i];
119
+ const isInvalid = !Number.isFinite(tokenId)
120
+ || tokenId < 0
121
+ || tokenId >= vocabSize
122
+ || (padTokenId != null ? tokenId === padTokenId : tokenId === 0);
123
+ if (isInvalid) {
124
+ return { index: i, tokenId };
125
+ }
126
+ }
127
+ return null;
128
+ }
129
+
116
130
  export async function readSampledTokenFromStagingBuffer(stagingBuffer, options = {}) {
117
131
  const ownsStagingBuffer = options.ownsStagingBuffer === true;
118
132
  const hasFinitenessBuffer = options.hasFinitenessBuffer === true;
@@ -240,11 +254,9 @@ async function runDecodeLayers(state, tokenId, opts, helpers) {
240
254
  throw new Error('Embed buffer not found or not a supported buffer type');
241
255
  }
242
256
  const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
243
- const embedDtype = isWeightBuffer(embedBufferRaw)
244
- ? getWeightDtype(embedBufferRaw)
245
- : isCpuWeightBuffer(embedBufferRaw)
246
- ? embedBufferRaw.dtype
247
- : null;
257
+ const embedDtype = isCpuWeightBuffer(embedBufferRaw)
258
+ ? embedBufferRaw.dtype
259
+ : getWeightDtype(embedBufferRaw);
248
260
  const activationDtype = getEffectiveActivationDtype(state, opts);
249
261
 
250
262
  const embedTensor = await embed([tokenId], embedBuffer, {
@@ -326,11 +338,9 @@ export async function decodeStep(state, currentIds, opts, helpers) {
326
338
  throw new Error('Embed buffer not found or not a supported buffer type');
327
339
  }
328
340
  const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
329
- const embedDtype = isWeightBuffer(embedBufferRaw)
330
- ? getWeightDtype(embedBufferRaw)
331
- : isCpuWeightBuffer(embedBufferRaw)
332
- ? embedBufferRaw.dtype
333
- : null;
341
+ const embedDtype = isCpuWeightBuffer(embedBufferRaw)
342
+ ? embedBufferRaw.dtype
343
+ : getWeightDtype(embedBufferRaw);
334
344
  const activationDtype = getEffectiveActivationDtype(state, opts);
335
345
  const activationBytes = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: activationDtype });
336
346
 
@@ -636,11 +646,21 @@ export async function decodeStep(state, currentIds, opts, helpers) {
636
646
  });
637
647
 
638
648
  releaseBuffer(logitsBuffer);
639
- if (!context.decodeBuffers?.ownsBuffer(hiddenStates)) {
640
- releaseBuffer(hiddenStates);
649
+ const invalidGpuToken = nextToken >= config.vocabSize
650
+ || (padTokenId != null && nextToken === padTokenId)
651
+ || (padTokenId == null && nextToken === 0);
652
+ if (!invalidGpuToken) {
653
+ if (!context.decodeBuffers?.ownsBuffer(hiddenStates)) {
654
+ releaseBuffer(hiddenStates);
655
+ }
656
+ state.currentSeqLen++;
657
+ return nextToken;
641
658
  }
642
- state.currentSeqLen++;
643
- return nextToken;
659
+ state.disableFusedDecode = true;
660
+ log.warn(
661
+ 'Decode',
662
+ `GPU sampling produced invalid token ${nextToken} (vocabSize=${config.vocabSize}, step=${state.decodeStepCount}); falling back to CPU sampling.`
663
+ );
644
664
  }
645
665
  }
646
666
 
@@ -981,7 +1001,7 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
981
1001
  throw new Error('Embed buffer not found or not a GPUBuffer/WeightBuffer');
982
1002
  }
983
1003
  const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
984
- const embedDtype = isWeightBuffer(embedBufferRaw) ? getWeightDtype(embedBufferRaw) : null;
1004
+ const embedDtype = getWeightDtype(embedBufferRaw);
985
1005
  const activationDtype = getEffectiveActivationDtype(state, opts);
986
1006
 
987
1007
  for (let i = 0; i < N; i++) {
@@ -1125,10 +1145,18 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
1125
1145
 
1126
1146
  const actualCount = resolveBatchStop(tokens, stopFlags, stopTokenIds, eosToken);
1127
1147
  const generatedTokens = tokens.slice(0, actualCount);
1148
+ const invalidToken = findInvalidGeneratedToken(generatedTokens, config.vocabSize, padTokenId);
1128
1149
 
1129
1150
  if (isInfinite) {
1130
1151
  throw new FinitenessError(`F16 bounds exceeded during batch generation${metadata}`);
1131
1152
  }
1153
+ if (invalidToken) {
1154
+ state.disableFusedDecode = true;
1155
+ throw new Error(
1156
+ `[Pipeline] Batch decode produced invalid token ${invalidToken.tokenId} ` +
1157
+ `at batch index ${invalidToken.index} (vocabSize=${config.vocabSize}, padTokenId=${padTokenId ?? 'none'}).`
1158
+ );
1159
+ }
1132
1160
 
1133
1161
  if (opts.profile && recorder.isProfilingEnabled()) {
1134
1162
  const timings = await recorder.resolveProfileTimings();