@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/CHANGELOG.md +32 -0
  2. package/README.md +25 -6
  3. package/package.json +25 -38
  4. package/src/browser/browser-converter.js +5 -0
  5. package/src/client/doppler-api.browser.js +6 -0
  6. package/src/client/doppler-api.d.ts +3 -0
  7. package/src/client/doppler-api.js +11 -2
  8. package/src/client/doppler-registry.js +3 -5
  9. package/src/client/doppler-registry.json +2 -2
  10. package/src/config/kernel-path-loader.d.ts +5 -0
  11. package/src/config/kernel-path-loader.js +13 -0
  12. package/src/config/kernels/kernel-ref-digests.js +23 -21
  13. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  14. package/src/config/kernels/registry.json +74 -0
  15. package/src/config/loader.js +9 -0
  16. package/src/config/merge-contract-check.js +7 -0
  17. package/src/config/platforms/loader.js +3 -1
  18. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  19. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  20. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  21. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
  22. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
  23. package/src/config/presets/kernel-paths/registry.json +21 -0
  24. package/src/config/presets/models/gemma2.json +2 -1
  25. package/src/config/presets/models/gemma3.json +4 -1
  26. package/src/config/presets/models/gemma4.json +61 -0
  27. package/src/config/presets/models/granite-docling.json +70 -0
  28. package/src/config/presets/models/lfm2.json +6 -1
  29. package/src/config/presets/models/qwen3.json +4 -3
  30. package/src/config/presets/models/qwen3_5.json +16 -0
  31. package/src/config/presets/models/qwen3_vl.json +40 -0
  32. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  33. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  34. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  35. package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
  36. package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
  37. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  38. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  39. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  40. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  41. package/src/config/runtime.js +3 -0
  42. package/src/config/schema/conversion.schema.d.ts +1 -0
  43. package/src/config/schema/debug.schema.d.ts +40 -0
  44. package/src/config/schema/debug.schema.js +28 -0
  45. package/src/config/schema/index.js +2 -0
  46. package/src/config/schema/inference-defaults.schema.js +1 -1
  47. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  48. package/src/config/schema/manifest.schema.d.ts +1 -1
  49. package/src/config/schema/manifest.schema.js +1 -1
  50. package/src/config/schema/memory-limits.schema.js +2 -2
  51. package/src/config/schema/storage.schema.js +2 -2
  52. package/src/converter/conversion-plan.js +11 -3
  53. package/src/converter/core.js +19 -8
  54. package/src/converter/manifest-inference.js +12 -22
  55. package/src/converter/parsers/transformer.js +4 -0
  56. package/src/converter/quantization-info.js +5 -1
  57. package/src/converter/quantizer.d.ts +5 -0
  58. package/src/converter/quantizer.js +34 -12
  59. package/src/converter/rope-config.js +8 -6
  60. package/src/converter/tokenizer-utils.d.ts +1 -0
  61. package/src/converter/tokenizer-utils.js +4 -1
  62. package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
  63. package/src/distribution/shard-delivery.js +40 -1
  64. package/src/formats/rdrr/classification.js +32 -0
  65. package/src/formats/rdrr/parsing.d.ts +4 -0
  66. package/src/formats/rdrr/parsing.js +14 -1
  67. package/src/gpu/kernel-runtime.js +4 -2
  68. package/src/gpu/kernels/attention.js +2 -1
  69. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  70. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  71. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  72. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  73. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  74. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  75. package/src/gpu/kernels/gated-short-conv.js +284 -0
  76. package/src/gpu/kernels/index.d.ts +8 -0
  77. package/src/gpu/kernels/index.js +6 -0
  78. package/src/gpu/kernels/linear-attention-core.js +37 -17
  79. package/src/gpu/kernels/matmul-selection.js +48 -4
  80. package/src/gpu/kernels/matmul.d.ts +5 -0
  81. package/src/gpu/kernels/matmul.js +71 -2
  82. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  83. package/src/gpu/kernels/rmsnorm.js +9 -2
  84. package/src/gpu/kernels/sample.js +1 -3
  85. package/src/gpu/kernels/sample.wgsl +39 -9
  86. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  87. package/src/gpu/kernels/shader-cache.js +9 -4
  88. package/src/gpu/kernels/split_qg.d.ts +50 -0
  89. package/src/gpu/kernels/split_qg.js +46 -0
  90. package/src/gpu/kernels/split_qg.wgsl +58 -0
  91. package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
  92. package/src/gpu/weight-buffer.d.ts +1 -1
  93. package/src/gpu/weight-buffer.js +1 -1
  94. package/src/inference/browser-harness.d.ts +2 -0
  95. package/src/inference/browser-harness.js +20 -1
  96. package/src/inference/kv-cache/base.js +3 -10
  97. package/src/inference/pipelines/diffusion/helpers.js +3 -0
  98. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  99. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
  100. package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
  101. package/src/inference/pipelines/text/attention/output-projection.js +8 -0
  102. package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
  103. package/src/inference/pipelines/text/attention/projections.js +54 -13
  104. package/src/inference/pipelines/text/attention/record.js +16 -6
  105. package/src/inference/pipelines/text/attention/run.js +59 -6
  106. package/src/inference/pipelines/text/config.d.ts +1 -0
  107. package/src/inference/pipelines/text/config.js +46 -4
  108. package/src/inference/pipelines/text/embed.js +26 -7
  109. package/src/inference/pipelines/text/execution-plan.js +5 -4
  110. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  111. package/src/inference/pipelines/text/execution-v0.js +12 -1
  112. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  113. package/src/inference/pipelines/text/generator-runtime.js +19 -0
  114. package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
  115. package/src/inference/pipelines/text/generator-steps.js +71 -26
  116. package/src/inference/pipelines/text/generator.d.ts +5 -0
  117. package/src/inference/pipelines/text/generator.js +353 -166
  118. package/src/inference/pipelines/text/init.d.ts +15 -0
  119. package/src/inference/pipelines/text/init.js +35 -10
  120. package/src/inference/pipelines/text/layer.js +38 -8
  121. package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
  122. package/src/inference/pipelines/text/linear-attention.js +33 -3
  123. package/src/inference/pipelines/text/logits/gpu.js +2 -2
  124. package/src/inference/pipelines/text/logits/index.d.ts +6 -1
  125. package/src/inference/pipelines/text/logits/index.js +3 -1
  126. package/src/inference/pipelines/text/model-load.js +3 -0
  127. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  128. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  129. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  130. package/src/inference/pipelines/text/ops.js +123 -53
  131. package/src/inference/pipelines/text/probes.js +1 -0
  132. package/src/inference/pipelines/text/sampling.js +52 -6
  133. package/src/inference/pipelines/text/state.js +2 -0
  134. package/src/inference/pipelines/text.d.ts +5 -0
  135. package/src/inference/pipelines/text.js +59 -1
  136. package/src/inference/pipelines/vision/encoder.js +386 -0
  137. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  138. package/src/inference/pipelines/vision/index.js +173 -0
  139. package/src/inference/pipelines/vision/ops.js +78 -0
  140. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  141. package/src/inference/test-harness.js +11 -9
  142. package/src/loader/doppler-loader.d.ts +3 -0
  143. package/src/loader/doppler-loader.js +20 -3
  144. package/src/loader/experts/expert-cache.js +6 -2
  145. package/src/loader/experts/expert-loader.js +6 -2
  146. package/src/loader/final-weights-loader.js +2 -0
  147. package/src/loader/layer-loader.js +42 -3
  148. package/src/loader/manifest-config.js +3 -1
  149. package/src/loader/shard-cache.js +3 -2
  150. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  151. package/src/loader/tensors/tensor-loader.js +130 -4
  152. package/src/rules/inference/dtype.rules.json +5 -0
  153. package/src/rules/inference/kernel-path.rules.json +2 -2
  154. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  155. package/src/rules/kernels/softmax.rules.json +2 -0
  156. package/src/rules/kernels/split-qg.rules.json +6 -0
  157. package/src/rules/rule-registry.d.ts +1 -0
  158. package/src/rules/rule-registry.js +4 -0
  159. package/src/storage/downloader.js +2 -1
  160. package/src/storage/quickstart-downloader.d.ts +3 -0
  161. package/src/storage/quickstart-downloader.js +27 -30
  162. package/src/storage/shard-manager.js +4 -3
  163. package/src/tooling/conversion-config-materializer.js +3 -5
  164. package/src/tooling/node-converter.js +28 -7
  165. package/src/tooling/node-source-runtime.js +65 -5
  166. package/src/tooling/node-webgpu.js +24 -7
  167. package/src/types/model.d.ts +5 -0
  168. package/src/utils/hf-resolve-url.d.ts +16 -0
  169. package/src/utils/hf-resolve-url.js +17 -0
  170. package/src/version.js +1 -1
  171. package/tools/doppler-cli.js +6 -1
  172. package/src/tooling/node-convert.d.ts +0 -54
@@ -0,0 +1,8 @@
1
+ export async function prepareAttentionProjectionInput(attnForProjection, matmulOutputDtype, castToF16) {
2
+ if (matmulOutputDtype === 'f16' && attnForProjection.dtype !== 'f16') {
3
+ const casted = await castToF16(attnForProjection);
4
+ return { oProjInput: casted, oProjInputTemp: casted };
5
+ }
6
+
7
+ return { oProjInput: attnForProjection, oProjInputTemp: null };
8
+ }
@@ -3,6 +3,7 @@ import type { Tensor } from '../../../../gpu/tensor.js';
3
3
  import type { WeightBuffer, CpuWeightBuffer } from '../../../../gpu/weight-buffer.js';
4
4
  import type { LayerWeights } from '../types.js';
5
5
  import type { LoRAAdapter } from '../lora.js';
6
+ import type { MatmulDebugConfigSchema } from '../../../../config/schema/debug.schema.js';
6
7
 
7
8
  export interface AttentionInputInfo {
8
9
  phase: 'prefill' | 'decode';
@@ -46,7 +47,16 @@ export function recordAttentionInputs(
46
47
  info: AttentionInputInfo | null | undefined
47
48
  ): void;
48
49
 
49
- export function resolveAttentionProjectionOutputDtype(attentionInputDtype: string): 'f16' | 'f32' | string;
50
+ export function shouldForceF32AttentionProjectionForRoPE(options: {
51
+ attentionInputDtype: string;
52
+ headDim: number;
53
+ rotaryDim?: number;
54
+ interleaved?: boolean;
55
+ }): boolean;
56
+ export function resolveAttentionProjectionOutputDtype(
57
+ attentionInputDtype: string,
58
+ options?: { forceF32?: boolean }
59
+ ): 'f16' | 'f32' | string;
50
60
  export function resolveProjectionSliceOffsetBytes(
51
61
  weightBuffer: WeightBuffer | Tensor | GPUBuffer | null | undefined,
52
62
  outputRows: number,
@@ -67,11 +77,13 @@ export interface ProjectAttentionQKVOptions {
67
77
  getWeightBuffer?: (weight: GPUBuffer | WeightBuffer | Float32Array | ArrayBuffer | CpuWeightBuffer, label: string) => GPUBuffer | WeightBuffer;
68
78
  lora?: LoRAAdapter | null;
69
79
  releaseTemporary: (buffer: GPUBuffer) => void;
80
+ matmulDebug?: MatmulDebugConfigSchema | null;
70
81
  onFusedQKV?: ((info: { qSize: number; kSize: number; vSize: number; totalSize: number }) => void) | null;
71
82
  }
72
83
 
73
84
  export interface ProjectAttentionQKVResult {
74
85
  qTensor: Tensor;
86
+ qGateTensor: Tensor | null;
75
87
  kTensor: Tensor;
76
88
  vTensor: Tensor;
77
89
  usedFusedQKV: boolean;
@@ -5,6 +5,8 @@ import {
5
5
  recordMatmul,
6
6
  runSplitQKV,
7
7
  recordSplitQKV,
8
+ runSplitQG,
9
+ recordSplitQG,
8
10
  runRMSNorm,
9
11
  recordRMSNorm,
10
12
  } from '../../../../gpu/kernel-selector.js';
@@ -28,6 +30,13 @@ function getSplitRunner(recorder) {
28
30
  return (qkvTensor, options) => recordSplitQKV(recorder, qkvTensor, options);
29
31
  }
30
32
 
33
+ function getSplitQGRunner(recorder) {
34
+ if (!recorder) {
35
+ return (qgTensor, options) => runSplitQG(qgTensor, options);
36
+ }
37
+ return (qgTensor, options) => recordSplitQG(recorder, qgTensor, options);
38
+ }
39
+
31
40
  function getRmsNormRunner(recorder) {
32
41
  if (!recorder) {
33
42
  return (input, weight, eps, options) => runRMSNorm(input, weight, eps, options);
@@ -62,9 +71,10 @@ async function projectSingleQkvTensor({
62
71
  matmulOutputDtype,
63
72
  getWeightBuffer,
64
73
  lora,
74
+ matmulDebug,
65
75
  releaseTemporary,
66
76
  }) {
67
- const runMatmulForMode = getMatmulRunner(recorder);
77
+ const runMatmulForMode = getMatmulRunner(recorder);
68
78
  const layerWeight = layerWeights?.[weightKey];
69
79
  if (!layerWeight) {
70
80
  throw new Error(`Attention projection requires ${weightKey}.`);
@@ -82,6 +92,7 @@ async function projectSingleQkvTensor({
82
92
  layerIdx,
83
93
  kernelPath,
84
94
  outputDtype: matmulOutputDtype,
95
+ matmulDebug,
85
96
  });
86
97
  } finally {
87
98
  releaseOwnedWeightBuffer(layerWeight, projBuffer, releaseTemporary);
@@ -169,6 +180,7 @@ async function projectQueryWithOptionalGate({
169
180
  matmulOutputDtype,
170
181
  getWeightBuffer,
171
182
  lora,
183
+ matmulDebug,
172
184
  releaseTemporary,
173
185
  attentionOutputGate,
174
186
  }) {
@@ -196,34 +208,44 @@ async function projectQueryWithOptionalGate({
196
208
  matmulOutputDtype,
197
209
  getWeightBuffer,
198
210
  lora,
211
+ matmulDebug,
199
212
  releaseTemporary,
200
213
  });
201
214
  return { qTensor, qGateTensor: null };
202
215
  }
203
216
 
217
+ // q_proj weights are stored with interleaved head layout: for head h,
218
+ // rows [h*headDim*2 : h*headDim*2+headDim] = Q, rows [h*headDim*2+headDim : (h+1)*headDim*2] = gate.
219
+ // Compute the full 2*qSize matmul, then de-interleave into separate Q and gate tensors.
204
220
  const runMatmulForMode = getMatmulRunner(recorder);
221
+ const runSplitQGForMode = getSplitQGRunner(recorder);
205
222
  const qWeightBuffer = getWeightBuffer(qWeight, 'q_proj');
206
- const gateOffset = resolveProjectionSliceOffsetBytes(qWeightBuffer, qSize, hiddenSize);
223
+ let fullQGTensor = null;
207
224
  let qTensor = null;
208
225
  let qGateTensor = null;
209
226
  try {
210
- qTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize, hiddenSize, {
227
+ fullQGTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize * 2, hiddenSize, {
211
228
  transposeB: 'auto',
212
229
  role: 'q_proj',
213
230
  layerIdx,
214
231
  kernelPath,
215
232
  outputDtype: matmulOutputDtype,
233
+ matmulDebug,
216
234
  });
217
235
 
218
- qGateTensor = await runMatmulForMode(normed, qWeightBuffer, numTokens, qSize, hiddenSize, {
219
- transposeB: 'auto',
220
- role: 'q_proj_gate',
221
- layerIdx,
222
- kernelPath,
223
- bOffset: gateOffset,
224
- outputDtype: matmulOutputDtype,
236
+ const split = await runSplitQGForMode(fullQGTensor, {
237
+ numTokens,
238
+ numHeads,
239
+ headDim,
225
240
  });
241
+ releaseTemporary(fullQGTensor.buffer);
242
+ fullQGTensor = null;
243
+ qTensor = split.Q;
244
+ qGateTensor = split.G;
226
245
  } catch (error) {
246
+ if (fullQGTensor) {
247
+ releaseTemporary(fullQGTensor.buffer);
248
+ }
227
249
  if (qTensor) {
228
250
  releaseTemporary(qTensor.buffer);
229
251
  }
@@ -277,9 +299,22 @@ export function recordAttentionInputs(state, info) {
277
299
  state.stats.attentionInputs.push(info);
278
300
  }
279
301
 
280
- export function resolveAttentionProjectionOutputDtype(attentionInputDtype) {
302
+ export function shouldForceF32AttentionProjectionForRoPE({
303
+ attentionInputDtype,
304
+ headDim,
305
+ rotaryDim = headDim,
306
+ interleaved = false,
307
+ }) {
308
+ return attentionInputDtype === 'f16'
309
+ && Number.isFinite(headDim)
310
+ && Number.isFinite(rotaryDim)
311
+ && (rotaryDim !== headDim || interleaved === true);
312
+ }
313
+
314
+ export function resolveAttentionProjectionOutputDtype(attentionInputDtype, options = {}) {
281
315
  const useF16Activations = attentionInputDtype === 'f16';
282
- return selectRuleValue('shared', 'dtype', 'f16OrFallbackByFlag', {
316
+ return selectRuleValue('inference', 'dtype', 'attentionProjectionOutputDtype', {
317
+ forceF32: options.forceF32 === true,
283
318
  useF16: useF16Activations,
284
319
  fallback: attentionInputDtype,
285
320
  });
@@ -299,6 +334,7 @@ export async function projectAttentionQKV({
299
334
  matmulOutputDtype,
300
335
  getWeightBuffer,
301
336
  lora,
337
+ matmulDebug,
302
338
  releaseTemporary,
303
339
  onFusedQKV = null,
304
340
  attentionOutputGate = false,
@@ -309,7 +345,8 @@ export async function projectAttentionQKV({
309
345
  const hasLoRA = getLoRAModule(lora, layerIdx, 'q_proj')
310
346
  || getLoRAModule(lora, layerIdx, 'k_proj')
311
347
  || getLoRAModule(lora, layerIdx, 'v_proj');
312
- const useFusedQKV = selectRuleValue('inference', 'attention', 'useFusedQkv', {
348
+ const forceSplitQKV = Boolean(matmulDebug?.enabled) && matmulDebug?.forceSplitQKV === true;
349
+ const useFusedQKV = !forceSplitQKV && selectRuleValue('inference', 'attention', 'useFusedQkv', {
313
350
  hasQkvProj: Boolean(layerWeights.qkvProj),
314
351
  hasQkvSizes: Boolean(layerWeights.qkvSizes),
315
352
  hasLoRA: Boolean(hasLoRA),
@@ -326,6 +363,7 @@ export async function projectAttentionQKV({
326
363
  layerIdx,
327
364
  kernelPath,
328
365
  outputDtype: matmulOutputDtype,
366
+ matmulDebug,
329
367
  });
330
368
  const split = await runSplitForMode(qkvTensor, {
331
369
  numTokens,
@@ -364,6 +402,7 @@ export async function projectAttentionQKV({
364
402
  matmulOutputDtype,
365
403
  getWeightBuffer,
366
404
  lora,
405
+ matmulDebug,
367
406
  releaseTemporary,
368
407
  attentionOutputGate,
369
408
  }));
@@ -384,6 +423,7 @@ export async function projectAttentionQKV({
384
423
  matmulOutputDtype,
385
424
  getWeightBuffer,
386
425
  lora,
426
+ matmulDebug,
387
427
  releaseTemporary,
388
428
  });
389
429
 
@@ -403,6 +443,7 @@ export async function projectAttentionQKV({
403
443
  matmulOutputDtype,
404
444
  getWeightBuffer,
405
445
  lora,
446
+ matmulDebug,
406
447
  releaseTemporary,
407
448
  });
408
449
 
@@ -24,10 +24,12 @@ import { selectRuleValue } from '../../../../rules/rule-registry.js';
24
24
  import { SlidingWindowKVCache } from '../../../kv-cache.js';
25
25
  import {
26
26
  recordAttentionInputs,
27
+ shouldForceF32AttentionProjectionForRoPE,
27
28
  resolveAttentionProjectionOutputDtype,
28
29
  projectAttentionQKV,
29
30
  applyAttentionQKNorm,
30
31
  } from './projections.js';
32
+ import { prepareAttentionProjectionInput } from './output-projection.js';
31
33
 
32
34
  import { releaseOrTrack, shouldDebugLayer } from './types.js';
33
35
 
@@ -142,7 +144,14 @@ export async function recordLayerAttentionGPU(
142
144
  }
143
145
 
144
146
  // 2. Q/K/V projections
145
- const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype);
147
+ const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
148
+ forceF32: shouldForceF32AttentionProjectionForRoPE({
149
+ attentionInputDtype: desiredOutputDtype,
150
+ headDim,
151
+ rotaryDim: config.ropeRotaryDim,
152
+ interleaved: config.ropeInterleaved,
153
+ }),
154
+ });
146
155
  let usedFusedQKV = false;
147
156
  ({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
148
157
  recorder,
@@ -158,6 +167,7 @@ export async function recordLayerAttentionGPU(
158
167
  matmulOutputDtype,
159
168
  getWeightBuffer,
160
169
  lora,
170
+ matmulDebug: state.runtimeConfig?.shared?.debug?.matmul ?? null,
161
171
  attentionOutputGate: config.attentionOutputGate === true,
162
172
  releaseTemporary: (buffer) => releaseOrTrack(recorder, buffer),
163
173
  onFusedQKV: layerIdx === 0 && isPrefill
@@ -535,14 +545,14 @@ export async function recordLayerAttentionGPU(
535
545
  let oProjInput = attnForProjection;
536
546
  oProjInputTemp = null;
537
547
  if (layerWeights.oProj && getWeightBuffer) {
548
+ ({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
549
+ attnForProjection,
550
+ matmulOutputDtype,
551
+ (tensor) => recordCastF32ToF16(recorder, tensor)
552
+ ));
538
553
  const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
539
554
  const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
540
555
 
541
- if (matmulOutputDtype === 'f16' && attnForProjection.dtype !== 'f16') {
542
- oProjInput = await recordCastF32ToF16(recorder, attnForProjection);
543
- oProjInputTemp = oProjInput;
544
- }
545
-
546
556
  // Use fused o_proj + residual for decode when possible
547
557
  // Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
548
558
  const oProjDtype = getWeightDtype(oProjBuf);
@@ -28,10 +28,12 @@ import { runProbes } from '../probes.js';
28
28
  import { SlidingWindowKVCache } from '../../../kv-cache.js';
29
29
  import {
30
30
  recordAttentionInputs,
31
+ shouldForceF32AttentionProjectionForRoPE,
31
32
  resolveAttentionProjectionOutputDtype,
32
33
  projectAttentionQKV,
33
34
  applyAttentionQKNorm,
34
35
  } from './projections.js';
36
+ import { prepareAttentionProjectionInput } from './output-projection.js';
35
37
 
36
38
  import {
37
39
  shouldDebugLayer,
@@ -164,6 +166,14 @@ export async function runLayerAttentionGPU(
164
166
  dtype: normed.dtype,
165
167
  });
166
168
  }
169
+
170
+ await runProbes('post_input_norm', normed.buffer, {
171
+ layerIdx,
172
+ numTokens,
173
+ hiddenSize,
174
+ probes: state.debugProbes,
175
+ dtype: normed.dtype,
176
+ });
167
177
  }
168
178
 
169
179
  // Debug: Check normed input for L0 prefill
@@ -193,7 +203,14 @@ export async function runLayerAttentionGPU(
193
203
  }
194
204
 
195
205
  // 2. Q/K/V projections
196
- const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype);
206
+ const matmulOutputDtype = resolveAttentionProjectionOutputDtype(desiredOutputDtype, {
207
+ forceF32: shouldForceF32AttentionProjectionForRoPE({
208
+ attentionInputDtype: desiredOutputDtype,
209
+ headDim,
210
+ rotaryDim: config.ropeRotaryDim,
211
+ interleaved: config.ropeInterleaved,
212
+ }),
213
+ });
197
214
  let usedFusedQKV = false;
198
215
  ({ qTensor, qGateTensor, kTensor, vTensor, usedFusedQKV } = await projectAttentionQKV({
199
216
  recorder: null,
@@ -209,6 +226,7 @@ export async function runLayerAttentionGPU(
209
226
  matmulOutputDtype,
210
227
  getWeightBuffer,
211
228
  lora,
229
+ matmulDebug: state.runtimeConfig?.shared?.debug?.matmul ?? null,
212
230
  attentionOutputGate: config.attentionOutputGate === true,
213
231
  releaseTemporary: (buffer) => releaseBuffer(buffer),
214
232
  onFusedQKV: layerIdx === 0 && isPrefill
@@ -224,6 +242,27 @@ export async function runLayerAttentionGPU(
224
242
  await traceStep('matmul', `L${layerIdx}.k_proj`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
225
243
  await traceStep('matmul', `L${layerIdx}.v_proj`, layerIdx, vTensor.buffer, [numTokens, numKVHeads * headDim]);
226
244
  }
245
+ await runProbes('q_proj', qTensor.buffer, {
246
+ layerIdx,
247
+ numTokens,
248
+ hiddenSize: numHeads * headDim,
249
+ probes: state.debugProbes,
250
+ dtype: qTensor.dtype,
251
+ });
252
+ await runProbes('k_proj', kTensor.buffer, {
253
+ layerIdx,
254
+ numTokens,
255
+ hiddenSize: numKVHeads * headDim,
256
+ probes: state.debugProbes,
257
+ dtype: kTensor.dtype,
258
+ });
259
+ await runProbes('v_proj', vTensor.buffer, {
260
+ layerIdx,
261
+ numTokens,
262
+ hiddenSize: numKVHeads * headDim,
263
+ probes: state.debugProbes,
264
+ dtype: vTensor.dtype,
265
+ });
227
266
 
228
267
  // Kernel step debug: Q/K/V projections
229
268
  if (isKernelDebugEnabled(layerIdx)) {
@@ -331,6 +370,20 @@ export async function runLayerAttentionGPU(
331
370
  await traceStep('rope', `L${layerIdx}.k_rope`, layerIdx, kTensor.buffer, [numTokens, numKVHeads * headDim]);
332
371
  }
333
372
  }
373
+ await runProbes('q_rope', qTensor.buffer, {
374
+ layerIdx,
375
+ numTokens,
376
+ hiddenSize: numHeads * headDim,
377
+ probes: state.debugProbes,
378
+ dtype: qTensor.dtype,
379
+ });
380
+ await runProbes('k_rope', kTensor.buffer, {
381
+ layerIdx,
382
+ numTokens,
383
+ hiddenSize: numKVHeads * headDim,
384
+ probes: state.debugProbes,
385
+ dtype: kTensor.dtype,
386
+ });
334
387
  if (isKernelDebugEnabled(layerIdx)) {
335
388
  logKernelStep('rope', { layerIdx, label: `startPos=${currentSeqLen}` });
336
389
  await dumpTokenVector(qTensor.buffer, 'Q_rope', {
@@ -723,14 +776,14 @@ export async function runLayerAttentionGPU(
723
776
  let oProjInput = attnForProjection;
724
777
  oProjInputTemp = null;
725
778
  if (layerWeights.oProj && getWeightBuffer) {
779
+ ({ oProjInput, oProjInputTemp } = await prepareAttentionProjectionInput(
780
+ attnForProjection,
781
+ matmulOutputDtype,
782
+ castF32ToF16
783
+ ));
726
784
  const oProjBuf = getWeightBuffer(layerWeights.oProj, 'o_proj');
727
785
  const loraO = getLoRAModule(lora, layerIdx, 'o_proj');
728
786
 
729
- if (matmulOutputDtype === 'f16' && attnOutput.dtype !== 'f16') {
730
- oProjInput = await castF32ToF16(attnOutput);
731
- oProjInputTemp = oProjInput;
732
- }
733
-
734
787
  // Use fused o_proj + residual for decode when possible
735
788
  // Note: dtype from WeightBuffer metadata (buffer-dtypes WeakMap removed)
736
789
  const oProjDtype = getWeightDtype(oProjBuf);
@@ -150,6 +150,7 @@ export interface ParsedModelConfig {
150
150
  ropeLocalTheta: number | null;
151
151
  ropeRotaryDim: number;
152
152
  ropeInterleaved: boolean;
153
+ mropeInterleaved: boolean;
153
154
  mropeSection: number[] | null;
154
155
  partialRotaryFactor: number | null;
155
156
  ropeScale: number;
@@ -349,6 +349,24 @@ function normalizeLayerTypeTag(value) {
349
349
  return null;
350
350
  }
351
351
 
352
+ function resolveVisionConfig(rawConfig, manifest) {
353
+ const vc = rawConfig?.vision_config ?? manifest?.config?.vision_config;
354
+ if (!vc || typeof vc !== 'object') return null;
355
+ return {
356
+ depth: vc.depth ?? 24,
357
+ hiddenSize: vc.hidden_size ?? 1024,
358
+ intermediateSize: vc.intermediate_size ?? 4096,
359
+ numHeads: vc.num_heads ?? 16,
360
+ outHiddenSize: vc.out_hidden_size ?? vc.hidden_size ?? 1024,
361
+ patchSize: vc.patch_size ?? 16,
362
+ spatialMergeSize: vc.spatial_merge_size ?? 2,
363
+ temporalPatchSize: vc.temporal_patch_size ?? 2,
364
+ eps: vc.eps ?? 1e-6,
365
+ deepstackVisualIndexes: Array.isArray(vc.deepstack_visual_indexes) ? vc.deepstack_visual_indexes : [],
366
+ imageTokenId: rawConfig?.image_token_id ?? manifest?.image_token_id ?? null,
367
+ };
368
+ }
369
+
352
370
  function parseCustomLayerTypes(layerTypes, numLayers, modelId) {
353
371
  if (!Array.isArray(layerTypes) || layerTypes.length === 0) {
354
372
  throw new Error(
@@ -482,6 +500,20 @@ export function toParsedConfigFromMerged(merged, manifest) {
482
500
  const queryPreAttnScalar = inf.attention.queryPreAttnScalar;
483
501
  const causalAttention = inf.attention.causal;
484
502
 
503
+ // Cross-field sanity: queryPreAttnScalar should typically equal headDim.
504
+ // A value of sqrt(headDim) indicates a known converter bug that produces
505
+ // attnScale = 1/sqrt(sqrt(headDim)) instead of the correct 1/sqrt(headDim).
506
+ if (queryPreAttnScalar != null && headDim != null
507
+ && queryPreAttnScalar !== headDim
508
+ && Math.abs(queryPreAttnScalar - Math.sqrt(headDim)) < 0.01) {
509
+ throw new Error(
510
+ `Model "${merged.modelId}": queryPreAttnScalar (${queryPreAttnScalar}) ` +
511
+ `equals sqrt(headDim) instead of headDim (${headDim}). ` +
512
+ `This is a known converter bug — the manifest must be regenerated ` +
513
+ `with the corrected converter.`
514
+ );
515
+ }
516
+
485
517
  // Get stop token IDs (cast to Manifest for compatibility)
486
518
  const stopTokenIds = getStopTokenIds(manifest);
487
519
 
@@ -498,10 +530,18 @@ export function toParsedConfigFromMerged(merged, manifest) {
498
530
  // RoPE scaling - use manifest inference as source of truth (not raw config)
499
531
  const ropeScale = inf.rope.ropeScalingFactor;
500
532
  const ropeScalingType = inf.rope.ropeScalingType;
501
- const ropeLocalScale = inf.rope.ropeLocalScalingFactor ?? ropeScale;
502
- const ropeLocalScalingType = inf.rope.ropeLocalScalingType ?? ropeScalingType;
533
+ const ropeLocalScale = inf.rope.ropeLocalScalingFactor;
534
+ const ropeLocalScalingType = inf.rope.ropeLocalScalingType;
503
535
  const partialRotaryFactor = inf.rope.partialRotaryFactor;
504
- const ropeInterleaved = inf.rope.mropeInterleaved === true;
536
+ const mropeInterleaved = inf.rope.mropeInterleaved === true;
537
+ const ropeInterleaved = false;
538
+
539
+ if (ropeLocalScale == null && (inf.rope.ropeLocalTheta != null || inf.rope.mropeSection != null)) {
540
+ throw new Error(
541
+ `Model "${merged.modelId}" uses hybrid/mRoPE but is missing rope.ropeLocalScalingFactor in manifest. ` +
542
+ `Re-convert the model using the latest converter or update the manifest to include an explicit scale.`
543
+ );
544
+ }
505
545
  const mropeSection = Array.isArray(inf.rope.mropeSection)
506
546
  ? inf.rope.mropeSection.map((entry) => Math.trunc(Number(entry)))
507
547
  : null;
@@ -511,7 +551,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
511
551
  `Manifest "${merged.modelId}" has invalid rope.mropeSection; expected positive integers.`
512
552
  );
513
553
  }
514
- if (ropeInterleaved && mropeSection) {
554
+ if (mropeInterleaved && mropeSection) {
515
555
  const doubledMropeDim = mropeSection.reduce((sum, entry) => sum + entry, 0) * 2;
516
556
  if (doubledMropeDim !== ropeRotaryDim) {
517
557
  throw new Error(
@@ -596,6 +636,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
596
636
  ropeLocalTheta: inf.rope.ropeLocalTheta,
597
637
  ropeRotaryDim,
598
638
  ropeInterleaved,
639
+ mropeInterleaved,
599
640
  mropeSection,
600
641
  partialRotaryFactor,
601
642
  ropeScale,
@@ -636,6 +677,7 @@ export function toParsedConfigFromMerged(merged, manifest) {
636
677
  chatTemplateType,
637
678
  chatTemplateEnabled,
638
679
  kernelPath: inf.defaultKernelPath,
680
+ visionConfig: resolveVisionConfig(config, manifest),
639
681
  };
640
682
  }
641
683
 
@@ -9,6 +9,7 @@ import { decodeReadback } from './debug-utils/index.js';
9
9
  import { createTensor } from '../../../gpu/tensor.js';
10
10
  import { castF32ToF16, recordCastF32ToF16 } from '../../../gpu/kernels/cast.js';
11
11
  import { isCpuWeightBuffer } from '../../../gpu/weight-buffer.js';
12
+ import { f16ToF32 } from '../../../loader/dtype-utils.js';
12
13
  import { selectRuleValue } from '../../../rules/rule-registry.js';
13
14
 
14
15
  const scaleShaderCode = `
@@ -202,11 +203,19 @@ export async function embed(tokenIds, embedBuffer, config) {
202
203
 
203
204
  const dtype = selectRuleValue('inference', 'dtype', 'f16OrF32', { useF16 });
204
205
 
205
- const cpuEmbeddings = isCpuWeightBuffer(embedBuffer)
206
- ? embedBuffer.data
207
- : embedBuffer instanceof Float32Array
208
- ? embedBuffer
209
- : null;
206
+ let cpuEmbeddings = null;
207
+ if (isCpuWeightBuffer(embedBuffer)) {
208
+ const bufDtype = embedBuffer.dtype;
209
+ if (bufDtype !== 'f32' && bufDtype !== 'f16') {
210
+ throw new Error(
211
+ `[Embed] CPU embedding buffer has unsupported dtype '${bufDtype}'; ` +
212
+ `only 'f32' and 'f16' are supported in the CPU gather path.`
213
+ );
214
+ }
215
+ cpuEmbeddings = embedBuffer.data;
216
+ } else if (embedBuffer instanceof Float32Array) {
217
+ cpuEmbeddings = embedBuffer;
218
+ }
210
219
 
211
220
  if (debug) {
212
221
  trace.embed(`tokens=${numTokens}, hidden=${hiddenSize}, vocab=${vocabSize}, scaleEmbeddings=${scaleEmbeddings}, transpose=${transpose}, indexOffset=${indexOffset}, activationDtype=${activationDtype}, useF16=${useF16}`);
@@ -226,18 +235,28 @@ export async function embed(tokenIds, embedBuffer, config) {
226
235
  }
227
236
 
228
237
  const output = new Float32Array(numTokens * hiddenSize);
238
+ // Check actual data type: loader's f16_to_f32 CPU path already decodes F16 into Float32Array,
239
+ // so dtype='f16' does not reliably indicate raw F16 bytes. Only Uint16Array needs per-element decoding.
240
+ const isF16Cpu = cpuEmbeddings instanceof Uint16Array;
229
241
  if (!transpose) {
230
242
  for (let t = 0; t < numTokens; t++) {
231
243
  const tokenId = (tokenIdArray)[t];
232
244
  const srcOffset = tokenId * hiddenSize;
233
- output.set(cpuEmbeddings.subarray(srcOffset, srcOffset + hiddenSize), t * hiddenSize);
245
+ if (isF16Cpu) {
246
+ for (let h = 0; h < hiddenSize; h++) {
247
+ output[t * hiddenSize + h] = f16ToF32(cpuEmbeddings[srcOffset + h]);
248
+ }
249
+ } else {
250
+ output.set(cpuEmbeddings.subarray(srcOffset, srcOffset + hiddenSize), t * hiddenSize);
251
+ }
234
252
  }
235
253
  } else {
236
254
  for (let t = 0; t < numTokens; t++) {
237
255
  const tokenId = (tokenIdArray)[t];
238
256
  const dstOffset = t * hiddenSize;
239
257
  for (let h = 0; h < hiddenSize; h++) {
240
- output[dstOffset + h] = cpuEmbeddings[h * vocabSize + tokenId];
258
+ const raw = cpuEmbeddings[h * vocabSize + tokenId];
259
+ output[dstOffset + h] = isF16Cpu ? f16ToF32(raw) : raw;
241
260
  }
242
261
  }
243
262
  }
@@ -58,10 +58,11 @@ function resolveFallbackActivationDtype(primaryActivationDtype) {
58
58
  function resolveFallbackKernelPath(primaryKernelPath) {
59
59
  const primaryKernelPathId = primaryKernelPath?.id ?? null;
60
60
  if (!primaryKernelPathId) {
61
- throw new Error(
62
- '[ExecutionPlan] F16 finiteness fallback requires a primary kernel path with a stable id. ' +
63
- 'Add a registered kernelPath id and a finiteness fallback rule.'
64
- );
61
+ return {
62
+ kernelPath: null,
63
+ kernelPathId: null,
64
+ kernelPathSource: 'none',
65
+ };
65
66
  }
66
67
 
67
68
  const explicitFallbackKernelPathId = typeof primaryKernelPath?.finitenessFallbackKernelPathId === 'string'
@@ -1,7 +1,7 @@
1
1
  import { selectRuleValue } from '../../../rules/rule-registry.js';
2
2
  import { cloneJson, isPhaseMatch, normalizeDtype, requireSessionActivationDtype, stepHasLayer } from './execution-v0-contract-helpers.js';
3
3
 
4
- const PIPELINE_COMPATIBLE_OPS = new Set([
4
+ export const PIPELINE_COMPATIBLE_OPS = new Set([
5
5
  'save',
6
6
  'load',
7
7
  'conv',
@@ -191,8 +191,15 @@ export function buildLayerPipelineFromExecution(steps) {
191
191
  if (layerSectionSteps.length === 0) {
192
192
  return null;
193
193
  }
194
- if (layerSectionSteps.some((step) => !PIPELINE_COMPATIBLE_OPS.has(step.op))) {
195
- return null;
194
+ const incompatibleOps = [
195
+ ...new Set(
196
+ layerSectionSteps
197
+ .filter((step) => !PIPELINE_COMPATIBLE_OPS.has(step.op))
198
+ .map((step) => step.op)
199
+ ),
200
+ ];
201
+ if (incompatibleOps.length > 0) {
202
+ return { incompatibleOps };
196
203
  }
197
204
 
198
205
  const layerSteps = layerSectionSteps
@@ -31,6 +31,7 @@ import {
31
31
  buildModelRuntimeOverrides,
32
32
  buildSessionRuntimePatch,
33
33
  resolveFinitenessFallbackKernelPathId,
34
+ PIPELINE_COMPATIBLE_OPS,
34
35
  } from './execution-v0-runtime-builders.js';
35
36
 
36
37
  export function hasExecutionV0(manifestInference) {
@@ -152,7 +153,17 @@ export function compileExecutionV0(options = {}) {
152
153
  numLayers,
153
154
  finitenessFallbackKernelPathId
154
155
  );
155
- const layerPipeline = buildLayerPipelineFromExecution(resolvedSteps);
156
+ const layerPipelineResult = buildLayerPipelineFromExecution(resolvedSteps);
157
+ if (layerPipelineResult?.incompatibleOps && !kernelPath) {
158
+ throw new Error(
159
+ `[ExecutionV0] manifest.inference.execution.steps contains layer ops that are not ` +
160
+ `compatible with the JS layer pipeline and no inline kernelPath was built to cover execution. ` +
161
+ `Unsupported ops: ${layerPipelineResult.incompatibleOps.join(', ')}. ` +
162
+ `Either add explicit kernel references to each step (for inline-kernel execution) ` +
163
+ `or restrict layer ops to: ${[...PIPELINE_COMPATIBLE_OPS].join(', ')}.`
164
+ );
165
+ }
166
+ const layerPipeline = layerPipelineResult?.incompatibleOps ? null : layerPipelineResult;
156
167
  const sessionPatch = buildSessionRuntimePatch(resolvedSession);
157
168
  const modelOverrides = buildModelRuntimeOverrides(manifestInference);
158
169
  for (const [path, source] of sessionSourceByPath.entries()) {
@@ -111,6 +111,7 @@ export function buildLayerContext(state, recorder, isDecodeMode, debugLayers, de
111
111
  ropeLocalCos: state.ropeLocalCos,
112
112
  ropeLocalSin: state.ropeLocalSin,
113
113
  linearAttentionRuntime: state.linearAttentionRuntime,
114
+ convLayerStates: state.convLayerStates,
114
115
  weightConfig: getWeightBufferConfig(state),
115
116
  debugFlags: state.debugFlags,
116
117
  debugProbes: state.runtimeConfig.shared.debug.probes,