@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/CHANGELOG.md +32 -0
  2. package/README.md +25 -6
  3. package/package.json +25 -38
  4. package/src/browser/browser-converter.js +5 -0
  5. package/src/client/doppler-api.browser.js +6 -0
  6. package/src/client/doppler-api.d.ts +3 -0
  7. package/src/client/doppler-api.js +11 -2
  8. package/src/client/doppler-registry.js +3 -5
  9. package/src/client/doppler-registry.json +2 -2
  10. package/src/config/kernel-path-loader.d.ts +5 -0
  11. package/src/config/kernel-path-loader.js +13 -0
  12. package/src/config/kernels/kernel-ref-digests.js +23 -21
  13. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  14. package/src/config/kernels/registry.json +74 -0
  15. package/src/config/loader.js +9 -0
  16. package/src/config/merge-contract-check.js +7 -0
  17. package/src/config/platforms/loader.js +3 -1
  18. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  19. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  20. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  21. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
  22. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
  23. package/src/config/presets/kernel-paths/registry.json +21 -0
  24. package/src/config/presets/models/gemma2.json +2 -1
  25. package/src/config/presets/models/gemma3.json +4 -1
  26. package/src/config/presets/models/gemma4.json +61 -0
  27. package/src/config/presets/models/granite-docling.json +70 -0
  28. package/src/config/presets/models/lfm2.json +6 -1
  29. package/src/config/presets/models/qwen3.json +4 -3
  30. package/src/config/presets/models/qwen3_5.json +16 -0
  31. package/src/config/presets/models/qwen3_vl.json +40 -0
  32. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  33. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  34. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  35. package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
  36. package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
  37. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  38. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  39. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  40. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  41. package/src/config/runtime.js +3 -0
  42. package/src/config/schema/conversion.schema.d.ts +1 -0
  43. package/src/config/schema/debug.schema.d.ts +40 -0
  44. package/src/config/schema/debug.schema.js +28 -0
  45. package/src/config/schema/index.js +2 -0
  46. package/src/config/schema/inference-defaults.schema.js +1 -1
  47. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  48. package/src/config/schema/manifest.schema.d.ts +1 -1
  49. package/src/config/schema/manifest.schema.js +1 -1
  50. package/src/config/schema/memory-limits.schema.js +2 -2
  51. package/src/config/schema/storage.schema.js +2 -2
  52. package/src/converter/conversion-plan.js +11 -3
  53. package/src/converter/core.js +19 -8
  54. package/src/converter/manifest-inference.js +12 -22
  55. package/src/converter/parsers/transformer.js +4 -0
  56. package/src/converter/quantization-info.js +5 -1
  57. package/src/converter/quantizer.d.ts +5 -0
  58. package/src/converter/quantizer.js +34 -12
  59. package/src/converter/rope-config.js +8 -6
  60. package/src/converter/tokenizer-utils.d.ts +1 -0
  61. package/src/converter/tokenizer-utils.js +4 -1
  62. package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
  63. package/src/distribution/shard-delivery.js +40 -1
  64. package/src/formats/rdrr/classification.js +32 -0
  65. package/src/formats/rdrr/parsing.d.ts +4 -0
  66. package/src/formats/rdrr/parsing.js +14 -1
  67. package/src/gpu/kernel-runtime.js +4 -2
  68. package/src/gpu/kernels/attention.js +2 -1
  69. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  70. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  71. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  72. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  73. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  74. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  75. package/src/gpu/kernels/gated-short-conv.js +284 -0
  76. package/src/gpu/kernels/index.d.ts +8 -0
  77. package/src/gpu/kernels/index.js +6 -0
  78. package/src/gpu/kernels/linear-attention-core.js +37 -17
  79. package/src/gpu/kernels/matmul-selection.js +48 -4
  80. package/src/gpu/kernels/matmul.d.ts +5 -0
  81. package/src/gpu/kernels/matmul.js +71 -2
  82. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  83. package/src/gpu/kernels/rmsnorm.js +9 -2
  84. package/src/gpu/kernels/sample.js +1 -3
  85. package/src/gpu/kernels/sample.wgsl +39 -9
  86. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  87. package/src/gpu/kernels/shader-cache.js +9 -4
  88. package/src/gpu/kernels/split_qg.d.ts +50 -0
  89. package/src/gpu/kernels/split_qg.js +46 -0
  90. package/src/gpu/kernels/split_qg.wgsl +58 -0
  91. package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
  92. package/src/gpu/weight-buffer.d.ts +1 -1
  93. package/src/gpu/weight-buffer.js +1 -1
  94. package/src/inference/browser-harness.d.ts +2 -0
  95. package/src/inference/browser-harness.js +20 -1
  96. package/src/inference/kv-cache/base.js +3 -10
  97. package/src/inference/pipelines/diffusion/helpers.js +3 -0
  98. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  99. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
  100. package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
  101. package/src/inference/pipelines/text/attention/output-projection.js +8 -0
  102. package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
  103. package/src/inference/pipelines/text/attention/projections.js +54 -13
  104. package/src/inference/pipelines/text/attention/record.js +16 -6
  105. package/src/inference/pipelines/text/attention/run.js +59 -6
  106. package/src/inference/pipelines/text/config.d.ts +1 -0
  107. package/src/inference/pipelines/text/config.js +46 -4
  108. package/src/inference/pipelines/text/embed.js +26 -7
  109. package/src/inference/pipelines/text/execution-plan.js +5 -4
  110. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  111. package/src/inference/pipelines/text/execution-v0.js +12 -1
  112. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  113. package/src/inference/pipelines/text/generator-runtime.js +19 -0
  114. package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
  115. package/src/inference/pipelines/text/generator-steps.js +71 -26
  116. package/src/inference/pipelines/text/generator.d.ts +5 -0
  117. package/src/inference/pipelines/text/generator.js +353 -166
  118. package/src/inference/pipelines/text/init.d.ts +15 -0
  119. package/src/inference/pipelines/text/init.js +35 -10
  120. package/src/inference/pipelines/text/layer.js +38 -8
  121. package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
  122. package/src/inference/pipelines/text/linear-attention.js +33 -3
  123. package/src/inference/pipelines/text/logits/gpu.js +2 -2
  124. package/src/inference/pipelines/text/logits/index.d.ts +6 -1
  125. package/src/inference/pipelines/text/logits/index.js +3 -1
  126. package/src/inference/pipelines/text/model-load.js +3 -0
  127. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  128. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  129. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  130. package/src/inference/pipelines/text/ops.js +123 -53
  131. package/src/inference/pipelines/text/probes.js +1 -0
  132. package/src/inference/pipelines/text/sampling.js +52 -6
  133. package/src/inference/pipelines/text/state.js +2 -0
  134. package/src/inference/pipelines/text.d.ts +5 -0
  135. package/src/inference/pipelines/text.js +59 -1
  136. package/src/inference/pipelines/vision/encoder.js +386 -0
  137. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  138. package/src/inference/pipelines/vision/index.js +173 -0
  139. package/src/inference/pipelines/vision/ops.js +78 -0
  140. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  141. package/src/inference/test-harness.js +11 -9
  142. package/src/loader/doppler-loader.d.ts +3 -0
  143. package/src/loader/doppler-loader.js +20 -3
  144. package/src/loader/experts/expert-cache.js +6 -2
  145. package/src/loader/experts/expert-loader.js +6 -2
  146. package/src/loader/final-weights-loader.js +2 -0
  147. package/src/loader/layer-loader.js +42 -3
  148. package/src/loader/manifest-config.js +3 -1
  149. package/src/loader/shard-cache.js +3 -2
  150. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  151. package/src/loader/tensors/tensor-loader.js +130 -4
  152. package/src/rules/inference/dtype.rules.json +5 -0
  153. package/src/rules/inference/kernel-path.rules.json +2 -2
  154. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  155. package/src/rules/kernels/softmax.rules.json +2 -0
  156. package/src/rules/kernels/split-qg.rules.json +6 -0
  157. package/src/rules/rule-registry.d.ts +1 -0
  158. package/src/rules/rule-registry.js +4 -0
  159. package/src/storage/downloader.js +2 -1
  160. package/src/storage/quickstart-downloader.d.ts +3 -0
  161. package/src/storage/quickstart-downloader.js +27 -30
  162. package/src/storage/shard-manager.js +4 -3
  163. package/src/tooling/conversion-config-materializer.js +3 -5
  164. package/src/tooling/node-converter.js +28 -7
  165. package/src/tooling/node-source-runtime.js +65 -5
  166. package/src/tooling/node-webgpu.js +24 -7
  167. package/src/types/model.d.ts +5 -0
  168. package/src/utils/hf-resolve-url.d.ts +16 -0
  169. package/src/utils/hf-resolve-url.js +17 -0
  170. package/src/version.js +1 -1
  171. package/tools/doppler-cli.js +6 -1
  172. package/src/tooling/node-convert.d.ts +0 -54
@@ -28,6 +28,7 @@ import type {
28
28
  SpeculativeConfigSchema,
29
29
  KernelPathSchema,
30
30
  } from '../../../config/schema/index.js';
31
+ import type { LoaderDebugConfigSchema } from '../../../config/schema/debug.schema.js';
31
32
  import type { KernelPathSource } from '../../../config/kernel-path-loader.js';
32
33
 
33
34
  export interface PipelineStorageContext {
@@ -190,6 +191,12 @@ export interface WeightLoadResult {
190
191
  layerRouterWeights: Map<number, RouterWeights>;
191
192
  }
192
193
 
194
+ export interface ResolvedQ4KConfig {
195
+ useFusedQ4K: boolean;
196
+ q4kLayout: 'row' | 'col' | null;
197
+ keepF32Weights: boolean;
198
+ }
199
+
193
200
  /** Options for loadWeights */
194
201
  export interface LoadWeightsOptions {
195
202
  storageContext?: PipelineStorageContext;
@@ -200,6 +207,7 @@ export interface LoadWeightsOptions {
200
207
  resolvedKernelPath?: KernelPathSchema | null;
201
208
  kernelPathSource?: KernelPathSource;
202
209
  keepF32Weights?: boolean;
210
+ loaderDebug?: LoaderDebugConfigSchema | null;
203
211
  }
204
212
 
205
213
  /**
@@ -211,6 +219,13 @@ export function loadWeights(
211
219
  options?: LoadWeightsOptions
212
220
  ): Promise<WeightLoadResult>;
213
221
 
222
+ export function resolveQ4KConfig(
223
+ manifest: Manifest,
224
+ kernelPath?: KernelPathSchema | null,
225
+ kernelPathSource?: KernelPathSource,
226
+ keepF32Weights?: boolean
227
+ ): ResolvedQ4KConfig;
228
+
214
229
  /**
215
230
  * Apply Gemma chat template to a prompt.
216
231
  */
@@ -11,7 +11,7 @@ import { getDopplerLoader } from '../../../loader/doppler-loader.js';
11
11
  import { log, setGPUDevice, trace as debugTrace } from '../../../debug/index.js';
12
12
  import { getRuntimeConfig } from '../../../config/runtime.js';
13
13
  import { PAGED_LAYOUT_SEQ_LEN_THRESHOLD } from '../../../config/schema/index.js';
14
- import { isKernelPathFusedQ4K } from '../../../config/kernel-path-loader.js';
14
+ import { isKernelPathFusedQ4K, kernelPathRequiresF32MatmulWeights } from '../../../config/kernel-path-loader.js';
15
15
  import { createWeightBuffer, getWeightDtype, isWeightBuffer } from '../../../gpu/weight-buffer.js';
16
16
  import { selectRuleValue } from '../../../rules/rule-registry.js';
17
17
  import {
@@ -128,7 +128,7 @@ function createRemoteStorageContext(baseUrl, manifest) {
128
128
  }
129
129
 
130
130
 
131
- function resolveQ4KConfig(
131
+ export function resolveQ4KConfig(
132
132
  manifest,
133
133
  kernelPath,
134
134
  kernelPathSource = 'none',
@@ -150,18 +150,23 @@ function resolveQ4KConfig(
150
150
  );
151
151
  }
152
152
  let useFused = kernelPath ? isKernelPathFusedQ4K(kernelPath) : hasSubgroups;
153
+ const kernelPathKeepsF32Weights = kernelPathRequiresF32MatmulWeights(kernelPath);
153
154
  if (q4kLayout === 'col') {
154
155
  useFused = false;
155
156
  }
157
+ const resolvedKeepF32Weights = keepF32Weights || kernelPathKeepsF32Weights;
156
158
 
157
159
  const pathLabel = kernelPath?.id ?? 'auto';
158
160
  const layoutLabel = q4kLayout ?? 'none';
159
- debugTrace.loader(`Q4K config: fused=${useFused}, kernelPath=${pathLabel}, source=${kernelPathSource}, layout=${layoutLabel}, subgroups=${hasSubgroups}`);
161
+ debugTrace.loader(
162
+ `Q4K config: fused=${useFused}, kernelPath=${pathLabel}, source=${kernelPathSource}, ` +
163
+ `layout=${layoutLabel}, keepF32Weights=${resolvedKeepF32Weights}, subgroups=${hasSubgroups}`
164
+ );
160
165
 
161
166
  return {
162
167
  useFusedQ4K: useFused,
163
168
  q4kLayout,
164
- keepF32Weights,
169
+ keepF32Weights: resolvedKeepF32Weights,
165
170
  };
166
171
  }
167
172
 
@@ -304,13 +309,21 @@ export async function initRoPEFrequencies(config, useGPU) {
304
309
  if (!Number.isFinite(ropeScale) || ropeScale <= 0) {
305
310
  throw new Error(`RoPE scale must be a positive number; got "${ropeScale}".`);
306
311
  }
307
- const resolvedLocalScale = ropeLocalScale ?? ropeScale;
308
- if (!Number.isFinite(resolvedLocalScale) || resolvedLocalScale <= 0) {
312
+ const resolvedLocalScale = ropeLocalScale;
313
+ if (resolvedLocalScale != null && (!Number.isFinite(resolvedLocalScale) || resolvedLocalScale <= 0)) {
309
314
  throw new Error(`Local RoPE scale must be a positive number; got "${resolvedLocalScale}".`);
310
315
  }
311
316
  const resolvedLocalTheta = ropeLocalTheta ?? ropeTheta;
312
- const resolvedLocalScalingType = ropeLocalScalingType ?? ropeScalingType;
313
- const resolvedLocalScaling = ropeLocalScaling ?? ropeScaling;
317
+ const resolvedLocalScalingType = (
318
+ ropeLocalScalingType === undefined
319
+ ? ropeScalingType
320
+ : ropeLocalScalingType
321
+ );
322
+ const resolvedLocalScaling = (
323
+ ropeLocalScalingType === undefined
324
+ ? ropeScaling
325
+ : ropeLocalScaling
326
+ );
314
327
  const resolvedRotaryDim = resolveRotaryDim(headDim, rotaryDim, partialRotaryFactor);
315
328
  const halfDim = resolvedRotaryDim / 2;
316
329
  if (mropeInterleaved === true && Array.isArray(mropeSection)) {
@@ -502,6 +515,12 @@ export function createKVCache(modelConfig, useGPU, debug = false, runtimeConfig)
502
515
  cacheLayout = 'paged';
503
516
  layoutSource = 'threshold';
504
517
  }
518
+ if (forceContiguousKVCache && cacheLayout === 'paged') {
519
+ throw new Error(
520
+ 'Paged KV cache layout is not supported for models with full-attention layers. ' +
521
+ 'Set runtime.inference.kvcache.layout to "contiguous" instead.'
522
+ );
523
+ }
505
524
  if (debug && cacheLayout !== runtimeKV.layout) {
506
525
  log.debug('Pipeline', `KV cache layout override: ${runtimeKV.layout} -> ${cacheLayout} (${layoutSource})`);
507
526
  }
@@ -599,7 +618,7 @@ export function createKVCache(modelConfig, useGPU, debug = false, runtimeConfig)
599
618
 
600
619
  if (debug) {
601
620
  if (forceContiguousKVCache && modelConfig.layerTypes) {
602
- log.debug('Pipeline', 'Layer pattern includes full-attention layers; forcing contiguous KV cache.');
621
+ log.debug('Pipeline', 'Layer pattern includes full-attention layers; paged layout blocked, contiguous enforced.');
603
622
  }
604
623
  const isSliding = kvCache instanceof SlidingWindowKVCache;
605
624
  log.debug('Pipeline', `KV cache: type=${kvCache?.constructor?.name || 'unknown'}, kvDtype=${kvCache.kvDtype}, layout=${kvCache.layout}, maxSeqLen=${kvCache.maxSeqLen}, windowSize=${isSliding ? kvCache.windowSize : null}`);
@@ -635,7 +654,12 @@ export async function initTokenizer(manifest, options = {}) {
635
654
 
636
655
 
637
656
  export async function loadWeights(manifest, modelConfig, options = {}) {
638
- const { onProgress, loadingConfig, baseUrl } = options;
657
+ const {
658
+ onProgress,
659
+ loadingConfig,
660
+ baseUrl,
661
+ loaderDebug,
662
+ } = options;
639
663
  const runtimeStorageContext = options.storageContext
640
664
  ?? createRemoteStorageContext(baseUrl, manifest);
641
665
  const verifyHashes = (
@@ -657,6 +681,7 @@ export async function loadWeights(manifest, modelConfig, options = {}) {
657
681
  keepF32Weights
658
682
  )
659
683
  );
684
+ dopplerLoader.setLoaderDebugConfig(loaderDebug ?? null);
660
685
 
661
686
  const tensorsFile = isRDRRManifest(manifest) ? manifest.tensorsFile : null;
662
687
  if (baseUrl && tensorsFile) {
@@ -43,19 +43,16 @@ export function detectSandwichNorm(config) {
43
43
  }
44
44
 
45
45
 
46
- export function isMoELayer(layerIdx, config, layerWeights) {
46
+ export function isMoELayer(layerIdx, config) {
47
47
  if (!config.useMoE) return false;
48
48
 
49
- // Check if layer has router weights
50
- if (layerWeights?.routerWeight) return true;
51
-
52
- // Fall back to layer_types array if available
49
+ // Manifest-first: check layerTypes from config (derived from manifest.inference.layerPattern)
53
50
  const layerTypes = config.layerTypes;
54
51
  if (Array.isArray(layerTypes) && layerIdx < layerTypes.length) {
55
52
  return layerTypes[layerIdx] === 'moe';
56
53
  }
57
54
 
58
- // Default: assume all layers are MoE if model uses MoE
55
+ // No layerTypes available: assume all layers are MoE
59
56
  return true;
60
57
  }
61
58
 
@@ -87,6 +84,11 @@ function assertSupportedLayerRuntime(layerIdx, config) {
87
84
  }
88
85
  }
89
86
 
87
+ function getConvLayerState(convLayerStates, layerIdx) {
88
+ if (!convLayerStates) return {};
89
+ return convLayerStates.get(layerIdx) ?? {};
90
+ }
91
+
90
92
  function isSlidingLayerType(layerType) {
91
93
  const normalized = normalizeLayerType(layerType);
92
94
  return normalized === 'sliding_attention'
@@ -103,6 +105,14 @@ function isConvLayerType(layerType) {
103
105
  || normalized === 'liv_convolution';
104
106
  }
105
107
 
108
+ export function hasConvLayers(layerTypes) {
109
+ if (!Array.isArray(layerTypes)) return false;
110
+ for (let i = 0; i < layerTypes.length; i++) {
111
+ if (isConvLayerType(layerTypes[i])) return true;
112
+ }
113
+ return false;
114
+ }
115
+
106
116
  function isLinearLayerType(layerType) {
107
117
  const normalized = normalizeLayerType(layerType);
108
118
  return normalized === 'linear_attention'
@@ -201,8 +211,22 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
201
211
  );
202
212
  }
203
213
  const convKernel = layerWeights?.convKernel ?? null;
214
+ // Apply input norm (operator_norm) before conv mixer — matches HF Lfm2 forward pass
215
+ let normedTensor = inputTensor;
216
+ const inputNormWeight = layerWeights?.inputNorm ?? null;
217
+ if (inputNormWeight) {
218
+ const normWeightBuf = getNormWeightBuffer(inputNormWeight, `L${layerIdx}.conv_input_norm`);
219
+ normedTensor = await doRMSNorm(inputTensor, normWeightBuf, rmsNormEps, {
220
+ batchSize: numTokens,
221
+ hiddenSize,
222
+ rmsNormWeightOffset: config.rmsNormWeightOffset,
223
+ label: `L${layerIdx}.conv_input_norm`,
224
+ layerIdx,
225
+ }, recorder);
226
+ if (!(inputNormWeight instanceof GPUBuffer)) releaseOrTrack(recorder, normWeightBuf);
227
+ }
204
228
  attnOutput = await doConv(
205
- inputTensor,
229
+ normedTensor,
206
230
  getWeightBuffer(convInProj, `L${layerIdx}.conv_in_proj`),
207
231
  convKernel ? getWeightBuffer(convKernel, `L${layerIdx}.conv_kernel`) : null,
208
232
  getWeightBuffer(convOutProj, `L${layerIdx}.conv_out_proj`),
@@ -213,9 +237,13 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
213
237
  label: `L${layerIdx}.conv`,
214
238
  swigluLimit: config.swigluLimit,
215
239
  kernelPath: context.kernelPath ?? null,
240
+ convState: getConvLayerState(context.convLayerStates, layerIdx),
216
241
  },
217
242
  recorder
218
243
  );
244
+ if (normedTensor !== inputTensor) {
245
+ releaseOrTrack(recorder, normedTensor.buffer);
246
+ }
219
247
  } else if (isLinearLayer) {
220
248
  attnOutput = await runLinearAttentionLayer(inputTensor, layerWeights ?? null, {
221
249
  layerIdx,
@@ -276,6 +304,7 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
276
304
  : (ropeFreqsSin),
277
305
  kvCache: ((kvCache)),
278
306
  stats: context.stats,
307
+ debugProbes: context.debugProbes,
279
308
  linearRuntime: context.linearAttentionRuntime ?? null,
280
309
  };
281
310
 
@@ -720,6 +749,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
720
749
  label: `L${layerIdx}.plan_conv`,
721
750
  swigluLimit: config.swigluLimit,
722
751
  kernelPath: context.kernelPath ?? null,
752
+ convState: getConvLayerState(context.convLayerStates, layerIdx),
723
753
  },
724
754
  recorder
725
755
  );
@@ -781,7 +811,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
781
811
  let outputTensor;
782
812
  const { runMoEFFNGPU, runDenseFFNGPU } = await import('./ffn/index.js');
783
813
 
784
- const canAutoMoe = config.useMoE && isMoELayer(layerIdx, config, layerWeights);
814
+ const canAutoMoe = config.useMoE && isMoELayer(layerIdx, config);
785
815
  const useMoe = selectRuleValue(
786
816
  'inference',
787
817
  'layer',
@@ -84,6 +84,11 @@ export declare function inferLinearNormMode(
84
84
  }
85
85
  ): LinearNormMode | null;
86
86
 
87
+ export declare function applyLinearNormWeightOffset(
88
+ values: Float32Array,
89
+ rmsNormWeightOffset: boolean
90
+ ): Float32Array;
91
+
87
92
  export declare function resetLinearAttentionRuntime(
88
93
  runtime: LinearAttentionRuntime | null | undefined
89
94
  ): LinearAttentionRuntime;
@@ -5,6 +5,8 @@ import { log } from '../../../debug/index.js';
5
5
  import { decodeReadback } from './debug-utils/index.js';
6
6
  import { runLinearAttentionCoreGPU } from '../../../gpu/kernels/linear-attention-core.js';
7
7
  import { runProbes } from './probes.js';
8
+ import { QK_K, Q4K_BLOCK_BYTES } from '../../../config/schema/index.js';
9
+ import { dequantizeQ4KM } from '../../../converter/quantizer.js';
8
10
 
9
11
  const LINEAR_RUNTIME_SCHEMA_VERSION = 1;
10
12
  const QK_L2NORM_EPS = 1e-6;
@@ -34,6 +36,15 @@ function bytesFromDtype(dtype) {
34
36
  return 4;
35
37
  }
36
38
 
39
+ export function applyLinearNormWeightOffset(values, rmsNormWeightOffset) {
40
+ if (!(values instanceof Float32Array)) {
41
+ throw new Error('applyLinearNormWeightOffset requires Float32Array input.');
42
+ }
43
+ // Qwen linear-attention output norm uses direct weights even when surrounding
44
+ // transformer RMSNorm sites use the Gemma-style (1 + weight) formula.
45
+ return values;
46
+ }
47
+
37
48
  function cloneLayerRuntimeState(layerState) {
38
49
  return {
39
50
  layerIdx: layerState.layerIdx,
@@ -283,9 +294,27 @@ async function readWeightAsF32(weight, expectedElements, label) {
283
294
  if (!elementCount && isWeightBuffer(weight) && Array.isArray(weight.shape) && weight.shape.length > 0) {
284
295
  elementCount = weight.shape.reduce((total, dim) => total * Math.max(1, Math.trunc(Number(dim) || 0)), 1);
285
296
  }
297
+ const isQ4K = sourceDtype === 'q4k' || sourceDtype === 'q4_k_m' || sourceDtype === 'q4_k';
286
298
  if (!elementCount) {
287
- const inferredBytes = sourceDtype === 'f16' || sourceDtype === 'bf16' ? 2 : 4;
288
- elementCount = Math.trunc(sourceBuffer.size / inferredBytes);
299
+ if (isQ4K) {
300
+ elementCount = Math.trunc(sourceBuffer.size / Q4K_BLOCK_BYTES) * QK_K;
301
+ } else {
302
+ const inferredBytes = sourceDtype === 'f16' || sourceDtype === 'bf16' ? 2 : 4;
303
+ elementCount = Math.trunc(sourceBuffer.size / inferredBytes);
304
+ }
305
+ }
306
+
307
+ if (isQ4K) {
308
+ const numBlocks = Math.ceil(elementCount / QK_K);
309
+ const q4kBytes = numBlocks * Q4K_BLOCK_BYTES;
310
+ const raw = await readBuffer(sourceBuffer, q4kBytes);
311
+ const decoded = dequantizeQ4KM(new Uint8Array(raw), numBlocks, [elementCount]);
312
+ if (expectedElements != null && decoded.length !== expectedElements) {
313
+ throw new Error(
314
+ `Weight "${label}" Q4K decoded length ${decoded.length}, expected ${expectedElements}.`
315
+ );
316
+ }
317
+ return decoded;
289
318
  }
290
319
 
291
320
  if (!sourceDtype) {
@@ -454,6 +483,7 @@ async function createLayerRuntimeState(
454
483
  expectedNormElements,
455
484
  `L${layerIdx}.linear_attn.norm.weight`
456
485
  );
486
+ const runtimeNorm = applyLinearNormWeightOffset(norm, config.rmsNormWeightOffset === true);
457
487
 
458
488
  const aNegExp = new Float32Array(aLog.length);
459
489
  for (let i = 0; i < aLog.length; i++) {
@@ -490,7 +520,7 @@ async function createLayerRuntimeState(
490
520
  convWeight,
491
521
  dtBias,
492
522
  aNegExp,
493
- normWeight: norm,
523
+ normWeight: runtimeNorm,
494
524
  convState,
495
525
  recurrentState,
496
526
  convWeightGPU: null,
@@ -304,7 +304,7 @@ export async function computeLogitsGPU(
304
304
 
305
305
  const logitsTensor = await runMatmul(normedTensor, lmHeadBuffer, numTokens, matmulVocabSize, hiddenSize, {
306
306
  transposeB: 'auto',
307
- role: forceStableF32Logits ? undefined : 'lm_head',
307
+ role: 'lm_head',
308
308
  kernelPath: config.kernelPath ?? null,
309
309
  });
310
310
 
@@ -391,7 +391,7 @@ export async function recordLogitsGPU(
391
391
  // Record matmul (no submit)
392
392
  const logitsTensor = await recordMatmul(recorder, normedTensor, lmHeadBuffer, numTokens, matmulVocabSize, hiddenSize, {
393
393
  transposeB: 'auto',
394
- role: forceStableF32Logits ? undefined : 'lm_head',
394
+ role: 'lm_head',
395
395
  kernelPath: config.kernelPath ?? null,
396
396
  });
397
397
 
@@ -25,6 +25,10 @@ export { computeLogitsGPU, recordLogitsGPU, computeChunkedLogitsGPU, resolveCpuW
25
25
  // Re-export utilities
26
26
  export { extractLastPositionLogits, finalizeLogits } from './utils.js';
27
27
 
28
+ export interface ComputeLogitsOptions {
29
+ lastPositionOnly?: boolean;
30
+ }
31
+
28
32
  /**
29
33
  * Compute logits from hidden states.
30
34
  *
@@ -53,5 +57,6 @@ export function computeLogits(
53
57
  debugFlags?: LogitsDebugFlags,
54
58
  getNormWeightBuffer?: (weight: GPUBuffer | Float32Array | ArrayBuffer, label: string) => GPUBuffer,
55
59
  debugCheckBuffer?: (buffer: GPUBuffer, label: string, numTokens: number, expectedDim?: number) => Promise<void>,
56
- debugProbes?: ProbeConfigSchema[] | null
60
+ debugProbes?: ProbeConfigSchema[] | null,
61
+ options?: ComputeLogitsOptions
57
62
  ): Promise<Float32Array>;
@@ -253,6 +253,7 @@ export async function computeLogits(
253
253
 
254
254
  const lastPositionOnly = options?.lastPositionOnly === true && numTokens > 1;
255
255
  const matmulRows = lastPositionOnly ? 1 : numTokens;
256
+ const matmulPhaseOverride = lastPositionOnly ? 'prefill' : null;
256
257
  let matmulInputTensor = normedTensor;
257
258
  let matmulInputOwned = false;
258
259
  if (lastPositionOnly) {
@@ -270,7 +271,8 @@ export async function computeLogits(
270
271
  // HuggingFace models store lm_head as [vocabSize, hiddenSize], so transposeB=true
271
272
  const logitsTensor = await runMatmul(matmulInputTensor, lmHeadBuffer, matmulRows, matmulVocabSize, hiddenSize, {
272
273
  transposeB: 'auto',
273
- role: (forceStableF32Logits || lastPositionOnly) ? undefined : 'lm_head',
274
+ role: 'lm_head',
275
+ phaseOverride: matmulPhaseOverride,
274
276
  kernelPath: config.kernelPath ?? null,
275
277
  });
276
278
  await runProbes('logits', logitsTensor.buffer, {
@@ -234,6 +234,9 @@ function buildManifestDecodeLoopRuntimePatch(manifest) {
234
234
 
235
235
  export function applyModelBatchingRuntimeDefaults(runtimeConfig, manifest, modelConfig) {
236
236
  void modelConfig;
237
+ if (manifest?.inference?.schema === 'doppler.execution/v0') {
238
+ return runtimeConfig;
239
+ }
237
240
  const batching = runtimeConfig?.inference?.batching;
238
241
  const generation = runtimeConfig?.inference?.generation;
239
242
  const runtimeBatchingAtDefaults = isRuntimeBatchingAtGlobalDefaults(batching);
@@ -23,6 +23,7 @@ import {
23
23
  validateMoeShape,
24
24
  resolveMoeVendorProfile,
25
25
  resolveGptOssKernelPathProfile,
26
+ resolveMixtralKernelPathProfile,
26
27
  } from './moe-shape-validator.js';
27
28
 
28
29
  export async function moeFeedForwardGPU(
@@ -52,7 +53,10 @@ export async function moeFeedForwardGPU(
52
53
  if (topK == null) {
53
54
  throw new Error('MoE topK is required in config.');
54
55
  }
55
- const modelType = config.modelType ?? (expertFormat === 'gpt-oss' ? 'gpt-oss' : 'mixtral');
56
+ if (config.modelType == null) {
57
+ throw new Error('MoE config.modelType is required; got null/undefined.');
58
+ }
59
+ const modelType = config.modelType;
56
60
  validateMoeShape(
57
61
  { hiddenSize, intermediateSize, moeTopK: topK, numExperts, expertFormat },
58
62
  { modelType }
@@ -130,7 +134,13 @@ export async function moeFeedForwardGPU(
130
134
  trace.buffers(`MoE L${layerIdx} router_logits`, { min, max, nanCount, dtype: logitsDtype });
131
135
  }
132
136
 
137
+ // Profile resolution: routerTopK/dequantExpert are resolved for tracing and
138
+ // forward validation. Actual kernel dispatch uses the generic softmax.rules.json
139
+ // topkVariant rules (keyed by modelType) and format-specific dequant paths.
140
+ // GPT-OSS: dequantTileShape actively steers MXFP4 dequant; routerTopK is trace-only.
141
+ // Mixtral: expert weights are pre-loaded (no runtime dequant); both fields are trace-only.
133
142
  let gptOssKernelPathProfile = null;
143
+ let mixtralKernelPathProfile = null;
134
144
  if (modelType === 'gpt-oss') {
135
145
  gptOssKernelPathProfile = await resolveGptOssKernelPathProfile({
136
146
  hasF16: caps.hasF16,
@@ -141,6 +151,14 @@ export async function moeFeedForwardGPU(
141
151
  groupSize: 32,
142
152
  tileShape: vendorProfile.dequantTileShape,
143
153
  });
154
+ } else if (modelType === 'mixtral') {
155
+ mixtralKernelPathProfile = await resolveMixtralKernelPathProfile({
156
+ hasF16: caps.hasF16,
157
+ hasSubgroups: caps.hasSubgroups,
158
+ routerDtype: logitsDtype,
159
+ weightsDtype: activationDtype,
160
+ outputDtype: activationDtype,
161
+ });
144
162
  }
145
163
 
146
164
  stepStart = perfMark();
@@ -159,7 +177,7 @@ export async function moeFeedForwardGPU(
159
177
  perfLog(`MoE L${layerIdx} topk`, stepStart, {
160
178
  topK,
161
179
  modelType,
162
- routerTopKKernel: gptOssKernelPathProfile?.routerTopK ?? null,
180
+ routerTopKKernel: gptOssKernelPathProfile?.routerTopK ?? mixtralKernelPathProfile?.routerTopK ?? null,
163
181
  });
164
182
 
165
183
  if (isTraceEnabled('buffers')) {
@@ -211,7 +229,7 @@ export async function moeFeedForwardGPU(
211
229
  const bytesPerElement = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: activationDtype });
212
230
  const bytesPerToken = hiddenSize * bytesPerElement;
213
231
  let maxTokensPerExpert = resolveMaxTokensPerExpert(numTokens, numExperts, topK, hiddenSize, activationDtype);
214
- if (modelType === 'gpt-oss') {
232
+ if (vendorProfile.maxTokensPerExpertScale !== 1.0) {
215
233
  maxTokensPerExpert = Math.max(
216
234
  1,
217
235
  Math.round(maxTokensPerExpert * vendorProfile.maxTokensPerExpertScale)
@@ -29,3 +29,12 @@ export interface GptOssKernelPathProfile {
29
29
  export declare function resolveGptOssKernelPathProfile(
30
30
  context: Record<string, unknown>
31
31
  ): Promise<GptOssKernelPathProfile>;
32
+
33
+ export interface MixtralKernelPathProfile {
34
+ routerTopK: string;
35
+ dequantExpert: string;
36
+ }
37
+
38
+ export declare function resolveMixtralKernelPathProfile(
39
+ context: Record<string, unknown>
40
+ ): Promise<MixtralKernelPathProfile>;
@@ -7,17 +7,15 @@ function asVendorString(caps) {
7
7
  }
8
8
 
9
9
  export function resolveMoeVendorProfile(modelType) {
10
- if (modelType !== 'gpt-oss') {
11
- return {
12
- preferVec4Dequant: false,
13
- dequantTileShape: 'scalar',
14
- routerWorkgroupSize: 128,
15
- maxTokensPerExpertScale: 1.0,
16
- };
17
- }
18
10
  const caps = getKernelCapabilities();
19
11
  const vendor = asVendorString(caps);
20
- return selectRuleValue('kernels', 'moeGptoss', 'vendorQuirkProfile', { vendor });
12
+ if (modelType === 'gpt-oss') {
13
+ return selectRuleValue('kernels', 'moeGptoss', 'vendorQuirkProfile', { vendor });
14
+ }
15
+ if (modelType === 'mixtral') {
16
+ return selectRuleValue('kernels', 'moeMixtral', 'vendorQuirkProfile', { vendor });
17
+ }
18
+ throw new Error(`[MoE] Unknown modelType "${modelType}" for vendor profile resolution.`);
21
19
  }
22
20
 
23
21
  function resolveGptOssRuleContext(context) {
@@ -41,6 +39,25 @@ export async function resolveGptOssKernelPathProfile(context) {
41
39
  };
42
40
  }
43
41
 
42
+ function resolveMixtralRuleContext(context) {
43
+ return {
44
+ modelType: 'mixtral',
45
+ hasF16: context?.hasF16,
46
+ hasSubgroups: context?.hasSubgroups,
47
+ routerDtype: context?.routerDtype ?? 'f32',
48
+ weightsDtype: context?.weightsDtype,
49
+ outputDtype: context?.outputDtype ?? context?.weightsDtype,
50
+ };
51
+ }
52
+
53
+ export async function resolveMixtralKernelPathProfile(context) {
54
+ const ruleContext = resolveMixtralRuleContext(context);
55
+ return {
56
+ routerTopK: selectRuleValue('kernels', 'moeMixtral', 'routerTopKVariant', ruleContext),
57
+ dequantExpert: selectRuleValue('kernels', 'moeMixtral', 'dequantVariant', ruleContext),
58
+ };
59
+ }
60
+
44
61
  export function validateMoeShape(config, options = {}) {
45
62
  const {
46
63
  hiddenSize,
@@ -66,8 +83,11 @@ export function validateMoeShape(config, options = {}) {
66
83
 
67
84
  if (modelType === 'gpt-oss') {
68
85
  const policy = selectRuleValue('kernels', 'moeGptoss', 'shapePolicy', { modelType });
69
- const hiddenDivisor = policy.hiddenSizeDivisor ?? 32;
70
- const intermediateDivisor = policy.intermediateSizeDivisor ?? 32;
86
+ if (policy.hiddenSizeDivisor == null || policy.intermediateSizeDivisor == null) {
87
+ throw new Error('[MoE] GPT-OSS shapePolicy is missing hiddenSizeDivisor or intermediateSizeDivisor.');
88
+ }
89
+ const hiddenDivisor = policy.hiddenSizeDivisor;
90
+ const intermediateDivisor = policy.intermediateSizeDivisor;
71
91
  if (hiddenSize % hiddenDivisor !== 0 || intermediateSize % intermediateDivisor !== 0) {
72
92
  throw new Error(
73
93
  `[MoE] GPT-OSS shape policy violation: hiddenSize (${hiddenSize}) % ${hiddenDivisor} = ${hiddenSize % hiddenDivisor}, ` +