@simulatte/doppler 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/CHANGELOG.md +14 -1
  2. package/README.md +25 -6
  3. package/package.json +5 -3
  4. package/src/client/doppler-api.browser.js +6 -0
  5. package/src/client/doppler-api.d.ts +3 -0
  6. package/src/client/doppler-api.js +11 -2
  7. package/src/client/doppler-registry.js +3 -5
  8. package/src/client/doppler-registry.json +16 -0
  9. package/src/config/kernels/kernel-ref-digests.js +23 -21
  10. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  11. package/src/config/loader.js +6 -0
  12. package/src/config/platforms/loader.js +3 -1
  13. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  14. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  15. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  16. package/src/config/presets/kernel-paths/registry.json +7 -0
  17. package/src/config/presets/models/gemma3.json +2 -1
  18. package/src/config/presets/models/gemma4.json +61 -0
  19. package/src/config/presets/models/granite-docling.json +70 -0
  20. package/src/config/presets/models/lfm2.json +6 -1
  21. package/src/config/presets/models/qwen3_vl.json +40 -0
  22. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  23. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  24. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  25. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  26. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  27. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  28. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  29. package/src/config/runtime.js +3 -0
  30. package/src/config/schema/debug.schema.d.ts +40 -0
  31. package/src/config/schema/debug.schema.js +28 -0
  32. package/src/config/schema/index.js +2 -0
  33. package/src/config/schema/inference-defaults.schema.js +1 -1
  34. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  35. package/src/config/schema/memory-limits.schema.js +2 -2
  36. package/src/config/schema/storage.schema.js +1 -1
  37. package/src/converter/conversion-plan.js +1 -1
  38. package/src/converter/core.js +17 -8
  39. package/src/converter/quantizer.d.ts +5 -0
  40. package/src/converter/quantizer.js +15 -0
  41. package/src/distribution/shard-delivery.js +34 -0
  42. package/src/formats/rdrr/classification.js +32 -0
  43. package/src/gpu/kernel-runtime.js +4 -2
  44. package/src/gpu/kernels/attention.js +2 -1
  45. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  46. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  47. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  48. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  49. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  50. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  51. package/src/gpu/kernels/gated-short-conv.js +284 -0
  52. package/src/gpu/kernels/linear-attention-core.js +37 -17
  53. package/src/gpu/kernels/matmul-selection.js +1 -0
  54. package/src/gpu/kernels/matmul.d.ts +3 -0
  55. package/src/gpu/kernels/matmul.js +70 -1
  56. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  57. package/src/gpu/kernels/sample.js +1 -3
  58. package/src/gpu/kernels/sample.wgsl +39 -9
  59. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  60. package/src/gpu/kernels/shader-cache.js +9 -4
  61. package/src/inference/kv-cache/base.js +3 -10
  62. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  63. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
  64. package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
  65. package/src/inference/pipelines/text/attention/projections.js +13 -2
  66. package/src/inference/pipelines/text/attention/record.js +1 -0
  67. package/src/inference/pipelines/text/attention/run.js +9 -0
  68. package/src/inference/pipelines/text/config.d.ts +1 -0
  69. package/src/inference/pipelines/text/config.js +32 -4
  70. package/src/inference/pipelines/text/embed.js +26 -7
  71. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  72. package/src/inference/pipelines/text/execution-v0.js +12 -1
  73. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  74. package/src/inference/pipelines/text/generator-runtime.js +14 -0
  75. package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
  76. package/src/inference/pipelines/text/generator-steps.js +46 -29
  77. package/src/inference/pipelines/text/generator.d.ts +5 -0
  78. package/src/inference/pipelines/text/generator.js +320 -166
  79. package/src/inference/pipelines/text/init.d.ts +2 -0
  80. package/src/inference/pipelines/text/init.js +19 -5
  81. package/src/inference/pipelines/text/layer.js +37 -8
  82. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  83. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  84. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  85. package/src/inference/pipelines/text/ops.js +123 -53
  86. package/src/inference/pipelines/text/probes.js +1 -0
  87. package/src/inference/pipelines/text/state.js +2 -0
  88. package/src/inference/pipelines/text.d.ts +5 -0
  89. package/src/inference/pipelines/text.js +59 -1
  90. package/src/inference/pipelines/vision/encoder.js +386 -0
  91. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  92. package/src/inference/pipelines/vision/index.js +173 -0
  93. package/src/inference/pipelines/vision/ops.js +78 -0
  94. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  95. package/src/inference/test-harness.js +9 -7
  96. package/src/loader/doppler-loader.d.ts +3 -0
  97. package/src/loader/doppler-loader.js +20 -3
  98. package/src/loader/experts/expert-cache.js +6 -2
  99. package/src/loader/experts/expert-loader.js +6 -2
  100. package/src/loader/layer-loader.js +42 -3
  101. package/src/loader/manifest-config.js +3 -1
  102. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  103. package/src/loader/tensors/tensor-loader.js +124 -3
  104. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  105. package/src/rules/kernels/softmax.rules.json +2 -0
  106. package/src/rules/rule-registry.d.ts +1 -0
  107. package/src/rules/rule-registry.js +2 -0
  108. package/src/storage/quickstart-downloader.d.ts +3 -0
  109. package/src/storage/quickstart-downloader.js +27 -30
  110. package/src/tooling/node-converter.js +25 -7
  111. package/src/tooling/node-source-runtime.js +29 -5
  112. package/src/tooling/node-webgpu.js +24 -7
  113. package/src/utils/hf-resolve-url.d.ts +16 -0
  114. package/src/utils/hf-resolve-url.js +17 -0
  115. package/src/version.js +1 -1
  116. package/src/tooling/node-convert.d.ts +0 -54
@@ -28,6 +28,7 @@ import type {
28
28
  SpeculativeConfigSchema,
29
29
  KernelPathSchema,
30
30
  } from '../../../config/schema/index.js';
31
+ import type { LoaderDebugConfigSchema } from '../../../config/schema/debug.schema.js';
31
32
  import type { KernelPathSource } from '../../../config/kernel-path-loader.js';
32
33
 
33
34
  export interface PipelineStorageContext {
@@ -206,6 +207,7 @@ export interface LoadWeightsOptions {
206
207
  resolvedKernelPath?: KernelPathSchema | null;
207
208
  kernelPathSource?: KernelPathSource;
208
209
  keepF32Weights?: boolean;
210
+ loaderDebug?: LoaderDebugConfigSchema | null;
209
211
  }
210
212
 
211
213
  /**
@@ -309,13 +309,21 @@ export async function initRoPEFrequencies(config, useGPU) {
309
309
  if (!Number.isFinite(ropeScale) || ropeScale <= 0) {
310
310
  throw new Error(`RoPE scale must be a positive number; got "${ropeScale}".`);
311
311
  }
312
- const resolvedLocalScale = ropeLocalScale ?? ropeScale;
313
- if (!Number.isFinite(resolvedLocalScale) || resolvedLocalScale <= 0) {
312
+ const resolvedLocalScale = ropeLocalScale;
313
+ if (resolvedLocalScale != null && (!Number.isFinite(resolvedLocalScale) || resolvedLocalScale <= 0)) {
314
314
  throw new Error(`Local RoPE scale must be a positive number; got "${resolvedLocalScale}".`);
315
315
  }
316
316
  const resolvedLocalTheta = ropeLocalTheta ?? ropeTheta;
317
- const resolvedLocalScalingType = ropeLocalScalingType ?? ropeScalingType;
318
- const resolvedLocalScaling = ropeLocalScaling ?? ropeScaling;
317
+ const resolvedLocalScalingType = (
318
+ ropeLocalScalingType === undefined
319
+ ? ropeScalingType
320
+ : ropeLocalScalingType
321
+ );
322
+ const resolvedLocalScaling = (
323
+ ropeLocalScalingType === undefined
324
+ ? ropeScaling
325
+ : ropeLocalScaling
326
+ );
319
327
  const resolvedRotaryDim = resolveRotaryDim(headDim, rotaryDim, partialRotaryFactor);
320
328
  const halfDim = resolvedRotaryDim / 2;
321
329
  if (mropeInterleaved === true && Array.isArray(mropeSection)) {
@@ -646,7 +654,12 @@ export async function initTokenizer(manifest, options = {}) {
646
654
 
647
655
 
648
656
  export async function loadWeights(manifest, modelConfig, options = {}) {
649
- const { onProgress, loadingConfig, baseUrl } = options;
657
+ const {
658
+ onProgress,
659
+ loadingConfig,
660
+ baseUrl,
661
+ loaderDebug,
662
+ } = options;
650
663
  const runtimeStorageContext = options.storageContext
651
664
  ?? createRemoteStorageContext(baseUrl, manifest);
652
665
  const verifyHashes = (
@@ -668,6 +681,7 @@ export async function loadWeights(manifest, modelConfig, options = {}) {
668
681
  keepF32Weights
669
682
  )
670
683
  );
684
+ dopplerLoader.setLoaderDebugConfig(loaderDebug ?? null);
671
685
 
672
686
  const tensorsFile = isRDRRManifest(manifest) ? manifest.tensorsFile : null;
673
687
  if (baseUrl && tensorsFile) {
@@ -43,19 +43,16 @@ export function detectSandwichNorm(config) {
43
43
  }
44
44
 
45
45
 
46
- export function isMoELayer(layerIdx, config, layerWeights) {
46
+ export function isMoELayer(layerIdx, config) {
47
47
  if (!config.useMoE) return false;
48
48
 
49
- // Check if layer has router weights
50
- if (layerWeights?.routerWeight) return true;
51
-
52
- // Fall back to layer_types array if available
49
+ // Manifest-first: check layerTypes from config (derived from manifest.inference.layerPattern)
53
50
  const layerTypes = config.layerTypes;
54
51
  if (Array.isArray(layerTypes) && layerIdx < layerTypes.length) {
55
52
  return layerTypes[layerIdx] === 'moe';
56
53
  }
57
54
 
58
- // Default: assume all layers are MoE if model uses MoE
55
+ // No layerTypes available: assume all layers are MoE
59
56
  return true;
60
57
  }
61
58
 
@@ -87,6 +84,11 @@ function assertSupportedLayerRuntime(layerIdx, config) {
87
84
  }
88
85
  }
89
86
 
87
+ function getConvLayerState(convLayerStates, layerIdx) {
88
+ if (!convLayerStates) return {};
89
+ return convLayerStates.get(layerIdx) ?? {};
90
+ }
91
+
90
92
  function isSlidingLayerType(layerType) {
91
93
  const normalized = normalizeLayerType(layerType);
92
94
  return normalized === 'sliding_attention'
@@ -103,6 +105,14 @@ function isConvLayerType(layerType) {
103
105
  || normalized === 'liv_convolution';
104
106
  }
105
107
 
108
+ export function hasConvLayers(layerTypes) {
109
+ if (!Array.isArray(layerTypes)) return false;
110
+ for (let i = 0; i < layerTypes.length; i++) {
111
+ if (isConvLayerType(layerTypes[i])) return true;
112
+ }
113
+ return false;
114
+ }
115
+
106
116
  function isLinearLayerType(layerType) {
107
117
  const normalized = normalizeLayerType(layerType);
108
118
  return normalized === 'linear_attention'
@@ -201,8 +211,22 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
201
211
  );
202
212
  }
203
213
  const convKernel = layerWeights?.convKernel ?? null;
214
+ // Apply input norm (operator_norm) before conv mixer — matches HF Lfm2 forward pass
215
+ let normedTensor = inputTensor;
216
+ const inputNormWeight = layerWeights?.inputNorm ?? null;
217
+ if (inputNormWeight) {
218
+ const normWeightBuf = getNormWeightBuffer(inputNormWeight, `L${layerIdx}.conv_input_norm`);
219
+ normedTensor = await doRMSNorm(inputTensor, normWeightBuf, rmsNormEps, {
220
+ batchSize: numTokens,
221
+ hiddenSize,
222
+ rmsNormWeightOffset: config.rmsNormWeightOffset,
223
+ label: `L${layerIdx}.conv_input_norm`,
224
+ layerIdx,
225
+ }, recorder);
226
+ if (!(inputNormWeight instanceof GPUBuffer)) releaseOrTrack(recorder, normWeightBuf);
227
+ }
204
228
  attnOutput = await doConv(
205
- inputTensor,
229
+ normedTensor,
206
230
  getWeightBuffer(convInProj, `L${layerIdx}.conv_in_proj`),
207
231
  convKernel ? getWeightBuffer(convKernel, `L${layerIdx}.conv_kernel`) : null,
208
232
  getWeightBuffer(convOutProj, `L${layerIdx}.conv_out_proj`),
@@ -213,9 +237,13 @@ export async function processLayerGPU(layerIdx, inputBuffer, numTokens, isPrefil
213
237
  label: `L${layerIdx}.conv`,
214
238
  swigluLimit: config.swigluLimit,
215
239
  kernelPath: context.kernelPath ?? null,
240
+ convState: getConvLayerState(context.convLayerStates, layerIdx),
216
241
  },
217
242
  recorder
218
243
  );
244
+ if (normedTensor !== inputTensor) {
245
+ releaseOrTrack(recorder, normedTensor.buffer);
246
+ }
219
247
  } else if (isLinearLayer) {
220
248
  attnOutput = await runLinearAttentionLayer(inputTensor, layerWeights ?? null, {
221
249
  layerIdx,
@@ -721,6 +749,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
721
749
  label: `L${layerIdx}.plan_conv`,
722
750
  swigluLimit: config.swigluLimit,
723
751
  kernelPath: context.kernelPath ?? null,
752
+ convState: getConvLayerState(context.convLayerStates, layerIdx),
724
753
  },
725
754
  recorder
726
755
  );
@@ -782,7 +811,7 @@ async function processLayerPlanGPU(layerIdx, inputBuffer, numTokens, isPrefill,
782
811
  let outputTensor;
783
812
  const { runMoEFFNGPU, runDenseFFNGPU } = await import('./ffn/index.js');
784
813
 
785
- const canAutoMoe = config.useMoE && isMoELayer(layerIdx, config, layerWeights);
814
+ const canAutoMoe = config.useMoE && isMoELayer(layerIdx, config);
786
815
  const useMoe = selectRuleValue(
787
816
  'inference',
788
817
  'layer',
@@ -23,6 +23,7 @@ import {
23
23
  validateMoeShape,
24
24
  resolveMoeVendorProfile,
25
25
  resolveGptOssKernelPathProfile,
26
+ resolveMixtralKernelPathProfile,
26
27
  } from './moe-shape-validator.js';
27
28
 
28
29
  export async function moeFeedForwardGPU(
@@ -52,7 +53,10 @@ export async function moeFeedForwardGPU(
52
53
  if (topK == null) {
53
54
  throw new Error('MoE topK is required in config.');
54
55
  }
55
- const modelType = config.modelType ?? (expertFormat === 'gpt-oss' ? 'gpt-oss' : 'mixtral');
56
+ if (config.modelType == null) {
57
+ throw new Error('MoE config.modelType is required; got null/undefined.');
58
+ }
59
+ const modelType = config.modelType;
56
60
  validateMoeShape(
57
61
  { hiddenSize, intermediateSize, moeTopK: topK, numExperts, expertFormat },
58
62
  { modelType }
@@ -130,7 +134,13 @@ export async function moeFeedForwardGPU(
130
134
  trace.buffers(`MoE L${layerIdx} router_logits`, { min, max, nanCount, dtype: logitsDtype });
131
135
  }
132
136
 
137
+ // Profile resolution: routerTopK/dequantExpert are resolved for tracing and
138
+ // forward validation. Actual kernel dispatch uses the generic softmax.rules.json
139
+ // topkVariant rules (keyed by modelType) and format-specific dequant paths.
140
+ // GPT-OSS: dequantTileShape actively steers MXFP4 dequant; routerTopK is trace-only.
141
+ // Mixtral: expert weights are pre-loaded (no runtime dequant); both fields are trace-only.
133
142
  let gptOssKernelPathProfile = null;
143
+ let mixtralKernelPathProfile = null;
134
144
  if (modelType === 'gpt-oss') {
135
145
  gptOssKernelPathProfile = await resolveGptOssKernelPathProfile({
136
146
  hasF16: caps.hasF16,
@@ -141,6 +151,14 @@ export async function moeFeedForwardGPU(
141
151
  groupSize: 32,
142
152
  tileShape: vendorProfile.dequantTileShape,
143
153
  });
154
+ } else if (modelType === 'mixtral') {
155
+ mixtralKernelPathProfile = await resolveMixtralKernelPathProfile({
156
+ hasF16: caps.hasF16,
157
+ hasSubgroups: caps.hasSubgroups,
158
+ routerDtype: logitsDtype,
159
+ weightsDtype: activationDtype,
160
+ outputDtype: activationDtype,
161
+ });
144
162
  }
145
163
 
146
164
  stepStart = perfMark();
@@ -159,7 +177,7 @@ export async function moeFeedForwardGPU(
159
177
  perfLog(`MoE L${layerIdx} topk`, stepStart, {
160
178
  topK,
161
179
  modelType,
162
- routerTopKKernel: gptOssKernelPathProfile?.routerTopK ?? null,
180
+ routerTopKKernel: gptOssKernelPathProfile?.routerTopK ?? mixtralKernelPathProfile?.routerTopK ?? null,
163
181
  });
164
182
 
165
183
  if (isTraceEnabled('buffers')) {
@@ -211,7 +229,7 @@ export async function moeFeedForwardGPU(
211
229
  const bytesPerElement = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: activationDtype });
212
230
  const bytesPerToken = hiddenSize * bytesPerElement;
213
231
  let maxTokensPerExpert = resolveMaxTokensPerExpert(numTokens, numExperts, topK, hiddenSize, activationDtype);
214
- if (modelType === 'gpt-oss') {
232
+ if (vendorProfile.maxTokensPerExpertScale !== 1.0) {
215
233
  maxTokensPerExpert = Math.max(
216
234
  1,
217
235
  Math.round(maxTokensPerExpert * vendorProfile.maxTokensPerExpertScale)
@@ -29,3 +29,12 @@ export interface GptOssKernelPathProfile {
29
29
  export declare function resolveGptOssKernelPathProfile(
30
30
  context: Record<string, unknown>
31
31
  ): Promise<GptOssKernelPathProfile>;
32
+
33
+ export interface MixtralKernelPathProfile {
34
+ routerTopK: string;
35
+ dequantExpert: string;
36
+ }
37
+
38
+ export declare function resolveMixtralKernelPathProfile(
39
+ context: Record<string, unknown>
40
+ ): Promise<MixtralKernelPathProfile>;
@@ -7,17 +7,15 @@ function asVendorString(caps) {
7
7
  }
8
8
 
9
9
  export function resolveMoeVendorProfile(modelType) {
10
- if (modelType !== 'gpt-oss') {
11
- return {
12
- preferVec4Dequant: false,
13
- dequantTileShape: 'scalar',
14
- routerWorkgroupSize: 128,
15
- maxTokensPerExpertScale: 1.0,
16
- };
17
- }
18
10
  const caps = getKernelCapabilities();
19
11
  const vendor = asVendorString(caps);
20
- return selectRuleValue('kernels', 'moeGptoss', 'vendorQuirkProfile', { vendor });
12
+ if (modelType === 'gpt-oss') {
13
+ return selectRuleValue('kernels', 'moeGptoss', 'vendorQuirkProfile', { vendor });
14
+ }
15
+ if (modelType === 'mixtral') {
16
+ return selectRuleValue('kernels', 'moeMixtral', 'vendorQuirkProfile', { vendor });
17
+ }
18
+ throw new Error(`[MoE] Unknown modelType "${modelType}" for vendor profile resolution.`);
21
19
  }
22
20
 
23
21
  function resolveGptOssRuleContext(context) {
@@ -41,6 +39,25 @@ export async function resolveGptOssKernelPathProfile(context) {
41
39
  };
42
40
  }
43
41
 
42
+ function resolveMixtralRuleContext(context) {
43
+ return {
44
+ modelType: 'mixtral',
45
+ hasF16: context?.hasF16,
46
+ hasSubgroups: context?.hasSubgroups,
47
+ routerDtype: context?.routerDtype ?? 'f32',
48
+ weightsDtype: context?.weightsDtype,
49
+ outputDtype: context?.outputDtype ?? context?.weightsDtype,
50
+ };
51
+ }
52
+
53
+ export async function resolveMixtralKernelPathProfile(context) {
54
+ const ruleContext = resolveMixtralRuleContext(context);
55
+ return {
56
+ routerTopK: selectRuleValue('kernels', 'moeMixtral', 'routerTopKVariant', ruleContext),
57
+ dequantExpert: selectRuleValue('kernels', 'moeMixtral', 'dequantVariant', ruleContext),
58
+ };
59
+ }
60
+
44
61
  export function validateMoeShape(config, options = {}) {
45
62
  const {
46
63
  hiddenSize,
@@ -66,8 +83,11 @@ export function validateMoeShape(config, options = {}) {
66
83
 
67
84
  if (modelType === 'gpt-oss') {
68
85
  const policy = selectRuleValue('kernels', 'moeGptoss', 'shapePolicy', { modelType });
69
- const hiddenDivisor = policy.hiddenSizeDivisor ?? 32;
70
- const intermediateDivisor = policy.intermediateSizeDivisor ?? 32;
86
+ if (policy.hiddenSizeDivisor == null || policy.intermediateSizeDivisor == null) {
87
+ throw new Error('[MoE] GPT-OSS shapePolicy is missing hiddenSizeDivisor or intermediateSizeDivisor.');
88
+ }
89
+ const hiddenDivisor = policy.hiddenSizeDivisor;
90
+ const intermediateDivisor = policy.intermediateSizeDivisor;
71
91
  if (hiddenSize % hiddenDivisor !== 0 || intermediateSize % intermediateDivisor !== 0) {
72
92
  throw new Error(
73
93
  `[MoE] GPT-OSS shape policy violation: hiddenSize (${hiddenSize}) % ${hiddenDivisor} = ${hiddenSize % hiddenDivisor}, ` +
@@ -14,13 +14,14 @@ import {
14
14
  recordCastF32ToF16,
15
15
  } from '../../../gpu/kernels/cast.js';
16
16
  import { createTensor } from '../../../gpu/tensor.js';
17
- import { releaseBuffer } from '../../../memory/buffer-pool.js';
17
+ import { releaseBuffer, readBuffer, acquireBuffer, uploadData } from '../../../memory/buffer-pool.js';
18
18
  import { kernelTrace, traceStep } from './kernel-trace.js';
19
19
  import {
20
20
  runLayerAttentionGPU,
21
21
  recordLayerAttentionGPU,
22
22
  } from './attention/index.js';
23
23
  import { runLinearAttentionLayer } from './linear-attention.js';
24
+ import { runGatedShortConvGPU } from '../../../gpu/kernels/gated-short-conv.js';
24
25
 
25
26
 
26
27
  export function isDecodeBuffer(decodeBuffers, buffer) {
@@ -174,17 +175,22 @@ export async function doConv(
174
175
  throw new Error('doConv requires hiddenSize > 0.');
175
176
  }
176
177
 
177
- // Use the first 2x hidden projection channels as a gated conv-state projection.
178
+ // LFM2 gated short convolution (GPU-native):
179
+ // in_proj → 3×hidden → GPU kernel: split(B,C,x) + B*x + causal conv1d + C*conv_out → out_proj
178
180
  let inProj = null;
179
- let activated = null;
180
- let convInput = null;
181
+ let convOut = null;
181
182
  let outProj = null;
182
183
  try {
184
+ const convState = options.convState;
185
+ const hasConvState = Boolean(convState?.convWeightGPU && convState?.convStateGPU);
186
+ const projN = hasConvState ? hiddenSize * 3 : hiddenSize * 2;
187
+
188
+ // Project input
183
189
  inProj = await doMatmul(
184
190
  inputTensor,
185
191
  convInProj,
186
192
  numTokens,
187
- hiddenSize * 2,
193
+ projN,
188
194
  hiddenSize,
189
195
  {
190
196
  transposeB: 'auto',
@@ -195,50 +201,32 @@ export async function doConv(
195
201
  },
196
202
  recorder
197
203
  );
198
- activated = await doSiLURowSplit(inProj, {
199
- numTokens,
200
- dim: hiddenSize,
201
- activation: 'silu',
202
- swigluLimit: options.swigluLimit ?? null,
203
- label: `${label}.activation`,
204
- layerIdx,
205
- }, recorder);
204
+
205
+ if (hasConvState) {
206
+ // GPU gated short conv kernel: B*x → conv1d → C*conv_out (all on GPU)
207
+ convOut = await runGatedShortConvGPU(inProj, convState, {
208
+ numTokens,
209
+ layerIdx,
210
+ recorder,
211
+ });
212
+ } else {
213
+ // SwiGLU gated activation fallback: silu(first_half) * second_half
214
+ convOut = await doSiLURowSplit(inProj, {
215
+ numTokens,
216
+ dim: hiddenSize,
217
+ activation: 'silu',
218
+ swigluLimit: options.swigluLimit ?? null,
219
+ label: `${label}.activation`,
220
+ layerIdx,
221
+ }, recorder);
222
+ }
206
223
 
207
224
  releaseOrTrack(recorder, inProj.buffer);
208
225
  inProj = null;
209
226
 
210
- convInput = activated;
211
- if (convKernel && options.conv2d && options.conv2d.enabled === true) {
212
- const convTensorInput = createTensor(activated.buffer, activated.dtype, [
213
- options.conv2d.inChannels,
214
- options.conv2d.height,
215
- options.conv2d.width,
216
- ], `${label}.conv_input`);
217
- const convOptions = {
218
- inChannels: options.conv2d.inChannels,
219
- outChannels: options.conv2d.outChannels,
220
- height: options.conv2d.height,
221
- width: options.conv2d.width,
222
- kernelH: options.conv2d.kernelH,
223
- kernelW: options.conv2d.kernelW,
224
- stride: options.conv2d.stride ?? 1,
225
- pad: options.conv2d.pad ?? 0,
226
- };
227
- const convResult = recorder
228
- ? await recordConv2D(recorder, convTensorInput, convKernel, null, convOptions)
229
- : await runConv2D(convTensorInput, convKernel, null, convOptions);
230
- convInput = createTensor(
231
- convResult.buffer,
232
- convResult.dtype,
233
- [numTokens, hiddenSize],
234
- `${label}.conv_output`
235
- );
236
- releaseOrTrack(recorder, activated.buffer);
237
- activated = null;
238
- }
239
-
227
+ // Output projection
240
228
  outProj = await doMatmul(
241
- convInput,
229
+ convOut,
242
230
  convOutProj,
243
231
  numTokens,
244
232
  hiddenSize,
@@ -253,13 +241,8 @@ export async function doConv(
253
241
  recorder
254
242
  );
255
243
 
256
- if (convInput && (!activated || convInput.buffer !== activated.buffer)) {
257
- releaseOrTrack(recorder, convInput.buffer);
258
- convInput = null;
259
- } else if (activated) {
260
- releaseOrTrack(recorder, activated.buffer);
261
- activated = null;
262
- }
244
+ releaseOrTrack(recorder, convOut.buffer);
245
+ convOut = null;
263
246
 
264
247
  if (kernelTrace.enabled && !recorder) {
265
248
  await traceStep('conv', label, layerIdx, outProj.buffer, [numTokens, hiddenSize]);
@@ -267,13 +250,100 @@ export async function doConv(
267
250
  return outProj;
268
251
  } catch (error) {
269
252
  if (outProj) releaseOrTrack(recorder, outProj.buffer);
270
- if (convInput && (!activated || convInput.buffer !== activated.buffer)) releaseOrTrack(recorder, convInput.buffer);
271
- if (activated) releaseOrTrack(recorder, activated.buffer);
253
+ if (convOut) releaseOrTrack(recorder, convOut.buffer);
272
254
  if (inProj) releaseOrTrack(recorder, inProj.buffer);
273
255
  throw error;
274
256
  }
275
257
  }
276
258
 
259
+ export async function initConvLayerState(convState, convKernel, convInProj, hiddenSize, label, layerIdx) {
260
+ const { isWeightBuffer } = await import('../../../gpu/weight-buffer.js');
261
+ const isWB = typeof isWeightBuffer === 'function' && isWeightBuffer(convKernel);
262
+ const kernelBuf = isWB ? convKernel.buffer : (convKernel instanceof GPUBuffer ? convKernel : convKernel.buffer ?? convKernel);
263
+ const kernelDtype = isWB ? String(convKernel.dtype ?? '').toLowerCase() : null;
264
+
265
+ // Determine kernel size from weight shape
266
+ let kernelSize = 3;
267
+ if (isWB && Array.isArray(convKernel.shape)) {
268
+ kernelSize = Number(convKernel.shape[convKernel.shape.length - 1]) || 3;
269
+ }
270
+
271
+ // Dequantize conv kernel weights to F32
272
+ const totalElements = hiddenSize * kernelSize;
273
+ const { QK_K, Q4K_BLOCK_BYTES } = await import('../../../config/schema/index.js');
274
+ const { dequantizeQ4KM } = await import('../../../converter/quantizer.js');
275
+ const { getDevice } = await import('../../../gpu/device.js');
276
+ const device = getDevice();
277
+
278
+ const isQ4K = kernelDtype === 'q4k' || kernelDtype === 'q4_k_m' || kernelDtype === 'q4_k';
279
+ let weightF32;
280
+
281
+ if (isQ4K) {
282
+ const numBlocks = Math.ceil(totalElements / QK_K);
283
+ const q4kBytes = numBlocks * Q4K_BLOCK_BYTES;
284
+ // GPU readBuffer returns zeros for some Q4K weight buffers, so prefer
285
+ // CPU-side rawBytes from the WeightBuffer when available.
286
+ const hasRawBytes = isWB && convKernel.rawBytes;
287
+ if (hasRawBytes) {
288
+ weightF32 = dequantizeQ4KM(new Uint8Array(convKernel.rawBytes), numBlocks, [totalElements]);
289
+ } else {
290
+ if (device) await device.queue.onSubmittedWorkDone();
291
+ const raw = await readBuffer(kernelBuf, q4kBytes);
292
+ weightF32 = dequantizeQ4KM(new Uint8Array(raw), numBlocks, [totalElements]);
293
+ }
294
+ } else if (kernelDtype === 'f16' || kernelDtype === 'bf16') {
295
+ if (device) await device.queue.onSubmittedWorkDone();
296
+ const raw = await readBuffer(kernelBuf, totalElements * 2);
297
+ const { decodeReadback } = await import('./debug-utils/index.js');
298
+ weightF32 = decodeReadback(raw, 'f16');
299
+ } else {
300
+ if (device) await device.queue.onSubmittedWorkDone();
301
+ const raw = await readBuffer(kernelBuf, totalElements * 4);
302
+ weightF32 = new Float32Array(raw);
303
+ }
304
+
305
+ // Validate dequantized weights are non-degenerate
306
+ let maxAbs = 0;
307
+ for (let i = 0; i < weightF32.length; i++) {
308
+ const abs = Math.abs(weightF32[i]);
309
+ if (abs > maxAbs) maxAbs = abs;
310
+ }
311
+ if (maxAbs === 0) {
312
+ const { log } = await import('../../../debug/index.js');
313
+ log.error('Pipeline', `${label} conv kernel weights are all zeros after dequantization (dtype=${kernelDtype}, elements=${totalElements}). Conv layers will produce degenerate output.`);
314
+ }
315
+
316
+ // Upload dequantized weights to GPU
317
+ const weightGPU = acquireBuffer(weightF32.byteLength, undefined, `${label}.conv_weight_f32`);
318
+ uploadData(weightGPU, weightF32);
319
+
320
+ // Create zeroed conv state buffer
321
+ const stateSize = hiddenSize * (kernelSize - 1) * Float32Array.BYTES_PER_ELEMENT;
322
+ const stateGPU = acquireBuffer(stateSize, undefined, `${label}.conv_state`);
323
+ uploadData(stateGPU, new Float32Array(hiddenSize * (kernelSize - 1)));
324
+
325
+ convState.convWeightGPU = weightGPU;
326
+ convState.convStateGPU = stateGPU;
327
+ convState.hiddenSize = hiddenSize;
328
+ convState.kernelSize = kernelSize;
329
+
330
+ // Pre-dequantize in_proj weight to F32 via CPU dequantization of the raw Q4K buffer.
331
+ // GPU readBuffer returns zeros for some Q4K weight buffers, so we dequantize from the
332
+ // WeightBuffer's raw bytes instead.
333
+ if (isWB && isWeightBuffer(convInProj)) {
334
+ const inProjDtype = String(convInProj.dtype ?? '').toLowerCase();
335
+ const isInProjQ4K = inProjDtype === 'q4k' || inProjDtype === 'q4_k_m' || inProjDtype === 'q4_k';
336
+ if (isInProjQ4K && convInProj.rawBytes) {
337
+ const inProjElements = hiddenSize * 3 * hiddenSize;
338
+ const inProjBlocks = Math.ceil(inProjElements / QK_K);
339
+ const inProjF32 = dequantizeQ4KM(new Uint8Array(convInProj.rawBytes), inProjBlocks, [inProjElements]);
340
+ const inProjGPU = acquireBuffer(inProjF32.byteLength, undefined, `${label}.in_proj_f32`);
341
+ uploadData(inProjGPU, inProjF32);
342
+ convState.inProjF32GPU = inProjGPU;
343
+ }
344
+ }
345
+ }
346
+
277
347
  export async function doCast(input, toDtype, recorder) {
278
348
  if (toDtype !== 'f16' && toDtype !== 'f32') {
279
349
  throw new Error(`Unsupported cast target dtype "${toDtype}"`);
@@ -11,6 +11,7 @@ const STAGE_DEFAULT_CATEGORY = {
11
11
  embed_out: 'embed',
12
12
  // Attention stages (per-layer)
13
13
  attn_input: 'attn',
14
+ post_input_norm: 'attn',
14
15
  attn_normed: 'attn',
15
16
  linear_qkv_proj: 'attn',
16
17
  linear_z_proj: 'attn',
@@ -15,6 +15,8 @@ export class PipelineState {
15
15
  layers: new Map(),
16
16
  };
17
17
 
18
+ this.convLayerStates = new Map();
19
+
18
20
  this.moeRouter = null;
19
21
 
20
22
  this.speculativeDecoder = null;
@@ -69,6 +69,11 @@ export declare class InferencePipeline extends PipelineState {
69
69
  // ==========================================================================
70
70
 
71
71
  generate(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<string, void, void>;
72
+ generateTokens(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<number, void, void>;
73
+ generateTokenIds(
74
+ prompt: PromptInput,
75
+ options?: GenerateOptions
76
+ ): Promise<{ tokenIds: number[]; stats: PipelineStats }>;
72
77
 
73
78
  decodeStepLogits(currentIds: number[], options?: GenerateOptions): Promise<LogitsStepResult>;
74
79
 
@@ -43,6 +43,7 @@ import {
43
43
  import { getDopplerLoader } from '../../loader/doppler-loader.js';
44
44
  import { registerPipeline, getPipelineFactory } from './registry.js';
45
45
  import { selectRuleValue } from '../../rules/rule-registry.js';
46
+ import { initConvLayerState } from './text/ops.js';
46
47
 
47
48
  function destroyMoERouter(router) {
48
49
  if (router && typeof router.destroy === 'function') {
@@ -221,6 +222,9 @@ export class InferencePipeline extends PipelineState {
221
222
  // Initialize RoPE frequencies
222
223
  await this._initRoPE();
223
224
 
225
+ // Initialize conv layer states for gated short conv layers (LFM2)
226
+ await this._initConvLayerStates();
227
+
224
228
  this.isLoaded = true;
225
229
  log.info('Pipeline', 'Model loaded successfully');
226
230
  }
@@ -237,6 +241,7 @@ export class InferencePipeline extends PipelineState {
237
241
  resolvedKernelPath: this.resolvedKernelPath,
238
242
  kernelPathSource: this.kernelPathSource,
239
243
  keepF32Weights: this.runtimeConfig.inference.compute.keepF32Weights === true,
244
+ loaderDebug: this.runtimeConfig?.shared?.debug?.loader ?? null,
240
245
  onProgress: (info) => {
241
246
  if (info.stage !== 'layers' && info.stage !== 'shards') {
242
247
  log.verbose('Loader', `${info.stage}: ${Math.round(info.progress * 100)}%${info.message ? ` - ${info.message}` : ''}`);
@@ -310,7 +315,7 @@ export class InferencePipeline extends PipelineState {
310
315
  maxSeqLen,
311
316
  ropeTheta: config.ropeTheta,
312
317
  ropeLocalTheta: config.ropeLocalTheta,
313
- mropeInterleaved: config.ropeInterleaved,
318
+ mropeInterleaved: config.mropeInterleaved,
314
319
  mropeSection: config.mropeSection,
315
320
  partialRotaryFactor: config.partialRotaryFactor,
316
321
  ropeScale: config.ropeScale,
@@ -327,6 +332,51 @@ export class InferencePipeline extends PipelineState {
327
332
  }
328
333
 
329
334
 
335
+ async _initConvLayerStates() {
336
+ const config = this.modelConfig;
337
+ if (!config?.layerTypes) return;
338
+ const { getDevice } = await import('../../gpu/device.js');
339
+ const device = getDevice();
340
+ if (!device) return;
341
+
342
+ const hiddenSize = config.hiddenSize;
343
+ const convStates = new Map();
344
+
345
+ for (let i = 0; i < config.layerTypes.length; i++) {
346
+ const lt = String(config.layerTypes[i] ?? '').toLowerCase();
347
+ if (lt !== 'conv' && lt !== 'convolution') continue;
348
+
349
+ const layerWeights = this.weights.get(`layer_${i}`);
350
+ if (!layerWeights) continue;
351
+ const convKernel = layerWeights?.convKernel;
352
+ if (!convKernel) continue;
353
+
354
+ const convState = {};
355
+ try {
356
+ await initConvLayerState(
357
+ convState,
358
+ convKernel,
359
+ layerWeights.convInProj ?? null,
360
+ hiddenSize,
361
+ `L${i}.conv`,
362
+ i
363
+ );
364
+ if (!convState.convWeightGPU || !convState.convStateGPU) {
365
+ continue;
366
+ }
367
+ convStates.set(i, convState);
368
+ } catch (e) {
369
+ log.warn('Pipeline', `Conv layer ${i} state init failed: ${e.message}`);
370
+ }
371
+ }
372
+
373
+ if (convStates.size > 0) {
374
+ this.convLayerStates = convStates;
375
+ log.info('Pipeline', `Initialized ${convStates.size} conv layer states (kernelSize=${convStates.values().next().value?.kernelSize})`);
376
+ }
377
+ }
378
+
379
+
330
380
  _resolveLayerPipeline() {
331
381
  if (!this.modelConfig) return;
332
382
  const runtimePlan = this.runtimeConfig.inference.pipeline ?? null;
@@ -349,6 +399,14 @@ export class InferencePipeline extends PipelineState {
349
399
  return this.generator.generate(prompt, options);
350
400
  }
351
401
 
402
+ generateTokens(prompt, options = {}) {
403
+ return this.generator.generateTokens(prompt, options);
404
+ }
405
+
406
+ generateTokenIds(prompt, options = {}) {
407
+ return this.generator.generateTokenIds(prompt, options);
408
+ }
409
+
352
410
  decodeStepLogits(currentIds, options = {}) {
353
411
  return this.generator.decodeStepLogits(currentIds, options);
354
412
  }