@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/CHANGELOG.md +32 -0
  2. package/README.md +25 -6
  3. package/package.json +25 -38
  4. package/src/browser/browser-converter.js +5 -0
  5. package/src/client/doppler-api.browser.js +6 -0
  6. package/src/client/doppler-api.d.ts +3 -0
  7. package/src/client/doppler-api.js +11 -2
  8. package/src/client/doppler-registry.js +3 -5
  9. package/src/client/doppler-registry.json +2 -2
  10. package/src/config/kernel-path-loader.d.ts +5 -0
  11. package/src/config/kernel-path-loader.js +13 -0
  12. package/src/config/kernels/kernel-ref-digests.js +23 -21
  13. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  14. package/src/config/kernels/registry.json +74 -0
  15. package/src/config/loader.js +9 -0
  16. package/src/config/merge-contract-check.js +7 -0
  17. package/src/config/platforms/loader.js +3 -1
  18. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  19. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  20. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  21. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
  22. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
  23. package/src/config/presets/kernel-paths/registry.json +21 -0
  24. package/src/config/presets/models/gemma2.json +2 -1
  25. package/src/config/presets/models/gemma3.json +4 -1
  26. package/src/config/presets/models/gemma4.json +61 -0
  27. package/src/config/presets/models/granite-docling.json +70 -0
  28. package/src/config/presets/models/lfm2.json +6 -1
  29. package/src/config/presets/models/qwen3.json +4 -3
  30. package/src/config/presets/models/qwen3_5.json +16 -0
  31. package/src/config/presets/models/qwen3_vl.json +40 -0
  32. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  33. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  34. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  35. package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
  36. package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
  37. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  38. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  39. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  40. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  41. package/src/config/runtime.js +3 -0
  42. package/src/config/schema/conversion.schema.d.ts +1 -0
  43. package/src/config/schema/debug.schema.d.ts +40 -0
  44. package/src/config/schema/debug.schema.js +28 -0
  45. package/src/config/schema/index.js +2 -0
  46. package/src/config/schema/inference-defaults.schema.js +1 -1
  47. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  48. package/src/config/schema/manifest.schema.d.ts +1 -1
  49. package/src/config/schema/manifest.schema.js +1 -1
  50. package/src/config/schema/memory-limits.schema.js +2 -2
  51. package/src/config/schema/storage.schema.js +2 -2
  52. package/src/converter/conversion-plan.js +11 -3
  53. package/src/converter/core.js +19 -8
  54. package/src/converter/manifest-inference.js +12 -22
  55. package/src/converter/parsers/transformer.js +4 -0
  56. package/src/converter/quantization-info.js +5 -1
  57. package/src/converter/quantizer.d.ts +5 -0
  58. package/src/converter/quantizer.js +34 -12
  59. package/src/converter/rope-config.js +8 -6
  60. package/src/converter/tokenizer-utils.d.ts +1 -0
  61. package/src/converter/tokenizer-utils.js +4 -1
  62. package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
  63. package/src/distribution/shard-delivery.js +40 -1
  64. package/src/formats/rdrr/classification.js +32 -0
  65. package/src/formats/rdrr/parsing.d.ts +4 -0
  66. package/src/formats/rdrr/parsing.js +14 -1
  67. package/src/gpu/kernel-runtime.js +4 -2
  68. package/src/gpu/kernels/attention.js +2 -1
  69. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  70. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  71. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  72. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  73. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  74. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  75. package/src/gpu/kernels/gated-short-conv.js +284 -0
  76. package/src/gpu/kernels/index.d.ts +8 -0
  77. package/src/gpu/kernels/index.js +6 -0
  78. package/src/gpu/kernels/linear-attention-core.js +37 -17
  79. package/src/gpu/kernels/matmul-selection.js +48 -4
  80. package/src/gpu/kernels/matmul.d.ts +5 -0
  81. package/src/gpu/kernels/matmul.js +71 -2
  82. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  83. package/src/gpu/kernels/rmsnorm.js +9 -2
  84. package/src/gpu/kernels/sample.js +1 -3
  85. package/src/gpu/kernels/sample.wgsl +39 -9
  86. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  87. package/src/gpu/kernels/shader-cache.js +9 -4
  88. package/src/gpu/kernels/split_qg.d.ts +50 -0
  89. package/src/gpu/kernels/split_qg.js +46 -0
  90. package/src/gpu/kernels/split_qg.wgsl +58 -0
  91. package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
  92. package/src/gpu/weight-buffer.d.ts +1 -1
  93. package/src/gpu/weight-buffer.js +1 -1
  94. package/src/inference/browser-harness.d.ts +2 -0
  95. package/src/inference/browser-harness.js +20 -1
  96. package/src/inference/kv-cache/base.js +3 -10
  97. package/src/inference/pipelines/diffusion/helpers.js +3 -0
  98. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  99. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
  100. package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
  101. package/src/inference/pipelines/text/attention/output-projection.js +8 -0
  102. package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
  103. package/src/inference/pipelines/text/attention/projections.js +54 -13
  104. package/src/inference/pipelines/text/attention/record.js +16 -6
  105. package/src/inference/pipelines/text/attention/run.js +59 -6
  106. package/src/inference/pipelines/text/config.d.ts +1 -0
  107. package/src/inference/pipelines/text/config.js +46 -4
  108. package/src/inference/pipelines/text/embed.js +26 -7
  109. package/src/inference/pipelines/text/execution-plan.js +5 -4
  110. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  111. package/src/inference/pipelines/text/execution-v0.js +12 -1
  112. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  113. package/src/inference/pipelines/text/generator-runtime.js +19 -0
  114. package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
  115. package/src/inference/pipelines/text/generator-steps.js +71 -26
  116. package/src/inference/pipelines/text/generator.d.ts +5 -0
  117. package/src/inference/pipelines/text/generator.js +353 -166
  118. package/src/inference/pipelines/text/init.d.ts +15 -0
  119. package/src/inference/pipelines/text/init.js +35 -10
  120. package/src/inference/pipelines/text/layer.js +38 -8
  121. package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
  122. package/src/inference/pipelines/text/linear-attention.js +33 -3
  123. package/src/inference/pipelines/text/logits/gpu.js +2 -2
  124. package/src/inference/pipelines/text/logits/index.d.ts +6 -1
  125. package/src/inference/pipelines/text/logits/index.js +3 -1
  126. package/src/inference/pipelines/text/model-load.js +3 -0
  127. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  128. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  129. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  130. package/src/inference/pipelines/text/ops.js +123 -53
  131. package/src/inference/pipelines/text/probes.js +1 -0
  132. package/src/inference/pipelines/text/sampling.js +52 -6
  133. package/src/inference/pipelines/text/state.js +2 -0
  134. package/src/inference/pipelines/text.d.ts +5 -0
  135. package/src/inference/pipelines/text.js +59 -1
  136. package/src/inference/pipelines/vision/encoder.js +386 -0
  137. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  138. package/src/inference/pipelines/vision/index.js +173 -0
  139. package/src/inference/pipelines/vision/ops.js +78 -0
  140. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  141. package/src/inference/test-harness.js +11 -9
  142. package/src/loader/doppler-loader.d.ts +3 -0
  143. package/src/loader/doppler-loader.js +20 -3
  144. package/src/loader/experts/expert-cache.js +6 -2
  145. package/src/loader/experts/expert-loader.js +6 -2
  146. package/src/loader/final-weights-loader.js +2 -0
  147. package/src/loader/layer-loader.js +42 -3
  148. package/src/loader/manifest-config.js +3 -1
  149. package/src/loader/shard-cache.js +3 -2
  150. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  151. package/src/loader/tensors/tensor-loader.js +130 -4
  152. package/src/rules/inference/dtype.rules.json +5 -0
  153. package/src/rules/inference/kernel-path.rules.json +2 -2
  154. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  155. package/src/rules/kernels/softmax.rules.json +2 -0
  156. package/src/rules/kernels/split-qg.rules.json +6 -0
  157. package/src/rules/rule-registry.d.ts +1 -0
  158. package/src/rules/rule-registry.js +4 -0
  159. package/src/storage/downloader.js +2 -1
  160. package/src/storage/quickstart-downloader.d.ts +3 -0
  161. package/src/storage/quickstart-downloader.js +27 -30
  162. package/src/storage/shard-manager.js +4 -3
  163. package/src/tooling/conversion-config-materializer.js +3 -5
  164. package/src/tooling/node-converter.js +28 -7
  165. package/src/tooling/node-source-runtime.js +65 -5
  166. package/src/tooling/node-webgpu.js +24 -7
  167. package/src/types/model.d.ts +5 -0
  168. package/src/utils/hf-resolve-url.d.ts +16 -0
  169. package/src/utils/hf-resolve-url.js +17 -0
  170. package/src/version.js +1 -1
  171. package/tools/doppler-cli.js +6 -1
  172. package/src/tooling/node-convert.d.ts +0 -54
@@ -14,13 +14,14 @@ import {
14
14
  recordCastF32ToF16,
15
15
  } from '../../../gpu/kernels/cast.js';
16
16
  import { createTensor } from '../../../gpu/tensor.js';
17
- import { releaseBuffer } from '../../../memory/buffer-pool.js';
17
+ import { releaseBuffer, readBuffer, acquireBuffer, uploadData } from '../../../memory/buffer-pool.js';
18
18
  import { kernelTrace, traceStep } from './kernel-trace.js';
19
19
  import {
20
20
  runLayerAttentionGPU,
21
21
  recordLayerAttentionGPU,
22
22
  } from './attention/index.js';
23
23
  import { runLinearAttentionLayer } from './linear-attention.js';
24
+ import { runGatedShortConvGPU } from '../../../gpu/kernels/gated-short-conv.js';
24
25
 
25
26
 
26
27
  export function isDecodeBuffer(decodeBuffers, buffer) {
@@ -174,17 +175,22 @@ export async function doConv(
174
175
  throw new Error('doConv requires hiddenSize > 0.');
175
176
  }
176
177
 
177
- // Use the first 2x hidden projection channels as a gated conv-state projection.
178
+ // LFM2 gated short convolution (GPU-native):
179
+ // in_proj → 3×hidden → GPU kernel: split(B,C,x) + B*x + causal conv1d + C*conv_out → out_proj
178
180
  let inProj = null;
179
- let activated = null;
180
- let convInput = null;
181
+ let convOut = null;
181
182
  let outProj = null;
182
183
  try {
184
+ const convState = options.convState;
185
+ const hasConvState = Boolean(convState?.convWeightGPU && convState?.convStateGPU);
186
+ const projN = hasConvState ? hiddenSize * 3 : hiddenSize * 2;
187
+
188
+ // Project input
183
189
  inProj = await doMatmul(
184
190
  inputTensor,
185
191
  convInProj,
186
192
  numTokens,
187
- hiddenSize * 2,
193
+ projN,
188
194
  hiddenSize,
189
195
  {
190
196
  transposeB: 'auto',
@@ -195,50 +201,32 @@ export async function doConv(
195
201
  },
196
202
  recorder
197
203
  );
198
- activated = await doSiLURowSplit(inProj, {
199
- numTokens,
200
- dim: hiddenSize,
201
- activation: 'silu',
202
- swigluLimit: options.swigluLimit ?? null,
203
- label: `${label}.activation`,
204
- layerIdx,
205
- }, recorder);
204
+
205
+ if (hasConvState) {
206
+ // GPU gated short conv kernel: B*x → conv1d → C*conv_out (all on GPU)
207
+ convOut = await runGatedShortConvGPU(inProj, convState, {
208
+ numTokens,
209
+ layerIdx,
210
+ recorder,
211
+ });
212
+ } else {
213
+ // SwiGLU gated activation fallback: silu(first_half) * second_half
214
+ convOut = await doSiLURowSplit(inProj, {
215
+ numTokens,
216
+ dim: hiddenSize,
217
+ activation: 'silu',
218
+ swigluLimit: options.swigluLimit ?? null,
219
+ label: `${label}.activation`,
220
+ layerIdx,
221
+ }, recorder);
222
+ }
206
223
 
207
224
  releaseOrTrack(recorder, inProj.buffer);
208
225
  inProj = null;
209
226
 
210
- convInput = activated;
211
- if (convKernel && options.conv2d && options.conv2d.enabled === true) {
212
- const convTensorInput = createTensor(activated.buffer, activated.dtype, [
213
- options.conv2d.inChannels,
214
- options.conv2d.height,
215
- options.conv2d.width,
216
- ], `${label}.conv_input`);
217
- const convOptions = {
218
- inChannels: options.conv2d.inChannels,
219
- outChannels: options.conv2d.outChannels,
220
- height: options.conv2d.height,
221
- width: options.conv2d.width,
222
- kernelH: options.conv2d.kernelH,
223
- kernelW: options.conv2d.kernelW,
224
- stride: options.conv2d.stride ?? 1,
225
- pad: options.conv2d.pad ?? 0,
226
- };
227
- const convResult = recorder
228
- ? await recordConv2D(recorder, convTensorInput, convKernel, null, convOptions)
229
- : await runConv2D(convTensorInput, convKernel, null, convOptions);
230
- convInput = createTensor(
231
- convResult.buffer,
232
- convResult.dtype,
233
- [numTokens, hiddenSize],
234
- `${label}.conv_output`
235
- );
236
- releaseOrTrack(recorder, activated.buffer);
237
- activated = null;
238
- }
239
-
227
+ // Output projection
240
228
  outProj = await doMatmul(
241
- convInput,
229
+ convOut,
242
230
  convOutProj,
243
231
  numTokens,
244
232
  hiddenSize,
@@ -253,13 +241,8 @@ export async function doConv(
253
241
  recorder
254
242
  );
255
243
 
256
- if (convInput && (!activated || convInput.buffer !== activated.buffer)) {
257
- releaseOrTrack(recorder, convInput.buffer);
258
- convInput = null;
259
- } else if (activated) {
260
- releaseOrTrack(recorder, activated.buffer);
261
- activated = null;
262
- }
244
+ releaseOrTrack(recorder, convOut.buffer);
245
+ convOut = null;
263
246
 
264
247
  if (kernelTrace.enabled && !recorder) {
265
248
  await traceStep('conv', label, layerIdx, outProj.buffer, [numTokens, hiddenSize]);
@@ -267,13 +250,100 @@ export async function doConv(
267
250
  return outProj;
268
251
  } catch (error) {
269
252
  if (outProj) releaseOrTrack(recorder, outProj.buffer);
270
- if (convInput && (!activated || convInput.buffer !== activated.buffer)) releaseOrTrack(recorder, convInput.buffer);
271
- if (activated) releaseOrTrack(recorder, activated.buffer);
253
+ if (convOut) releaseOrTrack(recorder, convOut.buffer);
272
254
  if (inProj) releaseOrTrack(recorder, inProj.buffer);
273
255
  throw error;
274
256
  }
275
257
  }
276
258
 
259
+ export async function initConvLayerState(convState, convKernel, convInProj, hiddenSize, label, layerIdx) {
260
+ const { isWeightBuffer } = await import('../../../gpu/weight-buffer.js');
261
+ const isWB = typeof isWeightBuffer === 'function' && isWeightBuffer(convKernel);
262
+ const kernelBuf = isWB ? convKernel.buffer : (convKernel instanceof GPUBuffer ? convKernel : convKernel.buffer ?? convKernel);
263
+ const kernelDtype = isWB ? String(convKernel.dtype ?? '').toLowerCase() : null;
264
+
265
+ // Determine kernel size from weight shape
266
+ let kernelSize = 3;
267
+ if (isWB && Array.isArray(convKernel.shape)) {
268
+ kernelSize = Number(convKernel.shape[convKernel.shape.length - 1]) || 3;
269
+ }
270
+
271
+ // Dequantize conv kernel weights to F32
272
+ const totalElements = hiddenSize * kernelSize;
273
+ const { QK_K, Q4K_BLOCK_BYTES } = await import('../../../config/schema/index.js');
274
+ const { dequantizeQ4KM } = await import('../../../converter/quantizer.js');
275
+ const { getDevice } = await import('../../../gpu/device.js');
276
+ const device = getDevice();
277
+
278
+ const isQ4K = kernelDtype === 'q4k' || kernelDtype === 'q4_k_m' || kernelDtype === 'q4_k';
279
+ let weightF32;
280
+
281
+ if (isQ4K) {
282
+ const numBlocks = Math.ceil(totalElements / QK_K);
283
+ const q4kBytes = numBlocks * Q4K_BLOCK_BYTES;
284
+ // GPU readBuffer returns zeros for some Q4K weight buffers, so prefer
285
+ // CPU-side rawBytes from the WeightBuffer when available.
286
+ const hasRawBytes = isWB && convKernel.rawBytes;
287
+ if (hasRawBytes) {
288
+ weightF32 = dequantizeQ4KM(new Uint8Array(convKernel.rawBytes), numBlocks, [totalElements]);
289
+ } else {
290
+ if (device) await device.queue.onSubmittedWorkDone();
291
+ const raw = await readBuffer(kernelBuf, q4kBytes);
292
+ weightF32 = dequantizeQ4KM(new Uint8Array(raw), numBlocks, [totalElements]);
293
+ }
294
+ } else if (kernelDtype === 'f16' || kernelDtype === 'bf16') {
295
+ if (device) await device.queue.onSubmittedWorkDone();
296
+ const raw = await readBuffer(kernelBuf, totalElements * 2);
297
+ const { decodeReadback } = await import('./debug-utils/index.js');
298
+ weightF32 = decodeReadback(raw, 'f16');
299
+ } else {
300
+ if (device) await device.queue.onSubmittedWorkDone();
301
+ const raw = await readBuffer(kernelBuf, totalElements * 4);
302
+ weightF32 = new Float32Array(raw);
303
+ }
304
+
305
+ // Validate dequantized weights are non-degenerate
306
+ let maxAbs = 0;
307
+ for (let i = 0; i < weightF32.length; i++) {
308
+ const abs = Math.abs(weightF32[i]);
309
+ if (abs > maxAbs) maxAbs = abs;
310
+ }
311
+ if (maxAbs === 0) {
312
+ const { log } = await import('../../../debug/index.js');
313
+ log.error('Pipeline', `${label} conv kernel weights are all zeros after dequantization (dtype=${kernelDtype}, elements=${totalElements}). Conv layers will produce degenerate output.`);
314
+ }
315
+
316
+ // Upload dequantized weights to GPU
317
+ const weightGPU = acquireBuffer(weightF32.byteLength, undefined, `${label}.conv_weight_f32`);
318
+ uploadData(weightGPU, weightF32);
319
+
320
+ // Create zeroed conv state buffer
321
+ const stateSize = hiddenSize * (kernelSize - 1) * Float32Array.BYTES_PER_ELEMENT;
322
+ const stateGPU = acquireBuffer(stateSize, undefined, `${label}.conv_state`);
323
+ uploadData(stateGPU, new Float32Array(hiddenSize * (kernelSize - 1)));
324
+
325
+ convState.convWeightGPU = weightGPU;
326
+ convState.convStateGPU = stateGPU;
327
+ convState.hiddenSize = hiddenSize;
328
+ convState.kernelSize = kernelSize;
329
+
330
+ // Pre-dequantize in_proj weight to F32 via CPU dequantization of the raw Q4K buffer.
331
+ // GPU readBuffer returns zeros for some Q4K weight buffers, so we dequantize from the
332
+ // WeightBuffer's raw bytes instead.
333
+ if (isWB && isWeightBuffer(convInProj)) {
334
+ const inProjDtype = String(convInProj.dtype ?? '').toLowerCase();
335
+ const isInProjQ4K = inProjDtype === 'q4k' || inProjDtype === 'q4_k_m' || inProjDtype === 'q4_k';
336
+ if (isInProjQ4K && convInProj.rawBytes) {
337
+ const inProjElements = hiddenSize * 3 * hiddenSize;
338
+ const inProjBlocks = Math.ceil(inProjElements / QK_K);
339
+ const inProjF32 = dequantizeQ4KM(new Uint8Array(convInProj.rawBytes), inProjBlocks, [inProjElements]);
340
+ const inProjGPU = acquireBuffer(inProjF32.byteLength, undefined, `${label}.in_proj_f32`);
341
+ uploadData(inProjGPU, inProjF32);
342
+ convState.inProjF32GPU = inProjGPU;
343
+ }
344
+ }
345
+ }
346
+
277
347
  export async function doCast(input, toDtype, recorder) {
278
348
  if (toDtype !== 'f16' && toDtype !== 'f32') {
279
349
  throw new Error(`Unsupported cast target dtype "${toDtype}"`);
@@ -11,6 +11,7 @@ const STAGE_DEFAULT_CATEGORY = {
11
11
  embed_out: 'embed',
12
12
  // Attention stages (per-layer)
13
13
  attn_input: 'attn',
14
+ post_input_norm: 'attn',
14
15
  attn_normed: 'attn',
15
16
  linear_qkv_proj: 'attn',
16
17
  linear_z_proj: 'attn',
@@ -58,6 +58,30 @@ export function softmax(logits) {
58
58
  return exps;
59
59
  }
60
60
 
61
+ function countFiniteCandidates(logits, padTokenId) {
62
+ let finiteCandidateCount = 0;
63
+ for (let i = 0; i < logits.length; i++) {
64
+ if (padTokenId != null && i === padTokenId) {
65
+ continue;
66
+ }
67
+ if (Number.isFinite(logits[i])) {
68
+ finiteCandidateCount += 1;
69
+ }
70
+ }
71
+ return finiteCandidateCount;
72
+ }
73
+
74
+ function assertFiniteSamplingCandidates(logits, padTokenId, label) {
75
+ const finiteCandidateCount = countFiniteCandidates(logits, padTokenId);
76
+ if (finiteCandidateCount > 0) {
77
+ return;
78
+ }
79
+ throw new Error(
80
+ `[Sampling] ${label} has no finite candidate logits after masking the pad token. ` +
81
+ 'Upstream decode likely produced NaN/Inf or an all-masked distribution.'
82
+ );
83
+ }
84
+
61
85
 
62
86
  export function sample(logits, opts) {
63
87
  const { temperature, topP, topK, decode, debug = false, padTokenId, seed } = opts;
@@ -66,16 +90,28 @@ export function sample(logits, opts) {
66
90
  logits[padTokenId] = -Infinity;
67
91
  }
68
92
 
93
+ assertFiniteSamplingCandidates(logits, padTokenId, 'Logits');
94
+
69
95
  // Greedy (argmax) when temperature = 0
70
96
  if (temperature === 0) {
71
- let maxIdx = 0;
72
- let maxVal = logits[0];
73
- for (let i = 1; i < logits.length; i++) {
74
- if (logits[i] > maxVal) {
75
- maxVal = logits[i];
97
+ let maxIdx = -1;
98
+ let maxVal = -Infinity;
99
+ for (let i = 0; i < logits.length; i++) {
100
+ const value = logits[i];
101
+ if (!Number.isFinite(value)) {
102
+ continue;
103
+ }
104
+ if (value > maxVal) {
105
+ maxVal = value;
76
106
  maxIdx = i;
77
107
  }
78
108
  }
109
+ if (maxIdx < 0) {
110
+ throw new Error(
111
+ '[Sampling] Greedy sampling could not find a finite candidate logit. ' +
112
+ 'Upstream decode likely produced NaN/Inf.'
113
+ );
114
+ }
79
115
  if (debug) {
80
116
  const text = decode?.([maxIdx]) ?? '?';
81
117
  trace.sample(`Greedy: id=${maxIdx} "${text}" logit=${maxVal.toFixed(4)}`);
@@ -96,7 +132,17 @@ export function sample(logits, opts) {
96
132
 
97
133
  let candidates = [];
98
134
  for (let i = 0; i < probs.length; i++) {
99
- candidates.push({ token: i, prob: probs[i] });
135
+ const probability = probs[i];
136
+ if (!Number.isFinite(probability) || probability <= 0) {
137
+ continue;
138
+ }
139
+ candidates.push({ token: i, prob: probability });
140
+ }
141
+ if (candidates.length === 0) {
142
+ throw new Error(
143
+ '[Sampling] Softmax produced no finite candidate probabilities. ' +
144
+ 'Upstream decode likely produced NaN/Inf logits.'
145
+ );
100
146
  }
101
147
  candidates.sort((a, b) => b.prob - a.prob);
102
148
 
@@ -15,6 +15,8 @@ export class PipelineState {
15
15
  layers: new Map(),
16
16
  };
17
17
 
18
+ this.convLayerStates = new Map();
19
+
18
20
  this.moeRouter = null;
19
21
 
20
22
  this.speculativeDecoder = null;
@@ -69,6 +69,11 @@ export declare class InferencePipeline extends PipelineState {
69
69
  // ==========================================================================
70
70
 
71
71
  generate(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<string, void, void>;
72
+ generateTokens(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<number, void, void>;
73
+ generateTokenIds(
74
+ prompt: PromptInput,
75
+ options?: GenerateOptions
76
+ ): Promise<{ tokenIds: number[]; stats: PipelineStats }>;
72
77
 
73
78
  decodeStepLogits(currentIds: number[], options?: GenerateOptions): Promise<LogitsStepResult>;
74
79
 
@@ -43,6 +43,7 @@ import {
43
43
  import { getDopplerLoader } from '../../loader/doppler-loader.js';
44
44
  import { registerPipeline, getPipelineFactory } from './registry.js';
45
45
  import { selectRuleValue } from '../../rules/rule-registry.js';
46
+ import { initConvLayerState } from './text/ops.js';
46
47
 
47
48
  function destroyMoERouter(router) {
48
49
  if (router && typeof router.destroy === 'function') {
@@ -221,6 +222,9 @@ export class InferencePipeline extends PipelineState {
221
222
  // Initialize RoPE frequencies
222
223
  await this._initRoPE();
223
224
 
225
+ // Initialize conv layer states for gated short conv layers (LFM2)
226
+ await this._initConvLayerStates();
227
+
224
228
  this.isLoaded = true;
225
229
  log.info('Pipeline', 'Model loaded successfully');
226
230
  }
@@ -237,6 +241,7 @@ export class InferencePipeline extends PipelineState {
237
241
  resolvedKernelPath: this.resolvedKernelPath,
238
242
  kernelPathSource: this.kernelPathSource,
239
243
  keepF32Weights: this.runtimeConfig.inference.compute.keepF32Weights === true,
244
+ loaderDebug: this.runtimeConfig?.shared?.debug?.loader ?? null,
240
245
  onProgress: (info) => {
241
246
  if (info.stage !== 'layers' && info.stage !== 'shards') {
242
247
  log.verbose('Loader', `${info.stage}: ${Math.round(info.progress * 100)}%${info.message ? ` - ${info.message}` : ''}`);
@@ -310,7 +315,7 @@ export class InferencePipeline extends PipelineState {
310
315
  maxSeqLen,
311
316
  ropeTheta: config.ropeTheta,
312
317
  ropeLocalTheta: config.ropeLocalTheta,
313
- mropeInterleaved: config.ropeInterleaved,
318
+ mropeInterleaved: config.mropeInterleaved,
314
319
  mropeSection: config.mropeSection,
315
320
  partialRotaryFactor: config.partialRotaryFactor,
316
321
  ropeScale: config.ropeScale,
@@ -327,6 +332,51 @@ export class InferencePipeline extends PipelineState {
327
332
  }
328
333
 
329
334
 
335
+ async _initConvLayerStates() {
336
+ const config = this.modelConfig;
337
+ if (!config?.layerTypes) return;
338
+ const { getDevice } = await import('../../gpu/device.js');
339
+ const device = getDevice();
340
+ if (!device) return;
341
+
342
+ const hiddenSize = config.hiddenSize;
343
+ const convStates = new Map();
344
+
345
+ for (let i = 0; i < config.layerTypes.length; i++) {
346
+ const lt = String(config.layerTypes[i] ?? '').toLowerCase();
347
+ if (lt !== 'conv' && lt !== 'convolution') continue;
348
+
349
+ const layerWeights = this.weights.get(`layer_${i}`);
350
+ if (!layerWeights) continue;
351
+ const convKernel = layerWeights?.convKernel;
352
+ if (!convKernel) continue;
353
+
354
+ const convState = {};
355
+ try {
356
+ await initConvLayerState(
357
+ convState,
358
+ convKernel,
359
+ layerWeights.convInProj ?? null,
360
+ hiddenSize,
361
+ `L${i}.conv`,
362
+ i
363
+ );
364
+ if (!convState.convWeightGPU || !convState.convStateGPU) {
365
+ continue;
366
+ }
367
+ convStates.set(i, convState);
368
+ } catch (e) {
369
+ log.warn('Pipeline', `Conv layer ${i} state init failed: ${e.message}`);
370
+ }
371
+ }
372
+
373
+ if (convStates.size > 0) {
374
+ this.convLayerStates = convStates;
375
+ log.info('Pipeline', `Initialized ${convStates.size} conv layer states (kernelSize=${convStates.values().next().value?.kernelSize})`);
376
+ }
377
+ }
378
+
379
+
330
380
  _resolveLayerPipeline() {
331
381
  if (!this.modelConfig) return;
332
382
  const runtimePlan = this.runtimeConfig.inference.pipeline ?? null;
@@ -349,6 +399,14 @@ export class InferencePipeline extends PipelineState {
349
399
  return this.generator.generate(prompt, options);
350
400
  }
351
401
 
402
+ generateTokens(prompt, options = {}) {
403
+ return this.generator.generateTokens(prompt, options);
404
+ }
405
+
406
+ generateTokenIds(prompt, options = {}) {
407
+ return this.generator.generateTokenIds(prompt, options);
408
+ }
409
+
352
410
  decodeStepLogits(currentIds, options = {}) {
353
411
  return this.generator.decodeStepLogits(currentIds, options);
354
412
  }