@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/CHANGELOG.md +32 -0
  2. package/README.md +25 -6
  3. package/package.json +25 -38
  4. package/src/browser/browser-converter.js +5 -0
  5. package/src/client/doppler-api.browser.js +6 -0
  6. package/src/client/doppler-api.d.ts +3 -0
  7. package/src/client/doppler-api.js +11 -2
  8. package/src/client/doppler-registry.js +3 -5
  9. package/src/client/doppler-registry.json +2 -2
  10. package/src/config/kernel-path-loader.d.ts +5 -0
  11. package/src/config/kernel-path-loader.js +13 -0
  12. package/src/config/kernels/kernel-ref-digests.js +23 -21
  13. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  14. package/src/config/kernels/registry.json +74 -0
  15. package/src/config/loader.js +9 -0
  16. package/src/config/merge-contract-check.js +7 -0
  17. package/src/config/platforms/loader.js +3 -1
  18. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  19. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  20. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  21. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
  22. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
  23. package/src/config/presets/kernel-paths/registry.json +21 -0
  24. package/src/config/presets/models/gemma2.json +2 -1
  25. package/src/config/presets/models/gemma3.json +4 -1
  26. package/src/config/presets/models/gemma4.json +61 -0
  27. package/src/config/presets/models/granite-docling.json +70 -0
  28. package/src/config/presets/models/lfm2.json +6 -1
  29. package/src/config/presets/models/qwen3.json +4 -3
  30. package/src/config/presets/models/qwen3_5.json +16 -0
  31. package/src/config/presets/models/qwen3_vl.json +40 -0
  32. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  33. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  34. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  35. package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
  36. package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
  37. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  38. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  39. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  40. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  41. package/src/config/runtime.js +3 -0
  42. package/src/config/schema/conversion.schema.d.ts +1 -0
  43. package/src/config/schema/debug.schema.d.ts +40 -0
  44. package/src/config/schema/debug.schema.js +28 -0
  45. package/src/config/schema/index.js +2 -0
  46. package/src/config/schema/inference-defaults.schema.js +1 -1
  47. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  48. package/src/config/schema/manifest.schema.d.ts +1 -1
  49. package/src/config/schema/manifest.schema.js +1 -1
  50. package/src/config/schema/memory-limits.schema.js +2 -2
  51. package/src/config/schema/storage.schema.js +2 -2
  52. package/src/converter/conversion-plan.js +11 -3
  53. package/src/converter/core.js +19 -8
  54. package/src/converter/manifest-inference.js +12 -22
  55. package/src/converter/parsers/transformer.js +4 -0
  56. package/src/converter/quantization-info.js +5 -1
  57. package/src/converter/quantizer.d.ts +5 -0
  58. package/src/converter/quantizer.js +34 -12
  59. package/src/converter/rope-config.js +8 -6
  60. package/src/converter/tokenizer-utils.d.ts +1 -0
  61. package/src/converter/tokenizer-utils.js +4 -1
  62. package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
  63. package/src/distribution/shard-delivery.js +40 -1
  64. package/src/formats/rdrr/classification.js +32 -0
  65. package/src/formats/rdrr/parsing.d.ts +4 -0
  66. package/src/formats/rdrr/parsing.js +14 -1
  67. package/src/gpu/kernel-runtime.js +4 -2
  68. package/src/gpu/kernels/attention.js +2 -1
  69. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  70. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  71. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  72. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  73. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  74. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  75. package/src/gpu/kernels/gated-short-conv.js +284 -0
  76. package/src/gpu/kernels/index.d.ts +8 -0
  77. package/src/gpu/kernels/index.js +6 -0
  78. package/src/gpu/kernels/linear-attention-core.js +37 -17
  79. package/src/gpu/kernels/matmul-selection.js +48 -4
  80. package/src/gpu/kernels/matmul.d.ts +5 -0
  81. package/src/gpu/kernels/matmul.js +71 -2
  82. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  83. package/src/gpu/kernels/rmsnorm.js +9 -2
  84. package/src/gpu/kernels/sample.js +1 -3
  85. package/src/gpu/kernels/sample.wgsl +39 -9
  86. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  87. package/src/gpu/kernels/shader-cache.js +9 -4
  88. package/src/gpu/kernels/split_qg.d.ts +50 -0
  89. package/src/gpu/kernels/split_qg.js +46 -0
  90. package/src/gpu/kernels/split_qg.wgsl +58 -0
  91. package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
  92. package/src/gpu/weight-buffer.d.ts +1 -1
  93. package/src/gpu/weight-buffer.js +1 -1
  94. package/src/inference/browser-harness.d.ts +2 -0
  95. package/src/inference/browser-harness.js +20 -1
  96. package/src/inference/kv-cache/base.js +3 -10
  97. package/src/inference/pipelines/diffusion/helpers.js +3 -0
  98. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  99. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
  100. package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
  101. package/src/inference/pipelines/text/attention/output-projection.js +8 -0
  102. package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
  103. package/src/inference/pipelines/text/attention/projections.js +54 -13
  104. package/src/inference/pipelines/text/attention/record.js +16 -6
  105. package/src/inference/pipelines/text/attention/run.js +59 -6
  106. package/src/inference/pipelines/text/config.d.ts +1 -0
  107. package/src/inference/pipelines/text/config.js +46 -4
  108. package/src/inference/pipelines/text/embed.js +26 -7
  109. package/src/inference/pipelines/text/execution-plan.js +5 -4
  110. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  111. package/src/inference/pipelines/text/execution-v0.js +12 -1
  112. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  113. package/src/inference/pipelines/text/generator-runtime.js +19 -0
  114. package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
  115. package/src/inference/pipelines/text/generator-steps.js +71 -26
  116. package/src/inference/pipelines/text/generator.d.ts +5 -0
  117. package/src/inference/pipelines/text/generator.js +353 -166
  118. package/src/inference/pipelines/text/init.d.ts +15 -0
  119. package/src/inference/pipelines/text/init.js +35 -10
  120. package/src/inference/pipelines/text/layer.js +38 -8
  121. package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
  122. package/src/inference/pipelines/text/linear-attention.js +33 -3
  123. package/src/inference/pipelines/text/logits/gpu.js +2 -2
  124. package/src/inference/pipelines/text/logits/index.d.ts +6 -1
  125. package/src/inference/pipelines/text/logits/index.js +3 -1
  126. package/src/inference/pipelines/text/model-load.js +3 -0
  127. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  128. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  129. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  130. package/src/inference/pipelines/text/ops.js +123 -53
  131. package/src/inference/pipelines/text/probes.js +1 -0
  132. package/src/inference/pipelines/text/sampling.js +52 -6
  133. package/src/inference/pipelines/text/state.js +2 -0
  134. package/src/inference/pipelines/text.d.ts +5 -0
  135. package/src/inference/pipelines/text.js +59 -1
  136. package/src/inference/pipelines/vision/encoder.js +386 -0
  137. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  138. package/src/inference/pipelines/vision/index.js +173 -0
  139. package/src/inference/pipelines/vision/ops.js +78 -0
  140. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  141. package/src/inference/test-harness.js +11 -9
  142. package/src/loader/doppler-loader.d.ts +3 -0
  143. package/src/loader/doppler-loader.js +20 -3
  144. package/src/loader/experts/expert-cache.js +6 -2
  145. package/src/loader/experts/expert-loader.js +6 -2
  146. package/src/loader/final-weights-loader.js +2 -0
  147. package/src/loader/layer-loader.js +42 -3
  148. package/src/loader/manifest-config.js +3 -1
  149. package/src/loader/shard-cache.js +3 -2
  150. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  151. package/src/loader/tensors/tensor-loader.js +130 -4
  152. package/src/rules/inference/dtype.rules.json +5 -0
  153. package/src/rules/inference/kernel-path.rules.json +2 -2
  154. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  155. package/src/rules/kernels/softmax.rules.json +2 -0
  156. package/src/rules/kernels/split-qg.rules.json +6 -0
  157. package/src/rules/rule-registry.d.ts +1 -0
  158. package/src/rules/rule-registry.js +4 -0
  159. package/src/storage/downloader.js +2 -1
  160. package/src/storage/quickstart-downloader.d.ts +3 -0
  161. package/src/storage/quickstart-downloader.js +27 -30
  162. package/src/storage/shard-manager.js +4 -3
  163. package/src/tooling/conversion-config-materializer.js +3 -5
  164. package/src/tooling/node-converter.js +28 -7
  165. package/src/tooling/node-source-runtime.js +65 -5
  166. package/src/tooling/node-webgpu.js +24 -7
  167. package/src/types/model.d.ts +5 -0
  168. package/src/utils/hf-resolve-url.d.ts +16 -0
  169. package/src/utils/hf-resolve-url.js +17 -0
  170. package/src/version.js +1 -1
  171. package/tools/doppler-cli.js +6 -1
  172. package/src/tooling/node-convert.d.ts +0 -54
@@ -139,6 +139,12 @@ export function resolveStepOptions(state, options = {}) {
139
139
  const executionPlan = resolveExecutionSessionPlan(state, options);
140
140
 
141
141
  return {
142
+ seed: resolveConfiguredValue(
143
+ options.seed,
144
+ undefined,
145
+ 'options.seed',
146
+ (value) => Number.isFinite(value) && value >= 0
147
+ ),
142
148
  temperature: resolveConfiguredValue(options.temperature, samplingDefaults.temperature, 'options.temperature'),
143
149
  topP: resolveConfiguredValue(options.topP, samplingDefaults.topP, 'options.topP'),
144
150
  topK: resolveConfiguredValue(options.topK, samplingDefaults.topK, 'options.topK'),
@@ -165,6 +171,12 @@ export function resolveGenerateOptions(state, options = {}) {
165
171
  const executionPlan = resolveExecutionSessionPlan(state, options);
166
172
 
167
173
  return {
174
+ seed: resolveConfiguredValue(
175
+ options.seed,
176
+ undefined,
177
+ 'options.seed',
178
+ (value) => Number.isFinite(value) && value >= 0
179
+ ),
168
180
  maxTokens: executionPlan.maxTokens,
169
181
  temperature: resolveConfiguredValue(options.temperature, samplingDefaults.temperature, 'options.temperature'),
170
182
  topP: resolveConfiguredValue(options.topP, samplingDefaults.topP, 'options.topP'),
@@ -191,6 +203,7 @@ export function resolveGenerateOptions(state, options = {}) {
191
203
  batchSize: executionPlan.batchSize,
192
204
  stopCheckMode: executionPlan.stopCheckMode,
193
205
  executionPlan,
206
+ images: options.images ?? null,
194
207
  };
195
208
  }
196
209
 
@@ -205,6 +218,7 @@ export function resolvePrefillOptions(state, options = {}) {
205
218
  disableCommandBatching: executionPlan.disableCommandBatching,
206
219
  disableMultiTokenDecode: executionPlan.disableMultiTokenDecode,
207
220
  executionPlan,
221
+ images: options.images ?? null,
208
222
  };
209
223
  }
210
224
 
@@ -213,6 +227,10 @@ export function resolvePrefillEmbeddingOptions(state, options = {}) {
213
227
  ? state.manifest.modelType.toLowerCase()
214
228
  : '';
215
229
  const generationDefaults = state.runtimeConfig.inference.generation;
230
+ // Embedding models default to 'mean' pooling — this is a model-category behavior,
231
+ // not a model-family identity check. Ideally embedding model presets would set
232
+ // generation.embeddingMode='mean' in their runtime config; the modelType fallback
233
+ // provides this default for manifests that predate runtime-preset embedding mode.
216
234
  const defaultEmbeddingMode = modelType === 'embedding'
217
235
  ? 'mean'
218
236
  : generationDefaults.embeddingMode;
@@ -226,6 +244,7 @@ export function resolveAdvanceEmbeddingMode(state, options = {}) {
226
244
  const modelType = typeof state.manifest?.modelType === 'string'
227
245
  ? state.manifest.modelType.toLowerCase()
228
246
  : '';
247
+ // See resolvePrefillEmbeddingOptions for embedding-model pooling rationale.
229
248
  const configuredMode = state.runtimeConfig.inference.generation.embeddingMode;
230
249
  return resolveConfiguredValue(
231
250
  options.embeddingMode,
@@ -12,6 +12,15 @@ export interface BatchDecodeSelectionConfig {
12
12
 
13
13
  export declare function shouldUseBatchDecode(config: BatchDecodeSelectionConfig): boolean;
14
14
 
15
+ export interface FusedDecodeSamplingConfig {
16
+ recorderEnabled: boolean;
17
+ gpuSamplingEnabled: boolean;
18
+ fusedDecodeDisabled: boolean;
19
+ layerTypes?: string[] | null;
20
+ }
21
+
22
+ export declare function shouldUseFusedDecodeSampling(config: FusedDecodeSamplingConfig): boolean;
23
+
15
24
  export declare function resolveBatchStop(
16
25
  tokens: number[],
17
26
  stopFlags: Uint32Array | null,
@@ -19,6 +28,12 @@ export declare function resolveBatchStop(
19
28
  eosTokenId: number | undefined | null
20
29
  ): number;
21
30
 
31
+ export declare function findInvalidGeneratedToken(
32
+ tokens: number[],
33
+ vocabSize: number,
34
+ padTokenId?: number | null
35
+ ): { index: number; tokenId: number } | null;
36
+
22
37
  export interface SampledTokenStagingBuffer {
23
38
  mapAsync(mode: number): Promise<void>;
24
39
  getMappedRange(): ArrayBufferLike;
@@ -1,6 +1,6 @@
1
1
  import { getDevice, setTrackSubmits } from '../../../gpu/device.js';
2
2
  import { releaseBuffer, readBuffer } from '../../../memory/buffer-pool.js';
3
- import { runArgmax, runGPUSample, recordArgmax, recordGPUSample, isGPUSamplingAvailable } from '../../../gpu/kernels/sample.js';
3
+ import { recordArgmax, recordGPUSample, isGPUSamplingAvailable } from '../../../gpu/kernels/sample.js';
4
4
  import { recordCheckStop } from '../../../gpu/kernels/check-stop.js';
5
5
  import { resetSubmitStats, logSubmitStats } from '../../../gpu/submit-tracker.js';
6
6
  import { createCommandRecorder, createProfilingRecorder, CommandRecorder } from '../../../gpu/command-recorder.js';
@@ -20,6 +20,7 @@ import { decodeReadback } from './debug-utils/index.js';
20
20
  import { getFinalNormWeights, extractEmbeddingFromHidden } from './generator-runtime.js';
21
21
  import { parseFinitenessStatusWords } from './finiteness-guard-status.js';
22
22
  import { hasLinearAttentionLayers } from './linear-attention.js';
23
+ import { hasConvLayers } from './layer.js';
23
24
 
24
25
  const UNKNOWN_TOKEN_TEXT = '<unknown>';
25
26
 
@@ -91,6 +92,13 @@ export function shouldUseBatchDecode(config) {
91
92
  return isBatchDecodeEnabled(config);
92
93
  }
93
94
 
95
+ export function shouldUseFusedDecodeSampling(config) {
96
+ return config.recorderEnabled === true
97
+ && config.gpuSamplingEnabled === true
98
+ && config.fusedDecodeDisabled !== true
99
+ && !hasConvLayers(config.layerTypes ?? []);
100
+ }
101
+
94
102
  export function resolveBatchStop(tokens, stopFlags, stopTokenIds, eosTokenId) {
95
103
  let actualCount = tokens.length;
96
104
  if (stopFlags) {
@@ -113,6 +121,20 @@ export function resolveBatchStop(tokens, stopFlags, stopTokenIds, eosTokenId) {
113
121
  return actualCount;
114
122
  }
115
123
 
124
+ export function findInvalidGeneratedToken(tokens, vocabSize, padTokenId = null) {
125
+ for (let i = 0; i < tokens.length; i++) {
126
+ const tokenId = tokens[i];
127
+ const isInvalid = !Number.isFinite(tokenId)
128
+ || tokenId < 0
129
+ || tokenId >= vocabSize
130
+ || (padTokenId != null ? tokenId === padTokenId : tokenId === 0);
131
+ if (isInvalid) {
132
+ return { index: i, tokenId };
133
+ }
134
+ }
135
+ return null;
136
+ }
137
+
116
138
  export async function readSampledTokenFromStagingBuffer(stagingBuffer, options = {}) {
117
139
  const ownsStagingBuffer = options.ownsStagingBuffer === true;
118
140
  const hasFinitenessBuffer = options.hasFinitenessBuffer === true;
@@ -240,11 +262,9 @@ async function runDecodeLayers(state, tokenId, opts, helpers) {
240
262
  throw new Error('Embed buffer not found or not a supported buffer type');
241
263
  }
242
264
  const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
243
- const embedDtype = isWeightBuffer(embedBufferRaw)
244
- ? getWeightDtype(embedBufferRaw)
245
- : isCpuWeightBuffer(embedBufferRaw)
246
- ? embedBufferRaw.dtype
247
- : null;
265
+ const embedDtype = isCpuWeightBuffer(embedBufferRaw)
266
+ ? embedBufferRaw.dtype
267
+ : getWeightDtype(embedBufferRaw);
248
268
  const activationDtype = getEffectiveActivationDtype(state, opts);
249
269
 
250
270
  const embedTensor = await embed([tokenId], embedBuffer, {
@@ -326,11 +346,9 @@ export async function decodeStep(state, currentIds, opts, helpers) {
326
346
  throw new Error('Embed buffer not found or not a supported buffer type');
327
347
  }
328
348
  const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
329
- const embedDtype = isWeightBuffer(embedBufferRaw)
330
- ? getWeightDtype(embedBufferRaw)
331
- : isCpuWeightBuffer(embedBufferRaw)
332
- ? embedBufferRaw.dtype
333
- : null;
349
+ const embedDtype = isCpuWeightBuffer(embedBufferRaw)
350
+ ? embedBufferRaw.dtype
351
+ : getWeightDtype(embedBufferRaw);
334
352
  const activationDtype = getEffectiveActivationDtype(state, opts);
335
353
  const activationBytes = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: activationDtype });
336
354
 
@@ -393,7 +411,12 @@ export async function decodeStep(state, currentIds, opts, helpers) {
393
411
  const padTokenId = state.tokenizer?.getSpecialTokens?.()?.pad ?? null;
394
412
  const lmHeadIsCpu = isCpuWeightBuffer(state.weights.get('lm_head'));
395
413
  const useGPUSampling = state.useGPU && isGPUSamplingAvailable() && !lmHeadIsCpu;
396
- const useFusedDecode = recorder && useGPUSampling && !state.disableFusedDecode;
414
+ const useFusedDecode = shouldUseFusedDecodeSampling({
415
+ recorderEnabled: Boolean(recorder),
416
+ gpuSamplingEnabled: useGPUSampling,
417
+ fusedDecodeDisabled: state.disableFusedDecode,
418
+ layerTypes: config.layerTypes,
419
+ });
397
420
 
398
421
  if (useFusedDecode) {
399
422
  const ring = state.decodeRing;
@@ -621,21 +644,30 @@ export async function decodeStep(state, currentIds, opts, helpers) {
621
644
  );
622
645
  if (logitsResult) {
623
646
  const { logitsBuffer, vocabSize, logitsDtype } = logitsResult;
647
+ const logitsBytes = selectRuleValue('shared', 'dtype', 'bytesFromDtype', { dtype: logitsDtype });
648
+ const logitsData = await readBuffer(logitsBuffer, numTokens * vocabSize * logitsBytes);
649
+ releaseBuffer(logitsBuffer);
624
650
 
625
- const nextToken = opts.temperature < samplingDefaults.greedyThreshold
626
- ? await runArgmax(logitsBuffer, vocabSize, { padTokenId, logitSoftcap, logitsDtype, outputIndex: 0 })
627
- : await runGPUSample(logitsBuffer, vocabSize, {
628
- temperature: opts.temperature,
629
- topK: opts.topK,
630
- padTokenId,
631
- logitSoftcap,
632
- logitsDtype,
633
- outputIndex: 0,
634
- greedyThreshold: samplingDefaults.greedyThreshold,
635
- randomSeed: opts.seed,
636
- });
651
+ const rawLogits = decodeReadback(logitsData, logitsDtype);
652
+ const finalizedLogits = await finalizeLogits(
653
+ rawLogits,
654
+ numTokens,
655
+ vocabSize,
656
+ config.vocabSize,
657
+ config,
658
+ state.runtimeConfig.shared.debug.probes
659
+ );
660
+ const sampledLogits = extractLastPositionLogits(finalizedLogits, numTokens, config.vocabSize);
661
+
662
+ applyRepetitionPenalty(sampledLogits, currentIds, opts.repetitionPenalty);
663
+ const nextToken = sample(sampledLogits, {
664
+ temperature: opts.temperature,
665
+ topP: opts.topP,
666
+ topK: opts.topK,
667
+ padTokenId,
668
+ seed: opts.seed,
669
+ });
637
670
 
638
- releaseBuffer(logitsBuffer);
639
671
  if (!context.decodeBuffers?.ownsBuffer(hiddenStates)) {
640
672
  releaseBuffer(hiddenStates);
641
673
  }
@@ -867,6 +899,11 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
867
899
  '[Pipeline] Batch decode path is disabled for linear_attention models; use single-token decode.'
868
900
  );
869
901
  }
902
+ if (hasConvLayers(config.layerTypes)) {
903
+ throw new Error(
904
+ '[Pipeline] Batch decode path is disabled for conv models; use single-token decode.'
905
+ );
906
+ }
870
907
  const samplingDefaults = state.runtimeConfig.inference.sampling;
871
908
  const executionPlan = opts.executionPlan;
872
909
  const batchSize = executionPlan?.batchSize ?? opts.batchSize ?? state.runtimeConfig.inference.batching.batchSize;
@@ -981,7 +1018,7 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
981
1018
  throw new Error('Embed buffer not found or not a GPUBuffer/WeightBuffer');
982
1019
  }
983
1020
  const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
984
- const embedDtype = isWeightBuffer(embedBufferRaw) ? getWeightDtype(embedBufferRaw) : null;
1021
+ const embedDtype = getWeightDtype(embedBufferRaw);
985
1022
  const activationDtype = getEffectiveActivationDtype(state, opts);
986
1023
 
987
1024
  for (let i = 0; i < N; i++) {
@@ -1125,10 +1162,18 @@ export async function generateNTokensGPU(state, startToken, N, currentIds, opts,
1125
1162
 
1126
1163
  const actualCount = resolveBatchStop(tokens, stopFlags, stopTokenIds, eosToken);
1127
1164
  const generatedTokens = tokens.slice(0, actualCount);
1165
+ const invalidToken = findInvalidGeneratedToken(generatedTokens, config.vocabSize, padTokenId);
1128
1166
 
1129
1167
  if (isInfinite) {
1130
1168
  throw new FinitenessError(`F16 bounds exceeded during batch generation${metadata}`);
1131
1169
  }
1170
+ if (invalidToken) {
1171
+ state.disableFusedDecode = true;
1172
+ throw new Error(
1173
+ `[Pipeline] Batch decode produced invalid token ${invalidToken.tokenId} ` +
1174
+ `at batch index ${invalidToken.index} (vocabSize=${config.vocabSize}, padTokenId=${padTokenId ?? 'none'}).`
1175
+ );
1176
+ }
1132
1177
 
1133
1178
  if (opts.profile && recorder.isProfilingEnabled()) {
1134
1179
  const timings = await recorder.resolveProfileTimings();
@@ -27,6 +27,11 @@ export declare class PipelineGenerator {
27
27
  * Batching and readback cadence are controlled by runtime.inference.batching.
28
28
  */
29
29
  generate(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<string, void, void>;
30
+ generateTokens(prompt: PromptInput, options?: GenerateOptions): AsyncGenerator<number, void, void>;
31
+ generateTokenIds(
32
+ prompt: PromptInput,
33
+ options?: GenerateOptions
34
+ ): Promise<{ tokenIds: number[]; stats: import('./types.js').PipelineStats }>;
30
35
  prefillKVOnly(prompt: PromptInput, options?: GenerateOptions): Promise<KVCacheSnapshot>;
31
36
  prefillWithEmbedding(prompt: PromptInput, options?: GenerateOptions): Promise<PrefillEmbeddingResult>;
32
37
  prefillWithLogits(prompt: PromptInput, options?: GenerateOptions): Promise<PrefillResult>;