@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/CHANGELOG.md +32 -0
  2. package/README.md +25 -6
  3. package/package.json +25 -38
  4. package/src/browser/browser-converter.js +5 -0
  5. package/src/client/doppler-api.browser.js +6 -0
  6. package/src/client/doppler-api.d.ts +3 -0
  7. package/src/client/doppler-api.js +11 -2
  8. package/src/client/doppler-registry.js +3 -5
  9. package/src/client/doppler-registry.json +2 -2
  10. package/src/config/kernel-path-loader.d.ts +5 -0
  11. package/src/config/kernel-path-loader.js +13 -0
  12. package/src/config/kernels/kernel-ref-digests.js +23 -21
  13. package/src/config/kernels/moe/mixtral.paths.json +46 -0
  14. package/src/config/kernels/registry.json +74 -0
  15. package/src/config/loader.js +9 -0
  16. package/src/config/merge-contract-check.js +7 -0
  17. package/src/config/platforms/loader.js +3 -1
  18. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
  19. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
  20. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
  21. package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
  22. package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
  23. package/src/config/presets/kernel-paths/registry.json +21 -0
  24. package/src/config/presets/models/gemma2.json +2 -1
  25. package/src/config/presets/models/gemma3.json +4 -1
  26. package/src/config/presets/models/gemma4.json +61 -0
  27. package/src/config/presets/models/granite-docling.json +70 -0
  28. package/src/config/presets/models/lfm2.json +6 -1
  29. package/src/config/presets/models/qwen3.json +4 -3
  30. package/src/config/presets/models/qwen3_5.json +16 -0
  31. package/src/config/presets/models/qwen3_vl.json +40 -0
  32. package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
  33. package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
  34. package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
  35. package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
  36. package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
  37. package/src/config/presets/runtime/modes/trace-layers.json +1 -0
  38. package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
  39. package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
  40. package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
  41. package/src/config/runtime.js +3 -0
  42. package/src/config/schema/conversion.schema.d.ts +1 -0
  43. package/src/config/schema/debug.schema.d.ts +40 -0
  44. package/src/config/schema/debug.schema.js +28 -0
  45. package/src/config/schema/index.js +2 -0
  46. package/src/config/schema/inference-defaults.schema.js +1 -1
  47. package/src/config/schema/kernel-path.schema.d.ts +1 -0
  48. package/src/config/schema/manifest.schema.d.ts +1 -1
  49. package/src/config/schema/manifest.schema.js +1 -1
  50. package/src/config/schema/memory-limits.schema.js +2 -2
  51. package/src/config/schema/storage.schema.js +2 -2
  52. package/src/converter/conversion-plan.js +11 -3
  53. package/src/converter/core.js +19 -8
  54. package/src/converter/manifest-inference.js +12 -22
  55. package/src/converter/parsers/transformer.js +4 -0
  56. package/src/converter/quantization-info.js +5 -1
  57. package/src/converter/quantizer.d.ts +5 -0
  58. package/src/converter/quantizer.js +34 -12
  59. package/src/converter/rope-config.js +8 -6
  60. package/src/converter/tokenizer-utils.d.ts +1 -0
  61. package/src/converter/tokenizer-utils.js +4 -1
  62. package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
  63. package/src/distribution/shard-delivery.js +40 -1
  64. package/src/formats/rdrr/classification.js +32 -0
  65. package/src/formats/rdrr/parsing.d.ts +4 -0
  66. package/src/formats/rdrr/parsing.js +14 -1
  67. package/src/gpu/kernel-runtime.js +4 -2
  68. package/src/gpu/kernels/attention.js +2 -1
  69. package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
  70. package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
  71. package/src/gpu/kernels/dequant_shared.wgsl +4 -2
  72. package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
  73. package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
  74. package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
  75. package/src/gpu/kernels/gated-short-conv.js +284 -0
  76. package/src/gpu/kernels/index.d.ts +8 -0
  77. package/src/gpu/kernels/index.js +6 -0
  78. package/src/gpu/kernels/linear-attention-core.js +37 -17
  79. package/src/gpu/kernels/matmul-selection.js +48 -4
  80. package/src/gpu/kernels/matmul.d.ts +5 -0
  81. package/src/gpu/kernels/matmul.js +71 -2
  82. package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
  83. package/src/gpu/kernels/rmsnorm.js +9 -2
  84. package/src/gpu/kernels/sample.js +1 -3
  85. package/src/gpu/kernels/sample.wgsl +39 -9
  86. package/src/gpu/kernels/sample_f16.wgsl +38 -8
  87. package/src/gpu/kernels/shader-cache.js +9 -4
  88. package/src/gpu/kernels/split_qg.d.ts +50 -0
  89. package/src/gpu/kernels/split_qg.js +46 -0
  90. package/src/gpu/kernels/split_qg.wgsl +58 -0
  91. package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
  92. package/src/gpu/weight-buffer.d.ts +1 -1
  93. package/src/gpu/weight-buffer.js +1 -1
  94. package/src/inference/browser-harness.d.ts +2 -0
  95. package/src/inference/browser-harness.js +20 -1
  96. package/src/inference/kv-cache/base.js +3 -10
  97. package/src/inference/pipelines/diffusion/helpers.js +3 -0
  98. package/src/inference/pipelines/diffusion/pipeline.js +2 -1
  99. package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
  100. package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
  101. package/src/inference/pipelines/text/attention/output-projection.js +8 -0
  102. package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
  103. package/src/inference/pipelines/text/attention/projections.js +54 -13
  104. package/src/inference/pipelines/text/attention/record.js +16 -6
  105. package/src/inference/pipelines/text/attention/run.js +59 -6
  106. package/src/inference/pipelines/text/config.d.ts +1 -0
  107. package/src/inference/pipelines/text/config.js +46 -4
  108. package/src/inference/pipelines/text/embed.js +26 -7
  109. package/src/inference/pipelines/text/execution-plan.js +5 -4
  110. package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
  111. package/src/inference/pipelines/text/execution-v0.js +12 -1
  112. package/src/inference/pipelines/text/generator-helpers.js +1 -0
  113. package/src/inference/pipelines/text/generator-runtime.js +19 -0
  114. package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
  115. package/src/inference/pipelines/text/generator-steps.js +71 -26
  116. package/src/inference/pipelines/text/generator.d.ts +5 -0
  117. package/src/inference/pipelines/text/generator.js +353 -166
  118. package/src/inference/pipelines/text/init.d.ts +15 -0
  119. package/src/inference/pipelines/text/init.js +35 -10
  120. package/src/inference/pipelines/text/layer.js +38 -8
  121. package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
  122. package/src/inference/pipelines/text/linear-attention.js +33 -3
  123. package/src/inference/pipelines/text/logits/gpu.js +2 -2
  124. package/src/inference/pipelines/text/logits/index.d.ts +6 -1
  125. package/src/inference/pipelines/text/logits/index.js +3 -1
  126. package/src/inference/pipelines/text/model-load.js +3 -0
  127. package/src/inference/pipelines/text/moe-gpu.js +21 -3
  128. package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
  129. package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
  130. package/src/inference/pipelines/text/ops.js +123 -53
  131. package/src/inference/pipelines/text/probes.js +1 -0
  132. package/src/inference/pipelines/text/sampling.js +52 -6
  133. package/src/inference/pipelines/text/state.js +2 -0
  134. package/src/inference/pipelines/text.d.ts +5 -0
  135. package/src/inference/pipelines/text.js +59 -1
  136. package/src/inference/pipelines/vision/encoder.js +386 -0
  137. package/src/inference/pipelines/vision/image-preprocess.js +151 -0
  138. package/src/inference/pipelines/vision/index.js +173 -0
  139. package/src/inference/pipelines/vision/ops.js +78 -0
  140. package/src/inference/pipelines/vision/patch-embed.js +151 -0
  141. package/src/inference/test-harness.js +11 -9
  142. package/src/loader/doppler-loader.d.ts +3 -0
  143. package/src/loader/doppler-loader.js +20 -3
  144. package/src/loader/experts/expert-cache.js +6 -2
  145. package/src/loader/experts/expert-loader.js +6 -2
  146. package/src/loader/final-weights-loader.js +2 -0
  147. package/src/loader/layer-loader.js +42 -3
  148. package/src/loader/manifest-config.js +3 -1
  149. package/src/loader/shard-cache.js +3 -2
  150. package/src/loader/tensors/tensor-loader.d.ts +3 -0
  151. package/src/loader/tensors/tensor-loader.js +130 -4
  152. package/src/rules/inference/dtype.rules.json +5 -0
  153. package/src/rules/inference/kernel-path.rules.json +2 -2
  154. package/src/rules/kernels/moe.rules.mixtral.json +75 -0
  155. package/src/rules/kernels/softmax.rules.json +2 -0
  156. package/src/rules/kernels/split-qg.rules.json +6 -0
  157. package/src/rules/rule-registry.d.ts +1 -0
  158. package/src/rules/rule-registry.js +4 -0
  159. package/src/storage/downloader.js +2 -1
  160. package/src/storage/quickstart-downloader.d.ts +3 -0
  161. package/src/storage/quickstart-downloader.js +27 -30
  162. package/src/storage/shard-manager.js +4 -3
  163. package/src/tooling/conversion-config-materializer.js +3 -5
  164. package/src/tooling/node-converter.js +28 -7
  165. package/src/tooling/node-source-runtime.js +65 -5
  166. package/src/tooling/node-webgpu.js +24 -7
  167. package/src/types/model.d.ts +5 -0
  168. package/src/utils/hf-resolve-url.d.ts +16 -0
  169. package/src/utils/hf-resolve-url.js +17 -0
  170. package/src/version.js +1 -1
  171. package/tools/doppler-cli.js +6 -1
  172. package/src/tooling/node-convert.d.ts +0 -54
@@ -1,7 +1,7 @@
1
1
 
2
2
 
3
3
  import { getDevice, setTrackSubmits } from '../../../gpu/device.js';
4
- import { releaseBuffer, readBuffer, readBufferSlice } from '../../../memory/buffer-pool.js';
4
+ import { releaseBuffer, readBuffer, readBufferSlice, uploadData } from '../../../memory/buffer-pool.js';
5
5
  import { isGPUSamplingAvailable } from '../../../gpu/kernels/sample.js';
6
6
  import { markWarmed as markKernelCacheWarmed } from '../../../gpu/kernel-selection-cache.js';
7
7
  import { resetSubmitStats, logSubmitStats } from '../../../gpu/submit-tracker.js';
@@ -122,6 +122,20 @@ function resolveTokenText(tokenizer, tokenIds, fallbackText = '?', renderTokenTe
122
122
  return fallbackText;
123
123
  }
124
124
 
125
+ export function shouldRetryWithFinitenessFallback(error) {
126
+ if (error?.name === 'FinitenessError') {
127
+ return true;
128
+ }
129
+ const message = typeof error?.message === 'string'
130
+ ? error.message
131
+ : (typeof error === 'string' ? error : '');
132
+ if (!message.startsWith('[Sampling]')) {
133
+ return false;
134
+ }
135
+ return message.includes('no finite candidate logits after masking the pad token')
136
+ || message.includes('Softmax produced no finite candidate probabilities');
137
+ }
138
+
125
139
  export class PipelineGenerator {
126
140
 
127
141
  #state;
@@ -196,6 +210,14 @@ export class PipelineGenerator {
196
210
  return resolveStepOptions(this.#state, options);
197
211
  }
198
212
 
213
+ _resetDecodeRuntimeState() {
214
+ this.#state.decodeStepCount = 0;
215
+ this.#state.disableRecordedLogits = false;
216
+ this.#state.disableFusedDecode = false;
217
+ resetActiveExecutionPlan(this.#state);
218
+ this.#state.decodeRing?.reset();
219
+ }
220
+
199
221
  _getDecodeHelpers(debugCheckBuffer) {
200
222
  return {
201
223
  buildLayerContext: (recorder, isDecodeMode, debugLayers, executionPlan) =>
@@ -221,6 +243,209 @@ export class PipelineGenerator {
221
243
  );
222
244
  }
223
245
 
246
+ _resolvePromptTokenIds(prompt, useChatTemplate, contextLabel) {
247
+ const processedPrompt = resolvePromptInput(this.#state, prompt, useChatTemplate, contextLabel);
248
+ const inputIds = this.#state.tokenizer.encode(processedPrompt);
249
+ this._assertTokenIdsInRange(inputIds, `${contextLabel}.encode`);
250
+ return inputIds;
251
+ }
252
+
253
+ _sampleNextTokenFromLogits(logits, generatedIds, opts) {
254
+ const sampledLogits = Float32Array.from(logits);
255
+ applyRepetitionPenalty(sampledLogits, generatedIds, opts.repetitionPenalty);
256
+ const padTokenId = this.#state.tokenizer?.getSpecialTokens?.()?.pad;
257
+ return sample(sampledLogits, {
258
+ temperature: opts.temperature,
259
+ topP: opts.topP,
260
+ topK: opts.topK,
261
+ padTokenId,
262
+ seed: opts.seed,
263
+ });
264
+ }
265
+
266
+ async _prefillPromptToLogits(prompt, opts, contextLabel) {
267
+ const inputIds = this._resolvePromptTokenIds(prompt, opts.useChatTemplate, contextLabel);
268
+ if (opts.debug) {
269
+ log.debug('Pipeline', `${contextLabel}: ${inputIds.length} tokens`);
270
+ }
271
+
272
+ let logits;
273
+ try {
274
+ logits = await this._prefill(inputIds, opts);
275
+ } catch (error) {
276
+ if (!shouldRetryWithFinitenessFallback(error)) {
277
+ throw error;
278
+ }
279
+ log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during ${contextLabel}. Retrying with F32 precision.`);
280
+ logits = await this._retryWithFinitenessFallback(
281
+ opts,
282
+ contextLabel,
283
+ () => this._prefill(inputIds, opts)
284
+ );
285
+ }
286
+
287
+ return { inputIds, logits };
288
+ }
289
+
290
+ async _decodeStepToLogits(currentIds, opts) {
291
+ const debugCheckBuffer = this.#state.debug
292
+ ? (buffer, label, numTokens, expectedDim) =>
293
+ debugCheckBufferHelper(this.#state, buffer, label, numTokens, expectedDim)
294
+ : undefined;
295
+ return decodeStepLogits(this.#state, currentIds, opts, this._getDecodeHelpers(debugCheckBuffer));
296
+ }
297
+
298
+ async _decodeNextTokenViaLogits(currentIds, opts) {
299
+ const stepResult = await this._decodeStepToLogits(currentIds, opts);
300
+ return this._sampleNextTokenFromLogits(stepResult.logits, currentIds, opts);
301
+ }
302
+
303
+ async *_generateTokensInternal(prompt, options = {}, mode = 'text') {
304
+ if (!this.#state.isLoaded) throw new Error('Model not loaded');
305
+ if (this.#state.isGenerating) throw new Error('Generation already in progress');
306
+
307
+ validateCallTimeOptions(options);
308
+
309
+ this.#state.isGenerating = true;
310
+ this._resetDecodeRuntimeState();
311
+ this.#state.stats.gpuTimePrefillMs = undefined;
312
+ this.#state.stats.gpuTimeDecodeMs = undefined;
313
+ this.#state.stats.decodeRecordMs = 0;
314
+ this.#state.stats.decodeSubmitWaitMs = 0;
315
+ this.#state.stats.decodeReadbackWaitMs = 0;
316
+ this.#state.stats.ttftMs = 0;
317
+ const startTime = performance.now();
318
+
319
+ const opts = resolveGenerateOptions(this.#state, options);
320
+
321
+ if (opts.debug) {
322
+ log.debug('Pipeline', `ChatTemplate: options=${options.useChatTemplate}, final=${opts.useChatTemplate}`);
323
+ }
324
+
325
+ const emitToken = async function* (generator, tokenId, textDecoder) {
326
+ if (mode === 'token') {
327
+ yield tokenId;
328
+ if (options.onToken) options.onToken(tokenId, '');
329
+ return;
330
+ }
331
+ const tokenText = textDecoder(tokenId);
332
+ yield tokenText;
333
+ if (options.onToken) options.onToken(tokenId, tokenText);
334
+ };
335
+
336
+ try {
337
+ const prefillStart = performance.now();
338
+ const { inputIds, logits: initialPrefillLogits } = await this._prefillPromptToLogits(prompt, opts, 'generate');
339
+ let prefillLogits = initialPrefillLogits;
340
+ this.#state.stats.prefillTimeMs = performance.now() - prefillStart;
341
+ this._assertTokenIdsInRange(inputIds, 'generate.prefillTokens');
342
+ const generatedIds = [...inputIds];
343
+ this.#state.stats.prefillTokens = inputIds.length;
344
+
345
+ if (opts.debug) {
346
+ log.debug('Pipeline', `Input: ${inputIds.length} tokens`);
347
+ }
348
+
349
+ const intentBundleConfig = this.#state.runtimeConfig.shared.intentBundle;
350
+ const intentBundle = intentBundleConfig?.bundle;
351
+ const expectedTopK = intentBundle?.payload?.expectedTopK
352
+ ?? intentBundle?.payload?.expected_top_k;
353
+ const maxDriftThreshold = intentBundle?.constraints?.maxDriftThreshold
354
+ ?? intentBundle?.constraints?.max_drift_threshold;
355
+
356
+ if (intentBundleConfig?.enabled && Array.isArray(expectedTopK) && expectedTopK.length > 0) {
357
+ const actualTopK = getTopK(
358
+ prefillLogits,
359
+ expectedTopK.length,
360
+ (tokens) => resolveTokenText(this.#state.tokenizer, tokens),
361
+ ).map((token) => token.token);
362
+ const driftResult = enforceLogitDrift(expectedTopK, actualTopK, maxDriftThreshold);
363
+ if (!driftResult.ok) {
364
+ throw new Error(`Intent bundle drift check failed: ${driftResult.reason}`);
365
+ }
366
+ }
367
+
368
+ if (opts.debug) {
369
+ const topAfterPenalty = getTopK(
370
+ Float32Array.from(prefillLogits),
371
+ 5,
372
+ (tokens) => resolveTokenText(this.#state.tokenizer, tokens)
373
+ );
374
+ log.debug('Pipeline', `After rep penalty top-5: ${topAfterPenalty.map(t => `"${t.text}"(${(t.prob * 100).toFixed(1)}%)`).join(', ')}`);
375
+ }
376
+
377
+ let firstToken;
378
+ try {
379
+ firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
380
+ } catch (error) {
381
+ if (!shouldRetryWithFinitenessFallback(error)) {
382
+ throw error;
383
+ }
384
+ log.warn('Pipeline', 'FinitenessGuard caught non-finite prefill logits at sampling. Retrying with F32 precision.');
385
+ prefillLogits = await this._retryWithFinitenessFallback(
386
+ opts,
387
+ 'prefill-sample',
388
+ () => this._prefill(inputIds, opts)
389
+ );
390
+ firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
391
+ }
392
+
393
+ if (opts.debug) {
394
+ const firstTokenText = resolveTokenText(this.#state.tokenizer, [firstToken], `[${firstToken}]`, (tokens) => this.#state.tokenizer?.decode?.(tokens, true, false));
395
+ log.debug('Pipeline', `First token sampled: id=${firstToken} text="${firstTokenText}"`);
396
+ }
397
+
398
+ generatedIds.push(firstToken);
399
+ this.#state.stats.ttftMs = performance.now() - startTime;
400
+
401
+ const decodeToken = (tokenId) => resolveTokenText(
402
+ this.#state.tokenizer,
403
+ [tokenId],
404
+ `[${tokenId}]`,
405
+ (tokens) => this.#state.tokenizer?.decode?.(tokens, true, false),
406
+ (tokens) => this.#state.tokenizer?.decode?.(tokens, false, false)
407
+ );
408
+
409
+ yield* emitToken(this, firstToken, decodeToken);
410
+
411
+ yield* this._runDecodeLoop(generatedIds, opts, options, {
412
+ stopTokenIds: this.#state.modelConfig.stopTokenIds,
413
+ eosToken: this.#state.tokenizer.getSpecialTokens?.()?.eos,
414
+ stopSequenceStart: inputIds.length,
415
+ decodeToken,
416
+ logBatchPath: opts.debug,
417
+ emitMode: mode,
418
+ });
419
+ const tokensGenerated = this.#state.stats.decodeTokens;
420
+ this.#state.stats.totalTimeMs = performance.now() - startTime;
421
+
422
+ if (opts.debug) {
423
+ log.debug('Pipeline', `Generated ${tokensGenerated} tokens in ${this.#state.stats.totalTimeMs.toFixed(0)}ms`);
424
+ }
425
+
426
+ const ttft = this.#state.stats.ttftMs ?? this.#state.stats.prefillTimeMs;
427
+ const decodeTokens = Math.max(0, tokensGenerated - 1);
428
+ const decodeSpeed = decodeTokens > 0 ? (decodeTokens / this.#state.stats.decodeTimeMs * 1000) : 0;
429
+ if (opts.benchmark) {
430
+ log.info('Benchmark', `TTFT: ${ttft.toFixed(0)}ms | Prefill: ${this.#state.stats.prefillTimeMs.toFixed(0)}ms | Decode: ${this.#state.stats.decodeTimeMs.toFixed(0)}ms (${decodeTokens} tokens @ ${decodeSpeed.toFixed(1)} tok/s)`);
431
+ } else {
432
+ log.info('Perf', `TTFT: ${ttft.toFixed(0)}ms | Prefill: ${this.#state.stats.prefillTimeMs.toFixed(0)}ms | Decode: ${this.#state.stats.decodeTimeMs.toFixed(0)}ms (${decodeTokens} tokens @ ${decodeSpeed.toFixed(1)} tok/s)`);
433
+ }
434
+ trace.perf('Decode summary', {
435
+ ttftMs: ttft,
436
+ prefillMs: this.#state.stats.prefillTimeMs,
437
+ decodeMs: this.#state.stats.decodeTimeMs,
438
+ decodeTokens,
439
+ decodeSpeed,
440
+ totalMs: this.#state.stats.totalTimeMs,
441
+ });
442
+ } finally {
443
+ this._closeFinitenessFallbackWindow(opts);
444
+ resetActiveExecutionPlan(this.#state);
445
+ this.#state.isGenerating = false;
446
+ }
447
+ }
448
+
224
449
  _beginFinitenessFallback(opts, reasonLabel) {
225
450
  const originalPlan = resolveActiveExecutionPlan(this.#state);
226
451
  const original = {
@@ -306,17 +531,21 @@ export class PipelineGenerator {
306
531
 
307
532
 
308
533
  async *generate(prompt, options = {}) {
534
+ yield* this._generateTokensInternal(prompt, options, 'text');
535
+ }
536
+
537
+ async *generateTokens(prompt, options = {}) {
538
+ yield* this._generateTokensInternal(prompt, options, 'token');
539
+ }
540
+
541
+ async generateTokenIds(prompt, options = {}) {
309
542
  if (!this.#state.isLoaded) throw new Error('Model not loaded');
310
543
  if (this.#state.isGenerating) throw new Error('Generation already in progress');
311
544
 
312
545
  validateCallTimeOptions(options);
313
546
 
314
547
  this.#state.isGenerating = true;
315
- this.#state.decodeStepCount = 0;
316
- this.#state.disableRecordedLogits = false;
317
- this.#state.disableFusedDecode = false;
318
- resetActiveExecutionPlan(this.#state);
319
- this.#state.decodeRing?.reset();
548
+ this._resetDecodeRuntimeState();
320
549
  this.#state.stats.gpuTimePrefillMs = undefined;
321
550
  this.#state.stats.gpuTimeDecodeMs = undefined;
322
551
  this.#state.stats.decodeRecordMs = 0;
@@ -324,135 +553,79 @@ export class PipelineGenerator {
324
553
  this.#state.stats.decodeReadbackWaitMs = 0;
325
554
  this.#state.stats.ttftMs = 0;
326
555
  const startTime = performance.now();
327
-
328
556
  const opts = resolveGenerateOptions(this.#state, options);
329
557
 
330
- if (opts.debug) {
331
- log.debug('Pipeline', `ChatTemplate: options=${options.useChatTemplate}, final=${opts.useChatTemplate}`);
332
- }
333
-
334
558
  try {
335
- const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'generate');
336
- if (opts.debug && opts.useChatTemplate) {
337
- log.debug('Pipeline', `Applied ${this.#state.modelConfig.chatTemplateType} chat template`);
338
- }
339
-
340
- const inputIds = this.#state.tokenizer.encode(processedPrompt);
341
- this._assertTokenIdsInRange(inputIds, 'generate.encode');
559
+ const prefillStart = performance.now();
560
+ const { inputIds, logits: initialPrefillLogits } = await this._prefillPromptToLogits(prompt, opts, 'generateTokenIds');
561
+ let prefillLogits = initialPrefillLogits;
562
+ this.#state.stats.prefillTimeMs = performance.now() - prefillStart;
563
+ this._assertTokenIdsInRange(inputIds, 'generateTokenIds.prefillTokens');
342
564
  const generatedIds = [...inputIds];
343
565
  this.#state.stats.prefillTokens = inputIds.length;
344
566
 
345
- if (opts.debug) {
346
- log.debug('Pipeline', `Input: ${inputIds.length} tokens`);
347
- }
348
-
349
- const prefillStart = performance.now();
350
- let prefillLogits;
567
+ let firstToken;
351
568
  try {
352
- prefillLogits = await this._prefill(inputIds, opts);
569
+ firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
353
570
  } catch (error) {
354
- if (error.name === 'FinitenessError') {
355
- log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefill. Retrying with F32 precision.`);
356
- prefillLogits = await this._retryWithFinitenessFallback(
357
- opts,
358
- 'prefill',
359
- () => this._prefill(inputIds, opts)
360
- );
361
- } else {
571
+ if (!shouldRetryWithFinitenessFallback(error)) {
362
572
  throw error;
363
573
  }
364
- }
365
- this.#state.stats.prefillTimeMs = performance.now() - prefillStart;
366
-
367
- const intentBundleConfig = this.#state.runtimeConfig.shared.intentBundle;
368
- const intentBundle = intentBundleConfig?.bundle;
369
- const expectedTopK = intentBundle?.payload?.expectedTopK
370
- ?? intentBundle?.payload?.expected_top_k;
371
- const maxDriftThreshold = intentBundle?.constraints?.maxDriftThreshold
372
- ?? intentBundle?.constraints?.max_drift_threshold;
373
-
374
- if (intentBundleConfig?.enabled && Array.isArray(expectedTopK) && expectedTopK.length > 0) {
375
- const actualTopK = getTopK(
376
- prefillLogits,
377
- expectedTopK.length,
378
- (tokens) => resolveTokenText(this.#state.tokenizer, tokens),
379
- ).map((token) => token.token);
380
- const driftResult = enforceLogitDrift(expectedTopK, actualTopK, maxDriftThreshold);
381
- if (!driftResult.ok) {
382
- throw new Error(`Intent bundle drift check failed: ${driftResult.reason}`);
383
- }
384
- }
385
-
386
- applyRepetitionPenalty(prefillLogits, generatedIds, opts.repetitionPenalty);
387
- const padTokenId = this.#state.tokenizer?.getSpecialTokens?.()?.pad;
388
-
389
- if (opts.debug) {
390
- const topAfterPenalty = getTopK(
391
- prefillLogits,
392
- 5,
393
- (tokens) => resolveTokenText(this.#state.tokenizer, tokens)
574
+ prefillLogits = await this._retryWithFinitenessFallback(
575
+ opts,
576
+ 'prefill-sample',
577
+ () => this._prefill(inputIds, opts)
394
578
  );
395
- log.debug('Pipeline', `After rep penalty top-5: ${topAfterPenalty.map(t => `"${t.text}"(${(t.prob * 100).toFixed(1)}%)`).join(', ')}`);
396
- }
397
-
398
- const firstToken = sample(prefillLogits, {
399
- temperature: opts.temperature,
400
- topP: opts.topP,
401
- topK: opts.topK,
402
- padTokenId,
403
- seed: opts.seed,
404
- });
405
-
406
- if (opts.debug) {
407
- const firstTokenText = resolveTokenText(this.#state.tokenizer, [firstToken], `[${firstToken}]`, (tokens) => this.#state.tokenizer?.decode?.(tokens, true, false));
408
- log.debug('Pipeline', `First token sampled: id=${firstToken} text="${firstTokenText}"`);
579
+ firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
409
580
  }
410
581
 
411
582
  generatedIds.push(firstToken);
583
+ const tokenIds = [firstToken];
412
584
  this.#state.stats.ttftMs = performance.now() - startTime;
413
585
 
414
- const decodeToken = (tokenId) => resolveTokenText(
415
- this.#state.tokenizer,
416
- [tokenId],
417
- `[${tokenId}]`,
418
- (tokens) => this.#state.tokenizer?.decode?.(tokens, true, false),
419
- (tokens) => this.#state.tokenizer?.decode?.(tokens, false, false)
420
- );
586
+ const stopTokenIds = this.#state.modelConfig.stopTokenIds;
587
+ const eosToken = this.#state.tokenizer.getSpecialTokens?.()?.eos;
588
+ const stopSequenceStart = inputIds.length;
589
+ markKernelCacheWarmed();
590
+ const decodeStart = performance.now();
421
591
 
422
- const firstText = decodeToken(firstToken);
423
- yield firstText;
424
- if (options.onToken) options.onToken(firstToken, firstText);
592
+ while (tokenIds.length < opts.maxTokens) {
593
+ if (options.signal?.aborted) break;
594
+ let nextToken;
595
+ try {
596
+ nextToken = await this._decodeNextTokenViaLogits(generatedIds, opts);
597
+ } catch (error) {
598
+ if (shouldRetryWithFinitenessFallback(error)) {
599
+ nextToken = await this._retryDecodeStepWithFinitenessWindow(
600
+ generatedIds,
601
+ opts,
602
+ `decode-step-${tokenIds.length}`
603
+ );
604
+ } else {
605
+ throw error;
606
+ }
607
+ }
608
+ generatedIds.push(nextToken);
609
+ tokenIds.push(nextToken);
610
+ this._consumeFinitenessFallbackToken(opts);
611
+ if (isStopToken(nextToken, stopTokenIds, eosToken)) {
612
+ break;
613
+ }
614
+ if (opts.stopSequences.length > 0) {
615
+ const fullText = this.#state.tokenizer.decode(generatedIds.slice(stopSequenceStart), false);
616
+ if (opts.stopSequences.some((seq) => fullText.endsWith(seq))) break;
617
+ }
618
+ }
425
619
 
426
- yield* this._runDecodeLoop(generatedIds, opts, options, {
427
- stopTokenIds: this.#state.modelConfig.stopTokenIds,
428
- eosToken: this.#state.tokenizer.getSpecialTokens?.()?.eos,
429
- stopSequenceStart: inputIds.length,
430
- decodeToken,
431
- logBatchPath: opts.debug,
432
- });
433
- const tokensGenerated = this.#state.stats.decodeTokens;
620
+ this.#state.stats.decodeTimeMs = performance.now() - decodeStart;
621
+ this.#state.stats.tokensGenerated = tokenIds.length;
622
+ this.#state.stats.decodeTokens = tokenIds.length;
434
623
  this.#state.stats.totalTimeMs = performance.now() - startTime;
435
624
 
436
- if (opts.debug) {
437
- log.debug('Pipeline', `Generated ${tokensGenerated} tokens in ${this.#state.stats.totalTimeMs.toFixed(0)}ms`);
438
- }
439
-
440
- const ttft = this.#state.stats.ttftMs ?? this.#state.stats.prefillTimeMs;
441
- const decodeTokens = Math.max(0, tokensGenerated - 1);
442
- const decodeSpeed = decodeTokens > 0 ? (decodeTokens / this.#state.stats.decodeTimeMs * 1000) : 0;
443
- if (opts.benchmark) {
444
- log.info('Benchmark', `TTFT: ${ttft.toFixed(0)}ms | Prefill: ${this.#state.stats.prefillTimeMs.toFixed(0)}ms | Decode: ${this.#state.stats.decodeTimeMs.toFixed(0)}ms (${decodeTokens} tokens @ ${decodeSpeed.toFixed(1)} tok/s)`);
445
- } else {
446
- log.info('Perf', `TTFT: ${ttft.toFixed(0)}ms | Prefill: ${this.#state.stats.prefillTimeMs.toFixed(0)}ms | Decode: ${this.#state.stats.decodeTimeMs.toFixed(0)}ms (${decodeTokens} tokens @ ${decodeSpeed.toFixed(1)} tok/s)`);
447
- }
448
- trace.perf('Decode summary', {
449
- ttftMs: ttft,
450
- prefillMs: this.#state.stats.prefillTimeMs,
451
- decodeMs: this.#state.stats.decodeTimeMs,
452
- decodeTokens,
453
- decodeSpeed,
454
- totalMs: this.#state.stats.totalTimeMs,
455
- });
625
+ return {
626
+ tokenIds,
627
+ stats: this.#state.stats,
628
+ };
456
629
  } finally {
457
630
  this._closeFinitenessFallbackWindow(opts);
458
631
  resetActiveExecutionPlan(this.#state);
@@ -463,14 +636,13 @@ export class PipelineGenerator {
463
636
 
464
637
  async prefillKVOnly(prompt, options = {}) {
465
638
  if (!this.#state.isLoaded) throw new Error('Model not loaded');
466
- resetActiveExecutionPlan(this.#state);
639
+ if (this.#state.isGenerating && options.__internalGenerate !== true) {
640
+ throw new Error('Generation already in progress');
641
+ }
642
+ this._resetDecodeRuntimeState();
467
643
  this.#state.stats.gpuTimePrefillMs = undefined;
468
644
  const opts = resolvePrefillOptions(this.#state, options);
469
-
470
- const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'prefillKVOnly');
471
-
472
- const inputIds = this.#state.tokenizer.encode(processedPrompt);
473
- this._assertTokenIdsInRange(inputIds, 'prefillKVOnly.encode');
645
+ const inputIds = this._resolvePromptTokenIds(prompt, opts.useChatTemplate, 'prefillKVOnly');
474
646
  if (opts.debug) {
475
647
  log.debug('Pipeline', `PrefillKVOnly: ${inputIds.length} tokens`);
476
648
  }
@@ -479,7 +651,7 @@ export class PipelineGenerator {
479
651
  try {
480
652
  prefillResult = await this._prefillToHidden(inputIds, opts);
481
653
  } catch (error) {
482
- if (error.name === 'FinitenessError') {
654
+ if (shouldRetryWithFinitenessFallback(error)) {
483
655
  log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefillKVOnly. Retrying with F32 precision.`);
484
656
  prefillResult = await this._retryWithFinitenessFallback(
485
657
  opts,
@@ -528,14 +700,13 @@ export class PipelineGenerator {
528
700
 
529
701
  async prefillWithEmbedding(prompt, options = {}) {
530
702
  if (!this.#state.isLoaded) throw new Error('Model not loaded');
531
- resetActiveExecutionPlan(this.#state);
703
+ if (this.#state.isGenerating && options.__internalGenerate !== true) {
704
+ throw new Error('Generation already in progress');
705
+ }
706
+ this._resetDecodeRuntimeState();
532
707
  this.#state.stats.gpuTimePrefillMs = undefined;
533
708
  const opts = resolvePrefillEmbeddingOptions(this.#state, options);
534
-
535
- const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'prefillWithEmbedding');
536
-
537
- const inputIds = this.#state.tokenizer.encode(processedPrompt);
538
- this._assertTokenIdsInRange(inputIds, 'prefillWithEmbedding.encode');
709
+ const inputIds = this._resolvePromptTokenIds(prompt, opts.useChatTemplate, 'prefillWithEmbedding');
539
710
  if (opts.debug) {
540
711
  log.debug('Pipeline', `PrefillWithEmbedding: ${inputIds.length} tokens (mode=${opts.embeddingMode})`);
541
712
  }
@@ -544,7 +715,7 @@ export class PipelineGenerator {
544
715
  try {
545
716
  prefillResult = await this._prefillToHidden(inputIds, opts);
546
717
  } catch (error) {
547
- if (error.name === 'FinitenessError') {
718
+ if (shouldRetryWithFinitenessFallback(error)) {
548
719
  log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefillWithEmbedding. Retrying with F32 precision.`);
549
720
  prefillResult = await this._retryWithFinitenessFallback(
550
721
  opts,
@@ -623,19 +794,13 @@ export class PipelineGenerator {
623
794
 
624
795
  async prefillWithLogits(prompt, options = {}) {
625
796
  if (!this.#state.isLoaded) throw new Error('Model not loaded');
626
- resetActiveExecutionPlan(this.#state);
797
+ if (this.#state.isGenerating && options.__internalGenerate !== true) {
798
+ throw new Error('Generation already in progress');
799
+ }
800
+ this._resetDecodeRuntimeState();
627
801
  this.#state.stats.gpuTimePrefillMs = undefined;
628
802
  const opts = resolvePrefillOptions(this.#state, options);
629
-
630
- const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'prefillWithLogits');
631
-
632
- const inputIds = this.#state.tokenizer.encode(processedPrompt);
633
- this._assertTokenIdsInRange(inputIds, 'prefillWithLogits.encode');
634
- if (opts.debug) {
635
- log.debug('Pipeline', `PrefillWithLogits: ${inputIds.length} tokens`);
636
- }
637
-
638
- const logits = await this._prefill(inputIds, opts);
803
+ const { inputIds, logits } = await this._prefillPromptToLogits(prompt, opts, 'prefillWithLogits');
639
804
 
640
805
  const snapshot = this.#state.kvCache?.clone();
641
806
  if (!snapshot) {
@@ -757,6 +922,7 @@ export class PipelineGenerator {
757
922
  stopSequenceStart,
758
923
  decodeToken,
759
924
  logBatchPath = false,
925
+ emitMode = 'text',
760
926
  } = runtime;
761
927
 
762
928
  let tokensGenerated = 1;
@@ -786,6 +952,9 @@ export class PipelineGenerator {
786
952
  }
787
953
  const readbackInterval = executionPlan.readbackInterval;
788
954
  const intervalBatches = readbackInterval == null ? 1 : readbackInterval;
955
+ const padTokenId = this.#state.tokenizer?.getSpecialTokens?.()?.pad;
956
+
957
+ const decodeSingleTokenViaLogits = async () => this._decodeNextTokenViaLogits(generatedIds, opts);
789
958
 
790
959
  if (logBatchPath && useBatchPath) {
791
960
  log.debug(
@@ -811,10 +980,16 @@ export class PipelineGenerator {
811
980
  for (const tokenId of batchResult.tokens) {
812
981
  generatedIds.push(tokenId);
813
982
  tokensGenerated++;
814
- const tokenText = decodeToken(tokenId);
815
- yield tokenText;
816
- if (options.onToken) options.onToken(tokenId, tokenText);
817
- batchTokens.push({ id: tokenId, text: tokenText });
983
+ if (emitMode === 'token') {
984
+ yield tokenId;
985
+ if (options.onToken) options.onToken(tokenId, '');
986
+ batchTokens.push({ id: tokenId, text: '' });
987
+ } else {
988
+ const tokenText = decodeToken(tokenId);
989
+ yield tokenText;
990
+ if (options.onToken) options.onToken(tokenId, tokenText);
991
+ batchTokens.push({ id: tokenId, text: tokenText });
992
+ }
818
993
  if (batchTokens.length === executionPlan.batchSize) {
819
994
  if (options.onBatch) options.onBatch(batchTokens);
820
995
  batchTokens = [];
@@ -831,9 +1006,9 @@ export class PipelineGenerator {
831
1006
  useBatchPath = false;
832
1007
  let nextToken;
833
1008
  try {
834
- nextToken = await this._decodeStep(generatedIds, opts);
1009
+ nextToken = await decodeSingleTokenViaLogits();
835
1010
  } catch (singleTokenError) {
836
- if (singleTokenError.name === 'FinitenessError') {
1011
+ if (shouldRetryWithFinitenessFallback(singleTokenError)) {
837
1012
  log.warn('Pipeline', `FinitenessGuard caught NaN/Inf at batch step ${tokensGenerated}. Truncating KV cache and retrying token with F32 precision.`);
838
1013
  nextToken = await this._retryDecodeStepWithFinitenessWindow(
839
1014
  generatedIds,
@@ -846,9 +1021,14 @@ export class PipelineGenerator {
846
1021
  }
847
1022
  generatedIds.push(nextToken);
848
1023
  tokensGenerated++;
849
- const tokenText = decodeToken(nextToken);
850
- yield tokenText;
851
- if (options.onToken) options.onToken(nextToken, tokenText);
1024
+ if (emitMode === 'token') {
1025
+ yield nextToken;
1026
+ if (options.onToken) options.onToken(nextToken, '');
1027
+ } else {
1028
+ const tokenText = decodeToken(nextToken);
1029
+ yield tokenText;
1030
+ if (options.onToken) options.onToken(nextToken, tokenText);
1031
+ }
852
1032
  this._consumeFinitenessFallbackToken(opts);
853
1033
  if (isStopToken(nextToken, stopTokenIds, eosToken)) break;
854
1034
  }
@@ -856,9 +1036,9 @@ export class PipelineGenerator {
856
1036
  const tokenStart = performance.now();
857
1037
  let nextToken;
858
1038
  try {
859
- nextToken = await this._decodeStep(generatedIds, opts);
1039
+ nextToken = await decodeSingleTokenViaLogits();
860
1040
  } catch (error) {
861
- if (error.name === 'FinitenessError') {
1041
+ if (shouldRetryWithFinitenessFallback(error)) {
862
1042
  log.warn('Pipeline', `FinitenessGuard caught NaN/Inf at step ${tokensGenerated}. Truncating KV cache and retrying token with F32 precision.`);
863
1043
  nextToken = await this._retryDecodeStepWithFinitenessWindow(
864
1044
  generatedIds,
@@ -872,9 +1052,14 @@ export class PipelineGenerator {
872
1052
  const tokenTime = performance.now() - tokenStart;
873
1053
  generatedIds.push(nextToken);
874
1054
  tokensGenerated++;
875
- const tokenText = decodeToken(nextToken);
876
- yield tokenText;
877
- if (options.onToken) options.onToken(nextToken, tokenText);
1055
+ const tokenText = emitMode === 'token' ? '' : decodeToken(nextToken);
1056
+ if (emitMode === 'token') {
1057
+ yield nextToken;
1058
+ if (options.onToken) options.onToken(nextToken, '');
1059
+ } else {
1060
+ yield tokenText;
1061
+ if (options.onToken) options.onToken(nextToken, tokenText);
1062
+ }
878
1063
  this._consumeFinitenessFallbackToken(opts);
879
1064
 
880
1065
  if (opts.debug || opts.benchmark) {
@@ -912,17 +1097,22 @@ export class PipelineGenerator {
912
1097
  if (startPos === 0 && hasLinearAttentionLayers(config.layerTypes)) {
913
1098
  this.#state.linearAttentionRuntime = resetLinearAttentionRuntime(this.#state.linearAttentionRuntime);
914
1099
  }
1100
+ if (startPos === 0) {
1101
+ for (const [, convState] of this.#state.convLayerStates) {
1102
+ if (convState.convStateGPU && convState.hiddenSize && convState.kernelSize) {
1103
+ uploadData(convState.convStateGPU, new Float32Array(convState.hiddenSize * (convState.kernelSize - 1)));
1104
+ }
1105
+ }
1106
+ }
915
1107
 
916
1108
  const embedBufferRaw = this.#state.weights.get('embed');
917
1109
  if (!(embedBufferRaw instanceof GPUBuffer) && !isWeightBuffer(embedBufferRaw) && !isCpuWeightBuffer(embedBufferRaw) && !(embedBufferRaw instanceof Float32Array)) {
918
1110
  throw new Error('Embed buffer not found or not a supported buffer type');
919
1111
  }
920
1112
  const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
921
- const embedDtype = isWeightBuffer(embedBufferRaw)
922
- ? getWeightDtype(embedBufferRaw)
923
- : isCpuWeightBuffer(embedBufferRaw)
924
- ? embedBufferRaw.dtype
925
- : null;
1113
+ const embedDtype = isCpuWeightBuffer(embedBufferRaw)
1114
+ ? embedBufferRaw.dtype
1115
+ : getWeightDtype(embedBufferRaw);
926
1116
  if (opts.debug) {
927
1117
  const embedSize = embedBuffer instanceof GPUBuffer ? embedBuffer.size : 'N/A';
928
1118
  log.debug('Pipeline', `Embed buffer: type=${embedBuffer?.constructor?.name}, size=${embedSize}, dtype=${embedDtype}`);
@@ -1263,18 +1453,15 @@ export class PipelineGenerator {
1263
1453
 
1264
1454
  async decodeStepLogits(currentIds, options = {}) {
1265
1455
  if (!this.#state.isLoaded) throw new Error('Model not loaded');
1266
- if (this.#state.isGenerating) throw new Error('Generation already in progress');
1456
+ if (this.#state.isGenerating && options.__internalGenerate !== true) {
1457
+ throw new Error('Generation already in progress');
1458
+ }
1267
1459
  resetActiveExecutionPlan(this.#state);
1268
1460
 
1269
1461
  validateCallTimeOptions(options);
1270
1462
 
1271
1463
  const opts = this._resolveStepOptions(options);
1272
- const debugCheckBuffer = this.#state.debug
1273
- ? (buffer, label, numTokens, expectedDim) =>
1274
- debugCheckBufferHelper(this.#state, buffer, label, numTokens, expectedDim)
1275
- : undefined;
1276
-
1277
- return decodeStepLogits(this.#state, currentIds, opts, this._getDecodeHelpers(debugCheckBuffer));
1464
+ return this._decodeStepToLogits(currentIds, opts);
1278
1465
  }
1279
1466
 
1280
1467
  async advanceWithToken(tokenId, options = {}) {