npm - @simulatte/doppler - Versions diffs - 0.1.7 → 0.1.9 - Mend

@simulatte/doppler 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

package/CHANGELOG.md +32 -0
package/README.md +25 -6
package/package.json +25 -38
package/src/browser/browser-converter.js +5 -0
package/src/client/doppler-api.browser.js +6 -0
package/src/client/doppler-api.d.ts +3 -0
package/src/client/doppler-api.js +11 -2
package/src/client/doppler-registry.js +3 -5
package/src/client/doppler-registry.json +2 -2
package/src/config/kernel-path-loader.d.ts +5 -0
package/src/config/kernel-path-loader.js +13 -0
package/src/config/kernels/kernel-ref-digests.js +23 -21
package/src/config/kernels/moe/mixtral.paths.json +46 -0
package/src/config/kernels/registry.json +74 -0
package/src/config/loader.js +9 -0
package/src/config/merge-contract-check.js +7 -0
package/src/config/platforms/loader.js +3 -1
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32w-f32a-online.json +56 -0
package/src/config/presets/kernel-paths/lfm2-q4k-dequant-f32a-nosubgroups.json +61 -0
package/src/config/presets/kernel-paths/registry.json +21 -0
package/src/config/presets/models/gemma2.json +2 -1
package/src/config/presets/models/gemma3.json +4 -1
package/src/config/presets/models/gemma4.json +61 -0
package/src/config/presets/models/granite-docling.json +70 -0
package/src/config/presets/models/lfm2.json +6 -1
package/src/config/presets/models/qwen3.json +4 -3
package/src/config/presets/models/qwen3_5.json +16 -0
package/src/config/presets/models/qwen3_vl.json +40 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
package/src/config/presets/runtime/model/qwen3-5-layer-probe.json +52 -0
package/src/config/presets/runtime/model/qwen3-5-linear-attn-debug.json +90 -0
package/src/config/presets/runtime/modes/trace-layers.json +1 -0
package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
package/src/config/runtime.js +3 -0
package/src/config/schema/conversion.schema.d.ts +1 -0
package/src/config/schema/debug.schema.d.ts +40 -0
package/src/config/schema/debug.schema.js +28 -0
package/src/config/schema/index.js +2 -0
package/src/config/schema/inference-defaults.schema.js +1 -1
package/src/config/schema/kernel-path.schema.d.ts +1 -0
package/src/config/schema/manifest.schema.d.ts +1 -1
package/src/config/schema/manifest.schema.js +1 -1
package/src/config/schema/memory-limits.schema.js +2 -2
package/src/config/schema/storage.schema.js +2 -2
package/src/converter/conversion-plan.js +11 -3
package/src/converter/core.js +19 -8
package/src/converter/manifest-inference.js +12 -22
package/src/converter/parsers/transformer.js +4 -0
package/src/converter/quantization-info.js +5 -1
package/src/converter/quantizer.d.ts +5 -0
package/src/converter/quantizer.js +34 -12
package/src/converter/rope-config.js +8 -6
package/src/converter/tokenizer-utils.d.ts +1 -0
package/src/converter/tokenizer-utils.js +4 -1
package/src/debug/reference/hf_qwen35_linear_attn_debug.py +268 -0
package/src/distribution/shard-delivery.js +40 -1
package/src/formats/rdrr/classification.js +32 -0
package/src/formats/rdrr/parsing.d.ts +4 -0
package/src/formats/rdrr/parsing.js +14 -1
package/src/gpu/kernel-runtime.js +4 -2
package/src/gpu/kernels/attention.js +2 -1
package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
package/src/gpu/kernels/dequant_shared.wgsl +4 -2
package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
package/src/gpu/kernels/gated-short-conv.js +284 -0
package/src/gpu/kernels/index.d.ts +8 -0
package/src/gpu/kernels/index.js +6 -0
package/src/gpu/kernels/linear-attention-core.js +37 -17
package/src/gpu/kernels/matmul-selection.js +48 -4
package/src/gpu/kernels/matmul.d.ts +5 -0
package/src/gpu/kernels/matmul.js +71 -2
package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
package/src/gpu/kernels/rmsnorm.js +9 -2
package/src/gpu/kernels/sample.js +1 -3
package/src/gpu/kernels/sample.wgsl +39 -9
package/src/gpu/kernels/sample_f16.wgsl +38 -8
package/src/gpu/kernels/shader-cache.js +9 -4
package/src/gpu/kernels/split_qg.d.ts +50 -0
package/src/gpu/kernels/split_qg.js +46 -0
package/src/gpu/kernels/split_qg.wgsl +58 -0
package/src/gpu/kernels/split_qg_f16.wgsl +62 -0
package/src/gpu/weight-buffer.d.ts +1 -1
package/src/gpu/weight-buffer.js +1 -1
package/src/inference/browser-harness.d.ts +2 -0
package/src/inference/browser-harness.js +20 -1
package/src/inference/kv-cache/base.js +3 -10
package/src/inference/pipelines/diffusion/helpers.js +3 -0
package/src/inference/pipelines/diffusion/pipeline.js +2 -1
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +10 -3
package/src/inference/pipelines/text/attention/output-projection.d.ts +12 -0
package/src/inference/pipelines/text/attention/output-projection.js +8 -0
package/src/inference/pipelines/text/attention/projections.d.ts +13 -1
package/src/inference/pipelines/text/attention/projections.js +54 -13
package/src/inference/pipelines/text/attention/record.js +16 -6
package/src/inference/pipelines/text/attention/run.js +59 -6
package/src/inference/pipelines/text/config.d.ts +1 -0
package/src/inference/pipelines/text/config.js +46 -4
package/src/inference/pipelines/text/embed.js +26 -7
package/src/inference/pipelines/text/execution-plan.js +5 -4
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
package/src/inference/pipelines/text/execution-v0.js +12 -1
package/src/inference/pipelines/text/generator-helpers.js +1 -0
package/src/inference/pipelines/text/generator-runtime.js +19 -0
package/src/inference/pipelines/text/generator-steps.d.ts +15 -0
package/src/inference/pipelines/text/generator-steps.js +71 -26
package/src/inference/pipelines/text/generator.d.ts +5 -0
package/src/inference/pipelines/text/generator.js +353 -166
package/src/inference/pipelines/text/init.d.ts +15 -0
package/src/inference/pipelines/text/init.js +35 -10
package/src/inference/pipelines/text/layer.js +38 -8
package/src/inference/pipelines/text/linear-attention.d.ts +5 -0
package/src/inference/pipelines/text/linear-attention.js +33 -3
package/src/inference/pipelines/text/logits/gpu.js +2 -2
package/src/inference/pipelines/text/logits/index.d.ts +6 -1
package/src/inference/pipelines/text/logits/index.js +3 -1
package/src/inference/pipelines/text/model-load.js +3 -0
package/src/inference/pipelines/text/moe-gpu.js +21 -3
package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
package/src/inference/pipelines/text/ops.js +123 -53
package/src/inference/pipelines/text/probes.js +1 -0
package/src/inference/pipelines/text/sampling.js +52 -6
package/src/inference/pipelines/text/state.js +2 -0
package/src/inference/pipelines/text.d.ts +5 -0
package/src/inference/pipelines/text.js +59 -1
package/src/inference/pipelines/vision/encoder.js +386 -0
package/src/inference/pipelines/vision/image-preprocess.js +151 -0
package/src/inference/pipelines/vision/index.js +173 -0
package/src/inference/pipelines/vision/ops.js +78 -0
package/src/inference/pipelines/vision/patch-embed.js +151 -0
package/src/inference/test-harness.js +11 -9
package/src/loader/doppler-loader.d.ts +3 -0
package/src/loader/doppler-loader.js +20 -3
package/src/loader/experts/expert-cache.js +6 -2
package/src/loader/experts/expert-loader.js +6 -2
package/src/loader/final-weights-loader.js +2 -0
package/src/loader/layer-loader.js +42 -3
package/src/loader/manifest-config.js +3 -1
package/src/loader/shard-cache.js +3 -2
package/src/loader/tensors/tensor-loader.d.ts +3 -0
package/src/loader/tensors/tensor-loader.js +130 -4
package/src/rules/inference/dtype.rules.json +5 -0
package/src/rules/inference/kernel-path.rules.json +2 -2
package/src/rules/kernels/moe.rules.mixtral.json +75 -0
package/src/rules/kernels/softmax.rules.json +2 -0
package/src/rules/kernels/split-qg.rules.json +6 -0
package/src/rules/rule-registry.d.ts +1 -0
package/src/rules/rule-registry.js +4 -0
package/src/storage/downloader.js +2 -1
package/src/storage/quickstart-downloader.d.ts +3 -0
package/src/storage/quickstart-downloader.js +27 -30
package/src/storage/shard-manager.js +4 -3
package/src/tooling/conversion-config-materializer.js +3 -5
package/src/tooling/node-converter.js +28 -7
package/src/tooling/node-source-runtime.js +65 -5
package/src/tooling/node-webgpu.js +24 -7
package/src/types/model.d.ts +5 -0
package/src/utils/hf-resolve-url.d.ts +16 -0
package/src/utils/hf-resolve-url.js +17 -0
package/src/version.js +1 -1
package/tools/doppler-cli.js +6 -1
package/src/tooling/node-convert.d.ts +0 -54

package/src/inference/pipelines/text/generator.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { getDevice, setTrackSubmits } from '../../../gpu/device.js';
-import { releaseBuffer, readBuffer, readBufferSlice } from '../../../memory/buffer-pool.js';
+import { releaseBuffer, readBuffer, readBufferSlice, uploadData } from '../../../memory/buffer-pool.js';
 import { isGPUSamplingAvailable } from '../../../gpu/kernels/sample.js';
 import { markWarmed as markKernelCacheWarmed } from '../../../gpu/kernel-selection-cache.js';
 import { resetSubmitStats, logSubmitStats } from '../../../gpu/submit-tracker.js';
@@ -122,6 +122,20 @@ function resolveTokenText(tokenizer, tokenIds, fallbackText = '?', renderTokenTe
   return fallbackText;
 }
+export function shouldRetryWithFinitenessFallback(error) {
+  if (error?.name === 'FinitenessError') {
+    return true;
+  }
+  const message = typeof error?.message === 'string'
+    ? error.message
+    : (typeof error === 'string' ? error : '');
+  if (!message.startsWith('[Sampling]')) {
+    return false;
+  }
+  return message.includes('no finite candidate logits after masking the pad token')
+    || message.includes('Softmax produced no finite candidate probabilities');
+}
 export class PipelineGenerator {
   #state;
@@ -196,6 +210,14 @@ export class PipelineGenerator {
     return resolveStepOptions(this.#state, options);
   }
+  _resetDecodeRuntimeState() {
+    this.#state.decodeStepCount = 0;
+    this.#state.disableRecordedLogits = false;
+    this.#state.disableFusedDecode = false;
+    resetActiveExecutionPlan(this.#state);
+    this.#state.decodeRing?.reset();
+  }
   _getDecodeHelpers(debugCheckBuffer) {
     return {
       buildLayerContext: (recorder, isDecodeMode, debugLayers, executionPlan) =>
@@ -221,6 +243,209 @@ export class PipelineGenerator {
     );
   }
+  _resolvePromptTokenIds(prompt, useChatTemplate, contextLabel) {
+    const processedPrompt = resolvePromptInput(this.#state, prompt, useChatTemplate, contextLabel);
+    const inputIds = this.#state.tokenizer.encode(processedPrompt);
+    this._assertTokenIdsInRange(inputIds, `${contextLabel}.encode`);
+    return inputIds;
+  }
+  _sampleNextTokenFromLogits(logits, generatedIds, opts) {
+    const sampledLogits = Float32Array.from(logits);
+    applyRepetitionPenalty(sampledLogits, generatedIds, opts.repetitionPenalty);
+    const padTokenId = this.#state.tokenizer?.getSpecialTokens?.()?.pad;
+    return sample(sampledLogits, {
+      temperature: opts.temperature,
+      topP: opts.topP,
+      topK: opts.topK,
+      padTokenId,
+      seed: opts.seed,
+    });
+  }
+  async _prefillPromptToLogits(prompt, opts, contextLabel) {
+    const inputIds = this._resolvePromptTokenIds(prompt, opts.useChatTemplate, contextLabel);
+    if (opts.debug) {
+      log.debug('Pipeline', `${contextLabel}: ${inputIds.length} tokens`);
+    }
+    let logits;
+    try {
+      logits = await this._prefill(inputIds, opts);
+    } catch (error) {
+      if (!shouldRetryWithFinitenessFallback(error)) {
+        throw error;
+      }
+      log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during ${contextLabel}. Retrying with F32 precision.`);
+      logits = await this._retryWithFinitenessFallback(
+        opts,
+        contextLabel,
+        () => this._prefill(inputIds, opts)
+      );
+    }
+    return { inputIds, logits };
+  }
+  async _decodeStepToLogits(currentIds, opts) {
+    const debugCheckBuffer = this.#state.debug
+      ? (buffer, label, numTokens, expectedDim) =>
+        debugCheckBufferHelper(this.#state, buffer, label, numTokens, expectedDim)
+      : undefined;
+    return decodeStepLogits(this.#state, currentIds, opts, this._getDecodeHelpers(debugCheckBuffer));
+  }
+  async _decodeNextTokenViaLogits(currentIds, opts) {
+    const stepResult = await this._decodeStepToLogits(currentIds, opts);
+    return this._sampleNextTokenFromLogits(stepResult.logits, currentIds, opts);
+  }
+  async *_generateTokensInternal(prompt, options = {}, mode = 'text') {
+    if (!this.#state.isLoaded) throw new Error('Model not loaded');
+    if (this.#state.isGenerating) throw new Error('Generation already in progress');
+    validateCallTimeOptions(options);
+    this.#state.isGenerating = true;
+    this._resetDecodeRuntimeState();
+    this.#state.stats.gpuTimePrefillMs = undefined;
+    this.#state.stats.gpuTimeDecodeMs = undefined;
+    this.#state.stats.decodeRecordMs = 0;
+    this.#state.stats.decodeSubmitWaitMs = 0;
+    this.#state.stats.decodeReadbackWaitMs = 0;
+    this.#state.stats.ttftMs = 0;
+    const startTime = performance.now();
+    const opts = resolveGenerateOptions(this.#state, options);
+    if (opts.debug) {
+      log.debug('Pipeline', `ChatTemplate: options=${options.useChatTemplate}, final=${opts.useChatTemplate}`);
+    }
+    const emitToken = async function* (generator, tokenId, textDecoder) {
+      if (mode === 'token') {
+        yield tokenId;
+        if (options.onToken) options.onToken(tokenId, '');
+        return;
+      }
+      const tokenText = textDecoder(tokenId);
+      yield tokenText;
+      if (options.onToken) options.onToken(tokenId, tokenText);
+    };
+    try {
+      const prefillStart = performance.now();
+      const { inputIds, logits: initialPrefillLogits } = await this._prefillPromptToLogits(prompt, opts, 'generate');
+      let prefillLogits = initialPrefillLogits;
+      this.#state.stats.prefillTimeMs = performance.now() - prefillStart;
+      this._assertTokenIdsInRange(inputIds, 'generate.prefillTokens');
+      const generatedIds = [...inputIds];
+      this.#state.stats.prefillTokens = inputIds.length;
+      if (opts.debug) {
+        log.debug('Pipeline', `Input: ${inputIds.length} tokens`);
+      }
+      const intentBundleConfig = this.#state.runtimeConfig.shared.intentBundle;
+      const intentBundle = intentBundleConfig?.bundle;
+      const expectedTopK = intentBundle?.payload?.expectedTopK
+        ?? intentBundle?.payload?.expected_top_k;
+      const maxDriftThreshold = intentBundle?.constraints?.maxDriftThreshold
+        ?? intentBundle?.constraints?.max_drift_threshold;
+      if (intentBundleConfig?.enabled && Array.isArray(expectedTopK) && expectedTopK.length > 0) {
+        const actualTopK = getTopK(
+          prefillLogits,
+          expectedTopK.length,
+          (tokens) => resolveTokenText(this.#state.tokenizer, tokens),
+        ).map((token) => token.token);
+        const driftResult = enforceLogitDrift(expectedTopK, actualTopK, maxDriftThreshold);
+        if (!driftResult.ok) {
+          throw new Error(`Intent bundle drift check failed: ${driftResult.reason}`);
+        }
+      }
+      if (opts.debug) {
+        const topAfterPenalty = getTopK(
+          Float32Array.from(prefillLogits),
+          5,
+          (tokens) => resolveTokenText(this.#state.tokenizer, tokens)
+        );
+        log.debug('Pipeline', `After rep penalty top-5: ${topAfterPenalty.map(t => `"${t.text}"(${(t.prob * 100).toFixed(1)}%)`).join(', ')}`);
+      }
+      let firstToken;
+      try {
+        firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
+      } catch (error) {
+        if (!shouldRetryWithFinitenessFallback(error)) {
+          throw error;
+        }
+        log.warn('Pipeline', 'FinitenessGuard caught non-finite prefill logits at sampling. Retrying with F32 precision.');
+        prefillLogits = await this._retryWithFinitenessFallback(
+          opts,
+          'prefill-sample',
+          () => this._prefill(inputIds, opts)
+        );
+        firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
+      }
+      if (opts.debug) {
+        const firstTokenText = resolveTokenText(this.#state.tokenizer, [firstToken], `[${firstToken}]`, (tokens) => this.#state.tokenizer?.decode?.(tokens, true, false));
+        log.debug('Pipeline', `First token sampled: id=${firstToken} text="${firstTokenText}"`);
+      }
+      generatedIds.push(firstToken);
+      this.#state.stats.ttftMs = performance.now() - startTime;
+      const decodeToken = (tokenId) => resolveTokenText(
+        this.#state.tokenizer,
+        [tokenId],
+        `[${tokenId}]`,
+        (tokens) => this.#state.tokenizer?.decode?.(tokens, true, false),
+        (tokens) => this.#state.tokenizer?.decode?.(tokens, false, false)
+      );
+      yield* emitToken(this, firstToken, decodeToken);
+      yield* this._runDecodeLoop(generatedIds, opts, options, {
+        stopTokenIds: this.#state.modelConfig.stopTokenIds,
+        eosToken: this.#state.tokenizer.getSpecialTokens?.()?.eos,
+        stopSequenceStart: inputIds.length,
+        decodeToken,
+        logBatchPath: opts.debug,
+        emitMode: mode,
+      });
+      const tokensGenerated = this.#state.stats.decodeTokens;
+      this.#state.stats.totalTimeMs = performance.now() - startTime;
+      if (opts.debug) {
+        log.debug('Pipeline', `Generated ${tokensGenerated} tokens in ${this.#state.stats.totalTimeMs.toFixed(0)}ms`);
+      }
+      const ttft = this.#state.stats.ttftMs ?? this.#state.stats.prefillTimeMs;
+      const decodeTokens = Math.max(0, tokensGenerated - 1);
+      const decodeSpeed = decodeTokens > 0 ? (decodeTokens / this.#state.stats.decodeTimeMs * 1000) : 0;
+      if (opts.benchmark) {
+        log.info('Benchmark', `TTFT: ${ttft.toFixed(0)}ms | Prefill: ${this.#state.stats.prefillTimeMs.toFixed(0)}ms | Decode: ${this.#state.stats.decodeTimeMs.toFixed(0)}ms (${decodeTokens} tokens @ ${decodeSpeed.toFixed(1)} tok/s)`);
+      } else {
+        log.info('Perf', `TTFT: ${ttft.toFixed(0)}ms | Prefill: ${this.#state.stats.prefillTimeMs.toFixed(0)}ms | Decode: ${this.#state.stats.decodeTimeMs.toFixed(0)}ms (${decodeTokens} tokens @ ${decodeSpeed.toFixed(1)} tok/s)`);
+      }
+      trace.perf('Decode summary', {
+        ttftMs: ttft,
+        prefillMs: this.#state.stats.prefillTimeMs,
+        decodeMs: this.#state.stats.decodeTimeMs,
+        decodeTokens,
+        decodeSpeed,
+        totalMs: this.#state.stats.totalTimeMs,
+      });
+    } finally {
+      this._closeFinitenessFallbackWindow(opts);
+      resetActiveExecutionPlan(this.#state);
+      this.#state.isGenerating = false;
+    }
+  }
   _beginFinitenessFallback(opts, reasonLabel) {
     const originalPlan = resolveActiveExecutionPlan(this.#state);
     const original = {
@@ -306,17 +531,21 @@ export class PipelineGenerator {
   async *generate(prompt, options = {}) {
+    yield* this._generateTokensInternal(prompt, options, 'text');
+  }
+  async *generateTokens(prompt, options = {}) {
+    yield* this._generateTokensInternal(prompt, options, 'token');
+  }
+  async generateTokenIds(prompt, options = {}) {
     if (!this.#state.isLoaded) throw new Error('Model not loaded');
     if (this.#state.isGenerating) throw new Error('Generation already in progress');
     validateCallTimeOptions(options);
     this.#state.isGenerating = true;
-    this.#state.decodeStepCount = 0;
-    this.#state.disableRecordedLogits = false;
-    this.#state.disableFusedDecode = false;
-    resetActiveExecutionPlan(this.#state);
-    this.#state.decodeRing?.reset();
+    this._resetDecodeRuntimeState();
     this.#state.stats.gpuTimePrefillMs = undefined;
     this.#state.stats.gpuTimeDecodeMs = undefined;
     this.#state.stats.decodeRecordMs = 0;
@@ -324,135 +553,79 @@ export class PipelineGenerator {
     this.#state.stats.decodeReadbackWaitMs = 0;
     this.#state.stats.ttftMs = 0;
     const startTime = performance.now();
     const opts = resolveGenerateOptions(this.#state, options);
-    if (opts.debug) {
-      log.debug('Pipeline', `ChatTemplate: options=${options.useChatTemplate}, final=${opts.useChatTemplate}`);
-    }
     try {
-      const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'generate');
-      if (opts.debug && opts.useChatTemplate) {
-        log.debug('Pipeline', `Applied ${this.#state.modelConfig.chatTemplateType} chat template`);
-      }
-      const inputIds = this.#state.tokenizer.encode(processedPrompt);
-      this._assertTokenIdsInRange(inputIds, 'generate.encode');
+      const prefillStart = performance.now();
+      const { inputIds, logits: initialPrefillLogits } = await this._prefillPromptToLogits(prompt, opts, 'generateTokenIds');
+      let prefillLogits = initialPrefillLogits;
+      this.#state.stats.prefillTimeMs = performance.now() - prefillStart;
+      this._assertTokenIdsInRange(inputIds, 'generateTokenIds.prefillTokens');
       const generatedIds = [...inputIds];
       this.#state.stats.prefillTokens = inputIds.length;
-      if (opts.debug) {
-        log.debug('Pipeline', `Input: ${inputIds.length} tokens`);
-      }
-      const prefillStart = performance.now();
-      let prefillLogits;
+      let firstToken;
       try {
-        prefillLogits = await this._prefill(inputIds, opts);
+        firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
       } catch (error) {
-        if (error.name === 'FinitenessError') {
-          log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefill. Retrying with F32 precision.`);
-          prefillLogits = await this._retryWithFinitenessFallback(
-            opts,
-            'prefill',
-            () => this._prefill(inputIds, opts)
-          );
-        } else {
+        if (!shouldRetryWithFinitenessFallback(error)) {
           throw error;
         }
-      }
-      this.#state.stats.prefillTimeMs = performance.now() - prefillStart;
-      const intentBundleConfig = this.#state.runtimeConfig.shared.intentBundle;
-      const intentBundle = intentBundleConfig?.bundle;
-      const expectedTopK = intentBundle?.payload?.expectedTopK
-        ?? intentBundle?.payload?.expected_top_k;
-      const maxDriftThreshold = intentBundle?.constraints?.maxDriftThreshold
-        ?? intentBundle?.constraints?.max_drift_threshold;
-      if (intentBundleConfig?.enabled && Array.isArray(expectedTopK) && expectedTopK.length > 0) {
-        const actualTopK = getTopK(
-          prefillLogits,
-          expectedTopK.length,
-        (tokens) => resolveTokenText(this.#state.tokenizer, tokens),
-      ).map((token) => token.token);
-        const driftResult = enforceLogitDrift(expectedTopK, actualTopK, maxDriftThreshold);
-        if (!driftResult.ok) {
-          throw new Error(`Intent bundle drift check failed: ${driftResult.reason}`);
-        }
-      }
-      applyRepetitionPenalty(prefillLogits, generatedIds, opts.repetitionPenalty);
-      const padTokenId = this.#state.tokenizer?.getSpecialTokens?.()?.pad;
-      if (opts.debug) {
-        const topAfterPenalty = getTopK(
-          prefillLogits,
-          5,
-          (tokens) => resolveTokenText(this.#state.tokenizer, tokens)
+        prefillLogits = await this._retryWithFinitenessFallback(
+          opts,
+          'prefill-sample',
+          () => this._prefill(inputIds, opts)
         );
-        log.debug('Pipeline', `After rep penalty top-5: ${topAfterPenalty.map(t => `"${t.text}"(${(t.prob * 100).toFixed(1)}%)`).join(', ')}`);
-      }
-      const firstToken = sample(prefillLogits, {
-        temperature: opts.temperature,
-        topP: opts.topP,
-        topK: opts.topK,
-        padTokenId,
-        seed: opts.seed,
-      });
-      if (opts.debug) {
-        const firstTokenText = resolveTokenText(this.#state.tokenizer, [firstToken], `[${firstToken}]`, (tokens) => this.#state.tokenizer?.decode?.(tokens, true, false));
-        log.debug('Pipeline', `First token sampled: id=${firstToken} text="${firstTokenText}"`);
+        firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
       }
       generatedIds.push(firstToken);
+      const tokenIds = [firstToken];
       this.#state.stats.ttftMs = performance.now() - startTime;
-      const decodeToken = (tokenId) => resolveTokenText(
-        this.#state.tokenizer,
-        [tokenId],
-        `[${tokenId}]`,
-        (tokens) => this.#state.tokenizer?.decode?.(tokens, true, false),
-        (tokens) => this.#state.tokenizer?.decode?.(tokens, false, false)
-      );
+      const stopTokenIds = this.#state.modelConfig.stopTokenIds;
+      const eosToken = this.#state.tokenizer.getSpecialTokens?.()?.eos;
+      const stopSequenceStart = inputIds.length;
+      markKernelCacheWarmed();
+      const decodeStart = performance.now();
-      const firstText = decodeToken(firstToken);
-      yield firstText;
-      if (options.onToken) options.onToken(firstToken, firstText);
+      while (tokenIds.length < opts.maxTokens) {
+        if (options.signal?.aborted) break;
+        let nextToken;
+        try {
+          nextToken = await this._decodeNextTokenViaLogits(generatedIds, opts);
+        } catch (error) {
+          if (shouldRetryWithFinitenessFallback(error)) {
+            nextToken = await this._retryDecodeStepWithFinitenessWindow(
+              generatedIds,
+              opts,
+              `decode-step-${tokenIds.length}`
+            );
+          } else {
+            throw error;
+          }
+        }
+        generatedIds.push(nextToken);
+        tokenIds.push(nextToken);
+        this._consumeFinitenessFallbackToken(opts);
+        if (isStopToken(nextToken, stopTokenIds, eosToken)) {
+          break;
+        }
+        if (opts.stopSequences.length > 0) {
+          const fullText = this.#state.tokenizer.decode(generatedIds.slice(stopSequenceStart), false);
+          if (opts.stopSequences.some((seq) => fullText.endsWith(seq))) break;
+        }
+      }
-      yield* this._runDecodeLoop(generatedIds, opts, options, {
-        stopTokenIds: this.#state.modelConfig.stopTokenIds,
-        eosToken: this.#state.tokenizer.getSpecialTokens?.()?.eos,
-        stopSequenceStart: inputIds.length,
-        decodeToken,
-        logBatchPath: opts.debug,
-      });
-      const tokensGenerated = this.#state.stats.decodeTokens;
+      this.#state.stats.decodeTimeMs = performance.now() - decodeStart;
+      this.#state.stats.tokensGenerated = tokenIds.length;
+      this.#state.stats.decodeTokens = tokenIds.length;
       this.#state.stats.totalTimeMs = performance.now() - startTime;
-      if (opts.debug) {
-        log.debug('Pipeline', `Generated ${tokensGenerated} tokens in ${this.#state.stats.totalTimeMs.toFixed(0)}ms`);
-      }
-      const ttft = this.#state.stats.ttftMs ?? this.#state.stats.prefillTimeMs;
-      const decodeTokens = Math.max(0, tokensGenerated - 1);
-      const decodeSpeed = decodeTokens > 0 ? (decodeTokens / this.#state.stats.decodeTimeMs * 1000) : 0;
-      if (opts.benchmark) {
-        log.info('Benchmark', `TTFT: ${ttft.toFixed(0)}ms | Prefill: ${this.#state.stats.prefillTimeMs.toFixed(0)}ms | Decode: ${this.#state.stats.decodeTimeMs.toFixed(0)}ms (${decodeTokens} tokens @ ${decodeSpeed.toFixed(1)} tok/s)`);
-      } else {
-        log.info('Perf', `TTFT: ${ttft.toFixed(0)}ms | Prefill: ${this.#state.stats.prefillTimeMs.toFixed(0)}ms | Decode: ${this.#state.stats.decodeTimeMs.toFixed(0)}ms (${decodeTokens} tokens @ ${decodeSpeed.toFixed(1)} tok/s)`);
-      }
-      trace.perf('Decode summary', {
-        ttftMs: ttft,
-        prefillMs: this.#state.stats.prefillTimeMs,
-        decodeMs: this.#state.stats.decodeTimeMs,
-        decodeTokens,
-        decodeSpeed,
-        totalMs: this.#state.stats.totalTimeMs,
-      });
+      return {
+        tokenIds,
+        stats: this.#state.stats,
+      };
     } finally {
       this._closeFinitenessFallbackWindow(opts);
       resetActiveExecutionPlan(this.#state);
@@ -463,14 +636,13 @@ export class PipelineGenerator {
   async prefillKVOnly(prompt, options = {}) {
     if (!this.#state.isLoaded) throw new Error('Model not loaded');
-    resetActiveExecutionPlan(this.#state);
+    if (this.#state.isGenerating && options.__internalGenerate !== true) {
+      throw new Error('Generation already in progress');
+    }
+    this._resetDecodeRuntimeState();
     this.#state.stats.gpuTimePrefillMs = undefined;
     const opts = resolvePrefillOptions(this.#state, options);
-    const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'prefillKVOnly');
-    const inputIds = this.#state.tokenizer.encode(processedPrompt);
-    this._assertTokenIdsInRange(inputIds, 'prefillKVOnly.encode');
+    const inputIds = this._resolvePromptTokenIds(prompt, opts.useChatTemplate, 'prefillKVOnly');
     if (opts.debug) {
       log.debug('Pipeline', `PrefillKVOnly: ${inputIds.length} tokens`);
     }
@@ -479,7 +651,7 @@ export class PipelineGenerator {
     try {
       prefillResult = await this._prefillToHidden(inputIds, opts);
     } catch (error) {
-      if (error.name === 'FinitenessError') {
+      if (shouldRetryWithFinitenessFallback(error)) {
         log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefillKVOnly. Retrying with F32 precision.`);
         prefillResult = await this._retryWithFinitenessFallback(
           opts,
@@ -528,14 +700,13 @@ export class PipelineGenerator {
   async prefillWithEmbedding(prompt, options = {}) {
     if (!this.#state.isLoaded) throw new Error('Model not loaded');
-    resetActiveExecutionPlan(this.#state);
+    if (this.#state.isGenerating && options.__internalGenerate !== true) {
+      throw new Error('Generation already in progress');
+    }
+    this._resetDecodeRuntimeState();
     this.#state.stats.gpuTimePrefillMs = undefined;
     const opts = resolvePrefillEmbeddingOptions(this.#state, options);
-    const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'prefillWithEmbedding');
-    const inputIds = this.#state.tokenizer.encode(processedPrompt);
-    this._assertTokenIdsInRange(inputIds, 'prefillWithEmbedding.encode');
+    const inputIds = this._resolvePromptTokenIds(prompt, opts.useChatTemplate, 'prefillWithEmbedding');
     if (opts.debug) {
       log.debug('Pipeline', `PrefillWithEmbedding: ${inputIds.length} tokens (mode=${opts.embeddingMode})`);
     }
@@ -544,7 +715,7 @@ export class PipelineGenerator {
     try {
       prefillResult = await this._prefillToHidden(inputIds, opts);
     } catch (error) {
-      if (error.name === 'FinitenessError') {
+      if (shouldRetryWithFinitenessFallback(error)) {
         log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefillWithEmbedding. Retrying with F32 precision.`);
         prefillResult = await this._retryWithFinitenessFallback(
           opts,
@@ -623,19 +794,13 @@ export class PipelineGenerator {
   async prefillWithLogits(prompt, options = {}) {
     if (!this.#state.isLoaded) throw new Error('Model not loaded');
-    resetActiveExecutionPlan(this.#state);
+    if (this.#state.isGenerating && options.__internalGenerate !== true) {
+      throw new Error('Generation already in progress');
+    }
+    this._resetDecodeRuntimeState();
     this.#state.stats.gpuTimePrefillMs = undefined;
     const opts = resolvePrefillOptions(this.#state, options);
-    const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'prefillWithLogits');
-    const inputIds = this.#state.tokenizer.encode(processedPrompt);
-    this._assertTokenIdsInRange(inputIds, 'prefillWithLogits.encode');
-    if (opts.debug) {
-      log.debug('Pipeline', `PrefillWithLogits: ${inputIds.length} tokens`);
-    }
-    const logits = await this._prefill(inputIds, opts);
+    const { inputIds, logits } = await this._prefillPromptToLogits(prompt, opts, 'prefillWithLogits');
     const snapshot = this.#state.kvCache?.clone();
     if (!snapshot) {
@@ -757,6 +922,7 @@ export class PipelineGenerator {
       stopSequenceStart,
       decodeToken,
       logBatchPath = false,
+      emitMode = 'text',
     } = runtime;
     let tokensGenerated = 1;
@@ -786,6 +952,9 @@ export class PipelineGenerator {
     }
     const readbackInterval = executionPlan.readbackInterval;
     const intervalBatches = readbackInterval == null ? 1 : readbackInterval;
+    const padTokenId = this.#state.tokenizer?.getSpecialTokens?.()?.pad;
+    const decodeSingleTokenViaLogits = async () => this._decodeNextTokenViaLogits(generatedIds, opts);
     if (logBatchPath && useBatchPath) {
       log.debug(
@@ -811,10 +980,16 @@ export class PipelineGenerator {
           for (const tokenId of batchResult.tokens) {
             generatedIds.push(tokenId);
             tokensGenerated++;
-            const tokenText = decodeToken(tokenId);
-            yield tokenText;
-            if (options.onToken) options.onToken(tokenId, tokenText);
-            batchTokens.push({ id: tokenId, text: tokenText });
+            if (emitMode === 'token') {
+              yield tokenId;
+              if (options.onToken) options.onToken(tokenId, '');
+              batchTokens.push({ id: tokenId, text: '' });
+            } else {
+              const tokenText = decodeToken(tokenId);
+              yield tokenText;
+              if (options.onToken) options.onToken(tokenId, tokenText);
+              batchTokens.push({ id: tokenId, text: tokenText });
+            }
             if (batchTokens.length === executionPlan.batchSize) {
               if (options.onBatch) options.onBatch(batchTokens);
               batchTokens = [];
@@ -831,9 +1006,9 @@ export class PipelineGenerator {
           useBatchPath = false;
           let nextToken;
           try {
-            nextToken = await this._decodeStep(generatedIds, opts);
+            nextToken = await decodeSingleTokenViaLogits();
           } catch (singleTokenError) {
-            if (singleTokenError.name === 'FinitenessError') {
+            if (shouldRetryWithFinitenessFallback(singleTokenError)) {
               log.warn('Pipeline', `FinitenessGuard caught NaN/Inf at batch step ${tokensGenerated}. Truncating KV cache and retrying token with F32 precision.`);
               nextToken = await this._retryDecodeStepWithFinitenessWindow(
                 generatedIds,
@@ -846,9 +1021,14 @@ export class PipelineGenerator {
           }
           generatedIds.push(nextToken);
           tokensGenerated++;
-          const tokenText = decodeToken(nextToken);
-          yield tokenText;
-          if (options.onToken) options.onToken(nextToken, tokenText);
+          if (emitMode === 'token') {
+            yield nextToken;
+            if (options.onToken) options.onToken(nextToken, '');
+          } else {
+            const tokenText = decodeToken(nextToken);
+            yield tokenText;
+            if (options.onToken) options.onToken(nextToken, tokenText);
+          }
           this._consumeFinitenessFallbackToken(opts);
           if (isStopToken(nextToken, stopTokenIds, eosToken)) break;
         }
@@ -856,9 +1036,9 @@ export class PipelineGenerator {
         const tokenStart = performance.now();
         let nextToken;
         try {
-          nextToken = await this._decodeStep(generatedIds, opts);
+          nextToken = await decodeSingleTokenViaLogits();
         } catch (error) {
-          if (error.name === 'FinitenessError') {
+          if (shouldRetryWithFinitenessFallback(error)) {
             log.warn('Pipeline', `FinitenessGuard caught NaN/Inf at step ${tokensGenerated}. Truncating KV cache and retrying token with F32 precision.`);
             nextToken = await this._retryDecodeStepWithFinitenessWindow(
               generatedIds,
@@ -872,9 +1052,14 @@ export class PipelineGenerator {
         const tokenTime = performance.now() - tokenStart;
         generatedIds.push(nextToken);
         tokensGenerated++;
-        const tokenText = decodeToken(nextToken);
-        yield tokenText;
-        if (options.onToken) options.onToken(nextToken, tokenText);
+        const tokenText = emitMode === 'token' ? '' : decodeToken(nextToken);
+        if (emitMode === 'token') {
+          yield nextToken;
+          if (options.onToken) options.onToken(nextToken, '');
+        } else {
+          yield tokenText;
+          if (options.onToken) options.onToken(nextToken, tokenText);
+        }
         this._consumeFinitenessFallbackToken(opts);
         if (opts.debug || opts.benchmark) {
@@ -912,17 +1097,22 @@ export class PipelineGenerator {
     if (startPos === 0 && hasLinearAttentionLayers(config.layerTypes)) {
       this.#state.linearAttentionRuntime = resetLinearAttentionRuntime(this.#state.linearAttentionRuntime);
     }
+    if (startPos === 0) {
+      for (const [, convState] of this.#state.convLayerStates) {
+        if (convState.convStateGPU && convState.hiddenSize && convState.kernelSize) {
+          uploadData(convState.convStateGPU, new Float32Array(convState.hiddenSize * (convState.kernelSize - 1)));
+        }
+      }
+    }
     const embedBufferRaw = this.#state.weights.get('embed');
     if (!(embedBufferRaw instanceof GPUBuffer) && !isWeightBuffer(embedBufferRaw) && !isCpuWeightBuffer(embedBufferRaw) && !(embedBufferRaw instanceof Float32Array)) {
       throw new Error('Embed buffer not found or not a supported buffer type');
     }
     const embedBuffer = isWeightBuffer(embedBufferRaw) ? embedBufferRaw.buffer : embedBufferRaw;
-    const embedDtype = isWeightBuffer(embedBufferRaw)
-      ? getWeightDtype(embedBufferRaw)
-      : isCpuWeightBuffer(embedBufferRaw)
-        ? embedBufferRaw.dtype
-        : null;
+    const embedDtype = isCpuWeightBuffer(embedBufferRaw)
+      ? embedBufferRaw.dtype
+      : getWeightDtype(embedBufferRaw);
     if (opts.debug) {
       const embedSize = embedBuffer instanceof GPUBuffer ? embedBuffer.size : 'N/A';
       log.debug('Pipeline', `Embed buffer: type=${embedBuffer?.constructor?.name}, size=${embedSize}, dtype=${embedDtype}`);
@@ -1263,18 +1453,15 @@ export class PipelineGenerator {
   async decodeStepLogits(currentIds, options = {}) {
     if (!this.#state.isLoaded) throw new Error('Model not loaded');
-    if (this.#state.isGenerating) throw new Error('Generation already in progress');
+    if (this.#state.isGenerating && options.__internalGenerate !== true) {
+      throw new Error('Generation already in progress');
+    }
     resetActiveExecutionPlan(this.#state);
     validateCallTimeOptions(options);
     const opts = this._resolveStepOptions(options);
-    const debugCheckBuffer = this.#state.debug
-      ? (buffer, label, numTokens, expectedDim) =>
-        debugCheckBufferHelper(this.#state, buffer, label, numTokens, expectedDim)
-      : undefined;
-    return decodeStepLogits(this.#state, currentIds, opts, this._getDecodeHelpers(debugCheckBuffer));
+    return this._decodeStepToLogits(currentIds, opts);
   }
   async advanceWithToken(tokenId, options = {}) {