npm - @simulatte/doppler - Versions diffs - 0.1.8 → 0.1.9 - Mend

@simulatte/doppler 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

package/CHANGELOG.md +14 -1
package/README.md +25 -6
package/package.json +5 -3
package/src/client/doppler-api.browser.js +6 -0
package/src/client/doppler-api.d.ts +3 -0
package/src/client/doppler-api.js +11 -2
package/src/client/doppler-registry.js +3 -5
package/src/client/doppler-registry.json +16 -0
package/src/config/kernels/kernel-ref-digests.js +23 -21
package/src/config/kernels/moe/mixtral.paths.json +46 -0
package/src/config/loader.js +6 -0
package/src/config/platforms/loader.js +3 -1
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-nosubgroups.json +16 -16
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-online.json +8 -8
package/src/config/presets/kernel-paths/gemma3-q4k-dequant-f32a-small-attn.json +61 -0
package/src/config/presets/kernel-paths/registry.json +7 -0
package/src/config/presets/models/gemma3.json +2 -1
package/src/config/presets/models/gemma4.json +61 -0
package/src/config/presets/models/granite-docling.json +70 -0
package/src/config/presets/models/lfm2.json +6 -1
package/src/config/presets/models/qwen3_vl.json +40 -0
package/src/config/presets/runtime/experiments/bench/gemma3-bench-q4k.json +2 -1
package/src/config/presets/runtime/experiments/verify/lfm2-verify.json +46 -0
package/src/config/presets/runtime/experiments/verify/translategemma-verify.json +39 -0
package/src/config/presets/runtime/modes/trace-layers.json +1 -0
package/src/config/presets/runtime/tiers/gemma4-16gb.json +69 -0
package/src/config/presets/runtime/tiers/gemma4-24gb.json +66 -0
package/src/config/presets/runtime/tiers/gemma4-32gb.json +66 -0
package/src/config/runtime.js +3 -0
package/src/config/schema/debug.schema.d.ts +40 -0
package/src/config/schema/debug.schema.js +28 -0
package/src/config/schema/index.js +2 -0
package/src/config/schema/inference-defaults.schema.js +1 -1
package/src/config/schema/kernel-path.schema.d.ts +1 -0
package/src/config/schema/memory-limits.schema.js +2 -2
package/src/config/schema/storage.schema.js +1 -1
package/src/converter/conversion-plan.js +1 -1
package/src/converter/core.js +17 -8
package/src/converter/quantizer.d.ts +5 -0
package/src/converter/quantizer.js +15 -0
package/src/distribution/shard-delivery.js +34 -0
package/src/formats/rdrr/classification.js +32 -0
package/src/gpu/kernel-runtime.js +4 -2
package/src/gpu/kernels/attention.js +2 -1
package/src/gpu/kernels/dequant_f16_out.wgsl +4 -2
package/src/gpu/kernels/dequant_f16_out_vec4.wgsl +5 -2
package/src/gpu/kernels/dequant_shared.wgsl +4 -2
package/src/gpu/kernels/dequant_shared_vec4.wgsl +4 -2
package/src/gpu/kernels/dequant_subgroup.wgsl +6 -2
package/src/gpu/kernels/gated-short-conv.d.ts +63 -0
package/src/gpu/kernels/gated-short-conv.js +284 -0
package/src/gpu/kernels/linear-attention-core.js +37 -17
package/src/gpu/kernels/matmul-selection.js +1 -0
package/src/gpu/kernels/matmul.d.ts +3 -0
package/src/gpu/kernels/matmul.js +70 -1
package/src/gpu/kernels/matmul_gemv_subgroup.wgsl +77 -79
package/src/gpu/kernels/sample.js +1 -3
package/src/gpu/kernels/sample.wgsl +39 -9
package/src/gpu/kernels/sample_f16.wgsl +38 -8
package/src/gpu/kernels/shader-cache.js +9 -4
package/src/inference/kv-cache/base.js +3 -10
package/src/inference/pipelines/diffusion/pipeline.js +2 -1
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +2 -1
package/src/inference/pipelines/text/attention/projections.d.ts +3 -0
package/src/inference/pipelines/text/attention/projections.js +13 -2
package/src/inference/pipelines/text/attention/record.js +1 -0
package/src/inference/pipelines/text/attention/run.js +9 -0
package/src/inference/pipelines/text/config.d.ts +1 -0
package/src/inference/pipelines/text/config.js +32 -4
package/src/inference/pipelines/text/embed.js +26 -7
package/src/inference/pipelines/text/execution-v0-runtime-builders.js +10 -3
package/src/inference/pipelines/text/execution-v0.js +12 -1
package/src/inference/pipelines/text/generator-helpers.js +1 -0
package/src/inference/pipelines/text/generator-runtime.js +14 -0
package/src/inference/pipelines/text/generator-steps.d.ts +9 -0
package/src/inference/pipelines/text/generator-steps.js +46 -29
package/src/inference/pipelines/text/generator.d.ts +5 -0
package/src/inference/pipelines/text/generator.js +320 -166
package/src/inference/pipelines/text/init.d.ts +2 -0
package/src/inference/pipelines/text/init.js +19 -5
package/src/inference/pipelines/text/layer.js +37 -8
package/src/inference/pipelines/text/moe-gpu.js +21 -3
package/src/inference/pipelines/text/moe-shape-validator.d.ts +9 -0
package/src/inference/pipelines/text/moe-shape-validator.js +31 -11
package/src/inference/pipelines/text/ops.js +123 -53
package/src/inference/pipelines/text/probes.js +1 -0
package/src/inference/pipelines/text/state.js +2 -0
package/src/inference/pipelines/text.d.ts +5 -0
package/src/inference/pipelines/text.js +59 -1
package/src/inference/pipelines/vision/encoder.js +386 -0
package/src/inference/pipelines/vision/image-preprocess.js +151 -0
package/src/inference/pipelines/vision/index.js +173 -0
package/src/inference/pipelines/vision/ops.js +78 -0
package/src/inference/pipelines/vision/patch-embed.js +151 -0
package/src/inference/test-harness.js +9 -7
package/src/loader/doppler-loader.d.ts +3 -0
package/src/loader/doppler-loader.js +20 -3
package/src/loader/experts/expert-cache.js +6 -2
package/src/loader/experts/expert-loader.js +6 -2
package/src/loader/layer-loader.js +42 -3
package/src/loader/manifest-config.js +3 -1
package/src/loader/tensors/tensor-loader.d.ts +3 -0
package/src/loader/tensors/tensor-loader.js +124 -3
package/src/rules/kernels/moe.rules.mixtral.json +75 -0
package/src/rules/kernels/softmax.rules.json +2 -0
package/src/rules/rule-registry.d.ts +1 -0
package/src/rules/rule-registry.js +2 -0
package/src/storage/quickstart-downloader.d.ts +3 -0
package/src/storage/quickstart-downloader.js +27 -30
package/src/tooling/node-converter.js +25 -7
package/src/tooling/node-source-runtime.js +29 -5
package/src/tooling/node-webgpu.js +24 -7
package/src/utils/hf-resolve-url.d.ts +16 -0
package/src/utils/hf-resolve-url.js +17 -0
package/src/version.js +1 -1
package/src/tooling/node-convert.d.ts +0 -54

package/src/inference/pipelines/text/generator.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { getDevice, setTrackSubmits } from '../../../gpu/device.js';
-import { releaseBuffer, readBuffer, readBufferSlice } from '../../../memory/buffer-pool.js';
+import { releaseBuffer, readBuffer, readBufferSlice, uploadData } from '../../../memory/buffer-pool.js';
 import { isGPUSamplingAvailable } from '../../../gpu/kernels/sample.js';
 import { markWarmed as markKernelCacheWarmed } from '../../../gpu/kernel-selection-cache.js';
 import { resetSubmitStats, logSubmitStats } from '../../../gpu/submit-tracker.js';
@@ -210,6 +210,14 @@ export class PipelineGenerator {
     return resolveStepOptions(this.#state, options);
   }
+  _resetDecodeRuntimeState() {
+    this.#state.decodeStepCount = 0;
+    this.#state.disableRecordedLogits = false;
+    this.#state.disableFusedDecode = false;
+    resetActiveExecutionPlan(this.#state);
+    this.#state.decodeRing?.reset();
+  }
   _getDecodeHelpers(debugCheckBuffer) {
     return {
       buildLayerContext: (recorder, isDecodeMode, debugLayers, executionPlan) =>
@@ -235,102 +243,71 @@ export class PipelineGenerator {
     );
   }
-  _beginFinitenessFallback(opts, reasonLabel) {
-    const originalPlan = resolveActiveExecutionPlan(this.#state);
-    const original = {
-      activePlanId: this.#state.executionPlanState?.activePlanId ?? 'primary',
-      seed: opts.seed,
-    };
-    const fallbackPlan = activateFallbackExecutionPlan(this.#state);
-    if (!fallbackPlan) {
-      throw new Error('[Pipeline] Finiteness fallback plan is unavailable for this model/runtime configuration.');
-    }
-    log.warn(
-      'Pipeline',
-      `FinitenessGuard fallback (${reasonLabel}): ` +
-      `${originalPlan.kernelPathId ?? 'none'} -> ${fallbackPlan.kernelPathId ?? 'none'}`
-    );
-    this.#state.decodeBuffers?.ensureBuffers({
-      hiddenSize: this.#state.modelConfig.hiddenSize,
-      intermediateSize: this.#state.modelConfig.intermediateSize,
-      activationDtype: fallbackPlan.activationDtype,
-      enablePingPong: true,
-    });
-    if (opts.seed == null) {
-      const fallbackSeedBase = (this.#state.decodeStepCount + this.#state.currentSeqLen + 1) >>> 0;
-      opts.seed = (fallbackSeedBase * 2654435761) >>> 0;
-    }
-    opts.executionPlan = rebaseExecutionSessionPlan(this.#state, opts.executionPlan);
-    return original;
+  _resolvePromptTokenIds(prompt, useChatTemplate, contextLabel) {
+    const processedPrompt = resolvePromptInput(this.#state, prompt, useChatTemplate, contextLabel);
+    const inputIds = this.#state.tokenizer.encode(processedPrompt);
+    this._assertTokenIdsInRange(inputIds, `${contextLabel}.encode`);
+    return inputIds;
   }
-  _endFinitenessFallback(opts, original) {
-    opts.seed = original.seed;
-    setActiveExecutionPlan(this.#state, original.activePlanId);
-    opts.executionPlan = rebaseExecutionSessionPlan(this.#state, opts.executionPlan);
-    const nextActivationDtype = this._getEffectiveActivationDtype();
-    this.#state.decodeBuffers?.ensureBuffers({
-      hiddenSize: this.#state.modelConfig.hiddenSize,
-      intermediateSize: this.#state.modelConfig.intermediateSize,
-      activationDtype: nextActivationDtype,
-      enablePingPong: true,
+  _sampleNextTokenFromLogits(logits, generatedIds, opts) {
+    const sampledLogits = Float32Array.from(logits);
+    applyRepetitionPenalty(sampledLogits, generatedIds, opts.repetitionPenalty);
+    const padTokenId = this.#state.tokenizer?.getSpecialTokens?.()?.pad;
+    return sample(sampledLogits, {
+      temperature: opts.temperature,
+      topP: opts.topP,
+      topK: opts.topK,
+      padTokenId,
+      seed: opts.seed,
     });
   }
-  async _retryWithFinitenessFallback(opts, reasonLabel, retryFn) {
-    if (this._hasFinitenessFallbackWindow()) {
-      return retryFn();
-    }
-    this.#state.kvCache?.truncate(this.#state.currentSeqLen);
-    const original = this._beginFinitenessFallback(opts, reasonLabel);
-    try {
-      return await retryFn();
-    } finally {
-      this._endFinitenessFallback(opts, original);
+  async _prefillPromptToLogits(prompt, opts, contextLabel) {
+    const inputIds = this._resolvePromptTokenIds(prompt, opts.useChatTemplate, contextLabel);
+    if (opts.debug) {
+      log.debug('Pipeline', `${contextLabel}: ${inputIds.length} tokens`);
     }
-  }
-  async _retryDecodeStepWithFinitenessWindow(generatedIds, opts, reasonLabel) {
-    const windowTokens = this._resolveDeferredRoundingWindowTokens();
-    if (windowTokens <= 1) {
-      return this._retryWithFinitenessFallback(
+    let logits;
+    try {
+      logits = await this._prefill(inputIds, opts);
+    } catch (error) {
+      if (!shouldRetryWithFinitenessFallback(error)) {
+        throw error;
+      }
+      log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during ${contextLabel}. Retrying with F32 precision.`);
+      logits = await this._retryWithFinitenessFallback(
         opts,
-        reasonLabel,
-        () => this._decodeStep(generatedIds, opts)
+        contextLabel,
+        () => this._prefill(inputIds, opts)
       );
     }
-    this.#state.kvCache?.truncate(this.#state.currentSeqLen);
-    this._openFinitenessFallbackWindow(opts, reasonLabel, windowTokens);
-    try {
-      return await this._decodeStep(generatedIds, opts);
-    } catch (error) {
-      this._closeFinitenessFallbackWindow(opts);
-      throw error;
-    }
+    return { inputIds, logits };
   }
-  // ==========================================================================
-  // Generation Public API
-  // ==========================================================================
+  async _decodeStepToLogits(currentIds, opts) {
+    const debugCheckBuffer = this.#state.debug
+      ? (buffer, label, numTokens, expectedDim) =>
+        debugCheckBufferHelper(this.#state, buffer, label, numTokens, expectedDim)
+      : undefined;
+    return decodeStepLogits(this.#state, currentIds, opts, this._getDecodeHelpers(debugCheckBuffer));
+  }
+  async _decodeNextTokenViaLogits(currentIds, opts) {
+    const stepResult = await this._decodeStepToLogits(currentIds, opts);
+    return this._sampleNextTokenFromLogits(stepResult.logits, currentIds, opts);
+  }
-  async *generate(prompt, options = {}) {
+  async *_generateTokensInternal(prompt, options = {}, mode = 'text') {
     if (!this.#state.isLoaded) throw new Error('Model not loaded');
     if (this.#state.isGenerating) throw new Error('Generation already in progress');
     validateCallTimeOptions(options);
     this.#state.isGenerating = true;
-    this.#state.decodeStepCount = 0;
-    this.#state.disableRecordedLogits = false;
-    this.#state.disableFusedDecode = false;
-    resetActiveExecutionPlan(this.#state);
-    this.#state.decodeRing?.reset();
+    this._resetDecodeRuntimeState();
     this.#state.stats.gpuTimePrefillMs = undefined;
     this.#state.stats.gpuTimeDecodeMs = undefined;
     this.#state.stats.decodeRecordMs = 0;
@@ -345,14 +322,23 @@ export class PipelineGenerator {
       log.debug('Pipeline', `ChatTemplate: options=${options.useChatTemplate}, final=${opts.useChatTemplate}`);
     }
-    try {
-      const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'generate');
-      if (opts.debug && opts.useChatTemplate) {
-        log.debug('Pipeline', `Applied ${this.#state.modelConfig.chatTemplateType} chat template`);
+    const emitToken = async function* (generator, tokenId, textDecoder) {
+      if (mode === 'token') {
+        yield tokenId;
+        if (options.onToken) options.onToken(tokenId, '');
+        return;
       }
+      const tokenText = textDecoder(tokenId);
+      yield tokenText;
+      if (options.onToken) options.onToken(tokenId, tokenText);
+    };
-      const inputIds = this.#state.tokenizer.encode(processedPrompt);
-      this._assertTokenIdsInRange(inputIds, 'generate.encode');
+    try {
+      const prefillStart = performance.now();
+      const { inputIds, logits: initialPrefillLogits } = await this._prefillPromptToLogits(prompt, opts, 'generate');
+      let prefillLogits = initialPrefillLogits;
+      this.#state.stats.prefillTimeMs = performance.now() - prefillStart;
+      this._assertTokenIdsInRange(inputIds, 'generate.prefillTokens');
       const generatedIds = [...inputIds];
       this.#state.stats.prefillTokens = inputIds.length;
@@ -360,24 +346,6 @@ export class PipelineGenerator {
         log.debug('Pipeline', `Input: ${inputIds.length} tokens`);
       }
-      const prefillStart = performance.now();
-      let prefillLogits;
-      try {
-        prefillLogits = await this._prefill(inputIds, opts);
-      } catch (error) {
-        if (shouldRetryWithFinitenessFallback(error)) {
-          log.warn('Pipeline', `FinitenessGuard caught NaN/Inf during prefill. Retrying with F32 precision.`);
-          prefillLogits = await this._retryWithFinitenessFallback(
-            opts,
-            'prefill',
-            () => this._prefill(inputIds, opts)
-          );
-        } else {
-          throw error;
-        }
-      }
-      this.#state.stats.prefillTimeMs = performance.now() - prefillStart;
       const intentBundleConfig = this.#state.runtimeConfig.shared.intentBundle;
       const intentBundle = intentBundleConfig?.bundle;
       const expectedTopK = intentBundle?.payload?.expectedTopK
@@ -389,20 +357,17 @@ export class PipelineGenerator {
         const actualTopK = getTopK(
           prefillLogits,
           expectedTopK.length,
-        (tokens) => resolveTokenText(this.#state.tokenizer, tokens),
-      ).map((token) => token.token);
+          (tokens) => resolveTokenText(this.#state.tokenizer, tokens),
+        ).map((token) => token.token);
         const driftResult = enforceLogitDrift(expectedTopK, actualTopK, maxDriftThreshold);
         if (!driftResult.ok) {
           throw new Error(`Intent bundle drift check failed: ${driftResult.reason}`);
         }
       }
-      applyRepetitionPenalty(prefillLogits, generatedIds, opts.repetitionPenalty);
-      const padTokenId = this.#state.tokenizer?.getSpecialTokens?.()?.pad;
       if (opts.debug) {
         const topAfterPenalty = getTopK(
-          prefillLogits,
+          Float32Array.from(prefillLogits),
           5,
           (tokens) => resolveTokenText(this.#state.tokenizer, tokens)
         );
@@ -411,13 +376,7 @@ export class PipelineGenerator {
       let firstToken;
       try {
-        firstToken = sample(prefillLogits, {
-          temperature: opts.temperature,
-          topP: opts.topP,
-          topK: opts.topK,
-          padTokenId,
-          seed: opts.seed,
-        });
+        firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
       } catch (error) {
         if (!shouldRetryWithFinitenessFallback(error)) {
           throw error;
@@ -428,14 +387,7 @@ export class PipelineGenerator {
           'prefill-sample',
           () => this._prefill(inputIds, opts)
         );
-        applyRepetitionPenalty(prefillLogits, generatedIds, opts.repetitionPenalty);
-        firstToken = sample(prefillLogits, {
-          temperature: opts.temperature,
-          topP: opts.topP,
-          topK: opts.topK,
-          padTokenId,
-          seed: opts.seed,
-        });
+        firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
       }
       if (opts.debug) {
@@ -454,9 +406,7 @@ export class PipelineGenerator {
         (tokens) => this.#state.tokenizer?.decode?.(tokens, false, false)
       );
-      const firstText = decodeToken(firstToken);
-      yield firstText;
-      if (options.onToken) options.onToken(firstToken, firstText);
+      yield* emitToken(this, firstToken, decodeToken);
       yield* this._runDecodeLoop(generatedIds, opts, options, {
         stopTokenIds: this.#state.modelConfig.stopTokenIds,
@@ -464,6 +414,7 @@ export class PipelineGenerator {
         stopSequenceStart: inputIds.length,
         decodeToken,
         logBatchPath: opts.debug,
+        emitMode: mode,
       });
       const tokensGenerated = this.#state.stats.decodeTokens;
       this.#state.stats.totalTimeMs = performance.now() - startTime;
@@ -495,17 +446,203 @@ export class PipelineGenerator {
     }
   }
+  _beginFinitenessFallback(opts, reasonLabel) {
+    const originalPlan = resolveActiveExecutionPlan(this.#state);
+    const original = {
+      activePlanId: this.#state.executionPlanState?.activePlanId ?? 'primary',
+      seed: opts.seed,
+    };
+    const fallbackPlan = activateFallbackExecutionPlan(this.#state);
+    if (!fallbackPlan) {
+      throw new Error('[Pipeline] Finiteness fallback plan is unavailable for this model/runtime configuration.');
+    }
+    log.warn(
+      'Pipeline',
+      `FinitenessGuard fallback (${reasonLabel}): ` +
+      `${originalPlan.kernelPathId ?? 'none'} -> ${fallbackPlan.kernelPathId ?? 'none'}`
+    );
-  async prefillKVOnly(prompt, options = {}) {
+    this.#state.decodeBuffers?.ensureBuffers({
+      hiddenSize: this.#state.modelConfig.hiddenSize,
+      intermediateSize: this.#state.modelConfig.intermediateSize,
+      activationDtype: fallbackPlan.activationDtype,
+      enablePingPong: true,
+    });
+    if (opts.seed == null) {
+      const fallbackSeedBase = (this.#state.decodeStepCount + this.#state.currentSeqLen + 1) >>> 0;
+      opts.seed = (fallbackSeedBase * 2654435761) >>> 0;
+    }
+    opts.executionPlan = rebaseExecutionSessionPlan(this.#state, opts.executionPlan);
+    return original;
+  }
+  _endFinitenessFallback(opts, original) {
+    opts.seed = original.seed;
+    setActiveExecutionPlan(this.#state, original.activePlanId);
+    opts.executionPlan = rebaseExecutionSessionPlan(this.#state, opts.executionPlan);
+    const nextActivationDtype = this._getEffectiveActivationDtype();
+    this.#state.decodeBuffers?.ensureBuffers({
+      hiddenSize: this.#state.modelConfig.hiddenSize,
+      intermediateSize: this.#state.modelConfig.intermediateSize,
+      activationDtype: nextActivationDtype,
+      enablePingPong: true,
+    });
+  }
+  async _retryWithFinitenessFallback(opts, reasonLabel, retryFn) {
+    if (this._hasFinitenessFallbackWindow()) {
+      return retryFn();
+    }
+    this.#state.kvCache?.truncate(this.#state.currentSeqLen);
+    const original = this._beginFinitenessFallback(opts, reasonLabel);
+    try {
+      return await retryFn();
+    } finally {
+      this._endFinitenessFallback(opts, original);
+    }
+  }
+  async _retryDecodeStepWithFinitenessWindow(generatedIds, opts, reasonLabel) {
+    const windowTokens = this._resolveDeferredRoundingWindowTokens();
+    if (windowTokens <= 1) {
+      return this._retryWithFinitenessFallback(
+        opts,
+        reasonLabel,
+        () => this._decodeStep(generatedIds, opts)
+      );
+    }
+    this.#state.kvCache?.truncate(this.#state.currentSeqLen);
+    this._openFinitenessFallbackWindow(opts, reasonLabel, windowTokens);
+    try {
+      return await this._decodeStep(generatedIds, opts);
+    } catch (error) {
+      this._closeFinitenessFallbackWindow(opts);
+      throw error;
+    }
+  }
+  // ==========================================================================
+  // Generation Public API
+  // ==========================================================================
+  async *generate(prompt, options = {}) {
+    yield* this._generateTokensInternal(prompt, options, 'text');
+  }
+  async *generateTokens(prompt, options = {}) {
+    yield* this._generateTokensInternal(prompt, options, 'token');
+  }
+  async generateTokenIds(prompt, options = {}) {
     if (!this.#state.isLoaded) throw new Error('Model not loaded');
-    resetActiveExecutionPlan(this.#state);
+    if (this.#state.isGenerating) throw new Error('Generation already in progress');
+    validateCallTimeOptions(options);
+    this.#state.isGenerating = true;
+    this._resetDecodeRuntimeState();
     this.#state.stats.gpuTimePrefillMs = undefined;
-    const opts = resolvePrefillOptions(this.#state, options);
+    this.#state.stats.gpuTimeDecodeMs = undefined;
+    this.#state.stats.decodeRecordMs = 0;
+    this.#state.stats.decodeSubmitWaitMs = 0;
+    this.#state.stats.decodeReadbackWaitMs = 0;
+    this.#state.stats.ttftMs = 0;
+    const startTime = performance.now();
+    const opts = resolveGenerateOptions(this.#state, options);
-    const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'prefillKVOnly');
+    try {
+      const prefillStart = performance.now();
+      const { inputIds, logits: initialPrefillLogits } = await this._prefillPromptToLogits(prompt, opts, 'generateTokenIds');
+      let prefillLogits = initialPrefillLogits;
+      this.#state.stats.prefillTimeMs = performance.now() - prefillStart;
+      this._assertTokenIdsInRange(inputIds, 'generateTokenIds.prefillTokens');
+      const generatedIds = [...inputIds];
+      this.#state.stats.prefillTokens = inputIds.length;
-    const inputIds = this.#state.tokenizer.encode(processedPrompt);
-    this._assertTokenIdsInRange(inputIds, 'prefillKVOnly.encode');
+      let firstToken;
+      try {
+        firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
+      } catch (error) {
+        if (!shouldRetryWithFinitenessFallback(error)) {
+          throw error;
+        }
+        prefillLogits = await this._retryWithFinitenessFallback(
+          opts,
+          'prefill-sample',
+          () => this._prefill(inputIds, opts)
+        );
+        firstToken = this._sampleNextTokenFromLogits(prefillLogits, generatedIds, opts);
+      }
+      generatedIds.push(firstToken);
+      const tokenIds = [firstToken];
+      this.#state.stats.ttftMs = performance.now() - startTime;
+      const stopTokenIds = this.#state.modelConfig.stopTokenIds;
+      const eosToken = this.#state.tokenizer.getSpecialTokens?.()?.eos;
+      const stopSequenceStart = inputIds.length;
+      markKernelCacheWarmed();
+      const decodeStart = performance.now();
+      while (tokenIds.length < opts.maxTokens) {
+        if (options.signal?.aborted) break;
+        let nextToken;
+        try {
+          nextToken = await this._decodeNextTokenViaLogits(generatedIds, opts);
+        } catch (error) {
+          if (shouldRetryWithFinitenessFallback(error)) {
+            nextToken = await this._retryDecodeStepWithFinitenessWindow(
+              generatedIds,
+              opts,
+              `decode-step-${tokenIds.length}`
+            );
+          } else {
+            throw error;
+          }
+        }
+        generatedIds.push(nextToken);
+        tokenIds.push(nextToken);
+        this._consumeFinitenessFallbackToken(opts);
+        if (isStopToken(nextToken, stopTokenIds, eosToken)) {
+          break;
+        }
+        if (opts.stopSequences.length > 0) {
+          const fullText = this.#state.tokenizer.decode(generatedIds.slice(stopSequenceStart), false);
+          if (opts.stopSequences.some((seq) => fullText.endsWith(seq))) break;
+        }
+      }
+      this.#state.stats.decodeTimeMs = performance.now() - decodeStart;
+      this.#state.stats.tokensGenerated = tokenIds.length;
+      this.#state.stats.decodeTokens = tokenIds.length;
+      this.#state.stats.totalTimeMs = performance.now() - startTime;
+      return {
+        tokenIds,
+        stats: this.#state.stats,
+      };
+    } finally {
+      this._closeFinitenessFallbackWindow(opts);
+      resetActiveExecutionPlan(this.#state);
+      this.#state.isGenerating = false;
+    }
+  }
+  async prefillKVOnly(prompt, options = {}) {
+    if (!this.#state.isLoaded) throw new Error('Model not loaded');
+    if (this.#state.isGenerating && options.__internalGenerate !== true) {
+      throw new Error('Generation already in progress');
+    }
+    this._resetDecodeRuntimeState();
+    this.#state.stats.gpuTimePrefillMs = undefined;
+    const opts = resolvePrefillOptions(this.#state, options);
+    const inputIds = this._resolvePromptTokenIds(prompt, opts.useChatTemplate, 'prefillKVOnly');
     if (opts.debug) {
       log.debug('Pipeline', `PrefillKVOnly: ${inputIds.length} tokens`);
     }
@@ -563,14 +700,13 @@ export class PipelineGenerator {
   async prefillWithEmbedding(prompt, options = {}) {
     if (!this.#state.isLoaded) throw new Error('Model not loaded');
-    resetActiveExecutionPlan(this.#state);
+    if (this.#state.isGenerating && options.__internalGenerate !== true) {
+      throw new Error('Generation already in progress');
+    }
+    this._resetDecodeRuntimeState();
     this.#state.stats.gpuTimePrefillMs = undefined;
     const opts = resolvePrefillEmbeddingOptions(this.#state, options);
-    const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'prefillWithEmbedding');
-    const inputIds = this.#state.tokenizer.encode(processedPrompt);
-    this._assertTokenIdsInRange(inputIds, 'prefillWithEmbedding.encode');
+    const inputIds = this._resolvePromptTokenIds(prompt, opts.useChatTemplate, 'prefillWithEmbedding');
     if (opts.debug) {
       log.debug('Pipeline', `PrefillWithEmbedding: ${inputIds.length} tokens (mode=${opts.embeddingMode})`);
     }
@@ -658,19 +794,13 @@ export class PipelineGenerator {
   async prefillWithLogits(prompt, options = {}) {
     if (!this.#state.isLoaded) throw new Error('Model not loaded');
-    resetActiveExecutionPlan(this.#state);
+    if (this.#state.isGenerating && options.__internalGenerate !== true) {
+      throw new Error('Generation already in progress');
+    }
+    this._resetDecodeRuntimeState();
     this.#state.stats.gpuTimePrefillMs = undefined;
     const opts = resolvePrefillOptions(this.#state, options);
-    const processedPrompt = resolvePromptInput(this.#state, prompt, opts.useChatTemplate, 'prefillWithLogits');
-    const inputIds = this.#state.tokenizer.encode(processedPrompt);
-    this._assertTokenIdsInRange(inputIds, 'prefillWithLogits.encode');
-    if (opts.debug) {
-      log.debug('Pipeline', `PrefillWithLogits: ${inputIds.length} tokens`);
-    }
-    const logits = await this._prefill(inputIds, opts);
+    const { inputIds, logits } = await this._prefillPromptToLogits(prompt, opts, 'prefillWithLogits');
     const snapshot = this.#state.kvCache?.clone();
     if (!snapshot) {
@@ -792,6 +922,7 @@ export class PipelineGenerator {
       stopSequenceStart,
       decodeToken,
       logBatchPath = false,
+      emitMode = 'text',
     } = runtime;
     let tokensGenerated = 1;
@@ -821,6 +952,9 @@ export class PipelineGenerator {
     }
     const readbackInterval = executionPlan.readbackInterval;
     const intervalBatches = readbackInterval == null ? 1 : readbackInterval;
+    const padTokenId = this.#state.tokenizer?.getSpecialTokens?.()?.pad;
+    const decodeSingleTokenViaLogits = async () => this._decodeNextTokenViaLogits(generatedIds, opts);
     if (logBatchPath && useBatchPath) {
       log.debug(
@@ -846,10 +980,16 @@ export class PipelineGenerator {
           for (const tokenId of batchResult.tokens) {
             generatedIds.push(tokenId);
             tokensGenerated++;
-            const tokenText = decodeToken(tokenId);
-            yield tokenText;
-            if (options.onToken) options.onToken(tokenId, tokenText);
-            batchTokens.push({ id: tokenId, text: tokenText });
+            if (emitMode === 'token') {
+              yield tokenId;
+              if (options.onToken) options.onToken(tokenId, '');
+              batchTokens.push({ id: tokenId, text: '' });
+            } else {
+              const tokenText = decodeToken(tokenId);
+              yield tokenText;
+              if (options.onToken) options.onToken(tokenId, tokenText);
+              batchTokens.push({ id: tokenId, text: tokenText });
+            }
             if (batchTokens.length === executionPlan.batchSize) {
               if (options.onBatch) options.onBatch(batchTokens);
               batchTokens = [];
@@ -866,7 +1006,7 @@ export class PipelineGenerator {
           useBatchPath = false;
           let nextToken;
           try {
-            nextToken = await this._decodeStep(generatedIds, opts);
+            nextToken = await decodeSingleTokenViaLogits();
           } catch (singleTokenError) {
             if (shouldRetryWithFinitenessFallback(singleTokenError)) {
               log.warn('Pipeline', `FinitenessGuard caught NaN/Inf at batch step ${tokensGenerated}. Truncating KV cache and retrying token with F32 precision.`);
@@ -881,9 +1021,14 @@ export class PipelineGenerator {
           }
           generatedIds.push(nextToken);
           tokensGenerated++;
-          const tokenText = decodeToken(nextToken);
-          yield tokenText;
-          if (options.onToken) options.onToken(nextToken, tokenText);
+          if (emitMode === 'token') {
+            yield nextToken;
+            if (options.onToken) options.onToken(nextToken, '');
+          } else {
+            const tokenText = decodeToken(nextToken);
+            yield tokenText;
+            if (options.onToken) options.onToken(nextToken, tokenText);
+          }
           this._consumeFinitenessFallbackToken(opts);
           if (isStopToken(nextToken, stopTokenIds, eosToken)) break;
         }
@@ -891,7 +1036,7 @@ export class PipelineGenerator {
         const tokenStart = performance.now();
         let nextToken;
         try {
-          nextToken = await this._decodeStep(generatedIds, opts);
+          nextToken = await decodeSingleTokenViaLogits();
         } catch (error) {
           if (shouldRetryWithFinitenessFallback(error)) {
             log.warn('Pipeline', `FinitenessGuard caught NaN/Inf at step ${tokensGenerated}. Truncating KV cache and retrying token with F32 precision.`);
@@ -907,9 +1052,14 @@ export class PipelineGenerator {
         const tokenTime = performance.now() - tokenStart;
         generatedIds.push(nextToken);
         tokensGenerated++;
-        const tokenText = decodeToken(nextToken);
-        yield tokenText;
-        if (options.onToken) options.onToken(nextToken, tokenText);
+        const tokenText = emitMode === 'token' ? '' : decodeToken(nextToken);
+        if (emitMode === 'token') {
+          yield nextToken;
+          if (options.onToken) options.onToken(nextToken, '');
+        } else {
+          yield tokenText;
+          if (options.onToken) options.onToken(nextToken, tokenText);
+        }
         this._consumeFinitenessFallbackToken(opts);
         if (opts.debug || opts.benchmark) {
@@ -947,6 +1097,13 @@ export class PipelineGenerator {
     if (startPos === 0 && hasLinearAttentionLayers(config.layerTypes)) {
       this.#state.linearAttentionRuntime = resetLinearAttentionRuntime(this.#state.linearAttentionRuntime);
     }
+    if (startPos === 0) {
+      for (const [, convState] of this.#state.convLayerStates) {
+        if (convState.convStateGPU && convState.hiddenSize && convState.kernelSize) {
+          uploadData(convState.convStateGPU, new Float32Array(convState.hiddenSize * (convState.kernelSize - 1)));
+        }
+      }
+    }
     const embedBufferRaw = this.#state.weights.get('embed');
     if (!(embedBufferRaw instanceof GPUBuffer) && !isWeightBuffer(embedBufferRaw) && !isCpuWeightBuffer(embedBufferRaw) && !(embedBufferRaw instanceof Float32Array)) {
@@ -1296,18 +1453,15 @@ export class PipelineGenerator {
   async decodeStepLogits(currentIds, options = {}) {
     if (!this.#state.isLoaded) throw new Error('Model not loaded');
-    if (this.#state.isGenerating) throw new Error('Generation already in progress');
+    if (this.#state.isGenerating && options.__internalGenerate !== true) {
+      throw new Error('Generation already in progress');
+    }
     resetActiveExecutionPlan(this.#state);
     validateCallTimeOptions(options);
     const opts = this._resolveStepOptions(options);
-    const debugCheckBuffer = this.#state.debug
-      ? (buffer, label, numTokens, expectedDim) =>
-        debugCheckBufferHelper(this.#state, buffer, label, numTokens, expectedDim)
-      : undefined;
-    return decodeStepLogits(this.#state, currentIds, opts, this._getDecodeHelpers(debugCheckBuffer));
+    return this._decodeStepToLogits(currentIds, opts);
   }
   async advanceWithToken(tokenId, options = {}) {