npm - node-llama-cpp - Versions diffs - 3.3.2 → 3.4.0 - Mend

node-llama-cpp 3.3.2 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

package/README.md +2 -1
package/dist/bindings/AddonTypes.d.ts +12 -4
package/dist/bindings/Llama.d.ts +9 -0
package/dist/bindings/Llama.js +52 -28
package/dist/bindings/Llama.js.map +1 -1
package/dist/bindings/getLlama.d.ts +2 -1
package/dist/bindings/getLlama.js +19 -9
package/dist/bindings/getLlama.js.map +1 -1
package/dist/bindings/utils/asyncSome.js +2 -0
package/dist/bindings/utils/asyncSome.js.map +1 -1
package/dist/bindings/utils/compileLLamaCpp.d.ts +1 -1
package/dist/bindings/utils/compileLLamaCpp.js +108 -34
package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
package/dist/bindings/utils/detectAvailableComputeLayers.d.ts +1 -0
package/dist/bindings/utils/detectAvailableComputeLayers.js +4 -4
package/dist/bindings/utils/detectAvailableComputeLayers.js.map +1 -1
package/dist/bindings/utils/detectBuildTools.d.ts +14 -0
package/dist/bindings/utils/detectBuildTools.js +149 -0
package/dist/bindings/utils/detectBuildTools.js.map +1 -0
package/dist/bindings/utils/resolveActualBindingBinaryPath.d.ts +1 -0
package/dist/bindings/utils/resolveActualBindingBinaryPath.js +18 -0
package/dist/bindings/utils/resolveActualBindingBinaryPath.js.map +1 -0
package/dist/bindings/utils/testBindingBinary.d.ts +1 -1
package/dist/bindings/utils/testBindingBinary.js +58 -5
package/dist/bindings/utils/testBindingBinary.js.map +1 -1
package/dist/chatWrappers/AlpacaChatWrapper.d.ts +4 -0
package/dist/chatWrappers/AlpacaChatWrapper.js +4 -0
package/dist/chatWrappers/AlpacaChatWrapper.js.map +1 -1
package/dist/chatWrappers/FalconChatWrapper.d.ts +4 -0
package/dist/chatWrappers/FalconChatWrapper.js +4 -0
package/dist/chatWrappers/FalconChatWrapper.js.map +1 -1
package/dist/chatWrappers/GeneralChatWrapper.d.ts +4 -0
package/dist/chatWrappers/GeneralChatWrapper.js +4 -0
package/dist/chatWrappers/GeneralChatWrapper.js.map +1 -1
package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +2 -0
package/dist/chatWrappers/utils/resolveChatWrapper.js +8 -27
package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -1
package/dist/cli/commands/ChatCommand.d.ts +4 -0
package/dist/cli/commands/ChatCommand.js +155 -11
package/dist/cli/commands/ChatCommand.js.map +1 -1
package/dist/cli/commands/CompleteCommand.d.ts +4 -0
package/dist/cli/commands/CompleteCommand.js +143 -10
package/dist/cli/commands/CompleteCommand.js.map +1 -1
package/dist/cli/commands/DebugCommand.js +5 -5
package/dist/cli/commands/DebugCommand.js.map +1 -1
package/dist/cli/commands/InfillCommand.d.ts +4 -0
package/dist/cli/commands/InfillCommand.js +142 -10
package/dist/cli/commands/InfillCommand.js.map +1 -1
package/dist/cli/commands/OnPostInstallCommand.js +12 -2
package/dist/cli/commands/OnPostInstallCommand.js.map +1 -1
package/dist/cli/commands/inspect/commands/InspectEstimateCommand.d.ts +1 -0
package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js +14 -7
package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js.map +1 -1
package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +13 -3
package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -1
package/dist/cli/commands/inspect/commands/InspectGpuCommand.js +20 -10
package/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map +1 -1
package/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts +2 -0
package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +234 -77
package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -1
package/dist/cli/utils/ConsoleTable.d.ts +1 -0
package/dist/cli/utils/ConsoleTable.js +5 -1
package/dist/cli/utils/ConsoleTable.js.map +1 -1
package/dist/cli/utils/interactivelyAskForModel.d.ts +2 -1
package/dist/cli/utils/interactivelyAskForModel.js +16 -13
package/dist/cli/utils/interactivelyAskForModel.js.map +1 -1
package/dist/cli/utils/isRunningUnderRosetta.d.ts +1 -0
package/dist/cli/utils/isRunningUnderRosetta.js +20 -0
package/dist/cli/utils/isRunningUnderRosetta.js.map +1 -0
package/dist/cli/utils/printCommonInfoLines.d.ts +4 -2
package/dist/cli/utils/printCommonInfoLines.js +67 -5
package/dist/cli/utils/printCommonInfoLines.js.map +1 -1
package/dist/cli/utils/resolveCommandGgufPath.d.ts +3 -1
package/dist/cli/utils/resolveCommandGgufPath.js +6 -5
package/dist/cli/utils/resolveCommandGgufPath.js.map +1 -1
package/dist/cli/utils/toBytes.d.ts +1 -0
package/dist/cli/utils/toBytes.js +5 -0
package/dist/cli/utils/toBytes.js.map +1 -0
package/dist/config.d.ts +3 -0
package/dist/config.js +3 -0
package/dist/config.js.map +1 -1
package/dist/evaluator/LlamaChat/LlamaChat.d.ts +12 -3
package/dist/evaluator/LlamaChat/LlamaChat.js +21 -7
package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +6 -2
package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +3 -0
package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
package/dist/evaluator/LlamaCompletion.d.ts +3 -0
package/dist/evaluator/LlamaCompletion.js +5 -0
package/dist/evaluator/LlamaCompletion.js.map +1 -1
package/dist/evaluator/LlamaContext/LlamaContext.d.ts +81 -38
package/dist/evaluator/LlamaContext/LlamaContext.js +678 -132
package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
package/dist/evaluator/LlamaContext/TokenPredictor.d.ts +55 -0
package/dist/evaluator/LlamaContext/TokenPredictor.js +20 -0
package/dist/evaluator/LlamaContext/TokenPredictor.js.map +1 -0
package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts +56 -0
package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js +266 -0
package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map +1 -0
package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts +58 -0
package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js +138 -0
package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map +1 -0
package/dist/evaluator/LlamaContext/types.d.ts +198 -5
package/dist/evaluator/LlamaEmbeddingContext.d.ts +3 -0
package/dist/evaluator/LlamaEmbeddingContext.js +3 -0
package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
package/dist/evaluator/LlamaGrammar.d.ts +7 -1
package/dist/evaluator/LlamaGrammar.js +6 -0
package/dist/evaluator/LlamaGrammar.js.map +1 -1
package/dist/evaluator/LlamaGrammarEvaluationState.d.ts +4 -4
package/dist/evaluator/LlamaGrammarEvaluationState.js +16 -8
package/dist/evaluator/LlamaGrammarEvaluationState.js.map +1 -1
package/dist/evaluator/LlamaJsonSchemaGrammar.d.ts +5 -0
package/dist/evaluator/LlamaJsonSchemaGrammar.js +7 -0
package/dist/evaluator/LlamaJsonSchemaGrammar.js.map +1 -1
package/dist/evaluator/LlamaModel/LlamaModel.d.ts +19 -11
package/dist/evaluator/LlamaModel/LlamaModel.js +23 -29
package/dist/evaluator/LlamaModel/LlamaModel.js.map +1 -1
package/dist/evaluator/LlamaRankingContext.d.ts +76 -0
package/dist/evaluator/LlamaRankingContext.js +159 -0
package/dist/evaluator/LlamaRankingContext.js.map +1 -0
package/dist/evaluator/TokenBias.d.ts +3 -0
package/dist/evaluator/TokenBias.js +3 -0
package/dist/evaluator/TokenBias.js.map +1 -1
package/dist/evaluator/utils/chunkDocument.d.ts +86 -0
package/dist/evaluator/utils/chunkDocument.js +212 -0
package/dist/evaluator/utils/chunkDocument.js.map +1 -0
package/dist/gguf/insights/GgufInsights.d.ts +3 -1
package/dist/gguf/insights/GgufInsights.js +114 -8
package/dist/gguf/insights/GgufInsights.js.map +1 -1
package/dist/gguf/insights/GgufInsightsConfigurationResolver.d.ts +6 -3
package/dist/gguf/insights/GgufInsightsConfigurationResolver.js +11 -7
package/dist/gguf/insights/GgufInsightsConfigurationResolver.js.map +1 -1
package/dist/gguf/insights/utils/resolveModelGpuLayersOption.d.ts +2 -1
package/dist/gguf/insights/utils/resolveModelGpuLayersOption.js +13 -7
package/dist/gguf/insights/utils/resolveModelGpuLayersOption.js.map +1 -1
package/dist/gguf/parser/GgufV2Parser.js +29 -8
package/dist/gguf/parser/GgufV2Parser.js.map +1 -1
package/dist/gguf/parser/parseGguf.js +11 -11
package/dist/gguf/parser/parseGguf.js.map +1 -1
package/dist/gguf/readGgufFileInfo.js +8 -3
package/dist/gguf/readGgufFileInfo.js.map +1 -1
package/dist/gguf/types/GgufFileInfoTypes.d.ts +1 -0
package/dist/gguf/types/GgufMetadataTypes.d.ts +9 -9
package/dist/gguf/types/GgufMetadataTypes.js +1 -1
package/dist/gguf/types/GgufMetadataTypes.js.map +1 -1
package/dist/gguf/types/GgufTensorInfoTypes.d.ts +13 -0
package/dist/gguf/types/GgufTensorInfoTypes.js.map +1 -1
package/dist/index.d.ts +7 -2
package/dist/index.js +6 -1
package/dist/index.js.map +1 -1
package/dist/tsconfig.tsbuildinfo +1 -1
package/dist/utils/LlamaText.d.ts +4 -1
package/dist/utils/LlamaText.js +4 -1
package/dist/utils/LlamaText.js.map +1 -1
package/dist/utils/cmake.js +23 -0
package/dist/utils/cmake.js.map +1 -1
package/dist/utils/pushAll.d.ts +1 -1
package/dist/utils/pushAll.js.map +1 -1
package/dist/utils/tokenizerUtils.js +1 -1
package/dist/utils/utilTypes.d.ts +5 -0
package/llama/CMakeLists.txt +25 -8
package/llama/addon/AddonContext.cpp +188 -16
package/llama/addon/AddonContext.h +1 -0
package/llama/addon/AddonGrammar.cpp +1 -4
package/llama/addon/AddonGrammarEvaluationState.cpp +16 -5
package/llama/addon/AddonModel.cpp +11 -15
package/llama/addon/AddonModel.h +0 -1
package/llama/addon/AddonSampler.cpp +1 -6
package/llama/addon/addon.cpp +26 -7
package/llama/addon/globals/getGpuInfo.cpp +30 -5
package/llama/addon/globals/getGpuInfo.h +6 -1
package/llama/addon/globals/getMemoryInfo.cpp +63 -0
package/llama/addon/globals/getMemoryInfo.h +4 -0
package/llama/binariesGithubRelease.json +1 -1
package/llama/cmake/win32.ensureNinjaPath.cmake +68 -0
package/llama/cmake/win32.ensureNodeLib.cmake +34 -0
package/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake +12 -0
package/llama/cmake/win32.llvmEnsureCmakeAr.cmake +37 -0
package/llama/cmake/win32.llvmUseGnuModeCompilers.cmake +87 -0
package/llama/cmake/win32.programFilesPaths.cmake +31 -0
package/llama/gitRelease.bundle +0 -0
package/llama/gpuInfo/vulkan-gpu-info.cpp +29 -2
package/llama/gpuInfo/vulkan-gpu-info.h +1 -0
package/llama/llama.cpp.info.json +1 -1
package/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake +14 -0
package/llama/profiles/llvm.win32.host-x64.target-arm64.cmake +14 -0
package/llama/profiles/llvm.win32.host-x64.target-x64.cmake +14 -0
package/llama/toolchains/llvm.win32.host-x64.target-x64.cmake +20 -0
package/llama/toolchains/win32.host-arm64.target-arm64.cmake +21 -0
package/llama/toolchains/win32.host-x64.target-arm64.cmake +14 -34
package/package.json +43 -43
package/templates/packed/electron-typescript-react.json +1 -1
package/templates/packed/node-typescript.json +1 -1

package/dist/evaluator/LlamaContext/LlamaContext.js CHANGED Viewed

@@ -1,9 +1,12 @@
-import { AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
+import { acquireLock, AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
 import { removeNullFields } from "../../utils/removeNullFields.js";
 import { compareTokens } from "../../utils/compareTokens.js";
 import { DisposeGuard } from "../../utils/DisposeGuard.js";
 import { TokenMeter } from "../TokenMeter.js";
 import { UnsupportedError } from "../../utils/UnsupportedError.js";
+import { pushAll } from "../../utils/pushAll.js";
+import { safeEventCallback } from "../../utils/safeEventCallback.js";
+import { GgufArchitectureType } from "../../gguf/types/GgufMetadataTypes.js";
 import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
 import { LlamaSampler } from "./LlamaSampler.js";
 const defaultLoraScale = 1;
@@ -13,6 +16,7 @@ const defaultFailedCreationRemedy = {
     retries: 6,
     autoContextSizeShrink: 0.16
 };
+const defaultEvaluationPriority = 5;
 export class LlamaContext {
     /** @internal */ _llama;
     /** @internal */ _ctx;
@@ -43,7 +47,7 @@ export class LlamaContext {
     /** @internal */ _allocatedContextSize;
     /** @internal */ _disposed = false;
     onDispose = new EventRelay();
-    constructor({ _model }, { sequences, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, performanceTracking = false, _embeddings }) {
+    constructor({ _model }, { sequences, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { dispatchSchedule: batchingDispatchSchedule = "nextCycle", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, performanceTracking = false, _embeddings, _ranking }) {
         if (_model.disposed)
             throw new DisposedError();
         this._llama = _model._llama;
@@ -70,6 +74,7 @@ export class LlamaContext {
             flashAttention: this._flashAttention,
             threads: this._idealThreads,
             embeddings: _embeddings,
+            ranking: _ranking,
             performanceTracking: this._performanceTracking
         }));
         this._batchingOptions = {
@@ -163,7 +168,7 @@ export class LlamaContext {
      * When there are no sequences left, this method will throw an error.
      */
     getSequence(options = {}) {
-        const { contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, _tokenMeter } = options;
+        const { contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, tokenPredictor, _tokenMeter } = options;
         this._ensureNotDisposed();
         const nextSequenceId = this._popSequenceId();
         if (nextSequenceId == null)
@@ -175,7 +180,8 @@ export class LlamaContext {
             contextShift: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
-            }
+            },
+            tokenPredictor
         });
     }
     dispatchPendingBatch() {
@@ -189,6 +195,7 @@ export class LlamaContext {
             this._dispatchDecodeScheduled = false;
             this._batchDispatchPending = false;
             let shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
+            const queuedDecodeToMappedLogits = new Map();
             const resolvePrioritizationStrategy = () => {
                 try {
                     this._ensureNotDisposed();
@@ -205,6 +212,7 @@ export class LlamaContext {
                 for (const queuedDecode of this._queuedDecodes) {
                     const batchItem = {
                         tokens: queuedDecode.tokens,
+                        logits: queuedDecode.logits,
                         evaluationPriority: queuedDecode.evaluationPriority
                     };
                     batchItemToQueuedDecodeMap.set(batchItem, queuedDecode);
@@ -262,15 +270,16 @@ export class LlamaContext {
                 if (currentBatchSize !== 0)
                     this._ctx.initBatch(currentBatchSize);
                 for (const { queuedDecode, processAmount } of batchItems) {
-                    let batchLogitIndex;
+                    let batchLogitIndexes;
+                    const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
+                    const tokenIndexesWithLogitsToProcess = queuedDecode.logits.slice(0, processAmount)
+                        .map((logit, index) => (logit ? index : undefined))
+                        .filter((index) => index != undefined);
+                    const numberOfOutputTokens = tokenIndexesWithLogitsToProcess.length;
+                    TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
+                    TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
                     try {
-                        const shouldGenerateLogitAtTheEnd = queuedDecode.generateLogitAtTheEnd &&
-                            processAmount === queuedDecode.tokens.length;
-                        const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
-                        const numberOfOutputTokens = shouldGenerateLogitAtTheEnd ? 1 : 0;
-                        TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
-                        TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
-                        batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), shouldGenerateLogitAtTheEnd);
+                        batchLogitIndexes = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), Uint32Array.from(tokenIndexesWithLogitsToProcess));
                     }
                     catch (err) {
                         this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
@@ -280,13 +289,23 @@ export class LlamaContext {
                     if (queuedDecode.tokens.length === processAmount) {
                         queuedDecodesToDelete.add(queuedDecode);
                         afterDecodeActions.push({
-                            batchLogitIndex,
-                            response: queuedDecode.response,
-                            onDone: queuedDecode.onDone
+                            queuedDecode,
+                            batchLogitIndexes,
+                            batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
+                            firstTokenIndex: queuedDecode.firstTokenSequenceIndex,
+                            returnResults: true
                         });
                     }
                     else {
+                        if (batchLogitIndexes.length > 0)
+                            afterDecodeActions.push({
+                                queuedDecode,
+                                batchLogitIndexes,
+                                batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
+                                firstTokenIndex: queuedDecode.firstTokenSequenceIndex
+                            });
                         queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
+                        queuedDecode.logits = queuedDecode.logits.slice(processAmount);
                         queuedDecode.firstTokenSequenceIndex += processAmount;
                     }
                 }
@@ -315,18 +334,50 @@ export class LlamaContext {
                         return;
                     }
                 }
-                for (const action of afterDecodeActions) {
-                    const [accept, reject] = action.response;
-                    if (action.onDone != null && action.batchLogitIndex != null) {
-                        try {
-                            accept(action.onDone(action.batchLogitIndex ?? null));
-                        }
-                        catch (err) {
-                            reject(err);
-                        }
+                function finishAfterDecodeAction(action, mappedLogitValues) {
+                    if (mappedLogitValues != null && mappedLogitValues.length > 0) {
+                        if (queuedDecodeToMappedLogits.has(action.queuedDecode))
+                            pushAll(queuedDecodeToMappedLogits.get(action.queuedDecode), mappedLogitValues);
+                        else
+                            queuedDecodeToMappedLogits.set(action.queuedDecode, mappedLogitValues);
+                    }
+                    if (action.returnResults != null) {
+                        const [accept] = action.queuedDecode.response;
+                        const mappedLogits = queuedDecodeToMappedLogits.get(action.queuedDecode) ?? [];
+                        queuedDecodeToMappedLogits.delete(action.queuedDecode);
+                        accept(mappedLogits);
                     }
-                    accept(undefined);
                 }
+                const afterDecodeActionResults = afterDecodeActions.map((action) => {
+                    if (action.batchLogitIndexes.length === 0) {
+                        finishAfterDecodeAction(action);
+                        return undefined;
+                    }
+                    const mappedLogitValues = [];
+                    let promiseChain = undefined;
+                    const batchLogitIndexes = action.batchLogitIndexes;
+                    const batchLogitTokenIndexes = action.batchLogitTokenIndexes;
+                    for (let i = 0; i < batchLogitIndexes.length; i++) {
+                        const tokenIndex = batchLogitTokenIndexes[i];
+                        const mappedValue = promiseChain != null
+                            ? promiseChain
+                                .then(() => action.queuedDecode.logitDataMapper(batchLogitIndexes[i], tokenIndex + action.firstTokenIndex))
+                            : action.queuedDecode.logitDataMapper(batchLogitIndexes[i], tokenIndex + action.firstTokenIndex);
+                        if (mappedValue instanceof Promise) {
+                            promiseChain = mappedValue;
+                            mappedLogitValues.push(mappedValue
+                                .then((value) => [tokenIndex + action.firstTokenIndex, value]));
+                        }
+                        else
+                            mappedLogitValues.push([tokenIndex + action.firstTokenIndex, mappedValue]);
+                    }
+                    if (promiseChain != null)
+                        return Promise.all(mappedLogitValues)
+                            .then((resolvedMappedLogitValues) => finishAfterDecodeAction(action, resolvedMappedLogitValues));
+                    finishAfterDecodeAction(action, mappedLogitValues);
+                    return undefined;
+                });
+                await Promise.all(afterDecodeActionResults);
             };
             const prioritizationStrategy = resolvePrioritizationStrategy();
             if (prioritizationStrategy == null)
@@ -376,17 +427,17 @@ export class LlamaContext {
         await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
     }
     /** @internal */
-    async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5, tokenMeter }, onDone) {
+    async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, logits, evaluationPriority = defaultEvaluationPriority, tokenMeter }, logitDataMapper) {
         return await new Promise((accept, reject) => {
             this._queuedDecodes.push({
                 sequenceId,
                 tokens,
+                logits,
                 firstTokenSequenceIndex,
-                generateLogitAtTheEnd,
                 evaluationPriority,
                 tokenMeter,
                 response: [accept, reject],
-                onDone
+                logitDataMapper
             });
             this._queuedDecodeSequenceIds.add(sequenceId);
             this._scheduleDecode();
@@ -429,10 +480,20 @@ export class LlamaContext {
         const dispatchSchedule = this._batchingOptions.dispatchSchedule;
         if (this._queuedDecodeSequenceIds.size === this._totalSequences)
             dispatch();
-        if (dispatchSchedule === "nextTick")
-            setTimeout(dispatch, 0);
-        else
+        if (dispatchSchedule === "nextCycle") {
+            if (typeof setImmediate === "function")
+                setImmediate(dispatch);
+            else
+                setTimeout(dispatch, 0);
+        }
+        else if (typeof dispatchSchedule === "function")
             dispatchSchedule(dispatch);
+        else {
+            if (typeof setImmediate === "function")
+                setImmediate(dispatch);
+            else
+                setTimeout(dispatch, 0);
+        }
     }
     /** @internal */
     _dispatchErrorForQueuedDecodesAndDequeue(queuedDecodes, err) {
@@ -620,17 +681,27 @@ export class LlamaContextSequence {
     /** @internal */ _gcRegistry;
     /** @internal */ _context;
     /** @internal */ _contextShift;
+    /** @internal */ _tokenPredictor;
     /** @internal */ _tokenMeter;
     /** @internal */ _disposeAggregator = new DisposeAggregator();
+    /** @internal */ _lock = {};
+    /** @internal */ _resetTokenPredictor = false;
+    /** @internal */ _tokenPredictorOwner = {};
     /** @internal */ _contextTokens = [];
     /** @internal */ _nextTokenIndex = 0;
+    /** @internal */ _loadedTokenPredictions = [];
+    /** @internal */ _usedTokenPredictions = 0;
+    /** @internal */ _unusedTokenPredictions = 0;
+    /** @internal */ _validatedTokenPredictions = 0;
+    /** @internal */ _refutedTokenPredictions = 0;
     /** @internal */ _disposed = false;
     onDispose = new EventRelay();
-    constructor({ sequenceId, context, tokenMeter, contextShift }) {
+    constructor({ sequenceId, context, tokenMeter, contextShift, tokenPredictor }) {
         this._sequenceId = sequenceId;
         this._context = context;
         this._tokenMeter = tokenMeter ?? new TokenMeter();
         this._contextShift = contextShift;
+        this._tokenPredictor = tokenPredictor;
         this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
         this._gcRegistry.register(this, sequenceId);
         this._disposeAggregator.add(() => this._gcRegistry.unregister(this));
@@ -639,6 +710,8 @@ export class LlamaContextSequence {
         this._disposeAggregator.add(() => {
             this._context._reclaimUnusedSequenceId(this._sequenceId);
         });
+        if (this._tokenPredictor != null)
+            this._disposeAggregator.add(this._tokenPredictor);
     }
     dispose() {
         if (this._disposed)
@@ -660,20 +733,51 @@ export class LlamaContextSequence {
     get model() {
         return this._context.model;
     }
+    /** The maximum number of tokens that the sequence state can hold */
+    get contextSize() {
+        return this._context.contextSize;
+    }
+    /** The index where the next evaluated token will be placed in the context */
     get nextTokenIndex() {
-        return this._nextTokenIndex;
+        return this._nextTokenIndex - this._loadedTokenPredictions.length;
     }
+    /** The current context state tokens */
     get contextTokens() {
-        return this._contextTokens.slice();
+        if (this._loadedTokenPredictions.length === 0)
+            return this._contextTokens.slice();
+        return this._contextTokens.slice(0, -this._loadedTokenPredictions.length);
     }
     get tokenMeter() {
         return this._tokenMeter;
     }
+    /**
+     * The token predictor used when creating this sequence.
+     */
+    get tokenPredictor() {
+        return this._tokenPredictor;
+    }
+    /**
+     * Statistics of token predictions using the sequence's `tokenPredictor`.
+     *
+     * The statistics change only when token prediction is used in this sequence.
+     *
+     * `validated` + `refuted` = total number of evaluated predictions.
+     *
+     * Prefer using `validated` and `refuted` to evaluate the effectiveness of token prediction.
+     */
+    get tokenPredictions() {
+        return {
+            used: this._usedTokenPredictions,
+            unused: this._unusedTokenPredictions,
+            validated: this._validatedTokenPredictions,
+            refuted: this._refutedTokenPredictions
+        };
+    }
     get isLoadedToMemory() {
         return !this._disposed;
     }
     compareContextTokens(tokens) {
-        for (let i = 0; i < this._contextTokens.length; i++) {
+        for (let i = 0; i < this._contextTokens.length - this._loadedTokenPredictions.length; i++) {
             if (compareTokens(this._contextTokens[i], tokens[i]))
                 continue;
             return {
@@ -681,7 +785,7 @@ export class LlamaContextSequence {
             };
         }
         return {
-            firstDifferentIndex: this._contextTokens.length
+            firstDifferentIndex: this._contextTokens.length - this._loadedTokenPredictions.length
         };
     }
     /**
@@ -695,10 +799,12 @@ export class LlamaContextSequence {
      * which incurs token evaluation of the shifted tokens.
      */
     async adaptStateToTokens(tokens, allowShift = true) {
-        if (this.model.fileInsights.isRecurrent || !allowShift) {
+        const modelSupportsShifting = !this.model.fileInsights.isRecurrent &&
+            this.model.fileInfo.metadata?.general?.architecture !== GgufArchitectureType.deepseek2;
+        if (!modelSupportsShifting || !allowShift) {
             const { firstDifferentIndex } = this.compareContextTokens(tokens);
-            if (firstDifferentIndex < this._nextTokenIndex)
-                await this.eraseContextTokenRanges([{
+            if (firstDifferentIndex < this.nextTokenIndex)
+                await this._eraseContextTokenRanges([{
                         start: firstDifferentIndex,
                         end: this._nextTokenIndex
                     }]);
@@ -707,7 +813,7 @@ export class LlamaContextSequence {
         const eraseRanges = [];
         let tokensIndex = 0;
         let differentTokenIndex = undefined;
-        for (let i = 0; i < this._contextTokens.length && tokensIndex < tokens.length; i++) {
+        for (let i = 0; i < this._contextTokens.length - this._loadedTokenPredictions.length && tokensIndex < tokens.length; i++) {
             if (compareTokens(this._contextTokens[i], tokens[tokensIndex])) {
                 if (differentTokenIndex != null) {
                     eraseRanges.push({
@@ -728,7 +834,7 @@ export class LlamaContextSequence {
                 end: this._nextTokenIndex
             });
         if (eraseRanges.length > 0)
-            await this.eraseContextTokenRanges(eraseRanges);
+            await this._eraseContextTokenRanges(eraseRanges);
     }
     /**
      * Clear the history of the sequence.
@@ -736,14 +842,18 @@ export class LlamaContextSequence {
      */
     async clearHistory() {
         this._ensureNotDisposed();
-        await this.eraseContextTokenRanges([{ start: 0, end: this._nextTokenIndex }]);
+        await this._eraseContextTokenRanges([{ start: 0, end: this._nextTokenIndex }]);
     }
     /**
      * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
      * The start of each range is inclusive, and the end of each range is exclusive.
      * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
      */
-    async eraseContextTokenRanges(ranges) {
+    eraseContextTokenRanges(ranges) {
+        return this._eraseContextTokenRanges(ranges);
+    }
+    /** @internal */
+    async _eraseContextTokenRanges(ranges, { canResetTokenPredictor = true, canRemovePredictionTokens = true, skipLock = false } = {}) {
         this._ensureNotDisposed();
         await withLock(this._context, "context", async () => {
             this._ensureNotDisposed();
@@ -776,6 +886,19 @@ export class LlamaContextSequence {
                 ranges.push(range);
                 return ranges;
             }, []);
+            const tokenPredictionsToRemove = (resolvedRanges.length > 0 && canRemovePredictionTokens)
+                ? this._loadedTokenPredictions.length
+                : 0;
+            if (tokenPredictionsToRemove > 0) {
+                const startDeleteIndex = this._nextTokenIndex - this._loadedTokenPredictions.length;
+                const lastDeleteRange = resolvedRanges[resolvedRanges.length - 1];
+                if (lastDeleteRange.end >= startDeleteIndex)
+                    lastDeleteRange.end = this._nextTokenIndex;
+                else
+                    resolvedRanges.push({ start: startDeleteIndex, end: this._nextTokenIndex });
+                if (canResetTokenPredictor)
+                    await this._abortTokenPredictor(true);
+            }
             let removedTokens = 0;
             let lastDeleteRangeEndPos = null;
             for (const range of resolvedRanges) {
@@ -790,6 +913,8 @@ export class LlamaContextSequence {
                 removedTokens += range.end - range.start;
                 lastDeleteRangeEndPos = range.end;
             }
+            if (tokenPredictionsToRemove > 0)
+                this._loadedTokenPredictions.splice(0, tokenPredictionsToRemove);
             if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 &&
                 lastDeleteRangeEndPos !== this._nextTokenIndex) {
                 this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, this._nextTokenIndex, -removedTokens);
@@ -797,17 +922,62 @@ export class LlamaContextSequence {
                 this._tokenMeter.useTokens(shiftedTokens, "input");
             }
             this._nextTokenIndex -= removedTokens;
+            if (canResetTokenPredictor && removedTokens > 0)
+                await this._abortTokenPredictor(true);
             if (deletionSuccessful)
                 return;
             const newSequenceTokens = this._contextTokens.slice();
             this._nextTokenIndex = 0;
             this._context._ctx.disposeSequence(this._sequenceId);
-            await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens);
+            await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, { _skipLock: skipLock });
         });
     }
-    evaluate(tokens, options = {}) {
-        const { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEogToken = false, _noSampling = false } = options;
-        return this._evaluate(tokens, {
+    /**
+     * Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
+     *
+     * This method uses the token predictor (when provided) to generate new tokens faster.
+     */
+    async *evaluate(tokens, options = {}) {
+        const iterator = this.evaluateWithMetadata(tokens, {}, options);
+        let iterateInput = undefined;
+        try {
+            while (true) {
+                const { value, done } = await iterator.next(iterateInput);
+                if (done)
+                    return;
+                iterateInput = yield value.token;
+            }
+        }
+        finally {
+            await iterator.return();
+        }
+    }
+    /**
+     * Like {@link evaluate `.evaluate(...)`}, but with additional metadata for each generated token.
+     *
+     * Configure the additional metadata options to choose which metadata to include.
+     */
+    evaluateWithMetadata(tokens, metadata, options = {}) {
+        const { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEogToken = false, _noSampling = false } = options;
+        if (this._tokenPredictor != null && !_noSampling && tokens.length > 0)
+            return this._speculativeEvaluate(tokens, metadata, {
+                temperature,
+                minP,
+                topK,
+                topP,
+                seed,
+                grammarEvaluationState,
+                repeatPenalty,
+                tokenBias,
+                evaluationPriority,
+                contextShiftOptions: {
+                    size: contextShiftSize,
+                    strategy: contextShiftStrategy
+                },
+                yieldEogToken,
+                tokenPredictor: this._tokenPredictor
+            });
+        return this._evaluate(tokens, metadata, {
             temperature,
             minP,
             topK,
@@ -827,82 +997,205 @@ export class LlamaContextSequence {
     }
     /**
      * Evaluate the provided tokens into the context sequence without generating new tokens.
-     * @param tokens
-     * @param [options]
      */
-    async evaluateWithoutGeneratingNewTokens(tokens, { evaluationPriority = 5, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {} } = {}) {
-        const iterator = this._evaluate(tokens, {
+    async evaluateWithoutGeneratingNewTokens(tokens, options = {}) {
+        const { evaluationPriority = defaultEvaluationPriority, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, _skipLock = false } = options;
+        const iterator = this._evaluate(tokens, {}, {
             generateNewTokens: false,
             evaluationPriority,
             contextShiftOptions: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
-            }
+            },
+            _skipLock
         });
+        const predictorAlignmentPromise = this.tokenPredictor == null
+            ? undefined
+            : this._tokenPredictor?.reset({
+                stateTokens: [...this._contextTokens, ...tokens],
+                evaluateOptions: {
+                    evaluationPriority,
+                    contextShift: {
+                        size: contextShiftSize,
+                        strategy: contextShiftStrategy
+                    }
+                },
+                targetSequence: this
+            });
+        if (predictorAlignmentPromise != null) {
+            this._tokenPredictorOwner = {};
+            this._resetTokenPredictor = false;
+        }
         // eslint-disable-next-line @typescript-eslint/no-unused-vars
         for await (const token of iterator) {
             // Array.from doesn't work with async generators, so we have to iterate over the generator
         }
+        await iterator.return();
+        if (predictorAlignmentPromise != null)
+            await predictorAlignmentPromise;
+    }
+    /**
+     * Evaluate the provided tokens into the context sequence with custom options for each token.
+     *
+     * This method allows for more precise control of the generation process.
+     *
+     * A next token will be generated for a given token only if any of the `generateNext` options for it are used.
+     *
+     * To generate more tokens after this method finishes,
+     * use it again with token(s) you selected to add to the context from the previous evaluation.
+     *
+     * This method doesn't use the token predictor (when provided) since it cannot predict which tokens are actually needed.
+     * Use the `evaluate` method when you need to use token prediction.
+     * @returns An array where for each token in the input array, there can be an output item at the same index in the output array.
+     * For indexes that have no output, there won't be any value at the corresponding index in the output array.
+     *
+     * It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
+     */
+    async controlledEvaluate(input, options) {
+        const { evaluationPriority = defaultEvaluationPriority, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {} } = options ?? {};
+        const contextShiftOptions = {
+            size: contextShiftSize,
+            strategy: contextShiftStrategy
+        };
+        this._ensureNotDisposed();
+        if (input.length === 0)
+            return [];
+        await this._abortTokenPredictor();
+        const sampler = new LlamaSampler(this.model);
+        const onTokenResult = safeEventCallback(options?.onTokenResult);
+        const logitsArray = [];
+        const resolvedTokens = input.map((item, index) => {
+            if (item instanceof Array) {
+                const [token, options] = item;
+                const generateNext = options?.generateNext ?? {};
+                if (generateNext.probabilities === true || generateNext.confidence === true || generateNext.token === true)
+                    logitsArray[index] = true;
+                return token;
+            }
+            return item;
+        });
+        const evaluatorLock = await acquireLock(this._lock, "evaluate");
+        try {
+            return await this._decodeTokens(resolvedTokens, logitsArray, evaluationPriority, this._tokenMeter, contextShiftOptions, async (batchLogitIndex, tokenIndex) => {
+                const inputToken = input[tokenIndex];
+                const inputOptions = inputToken instanceof Array
+                    ? (inputToken[1] ?? {})
+                    : {};
+                const generateNext = inputOptions.generateNext;
+                if (generateNext == null || ((generateNext.probabilities == null || !generateNext.probabilities) &&
+                    (generateNext.token == null || !generateNext.token) &&
+                    (generateNext.confidence == null || !generateNext.confidence)))
+                    return undefined;
+                const sampleOptions = generateNext.options ?? {};
+                const samplerConfig = this._resolveSamplerConfig({
+                    temperature: sampleOptions.temperature,
+                    minP: sampleOptions.minP,
+                    topK: sampleOptions.topK,
+                    topP: sampleOptions.topP,
+                    seed: sampleOptions.seed,
+                    repeatPenalty: sampleOptions.repeatPenalty,
+                    tokenBias: sampleOptions.tokenBias
+                });
+                return await withLock(sampler, "sample", async () => {
+                    if (sampler.disposed)
+                        return undefined;
+                    sampler.applyConfig(samplerConfig);
+                    const [token, probabilities, confidence] = await this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler, !!generateNext.probabilities, !!generateNext.confidence);
+                    const output = {
+                        next: {}
+                    };
+                    if (generateNext.token)
+                        output.next.token = token === -1
+                            ? null
+                            : (token ?? null);
+                    if (confidence != null)
+                        output.next.confidence = confidence;
+                    if (probabilities != null)
+                        output.next.probabilities = reviveTokenProbabilities(probabilities);
+                    onTokenResult?.(tokenIndex, output);
+                    return output;
+                });
+            });
+        }
+        finally {
+            evaluatorLock.dispose();
+            void withLock(sampler, "sample", sampler.asyncDispose);
+        }
     }
     /** @internal */
-    async *_evaluate(tokens, { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false }) {
+    async *_evaluate(tokens, metadata, { temperature, minP, topK, topP, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false, _skipLock = false }) {
         this._ensureNotDisposed();
         let evalTokens = tokens;
         if (evalTokens.length === 0)
             return;
+        await this._abortTokenPredictor(false, true);
+        const sampleProbabilities = metadata.probabilities === true;
+        const sampleConfidence = metadata.confidence === true;
         const sampler = new LlamaSampler(this.model);
         try {
             while (true) {
                 this._ensureNotDisposed();
-                // Evaluate to get the next token.
-                const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
-                    if (_noSampling)
-                        return null;
-                    const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
-                        ? repeatPenalty.punishTokens()
-                        : repeatPenalty?.punishTokens;
-                    const maxPunishTokens = Math.max(repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens, repeatPenaltyTokens?.length ?? 0);
-                    const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
-                        ? grammarEvaluationState()
-                        : grammarEvaluationState;
-                    if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
-                        throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
-                    const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
-                    sampler.applyConfig(removeNullFields({
-                        temperature,
-                        minP,
-                        topK,
-                        topP,
-                        seed: Math.max(0, Number.isFinite(seed)
-                            ? Math.floor(seed ?? (Date.now() / 1000))
-                            : Math.floor(Date.now() / 1000)),
-                        repeatPenalty: repeatPenalty?.penalty,
-                        repeatPenaltyMaxTokens: maxPunishTokens,
-                        repeatPenaltyTokens: repeatPenaltyTokens != null
-                            ? Uint32Array.from(repeatPenaltyTokens)
-                            : undefined,
-                        repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
-                        repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
-                        tokenBiasKeys,
-                        tokenBiasValues,
-                        grammarEvaluationState: resolvedGrammarEvaluationState?._state
-                    }));
-                    return withLock(sampler, "sample", async () => {
-                        if (sampler.disposed)
+                const evaluatorLock = _skipLock
+                    ? undefined
+                    : await acquireLock(this._lock, "evaluate");
+                let nextToken;
+                const yieldRes = {};
+                try {
+                    const logitsArray = [];
+                    if (generateNewTokens)
+                        logitsArray[evalTokens.length - 1] = true;
+                    // Evaluate to get the next token.
+                    const decodeResult = await this._decodeTokens(evalTokens, logitsArray, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
+                        if (_noSampling)
                             return null;
-                        return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                        const samplerConfig = this._resolveSamplerConfig({
+                            temperature,
+                            minP,
+                            topK,
+                            topP,
+                            seed,
+                            grammarEvaluationState,
+                            repeatPenalty,
+                            tokenBias
+                        });
+                        return withLock(sampler, "sample", async () => {
+                            if (sampler.disposed)
+                                return null;
+                            sampler.applyConfig(samplerConfig);
+                            if (sampleProbabilities || sampleConfidence)
+                                return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler, sampleProbabilities, sampleConfidence);
+                            else
+                                return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                        });
                     });
-                });
-                if (nextToken === -1)
-                    throw new Error("Failed to sample next token");
-                if (nextToken == null)
-                    return;
-                // the model finished generating text
-                if (!yieldEogToken && this._context.model.isEogToken(nextToken))
-                    break;
-                const replacementToken = (yield nextToken);
+                    const lastDecodeResult = decodeResult[evalTokens.length - 1];
+                    if (lastDecodeResult instanceof Array) {
+                        const [token, probabilities, confidence] = lastDecodeResult;
+                        nextToken = token;
+                        if (probabilities != null)
+                            yieldRes.probabilities = reviveTokenProbabilities(probabilities);
+                        if (confidence != null)
+                            yieldRes.confidence = confidence;
+                    }
+                    else
+                        nextToken = lastDecodeResult;
+                    if (nextToken === -1)
+                        throw new Error("Failed to sample next token");
+                    if (nextToken == null)
+                        return;
+                    // the model finished generating text
+                    if (!yieldEogToken && this._context.model.isEogToken(nextToken))
+                        break;
+                }
+                finally {
+                    evaluatorLock?.dispose();
+                }
+                yieldRes.token = nextToken;
+                const replacementToken = yield yieldRes;
                 // set the tokens for the next evaluation
-                if (replacementToken != null)
+                if (replacementToken instanceof Array)
+                    evalTokens = replacementToken.slice();
+                else if (replacementToken != null)
                     evalTokens = [replacementToken];
                 else
                     evalTokens = [nextToken];
@@ -913,39 +1206,280 @@ export class LlamaContextSequence {
         }
     }
     /** @internal */
-    async _decodeTokens(tokens, generateLogit, evaluationPriority, tokenMeter, contextShiftOptions, onDecodeDone) {
+    async *_speculativeEvaluate(tokens, metadata, { temperature, minP, topK, topP, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, contextShiftOptions, yieldEogToken = false, tokenPredictor }) {
         this._ensureNotDisposed();
-        const tokensLeftToDecode = tokens.slice();
-        return await withLock(this, "evaluate", async () => {
-            while (tokensLeftToDecode.length > 0) {
+        let evalTokens = tokens.slice();
+        if (evalTokens.length === 0)
+            return;
+        const tokenPredictorOwner = {};
+        this._tokenPredictorOwner = tokenPredictorOwner;
+        await this._abortTokenPredictor();
+        const sampleProbabilities = metadata.probabilities === true;
+        const sampleConfidence = metadata.confidence === true;
+        let logitsArray = [];
+        let logitsStartIndex = evalTokens.length - 1;
+        const validatedTokens = [];
+        logitsArray[logitsStartIndex] = true;
+        const sampler = new LlamaSampler(this.model);
+        try {
+            while (true) {
                 this._ensureNotDisposed();
-                let freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
-                if (freeSpace <= 0) {
-                    await this._freeUpSpaceForTokens(contextShiftOptions);
-                    freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
-                    if (freeSpace <= 0)
-                        throw new Error("Failed to free up space for new tokens");
+                const evaluatorLock = await acquireLock(this._lock, "evaluate");
+                let nextToken;
+                const yieldRes = {};
+                try {
+                    if (this._tokenPredictorOwner === tokenPredictorOwner &&
+                        this._loadedTokenPredictions.length > 0 &&
+                        evalTokens.length === 1 &&
+                        evalTokens[0] === this._loadedTokenPredictions[0]?.[0]) {
+                        const [token, probabilities, confidence] = this._loadedTokenPredictions.shift()[1];
+                        nextToken = token;
+                        yieldRes.token = nextToken;
+                        if (probabilities != null)
+                            yieldRes.probabilities = reviveTokenProbabilities(probabilities);
+                        if (confidence != null)
+                            yieldRes.confidence = confidence;
+                        const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
+                            ? grammarEvaluationState()
+                            : grammarEvaluationState;
+                        if (resolvedGrammarEvaluationState != null)
+                            LlamaSampler._acceptTokenOnGrammarEvaluationState(this._context._llama, resolvedGrammarEvaluationState, nextToken);
+                        this._unusedTokenPredictions--;
+                        this._usedTokenPredictions++;
+                    }
+                    else if (this._tokenPredictorOwner === tokenPredictorOwner && this._loadedTokenPredictions.length > 0) {
+                        const deleteStartIndex = Math.max(0, this._nextTokenIndex - this._loadedTokenPredictions.length);
+                        await this._eraseContextTokenRanges([{ start: deleteStartIndex, end: this._nextTokenIndex }], { canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock: true });
+                        this._loadedTokenPredictions.length = 0;
+                    }
+                    if (this._resetTokenPredictor) {
+                        await tokenPredictor.reset({
+                            stateTokens: [...this._contextTokens, ...evalTokens],
+                            evaluateOptions: {
+                                temperature,
+                                minP,
+                                topK,
+                                topP,
+                                seed,
+                                grammarEvaluationState: grammarEvaluationState instanceof Function
+                                    ? grammarEvaluationState()?.clone()
+                                    : grammarEvaluationState?.clone(),
+                                repeatPenalty,
+                                tokenBias,
+                                evaluationPriority,
+                                contextShift: contextShiftOptions,
+                                yieldEogToken: true
+                            },
+                            targetSequence: this
+                        });
+                        this._resetTokenPredictor = false;
+                        this._tokenPredictorOwner = tokenPredictorOwner;
+                    }
+                    if (nextToken == null) {
+                        if (this._tokenPredictorOwner === tokenPredictorOwner &&
+                            // prevent incurring context shifts due to token prediction validations
+                            this._nextTokenIndex + evalTokens.length < this._context.contextSize) {
+                            const testGrammarClone = grammarEvaluationState instanceof Function
+                                ? grammarEvaluationState()?.clone()
+                                : grammarEvaluationState?.clone();
+                            for (const token of await tokenPredictor.predictTokens()) {
+                                if (testGrammarClone != null) {
+                                    const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(this.model._llama, testGrammarClone, token);
+                                    if (!canAddToken)
+                                        break;
+                                }
+                                evalTokens.push(token);
+                                logitsArray[evalTokens.length - 1] = true;
+                                // prevent incurring context shifts due to token prediction validations
+                                if (this._nextTokenIndex + evalTokens.length >= this._context.contextSize)
+                                    break;
+                            }
+                        }
+                        let resolvedGrammarEvaluationState = undefined;
+                        // Evaluate to get the next token.
+                        const decodeResult = await this._decodeTokens(evalTokens, logitsArray, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex, tokenIndex) => {
+                            if (tokenIndex === logitsStartIndex)
+                                resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
+                                    ? grammarEvaluationState()
+                                    : grammarEvaluationState;
+                            else if (tokenIndex === logitsStartIndex + 1)
+                                resolvedGrammarEvaluationState = resolvedGrammarEvaluationState?.clone();
+                            const samplerConfig = this._resolveSamplerConfig({
+                                temperature,
+                                minP,
+                                topK,
+                                topP,
+                                seed,
+                                grammarEvaluationState: resolvedGrammarEvaluationState,
+                                repeatPenalty,
+                                tokenBias
+                            });
+                            return withLock(sampler, "sample", async () => {
+                                if (sampler.disposed)
+                                    return null;
+                                sampler.applyConfig(samplerConfig);
+                                if (sampleProbabilities || sampleConfidence)
+                                    return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler, sampleProbabilities, sampleConfidence);
+                                else
+                                    return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                            });
+                        });
+                        for (let i = logitsStartIndex; i < evalTokens.length; i++) {
+                            const item = decodeResult[i];
+                            const [resultToken, probabilities, confidence] = item instanceof Array
+                                ? item
+                                : [item];
+                            if (i === logitsStartIndex) {
+                                if (resultToken === -1)
+                                    throw new Error("Failed to sample next token");
+                                if (resultToken == null)
+                                    return;
+                                nextToken = resultToken;
+                                yieldRes.token = nextToken;
+                                if (probabilities != null)
+                                    yieldRes.probabilities = reviveTokenProbabilities(probabilities);
+                                if (confidence != null)
+                                    yieldRes.confidence = confidence;
+                            }
+                            else {
+                                if (resultToken === -1 || resultToken == null)
+                                    break;
+                                const lastValidatedTokenOutput = i === logitsStartIndex + 1
+                                    ? nextToken
+                                    : validatedTokens.at(-1)?.[1];
+                                if (lastValidatedTokenOutput != null && lastValidatedTokenOutput === evalTokens[i]) {
+                                    this._loadedTokenPredictions.push([evalTokens[i], [resultToken, probabilities, confidence]]);
+                                    this._validatedTokenPredictions++;
+                                    this._unusedTokenPredictions++;
+                                }
+                                else {
+                                    const deleteSize = Math.min(evalTokens.length - i, this.context.contextSize);
+                                    this._refutedTokenPredictions += deleteSize;
+                                    const deleteStartIndex = this._nextTokenIndex - deleteSize;
+                                    tokenPredictor.stop(true);
+                                    await this._eraseContextTokenRanges([{
+                                            start: deleteStartIndex,
+                                            end: this._nextTokenIndex
+                                        }], { canResetTokenPredictor: false, canRemovePredictionTokens: false, skipLock: true });
+                                    break; // the assumption that this token will be generated was wrong
+                                }
+                            }
+                        }
+                    }
+                    if (nextToken == null)
+                        throw new Error("Failed to generated next token");
+                    // the model finished generating text
+                    if (!yieldEogToken && this._context.model.isEogToken(nextToken))
+                        break;
                 }
-                const tokensToDecode = tokensLeftToDecode.splice(0, freeSpace);
-                const generateLogitAtTheEnd = generateLogit && tokensLeftToDecode.length === 0;
-                const nextToken = await this._context._decodeTokens({
-                    sequenceId: this._sequenceId,
-                    tokens: tokensToDecode,
-                    firstTokenSequenceIndex: this._nextTokenIndex,
-                    generateLogitAtTheEnd,
-                    evaluationPriority,
-                    tokenMeter
-                }, !generateLogitAtTheEnd
-                    ? undefined
-                    : onDecodeDone);
-                this._nextTokenIndex += tokensToDecode.length;
-                this._contextTokens = this._contextTokens.concat(tokensToDecode);
-                if (generateLogitAtTheEnd && nextToken != null)
-                    return nextToken;
+                finally {
+                    evaluatorLock.dispose();
+                }
+                const replacementToken = yield yieldRes;
+                // set the tokens for the next evaluation
+                if (replacementToken instanceof Array)
+                    evalTokens = replacementToken.slice();
+                else if (replacementToken != null)
+                    evalTokens = [replacementToken];
+                else
+                    evalTokens = [nextToken];
+                if (this._tokenPredictorOwner === tokenPredictorOwner)
+                    tokenPredictor.pushTokens(evalTokens);
+                logitsArray = [];
+                logitsStartIndex = evalTokens.length - 1;
+                logitsArray[logitsStartIndex] = true;
             }
-            return null;
+        }
+        finally {
+            void withLock(sampler, "sample", sampler.asyncDispose);
+            if (this._tokenPredictorOwner === tokenPredictorOwner)
+                tokenPredictor.stop();
+        }
+    }
+    /** @internal */
+    async _abortTokenPredictor(skipClearingPredictionsFromState = false, skipLock = false) {
+        this._tokenPredictor?.stop();
+        this._resetTokenPredictor = true;
+        if (skipClearingPredictionsFromState)
+            return;
+        if (this._loadedTokenPredictions.length > 0)
+            await this._eraseContextTokenRanges([{
+                    start: this._nextTokenIndex - this._loadedTokenPredictions.length,
+                    end: this._nextTokenIndex
+                }], { canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock });
+    }
+    /** @internal */
+    _resolveSamplerConfig({ temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias }) {
+        const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
+            ? repeatPenalty.punishTokens()
+            : repeatPenalty?.punishTokens;
+        const maxPunishTokens = Math.max(repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens, repeatPenaltyTokens?.length ?? 0);
+        const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
+            ? grammarEvaluationState()
+            : grammarEvaluationState;
+        if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
+            throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
+        const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
+        return removeNullFields({
+            temperature,
+            minP,
+            topK,
+            topP,
+            seed: Math.max(0, Number.isFinite(seed)
+                ? Math.floor(seed ?? (Date.now() / 1000))
+                : Math.floor(Date.now() / 1000)),
+            repeatPenalty: repeatPenalty?.penalty,
+            repeatPenaltyMaxTokens: maxPunishTokens,
+            repeatPenaltyTokens: repeatPenaltyTokens != null
+                ? Uint32Array.from(repeatPenaltyTokens)
+                : undefined,
+            repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
+            repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
+            tokenBiasKeys,
+            tokenBiasValues,
+            grammarEvaluationState: resolvedGrammarEvaluationState?._state
         });
     }
+    /**
+     * The caller of this function has to wrap it with a lock to ensure this function doesn't run concurrently.
+     * @internal
+     */
+    async _decodeTokens(tokens, logits, evaluationPriority, tokenMeter, contextShiftOptions, logitDataMapper) {
+        this._ensureNotDisposed();
+        const tokensLeftToDecode = tokens.slice();
+        const tokenLogitsLeftToDecode = logits.slice();
+        let currentTokenIndex = 0;
+        const res = [];
+        const normalizedLogitDataMapper = (batchLogitIndex, contextStateTokenIndex) => {
+            return logitDataMapper(batchLogitIndex, currentTokenIndex + (contextStateTokenIndex - this._nextTokenIndex));
+        };
+        while (tokensLeftToDecode.length > 0) {
+            this._ensureNotDisposed();
+            let freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
+            if (freeSpace <= 0) {
+                await this._freeUpSpaceForTokens(contextShiftOptions);
+                freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
+                if (freeSpace <= 0)
+                    throw new Error("Failed to free up space for new tokens");
+            }
+            const tokensToDecode = tokensLeftToDecode.splice(0, freeSpace);
+            const tokensLogits = tokenLogitsLeftToDecode.slice(0, tokensToDecode.length);
+            const generatedLogits = await this._context._decodeTokens({
+                sequenceId: this._sequenceId,
+                tokens: tokensToDecode,
+                firstTokenSequenceIndex: this._nextTokenIndex,
+                logits: tokensLogits,
+                evaluationPriority,
+                tokenMeter
+            }, normalizedLogitDataMapper);
+            for (const [index, value] of generatedLogits)
+                res[currentTokenIndex + (index - this._nextTokenIndex)] = value;
+            this._nextTokenIndex += tokensToDecode.length;
+            currentTokenIndex += tokensToDecode.length;
+            this._contextTokens = this._contextTokens.concat(tokensToDecode);
+        }
+        return res;
+    }
     /** @internal */
     async _freeUpSpaceForTokens(contextShiftOptions) {
         this._ensureNotDisposed();
@@ -957,7 +1491,7 @@ export class LlamaContextSequence {
             let eraseStartIndex = 0;
             if (this.model.tokens.bos != null && this._contextTokens[0] === this.model.tokens.bos)
                 eraseStartIndex = 1;
-            await this.eraseContextTokenRanges([{ start: eraseStartIndex, end: size + eraseStartIndex }]);
+            await this._eraseContextTokenRanges([{ start: eraseStartIndex, end: size + eraseStartIndex }], { skipLock: true });
         }
         else {
             const ranges = await contextShiftOptions.strategy({
@@ -966,9 +1500,9 @@ export class LlamaContextSequence {
             });
             if (ranges == null)
                 throw new Error("Invalid delete ranges");
-            await this.eraseContextTokenRanges(ranges);
-            if (this.nextTokenIndex >= this._context.contextSize - 1)
-                await this.eraseContextTokenRanges([{ start: 0, end: size }]);
+            await this._eraseContextTokenRanges(ranges, { skipLock: true });
+            if (this._nextTokenIndex >= this._context.contextSize - 1)
+                await this._eraseContextTokenRanges([{ start: 0, end: size }], { skipLock: true });
         }
     }
     /** @internal */
@@ -980,7 +1514,7 @@ export class LlamaContextSequence {
      * We need this to make it impossible to manually create instances of this class outside the code of this library
      * @internal
      */
-    static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
+    static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, tokenPredictor }) {
         return new LlamaContextSequence({
             sequenceId,
             context,
@@ -988,7 +1522,8 @@ export class LlamaContextSequence {
             contextShift: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
-            }
+            },
+            tokenPredictor
         });
     }
 }
@@ -1020,6 +1555,17 @@ function getTokenBiasesForAddon(tokenBias, currentModel) {
         tokenBiasValues: Float32Array.from(tokenBiasValues)
     };
 }
+function reviveTokenProbabilities(probabilities) {
+    if (probabilities == null)
+        return undefined;
+    const res = new Map();
+    for (let i = 1; i < probabilities.length; i += 2) {
+        const token = probabilities[i - 1];
+        const probability = probabilities[i];
+        res.set(token, probability);
+    }
+    return res;
+}
 function disposeContextIfReferenced(contextRef) {
     const context = contextRef.deref();
     if (context != null)