npm - @realtimex/node-llama-cpp - Versions diffs - 0.1.0 - Mend

@realtimex/node-llama-cpp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (876) hide show

package/dist/evaluator/LlamaContext/LlamaContext.js ADDED Viewed

@@ -0,0 +1,1919 @@
+import path from "path";
+import { acquireLock, AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
+import { removeNullFields } from "../../utils/removeNullFields.js";
+import { compareTokens } from "../../utils/compareTokens.js";
+import { DisposeGuard } from "../../utils/DisposeGuard.js";
+import { TokenMeter } from "../TokenMeter.js";
+import { UnsupportedError } from "../../utils/UnsupportedError.js";
+import { pushAll } from "../../utils/pushAll.js";
+import { safeEventCallback } from "../../utils/safeEventCallback.js";
+import { GgufArchitectureType } from "../../gguf/types/GgufMetadataTypes.js";
+import { LlamaLogLevel } from "../../bindings/types.js";
+import { resolveGgmlTypeOption } from "../../gguf/types/GgufTensorInfoTypes.js";
+import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
+import { LlamaSampler } from "./LlamaSampler.js";
+import { padSafeContextSize } from "./utils/padSafeContextSize.js";
+import { LlamaContextSequenceCheckpoints } from "./LlamaContextSequenceCheckpoints.js";
+const defaultLoraScale = 1;
+const shrinkRetriesMinContextSize = 4096;
+const defaultMaxPunishTokens = 64;
+const defaultFailedCreationRemedy = {
+    retries: 16,
+    autoContextSizeShrink: 0.16
+};
+const defaultEvaluationPriority = 5;
+const defaultDryRepeatPenalitySequenceBreakers = ["\n", ":", '"', "*"];
+const defaultCheckpointOptions = {
+    max: 32,
+    interval: 8192,
+    maxMemory: null
+};
+export const internalCheckpoints = {
+    speculative: {
+        name: "speculative",
+        maxCheckpoints: 2
+    },
+    chatSequenceStart: {
+        name: "sequenceStart",
+        maxCheckpoints: 1
+    },
+    chatGrammarEnd: {
+        name: "grammarEnd",
+        maxCheckpoints: 1
+    }
+};
+const decodeSyncWorkaround = {
+    vulkanLock: {}
+};
+export class LlamaContext {
+    /** @internal */ _llama;
+    /** @internal */ _ctx;
+    /** @internal */ _onReclaimUnusedSequenceId = new EventRelay();
+    /** @internal */ _backendContextDisposeGuard;
+    /** @internal */ _model;
+    /** @internal */ _contextSize;
+    /** @internal */ _batchSize;
+    /** @internal */ _flashAttention;
+    /** @internal */ _idealThreads;
+    /** @internal */ _minThreads;
+    /** @internal */ _performanceTracking;
+    /** @internal */ _kvCacheKeyType;
+    /** @internal */ _kvCacheValueType;
+    /** @internal */ _totalSequences;
+    /** @internal */ _unusedSequenceIds = [];
+    /** @internal */ _batchingOptions;
+    /** @internal */ _swaFullCache = false;
+    /** @internal */ _queuedDecodeSequenceIds = new Set();
+    /** @internal */ _queuedDecodes = [];
+    /** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
+    /** @internal */ _modelPreventDisposalHandle;
+    /** @internal */ _loraAdapters = new Set();
+    /** @internal */ _nextGeneratedSequenceId = 0;
+    /** @internal */ _dispatchDecodeScheduled = false;
+    /** @internal */ _batchDispatchPending = false;
+    /** @internal */ _threadSplitterConsumer;
+    /** @internal */ _freeReservedThreadsTimeout;
+    /** @internal */ _currentDispatchBatchHandle = {};
+    /** @internal */ _allocatedContextSize;
+    /** @internal */ _disposed = false;
+    onDispose = new EventRelay();
+    constructor({ _model }, { sequences, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { dispatchSchedule: batchingDispatchSchedule = "nextCycle", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, swaFullCache = _model.defaultContextSwaFullCache, performanceTracking = false, experimentalKvCacheKeyType, experimentalKvCacheValueType, _embeddings, _ranking }) {
+        if (_model.disposed)
+            throw new DisposedError();
+        this._llama = _model._llama;
+        this._model = _model;
+        this._backendContextDisposeGuard = new DisposeGuard([this._model._backendModelDisposeGuard]);
+        this._modelPreventDisposalHandle = this._model._backendModelDisposeGuard.createPreventDisposalHandle();
+        this._totalSequences = sequences;
+        this._contextSize = contextSize;
+        this._batchSize = Math.max(batchSize, this._totalSequences);
+        this._flashAttention = flashAttention;
+        this._idealThreads = typeof threads === "number"
+            ? this._llama._threadsSplitter.normalizeThreadsValue(threads)
+            : this._llama._threadsSplitter.normalizeThreadsValue(threads?.ideal ?? (this._llama.maxThreads === 0
+                ? this._llama.cpuMathCores
+                : this._llama.maxThreads));
+        this._minThreads = Math.max(1, typeof threads === "number"
+            ? 1
+            : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1));
+        this._performanceTracking = !!performanceTracking;
+        this._kvCacheKeyType = experimentalKvCacheKeyType;
+        this._kvCacheValueType = experimentalKvCacheValueType;
+        this._swaFullCache = !!swaFullCache;
+        this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
+            contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own <contextSize> of cells
+            batchSize: this._batchSize + ((!this._swaFullCache && this.model.fileInsights.swaSize != null && this.model.fileInsights.swaSize > 0)
+                ? 1 // +1 to handle edge cases with SWA KV cache
+                : 0),
+            sequences: this._totalSequences,
+            flashAttention: this._flashAttention,
+            threads: this._idealThreads,
+            embeddings: _embeddings,
+            ranking: _ranking,
+            performanceTracking: this._performanceTracking,
+            kvCacheKeyType: this._kvCacheKeyType,
+            kvCacheValueType: this._kvCacheValueType,
+            swaFullCache: this._swaFullCache
+        }));
+        this._batchingOptions = {
+            dispatchSchedule: batchingDispatchSchedule,
+            itemPrioritizationStrategy: batchingItemsPrioritizationStrategy
+        };
+        this._reclaimUnusedSequenceId = this._reclaimUnusedSequenceId.bind(this);
+        this._freeReservedThreads = this._freeReservedThreads.bind(this);
+        this._disposeAggregator.add(() => {
+            this._disposed = true;
+        });
+        this._disposeAggregator.add(this._onReclaimUnusedSequenceId);
+        this._disposeAggregator.add(this.onDispose.dispatchEvent);
+        this._disposeAggregator.add(this.model.onDispose.createListener(disposeContextIfReferenced.bind(null, new WeakRef(this))));
+        this._disposeAggregator.add(async () => {
+            await this._backendContextDisposeGuard.acquireDisposeLock();
+            await this._ctx.dispose();
+            this._modelPreventDisposalHandle.dispose();
+        });
+    }
+    async dispose() {
+        if (this._disposed)
+            return;
+        this._disposed = true;
+        await this._disposeAggregator.dispose();
+    }
+    /** @hidden */
+    [Symbol.asyncDispose]() {
+        return this.dispose();
+    }
+    get disposed() {
+        return this._disposed;
+    }
+    get model() {
+        return this._model;
+    }
+    get contextSize() {
+        return this._contextSize;
+    }
+    get batchSize() {
+        return this._batchSize;
+    }
+    get flashAttention() {
+        return this._flashAttention;
+    }
+    get kvCacheKeyType() {
+        return this._kvCacheKeyType;
+    }
+    get kvCacheValueType() {
+        return this._kvCacheValueType;
+    }
+    /**
+     * The actual size of the state in the memory in bytes.
+     * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
+     */
+    get stateSize() {
+        this._ensureNotDisposed();
+        return this._ctx.getStateSize();
+    }
+    /** The number of threads currently used to evaluate tokens */
+    get currentThreads() {
+        this._ensureNotDisposed();
+        return this._ctx.getThreads();
+    }
+    /**
+     * The number of threads that are preferred to be used to evaluate tokens.
+     *
+     * The actual number of threads used may be lower when other evaluations are running in parallel.
+     */
+    get idealThreads() {
+        return this._idealThreads;
+    }
+    getAllocatedContextSize() {
+        this._ensureNotDisposed();
+        if (this._allocatedContextSize == null)
+            this._allocatedContextSize = this._ctx.getContextSize();
+        return this._allocatedContextSize;
+    }
+    get totalSequences() {
+        return this._totalSequences;
+    }
+    get sequencesLeft() {
+        return this._totalSequences - this._nextGeneratedSequenceId + this._unusedSequenceIds.length;
+    }
+    /**
+     * Before calling this method, make sure to call `sequencesLeft` to check if there are any sequences left.
+     * When there are no sequences left, this method will throw an error.
+     */
+    getSequence(options = {}) {
+        const { contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, tokenPredictor, checkpoints, _tokenMeter } = options;
+        this._ensureNotDisposed();
+        const nextSequenceId = this._popSequenceId();
+        if (nextSequenceId == null)
+            throw new Error("No sequences left");
+        return LlamaContextSequence._create({
+            sequenceId: nextSequenceId,
+            context: this,
+            tokenMeter: _tokenMeter,
+            contextShift: {
+                size: contextShiftSize,
+                strategy: contextShiftStrategy
+            },
+            tokenPredictor,
+            checkpoints
+        });
+    }
+    dispatchPendingBatch() {
+        this._currentDispatchBatchHandle = {};
+        this._dispatchDecodeScheduled = false;
+        if (this._batchDispatchPending)
+            return;
+        this._batchDispatchPending = true;
+        void withLock([this, "context"], async () => {
+            this._currentDispatchBatchHandle = {};
+            this._dispatchDecodeScheduled = false;
+            this._batchDispatchPending = false;
+            let shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
+            const queuedDecodeToMappedLogits = new Map();
+            const resolvePrioritizationStrategy = () => {
+                try {
+                    this._ensureNotDisposed();
+                    return resolveBatchItemsPrioritizationStrategy(this._batchingOptions.itemPrioritizationStrategy);
+                }
+                catch (err) {
+                    this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
+                }
+                return null;
+            };
+            const getOrderedQueuedDecodes = (prioritizationStrategy) => {
+                const batchItemToQueuedDecodeMap = new Map();
+                const batchItemsList = [];
+                for (const queuedDecode of this._queuedDecodes) {
+                    const batchItem = {
+                        tokens: queuedDecode.tokens,
+                        logits: queuedDecode.logits,
+                        evaluationPriority: queuedDecode.evaluationPriority
+                    };
+                    batchItemToQueuedDecodeMap.set(batchItem, queuedDecode);
+                    batchItemsList.push(batchItem);
+                }
+                let prioritizedItems;
+                try {
+                    prioritizedItems = prioritizationStrategy({
+                        items: batchItemsList,
+                        size: this._batchSize
+                    });
+                }
+                catch (err) {
+                    this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
+                    return null;
+                }
+                return prioritizedItems.map((prioritizedItem) => {
+                    const queuedDecode = batchItemToQueuedDecodeMap.get(prioritizedItem.item);
+                    if (queuedDecode == null)
+                        throw new Error("Received invalid batch item. Make sure you keep the original object reference " +
+                            "of the batch item on `item` on `PrioritizedBatchItem` in your custom prioritization strategy");
+                    return {
+                        queuedDecode,
+                        processAmount: prioritizedItem.processAmount
+                    };
+                });
+            };
+            const fitQueuedDecodesToABatch = (queuedDecodes, batchSize) => {
+                const currentBatchItems = [];
+                let currentBatchSize = 0;
+                let batchTokenSlotsLeft = batchSize;
+                for (const { queuedDecode, processAmount } of queuedDecodes) {
+                    const resolvedProcessAmount = Math.min(processAmount <= 0 ? 1 : processAmount, queuedDecode.tokens.length, batchTokenSlotsLeft);
+                    if (resolvedProcessAmount <= 0) {
+                        if (batchTokenSlotsLeft === 0)
+                            break;
+                        continue;
+                    }
+                    batchTokenSlotsLeft -= resolvedProcessAmount;
+                    currentBatchSize += resolvedProcessAmount;
+                    currentBatchItems.push({
+                        queuedDecode,
+                        processAmount: resolvedProcessAmount
+                    });
+                }
+                return {
+                    currentBatchItems,
+                    currentBatchSize
+                };
+            };
+            const decodeTokenBatchItems = async (batchItems, currentBatchSize) => {
+                const afterDecodeActions = [];
+                const queuedDecodesToDelete = new Set();
+                const currentQueuedDecodeItems = new Set();
+                if (currentBatchSize !== 0)
+                    this._ctx.initBatch(currentBatchSize);
+                for (const { queuedDecode, processAmount } of batchItems) {
+                    let batchLogitIndexes;
+                    const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
+                    const tokenIndexesWithLogitsToProcess = queuedDecode.logits.slice(0, processAmount)
+                        .map((logit, index) => (logit ? index : undefined))
+                        .filter((index) => index != undefined);
+                    const numberOfOutputTokens = tokenIndexesWithLogitsToProcess.length;
+                    TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
+                    TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
+                    try {
+                        batchLogitIndexes = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), Uint32Array.from(tokenIndexesWithLogitsToProcess));
+                    }
+                    catch (err) {
+                        this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
+                        continue;
+                    }
+                    currentQueuedDecodeItems.add(queuedDecode);
+                    if (queuedDecode.tokens.length === processAmount) {
+                        queuedDecodesToDelete.add(queuedDecode);
+                        afterDecodeActions.push({
+                            queuedDecode,
+                            batchLogitIndexes,
+                            batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
+                            firstTokenIndex: queuedDecode.firstTokenSequenceIndex,
+                            sequenceStateLength: queuedDecode.firstTokenSequenceIndex + processAmount + 1,
+                            returnResults: true
+                        });
+                    }
+                    else {
+                        if (batchLogitIndexes.length > 0 || queuedDecode.afterBatchAction != null)
+                            afterDecodeActions.push({
+                                queuedDecode,
+                                batchLogitIndexes,
+                                batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
+                                firstTokenIndex: queuedDecode.firstTokenSequenceIndex,
+                                sequenceStateLength: queuedDecode.firstTokenSequenceIndex + processAmount + 1
+                            });
+                        queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
+                        queuedDecode.logits = queuedDecode.logits.slice(processAmount);
+                        queuedDecode.firstTokenSequenceIndex += processAmount;
+                    }
+                }
+                for (let i = 0; i < this._queuedDecodes.length; i++) {
+                    const queuedDecode = this._queuedDecodes[i];
+                    if (queuedDecodesToDelete.has(queuedDecode)) {
+                        this._queuedDecodes.splice(i, 1);
+                        this._queuedDecodeSequenceIds.delete(queuedDecode.sequenceId);
+                        i--;
+                    }
+                }
+                if (currentBatchSize !== 0) {
+                    const allocationResult = this._threadSplitterConsumer?.getAllocationToConsume();
+                    const [threadsToUse, consumerHandle] = allocationResult instanceof Promise
+                        ? await allocationResult ?? []
+                        : allocationResult ?? [];
+                    try {
+                        if (threadsToUse != null)
+                            this._ctx.setThreads(threadsToUse);
+                        await this._ctx.decodeBatch();
+                        consumerHandle?.dispose();
+                    }
+                    catch (err) {
+                        consumerHandle?.dispose();
+                        this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
+                        return;
+                    }
+                }
+                function finishAfterDecodeAction(action, mappedLogitValues) {
+                    if (mappedLogitValues != null && mappedLogitValues.length > 0) {
+                        if (queuedDecodeToMappedLogits.has(action.queuedDecode))
+                            pushAll(queuedDecodeToMappedLogits.get(action.queuedDecode), mappedLogitValues);
+                        else
+                            queuedDecodeToMappedLogits.set(action.queuedDecode, mappedLogitValues);
+                    }
+                    if (action.returnResults != null) {
+                        const [accept] = action.queuedDecode.response;
+                        const mappedLogits = queuedDecodeToMappedLogits.get(action.queuedDecode) ?? [];
+                        queuedDecodeToMappedLogits.delete(action.queuedDecode);
+                        accept(mappedLogits);
+                    }
+                }
+                const afterDecodeActionResults = afterDecodeActions.map((action) => {
+                    if (action.batchLogitIndexes.length === 0) {
+                        finishAfterDecodeAction(action);
+                        return undefined;
+                    }
+                    const mappedLogitValues = [];
+                    let promiseChain = undefined;
+                    const batchLogitIndexes = action.batchLogitIndexes;
+                    const batchLogitTokenIndexes = action.batchLogitTokenIndexes;
+                    for (let i = 0; i < batchLogitIndexes.length; i++) {
+                        const tokenIndex = batchLogitTokenIndexes[i];
+                        const mappedValue = promiseChain != null
+                            ? promiseChain
+                                .then(() => action.queuedDecode.logitDataMapper(batchLogitIndexes[i], tokenIndex + action.firstTokenIndex))
+                            : action.queuedDecode.logitDataMapper(batchLogitIndexes[i], tokenIndex + action.firstTokenIndex);
+                        if (mappedValue instanceof Promise) {
+                            promiseChain = mappedValue;
+                            mappedLogitValues.push(mappedValue
+                                .then((value) => [tokenIndex + action.firstTokenIndex, value]));
+                        }
+                        else
+                            mappedLogitValues.push([tokenIndex + action.firstTokenIndex, mappedValue]);
+                    }
+                    if (promiseChain != null)
+                        return Promise.all(mappedLogitValues)
+                            .then((resolvedMappedLogitValues) => finishAfterDecodeAction(action, resolvedMappedLogitValues));
+                    finishAfterDecodeAction(action, mappedLogitValues);
+                    return undefined;
+                });
+                await Promise.all(afterDecodeActionResults);
+                for (const action of afterDecodeActions) {
+                    const resPromise = action.queuedDecode.afterBatchAction?.(action.sequenceStateLength);
+                    if (resPromise instanceof Promise)
+                        await resPromise;
+                }
+            };
+            const prioritizationStrategy = resolvePrioritizationStrategy();
+            if (prioritizationStrategy == null)
+                return; // all queued items are rejected and dequeued when we get here
+            this._reserveThreads();
+            try {
+                while (shouldHaveAnotherLoop) {
+                    const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
+                    if (orderedQueuedDecodes == null)
+                        return; // all queued items are rejected and dequeued when we get here
+                    const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
+                    let preventDisposalHandle;
+                    try {
+                        preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
+                    }
+                    catch (err) {
+                        this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
+                        return;
+                    }
+                    let decodeLock;
+                    // this is a workaround to prevent Vulkan from crashing the process when decoding on multiple contexts in parallel
+                    if (this._llama.gpu === "vulkan")
+                        decodeLock = await acquireLock([decodeSyncWorkaround.vulkanLock, "decode"]);
+                    try {
+                        await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
+                        shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
+                    }
+                    finally {
+                        decodeLock?.dispose();
+                        preventDisposalHandle.dispose();
+                    }
+                }
+            }
+            finally {
+                this._scheduleToFreeReservedThreads();
+            }
+        });
+    }
+    /**
+     * Print the timings of token evaluation since that last print for this context.
+     *
+     * Requires the `performanceTracking` option to be enabled.
+     *
+     * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
+     * it won't print anything.
+     */
+    async printTimings() {
+        this._ensureNotDisposed();
+        if (!this._performanceTracking)
+            throw new UnsupportedError("Performance tracking is not enabled");
+        this._ctx.printTimings();
+        await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
+    }
+    /** @internal */
+    async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, logits, evaluationPriority = defaultEvaluationPriority, tokenMeter, afterBatchAction }, logitDataMapper) {
+        return await new Promise((accept, reject) => {
+            this._queuedDecodes.push({
+                sequenceId,
+                tokens,
+                logits,
+                firstTokenSequenceIndex,
+                evaluationPriority,
+                tokenMeter,
+                response: [accept, reject],
+                logitDataMapper,
+                afterBatchAction
+            });
+            this._queuedDecodeSequenceIds.add(sequenceId);
+            this._scheduleDecode();
+        });
+    }
+    /** @internal */
+    _reclaimUnusedSequenceId(sequenceId) {
+        if (this._disposed)
+            return;
+        void withLock([this, "context"], async () => {
+            if (this._disposed)
+                return;
+            this._ctx.disposeSequence(sequenceId);
+            this._unusedSequenceIds.push(sequenceId);
+            this._onReclaimUnusedSequenceId.dispatchEvent();
+        });
+    }
+    /** @internal */
+    _popSequenceId() {
+        if (this._unusedSequenceIds.length > 0)
+            return this._unusedSequenceIds.shift();
+        if (this._nextGeneratedSequenceId < this._totalSequences) {
+            const sequenceId = this._nextGeneratedSequenceId;
+            this._nextGeneratedSequenceId++;
+            return sequenceId;
+        }
+        return null;
+    }
+    /** @internal */
+    _scheduleDecode() {
+        if (this._dispatchDecodeScheduled || this._batchDispatchPending)
+            return;
+        this._dispatchDecodeScheduled = true;
+        const currentPendingBatchHandle = this._currentDispatchBatchHandle;
+        const dispatch = () => {
+            if (this._currentDispatchBatchHandle !== currentPendingBatchHandle)
+                return;
+            this.dispatchPendingBatch();
+        };
+        const dispatchSchedule = this._batchingOptions.dispatchSchedule;
+        if (this._queuedDecodeSequenceIds.size === this._totalSequences)
+            dispatch();
+        if (dispatchSchedule === "nextCycle") {
+            if (typeof setImmediate === "function")
+                setImmediate(dispatch);
+            else
+                setTimeout(dispatch, 0);
+        }
+        else if (typeof dispatchSchedule === "function")
+            dispatchSchedule(dispatch);
+        else {
+            if (typeof setImmediate === "function")
+                setImmediate(dispatch);
+            else
+                setTimeout(dispatch, 0);
+        }
+    }
+    /** @internal */
+    _dispatchErrorForQueuedDecodesAndDequeue(queuedDecodes, err) {
+        for (const pendingDecode of queuedDecodes) {
+            const [, reject] = pendingDecode.response;
+            reject(err);
+        }
+        for (let i = 0; i < this._queuedDecodes.length; i++) {
+            const item = this._queuedDecodes[i];
+            if (queuedDecodes.has(item)) {
+                this._queuedDecodes.splice(i, 1);
+                this._queuedDecodeSequenceIds.delete(item.sequenceId);
+                i--;
+            }
+        }
+    }
+    /** @internal */
+    _ensureNotDisposed() {
+        if (this._disposed)
+            throw new DisposedError();
+    }
+    /** @internal */
+    async _setLoras(loras) {
+        const addonLoras = [];
+        const addonScales = [];
+        for (const { filePath, scale } of loras) {
+            const lora = await this._model._getOrLoadLora(filePath);
+            addonLoras.push(lora);
+            addonScales.push(scale ?? defaultLoraScale);
+        }
+        this._ctx.setLoras(addonLoras, addonScales);
+        for (const addonLora of addonLoras) {
+            if (!this._loraAdapters.has(addonLora)) {
+                this._loraAdapters.add(addonLora);
+                addonLora.usages++;
+            }
+        }
+    }
+    /** @internal */
+    _reserveThreads() {
+        clearTimeout(this._freeReservedThreadsTimeout);
+        delete this._freeReservedThreadsTimeout;
+        if (this._threadSplitterConsumer != null)
+            return;
+        this._threadSplitterConsumer = this._llama._threadsSplitter.createConsumer(this._idealThreads, this._minThreads);
+    }
+    /** @internal */
+    _freeReservedThreads() {
+        clearTimeout(this._freeReservedThreadsTimeout);
+        delete this._freeReservedThreadsTimeout;
+        if (this._threadSplitterConsumer == null)
+            return;
+        this._threadSplitterConsumer.dispose();
+        delete this._threadSplitterConsumer;
+    }
+    /** @internal */
+    _scheduleToFreeReservedThreads() {
+        if (this._threadSplitterConsumer == null)
+            return;
+        clearTimeout(this._freeReservedThreadsTimeout);
+        this._freeReservedThreadsTimeout = setTimeout(this._freeReservedThreads, 0);
+    }
+    /** @internal */
+    static async _create(options, { _model }) {
+        const kvUnified = false;
+        const sequences = Math.max(1, Math.floor(options.sequences ?? getDefaultContextSequences()));
+        const flashAttention = _model.flashAttentionSupported
+            ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention)
+            : false;
+        const kvCacheKeyType = options.experimentalKvCacheKeyType === "currentQuant"
+            ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheKeyType
+            : resolveGgmlTypeOption(options.experimentalKvCacheKeyType) ?? _model.defaultContextKvCacheKeyType;
+        const kvCacheValueType = options.experimentalKvCacheValueType === "currentQuant"
+            ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheValueType
+            : resolveGgmlTypeOption(options.experimentalKvCacheValueType) ?? _model.defaultContextKvCacheValueType;
+        const swaFullCache = options.swaFullCache ?? _model.defaultContextSwaFullCache;
+        const loraOptions = typeof options.lora === "string"
+            ? { adapters: [{ filePath: options.lora }] }
+            : options.lora;
+        let failedCreationRetries = options.failedCreationRemedy === false
+            ? 0
+            : Math.max(0, options.failedCreationRemedy?.retries ?? defaultFailedCreationRemedy.retries);
+        const failedCreationAutoContextSizeShrink = options.failedCreationRemedy === false
+            ? 0
+            : options.failedCreationRemedy?.autoContextSizeShrink ?? defaultFailedCreationRemedy.autoContextSizeShrink;
+        let contextSize = await _model.fileInsights.configurationResolver.resolveContextContextSize(options.contextSize, {
+            batchSize: options.batchSize,
+            sequences: sequences,
+            modelGpuLayers: _model.gpuLayers,
+            modelTrainContextSize: _model.trainContextSize,
+            flashAttention,
+            kvCacheKeyType,
+            kvCacheValueType,
+            swaFullCache,
+            getVramState: () => _model._llama._vramOrchestrator.getMemoryState(),
+            llamaGpu: _model._llama.gpu,
+            ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks,
+            isEmbeddingContext: options._embeddings
+        });
+        const minContextSize = options.contextSize === "auto"
+            ? shrinkRetriesMinContextSize
+            : (typeof options.contextSize === "object" && typeof options.contextSize.min === "number")
+                ? options.contextSize.min
+                : typeof options.contextSize === "number"
+                    ? options.contextSize
+                    : shrinkRetriesMinContextSize;
+        const { createSignal } = options;
+        const paddedContextSize = kvUnified
+            ? Math.floor(padSafeContextSize(Math.max(2, contextSize) * sequences, "up") / sequences)
+            : padSafeContextSize(Math.max(2, contextSize), "up");
+        if (contextSize <= _model.trainContextSize && paddedContextSize < _model.trainContextSize)
+            contextSize = paddedContextSize;
+        async function createContext(contextSize) {
+            const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
+            const resourceRequirementsEstimation = _model.fileInsights.estimateContextResourceRequirements({
+                contextSize,
+                sequences,
+                isEmbeddingContext: options._embeddings,
+                modelGpuLayers: _model.gpuLayers,
+                batchSize,
+                flashAttention,
+                kvCacheKeyType,
+                kvCacheValueType,
+                swaFullCache
+            });
+            const context = new LlamaContext({ _model }, {
+                ...options,
+                contextSize,
+                batchSize,
+                sequences,
+                flashAttention,
+                experimentalKvCacheKeyType: kvCacheKeyType,
+                experimentalKvCacheValueType: kvCacheValueType,
+                swaFullCache
+            });
+            const contextCreationVramReservation = options.ignoreMemorySafetyChecks
+                ? null
+                : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram);
+            const contextCreationRamReservation = options.ignoreMemorySafetyChecks
+                ? null
+                : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.cpuRam);
+            try {
+                if (createSignal?.aborted)
+                    throw createSignal.reason;
+                const contextLoaded = await context._ctx.init();
+                if (createSignal?.aborted) {
+                    if (contextLoaded)
+                        await context._ctx.dispose();
+                    throw createSignal.reason;
+                }
+                else if (!contextLoaded)
+                    throw new Error("Failed to create context");
+                contextCreationVramReservation?.dispose?.();
+                contextCreationRamReservation?.dispose?.();
+                if (loraOptions != null && loraOptions.adapters.length > 0) {
+                    try {
+                        await context._setLoras(loraOptions.adapters);
+                        try {
+                            loraOptions.onLoadProgress?.(1);
+                        }
+                        catch (err) {
+                            console.error(err);
+                        }
+                    }
+                    catch (err) {
+                        await context.dispose();
+                        throw err;
+                    }
+                    if (createSignal?.aborted) {
+                        await context.dispose();
+                        throw createSignal.reason;
+                    }
+                }
+                else if (loraOptions?.onLoadProgress != null) {
+                    try {
+                        loraOptions.onLoadProgress(1);
+                    }
+                    catch (err) {
+                        console.error(err);
+                    }
+                }
+                return context;
+            }
+            finally {
+                contextCreationVramReservation?.dispose?.();
+                contextCreationRamReservation?.dispose?.();
+            }
+        }
+        while (failedCreationRetries >= 0) {
+            try {
+                return await createContext(contextSize);
+            }
+            catch (err) {
+                if (failedCreationRetries === 0 || (createSignal?.aborted && err === createSignal.reason))
+                    throw err;
+                failedCreationRetries--;
+                let newContextSize = typeof failedCreationAutoContextSizeShrink === "number"
+                    ? Math.floor(contextSize * (1 - failedCreationAutoContextSizeShrink))
+                    : Math.floor(failedCreationAutoContextSizeShrink(contextSize));
+                if (!Number.isFinite(newContextSize))
+                    throw err;
+                if (newContextSize < minContextSize)
+                    newContextSize = minContextSize;
+                if (newContextSize >= contextSize)
+                    throw err;
+                contextSize = newContextSize;
+            }
+        }
+        throw new Error("Failed to create context");
+    }
+}
+export class LlamaContextSequence {
+    /** @internal */ _sequenceId;
+    /** @internal */ _gcRegistry;
+    /** @internal */ _context;
+    /** @internal */ _contextShift;
+    /** @internal */ _tokenPredictor;
+    /** @internal */ _checkpoints = new LlamaContextSequenceCheckpoints();
+    /** @internal */ _checkpointOptions;
+    /** @internal */ _tokenMeter;
+    /** @internal */ _disposeAggregator = new DisposeAggregator();
+    /** @internal */ _lock = {};
+    /** @internal */ _resetTokenPredictor = false;
+    /** @internal */ _tokenPredictorOwner = {};
+    /** @internal */ _contextTokens = [];
+    /** @internal */ _nextTokenIndex = 0;
+    /** @internal */ _loadedTokenPredictions = [];
+    /** @internal */ _usedTokenPredictions = 0;
+    /** @internal */ _unusedTokenPredictions = 0;
+    /** @internal */ _validatedTokenPredictions = 0;
+    /** @internal */ _refutedTokenPredictions = 0;
+    /** @internal */ _disposed = false;
+    onDispose = new EventRelay();
+    constructor({ sequenceId, context, tokenMeter, contextShift, tokenPredictor, checkpoints }) {
+        this._sequenceId = sequenceId;
+        this._context = context;
+        this._tokenMeter = tokenMeter ?? new TokenMeter();
+        this._contextShift = contextShift;
+        this._tokenPredictor = tokenPredictor;
+        this._checkpointOptions = {
+            max: checkpoints?.max ?? defaultCheckpointOptions.max,
+            interval: checkpoints?.interval ?? defaultCheckpointOptions.interval,
+            maxMemory: checkpoints?.maxMemory ?? defaultCheckpointOptions.maxMemory
+        };
+        this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
+        this._gcRegistry.register(this, sequenceId);
+        this._disposeAggregator.add(() => this._gcRegistry.unregister(this));
+        this._disposeAggregator.add(this.onDispose.dispatchEvent);
+        this._disposeAggregator.add(this.model.onDispose.createListener(disposeContextSequenceIfReferenced.bind(null, new WeakRef(this))));
+        this._disposeAggregator.add(() => {
+            this._checkpoints.clearAllCheckpoints();
+            this._context._reclaimUnusedSequenceId(this._sequenceId);
+        });
+        if (this._tokenPredictor != null)
+            this._disposeAggregator.add(this._tokenPredictor);
+        this._takeIntervalCheckpointIfNeededAfterBatch = this._takeIntervalCheckpointIfNeededAfterBatch.bind(this);
+    }
+    dispose() {
+        if (this._disposed)
+            return;
+        this._disposeAggregator.dispose();
+        this._contextTokens.length = 0;
+        this._disposed = true;
+    }
+    /** @hidden */
+    [Symbol.dispose]() {
+        return this.dispose();
+    }
+    get disposed() {
+        return this._disposed;
+    }
+    get context() {
+        return this._context;
+    }
+    get model() {
+        return this._context.model;
+    }
+    /** The maximum number of tokens that the sequence state can hold */
+    get contextSize() {
+        return this._context.contextSize;
+    }
+    /** The index where the next evaluated token will be placed in the context */
+    get nextTokenIndex() {
+        return this._nextTokenIndex - this._loadedTokenPredictions.length;
+    }
+    /** The current context state tokens */
+    get contextTokens() {
+        if (this._loadedTokenPredictions.length === 0)
+            return this._contextTokens.slice();
+        return this._contextTokens.slice(0, -this._loadedTokenPredictions.length);
+    }
+    get tokenMeter() {
+        return this._tokenMeter;
+    }
+    /**
+     * The token predictor used when creating this sequence.
+     */
+    get tokenPredictor() {
+        return this._tokenPredictor;
+    }
+    /**
+     * Get the index of the first token in the KV cache.
+     *
+     * If you remove any tokens from the state that come before this index,
+     * no cached prefix tokens evaluation state will be used for the next evaluation.
+     *
+     * For example, if `stateCellsStartIndex` is `10` and you remove the range `{start: 11, end: 16}`
+     * then the cached state for range `0-10` will be used in the next evaluation,
+     * but if you remove the range `{start: 10, end: 16}` (or `{start: 9, end: 16}`) then the cached state will not be used at all
+     * and will be re-evaluated in the next evaluation.
+     *
+     * This index can be greater than `0` only when SWA (Sliding Window Attention) is used (only on supported models).
+     *
+     * When SWA is used, this index will usually be `Math.max(-1, .nextTokenIndex - .model.fileInsights.swaSize)` or larger.
+     *
+     * When the KV cache is empty, this index will be `-1`.
+     *
+     * You can disable SWA by setting the `swaFullCache` option to `true` when creating a context.
+     */
+    get stateCellsStartIndex() {
+        this._ensureNotDisposed();
+        return this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId);
+    }
+    /**
+     * Statistics of token predictions using the sequence's `tokenPredictor`.
+     *
+     * The statistics change only when token prediction is used in this sequence.
+     *
+     * `validated` + `refuted` = total number of evaluated predictions.
+     *
+     * Prefer using `validated` and `refuted` to evaluate the effectiveness of token prediction.
+     */
+    get tokenPredictions() {
+        return {
+            used: this._usedTokenPredictions,
+            unused: this._unusedTokenPredictions,
+            validated: this._validatedTokenPredictions,
+            refuted: this._refutedTokenPredictions
+        };
+    }
+    get isLoadedToMemory() {
+        return !this._disposed;
+    }
+    compareContextTokens(tokens) {
+        for (let i = 0; i < this._contextTokens.length - this._loadedTokenPredictions.length; i++) {
+            if (compareTokens(this._contextTokens[i], tokens[i]))
+                continue;
+            return {
+                firstDifferentIndex: i
+            };
+        }
+        return {
+            firstDifferentIndex: this._contextTokens.length - this._loadedTokenPredictions.length
+        };
+    }
+    /**
+     * Erase parts of the context state to align it with the given tokens.
+     *
+     * If the given tokens do not align with the current context state, the context state will be erased to align with the given tokens.
+     *
+     * To find the first different token index between the context state and the given tokens, access the `nextTokenIndex` property.
+     *
+     * If `allowShift` is `true` (the default), shifting tokens may happen to align the context state with the given tokens,
+     * which incurs token evaluation of the shifted tokens.
+     */
+    async adaptStateToTokens(tokens, allowShift = true) {
+        const modelSupportsShifting = !this.model.fileInsights.isRecurrent &&
+            this.model.fileInfo.metadata?.general?.architecture !== GgufArchitectureType.deepseek2;
+        if (!modelSupportsShifting || !allowShift) {
+            const { firstDifferentIndex } = this.compareContextTokens(tokens);
+            if (firstDifferentIndex < this.nextTokenIndex)
+                await this._eraseContextTokenRanges([{
+                        start: firstDifferentIndex,
+                        end: this._nextTokenIndex
+                    }]);
+            return;
+        }
+        const eraseRanges = [];
+        let tokensIndex = 0;
+        let differentTokenIndex = undefined;
+        for (let i = 0; i < this._contextTokens.length - this._loadedTokenPredictions.length && tokensIndex < tokens.length; i++) {
+            if (compareTokens(this._contextTokens[i], tokens[tokensIndex])) {
+                if (differentTokenIndex != null) {
+                    eraseRanges.push({
+                        start: differentTokenIndex,
+                        end: i
+                    });
+                    differentTokenIndex = undefined;
+                }
+                tokensIndex++;
+                continue;
+            }
+            if (differentTokenIndex == null)
+                differentTokenIndex = i;
+        }
+        if (differentTokenIndex != null)
+            eraseRanges.push({
+                start: differentTokenIndex,
+                end: this._nextTokenIndex
+            });
+        if (eraseRanges.length > 0)
+            await this._eraseContextTokenRanges(eraseRanges);
+    }
+    /**
+     * Clear the history of the sequence.
+     */
+    async clearHistory() {
+        this._ensureNotDisposed();
+        await this._eraseContextTokenRanges([{ start: 0, end: this._nextTokenIndex }]);
+    }
+    /**
+     * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
+     * The start of each range is inclusive, and the end of each range is exclusive.
+     * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
+     */
+    eraseContextTokenRanges(ranges) {
+        return this._eraseContextTokenRanges(ranges);
+    }
+    /** @internal */
+    async _eraseContextTokenRanges(ranges, { canResetTokenPredictor = true, canRemovePredictionTokens = true, skipLock = false } = {}) {
+        this._ensureNotDisposed();
+        let awaitEvaluationPromise;
+        await withLock([this._context, "context"], async () => {
+            this._ensureNotDisposed();
+            if (ranges.length === 0)
+                return;
+            // if the deletion fails, we'll have to dispose the sequence and fill it up again
+            let deletionSuccessful = true;
+            const resolvedRanges = ranges
+                .map(({ start, end }) => {
+                if (start === end)
+                    return null;
+                if (start > end)
+                    [start, end] = [end, start];
+                if (end > this._nextTokenIndex)
+                    end = this._nextTokenIndex;
+                if (start >= this._nextTokenIndex)
+                    return null;
+                return { start, end };
+            })
+                .filter((range) => range != null)
+                .sort((a, b) => a.start - b.start)
+                .reduce((ranges, range) => {
+                if (ranges.length === 0)
+                    return [range];
+                const lastRange = ranges[ranges.length - 1];
+                if (lastRange.end >= range.start) {
+                    lastRange.end = Math.max(lastRange.end, range.end);
+                    return ranges;
+                }
+                ranges.push(range);
+                return ranges;
+            }, []);
+            const minKvCachePosition = (this._contextTokens.length === 0 && this._loadedTokenPredictions.length === 0)
+                ? 0
+                : Math.max(0, this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId));
+            if (resolvedRanges[0] != null && resolvedRanges[0].start <= minKvCachePosition)
+                // we have to drop the cache and reevaluate the sequence due to missing KV cache
+                deletionSuccessful = false;
+            const tokenPredictionsToRemove = (resolvedRanges.length > 0 && canRemovePredictionTokens)
+                ? this._loadedTokenPredictions.length
+                : 0;
+            if (tokenPredictionsToRemove > 0) {
+                const startDeleteIndex = this._nextTokenIndex - this._loadedTokenPredictions.length;
+                const lastDeleteRange = resolvedRanges[resolvedRanges.length - 1];
+                if (lastDeleteRange.end >= startDeleteIndex)
+                    lastDeleteRange.end = this._nextTokenIndex;
+                else
+                    resolvedRanges.push({ start: startDeleteIndex, end: this._nextTokenIndex });
+                if (canResetTokenPredictor)
+                    await this._abortTokenPredictor(true);
+            }
+            let removedTokens = 0;
+            let lastDeleteRangeEndPos = null;
+            for (const range of resolvedRanges) {
+                this._contextTokens.splice(range.start - removedTokens, range.end - range.start);
+                if (deletionSuccessful)
+                    deletionSuccessful &&= this._context._ctx.removeTokenCellsFromSequence(this._sequenceId, range.start, range.end);
+                if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== range.start) {
+                    this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, range.start, -removedTokens);
+                    const shiftedTokens = range.start - lastDeleteRangeEndPos;
+                    this._tokenMeter.useTokens(shiftedTokens, "input");
+                }
+                removedTokens += range.end - range.start;
+                lastDeleteRangeEndPos = range.end;
+            }
+            if (tokenPredictionsToRemove > 0)
+                this._loadedTokenPredictions.splice(0, tokenPredictionsToRemove);
+            if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 &&
+                lastDeleteRangeEndPos !== this._nextTokenIndex) {
+                this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, this._nextTokenIndex, -removedTokens);
+                const shiftedTokens = this._nextTokenIndex - lastDeleteRangeEndPos;
+                this._tokenMeter.useTokens(shiftedTokens, "input");
+            }
+            this._nextTokenIndex -= removedTokens;
+            if (canResetTokenPredictor && removedTokens > 0)
+                await this._abortTokenPredictor(true);
+            this._checkpoints.pruneFromEndToIndex(this._contextTokens.length - 1);
+            if (deletionSuccessful)
+                return;
+            let restoreCheckpointIndex = this._contextTokens.length - 1;
+            const existingCheckpoint = this._checkpoints.getLastCheckpoint(restoreCheckpointIndex, this.contextSize);
+            if (existingCheckpoint != null &&
+                restoreCheckpointIndex >= existingCheckpoint.minPos &&
+                existingCheckpoint.maxPos <= this.contextSize) {
+                restoreCheckpointIndex = Math.min(restoreCheckpointIndex, existingCheckpoint.maxPos);
+                const restoredSuccessfully = await this._context._ctx.restoreCheckpoint(existingCheckpoint, restoreCheckpointIndex);
+                if (restoredSuccessfully) {
+                    const tokensToEvaluate = this._contextTokens.slice(restoreCheckpointIndex + 1);
+                    this._contextTokens = this._contextTokens.slice(0, restoreCheckpointIndex + 1);
+                    this._nextTokenIndex = restoreCheckpointIndex + 1;
+                    // wait for the evaluation outside the "context" lock to avoid deadlocks
+                    if (tokensToEvaluate.length > 0)
+                        awaitEvaluationPromise = this.evaluateWithoutGeneratingNewTokens(tokensToEvaluate, { _skipLock: skipLock });
+                    return;
+                }
+            }
+            const newSequenceTokens = this._contextTokens.slice();
+            this._nextTokenIndex = 0;
+            this._context._ctx.disposeSequence(this._sequenceId);
+            this._contextTokens = [];
+            // wait for the evaluation outside the "context" lock to avoid deadlocks
+            if (newSequenceTokens.length > 0)
+                awaitEvaluationPromise = this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, { _skipLock: skipLock });
+        });
+        if (awaitEvaluationPromise != null) {
+            await awaitEvaluationPromise;
+            if (this.needsCheckpoints && this._checkpoints.lastCheckpointIndex !== this._nextTokenIndex - 1)
+                await this.takeCheckpoint();
+        }
+    }
+    /**
+     * Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
+     *
+     * This method uses the token predictor (when provided) to generate new tokens faster.
+     */
+    async *evaluate(tokens, options = {}) {
+        const iterator = this.evaluateWithMetadata(tokens, {}, options);
+        let iterateInput = undefined;
+        try {
+            while (true) {
+                const { value, done } = await iterator.next(iterateInput);
+                if (done)
+                    return;
+                iterateInput = yield value.token;
+            }
+        }
+        finally {
+            await iterator.return();
+        }
+    }
+    /**
+     * Like {@link evaluate `.evaluate(...)`}, but with additional metadata for each generated token.
+     *
+     * Configure the additional metadata options to choose which metadata to include.
+     */
+    evaluateWithMetadata(tokens, metadata, options = {}) {
+        const { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, xtc, grammarEvaluationState, repeatPenalty, dryRepeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEogToken = false, _noSampling = false } = options;
+        if (this._tokenPredictor != null && !_noSampling && tokens.length > 0)
+            return this._speculativeEvaluate(tokens, metadata, {
+                temperature,
+                minP,
+                topK,
+                topP,
+                seed,
+                xtc,
+                grammarEvaluationState,
+                repeatPenalty,
+                dryRepeatPenalty,
+                tokenBias,
+                evaluationPriority,
+                contextShiftOptions: {
+                    size: contextShiftSize,
+                    strategy: contextShiftStrategy
+                },
+                yieldEogToken,
+                tokenPredictor: this._tokenPredictor
+            });
+        return this._evaluate(tokens, metadata, {
+            temperature,
+            minP,
+            topK,
+            topP,
+            seed,
+            xtc,
+            grammarEvaluationState,
+            repeatPenalty,
+            dryRepeatPenalty,
+            tokenBias,
+            evaluationPriority,
+            contextShiftOptions: {
+                size: contextShiftSize,
+                strategy: contextShiftStrategy
+            },
+            yieldEogToken,
+            _noSampling
+        });
+    }
+    /**
+     * Evaluate the provided tokens into the context sequence without generating new tokens.
+     */
+    async evaluateWithoutGeneratingNewTokens(tokens, options = {}) {
+        const { evaluationPriority = defaultEvaluationPriority, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, _skipLock = false } = options;
+        const iterator = this._evaluate(tokens, {}, {
+            generateNewTokens: false,
+            evaluationPriority,
+            contextShiftOptions: {
+                size: contextShiftSize,
+                strategy: contextShiftStrategy
+            },
+            _skipLock
+        });
+        const predictorAlignmentPromise = this.tokenPredictor == null
+            ? undefined
+            : this._tokenPredictor?.reset({
+                stateTokens: [...this._contextTokens, ...tokens],
+                evaluateOptions: {
+                    evaluationPriority,
+                    contextShift: {
+                        size: contextShiftSize,
+                        strategy: contextShiftStrategy
+                    }
+                },
+                targetSequence: this
+            });
+        if (predictorAlignmentPromise != null) {
+            this._tokenPredictorOwner = {};
+            this._resetTokenPredictor = false;
+        }
+        // eslint-disable-next-line @typescript-eslint/no-unused-vars
+        for await (const token of iterator) {
+            // Array.from doesn't work with async generators, so we have to iterate over the generator
+        }
+        await iterator.return();
+        if (predictorAlignmentPromise != null)
+            await predictorAlignmentPromise;
+    }
+    /**
+     * Evaluate the provided tokens into the context sequence with custom options for each token.
+     *
+     * This method allows for more precise control of the generation process.
+     *
+     * A next token will be generated for a given token only if any of the `generateNext` options for it are used.
+     *
+     * To generate more tokens after this method finishes,
+     * use it again with token(s) you selected to add to the context from the previous evaluation.
+     *
+     * This method doesn't use the token predictor (when provided) since it cannot predict which tokens are actually needed.
+     * Use the `evaluate` method when you need to use token prediction.
+     * @returns An array where for each token in the input array, there can be an output item at the same index in the output array.
+     * For indexes that have no output, there won't be any value at the corresponding index in the output array.
+     *
+     * It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
+     */
+    async controlledEvaluate(input, options) {
+        const { evaluationPriority = defaultEvaluationPriority, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {} } = options ?? {};
+        const contextShiftOptions = {
+            size: contextShiftSize,
+            strategy: contextShiftStrategy
+        };
+        this._ensureNotDisposed();
+        if (input.length === 0)
+            return [];
+        await this._abortTokenPredictor();
+        const sampler = new LlamaSampler(this.model);
+        const onTokenResult = safeEventCallback(options?.onTokenResult);
+        const logitsArray = [];
+        const resolvedTokens = input.map((item, index) => {
+            if (item instanceof Array) {
+                const [token, options] = item;
+                const generateNext = options?.generateNext ?? {};
+                if (generateNext.probabilities === true || generateNext.confidence === true || generateNext.token === true)
+                    logitsArray[index] = true;
+                return token;
+            }
+            return item;
+        });
+        const evaluatorLock = await acquireLock([this._lock, "evaluate"]);
+        try {
+            return await this._decodeTokens(resolvedTokens, logitsArray, evaluationPriority, this._tokenMeter, contextShiftOptions, async (batchLogitIndex, tokenIndex) => {
+                const inputToken = input[tokenIndex];
+                const inputOptions = inputToken instanceof Array
+                    ? (inputToken[1] ?? {})
+                    : {};
+                const generateNext = inputOptions.generateNext;
+                if (generateNext == null || ((generateNext.probabilities == null || !generateNext.probabilities) &&
+                    (generateNext.token == null || !generateNext.token) &&
+                    (generateNext.confidence == null || !generateNext.confidence)))
+                    return undefined;
+                const sampleOptions = generateNext.options ?? {};
+                const samplerConfig = this._resolveSamplerConfig({
+                    temperature: sampleOptions.temperature,
+                    minP: sampleOptions.minP,
+                    topK: sampleOptions.topK,
+                    topP: sampleOptions.topP,
+                    seed: sampleOptions.seed,
+                    xtc: sampleOptions.xtc,
+                    repeatPenalty: sampleOptions.repeatPenalty,
+                    dryRepeatPenalty: sampleOptions.dryRepeatPenalty,
+                    tokenBias: sampleOptions.tokenBias
+                });
+                return await withLock([sampler, "sample"], async () => {
+                    if (sampler.disposed)
+                        return undefined;
+                    sampler.applyConfig(samplerConfig);
+                    const [token, probabilities, confidence] = await this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler, !!generateNext.probabilities, !!generateNext.confidence);
+                    const output = {
+                        next: {}
+                    };
+                    if (generateNext.token)
+                        output.next.token = token === -1
+                            ? null
+                            : (token ?? null);
+                    if (confidence != null)
+                        output.next.confidence = confidence;
+                    if (probabilities != null)
+                        output.next.probabilities = reviveTokenProbabilities(probabilities);
+                    onTokenResult?.(tokenIndex, output);
+                    return output;
+                });
+            }, this._takeIntervalCheckpointIfNeededAfterBatch);
+        }
+        finally {
+            evaluatorLock.dispose();
+            void withLock([sampler, "sample"], sampler.asyncDispose);
+        }
+    }
+    /* eslint-disable @stylistic/max-len */
+    /**
+     * Save the current context sequence evaluation state to a file.
+     * @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
+     */
+    async saveStateToFile(filePath) {
+        /* eslint-enable @stylistic/max-len */
+        this._ensureNotDisposed();
+        const resolvedPath = path.resolve(process.cwd(), filePath);
+        const evaluatorLock = await acquireLock([this._lock, "evaluate"]);
+        const contextLock = await acquireLock([this._context, "context"]);
+        try {
+            this._ensureNotDisposed();
+            // TODO: save checkpoints to disk
+            const fileSize = await this._context._ctx.saveSequenceStateToFile(resolvedPath, this._sequenceId, Uint32Array.from(this.contextTokens));
+            return { fileSize };
+        }
+        finally {
+            contextLock.dispose();
+            evaluatorLock.dispose();
+        }
+    }
+    /* eslint-disable @stylistic/max-len */
+    /**
+     * Load a context sequence evaluation state from a file.
+     *
+     * Trying to load a state file with a longer context size than the current sequence's context size will fail and throw an error.
+     *
+     * You must ensure that the file was created from the exact same model, otherwise, using this function may crash the process.
+     * @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
+     */
+    async loadStateFromFile(filePath, acceptRisk) {
+        /* eslint-enable @stylistic/max-len */
+        if (!acceptRisk.acceptRisk)
+            throw new Error("The `acceptRisk` option must be set to `true` to use this feature");
+        this._ensureNotDisposed();
+        const resolvedPath = path.resolve(process.cwd(), filePath);
+        const evaluatorLock = await acquireLock([this._lock, "evaluate"]);
+        const contextLock = await acquireLock([this._context, "context"]);
+        try {
+            this._ensureNotDisposed();
+            this._tokenPredictorOwner = {};
+            await this._abortTokenPredictor(true);
+            this._ensureNotDisposed();
+            this._loadedTokenPredictions.length = 0;
+            this._nextTokenIndex = 0;
+            this._contextTokens = [];
+            const tokens = Array.from(await this._context._ctx.loadSequenceStateFromFile(resolvedPath, this._sequenceId, this.contextSize));
+            if (tokens.length > this.contextSize) {
+                this._context._ctx.disposeSequence(this._sequenceId);
+                throw new Error("The given state file is too large for the current context size");
+            }
+            this._contextTokens = tokens;
+            this._nextTokenIndex = tokens.length;
+            this._loadedTokenPredictions.length = 0;
+        }
+        finally {
+            contextLock.dispose();
+            evaluatorLock.dispose();
+        }
+    }
+    /**
+     * When reusing a prefix evaluation state is not possible for the current context sequence
+     * (like in contexts from recurrent and hybrid models,
+     * or with models that use SWA (Sliding Window Attention) when the `swaFullCache` option is not enabled on the context),
+     * you can use this method to checkpoint the current context sequence state.
+     * Those checkpoints will automatically be used when trying to erase parts of the context state that come after a checkpointed state,
+     * and be freed from memory when no longer relevant.
+     *
+     * Those checkpoints are relatively lightweight compared to saving the entire state,
+     * but taking too many checkpoints can increase memory usage.
+     * Checkpoints are stored in the RAM (not VRAM).
+     *
+     * Calling this method on a context sequence from a model that natively supports prefix evaluation state reuse will have no effect.
+     *
+     * > **Note:** to check whether the current context sequence needs taking checkpoints,
+     * > you can use the {@link needsCheckpoints `.needsCheckpoints`} property.
+     */
+    async takeCheckpoint() {
+        if (!this.needsCheckpoints)
+            return;
+        return await withLock([this._context, "context"], () => {
+            return this._takeCheckpoint(undefined, this._checkpointOptions.max);
+        });
+    }
+    /** @internal */
+    async _takeNamedCheckpoint(name, maxNamedCheckpoints) {
+        if (!this.needsCheckpoints)
+            return;
+        return await withLock([this._context, "context"], () => {
+            return this._takeCheckpoint(name, maxNamedCheckpoints);
+        });
+    }
+    /**
+     * Whether the current context sequence needs taking checkpoints of the context state to be able to reuse
+     * it as a prefix evaluation state in the future.
+     *
+     * See {@link takeCheckpoint `.takeCheckpoint()`} for more details.
+     */
+    get needsCheckpoints() {
+        if (this.model.fileInsights.isHybrid || this.model.fileInsights.isRecurrent)
+            return true;
+        else if (this.model.fileInsights.swaSize != null && !this._context._swaFullCache)
+            return true;
+        return false;
+    }
+    /**
+     * The index of the last taken checkpoint that's available for prefix reuse
+     */
+    get lastCheckpointIndex() {
+        return Math.max(0, Math.min(this._checkpoints.lastCheckpointIndex, this.nextTokenIndex - 1));
+    }
+    /**
+     * The total memory usage in bytes of all the checkpoints currently held for this context sequence
+     */
+    get checkpointsMemoryUsage() {
+        return this._checkpoints.memoryUsage;
+    }
+    /** @internal */
+    async _takeCheckpoint(name, maxNamedCheckpoints) {
+        if (!this.needsCheckpoints || this._nextTokenIndex === 0 || this._checkpoints.hasCheckpoint(name, this._nextTokenIndex - 1))
+            return;
+        if (this._checkpointOptions.maxMemory != null)
+            this._checkpoints.prepareMemoryForIncomingCheckpoint(this._checkpointOptions.maxMemory);
+        const checkpoint = new this.model._llama._bindings.AddonContextSequenceCheckpoint();
+        await checkpoint.init(this._context._ctx, this._sequenceId);
+        if (this._nextTokenIndex - 1 !== checkpoint.maxPos)
+            this.model._llama._log(LlamaLogLevel.warn, `Checkpoint max position mismatch: expected ${this._nextTokenIndex - 1}, got ${checkpoint.maxPos}`);
+        this._checkpoints.storeCheckpoint({
+            name,
+            maxNamedCheckpoints,
+            checkpoint,
+            currentMaxPos: checkpoint.maxPos
+        });
+        if (this._checkpointOptions.maxMemory != null)
+            this._checkpoints.pruneToKeepUnderMemoryUsage(this._checkpointOptions.maxMemory);
+    }
+    /** @internal */
+    _takeIntervalCheckpointIfNeeded(currentIndex = this._nextTokenIndex - 1) {
+        if (!this.needsCheckpoints)
+            return;
+        const lastCheckpointIndex = this._checkpoints.getLastNamedCheckpointIndex(undefined);
+        if (this._checkpointOptions.interval === false || currentIndex - lastCheckpointIndex < this._checkpointOptions.interval)
+            return;
+        return this._takeCheckpoint(undefined, this._checkpointOptions.max);
+    }
+    /** @internal */
+    _takeIntervalCheckpointIfNeededAfterBatch(sequenceStateLength) {
+        if (sequenceStateLength === 0)
+            return;
+        return this._takeIntervalCheckpointIfNeeded(sequenceStateLength - 1);
+    }
+    /** @internal */
+    async *_evaluate(tokens, metadata, { temperature, minP, topK, topP, seed, xtc, grammarEvaluationState, repeatPenalty, dryRepeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false, _skipLock = false }) {
+        this._ensureNotDisposed();
+        let evalTokens = tokens;
+        if (evalTokens.length === 0)
+            return;
+        await this._abortTokenPredictor(false, true);
+        const sampleProbabilities = metadata.probabilities === true;
+        const sampleConfidence = metadata.confidence === true;
+        const sampler = new LlamaSampler(this.model);
+        try {
+            while (true) {
+                this._ensureNotDisposed();
+                const evaluatorLock = _skipLock
+                    ? undefined
+                    : await acquireLock([this._lock, "evaluate"]);
+                let nextToken;
+                const yieldRes = {};
+                try {
+                    const logitsArray = [];
+                    if (generateNewTokens)
+                        logitsArray[evalTokens.length - 1] = true;
+                    // Evaluate to get the next token.
+                    const decodeResult = await this._decodeTokens(evalTokens, logitsArray, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
+                        if (_noSampling)
+                            return null;
+                        const samplerConfig = this._resolveSamplerConfig({
+                            temperature,
+                            minP,
+                            topK,
+                            topP,
+                            seed,
+                            xtc,
+                            grammarEvaluationState,
+                            repeatPenalty,
+                            dryRepeatPenalty,
+                            tokenBias
+                        });
+                        return withLock([sampler, "sample"], async () => {
+                            if (sampler.disposed)
+                                return null;
+                            sampler.applyConfig(samplerConfig);
+                            if (sampleProbabilities || sampleConfidence)
+                                return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler, sampleProbabilities, sampleConfidence);
+                            else
+                                return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                        });
+                    }, this._takeIntervalCheckpointIfNeededAfterBatch);
+                    const lastDecodeResult = decodeResult[evalTokens.length - 1];
+                    if (lastDecodeResult instanceof Array) {
+                        const [token, probabilities, confidence] = lastDecodeResult;
+                        nextToken = token;
+                        if (probabilities != null)
+                            yieldRes.probabilities = reviveTokenProbabilities(probabilities);
+                        if (confidence != null)
+                            yieldRes.confidence = confidence;
+                    }
+                    else
+                        nextToken = lastDecodeResult;
+                    if (nextToken === -1)
+                        throw new Error("Failed to sample next token");
+                    if (nextToken == null)
+                        return;
+                    // the model finished generating text
+                    if (!yieldEogToken && this._context.model.isEogToken(nextToken))
+                        break;
+                }
+                finally {
+                    evaluatorLock?.dispose();
+                }
+                yieldRes.token = nextToken;
+                const replacementToken = yield yieldRes;
+                // set the tokens for the next evaluation
+                if (replacementToken instanceof Array)
+                    evalTokens = replacementToken.slice();
+                else if (replacementToken != null)
+                    evalTokens = [replacementToken];
+                else
+                    evalTokens = [nextToken];
+            }
+        }
+        finally {
+            void withLock([sampler, "sample"], sampler.asyncDispose);
+        }
+    }
+    /** @internal */
+    async *_speculativeEvaluate(tokens, metadata, { temperature, minP, topK, topP, seed, xtc, grammarEvaluationState, repeatPenalty, dryRepeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, contextShiftOptions, yieldEogToken = false, tokenPredictor }) {
+        this._ensureNotDisposed();
+        let evalTokens = tokens.slice();
+        if (evalTokens.length === 0)
+            return;
+        const tokenPredictorOwner = {};
+        this._tokenPredictorOwner = tokenPredictorOwner;
+        await this._abortTokenPredictor();
+        const sampleProbabilities = metadata.probabilities === true;
+        const sampleConfidence = metadata.confidence === true;
+        let logitsArray = [];
+        let logitsStartIndex = evalTokens.length - 1;
+        const validatedTokens = [];
+        logitsArray[logitsStartIndex] = true;
+        const sampler = new LlamaSampler(this.model);
+        try {
+            while (true) {
+                this._ensureNotDisposed();
+                const evaluatorLock = await acquireLock([this._lock, "evaluate"]);
+                let nextToken;
+                const yieldRes = {};
+                try {
+                    if (this._tokenPredictorOwner === tokenPredictorOwner &&
+                        this._loadedTokenPredictions.length > 0 &&
+                        evalTokens.length === 1 &&
+                        evalTokens[0] === this._loadedTokenPredictions[0]?.[0]) {
+                        const [token, probabilities, confidence] = this._loadedTokenPredictions.shift()[1];
+                        nextToken = token;
+                        yieldRes.token = nextToken;
+                        if (probabilities != null)
+                            yieldRes.probabilities = reviveTokenProbabilities(probabilities);
+                        if (confidence != null)
+                            yieldRes.confidence = confidence;
+                        const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
+                            ? grammarEvaluationState()
+                            : grammarEvaluationState;
+                        if (resolvedGrammarEvaluationState != null)
+                            LlamaSampler._acceptTokenOnGrammarEvaluationState(this._context._llama, resolvedGrammarEvaluationState, nextToken);
+                        this._unusedTokenPredictions--;
+                        this._usedTokenPredictions++;
+                    }
+                    else if (this._tokenPredictorOwner === tokenPredictorOwner && this._loadedTokenPredictions.length > 0) {
+                        const deleteStartIndex = Math.max(0, this._nextTokenIndex - this._loadedTokenPredictions.length);
+                        await this._eraseContextTokenRanges([{ start: deleteStartIndex, end: this._nextTokenIndex }], { canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock: true });
+                        this._loadedTokenPredictions.length = 0;
+                        if (this.needsCheckpoints) {
+                            await this._takeCheckpoint(internalCheckpoints.speculative.name, internalCheckpoints.speculative.maxCheckpoints);
+                            await this._takeIntervalCheckpointIfNeeded();
+                        }
+                    }
+                    else if (this._tokenPredictorOwner === tokenPredictorOwner && this.needsCheckpoints) {
+                        await this._takeCheckpoint(internalCheckpoints.speculative.name, internalCheckpoints.speculative.maxCheckpoints);
+                        await this._takeIntervalCheckpointIfNeeded();
+                    }
+                    if (this._resetTokenPredictor) {
+                        await tokenPredictor.reset({
+                            stateTokens: [...this._contextTokens, ...evalTokens],
+                            evaluateOptions: {
+                                temperature,
+                                minP,
+                                topK,
+                                topP,
+                                seed,
+                                xtc,
+                                grammarEvaluationState: grammarEvaluationState instanceof Function
+                                    ? grammarEvaluationState()?.clone()
+                                    : grammarEvaluationState?.clone(),
+                                repeatPenalty,
+                                dryRepeatPenalty,
+                                tokenBias,
+                                evaluationPriority,
+                                contextShift: contextShiftOptions,
+                                yieldEogToken: true
+                            },
+                            targetSequence: this
+                        });
+                        this._resetTokenPredictor = false;
+                        this._tokenPredictorOwner = tokenPredictorOwner;
+                    }
+                    if (nextToken == null) {
+                        if (this._tokenPredictorOwner === tokenPredictorOwner &&
+                            // prevent incurring context shifts due to token prediction validations
+                            this._nextTokenIndex + evalTokens.length < this._context.contextSize) {
+                            const testGrammarClone = grammarEvaluationState instanceof Function
+                                ? grammarEvaluationState()?.clone()
+                                : grammarEvaluationState?.clone();
+                            for (const token of await tokenPredictor.predictTokens()) {
+                                if (testGrammarClone != null) {
+                                    const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(this.model._llama, testGrammarClone, token);
+                                    if (!canAddToken)
+                                        break;
+                                }
+                                evalTokens.push(token);
+                                logitsArray[evalTokens.length - 1] = true;
+                                // prevent incurring context shifts due to token prediction validations
+                                if (this._nextTokenIndex + evalTokens.length >= this._context.contextSize)
+                                    break;
+                            }
+                        }
+                        let resolvedGrammarEvaluationState = undefined;
+                        // Evaluate to get the next token.
+                        const decodeResult = await this._decodeTokens(evalTokens, logitsArray, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex, tokenIndex) => {
+                            if (tokenIndex === logitsStartIndex)
+                                resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
+                                    ? grammarEvaluationState()
+                                    : grammarEvaluationState;
+                            else if (tokenIndex === logitsStartIndex + 1)
+                                resolvedGrammarEvaluationState = resolvedGrammarEvaluationState?.clone();
+                            const samplerConfig = this._resolveSamplerConfig({
+                                temperature,
+                                minP,
+                                topK,
+                                topP,
+                                seed,
+                                xtc,
+                                grammarEvaluationState: resolvedGrammarEvaluationState,
+                                repeatPenalty,
+                                dryRepeatPenalty,
+                                tokenBias
+                            });
+                            return withLock([sampler, "sample"], async () => {
+                                if (sampler.disposed)
+                                    return null;
+                                sampler.applyConfig(samplerConfig);
+                                if (sampleProbabilities || sampleConfidence)
+                                    return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler, sampleProbabilities, sampleConfidence);
+                                else
+                                    return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                            });
+                        });
+                        for (let i = logitsStartIndex; i < evalTokens.length; i++) {
+                            const item = decodeResult[i];
+                            const [resultToken, probabilities, confidence] = item instanceof Array
+                                ? item
+                                : [item];
+                            if (i === logitsStartIndex) {
+                                if (resultToken === -1)
+                                    throw new Error("Failed to sample next token");
+                                if (resultToken == null)
+                                    return;
+                                nextToken = resultToken;
+                                yieldRes.token = nextToken;
+                                if (probabilities != null)
+                                    yieldRes.probabilities = reviveTokenProbabilities(probabilities);
+                                if (confidence != null)
+                                    yieldRes.confidence = confidence;
+                            }
+                            else {
+                                if (resultToken === -1 || resultToken == null)
+                                    break;
+                                const lastValidatedTokenOutput = i === logitsStartIndex + 1
+                                    ? nextToken
+                                    : validatedTokens.at(-1)?.[1];
+                                if (lastValidatedTokenOutput != null && lastValidatedTokenOutput === evalTokens[i]) {
+                                    this._loadedTokenPredictions.push([evalTokens[i], [resultToken, probabilities, confidence]]);
+                                    this._validatedTokenPredictions++;
+                                    this._unusedTokenPredictions++;
+                                }
+                                else {
+                                    const deleteSize = Math.min(evalTokens.length - i, this.context.contextSize);
+                                    this._refutedTokenPredictions += deleteSize;
+                                    const deleteStartIndex = this._nextTokenIndex - deleteSize;
+                                    tokenPredictor.stop(true);
+                                    await this._eraseContextTokenRanges([{
+                                            start: deleteStartIndex,
+                                            end: this._nextTokenIndex
+                                        }], { canResetTokenPredictor: false, canRemovePredictionTokens: false, skipLock: true });
+                                    break; // the assumption that this token will be generated was wrong
+                                }
+                            }
+                        }
+                    }
+                    if (nextToken == null)
+                        throw new Error("Failed to generated next token");
+                    // the model finished generating text
+                    if (!yieldEogToken && this._context.model.isEogToken(nextToken))
+                        break;
+                }
+                finally {
+                    evaluatorLock.dispose();
+                }
+                const replacementToken = yield yieldRes;
+                // set the tokens for the next evaluation
+                if (replacementToken instanceof Array)
+                    evalTokens = replacementToken.slice();
+                else if (replacementToken != null)
+                    evalTokens = [replacementToken];
+                else
+                    evalTokens = [nextToken];
+                if (this._tokenPredictorOwner === tokenPredictorOwner)
+                    tokenPredictor.pushTokens(evalTokens);
+                logitsArray = [];
+                logitsStartIndex = evalTokens.length - 1;
+                logitsArray[logitsStartIndex] = true;
+            }
+        }
+        finally {
+            void withLock([sampler, "sample"], sampler.asyncDispose);
+            if (this._tokenPredictorOwner === tokenPredictorOwner)
+                tokenPredictor.stop();
+        }
+    }
+    /** @internal */
+    async _abortTokenPredictor(skipClearingPredictionsFromState = false, skipLock = false) {
+        this._tokenPredictor?.stop();
+        this._resetTokenPredictor = true;
+        if (skipClearingPredictionsFromState)
+            return;
+        if (this._loadedTokenPredictions.length > 0)
+            await this._eraseContextTokenRanges([{
+                    start: this._nextTokenIndex - this._loadedTokenPredictions.length,
+                    end: this._nextTokenIndex
+                }], { canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock });
+    }
+    /** @internal */
+    _resolveSamplerConfig({ temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, xtc, grammarEvaluationState, repeatPenalty, dryRepeatPenalty, tokenBias }) {
+        const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
+            ? repeatPenalty.punishTokens()
+            : repeatPenalty?.punishTokens;
+        const maxPunishTokens = Math.max(repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens, repeatPenaltyTokens?.length ?? 0);
+        const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
+            ? grammarEvaluationState()
+            : grammarEvaluationState;
+        if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
+            throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
+        const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
+        return removeNullFields({
+            temperature,
+            minP,
+            topK,
+            topP,
+            seed: Math.max(0, Number.isFinite(seed)
+                ? Math.floor(seed ?? (Date.now() / 1000))
+                : Math.floor(Date.now() / 1000)),
+            xtcProbability: xtc == null
+                ? undefined
+                : Math.min(1, Math.max(0, xtc.probability)),
+            xtcThreshold: xtc == null
+                ? undefined
+                : Math.min(1, Math.max(0, xtc.threshold)),
+            repeatPenalty: repeatPenalty?.penalty,
+            repeatPenaltyMaxTokens: maxPunishTokens,
+            repeatPenaltyTokens: repeatPenaltyTokens != null
+                ? Uint32Array.from(repeatPenaltyTokens)
+                : undefined,
+            repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
+            repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
+            dryRepeatPenaltyStrength: (dryRepeatPenalty?.strength == null || dryRepeatPenalty?.strength === 0)
+                ? undefined
+                : Math.max(0, dryRepeatPenalty?.strength),
+            dryRepeatPenaltyBase: dryRepeatPenalty?.base,
+            dryRepeatPenaltyAllowedLength: Math.max(1, dryRepeatPenalty?.allowedLength ?? 2),
+            dryRepeatPenaltyLastTokens: dryRepeatPenalty?.lastTokens == null
+                ? -1
+                : Math.max(0, dryRepeatPenalty?.lastTokens),
+            dryRepeatPenaltySequenceBreakers: dryRepeatPenalty?.sequenceBreakers ?? defaultDryRepeatPenalitySequenceBreakers,
+            tokenBiasKeys,
+            tokenBiasValues,
+            grammarEvaluationState: resolvedGrammarEvaluationState?._state
+        });
+    }
+    /**
+     * The caller of this function has to wrap it with a lock to ensure this function doesn't run concurrently.
+     * @internal
+     */
+    async _decodeTokens(tokens, logits, evaluationPriority, tokenMeter, contextShiftOptions, logitDataMapper, afterBatchAction) {
+        this._ensureNotDisposed();
+        const tokensLeftToDecode = tokens.slice();
+        const tokenLogitsLeftToDecode = logits.slice();
+        let currentTokenIndex = 0;
+        const res = [];
+        const normalizedLogitDataMapper = (batchLogitIndex, contextStateTokenIndex) => {
+            return logitDataMapper(batchLogitIndex, currentTokenIndex + (contextStateTokenIndex - this._nextTokenIndex));
+        };
+        while (tokensLeftToDecode.length > 0) {
+            this._ensureNotDisposed();
+            let freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
+            if (freeSpace <= 0) {
+                await this._freeUpSpaceForTokens(contextShiftOptions);
+                freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
+                if (freeSpace <= 0)
+                    throw new Error("Failed to free up space for new tokens");
+            }
+            const tokensToDecode = tokensLeftToDecode.splice(0, freeSpace);
+            const tokensLogits = tokenLogitsLeftToDecode.slice(0, tokensToDecode.length);
+            const generatedLogits = await this._context._decodeTokens({
+                sequenceId: this._sequenceId,
+                tokens: tokensToDecode,
+                firstTokenSequenceIndex: this._nextTokenIndex,
+                logits: tokensLogits,
+                evaluationPriority,
+                tokenMeter,
+                afterBatchAction
+            }, normalizedLogitDataMapper);
+            for (const [index, value] of generatedLogits)
+                res[currentTokenIndex + (index - this._nextTokenIndex)] = value;
+            this._nextTokenIndex += tokensToDecode.length;
+            currentTokenIndex += tokensToDecode.length;
+            this._contextTokens = this._contextTokens.concat(tokensToDecode);
+        }
+        return res;
+    }
+    /** @internal */
+    async _freeUpSpaceForTokens(contextShiftOptions) {
+        this._ensureNotDisposed();
+        const size = Math.min(this._nextTokenIndex, Math.max(1, contextShiftOptions.size instanceof Function
+            ? await contextShiftOptions.size(this)
+            : contextShiftOptions.size));
+        this._ensureNotDisposed();
+        if (contextShiftOptions.strategy === "eraseBeginning") {
+            let eraseStartIndex = 0;
+            if (this.model.tokens.bos != null && this._contextTokens[0] === this.model.tokens.bos)
+                eraseStartIndex = 1;
+            await this._eraseContextTokenRanges([{ start: eraseStartIndex, end: size + eraseStartIndex }], { skipLock: true });
+        }
+        else {
+            const ranges = await contextShiftOptions.strategy({
+                sequence: this,
+                size
+            });
+            if (ranges == null)
+                throw new Error("Invalid delete ranges");
+            await this._eraseContextTokenRanges(ranges, { skipLock: true });
+            if (this._nextTokenIndex >= this._context.contextSize - 1)
+                await this._eraseContextTokenRanges([{ start: 0, end: size }], { skipLock: true });
+        }
+    }
+    /** @internal */
+    _ensureNotDisposed() {
+        if (this._disposed)
+            throw new DisposedError();
+    }
+    /**
+     * We need this to make it impossible to manually create instances of this class outside the code of this library
+     * @internal
+     */
+    static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, tokenPredictor, checkpoints }) {
+        return new LlamaContextSequence({
+            sequenceId,
+            context,
+            tokenMeter,
+            contextShift: {
+                size: contextShiftSize,
+                strategy: contextShiftStrategy
+            },
+            tokenPredictor,
+            checkpoints
+        });
+    }
+}
+function getTokenBiasesForAddon(tokenBias, currentModel) {
+    if (tokenBias == null)
+        return {
+            tokenBiasKeys: undefined,
+            tokenBiasValues: undefined
+        };
+    if (tokenBias instanceof Function)
+        tokenBias = tokenBias();
+    if (tokenBias._tokenizer !== currentModel.tokenizer)
+        throw new Error("This TokenBias instance was created with a different model than the one used by this context. " +
+            "Make sure you use the model instance of the context sequence for the TokenBias you use it with.");
+    const tokenBiasKeys = [];
+    const tokenBiasValues = [];
+    for (const [token, bias] of tokenBias._biases) {
+        tokenBiasKeys.push(token);
+        tokenBiasValues.push(bias);
+    }
+    if (tokenBiasKeys.length === 0 || tokenBiasValues.length === 0) {
+        return {
+            tokenBiasKeys: undefined,
+            tokenBiasValues: undefined
+        };
+    }
+    return {
+        tokenBiasKeys: Uint32Array.from(tokenBiasKeys),
+        tokenBiasValues: Float32Array.from(tokenBiasValues)
+    };
+}
+function reviveTokenProbabilities(probabilities) {
+    if (probabilities == null)
+        return undefined;
+    const res = new Map();
+    for (let i = 1; i < probabilities.length; i += 2) {
+        const token = probabilities[i - 1];
+        const probability = probabilities[i];
+        res.set(token, probability);
+    }
+    return res;
+}
+function disposeContextIfReferenced(contextRef) {
+    const context = contextRef.deref();
+    if (context != null)
+        void context.dispose();
+}
+function disposeContextSequenceIfReferenced(contextRef) {
+    const context = contextRef.deref();
+    if (context != null)
+        context.dispose();
+}
+export function getDefaultContextBatchSize({ contextSize, sequences }) {
+    return Math.min(contextSize * sequences, 512);
+}
+export function getDefaultContextSequences() {
+    return 1;
+}
+const defaultFallbackContextSize = 4096;
+export function getDefaultModelContextSize({ trainContextSize }) {
+    return trainContextSize ?? defaultFallbackContextSize;
+}
+//# sourceMappingURL=LlamaContext.js.map