npm - node-llama-cpp - Versions diffs - 3.0.0-beta.13 → 3.0.0-beta.15 - Mend

node-llama-cpp 3.0.0-beta.13 → 3.0.0-beta.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (351) hide show

package/dist/evaluator/LlamaContext/LlamaContext.d.ts CHANGED Viewed

@@ -1,21 +1,26 @@
 import { EventRelay } from "lifecycle-utils";
 import { Token } from "../../types.js";
-import { LlamaModel } from "../LlamaModel.js";
 import { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
+import { GgufInsights } from "../../gguf/GgufInsights.js";
+import { TokenMeter } from "../TokenMeter.js";
+import { BuildGpu } from "../../bindings/types.js";
 import { ContextShiftOptions, ContextTokensDeleteRange, EvaluationPriority, LlamaContextOptions, LlamaContextSequenceRepeatPenalty } from "./types.js";
+import type { LlamaModel } from "../LlamaModel.js";
 export declare class LlamaContext {
     readonly onDispose: EventRelay<void>;
-    /**
-     * @param options
-     */
-    constructor({ model, sequences, seed, contextSize, batchSize, threads, batching: { dispatchSchedule: batchingDispatchSchedule, itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy }, _embedding, _noSeed }: LlamaContextOptions);
-    dispose(): void;
+    private constructor();
+    dispose(): Promise<void>;
     /** @hidden */
-    [Symbol.dispose](): void;
+    [Symbol.asyncDispose](): Promise<void>;
     get disposed(): boolean;
     get model(): LlamaModel;
     get contextSize(): number;
     get batchSize(): number;
+    /**
+     * The actual size of the state in the memory in bytes.
+     * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
+     */
+    get stateSize(): number;
     getAllocatedContextSize(): number;
     get totalSequences(): number;
     get sequencesLeft(): number;
@@ -24,10 +29,15 @@ export declare class LlamaContext {
      * When there are no sequences left, this method will throw an error.
      * @param [options]
      */
-    getSequence({ contextShift: { size: contextShiftSize, strategy: contextShiftStrategy } }?: {
+    getSequence({ contextShift: { size: contextShiftSize, strategy: contextShiftStrategy }, _tokenMeter }?: {
         contextShift?: ContextShiftOptions;
     }): LlamaContextSequence;
     dispatchPendingBatch(): void;
+    /**
+     * Print the timings of token evaluation since that last print for this context.
+     * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
+     * it won't print anything.
+     */
     printTimings(): Promise<void>;
 }
 export declare class LlamaContextSequence {
@@ -41,6 +51,7 @@ export declare class LlamaContextSequence {
     get model(): LlamaModel;
     get nextTokenIndex(): number;
     get contextTokens(): Token[];
+    get tokenMeter(): TokenMeter;
     get isLoadedToMemory(): boolean;
     compareContextTokens(tokens: Token[]): {
         firstDifferentIndex: number;
@@ -52,7 +63,7 @@ export declare class LlamaContextSequence {
     clearHistory(): Promise<void>;
     /**
      * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
-     * the start and end of each range are exclusive.
+     * The start of each range is inclusive, and the end of each range is exclusive.
      * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
      */
     eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]): Promise<void>;
@@ -107,3 +118,26 @@ export declare class LlamaContextSequence {
         contextShift?: ContextShiftOptions;
     }): Promise<void>;
 }
+export declare function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, getVramState, llamaGpu, ignoreMemorySafetyChecks, isEmbeddingContext }: {
+    contextSize?: LlamaContextOptions["contextSize"];
+    batchSize?: LlamaContextOptions["batchSize"];
+    sequences: number;
+    modelFileInsights: GgufInsights;
+    modelGpuLayers: number;
+    modelTrainContextSize: number;
+    getVramState(): {
+        total: number;
+        free: number;
+    };
+    llamaGpu: BuildGpu;
+    ignoreMemorySafetyChecks?: boolean;
+    isEmbeddingContext?: boolean;
+}): number;
+export declare function getDefaultContextBatchSize({ contextSize, sequences }: {
+    contextSize: number;
+    sequences: number;
+}): number;
+export declare function getDefaultContextSequences(): number;
+export declare function getDefaultModelContextSize({ trainContextSize }: {
+    trainContextSize?: number;
+}): number;

package/dist/evaluator/LlamaContext/LlamaContext.js CHANGED Viewed

@@ -1,11 +1,15 @@
-import { DisposeAggregator, EventRelay, withLock, DisposedError } from "lifecycle-utils";
+import { DisposeAggregator, EventRelay, withLock, DisposedError, AsyncDisposeAggregator } from "lifecycle-utils";
 import { removeNullFields } from "../../utils/removeNullFields.js";
 import { compareTokens } from "../../utils/compareTokens.js";
-import { resolveBatchItemsPrioritizingStrategy } from "./utils/resolveBatchItemsPrioritizingStrategy.js";
+import { DisposeGuard } from "../../utils/DisposeGuard.js";
+import { minAllowedContextSizeInCalculations } from "../../config.js";
+import { TokenMeter } from "../TokenMeter.js";
+import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
 export class LlamaContext {
     /** @internal */ _llama;
     /** @internal */ _ctx;
     /** @internal */ _onReclaimUnusedSequenceId = new EventRelay();
+    /** @internal */ _backendContextDisposeGuard;
     /** @internal */ _model;
     /** @internal */ _contextSize;
     /** @internal */ _batchSize;
@@ -14,7 +18,8 @@ export class LlamaContext {
     /** @internal */ _batchingOptions;
     /** @internal */ _queuedDecodeSequenceIds = new Set();
     /** @internal */ _queuedDecodes = [];
-    /** @internal */ _disposeAggregator = new DisposeAggregator();
+    /** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
+    /** @internal */ _modelPreventDisposalHandle;
     /** @internal */ _nextGeneratedSequenceId = 0;
     /** @internal */ _dispatchDecodeScheduled = false;
     /** @internal */ _batchDispatchPending = false;
@@ -22,14 +27,13 @@ export class LlamaContext {
     /** @internal */ _allocatedContextSize;
     /** @internal */ _disposed = false;
     onDispose = new EventRelay();
-    /**
-     * @param options
-     */
-    constructor({ model, sequences = 1, seed = null, contextSize = model.trainContextSize, batchSize = contextSize, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy = "maximumParallelism" } = {}, _embedding, _noSeed }) {
-        if (model.disposed)
+    constructor({ _model }, { sequences, seed = null, contextSize, batchSize, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, _embeddings, _noSeed }) {
+        if (_model.disposed)
             throw new DisposedError();
-        this._llama = model._llama;
-        this._model = model;
+        this._llama = _model._llama;
+        this._model = _model;
+        this._backendContextDisposeGuard = new DisposeGuard([this._model._backendModelDisposeGuard]);
+        this._modelPreventDisposalHandle = this._model._backendModelDisposeGuard.createPreventDisposalHandle();
         this._totalSequences = Math.max(1, Math.floor(sequences));
         this._contextSize = Math.max(2, contextSize);
         this._batchSize = Math.max(batchSize, this._totalSequences);
@@ -37,30 +41,36 @@ export class LlamaContext {
             seed: seed != null ? Math.max(-1, Math.floor(seed)) : undefined,
             contextSize: this._contextSize * this._totalSequences,
             batchSize: this._batchSize,
+            sequences: this._totalSequences,
             threads: Math.max(0, Math.floor(threads)),
-            embedding: _embedding,
+            embeddings: _embeddings,
             noSeed: _noSeed
         }));
         this._batchingOptions = {
             dispatchSchedule: batchingDispatchSchedule,
-            itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy
+            itemPrioritizationStrategy: batchingItemsPrioritizationStrategy
         };
         this._reclaimUnusedSequenceId = this._reclaimUnusedSequenceId.bind(this);
-        this._disposeAggregator.add(this._onReclaimUnusedSequenceId);
-        this._disposeAggregator.add(this.onDispose.dispatchEvent);
         this._disposeAggregator.add(() => {
-            this._ctx.dispose();
+            this._disposed = true;
         });
+        this._disposeAggregator.add(this._onReclaimUnusedSequenceId);
+        this._disposeAggregator.add(this.onDispose.dispatchEvent);
         this._disposeAggregator.add(this.model.onDispose.createListener(disposeContextIfReferenced.bind(null, new WeakRef(this))));
+        this._disposeAggregator.add(async () => {
+            await this._backendContextDisposeGuard.acquireDisposeLock();
+            await this._ctx.dispose();
+            this._modelPreventDisposalHandle.dispose();
+        });
     }
-    dispose() {
+    async dispose() {
         if (this._disposed)
             return;
         this._disposed = true;
-        this._disposeAggregator.dispose();
+        await this._disposeAggregator.dispose();
     }
     /** @hidden */
-    [Symbol.dispose]() {
+    [Symbol.asyncDispose]() {
         return this.dispose();
     }
     get disposed() {
@@ -75,6 +85,14 @@ export class LlamaContext {
     get batchSize() {
         return this._batchSize;
     }
+    /**
+     * The actual size of the state in the memory in bytes.
+     * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
+     */
+    get stateSize() {
+        this._ensureNotDisposed();
+        return this._ctx.getStateSize();
+    }
     getAllocatedContextSize() {
         this._ensureNotDisposed();
         if (this._allocatedContextSize == null)
@@ -92,7 +110,7 @@ export class LlamaContext {
      * When there are no sequences left, this method will throw an error.
      * @param [options]
      */
-    getSequence({ contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} } = {}) {
+    getSequence({ contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, _tokenMeter } = {}) {
         this._ensureNotDisposed();
         const nextSequenceId = this._popSequenceId();
         if (nextSequenceId == null)
@@ -100,6 +118,7 @@ export class LlamaContext {
         return LlamaContextSequence._create({
             sequenceId: nextSequenceId,
             context: this,
+            tokenMeter: _tokenMeter,
             contextShift: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
@@ -116,17 +135,18 @@ export class LlamaContext {
             this._currentDispatchBatchHandle = {};
             this._dispatchDecodeScheduled = false;
             this._batchDispatchPending = false;
-            let prioritizeStrategy;
-            try {
-                this._ensureNotDisposed();
-                prioritizeStrategy = resolveBatchItemsPrioritizingStrategy(this._batchingOptions.itemsPrioritizingStrategy);
-            }
-            catch (err) {
-                this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
-                return;
-            }
-            let shouldHaveAnotherBatch = this._queuedDecodes.length > 0;
-            while (shouldHaveAnotherBatch) {
+            let shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
+            const resolvePrioritizationStrategy = () => {
+                try {
+                    this._ensureNotDisposed();
+                    return resolveBatchItemsPrioritizationStrategy(this._batchingOptions.itemPrioritizationStrategy);
+                }
+                catch (err) {
+                    this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
+                }
+                return null;
+            };
+            const getOrderedQueuedDecodes = (prioritizationStrategy) => {
                 const batchItemToQueuedDecodeMap = new Map();
                 const batchItemsList = [];
                 for (const queuedDecode of this._queuedDecodes) {
@@ -139,42 +159,65 @@ export class LlamaContext {
                 }
                 let prioritizedItems;
                 try {
-                    prioritizedItems = prioritizeStrategy({
+                    prioritizedItems = prioritizationStrategy({
                         items: batchItemsList,
                         size: this._batchSize
                     });
                 }
                 catch (err) {
                     this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
-                    return;
+                    return null;
                 }
-                let batchTokenSlotsLeft = this._batchSize;
-                const afterDecodeActions = [];
-                const queuedDecodesToDelete = new Set();
-                const currentQueuedDecodeItems = new Set();
-                const currentBatchItems = [];
-                let currentBatchSize = 0;
-                for (const prioritizedItem of prioritizedItems) {
+                return prioritizedItems.map((prioritizedItem) => {
                     const queuedDecode = batchItemToQueuedDecodeMap.get(prioritizedItem.item);
                     if (queuedDecode == null)
                         throw new Error("Received invalid batch item. Make sure you keep the original object reference " +
                             "of the batch item on `item` on `PrioritizedBatchItem` in your custom prioritization strategy");
-                    const processAmount = Math.min(queuedDecode.tokens.length, prioritizedItem.processAmount, batchTokenSlotsLeft);
-                    if (processAmount <= 0)
+                    return {
+                        queuedDecode,
+                        processAmount: prioritizedItem.processAmount
+                    };
+                });
+            };
+            const fitQueuedDecodesToABatch = (queuedDecodes, batchSize) => {
+                const currentBatchItems = [];
+                let currentBatchSize = 0;
+                let batchTokenSlotsLeft = batchSize;
+                for (const { queuedDecode, processAmount } of queuedDecodes) {
+                    const resolvedProcessAmount = Math.min(processAmount <= 0 ? 1 : processAmount, queuedDecode.tokens.length, batchTokenSlotsLeft);
+                    if (resolvedProcessAmount <= 0) {
+                        if (batchTokenSlotsLeft === 0)
+                            break;
                         continue;
-                    batchTokenSlotsLeft -= processAmount;
+                    }
+                    batchTokenSlotsLeft -= resolvedProcessAmount;
+                    currentBatchSize += resolvedProcessAmount;
                     currentBatchItems.push({
                         queuedDecode,
-                        processAmount
+                        processAmount: resolvedProcessAmount
                     });
-                    currentBatchSize += processAmount;
                 }
+                return {
+                    currentBatchItems,
+                    currentBatchSize
+                };
+            };
+            const decodeTokenBatchItems = async (batchItems, currentBatchSize) => {
+                const afterDecodeActions = [];
+                const queuedDecodesToDelete = new Set();
+                const currentQueuedDecodeItems = new Set();
                 if (currentBatchSize !== 0)
                     this._ctx.initBatch(currentBatchSize);
-                for (const { queuedDecode, processAmount } of currentBatchItems) {
+                for (const { queuedDecode, processAmount } of batchItems) {
                     let batchLogitIndex;
                     try {
-                        batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(queuedDecode.tokens.slice(0, processAmount)), queuedDecode.generateLogitAtTheEnd && processAmount === queuedDecode.tokens.length);
+                        const shouldGenerateLogitAtTheEnd = queuedDecode.generateLogitAtTheEnd &&
+                            processAmount === queuedDecode.tokens.length;
+                        const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
+                        const numberOfOutputTokens = shouldGenerateLogitAtTheEnd ? 1 : 0;
+                        TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
+                        TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
+                        batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), shouldGenerateLogitAtTheEnd);
                     }
                     catch (err) {
                         this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
@@ -193,8 +236,6 @@ export class LlamaContext {
                         queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
                         queuedDecode.firstTokenSequenceIndex += processAmount;
                     }
-                    if (batchTokenSlotsLeft === 0)
-                        break;
                 }
                 for (let i = 0; i < this._queuedDecodes.length; i++) {
                     const queuedDecode = this._queuedDecodes[i];
@@ -204,7 +245,6 @@ export class LlamaContext {
                         i--;
                     }
                 }
-                shouldHaveAnotherBatch = this._queuedDecodes.length > 0;
                 try {
                     if (currentBatchSize !== 0)
                         await this._ctx.decodeBatch();
@@ -225,15 +265,45 @@ export class LlamaContext {
                     }
                     accept(undefined);
                 }
+            };
+            const prioritizationStrategy = resolvePrioritizationStrategy();
+            if (prioritizationStrategy == null)
+                return; // all queued items are rejected and dequeued when we get here
+            while (shouldHaveAnotherLoop) {
+                const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
+                if (orderedQueuedDecodes == null)
+                    return; // all queued items are rejected and dequeued when we get here
+                const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
+                let preventDisposalHandle;
+                try {
+                    preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
+                }
+                catch (err) {
+                    this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
+                    return;
+                }
+                try {
+                    await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
+                    shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
+                }
+                finally {
+                    preventDisposalHandle.dispose();
+                }
             }
         });
     }
+    /**
+     * Print the timings of token evaluation since that last print for this context.
+     * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
+     * it won't print anything.
+     */
     async printTimings() {
+        this._ensureNotDisposed();
         this._ctx.printTimings();
         await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
     }
     /** @internal */
-    async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5 }, onDone) {
+    async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5, tokenMeter }, onDone) {
         return await new Promise((accept, reject) => {
             this._queuedDecodes.push({
                 sequenceId,
@@ -241,6 +311,7 @@ export class LlamaContext {
                 firstTokenSequenceIndex,
                 generateLogitAtTheEnd,
                 evaluationPriority,
+                tokenMeter,
                 response: [accept, reject],
                 onDone
             });
@@ -253,6 +324,8 @@ export class LlamaContext {
         if (this._disposed)
             return;
         void withLock(this, "context", async () => {
+            if (this._disposed)
+                return;
             this._ctx.disposeSequence(sequenceId);
             this._unusedSequenceIds.push(sequenceId);
             this._onReclaimUnusedSequenceId.dispatchEvent();
@@ -312,20 +385,65 @@ export class LlamaContext {
         if (this._disposed)
             throw new DisposedError();
     }
+    /** @internal */
+    static async _create(options, { _model }) {
+        const sequences = options.sequences ?? getDefaultContextSequences();
+        const contextSize = resolveContextContextSizeOption({
+            contextSize: options.contextSize,
+            batchSize: options.batchSize,
+            sequences: sequences,
+            modelFileInsights: _model.fileInsights,
+            modelGpuLayers: _model.gpuLayers,
+            modelTrainContextSize: _model.trainContextSize,
+            getVramState: () => _model._llama._vramOrchestrator.getMemoryState(),
+            llamaGpu: _model._llama.gpu,
+            ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks,
+            isEmbeddingContext: options._embeddings
+        });
+        const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
+        const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
+            contextSize,
+            sequences,
+            isEmbeddingContext: options._embeddings,
+            modelGpuLayers: _model.gpuLayers,
+            batchSize
+        }).gpuVram;
+        const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences });
+        const { createSignal } = options;
+        const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
+            ? null
+            : _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
+        try {
+            const contextLoaded = await context._ctx.init();
+            if (createSignal?.aborted) {
+                if (contextLoaded)
+                    await context._ctx.dispose();
+                throw createSignal.reason;
+            }
+            else if (!contextLoaded)
+                throw new Error("Failed to create context");
+            return context;
+        }
+        finally {
+            contextCreationMemoryReservation?.dispose?.();
+        }
+    }
 }
 export class LlamaContextSequence {
     /** @internal */ _sequenceId;
     /** @internal */ _gcRegistry;
     /** @internal */ _context;
     /** @internal */ _contextShift;
+    /** @internal */ _tokenMeter;
     /** @internal */ _disposeAggregator = new DisposeAggregator();
     /** @internal */ _contextTokens = [];
     /** @internal */ _nextTokenIndex = 0;
     /** @internal */ _disposed = false;
     onDispose = new EventRelay();
-    constructor({ sequenceId, context, contextShift }) {
+    constructor({ sequenceId, context, tokenMeter, contextShift }) {
         this._sequenceId = sequenceId;
         this._context = context;
+        this._tokenMeter = tokenMeter ?? new TokenMeter();
         this._contextShift = contextShift;
         this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
         this._gcRegistry.register(this, sequenceId);
@@ -362,6 +480,9 @@ export class LlamaContextSequence {
     get contextTokens() {
         return this._contextTokens.slice();
     }
+    get tokenMeter() {
+        return this._tokenMeter;
+    }
     get isLoadedToMemory() {
         return !this._disposed;
     }
@@ -387,7 +508,7 @@ export class LlamaContextSequence {
     }
     /**
      * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
-     * the start and end of each range are exclusive.
+     * The start of each range is inclusive, and the end of each range is exclusive.
      * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
      */
     async eraseContextTokenRanges(ranges) {
@@ -396,6 +517,8 @@ export class LlamaContextSequence {
             this._ensureNotDisposed();
             if (ranges.length === 0)
                 return;
+            // if the deletion fails, we'll have to dispose the sequence and fill it up again
+            let deletionSuccessful = true;
             const resolvedRanges = ranges
                 .map(({ start, end }) => {
                 if (start === end)
@@ -425,15 +548,22 @@ export class LlamaContextSequence {
             let lastDeleteRangeEndPos = null;
             for (const range of resolvedRanges) {
                 this._contextTokens.splice(range.start - removedTokens, range.end - range.start);
-                this._context._ctx.removeTokenCellsFromSequence(this._sequenceId, range.start, range.end);
-                if (lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== range.start)
+                if (deletionSuccessful)
+                    deletionSuccessful &&= this._context._ctx.removeTokenCellsFromSequence(this._sequenceId, range.start, range.end);
+                if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== range.start)
                     this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, range.start, -removedTokens);
                 removedTokens += range.end - range.start;
                 lastDeleteRangeEndPos = range.end;
             }
-            if (lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== this._nextTokenIndex)
+            if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== this._nextTokenIndex)
                 this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, this._nextTokenIndex, -removedTokens);
             this._nextTokenIndex -= removedTokens;
+            if (deletionSuccessful)
+                return;
+            const newSequenceTokens = this._contextTokens.slice();
+            this._nextTokenIndex = 0;
+            this._context._ctx.disposeSequence(this._sequenceId);
+            await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens);
         });
     }
     /**
@@ -485,7 +615,7 @@ export class LlamaContextSequence {
         while (true) {
             this._ensureNotDisposed();
             // Evaluate to get the next token.
-            const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, contextShiftOptions, (batchLogitIndex) => {
+            const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
                 const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
                     ? repeatPenalty.punishTokens()
                     : repeatPenalty?.punishTokens;
@@ -519,7 +649,7 @@ export class LlamaContextSequence {
         }
     }
     /** @internal */
-    async _decodeTokens(tokens, generateLogit, evaluationPriority, contextShiftOptions, onDecodeDone) {
+    async _decodeTokens(tokens, generateLogit, evaluationPriority, tokenMeter, contextShiftOptions, onDecodeDone) {
         this._ensureNotDisposed();
         const tokensLeftToDecode = tokens.slice();
         return await withLock(this, "evaluate", async () => {
@@ -539,7 +669,8 @@ export class LlamaContextSequence {
                     tokens: tokensToDecode,
                     firstTokenSequenceIndex: this._nextTokenIndex,
                     generateLogitAtTheEnd,
-                    evaluationPriority
+                    evaluationPriority,
+                    tokenMeter
                 }, !generateLogitAtTheEnd
                     ? undefined
                     : onDecodeDone);
@@ -586,10 +717,11 @@ export class LlamaContextSequence {
      * We need this to make it impossible to manually create instances of this class outside the code of this library
      * @internal
      */
-    static _create({ sequenceId, context, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
+    static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
         return new LlamaContextSequence({
             sequenceId,
             context,
+            tokenMeter,
             contextShift: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
@@ -600,11 +732,70 @@ export class LlamaContextSequence {
 function disposeContextIfReferenced(contextRef) {
     const context = contextRef.deref();
     if (context != null)
-        context.dispose();
+        void context.dispose();
 }
 function disposeContextSequenceIfReferenced(contextRef) {
     const context = contextRef.deref();
     if (context != null)
         context.dispose();
 }
+export function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, getVramState, llamaGpu, ignoreMemorySafetyChecks = false, isEmbeddingContext = false }) {
+    if (contextSize == null)
+        contextSize = "auto";
+    if (typeof contextSize === "number") {
+        const resolvedContextSize = Math.max(1, Math.floor(contextSize));
+        if (ignoreMemorySafetyChecks)
+            return resolvedContextSize;
+        const vramState = getVramState();
+        const contextVram = modelFileInsights.estimateContextResourceRequirements({
+            contextSize: resolvedContextSize,
+            batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: resolvedContextSize, sequences }),
+            modelGpuLayers: modelGpuLayers,
+            sequences,
+            isEmbeddingContext
+        }).gpuVram;
+        if (contextVram > vramState.free)
+            throw new Error(`The context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`);
+        return resolvedContextSize;
+    }
+    else if (contextSize === "auto" || typeof contextSize === "object") {
+        if (llamaGpu === false)
+            return modelTrainContextSize;
+        const vramState = getVramState();
+        if (vramState.total === 0)
+            return modelTrainContextSize;
+        const freeVram = vramState.free;
+        const maxContextSize = contextSize === "auto"
+            ? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize })
+            : Math.min(contextSize.max ?? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }), getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }));
+        const minContextSize = contextSize === "auto"
+            ? minAllowedContextSizeInCalculations
+            : Math.max(contextSize.min ?? minAllowedContextSizeInCalculations, minAllowedContextSizeInCalculations);
+        for (let testContextSize = maxContextSize; testContextSize >= minContextSize; testContextSize--) {
+            const contextVram = modelFileInsights.estimateContextResourceRequirements({
+                contextSize: testContextSize,
+                batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: testContextSize, sequences }),
+                modelGpuLayers: modelGpuLayers,
+                sequences,
+                isEmbeddingContext
+            }).gpuVram;
+            if (contextVram <= freeVram)
+                return testContextSize;
+        }
+        if (ignoreMemorySafetyChecks)
+            return minContextSize;
+        throw new Error(`The available VRAM is too small to fit the context size of ${maxContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""}`);
+    }
+    throw new Error(`Invalid context size: "${contextSize}"`);
+}
+export function getDefaultContextBatchSize({ contextSize, sequences }) {
+    return Math.min(contextSize * sequences, 512);
+}
+export function getDefaultContextSequences() {
+    return 1;
+}
+const defaultFallbackContextSize = 4096;
+export function getDefaultModelContextSize({ trainContextSize }) {
+    return trainContextSize ?? defaultFallbackContextSize;
+}
 //# sourceMappingURL=LlamaContext.js.map