npm - node-llama-cpp - Versions diffs - 3.17.1 → 3.18.1 - Mend

node-llama-cpp 3.17.1 → 3.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (121) hide show

package/dist/evaluator/LlamaContext/LlamaContext.js CHANGED Viewed

@@ -8,9 +8,12 @@ import { UnsupportedError } from "../../utils/UnsupportedError.js";
 import { pushAll } from "../../utils/pushAll.js";
 import { safeEventCallback } from "../../utils/safeEventCallback.js";
 import { GgufArchitectureType } from "../../gguf/types/GgufMetadataTypes.js";
+import { LlamaLogLevel } from "../../bindings/types.js";
+import { resolveGgmlTypeOption } from "../../gguf/types/GgufTensorInfoTypes.js";
 import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
 import { LlamaSampler } from "./LlamaSampler.js";
 import { padSafeContextSize } from "./utils/padSafeContextSize.js";
+import { LlamaContextSequenceCheckpoints } from "./LlamaContextSequenceCheckpoints.js";
 const defaultLoraScale = 1;
 const shrinkRetriesMinContextSize = 4096;
 const defaultMaxPunishTokens = 64;
@@ -20,6 +23,25 @@ const defaultFailedCreationRemedy = {
 };
 const defaultEvaluationPriority = 5;
 const defaultDryRepeatPenalitySequenceBreakers = ["\n", ":", '"', "*"];
+const defaultCheckpointOptions = {
+    max: 32,
+    interval: 8192,
+    maxMemory: null
+};
+export const internalCheckpoints = {
+    speculative: {
+        name: "speculative",
+        maxCheckpoints: 2
+    },
+    chatSequenceStart: {
+        name: "sequenceStart",
+        maxCheckpoints: 1
+    },
+    chatGrammarEnd: {
+        name: "grammarEnd",
+        maxCheckpoints: 1
+    }
+};
 const decodeSyncWorkaround = {
     vulkanLock: {}
 };
@@ -35,6 +57,8 @@ export class LlamaContext {
     /** @internal */ _idealThreads;
     /** @internal */ _minThreads;
     /** @internal */ _performanceTracking;
+    /** @internal */ _kvCacheKeyType;
+    /** @internal */ _kvCacheValueType;
     /** @internal */ _totalSequences;
     /** @internal */ _unusedSequenceIds = [];
     /** @internal */ _batchingOptions;
@@ -53,7 +77,7 @@ export class LlamaContext {
     /** @internal */ _allocatedContextSize;
     /** @internal */ _disposed = false;
     onDispose = new EventRelay();
-    constructor({ _model }, { sequences, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { dispatchSchedule: batchingDispatchSchedule = "nextCycle", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, swaFullCache = _model.defaultContextSwaFullCache, performanceTracking = false, _embeddings, _ranking }) {
+    constructor({ _model }, { sequences, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { dispatchSchedule: batchingDispatchSchedule = "nextCycle", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, swaFullCache = _model.defaultContextSwaFullCache, performanceTracking = false, experimentalKvCacheKeyType, experimentalKvCacheValueType, _embeddings, _ranking }) {
         if (_model.disposed)
             throw new DisposedError();
         this._llama = _model._llama;
@@ -73,6 +97,8 @@ export class LlamaContext {
             ? 1
             : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1));
         this._performanceTracking = !!performanceTracking;
+        this._kvCacheKeyType = experimentalKvCacheKeyType;
+        this._kvCacheValueType = experimentalKvCacheValueType;
         this._swaFullCache = !!swaFullCache;
         this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
             contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own <contextSize> of cells
@@ -85,6 +111,8 @@ export class LlamaContext {
             embeddings: _embeddings,
             ranking: _ranking,
             performanceTracking: this._performanceTracking,
+            kvCacheKeyType: this._kvCacheKeyType,
+            kvCacheValueType: this._kvCacheValueType,
             swaFullCache: this._swaFullCache
         }));
         this._batchingOptions = {
@@ -130,6 +158,12 @@ export class LlamaContext {
     get flashAttention() {
         return this._flashAttention;
     }
+    get kvCacheKeyType() {
+        return this._kvCacheKeyType;
+    }
+    get kvCacheValueType() {
+        return this._kvCacheValueType;
+    }
     /**
      * The actual size of the state in the memory in bytes.
      * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
@@ -168,7 +202,7 @@ export class LlamaContext {
      * When there are no sequences left, this method will throw an error.
      */
     getSequence(options = {}) {
-        const { contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, tokenPredictor, _tokenMeter } = options;
+        const { contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, tokenPredictor, checkpoints, _tokenMeter } = options;
         this._ensureNotDisposed();
         const nextSequenceId = this._popSequenceId();
         if (nextSequenceId == null)
@@ -181,7 +215,8 @@ export class LlamaContext {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
             },
-            tokenPredictor
+            tokenPredictor,
+            checkpoints
         });
     }
     dispatchPendingBatch() {
@@ -293,16 +328,18 @@ export class LlamaContext {
                             batchLogitIndexes,
                             batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
                             firstTokenIndex: queuedDecode.firstTokenSequenceIndex,
+                            sequenceStateLength: queuedDecode.firstTokenSequenceIndex + processAmount + 1,
                             returnResults: true
                         });
                     }
                     else {
-                        if (batchLogitIndexes.length > 0)
+                        if (batchLogitIndexes.length > 0 || queuedDecode.afterBatchAction != null)
                             afterDecodeActions.push({
                                 queuedDecode,
                                 batchLogitIndexes,
                                 batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
-                                firstTokenIndex: queuedDecode.firstTokenSequenceIndex
+                                firstTokenIndex: queuedDecode.firstTokenSequenceIndex,
+                                sequenceStateLength: queuedDecode.firstTokenSequenceIndex + processAmount + 1
                             });
                         queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
                         queuedDecode.logits = queuedDecode.logits.slice(processAmount);
@@ -378,6 +415,11 @@ export class LlamaContext {
                     return undefined;
                 });
                 await Promise.all(afterDecodeActionResults);
+                for (const action of afterDecodeActions) {
+                    const resPromise = action.queuedDecode.afterBatchAction?.(action.sequenceStateLength);
+                    if (resPromise instanceof Promise)
+                        await resPromise;
+                }
             };
             const prioritizationStrategy = resolvePrioritizationStrategy();
             if (prioritizationStrategy == null)
@@ -432,7 +474,7 @@ export class LlamaContext {
         await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
     }
     /** @internal */
-    async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, logits, evaluationPriority = defaultEvaluationPriority, tokenMeter }, logitDataMapper) {
+    async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, logits, evaluationPriority = defaultEvaluationPriority, tokenMeter, afterBatchAction }, logitDataMapper) {
         return await new Promise((accept, reject) => {
             this._queuedDecodes.push({
                 sequenceId,
@@ -442,7 +484,8 @@ export class LlamaContext {
                 evaluationPriority,
                 tokenMeter,
                 response: [accept, reject],
-                logitDataMapper
+                logitDataMapper,
+                afterBatchAction
             });
             this._queuedDecodeSequenceIds.add(sequenceId);
             this._scheduleDecode();
@@ -568,6 +611,12 @@ export class LlamaContext {
         const flashAttention = _model.flashAttentionSupported
             ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention)
             : false;
+        const kvCacheKeyType = options.experimentalKvCacheKeyType === "currentQuant"
+            ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheKeyType
+            : resolveGgmlTypeOption(options.experimentalKvCacheKeyType) ?? _model.defaultContextKvCacheKeyType;
+        const kvCacheValueType = options.experimentalKvCacheValueType === "currentQuant"
+            ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheValueType
+            : resolveGgmlTypeOption(options.experimentalKvCacheValueType) ?? _model.defaultContextKvCacheValueType;
         const swaFullCache = options.swaFullCache ?? _model.defaultContextSwaFullCache;
         const loraOptions = typeof options.lora === "string"
             ? { adapters: [{ filePath: options.lora }] }
@@ -584,6 +633,8 @@ export class LlamaContext {
             modelGpuLayers: _model.gpuLayers,
             modelTrainContextSize: _model.trainContextSize,
             flashAttention,
+            kvCacheKeyType,
+            kvCacheValueType,
             swaFullCache,
             getVramState: () => _model._llama._vramOrchestrator.getMemoryState(),
             llamaGpu: _model._llama.gpu,
@@ -612,9 +663,20 @@ export class LlamaContext {
                 modelGpuLayers: _model.gpuLayers,
                 batchSize,
                 flashAttention,
+                kvCacheKeyType,
+                kvCacheValueType,
+                swaFullCache
+            });
+            const context = new LlamaContext({ _model }, {
+                ...options,
+                contextSize,
+                batchSize,
+                sequences,
+                flashAttention,
+                experimentalKvCacheKeyType: kvCacheKeyType,
+                experimentalKvCacheValueType: kvCacheValueType,
                 swaFullCache
             });
-            const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences, flashAttention, swaFullCache });
             const contextCreationVramReservation = options.ignoreMemorySafetyChecks
                 ? null
                 : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram);
@@ -697,6 +759,8 @@ export class LlamaContextSequence {
     /** @internal */ _context;
     /** @internal */ _contextShift;
     /** @internal */ _tokenPredictor;
+    /** @internal */ _checkpoints = new LlamaContextSequenceCheckpoints();
+    /** @internal */ _checkpointOptions;
     /** @internal */ _tokenMeter;
     /** @internal */ _disposeAggregator = new DisposeAggregator();
     /** @internal */ _lock = {};
@@ -711,22 +775,29 @@ export class LlamaContextSequence {
     /** @internal */ _refutedTokenPredictions = 0;
     /** @internal */ _disposed = false;
     onDispose = new EventRelay();
-    constructor({ sequenceId, context, tokenMeter, contextShift, tokenPredictor }) {
+    constructor({ sequenceId, context, tokenMeter, contextShift, tokenPredictor, checkpoints }) {
         this._sequenceId = sequenceId;
         this._context = context;
         this._tokenMeter = tokenMeter ?? new TokenMeter();
         this._contextShift = contextShift;
         this._tokenPredictor = tokenPredictor;
+        this._checkpointOptions = {
+            max: checkpoints?.max ?? defaultCheckpointOptions.max,
+            interval: checkpoints?.interval ?? defaultCheckpointOptions.interval,
+            maxMemory: checkpoints?.maxMemory ?? defaultCheckpointOptions.maxMemory
+        };
         this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
         this._gcRegistry.register(this, sequenceId);
         this._disposeAggregator.add(() => this._gcRegistry.unregister(this));
         this._disposeAggregator.add(this.onDispose.dispatchEvent);
         this._disposeAggregator.add(this.model.onDispose.createListener(disposeContextSequenceIfReferenced.bind(null, new WeakRef(this))));
         this._disposeAggregator.add(() => {
+            this._checkpoints.clearAllCheckpoints();
             this._context._reclaimUnusedSequenceId(this._sequenceId);
         });
         if (this._tokenPredictor != null)
             this._disposeAggregator.add(this._tokenPredictor);
+        this._takeIntervalCheckpointIfNeededAfterBatch = this._takeIntervalCheckpointIfNeededAfterBatch.bind(this);
     }
     dispose() {
         if (this._disposed)
@@ -892,7 +963,7 @@ export class LlamaContextSequence {
     /** @internal */
     async _eraseContextTokenRanges(ranges, { canResetTokenPredictor = true, canRemovePredictionTokens = true, skipLock = false } = {}) {
         this._ensureNotDisposed();
-        let awaitPromise;
+        let awaitEvaluationPromise;
         await withLock([this._context, "context"], async () => {
             this._ensureNotDisposed();
             if (ranges.length === 0)
@@ -968,16 +1039,39 @@ export class LlamaContextSequence {
             this._nextTokenIndex -= removedTokens;
             if (canResetTokenPredictor && removedTokens > 0)
                 await this._abortTokenPredictor(true);
+            this._checkpoints.pruneFromEndToIndex(this._contextTokens.length - 1);
             if (deletionSuccessful)
                 return;
+            let restoreCheckpointIndex = this._contextTokens.length - 1;
+            const existingCheckpoint = this._checkpoints.getLastCheckpoint(restoreCheckpointIndex, this.contextSize);
+            if (existingCheckpoint != null &&
+                restoreCheckpointIndex >= existingCheckpoint.minPos &&
+                existingCheckpoint.maxPos <= this.contextSize) {
+                restoreCheckpointIndex = Math.min(restoreCheckpointIndex, existingCheckpoint.maxPos);
+                const restoredSuccessfully = await this._context._ctx.restoreCheckpoint(existingCheckpoint, restoreCheckpointIndex);
+                if (restoredSuccessfully) {
+                    const tokensToEvaluate = this._contextTokens.slice(restoreCheckpointIndex + 1);
+                    this._contextTokens = this._contextTokens.slice(0, restoreCheckpointIndex + 1);
+                    this._nextTokenIndex = restoreCheckpointIndex + 1;
+                    // wait for the evaluation outside the "context" lock to avoid deadlocks
+                    if (tokensToEvaluate.length > 0)
+                        awaitEvaluationPromise = this.evaluateWithoutGeneratingNewTokens(tokensToEvaluate, { _skipLock: skipLock });
+                    return;
+                }
+            }
             const newSequenceTokens = this._contextTokens.slice();
             this._nextTokenIndex = 0;
             this._context._ctx.disposeSequence(this._sequenceId);
+            this._contextTokens = [];
             // wait for the evaluation outside the "context" lock to avoid deadlocks
-            awaitPromise = this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, { _skipLock: skipLock });
+            if (newSequenceTokens.length > 0)
+                awaitEvaluationPromise = this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, { _skipLock: skipLock });
         });
-        if (awaitPromise != null)
-            await awaitPromise;
+        if (awaitEvaluationPromise != null) {
+            await awaitEvaluationPromise;
+            if (this.needsCheckpoints && this._checkpoints.lastCheckpointIndex !== this._nextTokenIndex - 1)
+                await this.takeCheckpoint();
+        }
     }
     /**
      * Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
@@ -1168,7 +1262,7 @@ export class LlamaContextSequence {
                     onTokenResult?.(tokenIndex, output);
                     return output;
                 });
-            });
+            }, this._takeIntervalCheckpointIfNeededAfterBatch);
         }
         finally {
             evaluatorLock.dispose();
@@ -1188,6 +1282,7 @@ export class LlamaContextSequence {
         const contextLock = await acquireLock([this._context, "context"]);
         try {
             this._ensureNotDisposed();
+            // TODO: save checkpoints to disk
             const fileSize = await this._context._ctx.saveSequenceStateToFile(resolvedPath, this._sequenceId, Uint32Array.from(this.contextTokens));
             return { fileSize };
         }
@@ -1235,6 +1330,97 @@ export class LlamaContextSequence {
             evaluatorLock.dispose();
         }
     }
+    /**
+     * When reusing a prefix evaluation state is not possible for the current context sequence
+     * (like in contexts from recurrent and hybrid models,
+     * or with models that use SWA (Sliding Window Attention) when the `swaFullCache` option is not enabled on the context),
+     * you can use this method to checkpoint the current context sequence state.
+     * Those checkpoints will automatically be used when trying to erase parts of the context state that come after a checkpointed state,
+     * and be freed from memory when no longer relevant.
+     *
+     * Those checkpoints are relatively lightweight compared to saving the entire state,
+     * but taking too many checkpoints can increase memory usage.
+     * Checkpoints are stored in the RAM (not VRAM).
+     *
+     * Calling this method on a context sequence from a model that natively supports prefix evaluation state reuse will have no effect.
+     *
+     * > **Note:** to check whether the current context sequence needs taking checkpoints,
+     * > you can use the {@link needsCheckpoints `.needsCheckpoints`} property.
+     */
+    async takeCheckpoint() {
+        if (!this.needsCheckpoints)
+            return;
+        return await withLock([this._context, "context"], () => {
+            return this._takeCheckpoint(undefined, this._checkpointOptions.max);
+        });
+    }
+    /** @internal */
+    async _takeNamedCheckpoint(name, maxNamedCheckpoints) {
+        if (!this.needsCheckpoints)
+            return;
+        return await withLock([this._context, "context"], () => {
+            return this._takeCheckpoint(name, maxNamedCheckpoints);
+        });
+    }
+    /**
+     * Whether the current context sequence needs taking checkpoints of the context state to be able to reuse
+     * it as a prefix evaluation state in the future.
+     *
+     * See {@link takeCheckpoint `.takeCheckpoint()`} for more details.
+     */
+    get needsCheckpoints() {
+        if (this.model.fileInsights.isHybrid || this.model.fileInsights.isRecurrent)
+            return true;
+        else if (this.model.fileInsights.swaSize != null && !this._context._swaFullCache)
+            return true;
+        return false;
+    }
+    /**
+     * The index of the last taken checkpoint that's available for prefix reuse
+     */
+    get lastCheckpointIndex() {
+        return Math.max(0, Math.min(this._checkpoints.lastCheckpointIndex, this.nextTokenIndex - 1));
+    }
+    /**
+     * The total memory usage in bytes of all the checkpoints currently held for this context sequence
+     */
+    get checkpointsMemoryUsage() {
+        return this._checkpoints.memoryUsage;
+    }
+    /** @internal */
+    async _takeCheckpoint(name, maxNamedCheckpoints) {
+        if (!this.needsCheckpoints || this._nextTokenIndex === 0 || this._checkpoints.hasCheckpoint(name, this._nextTokenIndex - 1))
+            return;
+        if (this._checkpointOptions.maxMemory != null)
+            this._checkpoints.prepareMemoryForIncomingCheckpoint(this._checkpointOptions.maxMemory);
+        const checkpoint = new this.model._llama._bindings.AddonContextSequenceCheckpoint();
+        await checkpoint.init(this._context._ctx, this._sequenceId);
+        if (this._nextTokenIndex - 1 !== checkpoint.maxPos)
+            this.model._llama._log(LlamaLogLevel.warn, `Checkpoint max position mismatch: expected ${this._nextTokenIndex - 1}, got ${checkpoint.maxPos}`);
+        this._checkpoints.storeCheckpoint({
+            name,
+            maxNamedCheckpoints,
+            checkpoint,
+            currentMaxPos: checkpoint.maxPos
+        });
+        if (this._checkpointOptions.maxMemory != null)
+            this._checkpoints.pruneToKeepUnderMemoryUsage(this._checkpointOptions.maxMemory);
+    }
+    /** @internal */
+    _takeIntervalCheckpointIfNeeded(currentIndex = this._nextTokenIndex - 1) {
+        if (!this.needsCheckpoints)
+            return;
+        const lastCheckpointIndex = this._checkpoints.getLastNamedCheckpointIndex(undefined);
+        if (this._checkpointOptions.interval === false || currentIndex - lastCheckpointIndex < this._checkpointOptions.interval)
+            return;
+        return this._takeCheckpoint(undefined, this._checkpointOptions.max);
+    }
+    /** @internal */
+    _takeIntervalCheckpointIfNeededAfterBatch(sequenceStateLength) {
+        if (sequenceStateLength === 0)
+            return;
+        return this._takeIntervalCheckpointIfNeeded(sequenceStateLength - 1);
+    }
     /** @internal */
     async *_evaluate(tokens, metadata, { temperature, minP, topK, topP, seed, xtc, grammarEvaluationState, repeatPenalty, dryRepeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false, _skipLock = false }) {
         this._ensureNotDisposed();
@@ -1282,7 +1468,7 @@ export class LlamaContextSequence {
                             else
                                 return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
                         });
-                    });
+                    }, this._takeIntervalCheckpointIfNeededAfterBatch);
                     const lastDecodeResult = decodeResult[evalTokens.length - 1];
                     if (lastDecodeResult instanceof Array) {
                         const [token, probabilities, confidence] = lastDecodeResult;
@@ -1366,6 +1552,14 @@ export class LlamaContextSequence {
                         const deleteStartIndex = Math.max(0, this._nextTokenIndex - this._loadedTokenPredictions.length);
                         await this._eraseContextTokenRanges([{ start: deleteStartIndex, end: this._nextTokenIndex }], { canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock: true });
                         this._loadedTokenPredictions.length = 0;
+                        if (this.needsCheckpoints) {
+                            await this._takeCheckpoint(internalCheckpoints.speculative.name, internalCheckpoints.speculative.maxCheckpoints);
+                            await this._takeIntervalCheckpointIfNeeded();
+                        }
+                    }
+                    else if (this._tokenPredictorOwner === tokenPredictorOwner && this.needsCheckpoints) {
+                        await this._takeCheckpoint(internalCheckpoints.speculative.name, internalCheckpoints.speculative.maxCheckpoints);
+                        await this._takeIntervalCheckpointIfNeeded();
                     }
                     if (this._resetTokenPredictor) {
                         await tokenPredictor.reset({
@@ -1578,7 +1772,7 @@ export class LlamaContextSequence {
      * The caller of this function has to wrap it with a lock to ensure this function doesn't run concurrently.
      * @internal
      */
-    async _decodeTokens(tokens, logits, evaluationPriority, tokenMeter, contextShiftOptions, logitDataMapper) {
+    async _decodeTokens(tokens, logits, evaluationPriority, tokenMeter, contextShiftOptions, logitDataMapper, afterBatchAction) {
         this._ensureNotDisposed();
         const tokensLeftToDecode = tokens.slice();
         const tokenLogitsLeftToDecode = logits.slice();
@@ -1604,7 +1798,8 @@ export class LlamaContextSequence {
                 firstTokenSequenceIndex: this._nextTokenIndex,
                 logits: tokensLogits,
                 evaluationPriority,
-                tokenMeter
+                tokenMeter,
+                afterBatchAction
             }, normalizedLogitDataMapper);
             for (const [index, value] of generatedLogits)
                 res[currentTokenIndex + (index - this._nextTokenIndex)] = value;
@@ -1648,7 +1843,7 @@ export class LlamaContextSequence {
      * We need this to make it impossible to manually create instances of this class outside the code of this library
      * @internal
      */
-    static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, tokenPredictor }) {
+    static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, tokenPredictor, checkpoints }) {
         return new LlamaContextSequence({
             sequenceId,
             context,
@@ -1657,7 +1852,8 @@ export class LlamaContextSequence {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
             },
-            tokenPredictor
+            tokenPredictor,
+            checkpoints
         });
     }
 }