npm - node-llama-cpp - Versions diffs - 3.0.0-beta.44 → 3.0.0-beta.46 - Mend

node-llama-cpp 3.0.0-beta.44 → 3.0.0-beta.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (329) hide show

package/dist/evaluator/LlamaContext/LlamaContext.js CHANGED Viewed

@@ -3,8 +3,16 @@ import { removeNullFields } from "../../utils/removeNullFields.js";
 import { compareTokens } from "../../utils/compareTokens.js";
 import { DisposeGuard } from "../../utils/DisposeGuard.js";
 import { TokenMeter } from "../TokenMeter.js";
+import { UnsupportedError } from "../../utils/UnsupportedError.js";
 import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
+import { LlamaSampler } from "./LlamaSampler.js";
 const defaultLoraScale = 1;
+const shrinkRetriesMinContextSize = 4096;
+const defaultMaxPunishTokens = 64;
+const defaultFailedCreationRemedy = {
+    retries: 6,
+    autoContextSizeShrink: 0.16
+};
 export class LlamaContext {
     /** @internal */ _llama;
     /** @internal */ _ctx;
@@ -14,6 +22,9 @@ export class LlamaContext {
     /** @internal */ _contextSize;
     /** @internal */ _batchSize;
     /** @internal */ _flashAttention;
+    /** @internal */ _idealThreads;
+    /** @internal */ _minThreads;
+    /** @internal */ _performanceTracking;
     /** @internal */ _totalSequences;
     /** @internal */ _unusedSequenceIds = [];
     /** @internal */ _batchingOptions;
@@ -26,11 +37,13 @@ export class LlamaContext {
     /** @internal */ _nextGeneratedSequenceId = 0;
     /** @internal */ _dispatchDecodeScheduled = false;
     /** @internal */ _batchDispatchPending = false;
+    /** @internal */ _threadSplitterConsumer;
+    /** @internal */ _freeReservedThreadsTimeout;
     /** @internal */ _currentDispatchBatchHandle = {};
     /** @internal */ _allocatedContextSize;
     /** @internal */ _disposed = false;
     onDispose = new EventRelay();
-    constructor({ _model }, { sequences, seed = null, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, _embeddings, _noSeed }) {
+    constructor({ _model }, { sequences, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, performanceTracking = false, _embeddings }) {
         if (_model.disposed)
             throw new DisposedError();
         this._llama = _model._llama;
@@ -41,15 +54,23 @@ export class LlamaContext {
         this._contextSize = Math.max(2, contextSize);
         this._batchSize = Math.max(batchSize, this._totalSequences);
         this._flashAttention = flashAttention;
+        this._idealThreads = typeof threads === "number"
+            ? this._llama._threadsSplitter.normalizeThreadsValue(threads)
+            : this._llama._threadsSplitter.normalizeThreadsValue(threads?.ideal ?? (this._llama.maxThreads === 0
+                ? this._llama.cpuMathCores
+                : this._llama.maxThreads));
+        this._minThreads = Math.max(1, typeof threads === "number"
+            ? 1
+            : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1));
+        this._performanceTracking = !!performanceTracking;
         this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
-            seed: seed != null ? Math.max(-1, Math.floor(seed)) : undefined,
             contextSize: this._contextSize * this._totalSequences, // each sequence needs its own <contextSize> of cells
             batchSize: this._batchSize,
             sequences: this._totalSequences,
             flashAttention: this._flashAttention,
-            threads: Math.max(0, Math.floor(threads)),
+            threads: this._idealThreads,
             embeddings: _embeddings,
-            noSeed: _noSeed
+            performanceTracking: this._performanceTracking
         }));
         this._batchingOptions = {
             dispatchSchedule: batchingDispatchSchedule,
@@ -58,6 +79,7 @@ export class LlamaContext {
         this._gcRegistry = new FinalizationRegistry(this._model._removeLoraUsage);
         this._gcRegistry.register(this, this._loraAdapters);
         this._reclaimUnusedSequenceId = this._reclaimUnusedSequenceId.bind(this);
+        this._freeReservedThreads = this._freeReservedThreads.bind(this);
         this._disposeAggregator.add(() => {
             this._disposed = true;
         });
@@ -111,6 +133,19 @@ export class LlamaContext {
         this._ensureNotDisposed();
         return this._ctx.getStateSize();
     }
+    /** The number of threads currently used to evaluate tokens */
+    get currentThreads() {
+        this._ensureNotDisposed();
+        return this._ctx.getThreads();
+    }
+    /**
+     * The number of threads that are preferred to be used to evaluate tokens.
+     *
+     * The actual number of threads used may be lower when other evaluations are running in parallel.
+     */
+    get idealThreads() {
+        return this._idealThreads;
+    }
     getAllocatedContextSize() {
         this._ensureNotDisposed();
         if (this._allocatedContextSize == null)
@@ -263,13 +298,22 @@ export class LlamaContext {
                         i--;
                     }
                 }
-                try {
-                    if (currentBatchSize !== 0)
+                if (currentBatchSize !== 0) {
+                    const allocationResult = this._threadSplitterConsumer?.getAllocationToConsume();
+                    const [threadsToUse, consumerHandle] = allocationResult instanceof Promise
+                        ? await allocationResult ?? []
+                        : allocationResult ?? [];
+                    try {
+                        if (threadsToUse != null)
+                            this._ctx.setThreads(threadsToUse);
                         await this._ctx.decodeBatch();
-                }
-                catch (err) {
-                    this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
-                    return;
+                        consumerHandle?.dispose();
+                    }
+                    catch (err) {
+                        consumerHandle?.dispose();
+                        this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
+                        return;
+                    }
                 }
                 for (const action of afterDecodeActions) {
                     const [accept, reject] = action.response;
@@ -287,36 +331,47 @@ export class LlamaContext {
             const prioritizationStrategy = resolvePrioritizationStrategy();
             if (prioritizationStrategy == null)
                 return; // all queued items are rejected and dequeued when we get here
-            while (shouldHaveAnotherLoop) {
-                const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
-                if (orderedQueuedDecodes == null)
-                    return; // all queued items are rejected and dequeued when we get here
-                const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
-                let preventDisposalHandle;
-                try {
-                    preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
-                }
-                catch (err) {
-                    this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
-                    return;
-                }
-                try {
-                    await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
-                    shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
-                }
-                finally {
-                    preventDisposalHandle.dispose();
+            this._reserveThreads();
+            try {
+                while (shouldHaveAnotherLoop) {
+                    const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
+                    if (orderedQueuedDecodes == null)
+                        return; // all queued items are rejected and dequeued when we get here
+                    const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
+                    let preventDisposalHandle;
+                    try {
+                        preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
+                    }
+                    catch (err) {
+                        this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
+                        return;
+                    }
+                    try {
+                        await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
+                        shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
+                    }
+                    finally {
+                        preventDisposalHandle.dispose();
+                    }
                 }
             }
+            finally {
+                this._scheduleToFreeReservedThreads();
+            }
         });
     }
     /**
      * Print the timings of token evaluation since that last print for this context.
+     *
+     * Requires the `performanceTracking` option to be enabled.
+     *
      * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
      * it won't print anything.
      */
     async printTimings() {
         this._ensureNotDisposed();
+        if (!this._performanceTracking)
+            throw new UnsupportedError("Performance tracking is not enabled");
         this._ctx.printTimings();
         await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
     }
@@ -350,14 +405,6 @@ export class LlamaContext {
         });
     }
     /** @internal */
-    _acceptTokenOnGrammarEvaluationState(grammarEvaluationState, token) {
-        this._ctx.acceptGrammarEvaluationStateToken(grammarEvaluationState._state, token);
-    }
-    /** @internal */
-    _canBeNextTokenForGrammarEvaluationState(grammarEvaluationState, token) {
-        return this._ctx.canBeNextTokenForGrammarEvaluationState(grammarEvaluationState._state, token);
-    }
-    /** @internal */
     _popSequenceId() {
         if (this._unusedSequenceIds.length > 0)
             return this._unusedSequenceIds.shift();
@@ -417,6 +464,30 @@ export class LlamaContext {
         }
     }
     /** @internal */
+    _reserveThreads() {
+        clearTimeout(this._freeReservedThreadsTimeout);
+        delete this._freeReservedThreadsTimeout;
+        if (this._threadSplitterConsumer != null)
+            return;
+        this._threadSplitterConsumer = this._llama._threadsSplitter.createConsumer(this._idealThreads, this._minThreads);
+    }
+    /** @internal */
+    _freeReservedThreads() {
+        clearTimeout(this._freeReservedThreadsTimeout);
+        delete this._freeReservedThreadsTimeout;
+        if (this._threadSplitterConsumer == null)
+            return;
+        this._threadSplitterConsumer.dispose();
+        delete this._threadSplitterConsumer;
+    }
+    /** @internal */
+    _scheduleToFreeReservedThreads() {
+        if (this._threadSplitterConsumer == null)
+            return;
+        clearTimeout(this._freeReservedThreadsTimeout);
+        this._freeReservedThreadsTimeout = setTimeout(this._freeReservedThreads, 0);
+    }
+    /** @internal */
     static async _create(options, { _model }) {
         const sequences = options.sequences ?? getDefaultContextSequences();
         const flashAttention = _model.flashAttentionSupported
@@ -425,7 +496,13 @@ export class LlamaContext {
         const loraOptions = typeof options.lora === "string"
             ? { adapters: [{ filePath: options.lora }] }
             : options.lora;
-        const contextSize = await _model.fileInsights.configurationResolver.resolveContextContextSize(options.contextSize, {
+        let failedCreationRetries = options.failedCreationRemedy === false
+            ? 0
+            : Math.max(0, options.failedCreationRemedy?.retries ?? defaultFailedCreationRemedy.retries);
+        const failedCreationAutoContextSizeShrink = options.failedCreationRemedy === false
+            ? 0
+            : options.failedCreationRemedy?.autoContextSizeShrink ?? defaultFailedCreationRemedy.autoContextSizeShrink;
+        let contextSize = await _model.fileInsights.configurationResolver.resolveContextContextSize(options.contextSize, {
             batchSize: options.batchSize,
             sequences: sequences,
             modelGpuLayers: _model.gpuLayers,
@@ -436,69 +513,101 @@ export class LlamaContext {
             ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks,
             isEmbeddingContext: options._embeddings
         });
-        const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
-        const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
-            contextSize,
-            sequences,
-            isEmbeddingContext: options._embeddings,
-            modelGpuLayers: _model.gpuLayers,
-            batchSize,
-            flashAttention
-        }).gpuVram;
-        const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences, flashAttention });
+        const minContextSize = options.contextSize === "auto"
+            ? shrinkRetriesMinContextSize
+            : (typeof options.contextSize === "object" && typeof options.contextSize.min === "number")
+                ? options.contextSize.min
+                : typeof options.contextSize === "number"
+                    ? options.contextSize
+                    : shrinkRetriesMinContextSize;
         const { createSignal } = options;
-        const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
-            ? null
-            : _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
-        try {
-            const contextLoaded = await context._ctx.init();
-            if (createSignal?.aborted) {
-                if (contextLoaded)
-                    await context._ctx.dispose();
-                throw createSignal.reason;
-            }
-            else if (!contextLoaded)
-                throw new Error("Failed to create context");
-            contextCreationMemoryReservation?.dispose?.();
-            if (loraOptions != null && loraOptions.adapters.length > 0) {
-                let loadedAdapters = 0;
-                for (const adapter of loraOptions.adapters) {
-                    try {
-                        await context._setLora({
-                            filePath: adapter.filePath,
-                            scale: adapter.scale
-                        });
-                        loadedAdapters++;
+        async function createContext(contextSize) {
+            const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
+            const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
+                contextSize,
+                sequences,
+                isEmbeddingContext: options._embeddings,
+                modelGpuLayers: _model.gpuLayers,
+                batchSize,
+                flashAttention
+            }).gpuVram;
+            const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences, flashAttention });
+            const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
+                ? null
+                : _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
+            try {
+                if (createSignal?.aborted)
+                    throw createSignal.reason;
+                const contextLoaded = await context._ctx.init();
+                if (createSignal?.aborted) {
+                    if (contextLoaded)
+                        await context._ctx.dispose();
+                    throw createSignal.reason;
+                }
+                else if (!contextLoaded)
+                    throw new Error("Failed to create context");
+                contextCreationMemoryReservation?.dispose?.();
+                if (loraOptions != null && loraOptions.adapters.length > 0) {
+                    let loadedAdapters = 0;
+                    for (const adapter of loraOptions.adapters) {
                         try {
-                            loraOptions.onLoadProgress?.(loadedAdapters / loraOptions.adapters.length);
+                            await context._setLora({
+                                filePath: adapter.filePath,
+                                scale: adapter.scale
+                            });
+                            loadedAdapters++;
+                            try {
+                                loraOptions.onLoadProgress?.(loadedAdapters / loraOptions.adapters.length);
+                            }
+                            catch (err) {
+                                console.error(err);
+                            }
                         }
                         catch (err) {
-                            console.error(err);
+                            await context.dispose();
+                            throw err;
+                        }
+                        if (createSignal?.aborted) {
+                            await context.dispose();
+                            throw createSignal.reason;
                         }
                     }
-                    catch (err) {
-                        await context.dispose();
-                        throw err;
+                }
+                else if (loraOptions?.onLoadProgress != null) {
+                    try {
+                        loraOptions.onLoadProgress(1);
                     }
-                    if (createSignal?.aborted) {
-                        await context.dispose();
-                        throw createSignal.reason;
+                    catch (err) {
+                        console.error(err);
                     }
                 }
+                return context;
             }
-            else if (loraOptions?.onLoadProgress != null) {
-                try {
-                    loraOptions.onLoadProgress(1);
-                }
-                catch (err) {
-                    console.error(err);
-                }
+            finally {
+                contextCreationMemoryReservation?.dispose?.();
             }
-            return context;
         }
-        finally {
-            contextCreationMemoryReservation?.dispose?.();
+        while (failedCreationRetries >= 0) {
+            try {
+                return await createContext(contextSize);
+            }
+            catch (err) {
+                if (failedCreationRetries === 0 || (createSignal?.aborted && err === createSignal.reason))
+                    throw err;
+                failedCreationRetries--;
+                let newContextSize = typeof failedCreationAutoContextSizeShrink === "number"
+                    ? Math.floor(contextSize * (1 - failedCreationAutoContextSizeShrink))
+                    : Math.floor(failedCreationAutoContextSizeShrink(contextSize));
+                if (!Number.isFinite(newContextSize))
+                    throw err;
+                if (newContextSize < minContextSize)
+                    newContextSize = minContextSize;
+                if (newContextSize >= contextSize)
+                    throw err;
+                contextSize = newContextSize;
+            }
         }
+        throw new Error("Failed to create context");
     }
 }
 export class LlamaContextSequence {
@@ -639,12 +748,13 @@ export class LlamaContextSequence {
         });
     }
     evaluate(tokens, options = {}) {
-        const { temperature = 0, minP = 0, topK = 40, topP = 0.95, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEogToken = false, _noSampling = false } = options;
+        const { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEogToken = false, _noSampling = false } = options;
         return this._evaluate(tokens, {
             temperature,
             minP,
             topK,
             topP,
+            seed,
             grammarEvaluationState,
             repeatPenalty,
             tokenBias,
@@ -677,53 +787,71 @@ export class LlamaContextSequence {
         }
     }
     /** @internal */
-    async *_evaluate(tokens, { temperature = 0, minP = 0, topK = 40, topP = 0.95, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false }) {
+    async *_evaluate(tokens, { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false }) {
         this._ensureNotDisposed();
         let evalTokens = tokens;
         if (evalTokens.length === 0)
             return;
-        while (true) {
-            this._ensureNotDisposed();
-            // Evaluate to get the next token.
-            const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
-                if (_noSampling)
-                    return null;
-                const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
-                    ? repeatPenalty.punishTokens()
-                    : repeatPenalty?.punishTokens;
-                const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
-                    ? grammarEvaluationState()
-                    : grammarEvaluationState;
-                if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
-                    throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
-                const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
-                return this._context._ctx.sampleToken(batchLogitIndex, removeNullFields({
-                    temperature,
-                    minP,
-                    topK,
-                    topP,
-                    repeatPenalty: repeatPenalty?.penalty,
-                    repeatPenaltyTokens: repeatPenaltyTokens != null
-                        ? Uint32Array.from(repeatPenaltyTokens)
-                        : undefined,
-                    repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
-                    repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
-                    tokenBiasKeys,
-                    tokenBiasValues,
-                    grammarEvaluationState: resolvedGrammarEvaluationState?._state
-                }));
-            });
-            if (nextToken == null)
-                return;
-            // the model finished generating text
-            if (!yieldEogToken && this._context.model.isEogToken(nextToken))
-                break;
-            const replacementToken = (yield nextToken);
-            // set the tokens for the next evaluation
-            if (replacementToken != null)
-                evalTokens = [replacementToken];
-            else
-                evalTokens = [nextToken];
+        const sampler = new LlamaSampler(this.model);
+        try {
+            while (true) {
+                this._ensureNotDisposed();
+                // Evaluate to get the next token.
+                const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
+                    if (_noSampling)
+                        return null;
+                    const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
+                        ? repeatPenalty.punishTokens()
+                        : repeatPenalty?.punishTokens;
+                    const maxPunishTokens = Math.max(repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens, repeatPenaltyTokens?.length ?? 0);
+                    const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
+                        ? grammarEvaluationState()
+                        : grammarEvaluationState;
+                    if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
+                        throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
+                    const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
+                    sampler.applyConfig(removeNullFields({
+                        temperature,
+                        minP,
+                        topK,
+                        topP,
+                        seed: Math.max(0, Number.isFinite(seed)
+                            ? Math.floor(seed ?? (Date.now() / 1000))
+                            : Math.floor(Date.now() / 1000)),
+                        repeatPenalty: repeatPenalty?.penalty,
+                        repeatPenaltyMaxTokens: maxPunishTokens,
+                        repeatPenaltyTokens: repeatPenaltyTokens != null
+                            ? Uint32Array.from(repeatPenaltyTokens)
+                            : undefined,
+                        repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
+                        repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
+                        tokenBiasKeys,
+                        tokenBiasValues,
+                        grammarEvaluationState: resolvedGrammarEvaluationState?._state
+                    }));
+                    return withLock(sampler, "sample", async () => {
+                        if (sampler.disposed)
+                            return null;
+                        return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                    });
+                });
+                if (nextToken === -1)
+                    throw new Error("Failed to sample next token");
+                if (nextToken == null)
+                    return;
+                // the model finished generating text
+                if (!yieldEogToken && this._context.model.isEogToken(nextToken))
+                    break;
+                const replacementToken = (yield nextToken);
+                // set the tokens for the next evaluation
+                if (replacementToken != null)
+                    evalTokens = [replacementToken];
+                else
+                    evalTokens = [nextToken];
+            }
+        }
+        finally {
+            void withLock(sampler, "sample", sampler.asyncDispose);
         }
     }
     /** @internal */
@@ -814,7 +942,7 @@ function getTokenBiasesForAddon(tokenBias, currentModel) {
         };
     if (tokenBias instanceof Function)
         tokenBias = tokenBias();
-    if (tokenBias._model !== currentModel)
+    if (tokenBias._tokenizer !== currentModel.tokenizer)
         throw new Error("This TokenBias instance was created with a different model than the one used by this context. " +
             "Make sure you use the model instance of the context sequence for the TokenBias you use it with.");
     const tokenBiasKeys = [];