npm - node-llama-cpp - Versions diffs - 3.0.0-beta.14 → 3.0.0-beta.16 - Mend

node-llama-cpp 3.0.0-beta.14 → 3.0.0-beta.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (383) hide show

package/dist/evaluator/LlamaContext/LlamaContext.d.ts CHANGED Viewed

@@ -1,6 +1,8 @@
 import { EventRelay } from "lifecycle-utils";
 import { Token } from "../../types.js";
 import { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
+import { TokenMeter } from "../TokenMeter.js";
+import { TokenBias } from "../TokenBias.js";
 import { ContextShiftOptions, ContextTokensDeleteRange, EvaluationPriority, LlamaContextSequenceRepeatPenalty } from "./types.js";
 import type { LlamaModel } from "../LlamaModel.js";
 export declare class LlamaContext {
@@ -13,6 +15,11 @@ export declare class LlamaContext {
     get model(): LlamaModel;
     get contextSize(): number;
     get batchSize(): number;
+    /**
+     * The actual size of the state in the memory in bytes.
+     * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
+     */
+    get stateSize(): number;
     getAllocatedContextSize(): number;
     get totalSequences(): number;
     get sequencesLeft(): number;
@@ -21,10 +28,15 @@ export declare class LlamaContext {
      * When there are no sequences left, this method will throw an error.
      * @param [options]
      */
-    getSequence({ contextShift: { size: contextShiftSize, strategy: contextShiftStrategy } }?: {
+    getSequence({ contextShift: { size: contextShiftSize, strategy: contextShiftStrategy }, _tokenMeter }?: {
         contextShift?: ContextShiftOptions;
     }): LlamaContextSequence;
     dispatchPendingBatch(): void;
+    /**
+     * Print the timings of token evaluation since that last print for this context.
+     * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
+     * it won't print anything.
+     */
     printTimings(): Promise<void>;
 }
 export declare class LlamaContextSequence {
@@ -38,6 +50,7 @@ export declare class LlamaContextSequence {
     get model(): LlamaModel;
     get nextTokenIndex(): number;
     get contextTokens(): Token[];
+    get tokenMeter(): TokenMeter;
     get isLoadedToMemory(): boolean;
     compareContextTokens(tokens: Token[]): {
         firstDifferentIndex: number;
@@ -49,7 +62,7 @@ export declare class LlamaContextSequence {
     clearHistory(): Promise<void>;
     /**
      * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
-     * the start and end of each range are exclusive.
+     * The start of each range is inclusive, and the end of each range is exclusive.
      * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
      */
     eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]): Promise<void>;
@@ -57,13 +70,19 @@ export declare class LlamaContextSequence {
      * @param tokens
      * @param [options]
      */
-    evaluate(tokens: Token[], { temperature, minP, topK, topP, grammarEvaluationState, repeatPenalty, evaluationPriority, contextShift: { size: contextShiftSize, strategy: contextShiftStrategy }, yieldEosToken }?: {
+    evaluate(tokens: Token[], { temperature, minP, topK, topP, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority, contextShift: { size: contextShiftSize, strategy: contextShiftStrategy }, yieldEosToken }?: {
         temperature?: number;
         minP?: number;
         topK?: number;
         topP?: number;
         grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined);
         repeatPenalty?: LlamaContextSequenceRepeatPenalty;
+        /**
+         * Adjust the probability of tokens being generated.
+         * Can be used to bias the model to generate tokens that you want it to lean towards,
+         * or to avoid generating tokens that you want it to avoid.
+         */
+        tokenBias?: TokenBias | (() => TokenBias);
         /**
          * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
          * evaluated based on the strategy chosen for the context.
@@ -104,3 +123,11 @@ export declare class LlamaContextSequence {
         contextShift?: ContextShiftOptions;
     }): Promise<void>;
 }
+export declare function getDefaultContextBatchSize({ contextSize, sequences }: {
+    contextSize: number;
+    sequences: number;
+}): number;
+export declare function getDefaultContextSequences(): number;
+export declare function getDefaultModelContextSize({ trainContextSize }: {
+    trainContextSize?: number;
+}): number;

package/dist/evaluator/LlamaContext/LlamaContext.js CHANGED Viewed

@@ -2,7 +2,8 @@ import { DisposeAggregator, EventRelay, withLock, DisposedError, AsyncDisposeAgg
 import { removeNullFields } from "../../utils/removeNullFields.js";
 import { compareTokens } from "../../utils/compareTokens.js";
 import { DisposeGuard } from "../../utils/DisposeGuard.js";
-import { resolveBatchItemsPrioritizingStrategy } from "./utils/resolveBatchItemsPrioritizingStrategy.js";
+import { TokenMeter } from "../TokenMeter.js";
+import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
 export class LlamaContext {
     /** @internal */ _llama;
     /** @internal */ _ctx;
@@ -25,7 +26,7 @@ export class LlamaContext {
     /** @internal */ _allocatedContextSize;
     /** @internal */ _disposed = false;
     onDispose = new EventRelay();
-    constructor({ _model }, { sequences = 1, seed = null, contextSize = _model.trainContextSize, batchSize = Math.min(contextSize * sequences, 512), threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy = "maximumParallelism" } = {}, _embeddings, _noSeed }) {
+    constructor({ _model }, { sequences, seed = null, contextSize, batchSize, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, _embeddings, _noSeed }) {
         if (_model.disposed)
             throw new DisposedError();
         this._llama = _model._llama;
@@ -39,13 +40,14 @@ export class LlamaContext {
             seed: seed != null ? Math.max(-1, Math.floor(seed)) : undefined,
             contextSize: this._contextSize * this._totalSequences,
             batchSize: this._batchSize,
+            sequences: this._totalSequences,
             threads: Math.max(0, Math.floor(threads)),
             embeddings: _embeddings,
             noSeed: _noSeed
         }));
         this._batchingOptions = {
             dispatchSchedule: batchingDispatchSchedule,
-            itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy
+            itemPrioritizationStrategy: batchingItemsPrioritizationStrategy
         };
         this._reclaimUnusedSequenceId = this._reclaimUnusedSequenceId.bind(this);
         this._disposeAggregator.add(() => {
@@ -82,6 +84,14 @@ export class LlamaContext {
     get batchSize() {
         return this._batchSize;
     }
+    /**
+     * The actual size of the state in the memory in bytes.
+     * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
+     */
+    get stateSize() {
+        this._ensureNotDisposed();
+        return this._ctx.getStateSize();
+    }
     getAllocatedContextSize() {
         this._ensureNotDisposed();
         if (this._allocatedContextSize == null)
@@ -99,7 +109,7 @@ export class LlamaContext {
      * When there are no sequences left, this method will throw an error.
      * @param [options]
      */
-    getSequence({ contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} } = {}) {
+    getSequence({ contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, _tokenMeter } = {}) {
         this._ensureNotDisposed();
         const nextSequenceId = this._popSequenceId();
         if (nextSequenceId == null)
@@ -107,6 +117,7 @@ export class LlamaContext {
         return LlamaContextSequence._create({
             sequenceId: nextSequenceId,
             context: this,
+            tokenMeter: _tokenMeter,
             contextShift: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
@@ -123,17 +134,18 @@ export class LlamaContext {
             this._currentDispatchBatchHandle = {};
             this._dispatchDecodeScheduled = false;
             this._batchDispatchPending = false;
-            let prioritizeStrategy;
-            try {
-                this._ensureNotDisposed();
-                prioritizeStrategy = resolveBatchItemsPrioritizingStrategy(this._batchingOptions.itemsPrioritizingStrategy);
-            }
-            catch (err) {
-                this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
-                return;
-            }
-            let shouldHaveAnotherBatch = this._queuedDecodes.length > 0;
-            while (shouldHaveAnotherBatch) {
+            let shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
+            const resolvePrioritizationStrategy = () => {
+                try {
+                    this._ensureNotDisposed();
+                    return resolveBatchItemsPrioritizationStrategy(this._batchingOptions.itemPrioritizationStrategy);
+                }
+                catch (err) {
+                    this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
+                }
+                return null;
+            };
+            const getOrderedQueuedDecodes = (prioritizationStrategy) => {
                 const batchItemToQueuedDecodeMap = new Map();
                 const batchItemsList = [];
                 for (const queuedDecode of this._queuedDecodes) {
@@ -146,101 +158,132 @@ export class LlamaContext {
                 }
                 let prioritizedItems;
                 try {
-                    prioritizedItems = prioritizeStrategy({
+                    prioritizedItems = prioritizationStrategy({
                         items: batchItemsList,
                         size: this._batchSize
                     });
                 }
                 catch (err) {
                     this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
-                    return;
+                    return null;
                 }
-                let batchTokenSlotsLeft = this._batchSize;
-                const afterDecodeActions = [];
-                const queuedDecodesToDelete = new Set();
-                const currentQueuedDecodeItems = new Set();
-                const currentBatchItems = [];
-                let currentBatchSize = 0;
-                for (const prioritizedItem of prioritizedItems) {
+                return prioritizedItems.map((prioritizedItem) => {
                     const queuedDecode = batchItemToQueuedDecodeMap.get(prioritizedItem.item);
                     if (queuedDecode == null)
                         throw new Error("Received invalid batch item. Make sure you keep the original object reference " +
                             "of the batch item on `item` on `PrioritizedBatchItem` in your custom prioritization strategy");
-                    const processAmount = Math.min(queuedDecode.tokens.length, prioritizedItem.processAmount, batchTokenSlotsLeft);
-                    if (processAmount <= 0)
+                    return {
+                        queuedDecode,
+                        processAmount: prioritizedItem.processAmount
+                    };
+                });
+            };
+            const fitQueuedDecodesToABatch = (queuedDecodes, batchSize) => {
+                const currentBatchItems = [];
+                let currentBatchSize = 0;
+                let batchTokenSlotsLeft = batchSize;
+                for (const { queuedDecode, processAmount } of queuedDecodes) {
+                    const resolvedProcessAmount = Math.min(processAmount <= 0 ? 1 : processAmount, queuedDecode.tokens.length, batchTokenSlotsLeft);
+                    if (resolvedProcessAmount <= 0) {
+                        if (batchTokenSlotsLeft === 0)
+                            break;
                         continue;
-                    batchTokenSlotsLeft -= processAmount;
+                    }
+                    batchTokenSlotsLeft -= resolvedProcessAmount;
+                    currentBatchSize += resolvedProcessAmount;
                     currentBatchItems.push({
                         queuedDecode,
-                        processAmount
+                        processAmount: resolvedProcessAmount
                     });
-                    currentBatchSize += processAmount;
                 }
-                let preventDisposalHandle;
+                return {
+                    currentBatchItems,
+                    currentBatchSize
+                };
+            };
+            const decodeTokenBatchItems = async (batchItems, currentBatchSize) => {
+                const afterDecodeActions = [];
+                const queuedDecodesToDelete = new Set();
+                const currentQueuedDecodeItems = new Set();
+                if (currentBatchSize !== 0)
+                    this._ctx.initBatch(currentBatchSize);
+                for (const { queuedDecode, processAmount } of batchItems) {
+                    let batchLogitIndex;
+                    try {
+                        const shouldGenerateLogitAtTheEnd = queuedDecode.generateLogitAtTheEnd &&
+                            processAmount === queuedDecode.tokens.length;
+                        const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
+                        const numberOfOutputTokens = shouldGenerateLogitAtTheEnd ? 1 : 0;
+                        TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
+                        TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
+                        batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), shouldGenerateLogitAtTheEnd);
+                    }
+                    catch (err) {
+                        this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
+                        continue;
+                    }
+                    currentQueuedDecodeItems.add(queuedDecode);
+                    if (queuedDecode.tokens.length === processAmount) {
+                        queuedDecodesToDelete.add(queuedDecode);
+                        afterDecodeActions.push({
+                            batchLogitIndex,
+                            response: queuedDecode.response,
+                            onDone: queuedDecode.onDone
+                        });
+                    }
+                    else {
+                        queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
+                        queuedDecode.firstTokenSequenceIndex += processAmount;
+                    }
+                }
+                for (let i = 0; i < this._queuedDecodes.length; i++) {
+                    const queuedDecode = this._queuedDecodes[i];
+                    if (queuedDecodesToDelete.has(queuedDecode)) {
+                        this._queuedDecodes.splice(i, 1);
+                        this._queuedDecodeSequenceIds.delete(queuedDecode.sequenceId);
+                        i--;
+                    }
+                }
                 try {
-                    preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
+                    if (currentBatchSize !== 0)
+                        await this._ctx.decodeBatch();
                 }
                 catch (err) {
-                    this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
+                    this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
                     return;
                 }
-                try {
-                    if (currentBatchSize !== 0)
-                        this._ctx.initBatch(currentBatchSize);
-                    for (const { queuedDecode, processAmount } of currentBatchItems) {
-                        let batchLogitIndex;
+                for (const action of afterDecodeActions) {
+                    const [accept, reject] = action.response;
+                    if (action.onDone != null && action.batchLogitIndex != null) {
                         try {
-                            batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(queuedDecode.tokens.slice(0, processAmount)), queuedDecode.generateLogitAtTheEnd && processAmount === queuedDecode.tokens.length);
+                            accept(action.onDone(action.batchLogitIndex ?? null));
                         }
                         catch (err) {
-                            this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
-                            continue;
-                        }
-                        currentQueuedDecodeItems.add(queuedDecode);
-                        if (queuedDecode.tokens.length === processAmount) {
-                            queuedDecodesToDelete.add(queuedDecode);
-                            afterDecodeActions.push({
-                                batchLogitIndex,
-                                response: queuedDecode.response,
-                                onDone: queuedDecode.onDone
-                            });
-                        }
-                        else {
-                            queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
-                            queuedDecode.firstTokenSequenceIndex += processAmount;
-                        }
-                        if (batchTokenSlotsLeft === 0)
-                            break;
-                    }
-                    for (let i = 0; i < this._queuedDecodes.length; i++) {
-                        const queuedDecode = this._queuedDecodes[i];
-                        if (queuedDecodesToDelete.has(queuedDecode)) {
-                            this._queuedDecodes.splice(i, 1);
-                            this._queuedDecodeSequenceIds.delete(queuedDecode.sequenceId);
-                            i--;
+                            reject(err);
                         }
                     }
-                    shouldHaveAnotherBatch = this._queuedDecodes.length > 0;
-                    try {
-                        if (currentBatchSize !== 0)
-                            await this._ctx.decodeBatch();
-                    }
-                    catch (err) {
-                        this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
-                        return;
-                    }
-                    for (const action of afterDecodeActions) {
-                        const [accept, reject] = action.response;
-                        if (action.onDone != null && action.batchLogitIndex != null) {
-                            try {
-                                accept(action.onDone(action.batchLogitIndex ?? null));
-                            }
-                            catch (err) {
-                                reject(err);
-                            }
-                        }
-                        accept(undefined);
-                    }
+                    accept(undefined);
+                }
+            };
+            const prioritizationStrategy = resolvePrioritizationStrategy();
+            if (prioritizationStrategy == null)
+                return; // all queued items are rejected and dequeued when we get here
+            while (shouldHaveAnotherLoop) {
+                const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
+                if (orderedQueuedDecodes == null)
+                    return; // all queued items are rejected and dequeued when we get here
+                const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
+                let preventDisposalHandle;
+                try {
+                    preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
+                }
+                catch (err) {
+                    this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
+                    return;
+                }
+                try {
+                    await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
+                    shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
                 }
                 finally {
                     preventDisposalHandle.dispose();
@@ -248,13 +291,18 @@ export class LlamaContext {
             }
         });
     }
+    /**
+     * Print the timings of token evaluation since that last print for this context.
+     * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
+     * it won't print anything.
+     */
     async printTimings() {
         this._ensureNotDisposed();
         this._ctx.printTimings();
         await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
     }
     /** @internal */
-    async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5 }, onDone) {
+    async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5, tokenMeter }, onDone) {
         return await new Promise((accept, reject) => {
             this._queuedDecodes.push({
                 sequenceId,
@@ -262,6 +310,7 @@ export class LlamaContext {
                 firstTokenSequenceIndex,
                 generateLogitAtTheEnd,
                 evaluationPriority,
+                tokenMeter,
                 response: [accept, reject],
                 onDone
             });
@@ -337,17 +386,44 @@ export class LlamaContext {
     }
     /** @internal */
     static async _create(options, { _model }) {
-        const context = new LlamaContext({ _model }, options);
+        const sequences = options.sequences ?? getDefaultContextSequences();
+        const contextSize = _model.fileInsights.configurationResolver.resolveContextContextSize(options.contextSize, {
+            batchSize: options.batchSize,
+            sequences: sequences,
+            modelGpuLayers: _model.gpuLayers,
+            modelTrainContextSize: _model.trainContextSize,
+            getVramState: () => _model._llama._vramOrchestrator.getMemoryState(),
+            llamaGpu: _model._llama.gpu,
+            ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks,
+            isEmbeddingContext: options._embeddings
+        });
+        const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
+        const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
+            contextSize,
+            sequences,
+            isEmbeddingContext: options._embeddings,
+            modelGpuLayers: _model.gpuLayers,
+            batchSize
+        }).gpuVram;
+        const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences });
         const { createSignal } = options;
-        const contextLoaded = await context._ctx.init();
-        if (createSignal?.aborted) {
-            if (contextLoaded)
-                await context._ctx.dispose();
-            throw createSignal.reason;
+        const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
+            ? null
+            : _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
+        try {
+            const contextLoaded = await context._ctx.init();
+            if (createSignal?.aborted) {
+                if (contextLoaded)
+                    await context._ctx.dispose();
+                throw createSignal.reason;
+            }
+            else if (!contextLoaded)
+                throw new Error("Failed to create context");
+            return context;
+        }
+        finally {
+            contextCreationMemoryReservation?.dispose?.();
         }
-        else if (!contextLoaded)
-            throw new Error("Failed to create context");
-        return context;
     }
 }
 export class LlamaContextSequence {
@@ -355,14 +431,16 @@ export class LlamaContextSequence {
     /** @internal */ _gcRegistry;
     /** @internal */ _context;
     /** @internal */ _contextShift;
+    /** @internal */ _tokenMeter;
     /** @internal */ _disposeAggregator = new DisposeAggregator();
     /** @internal */ _contextTokens = [];
     /** @internal */ _nextTokenIndex = 0;
     /** @internal */ _disposed = false;
     onDispose = new EventRelay();
-    constructor({ sequenceId, context, contextShift }) {
+    constructor({ sequenceId, context, tokenMeter, contextShift }) {
         this._sequenceId = sequenceId;
         this._context = context;
+        this._tokenMeter = tokenMeter ?? new TokenMeter();
         this._contextShift = contextShift;
         this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
         this._gcRegistry.register(this, sequenceId);
@@ -399,6 +477,9 @@ export class LlamaContextSequence {
     get contextTokens() {
         return this._contextTokens.slice();
     }
+    get tokenMeter() {
+        return this._tokenMeter;
+    }
     get isLoadedToMemory() {
         return !this._disposed;
     }
@@ -424,7 +505,7 @@ export class LlamaContextSequence {
     }
     /**
      * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
-     * the start and end of each range are exclusive.
+     * The start of each range is inclusive, and the end of each range is exclusive.
      * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
      */
     async eraseContextTokenRanges(ranges) {
@@ -486,7 +567,7 @@ export class LlamaContextSequence {
      * @param tokens
      * @param [options]
      */
-    evaluate(tokens, { temperature = 0, minP = 0, topK = 40, topP = 0.95, grammarEvaluationState, repeatPenalty, evaluationPriority = 5, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEosToken = false } = {}) {
+    evaluate(tokens, { temperature = 0, minP = 0, topK = 40, topP = 0.95, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEosToken = false } = {}) {
         return this._evaluate(tokens, {
             temperature,
             minP,
@@ -494,6 +575,7 @@ export class LlamaContextSequence {
             topP,
             grammarEvaluationState,
             repeatPenalty,
+            tokenBias,
             evaluationPriority,
             contextShiftOptions: {
                 size: contextShiftSize,
@@ -522,7 +604,7 @@ export class LlamaContextSequence {
         }
     }
     /** @internal */
-    async *_evaluate(tokens, { temperature = 0, minP = 0, topK = 40, topP = 0.95, grammarEvaluationState, repeatPenalty, evaluationPriority = 5, generateNewTokens = true, contextShiftOptions, yieldEosToken = false }) {
+    async *_evaluate(tokens, { temperature = 0, minP = 0, topK = 40, topP = 0.95, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, generateNewTokens = true, contextShiftOptions, yieldEosToken = false }) {
         this._ensureNotDisposed();
         let evalTokens = tokens;
         if (evalTokens.length === 0)
@@ -531,7 +613,7 @@ export class LlamaContextSequence {
         while (true) {
             this._ensureNotDisposed();
             // Evaluate to get the next token.
-            const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, contextShiftOptions, (batchLogitIndex) => {
+            const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
                 const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
                     ? repeatPenalty.punishTokens()
                     : repeatPenalty?.punishTokens;
@@ -540,6 +622,7 @@ export class LlamaContextSequence {
                     : grammarEvaluationState;
                 if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
                     throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
+                const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
                 return this._context._ctx.sampleToken(batchLogitIndex, removeNullFields({
                     temperature,
                     minP,
@@ -551,6 +634,8 @@ export class LlamaContextSequence {
                         : undefined,
                     repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
                     repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
+                    tokenBiasKeys,
+                    tokenBiasValues,
                     grammarEvaluationState: resolvedGrammarEvaluationState?._state
                 }));
             });
@@ -565,7 +650,7 @@ export class LlamaContextSequence {
         }
     }
     /** @internal */
-    async _decodeTokens(tokens, generateLogit, evaluationPriority, contextShiftOptions, onDecodeDone) {
+    async _decodeTokens(tokens, generateLogit, evaluationPriority, tokenMeter, contextShiftOptions, onDecodeDone) {
         this._ensureNotDisposed();
         const tokensLeftToDecode = tokens.slice();
         return await withLock(this, "evaluate", async () => {
@@ -585,7 +670,8 @@ export class LlamaContextSequence {
                     tokens: tokensToDecode,
                     firstTokenSequenceIndex: this._nextTokenIndex,
                     generateLogitAtTheEnd,
-                    evaluationPriority
+                    evaluationPriority,
+                    tokenMeter
                 }, !generateLogitAtTheEnd
                     ? undefined
                     : onDecodeDone);
@@ -632,10 +718,11 @@ export class LlamaContextSequence {
      * We need this to make it impossible to manually create instances of this class outside the code of this library
      * @internal
      */
-    static _create({ sequenceId, context, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
+    static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
         return new LlamaContextSequence({
             sequenceId,
             context,
+            tokenMeter,
             contextShift: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
@@ -643,6 +730,34 @@ export class LlamaContextSequence {
         });
     }
 }
+function getTokenBiasesForAddon(tokenBias, currentModel) {
+    if (tokenBias == null)
+        return {
+            tokenBiasKeys: undefined,
+            tokenBiasValues: undefined
+        };
+    if (tokenBias instanceof Function)
+        tokenBias = tokenBias();
+    if (tokenBias._model !== currentModel)
+        throw new Error("This TokenBias instance was created with a different model than the one used by this context. " +
+            "Make sure you use the model instance of the context sequence for the TokenBias you use it with.");
+    const tokenBiasKeys = [];
+    const tokenBiasValues = [];
+    for (const [token, bias] of tokenBias._biases) {
+        tokenBiasKeys.push(token);
+        tokenBiasValues.push(bias);
+    }
+    if (tokenBiasKeys.length === 0 || tokenBiasValues.length === 0) {
+        return {
+            tokenBiasKeys: undefined,
+            tokenBiasValues: undefined
+        };
+    }
+    return {
+        tokenBiasKeys: Uint32Array.from(tokenBiasKeys),
+        tokenBiasValues: Float32Array.from(tokenBiasValues)
+    };
+}
 function disposeContextIfReferenced(contextRef) {
     const context = contextRef.deref();
     if (context != null)
@@ -653,4 +768,14 @@ function disposeContextSequenceIfReferenced(contextRef) {
     if (context != null)
         context.dispose();
 }
+export function getDefaultContextBatchSize({ contextSize, sequences }) {
+    return Math.min(contextSize * sequences, 512);
+}
+export function getDefaultContextSequences() {
+    return 1;
+}
+const defaultFallbackContextSize = 4096;
+export function getDefaultModelContextSize({ trainContextSize }) {
+    return trainContextSize ?? defaultFallbackContextSize;
+}
 //# sourceMappingURL=LlamaContext.js.map