npm - node-llama-cpp - Versions diffs - 3.0.0-beta.16 → 3.0.0-beta.18 - Mend

node-llama-cpp 3.0.0-beta.16 → 3.0.0-beta.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (173) hide show

package/dist/evaluator/LlamaChat/LlamaChat.js CHANGED Viewed

@@ -7,6 +7,7 @@ import { UNKNOWN_UNICODE_CHAR } from "../../consts.js";
 import { getQueuedTokensBeforeStopTrigger } from "../../utils/getQueuedTokensBeforeStopTrigger.js";
 import { resolveChatWrapper } from "../../chatWrappers/utils/resolveChatWrapper.js";
 import { GeneralChatWrapper } from "../../chatWrappers/GeneralChatWrapper.js";
+import { getConsoleLogPrefix } from "../../utils/getConsoleLogPrefix.js";
 import { eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy } from "./utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js";
 import { FunctionCallGrammar, LlamaFunctionCallValidationError } from "./utils/FunctionCallGrammar.js";
 const defaultContextShiftOptions = {
@@ -36,7 +37,7 @@ export class LlamaChat {
                 bosString: contextSequence.model.tokens.bosString,
                 filename: contextSequence.model.filename,
                 fileInfo: contextSequence.model.fileInfo,
-                tokenizer: contextSequence.model.tokenize
+                tokenizer: contextSequence.model.tokenizer
             }) ?? new GeneralChatWrapper())
             : chatWrapper;
     }
@@ -71,7 +72,7 @@ export class LlamaChat {
     get model() {
         return this.sequence.model;
     }
-    async generateResponse(history, { onToken, signal, maxTokens, temperature, minP, topK, topP, grammar, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
+    async generateResponse(history, { onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, grammar, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
         const functionsEnabled = (functions != null && Object.keys(functions).length > 0);
         if (grammar != null && functionsEnabled)
             throw new Error("Using both grammar and functions is not supported yet");
@@ -89,7 +90,6 @@ export class LlamaChat {
             });
         const model = this._sequence.model;
         const context = this._sequence.context;
-        const eosToken = model.tokens.eos;
         const resolvedContextShift = {
             ...defaultContextShiftOptions,
             ...removeNullFields(contextShift)
@@ -116,6 +116,7 @@ export class LlamaChat {
             : undefined;
         const streamRegulator = new TokenStreamRegulator();
         const stopGenerationDetector = new StopGenerationDetector();
+        const customStopGenerationTriggersDetector = new StopGenerationDetector();
         const functionSyntaxStartDetector = new StopGenerationDetector();
         const functionSyntaxEndDetector = new StopGenerationDetector();
         const disengageInitiallyEngagedFunctionMode = new StopGenerationDetector();
@@ -129,7 +130,7 @@ export class LlamaChat {
         let lastContextWindowHistory = resolvedHistory;
         let lastHistoryCompressionMetadata = resolvedContextShift.lastEvaluationMetadata;
         const ensureNotAborted = () => {
-            if (signal?.aborted)
+            if (signal?.aborted && (!stopOnAbortSignal || res.length === 0))
                 throw signal.reason;
             if (this._sequence == null)
                 throw new DisposedError();
@@ -200,8 +201,11 @@ export class LlamaChat {
                 }
             }
         };
+        if (customStopTriggers != null)
+            StopGenerationDetector.resolveStopTriggers(customStopTriggers, model.tokenizer)
+                .map((stopTrigger) => customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
         if (grammar != null)
-            StopGenerationDetector.resolveStopTriggers(grammar.stopGenerationTriggers, model.tokenize)
+            StopGenerationDetector.resolveStopTriggers(grammar.stopGenerationTriggers, model.tokenizer)
                 .map((stopTrigger) => stopGenerationDetector.addStopTrigger(stopTrigger));
         if (functions != null && Object.keys(functions).length > 0)
             functionSyntaxStartDetector.addStopTrigger([this._chatWrapper.settings.functions.call.prefix]);
@@ -214,7 +218,7 @@ export class LlamaChat {
                 resolvedHistory: getResolvedHistoryWithCurrentModelResponse(),
                 resolvedContextShift,
                 lastHistoryCompressionMetadata,
-                pendingTokensCount: pendingTokens.length + queuedChunkTokens.length,
+                pendingTokensCount: ignoredStartTextTokens.length + pendingTokens.length + queuedChunkTokens.length,
                 isFirstEvaluation,
                 chatWrapper: this._chatWrapper,
                 lastEvaluationContextWindowHistory,
@@ -226,11 +230,11 @@ export class LlamaChat {
             });
             ensureNotAborted();
             if (generatedTokens === 0) {
-                StopGenerationDetector.resolveStopTriggers(ignoreStartText, model.tokenize)
+                StopGenerationDetector.resolveStopTriggers(ignoreStartText, model.tokenizer)
                     .map((stopTrigger) => ignoreStartTextDetector.addStopTrigger(stopTrigger));
                 if (functionsEnabled) {
                     initiallyEngagedFunctionMode = functionCallInitiallyEngaged;
-                    StopGenerationDetector.resolveStopTriggers(disengageInitiallyEngagedFunctionCall, model.tokenize)
+                    StopGenerationDetector.resolveStopTriggers(disengageInitiallyEngagedFunctionCall, model.tokenizer)
                         .map((stopTrigger) => disengageInitiallyEngagedFunctionMode.addStopTrigger(stopTrigger));
                     if (initiallyEngagedFunctionMode) {
                         inFunctionEvaluationMode = true;
@@ -247,10 +251,10 @@ export class LlamaChat {
             lastContextWindowHistory = contextWindowHistory;
             const contextWindowLastModelResponse = getLastTextModelResponseFromChatHistory(contextWindowHistory);
             const contextWindowsRes = [];
-            StopGenerationDetector.resolveStopTriggers(stopGenerationTriggers, model.tokenize)
+            StopGenerationDetector.resolveStopTriggers(stopGenerationTriggers, model.tokenizer)
                 .map((stopTrigger) => stopGenerationDetector.addStopTrigger(stopTrigger));
             if (functionsGrammar != null)
-                StopGenerationDetector.resolveStopTriggers(functionsGrammar.stopGenerationTriggers, model.tokenize)
+                StopGenerationDetector.resolveStopTriggers(functionsGrammar.stopGenerationTriggers, model.tokenizer)
                     .map((stopTrigger) => functionSyntaxEndDetector.addStopTrigger(stopTrigger));
             let { firstDifferentIndex } = this._sequence.compareContextTokens(tokens);
             // we need to decode at least one token to generate a response
@@ -279,193 +283,274 @@ export class LlamaChat {
                 },
                 tokenBias,
                 evaluationPriority,
-                yieldEosToken: true
+                yieldEogToken: true
             }));
-            for await (const token of evaluationIterator) {
-                ensureNotAborted();
-                generatedTokens++;
-                const tokens = [token];
-                const text = model.detokenize([token]);
-                const queuedTokenRelease = streamRegulator.addChunk({ tokens, text });
-                if (initiallyEngagedFunctionMode)
-                    disengageInitiallyEngagedFunctionMode.recordGeneration({ text, tokens, startNewChecks: generatedTokens === 1 });
-                if (text === UNKNOWN_UNICODE_CHAR || ((grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && text.trim() === "")) {
-                    locksToReleaseOnValidGeneration.push(queuedTokenRelease.createTextIndexLock(0));
-                }
-                else {
-                    while (locksToReleaseOnValidGeneration.length > 0)
-                        locksToReleaseOnValidGeneration.shift().dispose();
-                }
-                functionSyntaxStartDetector.recordGeneration({ text, tokens, queuedTokenRelease });
-                if (initiallyEngagedFunctionMode && disengageInitiallyEngagedFunctionMode.hasTriggeredStops) {
-                    initiallyEngagedFunctionMode = false;
-                    let shouldStopFunctionEvaluationMode = !functionSyntaxStartDetector.hasTriggeredStops;
-                    if (!shouldStopFunctionEvaluationMode && functionsEnabled && functionsGrammar != null) {
-                        const functionCallText = model.detokenize([...functionCallTokens, ...tokens]);
-                        try {
-                            const functionName = functionsGrammar.parseFunctionNameFromPartialCall(functionCallText, {
-                                enableInternalBuiltinFunctions: true,
-                                initialFunctionCallEngaged: true
-                            });
-                            const internalBuiltinFunctions = this._chatWrapper.getInternalBuiltinFunctions({ initialFunctionCallEngaged: true });
-                            if (internalBuiltinFunctions[functionName] != null) {
-                                shouldStopFunctionEvaluationMode = true;
+            try {
+                let currentIteration = await evaluationIterator.next();
+                while (currentIteration.done !== true) {
+                    const token = currentIteration.value;
+                    let replacementToken = undefined;
+                    ensureNotAborted();
+                    generatedTokens++;
+                    const tokens = [token];
+                    const text = model.detokenize([token]);
+                    const queuedTokenRelease = streamRegulator.addChunk({ tokens, text });
+                    if (initiallyEngagedFunctionMode)
+                        disengageInitiallyEngagedFunctionMode.recordGeneration({ text, tokens, startNewChecks: generatedTokens === 1 });
+                    if (text === UNKNOWN_UNICODE_CHAR || ((grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && text.trim() === "")) {
+                        locksToReleaseOnValidGeneration.push(queuedTokenRelease.createTextIndexLock(0));
+                    }
+                    else {
+                        while (locksToReleaseOnValidGeneration.length > 0)
+                            locksToReleaseOnValidGeneration.shift().dispose();
+                    }
+                    functionSyntaxStartDetector.recordGeneration({ text, tokens, queuedTokenRelease });
+                    if (initiallyEngagedFunctionMode && disengageInitiallyEngagedFunctionMode.hasTriggeredStops) {
+                        initiallyEngagedFunctionMode = false;
+                        let shouldStopFunctionEvaluationMode = !functionSyntaxStartDetector.hasTriggeredStops;
+                        if (!shouldStopFunctionEvaluationMode && functionsEnabled && functionsGrammar != null) {
+                            const functionCallText = model.detokenize([...functionCallTokens, ...tokens]);
+                            try {
+                                const functionName = functionsGrammar.parseFunctionNameFromPartialCall(functionCallText, {
+                                    enableInternalBuiltinFunctions: true,
+                                    initialFunctionCallEngaged: true
+                                });
+                                const internalBuiltinFunctions = this._chatWrapper.getInternalBuiltinFunctions({ initialFunctionCallEngaged: true });
+                                if (internalBuiltinFunctions[functionName] != null) {
+                                    shouldStopFunctionEvaluationMode = true;
+                                }
+                            }
+                            catch (err) {
+                                if (!(err instanceof LlamaFunctionCallValidationError))
+                                    throw err;
                             }
                         }
-                        catch (err) {
-                            if (!(err instanceof LlamaFunctionCallValidationError))
-                                throw err;
+                        if (shouldStopFunctionEvaluationMode) {
+                            inFunctionEvaluationMode = false;
+                            functionsGrammar = new FunctionCallGrammar(model._llama, functions, this._chatWrapper, false);
+                            functionsEvaluationState = new LlamaGrammarEvaluationState({
+                                grammar: functionsGrammar
+                            });
+                            functionCallTokens.length = 0;
+                            while (functionCallTokenSyntaxLocks.length > 0)
+                                functionCallTokenSyntaxLocks.shift().dispose();
+                            functionSyntaxStartDetector.clearInProgressStops();
+                            functionSyntaxStartDetector.clearTriggeredStops();
+                            functionSyntaxEndDetector.clearInProgressStops();
+                            functionSyntaxEndDetector.clearTriggeredStops();
                         }
                     }
-                    if (shouldStopFunctionEvaluationMode) {
-                        inFunctionEvaluationMode = false;
-                        functionsGrammar = new FunctionCallGrammar(model._llama, functions, this._chatWrapper, false);
-                        functionsEvaluationState = new LlamaGrammarEvaluationState({
-                            grammar: functionsGrammar
-                        });
-                        functionCallTokens.length = 0;
-                        while (functionCallTokenSyntaxLocks.length > 0)
-                            functionCallTokenSyntaxLocks.shift().dispose();
-                        functionSyntaxStartDetector.clearInProgressStops();
-                        functionSyntaxStartDetector.clearTriggeredStops();
-                        functionSyntaxEndDetector.clearInProgressStops();
-                        functionSyntaxEndDetector.clearTriggeredStops();
+                    if (!inFunctionEvaluationMode && functionsEnabled && functionsGrammar != null &&
+                        functionSyntaxStartDetector.hasTriggeredStops && functionsEvaluationState != null) {
+                        inFunctionEvaluationMode = true;
+                        functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
+                        stopGenerationDetector.clearTriggeredStops();
+                        stopGenerationDetector.clearInProgressStops();
+                        customStopGenerationTriggersDetector.clearTriggeredStops();
+                        customStopGenerationTriggersDetector.clearInProgressStops();
+                        pendingTokens.push(...streamRegulator.popFreeChunkTokens());
+                        const triggeredStops = functionSyntaxStartDetector.getTriggeredStops();
+                        const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
+                        const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
+                        pendingTokens.push(...queuedTokensBeforeStopTrigger);
+                        const [firstRemainingGenerationAfterStop] = triggeredStops
+                            .map((stopTrigger) => stopTrigger.remainingGenerations)
+                            .filter((remainingGenerations) => remainingGenerations.length > 0)
+                            .flat(1);
+                        const remainingTextAfterStop = (firstRemainingGenerationAfterStop == null || firstRemainingGenerationAfterStop.length === 0)
+                            ? ""
+                            : typeof firstRemainingGenerationAfterStop === "string"
+                                ? firstRemainingGenerationAfterStop
+                                : model.detokenize(firstRemainingGenerationAfterStop);
+                        functionCallTokens.push(...model.tokenize(this._chatWrapper.settings.functions.call.prefix, false, "trimLeadingSpace"));
+                        for (const functionCallToken of functionCallTokens)
+                            context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, functionCallToken);
+                        // these tokens have to be verified that they match the function calling syntax grammar before they can be accepted,
+                        // or the context state should be modified to not include the incompatible tokens
+                        const remainingTextTokens = model.tokenize(remainingTextAfterStop, false, "trimLeadingSpace");
+                        let unfitTokens = [];
+                        for (let i = 0; i < remainingTextTokens.length; i++) {
+                            const remainingToken = remainingTextTokens[i];
+                            const canBeNextToken = context._canBeNextTokenForGrammarEvaluationState(functionsEvaluationState, remainingToken);
+                            if (!canBeNextToken) {
+                                unfitTokens = remainingTextTokens.slice(i);
+                                break;
+                            }
+                            context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, remainingToken);
+                            functionCallTokens.push(remainingToken);
+                        }
+                        if (unfitTokens.length > 0) {
+                            const unfitTokensText = model.detokenize(unfitTokens); // the current token text must end with it
+                            const currentTokenText = queuedTokenRelease.text;
+                            let replacementTokens;
+                            if (!currentTokenText.endsWith(unfitTokensText)) {
+                                console.warn(getConsoleLogPrefix() + "The current token text does not end with the unfit function call syntax tokens text");
+                                replacementTokens = remainingTextTokens.slice(0, -unfitTokens.length);
+                            }
+                            else {
+                                const newCurrentTokensText = currentTokenText.slice(0, -unfitTokensText.length);
+                                replacementTokens = model.tokenize(newCurrentTokensText, false, "trimLeadingSpace");
+                            }
+                            if (replacementTokens.length > 0) {
+                                replacementToken = replacementTokens[0];
+                                queuedTokenRelease.modifyTokensAndText(replacementTokens, model.detokenize([replacementToken]));
+                            }
+                        }
                     }
-                }
-                if (!inFunctionEvaluationMode && functionsEnabled && functionsGrammar != null &&
-                    functionSyntaxStartDetector.hasTriggeredStops && functionsEvaluationState != null) {
-                    inFunctionEvaluationMode = true;
-                    functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
-                    stopGenerationDetector.clearTriggeredStops();
-                    stopGenerationDetector.clearInProgressStops();
-                    pendingTokens.push(...streamRegulator.popFreeChunkTokens());
-                    const triggeredStops = functionSyntaxStartDetector.getTriggeredStops();
-                    const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
-                    const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenize);
-                    pendingTokens.push(...queuedTokensBeforeStopTrigger);
-                    const [firstRemainingGenerationAfterStop] = triggeredStops
-                        .map((stopTrigger) => stopTrigger.remainingGenerations)
-                        .filter((remainingGenerations) => remainingGenerations.length > 0)
-                        .flat(1);
-                    const remainingTextAfterStop = (firstRemainingGenerationAfterStop == null || firstRemainingGenerationAfterStop.length === 0)
-                        ? ""
-                        : typeof firstRemainingGenerationAfterStop === "string"
-                            ? firstRemainingGenerationAfterStop
-                            : model.detokenize(firstRemainingGenerationAfterStop);
-                    functionCallTokens.push(...model.tokenize(this._chatWrapper.settings.functions.call.prefix + remainingTextAfterStop, false, "trimLeadingSpace"));
-                    for (const functionCallToken of functionCallTokens)
-                        context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, functionCallToken);
-                }
-                else if (inFunctionEvaluationMode) {
-                    functionCallTokens.push(...tokens);
-                    functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
-                    functionSyntaxEndDetector.recordGeneration({ text, tokens, queuedTokenRelease });
-                }
-                if (inFunctionEvaluationMode && functionSyntaxEndDetector.hasTriggeredStops && functionsGrammar != null) {
-                    const functionCallText = model.detokenize(functionCallTokens);
-                    const functionCall = functionsGrammar.parseFunctionCall(functionCallText);
-                    let modelResponse = model.detokenize(res);
-                    let contextWindowModelResponse = model.detokenize(contextWindowsRes);
-                    if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
-                        modelResponse = modelResponse.trimEnd();
-                        contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+                    else if (inFunctionEvaluationMode) {
+                        functionCallTokens.push(...tokens);
+                        functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
+                        functionSyntaxEndDetector.recordGeneration({ text, tokens, queuedTokenRelease });
                     }
-                    return {
-                        response: modelResponse,
-                        lastEvaluation: {
-                            contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
-                            cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
-                            contextShiftMetadata: lastHistoryCompressionMetadata
-                        },
-                        // prevent infinite TS type instantiation
-                        functionCall: functionCall,
-                        metadata: {
-                            stopReason: "functionCall"
+                    if (inFunctionEvaluationMode && functionSyntaxEndDetector.hasTriggeredStops && functionsGrammar != null) {
+                        const functionCallText = model.detokenize(functionCallTokens);
+                        const functionCall = functionsGrammar.parseFunctionCall(functionCallText);
+                        let modelResponse = model.detokenize(res);
+                        let contextWindowModelResponse = model.detokenize(contextWindowsRes);
+                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
+                            modelResponse = modelResponse.trimEnd();
+                            contextWindowModelResponse = contextWindowModelResponse.trimEnd();
                         }
-                    };
-                }
-                if (!inFunctionEvaluationMode)
-                    stopGenerationDetector.recordGeneration({ text, tokens, queuedTokenRelease });
-                pendingTokens.push(...streamRegulator.popFreeChunkTokens());
-                removeFoundStartIgnoreTextsFromPendingTokens();
-                if (stopGenerationDetector.hasTriggeredStops || token === eosToken) {
-                    const triggeredStops = stopGenerationDetector.getTriggeredStops();
-                    const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
-                    const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenize);
-                    pendingTokens.push(...queuedTokensBeforeStopTrigger);
-                    const [firstRemainingGenerationAfterStop] = triggeredStops
-                        .map((stopTrigger) => stopTrigger.remainingGenerations)
-                        .filter((remainingGenerations) => remainingGenerations.length > 0)
-                        .flat(1);
-                    removeFoundStartIgnoreTextsFromPendingTokens();
-                    if (pendingTokens.length > 0)
-                        onToken?.(pendingTokens.slice());
-                    res.push(...pendingTokens);
-                    contextWindowsRes.push(...pendingTokens);
-                    pendingTokens.length = 0;
-                    let modelResponse = model.detokenize(res);
-                    let contextWindowModelResponse = model.detokenize(contextWindowsRes);
-                    if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
-                        modelResponse = modelResponse.trimEnd();
-                        contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+                        return {
+                            response: modelResponse,
+                            lastEvaluation: {
+                                contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
+                                cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
+                                contextShiftMetadata: lastHistoryCompressionMetadata
+                            },
+                            // prevent infinite TS type instantiation
+                            functionCall: functionCall,
+                            metadata: {
+                                stopReason: "functionCall"
+                            }
+                        };
                     }
-                    return {
-                        response: modelResponse,
-                        lastEvaluation: {
-                            contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
-                            cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
-                            contextShiftMetadata: lastHistoryCompressionMetadata
-                        },
-                        metadata: {
-                            remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
-                            stopReason: token === eosToken
-                                ? "eosToken"
-                                : "stopGenerationTrigger"
-                        }
-                    };
-                }
-                const maxTokensTriggered = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
-                if (res.length === 0) {
-                    ignoreStartTextDetector.clearInProgressStops();
-                    ignoreStartTextDetector.clearTriggeredStops();
-                    ignoreStartTextDetector.recordGeneration({
-                        text: model.detokenize(pendingTokens),
-                        tokens: pendingTokens
-                    });
-                }
-                if (pendingTokens.length > 0 && (maxTokensTriggered || !ignoreStartTextDetector.hasInProgressStops)) {
+                    if (!inFunctionEvaluationMode) {
+                        stopGenerationDetector.recordGeneration({ text, tokens, queuedTokenRelease });
+                        customStopGenerationTriggersDetector.recordGeneration({ text, tokens, queuedTokenRelease });
+                    }
+                    pendingTokens.push(...streamRegulator.popFreeChunkTokens());
                     removeFoundStartIgnoreTextsFromPendingTokens();
-                    if (pendingTokens.length > 0) {
-                        onToken?.(pendingTokens.slice());
+                    if (stopGenerationDetector.hasTriggeredStops || customStopGenerationTriggersDetector.hasTriggeredStops ||
+                        model.isEogToken(token)) {
+                        const triggeredStops = stopGenerationDetector.hasTriggeredStops
+                            ? stopGenerationDetector.getTriggeredStops()
+                            : customStopGenerationTriggersDetector.getTriggeredStops();
+                        const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
+                        const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
+                        pendingTokens.push(...queuedTokensBeforeStopTrigger);
+                        const [firstRemainingGenerationAfterStop] = triggeredStops
+                            .map((stopTrigger) => stopTrigger.remainingGenerations)
+                            .filter((remainingGenerations) => remainingGenerations.length > 0)
+                            .flat(1);
+                        removeFoundStartIgnoreTextsFromPendingTokens();
+                        if (pendingTokens.length > 0)
+                            onToken?.(pendingTokens.slice());
                         res.push(...pendingTokens);
                         contextWindowsRes.push(...pendingTokens);
                         pendingTokens.length = 0;
-                    }
-                }
-                if (maxTokensTriggered) {
-                    let modelResponse = model.detokenize(res);
-                    let contextWindowModelResponse = model.detokenize(contextWindowsRes);
-                    if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
-                        modelResponse = modelResponse.trimEnd();
-                        contextWindowModelResponse = contextWindowModelResponse.trimEnd();
-                    }
-                    return {
-                        response: modelResponse,
-                        lastEvaluation: {
+                        let modelResponse = model.detokenize(res);
+                        let contextWindowModelResponse = model.detokenize(contextWindowsRes);
+                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
+                            modelResponse = modelResponse.trimEnd();
+                            contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+                        }
+                        const lastEvaluation = {
                             contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
                             cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
                             contextShiftMetadata: lastHistoryCompressionMetadata
-                        },
-                        metadata: {
-                            stopReason: "maxTokens"
+                        };
+                        const isEogToken = model.isEogToken(token);
+                        if (isEogToken || stopGenerationDetector.hasTriggeredStops) {
+                            return {
+                                response: modelResponse,
+                                lastEvaluation,
+                                metadata: {
+                                    remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
+                                    stopReason: isEogToken
+                                        ? "eogToken"
+                                        : "stopGenerationTrigger"
+                                }
+                            };
                         }
-                    };
-                }
-                if (this._sequence.nextTokenIndex >= context.contextSize) {
-                    shouldContextShift = true;
-                    break;
+                        return {
+                            response: modelResponse,
+                            lastEvaluation,
+                            metadata: {
+                                remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
+                                stopReason: "customStopTrigger",
+                                customStopTrigger: triggeredStops[0].stopTrigger
+                            }
+                        };
+                    }
+                    const maxTokensTriggered = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
+                    if (res.length === 0) {
+                        ignoreStartTextDetector.clearInProgressStops();
+                        ignoreStartTextDetector.clearTriggeredStops();
+                        ignoreStartTextDetector.recordGeneration({
+                            text: model.detokenize(pendingTokens),
+                            tokens: pendingTokens
+                        });
+                    }
+                    if (pendingTokens.length > 0 && (maxTokensTriggered || !ignoreStartTextDetector.hasInProgressStops)) {
+                        removeFoundStartIgnoreTextsFromPendingTokens();
+                        if (pendingTokens.length > 0) {
+                            onToken?.(pendingTokens.slice());
+                            res.push(...pendingTokens);
+                            contextWindowsRes.push(...pendingTokens);
+                            pendingTokens.length = 0;
+                        }
+                    }
+                    if (maxTokensTriggered) {
+                        let modelResponse = model.detokenize(res);
+                        let contextWindowModelResponse = model.detokenize(contextWindowsRes);
+                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
+                            modelResponse = modelResponse.trimEnd();
+                            contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+                        }
+                        return {
+                            response: modelResponse,
+                            lastEvaluation: {
+                                contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
+                                cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
+                                contextShiftMetadata: lastHistoryCompressionMetadata
+                            },
+                            metadata: {
+                                stopReason: "maxTokens"
+                            }
+                        };
+                    }
+                    if (this._sequence.nextTokenIndex >= context.contextSize - 1) {
+                        shouldContextShift = true;
+                        break;
+                    }
+                    if (signal?.aborted && stopOnAbortSignal) {
+                        if (res.length === 0)
+                            throw signal.reason;
+                        let modelResponse = model.detokenize(res);
+                        let contextWindowModelResponse = model.detokenize(contextWindowsRes);
+                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
+                            modelResponse = modelResponse.trimEnd();
+                            contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+                        }
+                        return {
+                            response: modelResponse,
+                            lastEvaluation: {
+                                contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
+                                cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
+                                contextShiftMetadata: lastHistoryCompressionMetadata
+                            },
+                            metadata: {
+                                stopReason: "abort"
+                            }
+                        };
+                    }
+                    currentIteration = await evaluationIterator.next(replacementToken);
                 }
             }
+            finally {
+                await evaluationIterator.return();
+            }
             isFirstEvaluation = false;
             if (shouldContextShift)
                 continue;
@@ -593,7 +678,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
             availableFunctions: functions,
             documentFunctionParams
         });
-        const tokens = contextText.tokenize(model.tokenize);
+        const tokens = contextText.tokenize(model.tokenizer);
         if (tokens.length + pendingTokensCount + minFreeContextTokens < context.contextSize) {
             const { firstDifferentIndex } = sequence.compareContextTokens(tokens);
             const existingEvaluationPercentage = firstDifferentIndex / tokens.length;
@@ -619,11 +704,11 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
             : resolvedContextShift.size;
         const { compressedHistory, metadata } = await compressHistoryToFitContextSize({
             history: resolvedHistory,
-            contextShiftSize: Math.max(contextShiftSize, minFreeContextTokens) + pendingTokensCount,
+            contextShiftSize: Math.max(minFreeContextTokens, Math.min(contextShiftSize, context.contextSize - pendingTokensCount)) + pendingTokensCount,
             contextShiftStrategy: resolvedContextShift.strategy,
             contextShiftLastEvaluationMetadata: resolvedContextShift.lastEvaluationMetadata,
             contextSize: context.contextSize,
-            tokenizer: model.tokenize,
+            tokenizer: model.tokenizer,
             chatWrapper: chatWrapper,
             functions,
             documentFunctionParams
@@ -635,7 +720,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
         return {
             history: compressedHistory,
             stopGenerationTriggers,
-            tokens: contextText.tokenize(model.tokenize),
+            tokens: contextText.tokenize(model.tokenizer),
             newResolvedHistory: resolvedHistory,
             newHistoryCompressionMetadata: metadata,
             ignoreStartText: ignoreStartText ?? [],
@@ -648,7 +733,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
             availableFunctions: functions,
             documentFunctionParams
         });
-        const tokens = contextText.tokenize(model.tokenize);
+        const tokens = contextText.tokenize(model.tokenizer);
         if (tokens.length + pendingTokensCount + minFreeContextTokens < context.contextSize)
             return {
                 history: resolvedHistory,
@@ -666,11 +751,11 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
         : resolvedContextShift.size)));
     const { compressedHistory, metadata } = await compressHistoryToFitContextSize({
         history: resolvedHistory,
-        contextShiftSize: Math.max(contextShiftSize, minFreeContextTokens) + pendingTokensCount,
+        contextShiftSize: Math.max(minFreeContextTokens, Math.min(contextShiftSize, context.contextSize - pendingTokensCount)) + pendingTokensCount,
         contextShiftStrategy: resolvedContextShift.strategy,
         contextShiftLastEvaluationMetadata: resolvedContextShift.lastEvaluationMetadata,
         contextSize: context.contextSize,
-        tokenizer: model.tokenize,
+        tokenizer: model.tokenizer,
         chatWrapper: chatWrapper,
         functions,
         documentFunctionParams
@@ -682,7 +767,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
     return {
         history: compressedHistory,
         stopGenerationTriggers,
-        tokens: contextText.tokenize(model.tokenize),
+        tokens: contextText.tokenize(model.tokenizer),
         newResolvedHistory: resolvedHistory,
         newHistoryCompressionMetadata: metadata,
         ignoreStartText: ignoreStartText ?? [],