npm - node-llama-cpp - Versions diffs - 3.0.0-beta.17 → 3.0.0-beta.18 - Mend

node-llama-cpp 3.0.0-beta.17 → 3.0.0-beta.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (126) hide show

package/dist/evaluator/LlamaChat/LlamaChat.js CHANGED Viewed

@@ -72,7 +72,7 @@ export class LlamaChat {
     get model() {
         return this.sequence.model;
     }
-    async generateResponse(history, { onToken, signal, maxTokens, temperature, minP, topK, topP, grammar, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
+    async generateResponse(history, { onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, grammar, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
         const functionsEnabled = (functions != null && Object.keys(functions).length > 0);
         if (grammar != null && functionsEnabled)
             throw new Error("Using both grammar and functions is not supported yet");
@@ -116,6 +116,7 @@ export class LlamaChat {
             : undefined;
         const streamRegulator = new TokenStreamRegulator();
         const stopGenerationDetector = new StopGenerationDetector();
+        const customStopGenerationTriggersDetector = new StopGenerationDetector();
         const functionSyntaxStartDetector = new StopGenerationDetector();
         const functionSyntaxEndDetector = new StopGenerationDetector();
         const disengageInitiallyEngagedFunctionMode = new StopGenerationDetector();
@@ -129,7 +130,7 @@ export class LlamaChat {
         let lastContextWindowHistory = resolvedHistory;
         let lastHistoryCompressionMetadata = resolvedContextShift.lastEvaluationMetadata;
         const ensureNotAborted = () => {
-            if (signal?.aborted)
+            if (signal?.aborted && (!stopOnAbortSignal || res.length === 0))
                 throw signal.reason;
             if (this._sequence == null)
                 throw new DisposedError();
@@ -200,6 +201,9 @@ export class LlamaChat {
                 }
             }
         };
+        if (customStopTriggers != null)
+            StopGenerationDetector.resolveStopTriggers(customStopTriggers, model.tokenizer)
+                .map((stopTrigger) => customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
         if (grammar != null)
             StopGenerationDetector.resolveStopTriggers(grammar.stopGenerationTriggers, model.tokenizer)
                 .map((stopTrigger) => stopGenerationDetector.addStopTrigger(stopTrigger));
@@ -214,7 +218,7 @@ export class LlamaChat {
                 resolvedHistory: getResolvedHistoryWithCurrentModelResponse(),
                 resolvedContextShift,
                 lastHistoryCompressionMetadata,
-                pendingTokensCount: pendingTokens.length + queuedChunkTokens.length,
+                pendingTokensCount: ignoredStartTextTokens.length + pendingTokens.length + queuedChunkTokens.length,
                 isFirstEvaluation,
                 chatWrapper: this._chatWrapper,
                 lastEvaluationContextWindowHistory,
@@ -281,225 +285,271 @@ export class LlamaChat {
                 evaluationPriority,
                 yieldEogToken: true
             }));
-            let currentIteration = await evaluationIterator.next();
-            while (currentIteration.done !== true) {
-                const token = currentIteration.value;
-                let replacementToken = undefined;
-                ensureNotAborted();
-                generatedTokens++;
-                const tokens = [token];
-                const text = model.detokenize([token]);
-                const queuedTokenRelease = streamRegulator.addChunk({ tokens, text });
-                if (initiallyEngagedFunctionMode)
-                    disengageInitiallyEngagedFunctionMode.recordGeneration({ text, tokens, startNewChecks: generatedTokens === 1 });
-                if (text === UNKNOWN_UNICODE_CHAR || ((grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && text.trim() === "")) {
-                    locksToReleaseOnValidGeneration.push(queuedTokenRelease.createTextIndexLock(0));
-                }
-                else {
-                    while (locksToReleaseOnValidGeneration.length > 0)
-                        locksToReleaseOnValidGeneration.shift().dispose();
-                }
-                functionSyntaxStartDetector.recordGeneration({ text, tokens, queuedTokenRelease });
-                if (initiallyEngagedFunctionMode && disengageInitiallyEngagedFunctionMode.hasTriggeredStops) {
-                    initiallyEngagedFunctionMode = false;
-                    let shouldStopFunctionEvaluationMode = !functionSyntaxStartDetector.hasTriggeredStops;
-                    if (!shouldStopFunctionEvaluationMode && functionsEnabled && functionsGrammar != null) {
-                        const functionCallText = model.detokenize([...functionCallTokens, ...tokens]);
-                        try {
-                            const functionName = functionsGrammar.parseFunctionNameFromPartialCall(functionCallText, {
-                                enableInternalBuiltinFunctions: true,
-                                initialFunctionCallEngaged: true
-                            });
-                            const internalBuiltinFunctions = this._chatWrapper.getInternalBuiltinFunctions({ initialFunctionCallEngaged: true });
-                            if (internalBuiltinFunctions[functionName] != null) {
-                                shouldStopFunctionEvaluationMode = true;
-                            }
-                        }
-                        catch (err) {
-                            if (!(err instanceof LlamaFunctionCallValidationError))
-                                throw err;
-                        }
+            try {
+                let currentIteration = await evaluationIterator.next();
+                while (currentIteration.done !== true) {
+                    const token = currentIteration.value;
+                    let replacementToken = undefined;
+                    ensureNotAborted();
+                    generatedTokens++;
+                    const tokens = [token];
+                    const text = model.detokenize([token]);
+                    const queuedTokenRelease = streamRegulator.addChunk({ tokens, text });
+                    if (initiallyEngagedFunctionMode)
+                        disengageInitiallyEngagedFunctionMode.recordGeneration({ text, tokens, startNewChecks: generatedTokens === 1 });
+                    if (text === UNKNOWN_UNICODE_CHAR || ((grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && text.trim() === "")) {
+                        locksToReleaseOnValidGeneration.push(queuedTokenRelease.createTextIndexLock(0));
                     }
-                    if (shouldStopFunctionEvaluationMode) {
-                        inFunctionEvaluationMode = false;
-                        functionsGrammar = new FunctionCallGrammar(model._llama, functions, this._chatWrapper, false);
-                        functionsEvaluationState = new LlamaGrammarEvaluationState({
-                            grammar: functionsGrammar
-                        });
-                        functionCallTokens.length = 0;
-                        while (functionCallTokenSyntaxLocks.length > 0)
-                            functionCallTokenSyntaxLocks.shift().dispose();
-                        functionSyntaxStartDetector.clearInProgressStops();
-                        functionSyntaxStartDetector.clearTriggeredStops();
-                        functionSyntaxEndDetector.clearInProgressStops();
-                        functionSyntaxEndDetector.clearTriggeredStops();
+                    else {
+                        while (locksToReleaseOnValidGeneration.length > 0)
+                            locksToReleaseOnValidGeneration.shift().dispose();
                     }
-                }
-                if (!inFunctionEvaluationMode && functionsEnabled && functionsGrammar != null &&
-                    functionSyntaxStartDetector.hasTriggeredStops && functionsEvaluationState != null) {
-                    inFunctionEvaluationMode = true;
-                    functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
-                    stopGenerationDetector.clearTriggeredStops();
-                    stopGenerationDetector.clearInProgressStops();
-                    pendingTokens.push(...streamRegulator.popFreeChunkTokens());
-                    const triggeredStops = functionSyntaxStartDetector.getTriggeredStops();
-                    const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
-                    const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
-                    pendingTokens.push(...queuedTokensBeforeStopTrigger);
-                    const [firstRemainingGenerationAfterStop] = triggeredStops
-                        .map((stopTrigger) => stopTrigger.remainingGenerations)
-                        .filter((remainingGenerations) => remainingGenerations.length > 0)
-                        .flat(1);
-                    const remainingTextAfterStop = (firstRemainingGenerationAfterStop == null || firstRemainingGenerationAfterStop.length === 0)
-                        ? ""
-                        : typeof firstRemainingGenerationAfterStop === "string"
-                            ? firstRemainingGenerationAfterStop
-                            : model.detokenize(firstRemainingGenerationAfterStop);
-                    functionCallTokens.push(...model.tokenize(this._chatWrapper.settings.functions.call.prefix, false, "trimLeadingSpace"));
-                    for (const functionCallToken of functionCallTokens)
-                        context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, functionCallToken);
-                    // these tokens have to be verified that they match the function calling syntax grammar before they can be accepted,
-                    // or the context state should be modified to not include the incompatible tokens
-                    const remainingTextTokens = model.tokenize(remainingTextAfterStop, false, "trimLeadingSpace");
-                    let unfitTokens = [];
-                    for (let i = 0; i < remainingTextTokens.length; i++) {
-                        const remainingToken = remainingTextTokens[i];
-                        const canBeNextToken = context._canBeNextTokenForGrammarEvaluationState(functionsEvaluationState, remainingToken);
-                        if (!canBeNextToken) {
-                            unfitTokens = remainingTextTokens.slice(i);
-                            break;
+                    functionSyntaxStartDetector.recordGeneration({ text, tokens, queuedTokenRelease });
+                    if (initiallyEngagedFunctionMode && disengageInitiallyEngagedFunctionMode.hasTriggeredStops) {
+                        initiallyEngagedFunctionMode = false;
+                        let shouldStopFunctionEvaluationMode = !functionSyntaxStartDetector.hasTriggeredStops;
+                        if (!shouldStopFunctionEvaluationMode && functionsEnabled && functionsGrammar != null) {
+                            const functionCallText = model.detokenize([...functionCallTokens, ...tokens]);
+                            try {
+                                const functionName = functionsGrammar.parseFunctionNameFromPartialCall(functionCallText, {
+                                    enableInternalBuiltinFunctions: true,
+                                    initialFunctionCallEngaged: true
+                                });
+                                const internalBuiltinFunctions = this._chatWrapper.getInternalBuiltinFunctions({ initialFunctionCallEngaged: true });
+                                if (internalBuiltinFunctions[functionName] != null) {
+                                    shouldStopFunctionEvaluationMode = true;
+                                }
+                            }
+                            catch (err) {
+                                if (!(err instanceof LlamaFunctionCallValidationError))
+                                    throw err;
+                            }
                         }
-                        context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, remainingToken);
-                        functionCallTokens.push(remainingToken);
-                    }
-                    if (unfitTokens.length > 0) {
-                        const unfitTokensText = model.detokenize(unfitTokens); // the current token text must end with it
-                        const currentTokenText = queuedTokenRelease.text;
-                        let replacementTokens;
-                        if (!currentTokenText.endsWith(unfitTokensText)) {
-                            console.warn(getConsoleLogPrefix() + "The current token text does not end with the unfit function call syntax tokens text");
-                            replacementTokens = remainingTextTokens.slice(0, -unfitTokens.length);
+                        if (shouldStopFunctionEvaluationMode) {
+                            inFunctionEvaluationMode = false;
+                            functionsGrammar = new FunctionCallGrammar(model._llama, functions, this._chatWrapper, false);
+                            functionsEvaluationState = new LlamaGrammarEvaluationState({
+                                grammar: functionsGrammar
+                            });
+                            functionCallTokens.length = 0;
+                            while (functionCallTokenSyntaxLocks.length > 0)
+                                functionCallTokenSyntaxLocks.shift().dispose();
+                            functionSyntaxStartDetector.clearInProgressStops();
+                            functionSyntaxStartDetector.clearTriggeredStops();
+                            functionSyntaxEndDetector.clearInProgressStops();
+                            functionSyntaxEndDetector.clearTriggeredStops();
                         }
-                        else {
-                            const newCurrentTokensText = currentTokenText.slice(0, -unfitTokensText.length);
-                            replacementTokens = model.tokenize(newCurrentTokensText, false, "trimLeadingSpace");
+                    }
+                    if (!inFunctionEvaluationMode && functionsEnabled && functionsGrammar != null &&
+                        functionSyntaxStartDetector.hasTriggeredStops && functionsEvaluationState != null) {
+                        inFunctionEvaluationMode = true;
+                        functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
+                        stopGenerationDetector.clearTriggeredStops();
+                        stopGenerationDetector.clearInProgressStops();
+                        customStopGenerationTriggersDetector.clearTriggeredStops();
+                        customStopGenerationTriggersDetector.clearInProgressStops();
+                        pendingTokens.push(...streamRegulator.popFreeChunkTokens());
+                        const triggeredStops = functionSyntaxStartDetector.getTriggeredStops();
+                        const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
+                        const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
+                        pendingTokens.push(...queuedTokensBeforeStopTrigger);
+                        const [firstRemainingGenerationAfterStop] = triggeredStops
+                            .map((stopTrigger) => stopTrigger.remainingGenerations)
+                            .filter((remainingGenerations) => remainingGenerations.length > 0)
+                            .flat(1);
+                        const remainingTextAfterStop = (firstRemainingGenerationAfterStop == null || firstRemainingGenerationAfterStop.length === 0)
+                            ? ""
+                            : typeof firstRemainingGenerationAfterStop === "string"
+                                ? firstRemainingGenerationAfterStop
+                                : model.detokenize(firstRemainingGenerationAfterStop);
+                        functionCallTokens.push(...model.tokenize(this._chatWrapper.settings.functions.call.prefix, false, "trimLeadingSpace"));
+                        for (const functionCallToken of functionCallTokens)
+                            context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, functionCallToken);
+                        // these tokens have to be verified that they match the function calling syntax grammar before they can be accepted,
+                        // or the context state should be modified to not include the incompatible tokens
+                        const remainingTextTokens = model.tokenize(remainingTextAfterStop, false, "trimLeadingSpace");
+                        let unfitTokens = [];
+                        for (let i = 0; i < remainingTextTokens.length; i++) {
+                            const remainingToken = remainingTextTokens[i];
+                            const canBeNextToken = context._canBeNextTokenForGrammarEvaluationState(functionsEvaluationState, remainingToken);
+                            if (!canBeNextToken) {
+                                unfitTokens = remainingTextTokens.slice(i);
+                                break;
+                            }
+                            context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, remainingToken);
+                            functionCallTokens.push(remainingToken);
                         }
-                        if (replacementTokens.length > 0) {
-                            replacementToken = replacementTokens[0];
-                            queuedTokenRelease.modifyTokensAndText(replacementTokens, model.detokenize([replacementToken]));
+                        if (unfitTokens.length > 0) {
+                            const unfitTokensText = model.detokenize(unfitTokens); // the current token text must end with it
+                            const currentTokenText = queuedTokenRelease.text;
+                            let replacementTokens;
+                            if (!currentTokenText.endsWith(unfitTokensText)) {
+                                console.warn(getConsoleLogPrefix() + "The current token text does not end with the unfit function call syntax tokens text");
+                                replacementTokens = remainingTextTokens.slice(0, -unfitTokens.length);
+                            }
+                            else {
+                                const newCurrentTokensText = currentTokenText.slice(0, -unfitTokensText.length);
+                                replacementTokens = model.tokenize(newCurrentTokensText, false, "trimLeadingSpace");
+                            }
+                            if (replacementTokens.length > 0) {
+                                replacementToken = replacementTokens[0];
+                                queuedTokenRelease.modifyTokensAndText(replacementTokens, model.detokenize([replacementToken]));
+                            }
                         }
                     }
-                }
-                else if (inFunctionEvaluationMode) {
-                    functionCallTokens.push(...tokens);
-                    functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
-                    functionSyntaxEndDetector.recordGeneration({ text, tokens, queuedTokenRelease });
-                }
-                if (inFunctionEvaluationMode && functionSyntaxEndDetector.hasTriggeredStops && functionsGrammar != null) {
-                    const functionCallText = model.detokenize(functionCallTokens);
-                    const functionCall = functionsGrammar.parseFunctionCall(functionCallText);
-                    let modelResponse = model.detokenize(res);
-                    let contextWindowModelResponse = model.detokenize(contextWindowsRes);
-                    if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
-                        modelResponse = modelResponse.trimEnd();
-                        contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+                    else if (inFunctionEvaluationMode) {
+                        functionCallTokens.push(...tokens);
+                        functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
+                        functionSyntaxEndDetector.recordGeneration({ text, tokens, queuedTokenRelease });
                     }
-                    return {
-                        response: modelResponse,
-                        lastEvaluation: {
-                            contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
-                            cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
-                            contextShiftMetadata: lastHistoryCompressionMetadata
-                        },
-                        // prevent infinite TS type instantiation
-                        functionCall: functionCall,
-                        metadata: {
-                            stopReason: "functionCall"
+                    if (inFunctionEvaluationMode && functionSyntaxEndDetector.hasTriggeredStops && functionsGrammar != null) {
+                        const functionCallText = model.detokenize(functionCallTokens);
+                        const functionCall = functionsGrammar.parseFunctionCall(functionCallText);
+                        let modelResponse = model.detokenize(res);
+                        let contextWindowModelResponse = model.detokenize(contextWindowsRes);
+                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
+                            modelResponse = modelResponse.trimEnd();
+                            contextWindowModelResponse = contextWindowModelResponse.trimEnd();
                         }
-                    };
-                }
-                if (!inFunctionEvaluationMode)
-                    stopGenerationDetector.recordGeneration({ text, tokens, queuedTokenRelease });
-                pendingTokens.push(...streamRegulator.popFreeChunkTokens());
-                removeFoundStartIgnoreTextsFromPendingTokens();
-                if (stopGenerationDetector.hasTriggeredStops || model.isEogToken(token)) {
-                    const triggeredStops = stopGenerationDetector.getTriggeredStops();
-                    const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
-                    const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
-                    pendingTokens.push(...queuedTokensBeforeStopTrigger);
-                    const [firstRemainingGenerationAfterStop] = triggeredStops
-                        .map((stopTrigger) => stopTrigger.remainingGenerations)
-                        .filter((remainingGenerations) => remainingGenerations.length > 0)
-                        .flat(1);
-                    removeFoundStartIgnoreTextsFromPendingTokens();
-                    if (pendingTokens.length > 0)
-                        onToken?.(pendingTokens.slice());
-                    res.push(...pendingTokens);
-                    contextWindowsRes.push(...pendingTokens);
-                    pendingTokens.length = 0;
-                    let modelResponse = model.detokenize(res);
-                    let contextWindowModelResponse = model.detokenize(contextWindowsRes);
-                    if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
-                        modelResponse = modelResponse.trimEnd();
-                        contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+                        return {
+                            response: modelResponse,
+                            lastEvaluation: {
+                                contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
+                                cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
+                                contextShiftMetadata: lastHistoryCompressionMetadata
+                            },
+                            // prevent infinite TS type instantiation
+                            functionCall: functionCall,
+                            metadata: {
+                                stopReason: "functionCall"
+                            }
+                        };
                     }
-                    return {
-                        response: modelResponse,
-                        lastEvaluation: {
-                            contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
-                            cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
-                            contextShiftMetadata: lastHistoryCompressionMetadata
-                        },
-                        metadata: {
-                            remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
-                            stopReason: model.isEogToken(token)
-                                ? "eogToken"
-                                : "stopGenerationTrigger"
-                        }
-                    };
-                }
-                const maxTokensTriggered = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
-                if (res.length === 0) {
-                    ignoreStartTextDetector.clearInProgressStops();
-                    ignoreStartTextDetector.clearTriggeredStops();
-                    ignoreStartTextDetector.recordGeneration({
-                        text: model.detokenize(pendingTokens),
-                        tokens: pendingTokens
-                    });
-                }
-                if (pendingTokens.length > 0 && (maxTokensTriggered || !ignoreStartTextDetector.hasInProgressStops)) {
+                    if (!inFunctionEvaluationMode) {
+                        stopGenerationDetector.recordGeneration({ text, tokens, queuedTokenRelease });
+                        customStopGenerationTriggersDetector.recordGeneration({ text, tokens, queuedTokenRelease });
+                    }
+                    pendingTokens.push(...streamRegulator.popFreeChunkTokens());
                     removeFoundStartIgnoreTextsFromPendingTokens();
-                    if (pendingTokens.length > 0) {
-                        onToken?.(pendingTokens.slice());
+                    if (stopGenerationDetector.hasTriggeredStops || customStopGenerationTriggersDetector.hasTriggeredStops ||
+                        model.isEogToken(token)) {
+                        const triggeredStops = stopGenerationDetector.hasTriggeredStops
+                            ? stopGenerationDetector.getTriggeredStops()
+                            : customStopGenerationTriggersDetector.getTriggeredStops();
+                        const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
+                        const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
+                        pendingTokens.push(...queuedTokensBeforeStopTrigger);
+                        const [firstRemainingGenerationAfterStop] = triggeredStops
+                            .map((stopTrigger) => stopTrigger.remainingGenerations)
+                            .filter((remainingGenerations) => remainingGenerations.length > 0)
+                            .flat(1);
+                        removeFoundStartIgnoreTextsFromPendingTokens();
+                        if (pendingTokens.length > 0)
+                            onToken?.(pendingTokens.slice());
                         res.push(...pendingTokens);
                         contextWindowsRes.push(...pendingTokens);
                         pendingTokens.length = 0;
-                    }
-                }
-                if (maxTokensTriggered) {
-                    let modelResponse = model.detokenize(res);
-                    let contextWindowModelResponse = model.detokenize(contextWindowsRes);
-                    if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
-                        modelResponse = modelResponse.trimEnd();
-                        contextWindowModelResponse = contextWindowModelResponse.trimEnd();
-                    }
-                    return {
-                        response: modelResponse,
-                        lastEvaluation: {
+                        let modelResponse = model.detokenize(res);
+                        let contextWindowModelResponse = model.detokenize(contextWindowsRes);
+                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
+                            modelResponse = modelResponse.trimEnd();
+                            contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+                        }
+                        const lastEvaluation = {
                             contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
                             cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
                             contextShiftMetadata: lastHistoryCompressionMetadata
-                        },
-                        metadata: {
-                            stopReason: "maxTokens"
+                        };
+                        const isEogToken = model.isEogToken(token);
+                        if (isEogToken || stopGenerationDetector.hasTriggeredStops) {
+                            return {
+                                response: modelResponse,
+                                lastEvaluation,
+                                metadata: {
+                                    remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
+                                    stopReason: isEogToken
+                                        ? "eogToken"
+                                        : "stopGenerationTrigger"
+                                }
+                            };
                         }
-                    };
-                }
-                if (this._sequence.nextTokenIndex >= context.contextSize) {
-                    shouldContextShift = true;
-                    break;
+                        return {
+                            response: modelResponse,
+                            lastEvaluation,
+                            metadata: {
+                                remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
+                                stopReason: "customStopTrigger",
+                                customStopTrigger: triggeredStops[0].stopTrigger
+                            }
+                        };
+                    }
+                    const maxTokensTriggered = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
+                    if (res.length === 0) {
+                        ignoreStartTextDetector.clearInProgressStops();
+                        ignoreStartTextDetector.clearTriggeredStops();
+                        ignoreStartTextDetector.recordGeneration({
+                            text: model.detokenize(pendingTokens),
+                            tokens: pendingTokens
+                        });
+                    }
+                    if (pendingTokens.length > 0 && (maxTokensTriggered || !ignoreStartTextDetector.hasInProgressStops)) {
+                        removeFoundStartIgnoreTextsFromPendingTokens();
+                        if (pendingTokens.length > 0) {
+                            onToken?.(pendingTokens.slice());
+                            res.push(...pendingTokens);
+                            contextWindowsRes.push(...pendingTokens);
+                            pendingTokens.length = 0;
+                        }
+                    }
+                    if (maxTokensTriggered) {
+                        let modelResponse = model.detokenize(res);
+                        let contextWindowModelResponse = model.detokenize(contextWindowsRes);
+                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
+                            modelResponse = modelResponse.trimEnd();
+                            contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+                        }
+                        return {
+                            response: modelResponse,
+                            lastEvaluation: {
+                                contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
+                                cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
+                                contextShiftMetadata: lastHistoryCompressionMetadata
+                            },
+                            metadata: {
+                                stopReason: "maxTokens"
+                            }
+                        };
+                    }
+                    if (this._sequence.nextTokenIndex >= context.contextSize - 1) {
+                        shouldContextShift = true;
+                        break;
+                    }
+                    if (signal?.aborted && stopOnAbortSignal) {
+                        if (res.length === 0)
+                            throw signal.reason;
+                        let modelResponse = model.detokenize(res);
+                        let contextWindowModelResponse = model.detokenize(contextWindowsRes);
+                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
+                            modelResponse = modelResponse.trimEnd();
+                            contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+                        }
+                        return {
+                            response: modelResponse,
+                            lastEvaluation: {
+                                contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
+                                cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
+                                contextShiftMetadata: lastHistoryCompressionMetadata
+                            },
+                            metadata: {
+                                stopReason: "abort"
+                            }
+                        };
+                    }
+                    currentIteration = await evaluationIterator.next(replacementToken);
                 }
-                currentIteration = await evaluationIterator.next(replacementToken);
+            }
+            finally {
+                await evaluationIterator.return();
             }
             isFirstEvaluation = false;
             if (shouldContextShift)
@@ -654,7 +704,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
             : resolvedContextShift.size;
         const { compressedHistory, metadata } = await compressHistoryToFitContextSize({
             history: resolvedHistory,
-            contextShiftSize: Math.max(contextShiftSize, minFreeContextTokens) + pendingTokensCount,
+            contextShiftSize: Math.max(minFreeContextTokens, Math.min(contextShiftSize, context.contextSize - pendingTokensCount)) + pendingTokensCount,
             contextShiftStrategy: resolvedContextShift.strategy,
             contextShiftLastEvaluationMetadata: resolvedContextShift.lastEvaluationMetadata,
             contextSize: context.contextSize,
@@ -701,7 +751,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
         : resolvedContextShift.size)));
     const { compressedHistory, metadata } = await compressHistoryToFitContextSize({
         history: resolvedHistory,
-        contextShiftSize: Math.max(contextShiftSize, minFreeContextTokens) + pendingTokensCount,
+        contextShiftSize: Math.max(minFreeContextTokens, Math.min(contextShiftSize, context.contextSize - pendingTokensCount)) + pendingTokensCount,
         contextShiftStrategy: resolvedContextShift.strategy,
         contextShiftLastEvaluationMetadata: resolvedContextShift.lastEvaluationMetadata,
         contextSize: context.contextSize,