npm - node-llama-cpp - Versions diffs - 3.11.0 → 3.12.1 - Mend

node-llama-cpp 3.11.0 → 3.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/README.md +1 -1
package/dist/bindings/Llama.d.ts +5 -1
package/dist/bindings/Llama.js +11 -1
package/dist/bindings/Llama.js.map +1 -1
package/dist/bindings/types.d.ts +5 -2
package/dist/bindings/types.js +16 -1
package/dist/bindings/types.js.map +1 -1
package/dist/chatWrappers/HarmonyChatWrapper.d.ts +78 -0
package/dist/chatWrappers/HarmonyChatWrapper.js +539 -0
package/dist/chatWrappers/HarmonyChatWrapper.js.map +1 -0
package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js +8 -2
package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js.map +1 -1
package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +4 -2
package/dist/chatWrappers/utils/resolveChatWrapper.js +21 -6
package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -1
package/dist/cli/commands/ChatCommand.d.ts +2 -1
package/dist/cli/commands/ChatCommand.js +21 -7
package/dist/cli/commands/ChatCommand.js.map +1 -1
package/dist/cli/commands/CompleteCommand.d.ts +2 -1
package/dist/cli/commands/CompleteCommand.js +21 -7
package/dist/cli/commands/CompleteCommand.js.map +1 -1
package/dist/cli/commands/InfillCommand.d.ts +2 -1
package/dist/cli/commands/InfillCommand.js +21 -7
package/dist/cli/commands/InfillCommand.js.map +1 -1
package/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts +1 -0
package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +16 -5
package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -1
package/dist/cli/recommendedModels.js +22 -0
package/dist/cli/recommendedModels.js.map +1 -1
package/dist/evaluator/LlamaChat/LlamaChat.d.ts +14 -0
package/dist/evaluator/LlamaChat/LlamaChat.js +369 -48
package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +52 -2
package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +162 -47
package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
package/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.d.ts +1 -0
package/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js.map +1 -1
package/dist/gguf/insights/GgufInsights.js +22 -3
package/dist/gguf/insights/GgufInsights.js.map +1 -1
package/dist/gguf/types/GgufMetadataTypes.d.ts +19 -2
package/dist/gguf/types/GgufMetadataTypes.js +17 -0
package/dist/gguf/types/GgufMetadataTypes.js.map +1 -1
package/dist/gguf/types/GgufTensorInfoTypes.d.ts +2 -1
package/dist/gguf/types/GgufTensorInfoTypes.js +1 -0
package/dist/gguf/types/GgufTensorInfoTypes.js.map +1 -1
package/dist/gguf/utils/getGgufFileTypeName.d.ts +1 -1
package/dist/gguf/utils/ggufQuantNames.js +1 -0
package/dist/gguf/utils/ggufQuantNames.js.map +1 -1
package/dist/index.d.ts +3 -2
package/dist/index.js +2 -1
package/dist/index.js.map +1 -1
package/dist/tsconfig.tsbuildinfo +1 -1
package/dist/types.d.ts +150 -3
package/dist/types.js +2 -1
package/dist/types.js.map +1 -1
package/dist/utils/gbnfJson/types.d.ts +1 -1
package/dist/utils/gbnfJson/types.js.map +1 -1
package/dist/utils/getChatWrapperSegmentDefinition.js +2 -0
package/dist/utils/getChatWrapperSegmentDefinition.js.map +1 -1
package/llama/binariesGithubRelease.json +1 -1
package/llama/gitRelease.bundle +0 -0
package/llama/llama.cpp.info.json +1 -1
package/package.json +18 -16
package/templates/packed/electron-typescript-react.json +1 -1

package/dist/evaluator/LlamaChat/LlamaChat.js CHANGED Viewed

@@ -80,7 +80,7 @@ export class LlamaChat {
         return this.sequence.model;
     }
     async generateResponse(history, options = {}) {
-        const { onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = options;
+        const { onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, abortOnNonText = false, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = options;
         this.sequence.tokenPredictor?.updateInputTokens?.(this.model.tokenize(findLastUserMessageInChatHistory(history)?.text ?? ""));
         const generateResponseState = new GenerateResponseState(this, this._chatWrapper, history, {
             onTextChunk,
@@ -107,12 +107,13 @@ export class LlamaChat {
             maxParallelFunctionCalls,
             contextShift,
             customStopTriggers,
+            abortOnNonText,
             lastEvaluationContextWindow: {
                 history: lastEvaluationContextWindowHistory,
                 minimumOverlapPercentageToPreventContextShift
             }
         });
-        if (generateResponseState.grammar != null && generateResponseState.functionsEnabled)
+        if (generateResponseState.grammar != null && generateResponseState.functionsEnabled && !abortOnNonText)
             throw new Error("Using both grammar and functions is not supported yet");
         return await withLock([this._chatLock, "evaluate"], signal, async () => {
             try {
@@ -122,11 +123,13 @@ export class LlamaChat {
                     await generateResponseState.loadContextWindow(generateResponseState.getResolvedHistoryWithCurrentModelResponse(), generateResponseState.getContextWindowsHistoryWithCurrentModelResponse(), false, avoidReloadingHistory);
                 };
                 const loadContextWindowForFunctionCallingLoop = async () => loadContextWindow(true);
-                const loadContextWindowForBudgetTriggers = async () => loadContextWindow(false);
                 while (true) {
                     generateResponseState.startTokenLoop();
+                    generateResponseState.handleRerender();
+                    const shouldHandlePrefixTriggers = generateResponseState.isRerender;
                     generateResponseState.canAvoidReloadingHistory = false;
                     await loadContextWindow();
+                    generateResponseState.isRerender = false;
                     generateResponseState.addStopGenerationTriggersFromChatWrapper();
                     if (generateResponseState.generatedTokens === 0) {
                         generateResponseState.addIgnoreStartTextTriggersFromChatWrapper();
@@ -134,7 +137,15 @@ export class LlamaChat {
                             generateResponseState.initFunctions();
                         }
                     }
-                    if (generateResponseState.functionEvaluationMode !== false) {
+                    const abortRes = generateResponseState.handleAbortTrigger("model");
+                    if (abortRes != null)
+                        return abortRes;
+                    if (shouldHandlePrefixTriggers) {
+                        const handlePrefixTriggersRes = await generateResponseState.handlePrefixTriggers(loadContextWindowForFunctionCallingLoop);
+                        if (handlePrefixTriggersRes != null)
+                            return handlePrefixTriggersRes;
+                    }
+                    if (generateResponseState.functionEvaluationMode !== false && !generateResponseState.abortOnNonText) {
                         const functionsCallsRes = await generateResponseState.enterFunctionCallingLoop(loadContextWindowForFunctionCallingLoop);
                         if (functionsCallsRes != null)
                             return functionsCallsRes;
@@ -165,21 +176,21 @@ export class LlamaChat {
                         const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger("model");
                         if (maxTokensTriggerRes != null)
                             return maxTokensTriggerRes;
-                        if (generateResponseState.updateShouldContextShift())
+                        if (generateResponseState.handleShouldRerender() || generateResponseState.updateShouldContextShift())
                             break;
                         if (await generateResponseState.handleBudgetTriggers()) {
-                            await loadContextWindowForBudgetTriggers();
-                            await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
-                            await generateResponseState.createNewEvaluationIterator();
+                            generateResponseState.shouldRerender = true;
+                            generateResponseState.skipClosingResponseItemOnRerender = true;
+                            break;
                         }
-                        if (generateResponseState.updateShouldContextShift())
+                        if (generateResponseState.handleShouldRerender() || generateResponseState.updateShouldContextShift())
                             break;
                         const abortRes = generateResponseState.handleAbortTrigger("model");
                         if (abortRes != null)
                             return abortRes;
                     }
                     generateResponseState.isFirstEvaluation = false;
-                    if (generateResponseState.shouldContextShift)
+                    if (generateResponseState.shouldRerender || generateResponseState.shouldContextShift)
                         continue;
                     break;
                 }
@@ -236,10 +247,12 @@ export class LlamaChat {
                 while (true) {
                     generateResponseState.startTokenLoop();
                     const { userTextSuffix } = await generateResponseState.loadContextWindow(mergeGeneratedResultWithChatHistory("user", generateResponseState.resolvedHistory, generateResponseState.segmentHandler.getModelResponseSegments()), mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()), true);
+                    generateResponseState.isRerender = false;
                     generateResponseState.functionEvaluationMode = false;
                     generateResponseState.addStopGenerationTriggersFromChatWrapper();
                     if (userTextSuffix != null && userTextSuffix.values.length > 0)
                         generateResponseState.stopGenerationDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(userTextSuffix, this.model.tokenizer));
+                    generateResponseState.rerenderTriggers.forEach((trigger) => (generateResponseState.stopGenerationDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(trigger, this.model.tokenizer))));
                     allSegmentTypes
                         .map((segmentType) => getChatWrapperSegmentDefinition(this._chatWrapper.settings, segmentType))
                         .filter((segmentDefinition) => segmentDefinition != null)
@@ -545,13 +558,13 @@ function generateContextTextThatEndsWithUserText(chatWrapper, options) {
         `There might be an issue with the chat wrapper "${chatWrapper.wrapperName}" ` +
         "where not all user messages are properly added to the the result LlamaText");
 }
-async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHistoryCompressionMetadata, pendingTokensCount = 0, isFirstEvaluation, chatWrapper, lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift, sequence, minFreeContextTokens = 1, functions, documentFunctionParams, endWithUserText }) {
+async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHistoryCompressionMetadata, pendingTokensCount = 0, isFirstEvaluation, isRerender, chatWrapper, lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift, sequence, minFreeContextTokens = 1, functions, documentFunctionParams, endWithUserText }) {
     if (sequence == null)
         throw new DisposedError();
     const model = sequence.model;
     const context = sequence.context;
     let removeRawFromHistory = false;
-    if (isFirstEvaluation && lastEvaluationContextWindowHistory != null && sequence.isLoadedToMemory) {
+    if ((isFirstEvaluation || isRerender) && lastEvaluationContextWindowHistory != null && sequence.isLoadedToMemory) {
         const newContextWindow = lastEvaluationContextWindowHistory.slice();
         if (endWithUserText) {
             if (newContextWindow.length === 0 || newContextWindow[newContextWindow.length - 1].type !== "user")
@@ -565,7 +578,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
                 type: "model",
                 response: []
             });
-        const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix } = generateContextText(endWithUserText, chatWrapper, {
+        const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix, prefixTriggers, noPrefixTrigger, rerender, detectFunctionCalls } = generateContextText(endWithUserText, chatWrapper, {
             chatHistory: newContextWindow,
             availableFunctions: functions,
             documentFunctionParams
@@ -574,7 +587,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
         if (tokens.length + pendingTokensCount + minFreeContextTokens < context.contextSize) {
             const { firstDifferentIndex } = sequence.compareContextTokens(tokens);
             const existingEvaluationPercentage = firstDifferentIndex / tokens.length;
-            if (existingEvaluationPercentage >= minimumOverlapPercentageToPreventContextShift)
+            if (isRerender || existingEvaluationPercentage >= minimumOverlapPercentageToPreventContextShift)
                 return {
                     history: newContextWindow,
                     stopGenerationTriggers,
@@ -584,7 +597,11 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
                     ignoreStartText: ignoreStartText ?? [],
                     functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
                     disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [],
-                    userTextSuffix
+                    userTextSuffix,
+                    prefixTriggers,
+                    noPrefixTrigger,
+                    rerender,
+                    detectFunctionCalls
                 };
         }
     }
@@ -607,7 +624,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
             functions,
             documentFunctionParams
         });
-        const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix } = generateContextText(endWithUserText, chatWrapper, {
+        const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix, prefixTriggers, noPrefixTrigger, rerender, detectFunctionCalls } = generateContextText(endWithUserText, chatWrapper, {
             chatHistory: compressedHistory,
             availableFunctions: functions,
             documentFunctionParams
@@ -621,11 +638,15 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
             ignoreStartText: ignoreStartText ?? [],
             functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
             disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [],
-            userTextSuffix
+            userTextSuffix,
+            prefixTriggers,
+            noPrefixTrigger,
+            rerender,
+            detectFunctionCalls
         };
     }
     {
-        const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix } = generateContextText(endWithUserText, chatWrapper, {
+        const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix, prefixTriggers, noPrefixTrigger, rerender, detectFunctionCalls } = generateContextText(endWithUserText, chatWrapper, {
             chatHistory: resolvedHistory,
             availableFunctions: functions,
             documentFunctionParams
@@ -641,7 +662,11 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
                 ignoreStartText: ignoreStartText ?? [],
                 functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
                 disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [],
-                userTextSuffix
+                userTextSuffix,
+                prefixTriggers,
+                noPrefixTrigger,
+                rerender,
+                detectFunctionCalls
             };
     }
     const contextShiftSize = Math.min(context.contextSize, Math.max(1, Math.floor(resolvedContextShift.size instanceof Function
@@ -658,7 +683,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
         functions,
         documentFunctionParams
     });
-    const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix } = generateContextText(endWithUserText, chatWrapper, {
+    const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix, prefixTriggers, noPrefixTrigger, rerender, detectFunctionCalls } = generateContextText(endWithUserText, chatWrapper, {
         chatHistory: compressedHistory,
         availableFunctions: functions,
         documentFunctionParams
@@ -672,7 +697,11 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
         ignoreStartText: ignoreStartText ?? [],
         functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
         disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [],
-        userTextSuffix
+        userTextSuffix,
+        prefixTriggers,
+        noPrefixTrigger,
+        rerender,
+        detectFunctionCalls
     };
 }
 class GenerateResponseState {
@@ -702,6 +731,7 @@ class GenerateResponseState {
     maxParallelFunctionCalls;
     contextShift;
     customStopTriggers;
+    abortOnNonText;
     minimumOverlapPercentageToPreventContextShift;
     functionsEnabled;
     repeatPenaltyEnabled;
@@ -711,6 +741,7 @@ class GenerateResponseState {
     functionNameGrammar;
     functionsGrammar;
     functionsEvaluationState;
+    functionSyntaxStartDetectorEnabled = true;
     streamRegulator = new TokenStreamRegulator();
     stopGenerationDetector = new StopGenerationDetector();
     customStopGenerationTriggersDetector = new StopGenerationDetector();
@@ -723,6 +754,7 @@ class GenerateResponseState {
     res = [];
     pendingTokens = [];
     ignoredStartTextTokens = [];
+    prefixTriggerTokens = [];
     resFunctionCalls = [];
     segmentHandler;
     pendingPartialTokens = [];
@@ -735,12 +767,16 @@ class GenerateResponseState {
     releasedPartiallyFreeTokensBeforeFunctionCallStartSyntax = false;
     generatedTokens = 0;
     isFirstEvaluation = true;
+    isRerender = true; // first render is a rerender
     initiallyEngagedFunctionMode = false;
     lastContextWindowHistory;
     lastHistoryCompressionMetadata;
     restartEvaluationIterator = false;
     // context shift loop
     shouldContextShift = false;
+    shouldRerender = false;
+    skipClosingResponseItemOnRerender = false;
+    shouldAbortBecauseOfNonText = false;
     canAvoidReloadingHistory = false;
     contextWindowTokens = [];
     stopGenerationTriggers = [];
@@ -748,6 +784,11 @@ class GenerateResponseState {
     functionCallInitiallyEngaged = false;
     disengageInitiallyEngagedFunctionCall = [];
     userTextSuffix = undefined;
+    prefixTriggerDetectors = new Map();
+    noPrefixTrigger = undefined;
+    rerenderTriggers = [];
+    rerenderTriggerDetector = new StopGenerationDetector();
+    rerenderActions = undefined;
     tokens = [];
     // token evaluation loop
     evaluationIterator;
@@ -757,7 +798,7 @@ class GenerateResponseState {
     currentTokens = [];
     currentText = "";
     currentQueuedTokenRelease;
-    constructor(llamaChat, chatWrapper, history, { onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
+    constructor(llamaChat, chatWrapper, history, { onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, abortOnNonText, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
         this.llamaChat = llamaChat;
         this.chatWrapper = chatWrapper;
         this.history = history;
@@ -784,6 +825,7 @@ class GenerateResponseState {
         this.maxParallelFunctionCalls = maxParallelFunctionCalls;
         this.contextShift = contextShift;
         this.customStopTriggers = customStopTriggers;
+        this.abortOnNonText = abortOnNonText ?? false;
         this.minimumOverlapPercentageToPreventContextShift = minimumOverlapPercentageToPreventContextShift;
         this.functionsEnabled = (this.functions != null && Object.keys(this.functions).length > 0);
         if (this.signal?.aborted)
@@ -821,7 +863,7 @@ class GenerateResponseState {
         if (this.grammar != null)
             StopGenerationDetector.resolveStopTriggers(this.grammar.stopGenerationTriggers, this.llamaChat.model.tokenizer)
                 .map((stopTrigger) => this.stopGenerationDetector.addStopTrigger(stopTrigger));
-        if (this.functions != null && Object.keys(this.functions).length > 0)
+        if (this.functions != null && Object.keys(this.functions).length > 0 && !this.abortOnNonText)
             this.functionSyntaxStartDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText([
                 this.chatWrapper.settings.functions?.parallelism?.call?.sectionPrefix ?? "",
                 this.chatWrapper.settings.functions.call.prefix
@@ -846,6 +888,17 @@ class GenerateResponseState {
                 ? new Map()
                 : SegmentHandler.getSegmentTokenCounts(lastModelMessageFullResponse, this.llamaChat.model.tokenizer)
         });
+        if (this.abortOnNonText) {
+            this.stopGenerationDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText([
+                this.chatWrapper.settings.functions?.parallelism?.call?.sectionPrefix ?? "",
+                this.chatWrapper.settings.functions.call.prefix
+            ]), this.llamaChat.model.tokenizer));
+            for (const segmentType of allSegmentTypes) {
+                const segmentDefinition = getChatWrapperSegmentDefinition(this.chatWrapper.settings, segmentType);
+                if (segmentDefinition != null)
+                    this.stopGenerationDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText(segmentDefinition.prefix), this.llamaChat.model.tokenizer));
+            }
+        }
         this.getPenaltyTokens = this.getPenaltyTokens.bind(this);
     }
     async dispose() {
@@ -894,7 +947,10 @@ class GenerateResponseState {
         });
         if (!hadThoughtSegments)
             return;
-        this.segmentHandler.openSegment("thought");
+        if (this.abortOnNonText)
+            this.shouldAbortBecauseOfNonText = true;
+        else
+            this.segmentHandler.openSegment("thought");
     }
     ensureNotAborted() {
         if (this.signal?.aborted && (!this.stopOnAbortSignal || this.res.length === 0))
@@ -930,7 +986,8 @@ class GenerateResponseState {
             let mostExhaustiveTriggeredStopsLeftoverTokens = [];
             const lastTokensForDetokenizer = resolveLastTokens([
                 this.contextWindowTokens,
-                this.ignoredStartTextTokens
+                this.ignoredStartTextTokens,
+                this.prefixTriggerTokens
             ]);
             const pendingPartialTokens = [];
             for (let i = 0; i < this.pendingTokens.length; i++) {
@@ -993,6 +1050,18 @@ class GenerateResponseState {
         this.ensureNotAborted();
         this.shouldContextShift = false;
     }
+    handleRerender() {
+        if (this.shouldRerender) {
+            this.isRerender = true;
+            this.streamRegulator.reset();
+            if (this.rerenderActions === "closeResponseItem" && this.segmentHandler.topOpenSegmentType != null &&
+                !this.skipClosingResponseItemOnRerender) {
+                this.segmentHandler.closeSegment(this.segmentHandler.topOpenSegmentType);
+                this.shouldRerender = false;
+            }
+            this.skipClosingResponseItemOnRerender = false;
+        }
+    }
     getContextWindowFunctionCallsTokens() {
         if (this.functionEvaluationMode === false)
             return [];
@@ -1019,14 +1088,15 @@ class GenerateResponseState {
     async loadContextWindow(resolvedHistory, resolvedContextWindowsHistory, endWithUserText = false, avoidReloadingHistory = false) {
         const queuedChunkTokens = this.streamRegulator.getAllQueuedChunkTokens();
         const functionCallsTokens = this.getContextWindowFunctionCallsTokens();
-        if (!avoidReloadingHistory || !this.canAvoidReloadingHistory || !this.llamaChat.sequence.isLoadedToMemory) {
-            const { history: contextWindowHistory, stopGenerationTriggers, tokens: contextWindowTokens, removeRawFromHistory, newHistoryCompressionMetadata, ignoreStartText, functionCallInitiallyEngaged, disengageInitiallyEngagedFunctionCall, userTextSuffix } = await getContextWindow({
+        if (!avoidReloadingHistory || !this.canAvoidReloadingHistory || this.isRerender || !this.llamaChat.sequence.isLoadedToMemory) {
+            const { history: contextWindowHistory, stopGenerationTriggers, tokens: contextWindowTokens, removeRawFromHistory, newHistoryCompressionMetadata, ignoreStartText, functionCallInitiallyEngaged, disengageInitiallyEngagedFunctionCall, userTextSuffix, prefixTriggers, noPrefixTrigger, rerender, detectFunctionCalls } = await getContextWindow({
                 resolvedHistory: resolvedHistory,
                 resolvedContextShift: this.resolvedContextShift,
                 lastHistoryCompressionMetadata: this.lastHistoryCompressionMetadata,
-                pendingTokensCount: this.pendingTokens.length + queuedChunkTokens.length + functionCallsTokens.length +
-                    this.pendingPartialTokens.length,
+                pendingTokensCount: this.prefixTriggerTokens.length + this.pendingTokens.length + queuedChunkTokens.length +
+                    functionCallsTokens.length + this.pendingPartialTokens.length,
                 isFirstEvaluation: this.isFirstEvaluation,
+                isRerender: this.isRerender,
                 chatWrapper: this.chatWrapper,
                 lastEvaluationContextWindowHistory: resolvedContextWindowsHistory,
                 minimumOverlapPercentageToPreventContextShift: this.minimumOverlapPercentageToPreventContextShift,
@@ -1043,6 +1113,61 @@ class GenerateResponseState {
             this.functionCallInitiallyEngaged = functionCallInitiallyEngaged;
             this.disengageInitiallyEngagedFunctionCall = disengageInitiallyEngagedFunctionCall;
             this.userTextSuffix = userTextSuffix;
+            if (this.isRerender) {
+                this.prefixTriggerTokens.length = 0;
+                for (const prefixDetector of this.prefixTriggerDetectors.keys()) {
+                    prefixDetector.clearInProgressStops();
+                    prefixDetector.clearTriggeredStops();
+                }
+                this.prefixTriggerDetectors.clear();
+                for (const trigger of prefixTriggers ?? []) {
+                    const segmentBudget = trigger.type === "segment"
+                        ? this.getSegmentBudget(trigger.segmentType)
+                        : null;
+                    if (trigger.type === "functionCall" && !this.functionsEnabled)
+                        continue;
+                    else if (trigger.type === "segment" &&
+                        segmentBudget != null &&
+                        !this.segmentHandler.isSegmentTypeOpen(trigger.segmentType) &&
+                        this.segmentHandler.getSegmentTokensCount(trigger.segmentType) >= segmentBudget)
+                        continue;
+                    const prefixDetector = new StopGenerationDetector();
+                    StopGenerationDetector.resolveStopTriggers(trigger.triggers, this.llamaChat.model.tokenizer)
+                        .forEach((stopTrigger) => prefixDetector.addStopTrigger(stopTrigger));
+                    this.prefixTriggerDetectors.set(prefixDetector, { inject: trigger.inject, trigger });
+                    const inject = trigger.inject;
+                    if (inject != null && inject.values.length > 0) {
+                        const fullPrefixDetector = new StopGenerationDetector();
+                        StopGenerationDetector
+                            .resolveStopTriggers(trigger.triggers.map((trigger) => LlamaText([trigger, inject])), this.llamaChat.model.tokenizer)
+                            .forEach((stopTrigger) => fullPrefixDetector.addStopTrigger(stopTrigger));
+                        this.prefixTriggerDetectors.set(fullPrefixDetector, { trigger });
+                    }
+                }
+                this.noPrefixTrigger = noPrefixTrigger;
+                const noPrefixTriggerSegmentBudget = noPrefixTrigger?.type === "segment"
+                    ? this.getSegmentBudget(noPrefixTrigger.segmentType)
+                    : null;
+                if (this.noPrefixTrigger?.type === "functionCall" && !this.functionsEnabled)
+                    this.noPrefixTrigger = undefined;
+                else if (noPrefixTrigger?.type === "segment" &&
+                    noPrefixTriggerSegmentBudget != null &&
+                    !this.segmentHandler.isSegmentTypeOpen(noPrefixTrigger.segmentType) &&
+                    this.segmentHandler.getSegmentTokensCount(noPrefixTrigger.segmentType) >= noPrefixTriggerSegmentBudget)
+                    this.noPrefixTrigger = undefined;
+                this.rerenderTriggers = rerender?.triggers ?? [];
+                this.rerenderTriggerDetector.clearInProgressStops();
+                this.rerenderTriggerDetector.clearTriggeredStops();
+                this.rerenderTriggerDetector = new StopGenerationDetector();
+                this.rerenderActions = rerender?.action;
+                this.functionSyntaxStartDetectorEnabled = detectFunctionCalls ?? true;
+                if (!this.functionSyntaxStartDetectorEnabled)
+                    this.functionSyntaxStartDetector.clearInProgressStops();
+                if (rerender?.triggers != null) {
+                    StopGenerationDetector.resolveStopTriggers(rerender.triggers, this.llamaChat.model.tokenizer)
+                        .map((stopTrigger) => this.rerenderTriggerDetector.addStopTrigger(stopTrigger));
+                }
+            }
             this.lastHistoryCompressionMetadata = newHistoryCompressionMetadata;
             this.lastContextWindowHistory = contextWindowHistory;
             this.segmentHandler.resetContextWindow();
@@ -1055,6 +1180,7 @@ class GenerateResponseState {
         this.tokens = [
             ...this.contextWindowTokens,
             ...this.ignoredStartTextTokens,
+            ...this.prefixTriggerTokens,
             ...this.pendingTokens,
             ...queuedChunkTokens,
             ...functionCallsTokens,
@@ -1076,6 +1202,10 @@ class GenerateResponseState {
     }
     initFunctions() {
         this.initiallyEngagedFunctionMode = this.functionCallInitiallyEngaged;
+        if (this.initiallyEngagedFunctionMode && this.abortOnNonText) {
+            this.shouldAbortBecauseOfNonText = true;
+            return;
+        }
         if (this.initiallyEngagedFunctionMode) {
             StopGenerationDetector.resolveStopTriggers(this.disengageInitiallyEngagedFunctionCall, this.llamaChat.model.tokenizer)
                 .map((stopTrigger) => this.disengageInitiallyEngagedFunctionMode.addStopTrigger(stopTrigger));
@@ -1090,6 +1220,140 @@ class GenerateResponseState {
             this.restartEvaluationIterator = true;
         }
     }
+    async handlePrefixTriggers(loadContextWindow) {
+        const reloadTokens = async () => {
+            this.startTokenLoop();
+            await loadContextWindow();
+        };
+        const injectTokens = async (text, alignStateTokens = false) => {
+            if (text == null)
+                return;
+            const tokens = text.tokenize(this.llamaChat.model.tokenizer, "trimLeadingSpace");
+            if (tokens.length === 0)
+                return;
+            pushAll(this.prefixTriggerTokens, tokens);
+            if (alignStateTokens)
+                await reloadTokens();
+        };
+        if (this.prefixTriggerDetectors.size === 0) {
+            if (this.abortOnNonText && this.noPrefixTrigger != null && this.noPrefixTrigger.type !== "response") {
+                this.shouldAbortBecauseOfNonText = true;
+                const stopRes = this.handleAbortTrigger("model");
+                if (stopRes != null)
+                    return stopRes;
+                return undefined;
+            }
+            if (this.noPrefixTrigger?.type === "functionCall" && this.chatWrapper.settings.functions != null) {
+                await injectTokens(this.noPrefixTrigger.inject, true);
+                this.functionEvaluationMode = "functionName";
+            }
+            else if (this.noPrefixTrigger?.type === "segment") {
+                await injectTokens(this.noPrefixTrigger.inject, true);
+                this.segmentHandler.openSegment(this.noPrefixTrigger.segmentType);
+            }
+            else if (this.noPrefixTrigger?.type === "response")
+                await injectTokens(this.noPrefixTrigger.inject, true);
+            return undefined;
+        }
+        const generatedTokens = [];
+        let isFirstToken = true;
+        let continueGeneration = true;
+        for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
+            pushAll(generatedTokens, tokens);
+            for (const [triggerDetector, { trigger, inject }] of [...this.prefixTriggerDetectors.entries()]) {
+                triggerDetector.recordGeneration({
+                    text: this.currentText,
+                    tokens: this.currentTokens,
+                    startNewChecks: isFirstToken,
+                    triggerMustStartWithGeneration: true
+                });
+                if (triggerDetector.hasTriggeredStops) {
+                    const { firstRemainingGenerationAfterStop, stopTrigger } = StopGenerationDetector.getFirstRemainingGenerationAfterStop(triggerDetector.getTriggeredStops());
+                    const remainingTokens = typeof firstRemainingGenerationAfterStop === "string"
+                        ? firstRemainingGenerationAfterStop === ""
+                            ? []
+                            : this.llamaChat.model.tokenize(firstRemainingGenerationAfterStop, false, "trimLeadingSpace")
+                        : (firstRemainingGenerationAfterStop ?? []);
+                    const triggerTokens = (stopTrigger == null || remainingTokens.length === 0)
+                        ? generatedTokens
+                        : stopTrigger.flatMap((item) => {
+                            if (typeof item === "string")
+                                return this.llamaChat.model.tokenize(item, false, "trimLeadingSpace");
+                            return [item];
+                        });
+                    if (this.abortOnNonText && trigger.type !== "response") {
+                        this.shouldAbortBecauseOfNonText = true;
+                        const stopRes = this.handleAbortTrigger("model");
+                        if (stopRes != null)
+                            return stopRes;
+                        return undefined;
+                    }
+                    this.streamRegulator.reset();
+                    if (trigger.type === "segment") {
+                        pushAll(this.prefixTriggerTokens, triggerTokens);
+                        if (inject != null)
+                            await injectTokens(inject);
+                        await reloadTokens();
+                        this.segmentHandler.openSegment(trigger.segmentType);
+                    }
+                    else if (trigger.type === "response") {
+                        pushAll(this.prefixTriggerTokens, triggerTokens);
+                        if (inject != null)
+                            await injectTokens(inject);
+                        await reloadTokens();
+                    }
+                    else if (trigger.type === "functionCall") {
+                        if (trigger.replaceTrigger === false)
+                            pushAll(this.prefixTriggerTokens, triggerTokens);
+                        if (inject != null)
+                            await injectTokens(inject);
+                        await reloadTokens();
+                        this.functionEvaluationMode = "functionName";
+                    }
+                    else
+                        void trigger;
+                    this.prefixTriggerDetectors.clear();
+                    continueGeneration = false;
+                    break;
+                }
+                else if (!triggerDetector.hasInProgressStops)
+                    this.prefixTriggerDetectors.delete(triggerDetector);
+            }
+            if (this.prefixTriggerDetectors.size === 0 && continueGeneration) {
+                if (this.abortOnNonText && this.noPrefixTrigger != null && this.noPrefixTrigger.type !== "response") {
+                    this.shouldAbortBecauseOfNonText = true;
+                    const stopRes = this.handleAbortTrigger("model");
+                    if (stopRes != null)
+                        return stopRes;
+                    return undefined;
+                }
+                this.streamRegulator.reset();
+                continueGeneration = false;
+                if (this.noPrefixTrigger?.type === "functionCall" && this.chatWrapper.settings.functions != null) {
+                    await injectTokens(this.noPrefixTrigger.inject, true);
+                    this.functionEvaluationMode = "functionName";
+                }
+                else if (this.noPrefixTrigger?.type === "segment") {
+                    await injectTokens(this.noPrefixTrigger.inject, true);
+                    this.segmentHandler.openSegment(this.noPrefixTrigger.segmentType);
+                }
+                else if (this.noPrefixTrigger?.type === "response")
+                    await injectTokens(this.noPrefixTrigger.inject, true);
+                else
+                    this.streamRegulator.addChunk({
+                        tokens: generatedTokens,
+                        text: this.llamaChat.model.detokenize(generatedTokens, false, this.getLastTokens())
+                    });
+            }
+            isFirstToken = false;
+            if (!continueGeneration)
+                break;
+            const stopRes = this.handleAbortTrigger("model") ?? this.handleMaxTokensTrigger("model");
+            if (stopRes != null)
+                return stopRes;
+        }
+        return undefined;
+    }
     async enterFunctionCallingLoop(loadContextWindow) {
         if (!this.functionsEnabled) {
             this.functionEvaluationMode = false;
@@ -1568,6 +1832,8 @@ class GenerateResponseState {
         }
     }
     detectAndHandleFunctionStartSyntax() {
+        if (!this.functionSyntaxStartDetectorEnabled)
+            return;
         this.functionSyntaxStartDetector.recordGeneration({
             text: this.currentText,
             tokens: this.currentTokens,
@@ -1575,6 +1841,10 @@ class GenerateResponseState {
         });
         if (this.currentQueuedTokenRelease != null && this.functionEvaluationMode === false && this.functionsEnabled &&
             this.functionSyntaxStartDetector.hasTriggeredStops) {
+            if (this.abortOnNonText) {
+                this.shouldAbortBecauseOfNonText = true;
+                return;
+            }
             this.functionEvaluationMode = "functionName";
             this.currentQueuedTokenRelease.createTextIndexLock(0);
             this.stopGenerationDetector.clearTriggeredStops();
@@ -1592,6 +1862,11 @@ class GenerateResponseState {
         }
     }
     recordStopGenerationEvaluation() {
+        this.rerenderTriggerDetector.recordGeneration({
+            text: this.currentText,
+            tokens: this.currentTokens,
+            queuedTokenRelease: this.currentQueuedTokenRelease
+        });
         this.stopGenerationDetector.recordGeneration({
             text: this.currentText,
             tokens: this.currentTokens,
@@ -1609,8 +1884,10 @@ class GenerateResponseState {
         pushAll(this.pendingTokens, this.streamRegulator.popFreeChunkTokens());
     }
     handleStopGenerationTrigger(lastHistoryItemType, forceStopReason) {
-        if (this.stopGenerationDetector.hasTriggeredStops || this.customStopGenerationTriggersDetector.hasTriggeredStops ||
-            this.llamaChat.model.isEogToken(this.currentToken) || forceStopReason != null) {
+        const detectedStopGenerationTrigger = this.stopGenerationDetector.hasTriggeredStops ||
+            this.customStopGenerationTriggersDetector.hasTriggeredStops ||
+            this.llamaChat.model.isEogToken(this.currentToken);
+        if ((detectedStopGenerationTrigger && !this.rerenderTriggerDetector.hasTriggeredStops) || forceStopReason != null) {
             this.stopGenerationDetector.clearInProgressStops();
             this.customStopGenerationTriggersDetector.clearInProgressStops();
             pushAll(this.pendingTokens, this.streamRegulator.popFreeChunkTokens());
@@ -1709,25 +1986,45 @@ class GenerateResponseState {
     }
     async handleBudgetTriggers() {
         let shouldReloadEvaluationState = false;
-        const hasBudget = (budget) => budget != null && budget !== Infinity;
-        const hasBudgetTriggers = this.budgets != null && hasBudget(this.budgets.thoughtTokens);
-        if (!hasBudgetTriggers)
+        if (this.budgets == null)
             return shouldReloadEvaluationState;
-        if (hasBudget(this.budgets.thoughtTokens) && this.segmentHandler.isSegmentTypeOpen("thought")) {
-            const usedThoughtTokens = this.segmentHandler.getSegmentTokensCount("thought");
-            if (usedThoughtTokens >= this.budgets.thoughtTokens) {
-                this.segmentHandler.closeSegment("thought");
+        for (const segmentType of this.segmentHandler.getOpenSegmentStack().reverse()) {
+            const budget = this.getSegmentBudget(segmentType);
+            if (budget == null)
+                continue;
+            const usedSegmentTokens = this.segmentHandler.getSegmentTokensCount(segmentType);
+            if (usedSegmentTokens >= budget) {
+                this.segmentHandler.closeSegment(segmentType);
                 shouldReloadEvaluationState = true;
             }
         }
         return shouldReloadEvaluationState;
     }
+    getSegmentBudget(segmentType) {
+        const getBudget = (budget) => ((budget == null || budget === Infinity)
+            ? null
+            : budget);
+        if (this.budgets == null)
+            return null;
+        if (segmentType === "thought")
+            return getBudget(this.budgets.thoughtTokens);
+        else if (segmentType === "comment")
+            return getBudget(this.budgets.commentTokens);
+        void segmentType;
+        return null;
+    }
+    handleShouldRerender() {
+        this.shouldRerender = this.rerenderTriggerDetector.hasTriggeredStops;
+        if (this.abortOnNonText && this.shouldRerender)
+            this.shouldAbortBecauseOfNonText = true;
+        return this.shouldRerender;
+    }
     updateShouldContextShift() {
         this.shouldContextShift = this.llamaChat.sequence.nextTokenIndex >= this.llamaChat.context.contextSize - 1;
         return this.shouldContextShift;
     }
     get shouldAbort() {
-        return !!(this.signal?.aborted && this.stopOnAbortSignal);
+        return !!(this.signal?.aborted && this.stopOnAbortSignal) || this.shouldAbortBecauseOfNonText;
     }
     handleAbortTrigger(lastHistoryItemType) {
         if (this.shouldAbort && this.signal?.aborted && this.stopOnAbortSignal) {
@@ -1747,7 +2044,9 @@ class GenerateResponseState {
                     contextShiftMetadata: this.lastHistoryCompressionMetadata
                 },
                 metadata: {
-                    stopReason: "abort"
+                    stopReason: this.shouldAbortBecauseOfNonText
+                        ? "eogToken"
+                        : "abort"
                 }
             };
         }
@@ -1867,6 +2166,29 @@ class SegmentHandler {
     isSegmentTypeOpen(type) {
         return this._segmentsStackSet.has(type);
     }
+    get topOpenSegmentType() {
+        return this._segmentsStack.at(-1);
+    }
+    /**
+     * First segment in the stack is the top most that'll close last.
+     * ```
+     * <segment1>
+     *     some text here
+     *     <segment2>
+     *        some text here
+     *         <segment3>
+     *             some text here
+     *         </segment3>
+     * ```
+     * In that example, the top most segment is `segment1`, and the last open segment is `segment2` (which is the next one to close).
+     * So in that example, this function will return:
+     * ```
+     * ["segment1", "segment2"]
+     * ```
+     */
+    getOpenSegmentStack() {
+        return this._segmentsStack.slice(this._ownedSegmentsStackLength);
+    }
     _processTokens(tokens, text) {
         const queuedTokenRelease = this._streamRegulator.addChunk({
             tokens,
@@ -2065,17 +2387,16 @@ class SegmentHandler {
                 this.onResponseChunk?.({ type: undefined, segmentType: undefined, tokens: tokens.slice(), text });
             }
             else {
-                if (lastSegment instanceof Array) {
-                    const text = (this.onResponseChunk != null || this.onTextChunk != null)
-                        ? this.model.detokenize(tokens, false, this._getTokenTrailFromResult())
-                        : "";
+                const text = (this.onResponseChunk != null || this.onTextChunk != null)
+                    ? this.model.detokenize(tokens, false, this._getTokenTrailFromResult())
+                    : "";
+                if (lastSegment instanceof Array)
                     pushAll(lastSegment, tokens);
-                    this.onToken?.(tokens);
-                    this.onTextChunk?.(text);
-                    this.onResponseChunk?.({ type: undefined, segmentType: undefined, tokens, text });
-                }
                 else
                     this._segments.push(tokens);
+                this.onToken?.(tokens.slice());
+                this.onTextChunk?.(text);
+                this.onResponseChunk?.({ type: undefined, segmentType: undefined, tokens: tokens.slice(), text });
             }
             if (lastContextWindowSegment == null)
                 this._contextWindowSegments.push(tokens.slice());