npm - node-llama-cpp - Versions diffs - 3.12.0 → 3.12.3 - Mend

node-llama-cpp 3.12.0 → 3.12.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/dist/evaluator/LlamaChat/LlamaChat.d.ts CHANGED Viewed

@@ -252,7 +252,21 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
          * Defaults to `Infinity`.
          */
         thoughtTokens?: number;
+        /**
+         * Budget for comment tokens.
+         *
+         * Defaults to `Infinity`.
+         */
+        commentTokens?: number;
     };
+    /**
+     * Stop the generation when the model tries to generate a non-textual segment or call a function.
+     *
+     * Useful for generating completions in a form of a model response.
+     *
+     * Defaults to `false`.
+     */
+    abortOnNonText?: boolean;
 } & ({
     grammar?: LlamaGrammar;
     functions?: never;

package/dist/evaluator/LlamaChat/LlamaChat.js CHANGED Viewed

@@ -80,7 +80,7 @@ export class LlamaChat {
         return this.sequence.model;
     }
     async generateResponse(history, options = {}) {
-        const { onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = options;
+        const { onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, abortOnNonText = false, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = options;
         this.sequence.tokenPredictor?.updateInputTokens?.(this.model.tokenize(findLastUserMessageInChatHistory(history)?.text ?? ""));
         const generateResponseState = new GenerateResponseState(this, this._chatWrapper, history, {
             onTextChunk,
@@ -107,12 +107,13 @@ export class LlamaChat {
             maxParallelFunctionCalls,
             contextShift,
             customStopTriggers,
+            abortOnNonText,
             lastEvaluationContextWindow: {
                 history: lastEvaluationContextWindowHistory,
                 minimumOverlapPercentageToPreventContextShift
             }
         });
-        if (generateResponseState.grammar != null && generateResponseState.functionsEnabled)
+        if (generateResponseState.grammar != null && generateResponseState.functionsEnabled && !abortOnNonText)
             throw new Error("Using both grammar and functions is not supported yet");
         return await withLock([this._chatLock, "evaluate"], signal, async () => {
             try {
@@ -122,7 +123,6 @@ export class LlamaChat {
                     await generateResponseState.loadContextWindow(generateResponseState.getResolvedHistoryWithCurrentModelResponse(), generateResponseState.getContextWindowsHistoryWithCurrentModelResponse(), false, avoidReloadingHistory);
                 };
                 const loadContextWindowForFunctionCallingLoop = async () => loadContextWindow(true);
-                const loadContextWindowForBudgetTriggers = async () => loadContextWindow(false);
                 while (true) {
                     generateResponseState.startTokenLoop();
                     generateResponseState.handleRerender();
@@ -137,12 +137,15 @@ export class LlamaChat {
                             generateResponseState.initFunctions();
                         }
                     }
+                    const abortRes = generateResponseState.handleAbortTrigger("model");
+                    if (abortRes != null)
+                        return abortRes;
                     if (shouldHandlePrefixTriggers) {
                         const handlePrefixTriggersRes = await generateResponseState.handlePrefixTriggers(loadContextWindowForFunctionCallingLoop);
                         if (handlePrefixTriggersRes != null)
                             return handlePrefixTriggersRes;
                     }
-                    if (generateResponseState.functionEvaluationMode !== false) {
+                    if (generateResponseState.functionEvaluationMode !== false && !generateResponseState.abortOnNonText) {
                         const functionsCallsRes = await generateResponseState.enterFunctionCallingLoop(loadContextWindowForFunctionCallingLoop);
                         if (functionsCallsRes != null)
                             return functionsCallsRes;
@@ -176,9 +179,9 @@ export class LlamaChat {
                         if (generateResponseState.handleShouldRerender() || generateResponseState.updateShouldContextShift())
                             break;
                         if (await generateResponseState.handleBudgetTriggers()) {
-                            await loadContextWindowForBudgetTriggers();
-                            await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
-                            await generateResponseState.createNewEvaluationIterator();
+                            generateResponseState.shouldRerender = true;
+                            generateResponseState.skipClosingResponseItemOnRerender = true;
+                            break;
                         }
                         if (generateResponseState.handleShouldRerender() || generateResponseState.updateShouldContextShift())
                             break;
@@ -728,6 +731,7 @@ class GenerateResponseState {
     maxParallelFunctionCalls;
     contextShift;
     customStopTriggers;
+    abortOnNonText;
     minimumOverlapPercentageToPreventContextShift;
     functionsEnabled;
     repeatPenaltyEnabled;
@@ -771,6 +775,8 @@ class GenerateResponseState {
     // context shift loop
     shouldContextShift = false;
     shouldRerender = false;
+    skipClosingResponseItemOnRerender = false;
+    shouldAbortBecauseOfNonText = false;
     canAvoidReloadingHistory = false;
     contextWindowTokens = [];
     stopGenerationTriggers = [];
@@ -792,7 +798,7 @@ class GenerateResponseState {
     currentTokens = [];
     currentText = "";
     currentQueuedTokenRelease;
-    constructor(llamaChat, chatWrapper, history, { onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
+    constructor(llamaChat, chatWrapper, history, { onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, abortOnNonText, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
         this.llamaChat = llamaChat;
         this.chatWrapper = chatWrapper;
         this.history = history;
@@ -819,6 +825,7 @@ class GenerateResponseState {
         this.maxParallelFunctionCalls = maxParallelFunctionCalls;
         this.contextShift = contextShift;
         this.customStopTriggers = customStopTriggers;
+        this.abortOnNonText = abortOnNonText ?? false;
         this.minimumOverlapPercentageToPreventContextShift = minimumOverlapPercentageToPreventContextShift;
         this.functionsEnabled = (this.functions != null && Object.keys(this.functions).length > 0);
         if (this.signal?.aborted)
@@ -856,7 +863,7 @@ class GenerateResponseState {
         if (this.grammar != null)
             StopGenerationDetector.resolveStopTriggers(this.grammar.stopGenerationTriggers, this.llamaChat.model.tokenizer)
                 .map((stopTrigger) => this.stopGenerationDetector.addStopTrigger(stopTrigger));
-        if (this.functions != null && Object.keys(this.functions).length > 0)
+        if (this.functions != null && Object.keys(this.functions).length > 0 && !this.abortOnNonText)
             this.functionSyntaxStartDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText([
                 this.chatWrapper.settings.functions?.parallelism?.call?.sectionPrefix ?? "",
                 this.chatWrapper.settings.functions.call.prefix
@@ -881,6 +888,17 @@ class GenerateResponseState {
                 ? new Map()
                 : SegmentHandler.getSegmentTokenCounts(lastModelMessageFullResponse, this.llamaChat.model.tokenizer)
         });
+        if (this.abortOnNonText) {
+            this.stopGenerationDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText([
+                this.chatWrapper.settings.functions?.parallelism?.call?.sectionPrefix ?? "",
+                this.chatWrapper.settings.functions.call.prefix
+            ]), this.llamaChat.model.tokenizer));
+            for (const segmentType of allSegmentTypes) {
+                const segmentDefinition = getChatWrapperSegmentDefinition(this.chatWrapper.settings, segmentType);
+                if (segmentDefinition != null)
+                    this.stopGenerationDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText(segmentDefinition.prefix), this.llamaChat.model.tokenizer));
+            }
+        }
         this.getPenaltyTokens = this.getPenaltyTokens.bind(this);
     }
     async dispose() {
@@ -929,7 +947,10 @@ class GenerateResponseState {
         });
         if (!hadThoughtSegments)
             return;
-        this.segmentHandler.openSegment("thought");
+        if (this.abortOnNonText)
+            this.shouldAbortBecauseOfNonText = true;
+        else
+            this.segmentHandler.openSegment("thought");
     }
     ensureNotAborted() {
         if (this.signal?.aborted && (!this.stopOnAbortSignal || this.res.length === 0))
@@ -1033,10 +1054,12 @@ class GenerateResponseState {
         if (this.shouldRerender) {
             this.isRerender = true;
             this.streamRegulator.reset();
-            if (this.rerenderActions === "closeResponseItem" && this.segmentHandler.topOpenSegmentType != null) {
+            if (this.rerenderActions === "closeResponseItem" && this.segmentHandler.topOpenSegmentType != null &&
+                !this.skipClosingResponseItemOnRerender) {
                 this.segmentHandler.closeSegment(this.segmentHandler.topOpenSegmentType);
                 this.shouldRerender = false;
             }
+            this.skipClosingResponseItemOnRerender = false;
         }
     }
     getContextWindowFunctionCallsTokens() {
@@ -1098,8 +1121,16 @@ class GenerateResponseState {
                 }
                 this.prefixTriggerDetectors.clear();
                 for (const trigger of prefixTriggers ?? []) {
+                    const segmentBudget = trigger.type === "segment"
+                        ? this.getSegmentBudget(trigger.segmentType)
+                        : null;
                     if (trigger.type === "functionCall" && !this.functionsEnabled)
                         continue;
+                    else if (trigger.type === "segment" &&
+                        segmentBudget != null &&
+                        !this.segmentHandler.isSegmentTypeOpen(trigger.segmentType) &&
+                        this.segmentHandler.getSegmentTokensCount(trigger.segmentType) >= segmentBudget)
+                        continue;
                     const prefixDetector = new StopGenerationDetector();
                     StopGenerationDetector.resolveStopTriggers(trigger.triggers, this.llamaChat.model.tokenizer)
                         .forEach((stopTrigger) => prefixDetector.addStopTrigger(stopTrigger));
@@ -1114,8 +1145,16 @@ class GenerateResponseState {
                     }
                 }
                 this.noPrefixTrigger = noPrefixTrigger;
+                const noPrefixTriggerSegmentBudget = noPrefixTrigger?.type === "segment"
+                    ? this.getSegmentBudget(noPrefixTrigger.segmentType)
+                    : null;
                 if (this.noPrefixTrigger?.type === "functionCall" && !this.functionsEnabled)
                     this.noPrefixTrigger = undefined;
+                else if (noPrefixTrigger?.type === "segment" &&
+                    noPrefixTriggerSegmentBudget != null &&
+                    !this.segmentHandler.isSegmentTypeOpen(noPrefixTrigger.segmentType) &&
+                    this.segmentHandler.getSegmentTokensCount(noPrefixTrigger.segmentType) >= noPrefixTriggerSegmentBudget)
+                    this.noPrefixTrigger = undefined;
                 this.rerenderTriggers = rerender?.triggers ?? [];
                 this.rerenderTriggerDetector.clearInProgressStops();
                 this.rerenderTriggerDetector.clearTriggeredStops();
@@ -1163,6 +1202,10 @@ class GenerateResponseState {
     }
     initFunctions() {
         this.initiallyEngagedFunctionMode = this.functionCallInitiallyEngaged;
+        if (this.initiallyEngagedFunctionMode && this.abortOnNonText) {
+            this.shouldAbortBecauseOfNonText = true;
+            return;
+        }
         if (this.initiallyEngagedFunctionMode) {
             StopGenerationDetector.resolveStopTriggers(this.disengageInitiallyEngagedFunctionCall, this.llamaChat.model.tokenizer)
                 .map((stopTrigger) => this.disengageInitiallyEngagedFunctionMode.addStopTrigger(stopTrigger));
@@ -1193,6 +1236,13 @@ class GenerateResponseState {
                 await reloadTokens();
         };
         if (this.prefixTriggerDetectors.size === 0) {
+            if (this.abortOnNonText && this.noPrefixTrigger != null && this.noPrefixTrigger.type !== "response") {
+                this.shouldAbortBecauseOfNonText = true;
+                const stopRes = this.handleAbortTrigger("model");
+                if (stopRes != null)
+                    return stopRes;
+                return undefined;
+            }
             if (this.noPrefixTrigger?.type === "functionCall" && this.chatWrapper.settings.functions != null) {
                 await injectTokens(this.noPrefixTrigger.inject, true);
                 this.functionEvaluationMode = "functionName";
@@ -1231,6 +1281,13 @@ class GenerateResponseState {
                                 return this.llamaChat.model.tokenize(item, false, "trimLeadingSpace");
                             return [item];
                         });
+                    if (this.abortOnNonText && trigger.type !== "response") {
+                        this.shouldAbortBecauseOfNonText = true;
+                        const stopRes = this.handleAbortTrigger("model");
+                        if (stopRes != null)
+                            return stopRes;
+                        return undefined;
+                    }
                     this.streamRegulator.reset();
                     if (trigger.type === "segment") {
                         pushAll(this.prefixTriggerTokens, triggerTokens);
@@ -1263,6 +1320,13 @@ class GenerateResponseState {
                     this.prefixTriggerDetectors.delete(triggerDetector);
             }
             if (this.prefixTriggerDetectors.size === 0 && continueGeneration) {
+                if (this.abortOnNonText && this.noPrefixTrigger != null && this.noPrefixTrigger.type !== "response") {
+                    this.shouldAbortBecauseOfNonText = true;
+                    const stopRes = this.handleAbortTrigger("model");
+                    if (stopRes != null)
+                        return stopRes;
+                    return undefined;
+                }
                 this.streamRegulator.reset();
                 continueGeneration = false;
                 if (this.noPrefixTrigger?.type === "functionCall" && this.chatWrapper.settings.functions != null) {
@@ -1777,6 +1841,10 @@ class GenerateResponseState {
         });
         if (this.currentQueuedTokenRelease != null && this.functionEvaluationMode === false && this.functionsEnabled &&
             this.functionSyntaxStartDetector.hasTriggeredStops) {
+            if (this.abortOnNonText) {
+                this.shouldAbortBecauseOfNonText = true;
+                return;
+            }
             this.functionEvaluationMode = "functionName";
             this.currentQueuedTokenRelease.createTextIndexLock(0);
             this.stopGenerationDetector.clearTriggeredStops();
@@ -1918,21 +1986,37 @@ class GenerateResponseState {
     }
     async handleBudgetTriggers() {
         let shouldReloadEvaluationState = false;
-        const hasBudget = (budget) => budget != null && budget !== Infinity;
-        const hasBudgetTriggers = this.budgets != null && hasBudget(this.budgets.thoughtTokens);
-        if (!hasBudgetTriggers)
+        if (this.budgets == null)
             return shouldReloadEvaluationState;
-        if (hasBudget(this.budgets.thoughtTokens) && this.segmentHandler.isSegmentTypeOpen("thought")) {
-            const usedThoughtTokens = this.segmentHandler.getSegmentTokensCount("thought");
-            if (usedThoughtTokens >= this.budgets.thoughtTokens) {
-                this.segmentHandler.closeSegment("thought");
+        for (const segmentType of this.segmentHandler.getOpenSegmentStack().reverse()) {
+            const budget = this.getSegmentBudget(segmentType);
+            if (budget == null)
+                continue;
+            const usedSegmentTokens = this.segmentHandler.getSegmentTokensCount(segmentType);
+            if (usedSegmentTokens >= budget) {
+                this.segmentHandler.closeSegment(segmentType);
                 shouldReloadEvaluationState = true;
             }
         }
         return shouldReloadEvaluationState;
     }
+    getSegmentBudget(segmentType) {
+        const getBudget = (budget) => ((budget == null || budget === Infinity)
+            ? null
+            : budget);
+        if (this.budgets == null)
+            return null;
+        if (segmentType === "thought")
+            return getBudget(this.budgets.thoughtTokens);
+        else if (segmentType === "comment")
+            return getBudget(this.budgets.commentTokens);
+        void segmentType;
+        return null;
+    }
     handleShouldRerender() {
         this.shouldRerender = this.rerenderTriggerDetector.hasTriggeredStops;
+        if (this.abortOnNonText && this.shouldRerender)
+            this.shouldAbortBecauseOfNonText = true;
         return this.shouldRerender;
     }
     updateShouldContextShift() {
@@ -1940,7 +2024,7 @@ class GenerateResponseState {
         return this.shouldContextShift;
     }
     get shouldAbort() {
-        return !!(this.signal?.aborted && this.stopOnAbortSignal);
+        return !!(this.signal?.aborted && this.stopOnAbortSignal) || this.shouldAbortBecauseOfNonText;
     }
     handleAbortTrigger(lastHistoryItemType) {
         if (this.shouldAbort && this.signal?.aborted && this.stopOnAbortSignal) {
@@ -1960,7 +2044,9 @@ class GenerateResponseState {
                     contextShiftMetadata: this.lastHistoryCompressionMetadata
                 },
                 metadata: {
-                    stopReason: "abort"
+                    stopReason: this.shouldAbortBecauseOfNonText
+                        ? "eogToken"
+                        : "abort"
                 }
             };
         }
@@ -2083,6 +2169,26 @@ class SegmentHandler {
     get topOpenSegmentType() {
         return this._segmentsStack.at(-1);
     }
+    /**
+     * First segment in the stack is the top most that'll close last.
+     * ```
+     * <segment1>
+     *     some text here
+     *     <segment2>
+     *        some text here
+     *         <segment3>
+     *             some text here
+     *         </segment3>
+     * ```
+     * In that example, the top most segment is `segment1`, and the last open segment is `segment2` (which is the next one to close).
+     * So in that example, this function will return:
+     * ```
+     * ["segment1", "segment2"]
+     * ```
+     */
+    getOpenSegmentStack() {
+        return this._segmentsStack.slice(this._ownedSegmentsStackLength);
+    }
     _processTokens(tokens, text) {
         const queuedTokenRelease = this._streamRegulator.addChunk({
             tokens,