npm - node-llama-cpp - Versions diffs - 3.17.1 → 3.18.0 - Mend

node-llama-cpp 3.17.1 → 3.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

package/dist/evaluator/LlamaChat/LlamaChat.js CHANGED Viewed

@@ -1,4 +1,5 @@
 import { DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
+import { internalCheckpoints } from "../LlamaContext/LlamaContext.js";
 import { isChatModelResponseFunctionCall, isChatModelResponseSegment, allSegmentTypes } from "../../types.js";
 import { removeNullFields } from "../../utils/removeNullFields.js";
 import { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
@@ -15,6 +16,7 @@ import { LlamaSampler } from "../LlamaContext/LlamaSampler.js";
 import { getChatWrapperSegmentDefinition } from "../../utils/getChatWrapperSegmentDefinition.js";
 import { jsonDumps } from "../../chatWrappers/utils/jsonDumps.js";
 import { defaultMaxPreloadTokens } from "../LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js";
+import { LlamaLogLevel } from "../../bindings/types.js";
 import { eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy } from "./utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js";
 import { FunctionCallNameGrammar } from "./utils/FunctionCallNameGrammar.js";
 import { FunctionCallParamsGrammar } from "./utils/FunctionCallParamsGrammar.js";
@@ -26,6 +28,9 @@ const defaultContextShiftOptions = {
 const defaultRepeatPenaltyLastTokens = 64;
 const defaultTrimWhitespaceSuffix = false;
 const defaultEvaluationPriority = 5;
+const defaultSegmentBudgetSize = (contextSize) => (contextSize < 8192
+    ? contextSize * 0.5
+    : contextSize * 0.75);
 export class LlamaChat {
     /** @internal */ _chatWrapper;
     /** @internal */ _disposeAggregator = new DisposeAggregator();
@@ -118,7 +123,9 @@ export class LlamaChat {
         if (generateResponseState.grammar != null && generateResponseState.functionsEnabled && !abortOnNonText)
             throw new Error("Using both grammar and functions is not supported yet");
         return await withLock([this._chatLock, "evaluate"], signal, async () => {
+            let hadError = false;
             try {
+                let tookInitialCheckpoint = false;
                 generateResponseState.ensureLastHistoryItemIsModel();
                 generateResponseState.ensureReopenedThoughtSegmentAfterFunctionCallsIfNeeded();
                 const loadContextWindow = async (avoidReloadingHistory = false) => {
@@ -156,6 +163,10 @@ export class LlamaChat {
                     await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
                     await generateResponseState.createNewEvaluationIterator();
                     while (await generateResponseState.iterateEvaluation()) {
+                        if (!tookInitialCheckpoint && this.sequence.needsCheckpoints) {
+                            await this.sequence.takeCheckpoint();
+                            tookInitialCheckpoint = true;
+                        }
                         if (!generateResponseState.holdPartialTokensForNextEvaluation()) {
                             generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
                             generateResponseState.detectAndHandleFunctionStartSyntax();
@@ -166,7 +177,11 @@ export class LlamaChat {
                                 if (functionsCallsRes != null)
                                     return functionsCallsRes;
                             }
-                            generateResponseState.recordStopGenerationEvaluation();
+                            {
+                                const resPromise = generateResponseState.recordStopGenerationEvaluation();
+                                if (resPromise instanceof Promise)
+                                    await resPromise;
+                            }
                             generateResponseState.popStreamRegulatorFreeTokens();
                             generateResponseState.removeFoundStartIgnoreTextsFromPendingTokens();
                             const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("model");
@@ -198,8 +213,14 @@ export class LlamaChat {
                 }
                 throw new Error("The context size is too small to generate a response");
             }
+            catch (err) {
+                hadError = true;
+                throw err;
+            }
             finally {
                 await generateResponseState.dispose();
+                if (!hadError && this.sequence.needsCheckpoints)
+                    void this.sequence.takeCheckpoint();
             }
         });
     }
@@ -247,6 +268,7 @@ export class LlamaChat {
         });
         return await withLock([this._chatLock, "evaluate"], signal, async () => {
             try {
+                let tookInitialCheckpoint = false;
                 generateResponseState.ensureLastHistoryItemIsUser();
                 while (true) {
                     generateResponseState.startTokenLoop();
@@ -279,9 +301,17 @@ export class LlamaChat {
                     }
                     await generateResponseState.createNewEvaluationIterator();
                     while (await generateResponseState.iterateEvaluation()) {
+                        if (!tookInitialCheckpoint && this.sequence.needsCheckpoints) {
+                            await this.sequence.takeCheckpoint();
+                            tookInitialCheckpoint = true;
+                        }
                         if (!generateResponseState.holdPartialTokensForNextEvaluation()) {
                             generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
-                            generateResponseState.recordStopGenerationEvaluation();
+                            {
+                                const resPromise = generateResponseState.recordStopGenerationEvaluation();
+                                if (resPromise instanceof Promise)
+                                    await resPromise;
+                            }
                             generateResponseState.popStreamRegulatorFreeTokens();
                             const someOfCurrentTokensAreSpecial = generateResponseState.currentTokens.some((token) => (this.model.isSpecialToken(token)));
                             const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("user", someOfCurrentTokensAreSpecial
@@ -792,6 +822,7 @@ class GenerateResponseState {
     userTextSuffix = undefined;
     prefixTriggerDetectors = new Map();
     noPrefixTrigger = undefined;
+    responsePrefix = undefined;
     rerenderTriggers = [];
     rerenderTriggerDetector = new StopGenerationDetector();
     rerenderActions = undefined;
@@ -1148,6 +1179,9 @@ class GenerateResponseState {
                         !this.segmentHandler.isSegmentTypeOpen(trigger.segmentType) &&
                         this.segmentHandler.getSegmentTokensCount(trigger.segmentType) >= segmentBudget)
                         continue;
+                    if (this.responsePrefix == null && trigger.type === "response" && trigger.triggers.length > 0 &&
+                        (trigger.triggers[0]?.values?.length ?? 0) > 0)
+                        this.responsePrefix = LlamaText([trigger.triggers[0] ?? "", trigger.inject ?? ""]);
                     const prefixDetector = new StopGenerationDetector();
                     StopGenerationDetector.resolveStopTriggers(trigger.triggers, this.llamaChat.model.tokenizer)
                         .forEach((stopTrigger) => prefixDetector.addStopTrigger(stopTrigger));
@@ -1172,6 +1206,8 @@ class GenerateResponseState {
                     !this.segmentHandler.isSegmentTypeOpen(noPrefixTrigger.segmentType) &&
                     this.segmentHandler.getSegmentTokensCount(noPrefixTrigger.segmentType) >= noPrefixTriggerSegmentBudget)
                     this.noPrefixTrigger = undefined;
+                else if (noPrefixTrigger?.type === "response")
+                    this.responsePrefix = noPrefixTrigger.inject;
                 this.rerenderTriggers = rerender?.triggers ?? [];
                 this.rerenderTriggerDetector.clearInProgressStops();
                 this.rerenderTriggerDetector.clearTriggeredStops();
@@ -1252,6 +1288,11 @@ class GenerateResponseState {
             if (alignStateTokens)
                 await reloadTokens();
         };
+        if (this.grammar != null) {
+            if (this.responsePrefix != null)
+                await injectTokens(this.responsePrefix, true);
+            return undefined;
+        }
         if (this.prefixTriggerDetectors.size === 0) {
             if (this.abortOnNonText && this.noPrefixTrigger != null && this.noPrefixTrigger.type !== "response") {
                 this.shouldAbortBecauseOfNonText = true;
@@ -1275,7 +1316,12 @@ class GenerateResponseState {
         const generatedTokens = [];
         let isFirstToken = true;
         let continueGeneration = true;
+        let tookInitialCheckpoint = false;
         for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
+            if (!tookInitialCheckpoint && this.llamaChat.sequence.needsCheckpoints) {
+                await this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatSequenceStart.name, internalCheckpoints.chatSequenceStart.maxCheckpoints);
+                tookInitialCheckpoint = true;
+            }
             pushAll(generatedTokens, tokens);
             for (const [triggerDetector, { trigger, inject }] of [...this.prefixTriggerDetectors.entries()]) {
                 triggerDetector.recordGeneration({
@@ -1420,7 +1466,12 @@ class GenerateResponseState {
                         pushAll(prefixDetectorRecordedTokens, tokens);
                     }
                 }
+                let tookInitialCheckpoint = false;
                 for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
+                    if (!tookInitialCheckpoint && this.llamaChat.sequence.needsCheckpoints) {
+                        await this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatSequenceStart.name, internalCheckpoints.chatSequenceStart.maxCheckpoints);
+                        tookInitialCheckpoint = true;
+                    }
                     const stopGenerationTriggerRes = this.handleStopGenerationTrigger("model");
                     if (stopGenerationTriggerRes != null)
                         return stopGenerationTriggerRes;
@@ -1463,7 +1514,11 @@ class GenerateResponseState {
                             tokens: this.currentTokens,
                             text: this.currentText
                         });
-                        this.recordStopGenerationEvaluation();
+                        {
+                            const resPromise = this.recordStopGenerationEvaluation();
+                            if (resPromise instanceof Promise)
+                                await resPromise;
+                        }
                     }
                     this.currentFunctionCallCurrentPartTokens.length = 0;
                     this.functionEvaluationMode = false;
@@ -1515,7 +1570,12 @@ class GenerateResponseState {
                         }
                     }
                 }
+                let tookInitialCheckpoint = false;
                 for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
+                    if (!tookInitialCheckpoint && this.llamaChat.sequence.needsCheckpoints) {
+                        await this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatSequenceStart.name, internalCheckpoints.chatSequenceStart.maxCheckpoints);
+                        tookInitialCheckpoint = true;
+                    }
                     pushAll(this.currentFunctionCallCurrentPartTokens, tokens);
                     functionNameGenerationDoneDetector.recordGeneration({
                         text: this.currentText,
@@ -1578,11 +1638,20 @@ class GenerateResponseState {
                             paramsChunk: this.llamaChat.model.detokenize(this.currentFunctionCallCurrentPartTokens, false, lastPartTokens),
                             done: false
                         });
+                    let tookInitialCheckpoint = false;
                     for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
+                        if (!tookInitialCheckpoint && this.llamaChat.sequence.needsCheckpoints) {
+                            await this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatSequenceStart.name, internalCheckpoints.chatSequenceStart.maxCheckpoints);
+                            tookInitialCheckpoint = true;
+                        }
+                        const hadInProgressTriggers = functionParamsGenerationDoneDetector.hasInProgressStops;
                         functionParamsGenerationDoneDetector.recordGeneration({
                             text: this.currentText,
                             tokens: this.currentTokens
                         });
+                        if (!hadInProgressTriggers && functionParamsGenerationDoneDetector.hasInProgressStops &&
+                            this.llamaChat.sequence.needsCheckpoints)
+                            await this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatGrammarEnd.name, internalCheckpoints.chatGrammarEnd.maxCheckpoints);
                         this.onFunctionCallParamsChunk?.({
                             callIndex: this.resFunctionCalls.length,
                             functionName: this.functionEvaluationFunctionName,
@@ -1646,7 +1715,12 @@ class GenerateResponseState {
                     LlamaText(new SpecialToken("EOT"))
                 ], this.llamaChat.model.tokenizer)
                     .map((stopTrigger) => sectionSuffixDetector.addStopTrigger(stopTrigger));
+                let tookInitialCheckpoint = false;
                 for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
+                    if (!tookInitialCheckpoint && this.llamaChat.sequence.needsCheckpoints) {
+                        await this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatSequenceStart.name, internalCheckpoints.chatSequenceStart.maxCheckpoints);
+                        tookInitialCheckpoint = true;
+                    }
                     pushAll(this.currentFunctionCallCurrentPartTokens, tokens);
                     sectionSuffixDetector.recordGeneration({
                         text: this.currentText,
@@ -1772,6 +1846,19 @@ class GenerateResponseState {
         }));
     }
     async createNewEvaluationIterator() {
+        if (this.tokens.length === 0) {
+            if (this.evaluationIterator != null)
+                return;
+            const token = this.llamaChat.sequence.contextTokens.at(-1);
+            if (token == null)
+                throw new Error("No tokens to evaluate");
+            this.llamaChat.sequence.model._llama._log(LlamaLogLevel.warn, "Attempted to evaluate with no input, reevaluating the last context sequence token");
+            await this.llamaChat.sequence.eraseContextTokenRanges([{
+                    start: this.llamaChat.sequence.contextTokens.length - 1,
+                    end: this.llamaChat.sequence.contextTokens.length
+                }]);
+            this.tokens = [token];
+        }
         if (this.evaluationIterator != null)
             await this.evaluationIterator.return();
         this.currentIterationReplacementToken = undefined;
@@ -1881,6 +1968,7 @@ class GenerateResponseState {
         }
     }
     recordStopGenerationEvaluation() {
+        const hadInProgressStopTrigger = this.stopGenerationDetector.hasInProgressStops;
         this.rerenderTriggerDetector.recordGeneration({
             text: this.currentText,
             tokens: this.currentTokens,
@@ -1898,6 +1986,9 @@ class GenerateResponseState {
         });
         if (this.llamaChat.model.isEogToken(this.currentToken))
             this.currentQueuedTokenRelease?.createTokenIndexLock(0);
+        if (this.grammar != null && !hadInProgressStopTrigger && this.stopGenerationDetector.hasInProgressStops &&
+            this.llamaChat.sequence.needsCheckpoints)
+            return this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatGrammarEnd.name, internalCheckpoints.chatGrammarEnd.maxCheckpoints);
     }
     popStreamRegulatorFreeTokens() {
         pushAll(this.pendingTokens, this.streamRegulator.popFreeChunkTokens());
@@ -2020,9 +2111,11 @@ class GenerateResponseState {
         return shouldReloadEvaluationState;
     }
     getSegmentBudget(segmentType) {
-        const getBudget = (budget) => ((budget == null || budget === Infinity)
-            ? null
-            : budget);
+        const getBudget = (budget) => (budget == null
+            ? Math.ceil(defaultSegmentBudgetSize(this.llamaChat.sequence.contextSize))
+            : budget === Infinity
+                ? null
+                : budget);
         if (this.budgets == null)
             return null;
         if (segmentType === "thought")