node-llama-cpp 3.0.0-beta.17 → 3.0.0-beta.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/README.md +8 -5
  2. package/dist/ChatWrapper.js +3 -3
  3. package/dist/ChatWrapper.js.map +1 -1
  4. package/dist/apiDocsOverrides.d.ts +1 -0
  5. package/dist/apiDocsOverrides.js +5 -0
  6. package/dist/apiDocsOverrides.js.map +1 -0
  7. package/dist/bindings/AddonTypes.d.ts +1 -0
  8. package/dist/bindings/getLlama.d.ts +5 -1
  9. package/dist/bindings/getLlama.js +11 -4
  10. package/dist/bindings/getLlama.js.map +1 -1
  11. package/dist/bindings/utils/hasBuildingFromSourceDependenciesInstalled.d.ts +3 -0
  12. package/dist/bindings/utils/hasBuildingFromSourceDependenciesInstalled.js +27 -0
  13. package/dist/bindings/utils/hasBuildingFromSourceDependenciesInstalled.js.map +1 -0
  14. package/dist/chatWrappers/generic/TemplateChatWrapper.d.ts +1 -2
  15. package/dist/chatWrappers/generic/TemplateChatWrapper.js +1 -2
  16. package/dist/chatWrappers/generic/TemplateChatWrapper.js.map +1 -1
  17. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js +2 -2
  18. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js.map +1 -1
  19. package/dist/cli/cli.js +5 -3
  20. package/dist/cli/cli.js.map +1 -1
  21. package/dist/cli/commands/DebugCommand.js +3 -5
  22. package/dist/cli/commands/DebugCommand.js.map +1 -1
  23. package/dist/cli/commands/DownloadCommand.d.ts +1 -1
  24. package/dist/cli/commands/DownloadCommand.js +2 -1
  25. package/dist/cli/commands/DownloadCommand.js.map +1 -1
  26. package/dist/cli/commands/PullCommand.d.ts +12 -0
  27. package/dist/cli/commands/PullCommand.js +109 -0
  28. package/dist/cli/commands/PullCommand.js.map +1 -0
  29. package/dist/cli/commands/inspect/commands/InspectGgufCommand.d.ts +1 -0
  30. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +23 -11
  31. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -1
  32. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +2 -1
  33. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -1
  34. package/dist/cli/recommendedModels.js +12 -20
  35. package/dist/cli/recommendedModels.js.map +1 -1
  36. package/dist/cli/utils/resolveCommandGgufPath.d.ts +3 -1
  37. package/dist/cli/utils/resolveCommandGgufPath.js +41 -97
  38. package/dist/cli/utils/resolveCommandGgufPath.js.map +1 -1
  39. package/dist/cli/utils/resolveModelRecommendationFileOptions.d.ts +2 -2
  40. package/dist/cli/utils/resolveModelRecommendationFileOptions.js +1 -4
  41. package/dist/cli/utils/resolveModelRecommendationFileOptions.js.map +1 -1
  42. package/dist/evaluator/LlamaChat/LlamaChat.d.ts +18 -2
  43. package/dist/evaluator/LlamaChat/LlamaChat.js +255 -205
  44. package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
  45. package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +22 -3
  46. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +18 -7
  47. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
  48. package/dist/evaluator/LlamaCompletion.js +1 -1
  49. package/dist/evaluator/LlamaCompletion.js.map +1 -1
  50. package/dist/evaluator/LlamaContext/LlamaContext.d.ts +2 -7
  51. package/dist/evaluator/LlamaContext/LlamaContext.js +12 -12
  52. package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
  53. package/dist/evaluator/LlamaEmbeddingContext.d.ts +2 -10
  54. package/dist/evaluator/LlamaEmbeddingContext.js +9 -23
  55. package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
  56. package/dist/evaluator/LlamaGrammar.d.ts +1 -1
  57. package/dist/evaluator/LlamaModel.d.ts +9 -0
  58. package/dist/evaluator/LlamaModel.js +2 -1
  59. package/dist/evaluator/LlamaModel.js.map +1 -1
  60. package/dist/gguf/insights/GgufInsights.js +12 -12
  61. package/dist/gguf/insights/GgufInsights.js.map +1 -1
  62. package/dist/gguf/insights/utils/resolveContextContextSizeOption.js +27 -3
  63. package/dist/gguf/insights/utils/resolveContextContextSizeOption.js.map +1 -1
  64. package/dist/gguf/parser/parseGguf.js +5 -0
  65. package/dist/gguf/parser/parseGguf.js.map +1 -1
  66. package/dist/gguf/readGgufFileInfo.d.ts +5 -2
  67. package/dist/gguf/readGgufFileInfo.js +38 -10
  68. package/dist/gguf/readGgufFileInfo.js.map +1 -1
  69. package/dist/gguf/types/GgufFileInfoTypes.d.ts +32 -0
  70. package/dist/gguf/types/GgufFileInfoTypes.js.map +1 -1
  71. package/dist/gguf/utils/getGgufMetadataArchitectureData.js +1 -1
  72. package/dist/gguf/utils/getGgufMetadataArchitectureData.js.map +1 -1
  73. package/dist/gguf/utils/resolveBinarySplitGgufPartUrls.d.ts +2 -0
  74. package/dist/gguf/utils/resolveBinarySplitGgufPartUrls.js +39 -0
  75. package/dist/gguf/utils/resolveBinarySplitGgufPartUrls.js.map +1 -0
  76. package/dist/gguf/utils/resolveSplitGgufParts.d.ts +7 -0
  77. package/dist/gguf/utils/resolveSplitGgufParts.js +55 -0
  78. package/dist/gguf/utils/resolveSplitGgufParts.js.map +1 -0
  79. package/dist/index.d.ts +4 -2
  80. package/dist/index.js +4 -2
  81. package/dist/index.js.map +1 -1
  82. package/dist/utils/LlamaText.d.ts +29 -20
  83. package/dist/utils/LlamaText.js +253 -243
  84. package/dist/utils/LlamaText.js.map +1 -1
  85. package/dist/utils/StopGenerationDetector.d.ts +1 -1
  86. package/dist/utils/StopGenerationDetector.js +2 -0
  87. package/dist/utils/StopGenerationDetector.js.map +1 -1
  88. package/dist/utils/createModelDownloader.d.ts +99 -0
  89. package/dist/utils/createModelDownloader.js +226 -0
  90. package/dist/utils/createModelDownloader.js.map +1 -0
  91. package/dist/utils/findCharacterRemovalCountToFitChatHistoryInContext.js +18 -8
  92. package/dist/utils/findCharacterRemovalCountToFitChatHistoryInContext.js.map +1 -1
  93. package/dist/utils/parseTextTemplate.d.ts +2 -2
  94. package/dist/utils/parseTextTemplate.js +2 -2
  95. package/dist/utils/runtime.d.ts +4 -0
  96. package/dist/utils/runtime.js +8 -0
  97. package/dist/utils/runtime.js.map +1 -0
  98. package/llama/addon.cpp +18 -7
  99. package/llama/binariesGithubRelease.json +1 -1
  100. package/llama/gitRelease.bundle +0 -0
  101. package/llama/grammars/README.md +1 -1
  102. package/llama/llama.cpp.info.json +1 -1
  103. package/llamaBins/linux-arm64/_nlcBuildMetadata.json +1 -1
  104. package/llamaBins/linux-arm64/llama-addon.node +0 -0
  105. package/llamaBins/linux-armv7l/_nlcBuildMetadata.json +1 -1
  106. package/llamaBins/linux-armv7l/llama-addon.node +0 -0
  107. package/llamaBins/linux-x64/_nlcBuildMetadata.json +1 -1
  108. package/llamaBins/linux-x64/llama-addon.node +0 -0
  109. package/llamaBins/linux-x64-cuda/_nlcBuildMetadata.json +1 -1
  110. package/llamaBins/linux-x64-cuda/llama-addon.node +0 -0
  111. package/llamaBins/linux-x64-vulkan/_nlcBuildMetadata.json +1 -1
  112. package/llamaBins/linux-x64-vulkan/llama-addon.node +0 -0
  113. package/llamaBins/mac-arm64-metal/_nlcBuildMetadata.json +1 -1
  114. package/llamaBins/mac-arm64-metal/default.metallib +0 -0
  115. package/llamaBins/mac-arm64-metal/llama-addon.node +0 -0
  116. package/llamaBins/mac-x64/_nlcBuildMetadata.json +1 -1
  117. package/llamaBins/mac-x64/llama-addon.node +0 -0
  118. package/llamaBins/win-arm64/_nlcBuildMetadata.json +1 -1
  119. package/llamaBins/win-arm64/llama-addon.node +0 -0
  120. package/llamaBins/win-x64/_nlcBuildMetadata.json +1 -1
  121. package/llamaBins/win-x64/llama-addon.node +0 -0
  122. package/llamaBins/win-x64-cuda/_nlcBuildMetadata.json +1 -1
  123. package/llamaBins/win-x64-cuda/llama-addon.node +0 -0
  124. package/llamaBins/win-x64-vulkan/_nlcBuildMetadata.json +1 -1
  125. package/llamaBins/win-x64-vulkan/llama-addon.node +0 -0
  126. package/package.json +8 -5
@@ -72,7 +72,7 @@ export class LlamaChat {
72
72
  get model() {
73
73
  return this.sequence.model;
74
74
  }
75
- async generateResponse(history, { onToken, signal, maxTokens, temperature, minP, topK, topP, grammar, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
75
+ async generateResponse(history, { onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, grammar, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
76
76
  const functionsEnabled = (functions != null && Object.keys(functions).length > 0);
77
77
  if (grammar != null && functionsEnabled)
78
78
  throw new Error("Using both grammar and functions is not supported yet");
@@ -116,6 +116,7 @@ export class LlamaChat {
116
116
  : undefined;
117
117
  const streamRegulator = new TokenStreamRegulator();
118
118
  const stopGenerationDetector = new StopGenerationDetector();
119
+ const customStopGenerationTriggersDetector = new StopGenerationDetector();
119
120
  const functionSyntaxStartDetector = new StopGenerationDetector();
120
121
  const functionSyntaxEndDetector = new StopGenerationDetector();
121
122
  const disengageInitiallyEngagedFunctionMode = new StopGenerationDetector();
@@ -129,7 +130,7 @@ export class LlamaChat {
129
130
  let lastContextWindowHistory = resolvedHistory;
130
131
  let lastHistoryCompressionMetadata = resolvedContextShift.lastEvaluationMetadata;
131
132
  const ensureNotAborted = () => {
132
- if (signal?.aborted)
133
+ if (signal?.aborted && (!stopOnAbortSignal || res.length === 0))
133
134
  throw signal.reason;
134
135
  if (this._sequence == null)
135
136
  throw new DisposedError();
@@ -200,6 +201,9 @@ export class LlamaChat {
200
201
  }
201
202
  }
202
203
  };
204
+ if (customStopTriggers != null)
205
+ StopGenerationDetector.resolveStopTriggers(customStopTriggers, model.tokenizer)
206
+ .map((stopTrigger) => customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
203
207
  if (grammar != null)
204
208
  StopGenerationDetector.resolveStopTriggers(grammar.stopGenerationTriggers, model.tokenizer)
205
209
  .map((stopTrigger) => stopGenerationDetector.addStopTrigger(stopTrigger));
@@ -214,7 +218,7 @@ export class LlamaChat {
214
218
  resolvedHistory: getResolvedHistoryWithCurrentModelResponse(),
215
219
  resolvedContextShift,
216
220
  lastHistoryCompressionMetadata,
217
- pendingTokensCount: pendingTokens.length + queuedChunkTokens.length,
221
+ pendingTokensCount: ignoredStartTextTokens.length + pendingTokens.length + queuedChunkTokens.length,
218
222
  isFirstEvaluation,
219
223
  chatWrapper: this._chatWrapper,
220
224
  lastEvaluationContextWindowHistory,
@@ -281,225 +285,271 @@ export class LlamaChat {
281
285
  evaluationPriority,
282
286
  yieldEogToken: true
283
287
  }));
284
- let currentIteration = await evaluationIterator.next();
285
- while (currentIteration.done !== true) {
286
- const token = currentIteration.value;
287
- let replacementToken = undefined;
288
- ensureNotAborted();
289
- generatedTokens++;
290
- const tokens = [token];
291
- const text = model.detokenize([token]);
292
- const queuedTokenRelease = streamRegulator.addChunk({ tokens, text });
293
- if (initiallyEngagedFunctionMode)
294
- disengageInitiallyEngagedFunctionMode.recordGeneration({ text, tokens, startNewChecks: generatedTokens === 1 });
295
- if (text === UNKNOWN_UNICODE_CHAR || ((grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && text.trim() === "")) {
296
- locksToReleaseOnValidGeneration.push(queuedTokenRelease.createTextIndexLock(0));
297
- }
298
- else {
299
- while (locksToReleaseOnValidGeneration.length > 0)
300
- locksToReleaseOnValidGeneration.shift().dispose();
301
- }
302
- functionSyntaxStartDetector.recordGeneration({ text, tokens, queuedTokenRelease });
303
- if (initiallyEngagedFunctionMode && disengageInitiallyEngagedFunctionMode.hasTriggeredStops) {
304
- initiallyEngagedFunctionMode = false;
305
- let shouldStopFunctionEvaluationMode = !functionSyntaxStartDetector.hasTriggeredStops;
306
- if (!shouldStopFunctionEvaluationMode && functionsEnabled && functionsGrammar != null) {
307
- const functionCallText = model.detokenize([...functionCallTokens, ...tokens]);
308
- try {
309
- const functionName = functionsGrammar.parseFunctionNameFromPartialCall(functionCallText, {
310
- enableInternalBuiltinFunctions: true,
311
- initialFunctionCallEngaged: true
312
- });
313
- const internalBuiltinFunctions = this._chatWrapper.getInternalBuiltinFunctions({ initialFunctionCallEngaged: true });
314
- if (internalBuiltinFunctions[functionName] != null) {
315
- shouldStopFunctionEvaluationMode = true;
316
- }
317
- }
318
- catch (err) {
319
- if (!(err instanceof LlamaFunctionCallValidationError))
320
- throw err;
321
- }
288
+ try {
289
+ let currentIteration = await evaluationIterator.next();
290
+ while (currentIteration.done !== true) {
291
+ const token = currentIteration.value;
292
+ let replacementToken = undefined;
293
+ ensureNotAborted();
294
+ generatedTokens++;
295
+ const tokens = [token];
296
+ const text = model.detokenize([token]);
297
+ const queuedTokenRelease = streamRegulator.addChunk({ tokens, text });
298
+ if (initiallyEngagedFunctionMode)
299
+ disengageInitiallyEngagedFunctionMode.recordGeneration({ text, tokens, startNewChecks: generatedTokens === 1 });
300
+ if (text === UNKNOWN_UNICODE_CHAR || ((grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && text.trim() === "")) {
301
+ locksToReleaseOnValidGeneration.push(queuedTokenRelease.createTextIndexLock(0));
322
302
  }
323
- if (shouldStopFunctionEvaluationMode) {
324
- inFunctionEvaluationMode = false;
325
- functionsGrammar = new FunctionCallGrammar(model._llama, functions, this._chatWrapper, false);
326
- functionsEvaluationState = new LlamaGrammarEvaluationState({
327
- grammar: functionsGrammar
328
- });
329
- functionCallTokens.length = 0;
330
- while (functionCallTokenSyntaxLocks.length > 0)
331
- functionCallTokenSyntaxLocks.shift().dispose();
332
- functionSyntaxStartDetector.clearInProgressStops();
333
- functionSyntaxStartDetector.clearTriggeredStops();
334
- functionSyntaxEndDetector.clearInProgressStops();
335
- functionSyntaxEndDetector.clearTriggeredStops();
303
+ else {
304
+ while (locksToReleaseOnValidGeneration.length > 0)
305
+ locksToReleaseOnValidGeneration.shift().dispose();
336
306
  }
337
- }
338
- if (!inFunctionEvaluationMode && functionsEnabled && functionsGrammar != null &&
339
- functionSyntaxStartDetector.hasTriggeredStops && functionsEvaluationState != null) {
340
- inFunctionEvaluationMode = true;
341
- functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
342
- stopGenerationDetector.clearTriggeredStops();
343
- stopGenerationDetector.clearInProgressStops();
344
- pendingTokens.push(...streamRegulator.popFreeChunkTokens());
345
- const triggeredStops = functionSyntaxStartDetector.getTriggeredStops();
346
- const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
347
- const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
348
- pendingTokens.push(...queuedTokensBeforeStopTrigger);
349
- const [firstRemainingGenerationAfterStop] = triggeredStops
350
- .map((stopTrigger) => stopTrigger.remainingGenerations)
351
- .filter((remainingGenerations) => remainingGenerations.length > 0)
352
- .flat(1);
353
- const remainingTextAfterStop = (firstRemainingGenerationAfterStop == null || firstRemainingGenerationAfterStop.length === 0)
354
- ? ""
355
- : typeof firstRemainingGenerationAfterStop === "string"
356
- ? firstRemainingGenerationAfterStop
357
- : model.detokenize(firstRemainingGenerationAfterStop);
358
- functionCallTokens.push(...model.tokenize(this._chatWrapper.settings.functions.call.prefix, false, "trimLeadingSpace"));
359
- for (const functionCallToken of functionCallTokens)
360
- context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, functionCallToken);
361
- // these tokens have to be verified that they match the function calling syntax grammar before they can be accepted,
362
- // or the context state should be modified to not include the incompatible tokens
363
- const remainingTextTokens = model.tokenize(remainingTextAfterStop, false, "trimLeadingSpace");
364
- let unfitTokens = [];
365
- for (let i = 0; i < remainingTextTokens.length; i++) {
366
- const remainingToken = remainingTextTokens[i];
367
- const canBeNextToken = context._canBeNextTokenForGrammarEvaluationState(functionsEvaluationState, remainingToken);
368
- if (!canBeNextToken) {
369
- unfitTokens = remainingTextTokens.slice(i);
370
- break;
307
+ functionSyntaxStartDetector.recordGeneration({ text, tokens, queuedTokenRelease });
308
+ if (initiallyEngagedFunctionMode && disengageInitiallyEngagedFunctionMode.hasTriggeredStops) {
309
+ initiallyEngagedFunctionMode = false;
310
+ let shouldStopFunctionEvaluationMode = !functionSyntaxStartDetector.hasTriggeredStops;
311
+ if (!shouldStopFunctionEvaluationMode && functionsEnabled && functionsGrammar != null) {
312
+ const functionCallText = model.detokenize([...functionCallTokens, ...tokens]);
313
+ try {
314
+ const functionName = functionsGrammar.parseFunctionNameFromPartialCall(functionCallText, {
315
+ enableInternalBuiltinFunctions: true,
316
+ initialFunctionCallEngaged: true
317
+ });
318
+ const internalBuiltinFunctions = this._chatWrapper.getInternalBuiltinFunctions({ initialFunctionCallEngaged: true });
319
+ if (internalBuiltinFunctions[functionName] != null) {
320
+ shouldStopFunctionEvaluationMode = true;
321
+ }
322
+ }
323
+ catch (err) {
324
+ if (!(err instanceof LlamaFunctionCallValidationError))
325
+ throw err;
326
+ }
371
327
  }
372
- context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, remainingToken);
373
- functionCallTokens.push(remainingToken);
374
- }
375
- if (unfitTokens.length > 0) {
376
- const unfitTokensText = model.detokenize(unfitTokens); // the current token text must end with it
377
- const currentTokenText = queuedTokenRelease.text;
378
- let replacementTokens;
379
- if (!currentTokenText.endsWith(unfitTokensText)) {
380
- console.warn(getConsoleLogPrefix() + "The current token text does not end with the unfit function call syntax tokens text");
381
- replacementTokens = remainingTextTokens.slice(0, -unfitTokens.length);
328
+ if (shouldStopFunctionEvaluationMode) {
329
+ inFunctionEvaluationMode = false;
330
+ functionsGrammar = new FunctionCallGrammar(model._llama, functions, this._chatWrapper, false);
331
+ functionsEvaluationState = new LlamaGrammarEvaluationState({
332
+ grammar: functionsGrammar
333
+ });
334
+ functionCallTokens.length = 0;
335
+ while (functionCallTokenSyntaxLocks.length > 0)
336
+ functionCallTokenSyntaxLocks.shift().dispose();
337
+ functionSyntaxStartDetector.clearInProgressStops();
338
+ functionSyntaxStartDetector.clearTriggeredStops();
339
+ functionSyntaxEndDetector.clearInProgressStops();
340
+ functionSyntaxEndDetector.clearTriggeredStops();
382
341
  }
383
- else {
384
- const newCurrentTokensText = currentTokenText.slice(0, -unfitTokensText.length);
385
- replacementTokens = model.tokenize(newCurrentTokensText, false, "trimLeadingSpace");
342
+ }
343
+ if (!inFunctionEvaluationMode && functionsEnabled && functionsGrammar != null &&
344
+ functionSyntaxStartDetector.hasTriggeredStops && functionsEvaluationState != null) {
345
+ inFunctionEvaluationMode = true;
346
+ functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
347
+ stopGenerationDetector.clearTriggeredStops();
348
+ stopGenerationDetector.clearInProgressStops();
349
+ customStopGenerationTriggersDetector.clearTriggeredStops();
350
+ customStopGenerationTriggersDetector.clearInProgressStops();
351
+ pendingTokens.push(...streamRegulator.popFreeChunkTokens());
352
+ const triggeredStops = functionSyntaxStartDetector.getTriggeredStops();
353
+ const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
354
+ const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
355
+ pendingTokens.push(...queuedTokensBeforeStopTrigger);
356
+ const [firstRemainingGenerationAfterStop] = triggeredStops
357
+ .map((stopTrigger) => stopTrigger.remainingGenerations)
358
+ .filter((remainingGenerations) => remainingGenerations.length > 0)
359
+ .flat(1);
360
+ const remainingTextAfterStop = (firstRemainingGenerationAfterStop == null || firstRemainingGenerationAfterStop.length === 0)
361
+ ? ""
362
+ : typeof firstRemainingGenerationAfterStop === "string"
363
+ ? firstRemainingGenerationAfterStop
364
+ : model.detokenize(firstRemainingGenerationAfterStop);
365
+ functionCallTokens.push(...model.tokenize(this._chatWrapper.settings.functions.call.prefix, false, "trimLeadingSpace"));
366
+ for (const functionCallToken of functionCallTokens)
367
+ context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, functionCallToken);
368
+ // these tokens have to be verified that they match the function calling syntax grammar before they can be accepted,
369
+ // or the context state should be modified to not include the incompatible tokens
370
+ const remainingTextTokens = model.tokenize(remainingTextAfterStop, false, "trimLeadingSpace");
371
+ let unfitTokens = [];
372
+ for (let i = 0; i < remainingTextTokens.length; i++) {
373
+ const remainingToken = remainingTextTokens[i];
374
+ const canBeNextToken = context._canBeNextTokenForGrammarEvaluationState(functionsEvaluationState, remainingToken);
375
+ if (!canBeNextToken) {
376
+ unfitTokens = remainingTextTokens.slice(i);
377
+ break;
378
+ }
379
+ context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, remainingToken);
380
+ functionCallTokens.push(remainingToken);
386
381
  }
387
- if (replacementTokens.length > 0) {
388
- replacementToken = replacementTokens[0];
389
- queuedTokenRelease.modifyTokensAndText(replacementTokens, model.detokenize([replacementToken]));
382
+ if (unfitTokens.length > 0) {
383
+ const unfitTokensText = model.detokenize(unfitTokens); // the current token text must end with it
384
+ const currentTokenText = queuedTokenRelease.text;
385
+ let replacementTokens;
386
+ if (!currentTokenText.endsWith(unfitTokensText)) {
387
+ console.warn(getConsoleLogPrefix() + "The current token text does not end with the unfit function call syntax tokens text");
388
+ replacementTokens = remainingTextTokens.slice(0, -unfitTokens.length);
389
+ }
390
+ else {
391
+ const newCurrentTokensText = currentTokenText.slice(0, -unfitTokensText.length);
392
+ replacementTokens = model.tokenize(newCurrentTokensText, false, "trimLeadingSpace");
393
+ }
394
+ if (replacementTokens.length > 0) {
395
+ replacementToken = replacementTokens[0];
396
+ queuedTokenRelease.modifyTokensAndText(replacementTokens, model.detokenize([replacementToken]));
397
+ }
390
398
  }
391
399
  }
392
- }
393
- else if (inFunctionEvaluationMode) {
394
- functionCallTokens.push(...tokens);
395
- functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
396
- functionSyntaxEndDetector.recordGeneration({ text, tokens, queuedTokenRelease });
397
- }
398
- if (inFunctionEvaluationMode && functionSyntaxEndDetector.hasTriggeredStops && functionsGrammar != null) {
399
- const functionCallText = model.detokenize(functionCallTokens);
400
- const functionCall = functionsGrammar.parseFunctionCall(functionCallText);
401
- let modelResponse = model.detokenize(res);
402
- let contextWindowModelResponse = model.detokenize(contextWindowsRes);
403
- if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
404
- modelResponse = modelResponse.trimEnd();
405
- contextWindowModelResponse = contextWindowModelResponse.trimEnd();
400
+ else if (inFunctionEvaluationMode) {
401
+ functionCallTokens.push(...tokens);
402
+ functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
403
+ functionSyntaxEndDetector.recordGeneration({ text, tokens, queuedTokenRelease });
406
404
  }
407
- return {
408
- response: modelResponse,
409
- lastEvaluation: {
410
- contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
411
- cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
412
- contextShiftMetadata: lastHistoryCompressionMetadata
413
- },
414
- // prevent infinite TS type instantiation
415
- functionCall: functionCall,
416
- metadata: {
417
- stopReason: "functionCall"
405
+ if (inFunctionEvaluationMode && functionSyntaxEndDetector.hasTriggeredStops && functionsGrammar != null) {
406
+ const functionCallText = model.detokenize(functionCallTokens);
407
+ const functionCall = functionsGrammar.parseFunctionCall(functionCallText);
408
+ let modelResponse = model.detokenize(res);
409
+ let contextWindowModelResponse = model.detokenize(contextWindowsRes);
410
+ if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
411
+ modelResponse = modelResponse.trimEnd();
412
+ contextWindowModelResponse = contextWindowModelResponse.trimEnd();
418
413
  }
419
- };
420
- }
421
- if (!inFunctionEvaluationMode)
422
- stopGenerationDetector.recordGeneration({ text, tokens, queuedTokenRelease });
423
- pendingTokens.push(...streamRegulator.popFreeChunkTokens());
424
- removeFoundStartIgnoreTextsFromPendingTokens();
425
- if (stopGenerationDetector.hasTriggeredStops || model.isEogToken(token)) {
426
- const triggeredStops = stopGenerationDetector.getTriggeredStops();
427
- const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
428
- const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
429
- pendingTokens.push(...queuedTokensBeforeStopTrigger);
430
- const [firstRemainingGenerationAfterStop] = triggeredStops
431
- .map((stopTrigger) => stopTrigger.remainingGenerations)
432
- .filter((remainingGenerations) => remainingGenerations.length > 0)
433
- .flat(1);
434
- removeFoundStartIgnoreTextsFromPendingTokens();
435
- if (pendingTokens.length > 0)
436
- onToken?.(pendingTokens.slice());
437
- res.push(...pendingTokens);
438
- contextWindowsRes.push(...pendingTokens);
439
- pendingTokens.length = 0;
440
- let modelResponse = model.detokenize(res);
441
- let contextWindowModelResponse = model.detokenize(contextWindowsRes);
442
- if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
443
- modelResponse = modelResponse.trimEnd();
444
- contextWindowModelResponse = contextWindowModelResponse.trimEnd();
414
+ return {
415
+ response: modelResponse,
416
+ lastEvaluation: {
417
+ contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
418
+ cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
419
+ contextShiftMetadata: lastHistoryCompressionMetadata
420
+ },
421
+ // prevent infinite TS type instantiation
422
+ functionCall: functionCall,
423
+ metadata: {
424
+ stopReason: "functionCall"
425
+ }
426
+ };
445
427
  }
446
- return {
447
- response: modelResponse,
448
- lastEvaluation: {
449
- contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
450
- cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
451
- contextShiftMetadata: lastHistoryCompressionMetadata
452
- },
453
- metadata: {
454
- remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
455
- stopReason: model.isEogToken(token)
456
- ? "eogToken"
457
- : "stopGenerationTrigger"
458
- }
459
- };
460
- }
461
- const maxTokensTriggered = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
462
- if (res.length === 0) {
463
- ignoreStartTextDetector.clearInProgressStops();
464
- ignoreStartTextDetector.clearTriggeredStops();
465
- ignoreStartTextDetector.recordGeneration({
466
- text: model.detokenize(pendingTokens),
467
- tokens: pendingTokens
468
- });
469
- }
470
- if (pendingTokens.length > 0 && (maxTokensTriggered || !ignoreStartTextDetector.hasInProgressStops)) {
428
+ if (!inFunctionEvaluationMode) {
429
+ stopGenerationDetector.recordGeneration({ text, tokens, queuedTokenRelease });
430
+ customStopGenerationTriggersDetector.recordGeneration({ text, tokens, queuedTokenRelease });
431
+ }
432
+ pendingTokens.push(...streamRegulator.popFreeChunkTokens());
471
433
  removeFoundStartIgnoreTextsFromPendingTokens();
472
- if (pendingTokens.length > 0) {
473
- onToken?.(pendingTokens.slice());
434
+ if (stopGenerationDetector.hasTriggeredStops || customStopGenerationTriggersDetector.hasTriggeredStops ||
435
+ model.isEogToken(token)) {
436
+ const triggeredStops = stopGenerationDetector.hasTriggeredStops
437
+ ? stopGenerationDetector.getTriggeredStops()
438
+ : customStopGenerationTriggersDetector.getTriggeredStops();
439
+ const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
440
+ const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
441
+ pendingTokens.push(...queuedTokensBeforeStopTrigger);
442
+ const [firstRemainingGenerationAfterStop] = triggeredStops
443
+ .map((stopTrigger) => stopTrigger.remainingGenerations)
444
+ .filter((remainingGenerations) => remainingGenerations.length > 0)
445
+ .flat(1);
446
+ removeFoundStartIgnoreTextsFromPendingTokens();
447
+ if (pendingTokens.length > 0)
448
+ onToken?.(pendingTokens.slice());
474
449
  res.push(...pendingTokens);
475
450
  contextWindowsRes.push(...pendingTokens);
476
451
  pendingTokens.length = 0;
477
- }
478
- }
479
- if (maxTokensTriggered) {
480
- let modelResponse = model.detokenize(res);
481
- let contextWindowModelResponse = model.detokenize(contextWindowsRes);
482
- if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
483
- modelResponse = modelResponse.trimEnd();
484
- contextWindowModelResponse = contextWindowModelResponse.trimEnd();
485
- }
486
- return {
487
- response: modelResponse,
488
- lastEvaluation: {
452
+ let modelResponse = model.detokenize(res);
453
+ let contextWindowModelResponse = model.detokenize(contextWindowsRes);
454
+ if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
455
+ modelResponse = modelResponse.trimEnd();
456
+ contextWindowModelResponse = contextWindowModelResponse.trimEnd();
457
+ }
458
+ const lastEvaluation = {
489
459
  contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
490
460
  cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
491
461
  contextShiftMetadata: lastHistoryCompressionMetadata
492
- },
493
- metadata: {
494
- stopReason: "maxTokens"
462
+ };
463
+ const isEogToken = model.isEogToken(token);
464
+ if (isEogToken || stopGenerationDetector.hasTriggeredStops) {
465
+ return {
466
+ response: modelResponse,
467
+ lastEvaluation,
468
+ metadata: {
469
+ remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
470
+ stopReason: isEogToken
471
+ ? "eogToken"
472
+ : "stopGenerationTrigger"
473
+ }
474
+ };
495
475
  }
496
- };
497
- }
498
- if (this._sequence.nextTokenIndex >= context.contextSize) {
499
- shouldContextShift = true;
500
- break;
476
+ return {
477
+ response: modelResponse,
478
+ lastEvaluation,
479
+ metadata: {
480
+ remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
481
+ stopReason: "customStopTrigger",
482
+ customStopTrigger: triggeredStops[0].stopTrigger
483
+ }
484
+ };
485
+ }
486
+ const maxTokensTriggered = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
487
+ if (res.length === 0) {
488
+ ignoreStartTextDetector.clearInProgressStops();
489
+ ignoreStartTextDetector.clearTriggeredStops();
490
+ ignoreStartTextDetector.recordGeneration({
491
+ text: model.detokenize(pendingTokens),
492
+ tokens: pendingTokens
493
+ });
494
+ }
495
+ if (pendingTokens.length > 0 && (maxTokensTriggered || !ignoreStartTextDetector.hasInProgressStops)) {
496
+ removeFoundStartIgnoreTextsFromPendingTokens();
497
+ if (pendingTokens.length > 0) {
498
+ onToken?.(pendingTokens.slice());
499
+ res.push(...pendingTokens);
500
+ contextWindowsRes.push(...pendingTokens);
501
+ pendingTokens.length = 0;
502
+ }
503
+ }
504
+ if (maxTokensTriggered) {
505
+ let modelResponse = model.detokenize(res);
506
+ let contextWindowModelResponse = model.detokenize(contextWindowsRes);
507
+ if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
508
+ modelResponse = modelResponse.trimEnd();
509
+ contextWindowModelResponse = contextWindowModelResponse.trimEnd();
510
+ }
511
+ return {
512
+ response: modelResponse,
513
+ lastEvaluation: {
514
+ contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
515
+ cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
516
+ contextShiftMetadata: lastHistoryCompressionMetadata
517
+ },
518
+ metadata: {
519
+ stopReason: "maxTokens"
520
+ }
521
+ };
522
+ }
523
+ if (this._sequence.nextTokenIndex >= context.contextSize - 1) {
524
+ shouldContextShift = true;
525
+ break;
526
+ }
527
+ if (signal?.aborted && stopOnAbortSignal) {
528
+ if (res.length === 0)
529
+ throw signal.reason;
530
+ let modelResponse = model.detokenize(res);
531
+ let contextWindowModelResponse = model.detokenize(contextWindowsRes);
532
+ if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
533
+ modelResponse = modelResponse.trimEnd();
534
+ contextWindowModelResponse = contextWindowModelResponse.trimEnd();
535
+ }
536
+ return {
537
+ response: modelResponse,
538
+ lastEvaluation: {
539
+ contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
540
+ cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
541
+ contextShiftMetadata: lastHistoryCompressionMetadata
542
+ },
543
+ metadata: {
544
+ stopReason: "abort"
545
+ }
546
+ };
547
+ }
548
+ currentIteration = await evaluationIterator.next(replacementToken);
501
549
  }
502
- currentIteration = await evaluationIterator.next(replacementToken);
550
+ }
551
+ finally {
552
+ await evaluationIterator.return();
503
553
  }
504
554
  isFirstEvaluation = false;
505
555
  if (shouldContextShift)
@@ -654,7 +704,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
654
704
  : resolvedContextShift.size;
655
705
  const { compressedHistory, metadata } = await compressHistoryToFitContextSize({
656
706
  history: resolvedHistory,
657
- contextShiftSize: Math.max(contextShiftSize, minFreeContextTokens) + pendingTokensCount,
707
+ contextShiftSize: Math.max(minFreeContextTokens, Math.min(contextShiftSize, context.contextSize - pendingTokensCount)) + pendingTokensCount,
658
708
  contextShiftStrategy: resolvedContextShift.strategy,
659
709
  contextShiftLastEvaluationMetadata: resolvedContextShift.lastEvaluationMetadata,
660
710
  contextSize: context.contextSize,
@@ -701,7 +751,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
701
751
  : resolvedContextShift.size)));
702
752
  const { compressedHistory, metadata } = await compressHistoryToFitContextSize({
703
753
  history: resolvedHistory,
704
- contextShiftSize: Math.max(contextShiftSize, minFreeContextTokens) + pendingTokensCount,
754
+ contextShiftSize: Math.max(minFreeContextTokens, Math.min(contextShiftSize, context.contextSize - pendingTokensCount)) + pendingTokensCount,
705
755
  contextShiftStrategy: resolvedContextShift.strategy,
706
756
  contextShiftLastEvaluationMetadata: resolvedContextShift.lastEvaluationMetadata,
707
757
  contextSize: context.contextSize,