node-llama-cpp 3.0.0-beta.17 → 3.0.0-beta.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. package/README.md +8 -5
  2. package/dist/ChatWrapper.js +3 -3
  3. package/dist/ChatWrapper.js.map +1 -1
  4. package/dist/apiDocsOverrides.d.ts +1 -0
  5. package/dist/apiDocsOverrides.js +5 -0
  6. package/dist/apiDocsOverrides.js.map +1 -0
  7. package/dist/bindings/AddonTypes.d.ts +1 -0
  8. package/dist/bindings/Llama.d.ts +6 -0
  9. package/dist/bindings/Llama.js +11 -0
  10. package/dist/bindings/Llama.js.map +1 -1
  11. package/dist/bindings/getLlama.d.ts +6 -2
  12. package/dist/bindings/getLlama.js +11 -4
  13. package/dist/bindings/getLlama.js.map +1 -1
  14. package/dist/bindings/utils/hasBuildingFromSourceDependenciesInstalled.d.ts +3 -0
  15. package/dist/bindings/utils/hasBuildingFromSourceDependenciesInstalled.js +27 -0
  16. package/dist/bindings/utils/hasBuildingFromSourceDependenciesInstalled.js.map +1 -0
  17. package/dist/chatWrappers/generic/TemplateChatWrapper.d.ts +1 -2
  18. package/dist/chatWrappers/generic/TemplateChatWrapper.js +1 -2
  19. package/dist/chatWrappers/generic/TemplateChatWrapper.js.map +1 -1
  20. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js +2 -2
  21. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js.map +1 -1
  22. package/dist/cli/cli.js +5 -3
  23. package/dist/cli/cli.js.map +1 -1
  24. package/dist/cli/commands/DebugCommand.js +3 -5
  25. package/dist/cli/commands/DebugCommand.js.map +1 -1
  26. package/dist/cli/commands/DownloadCommand.d.ts +1 -1
  27. package/dist/cli/commands/DownloadCommand.js +2 -1
  28. package/dist/cli/commands/DownloadCommand.js.map +1 -1
  29. package/dist/cli/commands/PullCommand.d.ts +12 -0
  30. package/dist/cli/commands/PullCommand.js +109 -0
  31. package/dist/cli/commands/PullCommand.js.map +1 -0
  32. package/dist/cli/commands/inspect/commands/InspectGgufCommand.d.ts +1 -0
  33. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +23 -11
  34. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -1
  35. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +2 -1
  36. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -1
  37. package/dist/cli/recommendedModels.js +12 -20
  38. package/dist/cli/recommendedModels.js.map +1 -1
  39. package/dist/cli/utils/resolveCommandGgufPath.d.ts +3 -1
  40. package/dist/cli/utils/resolveCommandGgufPath.js +41 -97
  41. package/dist/cli/utils/resolveCommandGgufPath.js.map +1 -1
  42. package/dist/cli/utils/resolveModelRecommendationFileOptions.d.ts +2 -2
  43. package/dist/cli/utils/resolveModelRecommendationFileOptions.js +1 -4
  44. package/dist/cli/utils/resolveModelRecommendationFileOptions.js.map +1 -1
  45. package/dist/evaluator/LlamaChat/LlamaChat.d.ts +18 -2
  46. package/dist/evaluator/LlamaChat/LlamaChat.js +258 -205
  47. package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
  48. package/dist/evaluator/LlamaChat/utils/FunctionCallGrammar.js +1 -2
  49. package/dist/evaluator/LlamaChat/utils/FunctionCallGrammar.js.map +1 -1
  50. package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +22 -3
  51. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +18 -7
  52. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
  53. package/dist/evaluator/LlamaCompletion.js +2 -2
  54. package/dist/evaluator/LlamaCompletion.js.map +1 -1
  55. package/dist/evaluator/LlamaContext/LlamaContext.d.ts +2 -7
  56. package/dist/evaluator/LlamaContext/LlamaContext.js +12 -12
  57. package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
  58. package/dist/evaluator/LlamaEmbeddingContext.d.ts +2 -10
  59. package/dist/evaluator/LlamaEmbeddingContext.js +9 -23
  60. package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
  61. package/dist/evaluator/LlamaGrammar.d.ts +3 -3
  62. package/dist/evaluator/LlamaGrammar.js +3 -3
  63. package/dist/evaluator/LlamaGrammar.js.map +1 -1
  64. package/dist/evaluator/LlamaJsonSchemaGrammar.js +2 -3
  65. package/dist/evaluator/LlamaJsonSchemaGrammar.js.map +1 -1
  66. package/dist/evaluator/LlamaModel.d.ts +9 -0
  67. package/dist/evaluator/LlamaModel.js +2 -1
  68. package/dist/evaluator/LlamaModel.js.map +1 -1
  69. package/dist/gguf/insights/GgufInsights.js +12 -12
  70. package/dist/gguf/insights/GgufInsights.js.map +1 -1
  71. package/dist/gguf/insights/utils/resolveContextContextSizeOption.js +27 -3
  72. package/dist/gguf/insights/utils/resolveContextContextSizeOption.js.map +1 -1
  73. package/dist/gguf/parser/parseGguf.js +5 -0
  74. package/dist/gguf/parser/parseGguf.js.map +1 -1
  75. package/dist/gguf/readGgufFileInfo.d.ts +5 -2
  76. package/dist/gguf/readGgufFileInfo.js +38 -10
  77. package/dist/gguf/readGgufFileInfo.js.map +1 -1
  78. package/dist/gguf/types/GgufFileInfoTypes.d.ts +32 -0
  79. package/dist/gguf/types/GgufFileInfoTypes.js.map +1 -1
  80. package/dist/gguf/utils/getGgufMetadataArchitectureData.js +1 -1
  81. package/dist/gguf/utils/getGgufMetadataArchitectureData.js.map +1 -1
  82. package/dist/gguf/utils/resolveBinarySplitGgufPartUrls.d.ts +2 -0
  83. package/dist/gguf/utils/resolveBinarySplitGgufPartUrls.js +39 -0
  84. package/dist/gguf/utils/resolveBinarySplitGgufPartUrls.js.map +1 -0
  85. package/dist/gguf/utils/resolveSplitGgufParts.d.ts +7 -0
  86. package/dist/gguf/utils/resolveSplitGgufParts.js +55 -0
  87. package/dist/gguf/utils/resolveSplitGgufParts.js.map +1 -0
  88. package/dist/index.d.ts +6 -4
  89. package/dist/index.js +4 -2
  90. package/dist/index.js.map +1 -1
  91. package/dist/utils/LlamaText.d.ts +29 -20
  92. package/dist/utils/LlamaText.js +253 -243
  93. package/dist/utils/LlamaText.js.map +1 -1
  94. package/dist/utils/StopGenerationDetector.d.ts +1 -1
  95. package/dist/utils/StopGenerationDetector.js +2 -0
  96. package/dist/utils/StopGenerationDetector.js.map +1 -1
  97. package/dist/utils/TokenStreamRegulator.d.ts +4 -2
  98. package/dist/utils/TokenStreamRegulator.js +56 -4
  99. package/dist/utils/TokenStreamRegulator.js.map +1 -1
  100. package/dist/utils/createModelDownloader.d.ts +99 -0
  101. package/dist/utils/createModelDownloader.js +226 -0
  102. package/dist/utils/createModelDownloader.js.map +1 -0
  103. package/dist/utils/findCharacterRemovalCountToFitChatHistoryInContext.js +18 -8
  104. package/dist/utils/findCharacterRemovalCountToFitChatHistoryInContext.js.map +1 -1
  105. package/dist/utils/gbnfJson/getGbnfGrammarForGbnfJsonSchema.d.ts +5 -0
  106. package/dist/utils/gbnfJson/getGbnfGrammarForGbnfJsonSchema.js +11 -0
  107. package/dist/utils/gbnfJson/getGbnfGrammarForGbnfJsonSchema.js.map +1 -0
  108. package/dist/utils/gbnfJson/terminals/GbnfArray.d.ts +3 -1
  109. package/dist/utils/gbnfJson/terminals/GbnfArray.js +10 -5
  110. package/dist/utils/gbnfJson/terminals/GbnfArray.js.map +1 -1
  111. package/dist/utils/gbnfJson/terminals/GbnfObjectMap.d.ts +3 -1
  112. package/dist/utils/gbnfJson/terminals/GbnfObjectMap.js +9 -4
  113. package/dist/utils/gbnfJson/terminals/GbnfObjectMap.js.map +1 -1
  114. package/dist/utils/gbnfJson/terminals/GbnfRepetition.d.ts +9 -0
  115. package/dist/utils/gbnfJson/terminals/GbnfRepetition.js +37 -0
  116. package/dist/utils/gbnfJson/terminals/GbnfRepetition.js.map +1 -0
  117. package/dist/utils/gbnfJson/terminals/GbnfString.js +23 -5
  118. package/dist/utils/gbnfJson/terminals/GbnfString.js.map +1 -1
  119. package/dist/utils/gbnfJson/terminals/GbnfWhitespace.d.ts +6 -3
  120. package/dist/utils/gbnfJson/terminals/GbnfWhitespace.js +37 -9
  121. package/dist/utils/gbnfJson/terminals/GbnfWhitespace.js.map +1 -1
  122. package/dist/utils/gbnfJson/terminals/gbnfConsts.d.ts +5 -4
  123. package/dist/utils/gbnfJson/terminals/gbnfConsts.js +14 -3
  124. package/dist/utils/gbnfJson/terminals/gbnfConsts.js.map +1 -1
  125. package/dist/utils/gbnfJson/utils/GbnfJsonScopeState.d.ts +10 -0
  126. package/dist/utils/gbnfJson/utils/GbnfJsonScopeState.js +15 -0
  127. package/dist/utils/gbnfJson/utils/GbnfJsonScopeState.js.map +1 -0
  128. package/dist/utils/gbnfJson/utils/getGbnfJsonTerminalForGbnfJsonSchema.d.ts +2 -1
  129. package/dist/utils/gbnfJson/utils/getGbnfJsonTerminalForGbnfJsonSchema.js +6 -5
  130. package/dist/utils/gbnfJson/utils/getGbnfJsonTerminalForGbnfJsonSchema.js.map +1 -1
  131. package/dist/utils/parseTextTemplate.d.ts +2 -2
  132. package/dist/utils/parseTextTemplate.js +2 -2
  133. package/dist/utils/runtime.d.ts +4 -0
  134. package/dist/utils/runtime.js +8 -0
  135. package/dist/utils/runtime.js.map +1 -0
  136. package/llama/addon.cpp +18 -7
  137. package/llama/binariesGithubRelease.json +1 -1
  138. package/llama/gitRelease.bundle +0 -0
  139. package/llama/grammars/README.md +1 -1
  140. package/llama/llama.cpp.info.json +1 -1
  141. package/llamaBins/linux-arm64/_nlcBuildMetadata.json +1 -1
  142. package/llamaBins/linux-arm64/llama-addon.node +0 -0
  143. package/llamaBins/linux-armv7l/_nlcBuildMetadata.json +1 -1
  144. package/llamaBins/linux-armv7l/llama-addon.node +0 -0
  145. package/llamaBins/linux-x64/_nlcBuildMetadata.json +1 -1
  146. package/llamaBins/linux-x64/llama-addon.node +0 -0
  147. package/llamaBins/linux-x64-cuda/_nlcBuildMetadata.json +1 -1
  148. package/llamaBins/linux-x64-cuda/llama-addon.node +0 -0
  149. package/llamaBins/linux-x64-vulkan/_nlcBuildMetadata.json +1 -1
  150. package/llamaBins/linux-x64-vulkan/llama-addon.node +0 -0
  151. package/llamaBins/mac-arm64-metal/_nlcBuildMetadata.json +1 -1
  152. package/llamaBins/mac-arm64-metal/default.metallib +0 -0
  153. package/llamaBins/mac-arm64-metal/llama-addon.node +0 -0
  154. package/llamaBins/mac-x64/_nlcBuildMetadata.json +1 -1
  155. package/llamaBins/mac-x64/llama-addon.node +0 -0
  156. package/llamaBins/win-arm64/_nlcBuildMetadata.json +1 -1
  157. package/llamaBins/win-arm64/llama-addon.node +0 -0
  158. package/llamaBins/win-x64/_nlcBuildMetadata.json +1 -1
  159. package/llamaBins/win-x64/llama-addon.node +0 -0
  160. package/llamaBins/win-x64-cuda/_nlcBuildMetadata.json +1 -1
  161. package/llamaBins/win-x64-cuda/llama-addon.node +0 -0
  162. package/llamaBins/win-x64-vulkan/_nlcBuildMetadata.json +1 -1
  163. package/llamaBins/win-x64-vulkan/llama-addon.node +0 -0
  164. package/package.json +11 -8
  165. package/dist/utils/getGbnfGrammarForGbnfJsonSchema.d.ts +0 -2
  166. package/dist/utils/getGbnfGrammarForGbnfJsonSchema.js +0 -9
  167. package/dist/utils/getGbnfGrammarForGbnfJsonSchema.js.map +0 -1
@@ -72,7 +72,7 @@ export class LlamaChat {
72
72
  get model() {
73
73
  return this.sequence.model;
74
74
  }
75
- async generateResponse(history, { onToken, signal, maxTokens, temperature, minP, topK, topP, grammar, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
75
+ async generateResponse(history, { onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, grammar, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
76
76
  const functionsEnabled = (functions != null && Object.keys(functions).length > 0);
77
77
  if (grammar != null && functionsEnabled)
78
78
  throw new Error("Using both grammar and functions is not supported yet");
@@ -116,6 +116,7 @@ export class LlamaChat {
116
116
  : undefined;
117
117
  const streamRegulator = new TokenStreamRegulator();
118
118
  const stopGenerationDetector = new StopGenerationDetector();
119
+ const customStopGenerationTriggersDetector = new StopGenerationDetector();
119
120
  const functionSyntaxStartDetector = new StopGenerationDetector();
120
121
  const functionSyntaxEndDetector = new StopGenerationDetector();
121
122
  const disengageInitiallyEngagedFunctionMode = new StopGenerationDetector();
@@ -129,7 +130,7 @@ export class LlamaChat {
129
130
  let lastContextWindowHistory = resolvedHistory;
130
131
  let lastHistoryCompressionMetadata = resolvedContextShift.lastEvaluationMetadata;
131
132
  const ensureNotAborted = () => {
132
- if (signal?.aborted)
133
+ if (signal?.aborted && (!stopOnAbortSignal || res.length === 0))
133
134
  throw signal.reason;
134
135
  if (this._sequence == null)
135
136
  throw new DisposedError();
@@ -200,6 +201,9 @@ export class LlamaChat {
200
201
  }
201
202
  }
202
203
  };
204
+ if (customStopTriggers != null)
205
+ StopGenerationDetector.resolveStopTriggers(customStopTriggers, model.tokenizer)
206
+ .map((stopTrigger) => customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
203
207
  if (grammar != null)
204
208
  StopGenerationDetector.resolveStopTriggers(grammar.stopGenerationTriggers, model.tokenizer)
205
209
  .map((stopTrigger) => stopGenerationDetector.addStopTrigger(stopTrigger));
@@ -214,7 +218,7 @@ export class LlamaChat {
214
218
  resolvedHistory: getResolvedHistoryWithCurrentModelResponse(),
215
219
  resolvedContextShift,
216
220
  lastHistoryCompressionMetadata,
217
- pendingTokensCount: pendingTokens.length + queuedChunkTokens.length,
221
+ pendingTokensCount: ignoredStartTextTokens.length + pendingTokens.length + queuedChunkTokens.length,
218
222
  isFirstEvaluation,
219
223
  chatWrapper: this._chatWrapper,
220
224
  lastEvaluationContextWindowHistory,
@@ -281,225 +285,274 @@ export class LlamaChat {
281
285
  evaluationPriority,
282
286
  yieldEogToken: true
283
287
  }));
284
- let currentIteration = await evaluationIterator.next();
285
- while (currentIteration.done !== true) {
286
- const token = currentIteration.value;
287
- let replacementToken = undefined;
288
- ensureNotAborted();
289
- generatedTokens++;
290
- const tokens = [token];
291
- const text = model.detokenize([token]);
292
- const queuedTokenRelease = streamRegulator.addChunk({ tokens, text });
293
- if (initiallyEngagedFunctionMode)
294
- disengageInitiallyEngagedFunctionMode.recordGeneration({ text, tokens, startNewChecks: generatedTokens === 1 });
295
- if (text === UNKNOWN_UNICODE_CHAR || ((grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && text.trim() === "")) {
296
- locksToReleaseOnValidGeneration.push(queuedTokenRelease.createTextIndexLock(0));
297
- }
298
- else {
299
- while (locksToReleaseOnValidGeneration.length > 0)
300
- locksToReleaseOnValidGeneration.shift().dispose();
301
- }
302
- functionSyntaxStartDetector.recordGeneration({ text, tokens, queuedTokenRelease });
303
- if (initiallyEngagedFunctionMode && disengageInitiallyEngagedFunctionMode.hasTriggeredStops) {
304
- initiallyEngagedFunctionMode = false;
305
- let shouldStopFunctionEvaluationMode = !functionSyntaxStartDetector.hasTriggeredStops;
306
- if (!shouldStopFunctionEvaluationMode && functionsEnabled && functionsGrammar != null) {
307
- const functionCallText = model.detokenize([...functionCallTokens, ...tokens]);
308
- try {
309
- const functionName = functionsGrammar.parseFunctionNameFromPartialCall(functionCallText, {
310
- enableInternalBuiltinFunctions: true,
311
- initialFunctionCallEngaged: true
312
- });
313
- const internalBuiltinFunctions = this._chatWrapper.getInternalBuiltinFunctions({ initialFunctionCallEngaged: true });
314
- if (internalBuiltinFunctions[functionName] != null) {
315
- shouldStopFunctionEvaluationMode = true;
316
- }
317
- }
318
- catch (err) {
319
- if (!(err instanceof LlamaFunctionCallValidationError))
320
- throw err;
321
- }
288
+ try {
289
+ let currentIteration = await evaluationIterator.next();
290
+ while (currentIteration.done !== true) {
291
+ const token = currentIteration.value;
292
+ let replacementToken = undefined;
293
+ ensureNotAborted();
294
+ generatedTokens++;
295
+ const tokens = [token];
296
+ const text = model.detokenize([token]);
297
+ const queuedTokenRelease = streamRegulator.addChunk({ tokens, text });
298
+ if (initiallyEngagedFunctionMode)
299
+ disengageInitiallyEngagedFunctionMode.recordGeneration({ text, tokens, startNewChecks: generatedTokens === 1 });
300
+ if (text === UNKNOWN_UNICODE_CHAR || ((grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && text.trim() === "")) {
301
+ locksToReleaseOnValidGeneration.push(queuedTokenRelease.createTextIndexLock(0));
322
302
  }
323
- if (shouldStopFunctionEvaluationMode) {
324
- inFunctionEvaluationMode = false;
325
- functionsGrammar = new FunctionCallGrammar(model._llama, functions, this._chatWrapper, false);
326
- functionsEvaluationState = new LlamaGrammarEvaluationState({
327
- grammar: functionsGrammar
328
- });
329
- functionCallTokens.length = 0;
330
- while (functionCallTokenSyntaxLocks.length > 0)
331
- functionCallTokenSyntaxLocks.shift().dispose();
332
- functionSyntaxStartDetector.clearInProgressStops();
333
- functionSyntaxStartDetector.clearTriggeredStops();
334
- functionSyntaxEndDetector.clearInProgressStops();
335
- functionSyntaxEndDetector.clearTriggeredStops();
303
+ else {
304
+ while (locksToReleaseOnValidGeneration.length > 0)
305
+ locksToReleaseOnValidGeneration.shift().dispose();
336
306
  }
337
- }
338
- if (!inFunctionEvaluationMode && functionsEnabled && functionsGrammar != null &&
339
- functionSyntaxStartDetector.hasTriggeredStops && functionsEvaluationState != null) {
340
- inFunctionEvaluationMode = true;
341
- functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
342
- stopGenerationDetector.clearTriggeredStops();
343
- stopGenerationDetector.clearInProgressStops();
344
- pendingTokens.push(...streamRegulator.popFreeChunkTokens());
345
- const triggeredStops = functionSyntaxStartDetector.getTriggeredStops();
346
- const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
347
- const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
348
- pendingTokens.push(...queuedTokensBeforeStopTrigger);
349
- const [firstRemainingGenerationAfterStop] = triggeredStops
350
- .map((stopTrigger) => stopTrigger.remainingGenerations)
351
- .filter((remainingGenerations) => remainingGenerations.length > 0)
352
- .flat(1);
353
- const remainingTextAfterStop = (firstRemainingGenerationAfterStop == null || firstRemainingGenerationAfterStop.length === 0)
354
- ? ""
355
- : typeof firstRemainingGenerationAfterStop === "string"
356
- ? firstRemainingGenerationAfterStop
357
- : model.detokenize(firstRemainingGenerationAfterStop);
358
- functionCallTokens.push(...model.tokenize(this._chatWrapper.settings.functions.call.prefix, false, "trimLeadingSpace"));
359
- for (const functionCallToken of functionCallTokens)
360
- context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, functionCallToken);
361
- // these tokens have to be verified that they match the function calling syntax grammar before they can be accepted,
362
- // or the context state should be modified to not include the incompatible tokens
363
- const remainingTextTokens = model.tokenize(remainingTextAfterStop, false, "trimLeadingSpace");
364
- let unfitTokens = [];
365
- for (let i = 0; i < remainingTextTokens.length; i++) {
366
- const remainingToken = remainingTextTokens[i];
367
- const canBeNextToken = context._canBeNextTokenForGrammarEvaluationState(functionsEvaluationState, remainingToken);
368
- if (!canBeNextToken) {
369
- unfitTokens = remainingTextTokens.slice(i);
370
- break;
307
+ functionSyntaxStartDetector.recordGeneration({ text, tokens, queuedTokenRelease });
308
+ if (initiallyEngagedFunctionMode && disengageInitiallyEngagedFunctionMode.hasTriggeredStops) {
309
+ initiallyEngagedFunctionMode = false;
310
+ let shouldStopFunctionEvaluationMode = !functionSyntaxStartDetector.hasTriggeredStops;
311
+ if (!shouldStopFunctionEvaluationMode && functionsEnabled && functionsGrammar != null) {
312
+ const functionCallText = model.detokenize([...functionCallTokens, ...tokens]);
313
+ try {
314
+ const functionName = functionsGrammar.parseFunctionNameFromPartialCall(functionCallText, {
315
+ enableInternalBuiltinFunctions: true,
316
+ initialFunctionCallEngaged: true
317
+ });
318
+ const internalBuiltinFunctions = this._chatWrapper.getInternalBuiltinFunctions({ initialFunctionCallEngaged: true });
319
+ if (internalBuiltinFunctions[functionName] != null) {
320
+ shouldStopFunctionEvaluationMode = true;
321
+ }
322
+ }
323
+ catch (err) {
324
+ if (!(err instanceof LlamaFunctionCallValidationError))
325
+ throw err;
326
+ }
371
327
  }
372
- context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, remainingToken);
373
- functionCallTokens.push(remainingToken);
374
- }
375
- if (unfitTokens.length > 0) {
376
- const unfitTokensText = model.detokenize(unfitTokens); // the current token text must end with it
377
- const currentTokenText = queuedTokenRelease.text;
378
- let replacementTokens;
379
- if (!currentTokenText.endsWith(unfitTokensText)) {
380
- console.warn(getConsoleLogPrefix() + "The current token text does not end with the unfit function call syntax tokens text");
381
- replacementTokens = remainingTextTokens.slice(0, -unfitTokens.length);
328
+ if (shouldStopFunctionEvaluationMode) {
329
+ inFunctionEvaluationMode = false;
330
+ functionsGrammar = new FunctionCallGrammar(model._llama, functions, this._chatWrapper, false);
331
+ functionsEvaluationState = new LlamaGrammarEvaluationState({
332
+ grammar: functionsGrammar
333
+ });
334
+ functionCallTokens.length = 0;
335
+ while (functionCallTokenSyntaxLocks.length > 0)
336
+ functionCallTokenSyntaxLocks.shift().dispose();
337
+ functionSyntaxStartDetector.clearInProgressStops();
338
+ functionSyntaxStartDetector.clearTriggeredStops();
339
+ functionSyntaxEndDetector.clearInProgressStops();
340
+ functionSyntaxEndDetector.clearTriggeredStops();
382
341
  }
383
- else {
384
- const newCurrentTokensText = currentTokenText.slice(0, -unfitTokensText.length);
385
- replacementTokens = model.tokenize(newCurrentTokensText, false, "trimLeadingSpace");
342
+ }
343
+ if (!inFunctionEvaluationMode && functionsEnabled && functionsGrammar != null &&
344
+ functionSyntaxStartDetector.hasTriggeredStops && functionsEvaluationState != null) {
345
+ inFunctionEvaluationMode = true;
346
+ functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
347
+ stopGenerationDetector.clearTriggeredStops();
348
+ stopGenerationDetector.clearInProgressStops();
349
+ customStopGenerationTriggersDetector.clearTriggeredStops();
350
+ customStopGenerationTriggersDetector.clearInProgressStops();
351
+ pendingTokens.push(...streamRegulator.popFreeChunkTokens());
352
+ const triggeredStops = functionSyntaxStartDetector.getTriggeredStops();
353
+ const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk(model.tokenizer);
354
+ const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
355
+ pendingTokens.push(...queuedTokensBeforeStopTrigger);
356
+ const [firstRemainingGenerationAfterStop] = triggeredStops
357
+ .map((stopTrigger) => stopTrigger.remainingGenerations)
358
+ .filter((remainingGenerations) => remainingGenerations.length > 0)
359
+ .flat(1);
360
+ const remainingTextAfterStop = (firstRemainingGenerationAfterStop == null || firstRemainingGenerationAfterStop.length === 0)
361
+ ? ""
362
+ : typeof firstRemainingGenerationAfterStop === "string"
363
+ ? firstRemainingGenerationAfterStop
364
+ : model.detokenize(firstRemainingGenerationAfterStop);
365
+ functionCallTokens.push(...model.tokenize(this._chatWrapper.settings.functions.call.prefix, false, "trimLeadingSpace"));
366
+ for (const functionCallToken of functionCallTokens)
367
+ context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, functionCallToken);
368
+ // these tokens have to be verified that they match the function calling syntax grammar before they can be accepted,
369
+ // or the context state should be modified to not include the incompatible tokens
370
+ const remainingTextTokens = model.tokenize(remainingTextAfterStop, false, "trimLeadingSpace");
371
+ let unfitTokens = [];
372
+ for (let i = 0; i < remainingTextTokens.length; i++) {
373
+ const remainingToken = remainingTextTokens[i];
374
+ const canBeNextToken = context._canBeNextTokenForGrammarEvaluationState(functionsEvaluationState, remainingToken);
375
+ if (!canBeNextToken) {
376
+ unfitTokens = remainingTextTokens.slice(i);
377
+ break;
378
+ }
379
+ context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, remainingToken);
380
+ functionCallTokens.push(remainingToken);
386
381
  }
387
- if (replacementTokens.length > 0) {
388
- replacementToken = replacementTokens[0];
389
- queuedTokenRelease.modifyTokensAndText(replacementTokens, model.detokenize([replacementToken]));
382
+ if (unfitTokens.length > 0) {
383
+ const unfitTokensText = model.detokenize(unfitTokens); // the current token text must end with it
384
+ const currentTokenText = queuedTokenRelease.text;
385
+ let replacementTokens;
386
+ if (!currentTokenText.endsWith(unfitTokensText)) {
387
+ console.warn(getConsoleLogPrefix() + "The current token text does not end with the unfit function call syntax tokens text");
388
+ replacementTokens = remainingTextTokens.slice(0, -unfitTokens.length);
389
+ }
390
+ else {
391
+ const newCurrentTokensText = currentTokenText.slice(0, -unfitTokensText.length);
392
+ replacementTokens = model.tokenize(newCurrentTokensText, false, "trimLeadingSpace");
393
+ }
394
+ if (replacementTokens.length > 0) {
395
+ replacementToken = replacementTokens[0];
396
+ queuedTokenRelease.modifyTokensAndText(replacementTokens, model.detokenize([replacementToken]));
397
+ }
390
398
  }
391
399
  }
392
- }
393
- else if (inFunctionEvaluationMode) {
394
- functionCallTokens.push(...tokens);
395
- functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
396
- functionSyntaxEndDetector.recordGeneration({ text, tokens, queuedTokenRelease });
397
- }
398
- if (inFunctionEvaluationMode && functionSyntaxEndDetector.hasTriggeredStops && functionsGrammar != null) {
399
- const functionCallText = model.detokenize(functionCallTokens);
400
- const functionCall = functionsGrammar.parseFunctionCall(functionCallText);
401
- let modelResponse = model.detokenize(res);
402
- let contextWindowModelResponse = model.detokenize(contextWindowsRes);
403
- if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
404
- modelResponse = modelResponse.trimEnd();
405
- contextWindowModelResponse = contextWindowModelResponse.trimEnd();
400
+ else if (inFunctionEvaluationMode) {
401
+ functionCallTokens.push(...tokens);
402
+ functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
403
+ functionSyntaxEndDetector.recordGeneration({ text, tokens, queuedTokenRelease });
406
404
  }
407
- return {
408
- response: modelResponse,
409
- lastEvaluation: {
410
- contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
411
- cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
412
- contextShiftMetadata: lastHistoryCompressionMetadata
413
- },
414
- // prevent infinite TS type instantiation
415
- functionCall: functionCall,
416
- metadata: {
417
- stopReason: "functionCall"
405
+ if (inFunctionEvaluationMode && functionSyntaxEndDetector.hasTriggeredStops && functionsGrammar != null) {
406
+ const functionCallText = model.detokenize(functionCallTokens);
407
+ const functionCall = functionsGrammar.parseFunctionCall(functionCallText);
408
+ let modelResponse = model.detokenize(res);
409
+ let contextWindowModelResponse = model.detokenize(contextWindowsRes);
410
+ if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
411
+ modelResponse = modelResponse.trimEnd();
412
+ contextWindowModelResponse = contextWindowModelResponse.trimEnd();
418
413
  }
419
- };
420
- }
421
- if (!inFunctionEvaluationMode)
422
- stopGenerationDetector.recordGeneration({ text, tokens, queuedTokenRelease });
423
- pendingTokens.push(...streamRegulator.popFreeChunkTokens());
424
- removeFoundStartIgnoreTextsFromPendingTokens();
425
- if (stopGenerationDetector.hasTriggeredStops || model.isEogToken(token)) {
426
- const triggeredStops = stopGenerationDetector.getTriggeredStops();
427
- const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk();
428
- const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
429
- pendingTokens.push(...queuedTokensBeforeStopTrigger);
430
- const [firstRemainingGenerationAfterStop] = triggeredStops
431
- .map((stopTrigger) => stopTrigger.remainingGenerations)
432
- .filter((remainingGenerations) => remainingGenerations.length > 0)
433
- .flat(1);
434
- removeFoundStartIgnoreTextsFromPendingTokens();
435
- if (pendingTokens.length > 0)
436
- onToken?.(pendingTokens.slice());
437
- res.push(...pendingTokens);
438
- contextWindowsRes.push(...pendingTokens);
439
- pendingTokens.length = 0;
440
- let modelResponse = model.detokenize(res);
441
- let contextWindowModelResponse = model.detokenize(contextWindowsRes);
442
- if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
443
- modelResponse = modelResponse.trimEnd();
444
- contextWindowModelResponse = contextWindowModelResponse.trimEnd();
414
+ return {
415
+ response: modelResponse,
416
+ lastEvaluation: {
417
+ contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
418
+ cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
419
+ contextShiftMetadata: lastHistoryCompressionMetadata
420
+ },
421
+ // prevent infinite TS type instantiation
422
+ functionCall: functionCall,
423
+ metadata: {
424
+ stopReason: "functionCall"
425
+ }
426
+ };
445
427
  }
446
- return {
447
- response: modelResponse,
448
- lastEvaluation: {
449
- contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
450
- cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
451
- contextShiftMetadata: lastHistoryCompressionMetadata
452
- },
453
- metadata: {
454
- remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
455
- stopReason: model.isEogToken(token)
456
- ? "eogToken"
457
- : "stopGenerationTrigger"
458
- }
459
- };
460
- }
461
- const maxTokensTriggered = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
462
- if (res.length === 0) {
463
- ignoreStartTextDetector.clearInProgressStops();
464
- ignoreStartTextDetector.clearTriggeredStops();
465
- ignoreStartTextDetector.recordGeneration({
466
- text: model.detokenize(pendingTokens),
467
- tokens: pendingTokens
468
- });
469
- }
470
- if (pendingTokens.length > 0 && (maxTokensTriggered || !ignoreStartTextDetector.hasInProgressStops)) {
428
+ if (!inFunctionEvaluationMode) {
429
+ stopGenerationDetector.recordGeneration({ text, tokens, queuedTokenRelease });
430
+ customStopGenerationTriggersDetector.recordGeneration({ text, tokens, queuedTokenRelease });
431
+ }
432
+ pendingTokens.push(...streamRegulator.popFreeChunkTokens());
471
433
  removeFoundStartIgnoreTextsFromPendingTokens();
472
- if (pendingTokens.length > 0) {
473
- onToken?.(pendingTokens.slice());
434
+ if (stopGenerationDetector.hasTriggeredStops || customStopGenerationTriggersDetector.hasTriggeredStops ||
435
+ model.isEogToken(token)) {
436
+ stopGenerationDetector.clearInProgressStops();
437
+ customStopGenerationTriggersDetector.clearInProgressStops();
438
+ pendingTokens.push(...streamRegulator.popFreeChunkTokens());
439
+ const triggeredStops = stopGenerationDetector.hasTriggeredStops
440
+ ? stopGenerationDetector.getTriggeredStops()
441
+ : customStopGenerationTriggersDetector.getTriggeredStops();
442
+ const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk(model.tokenizer);
443
+ const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
444
+ pendingTokens.push(...queuedTokensBeforeStopTrigger);
445
+ const [firstRemainingGenerationAfterStop] = triggeredStops
446
+ .map((stopTrigger) => stopTrigger.remainingGenerations)
447
+ .filter((remainingGenerations) => remainingGenerations.length > 0)
448
+ .flat(1);
449
+ removeFoundStartIgnoreTextsFromPendingTokens();
450
+ if (pendingTokens.length > 0)
451
+ onToken?.(pendingTokens.slice());
474
452
  res.push(...pendingTokens);
475
453
  contextWindowsRes.push(...pendingTokens);
476
454
  pendingTokens.length = 0;
477
- }
478
- }
479
- if (maxTokensTriggered) {
480
- let modelResponse = model.detokenize(res);
481
- let contextWindowModelResponse = model.detokenize(contextWindowsRes);
482
- if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
483
- modelResponse = modelResponse.trimEnd();
484
- contextWindowModelResponse = contextWindowModelResponse.trimEnd();
485
- }
486
- return {
487
- response: modelResponse,
488
- lastEvaluation: {
455
+ let modelResponse = model.detokenize(res);
456
+ let contextWindowModelResponse = model.detokenize(contextWindowsRes);
457
+ if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
458
+ modelResponse = modelResponse.trimEnd();
459
+ contextWindowModelResponse = contextWindowModelResponse.trimEnd();
460
+ }
461
+ const lastEvaluation = {
489
462
  contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
490
463
  cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
491
464
  contextShiftMetadata: lastHistoryCompressionMetadata
492
- },
493
- metadata: {
494
- stopReason: "maxTokens"
465
+ };
466
+ const isEogToken = model.isEogToken(token);
467
+ if (isEogToken || stopGenerationDetector.hasTriggeredStops) {
468
+ return {
469
+ response: modelResponse,
470
+ lastEvaluation,
471
+ metadata: {
472
+ remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
473
+ stopReason: isEogToken
474
+ ? "eogToken"
475
+ : "stopGenerationTrigger"
476
+ }
477
+ };
495
478
  }
496
- };
497
- }
498
- if (this._sequence.nextTokenIndex >= context.contextSize) {
499
- shouldContextShift = true;
500
- break;
479
+ return {
480
+ response: modelResponse,
481
+ lastEvaluation,
482
+ metadata: {
483
+ remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
484
+ stopReason: "customStopTrigger",
485
+ customStopTrigger: triggeredStops[0].stopTrigger
486
+ }
487
+ };
488
+ }
489
+ const maxTokensTriggered = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
490
+ if (res.length === 0) {
491
+ ignoreStartTextDetector.clearInProgressStops();
492
+ ignoreStartTextDetector.clearTriggeredStops();
493
+ ignoreStartTextDetector.recordGeneration({
494
+ text: model.detokenize(pendingTokens),
495
+ tokens: pendingTokens
496
+ });
497
+ }
498
+ if (pendingTokens.length > 0 && (maxTokensTriggered || !ignoreStartTextDetector.hasInProgressStops)) {
499
+ removeFoundStartIgnoreTextsFromPendingTokens();
500
+ if (pendingTokens.length > 0) {
501
+ onToken?.(pendingTokens.slice());
502
+ res.push(...pendingTokens);
503
+ contextWindowsRes.push(...pendingTokens);
504
+ pendingTokens.length = 0;
505
+ }
506
+ }
507
+ if (maxTokensTriggered) {
508
+ let modelResponse = model.detokenize(res);
509
+ let contextWindowModelResponse = model.detokenize(contextWindowsRes);
510
+ if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
511
+ modelResponse = modelResponse.trimEnd();
512
+ contextWindowModelResponse = contextWindowModelResponse.trimEnd();
513
+ }
514
+ return {
515
+ response: modelResponse,
516
+ lastEvaluation: {
517
+ contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
518
+ cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
519
+ contextShiftMetadata: lastHistoryCompressionMetadata
520
+ },
521
+ metadata: {
522
+ stopReason: "maxTokens"
523
+ }
524
+ };
525
+ }
526
+ if (this._sequence.nextTokenIndex >= context.contextSize - 1) {
527
+ shouldContextShift = true;
528
+ break;
529
+ }
530
+ if (signal?.aborted && stopOnAbortSignal) {
531
+ if (res.length === 0)
532
+ throw signal.reason;
533
+ let modelResponse = model.detokenize(res);
534
+ let contextWindowModelResponse = model.detokenize(contextWindowsRes);
535
+ if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
536
+ modelResponse = modelResponse.trimEnd();
537
+ contextWindowModelResponse = contextWindowModelResponse.trimEnd();
538
+ }
539
+ return {
540
+ response: modelResponse,
541
+ lastEvaluation: {
542
+ contextWindow: setLastModelTextResponseInChatHistory(lastContextWindowHistory, contextWindowLastModelResponse + contextWindowModelResponse),
543
+ cleanHistory: setLastModelTextResponseInChatHistory(resolvedHistory, lastModelResponse + modelResponse),
544
+ contextShiftMetadata: lastHistoryCompressionMetadata
545
+ },
546
+ metadata: {
547
+ stopReason: "abort"
548
+ }
549
+ };
550
+ }
551
+ currentIteration = await evaluationIterator.next(replacementToken);
501
552
  }
502
- currentIteration = await evaluationIterator.next(replacementToken);
553
+ }
554
+ finally {
555
+ await evaluationIterator.return();
503
556
  }
504
557
  isFirstEvaluation = false;
505
558
  if (shouldContextShift)
@@ -654,7 +707,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
654
707
  : resolvedContextShift.size;
655
708
  const { compressedHistory, metadata } = await compressHistoryToFitContextSize({
656
709
  history: resolvedHistory,
657
- contextShiftSize: Math.max(contextShiftSize, minFreeContextTokens) + pendingTokensCount,
710
+ contextShiftSize: Math.max(minFreeContextTokens, Math.min(contextShiftSize, context.contextSize - pendingTokensCount)) + pendingTokensCount,
658
711
  contextShiftStrategy: resolvedContextShift.strategy,
659
712
  contextShiftLastEvaluationMetadata: resolvedContextShift.lastEvaluationMetadata,
660
713
  contextSize: context.contextSize,
@@ -701,7 +754,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
701
754
  : resolvedContextShift.size)));
702
755
  const { compressedHistory, metadata } = await compressHistoryToFitContextSize({
703
756
  history: resolvedHistory,
704
- contextShiftSize: Math.max(contextShiftSize, minFreeContextTokens) + pendingTokensCount,
757
+ contextShiftSize: Math.max(minFreeContextTokens, Math.min(contextShiftSize, context.contextSize - pendingTokensCount)) + pendingTokensCount,
705
758
  contextShiftStrategy: resolvedContextShift.strategy,
706
759
  contextShiftLastEvaluationMetadata: resolvedContextShift.lastEvaluationMetadata,
707
760
  contextSize: context.contextSize,