node-llama-cpp 3.15.1 → 3.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/dist/bindings/AddonTypes.d.ts +8 -1
  2. package/dist/bindings/getLlama.d.ts +1 -1
  3. package/dist/bindings/getLlama.js +1 -1
  4. package/dist/bindings/getLlama.js.map +1 -1
  5. package/dist/chatWrappers/generic/utils/extractFunctionCallSettingsFromJinjaTemplate.js +67 -8
  6. package/dist/chatWrappers/generic/utils/extractFunctionCallSettingsFromJinjaTemplate.js.map +1 -1
  7. package/dist/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.js +2 -1
  8. package/dist/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.js.map +1 -1
  9. package/dist/cli/commands/ChatCommand.d.ts +6 -0
  10. package/dist/cli/commands/ChatCommand.js +66 -3
  11. package/dist/cli/commands/ChatCommand.js.map +1 -1
  12. package/dist/cli/commands/CompleteCommand.d.ts +6 -0
  13. package/dist/cli/commands/CompleteCommand.js +66 -4
  14. package/dist/cli/commands/CompleteCommand.js.map +1 -1
  15. package/dist/cli/commands/InfillCommand.d.ts +6 -0
  16. package/dist/cli/commands/InfillCommand.js +66 -4
  17. package/dist/cli/commands/InfillCommand.js.map +1 -1
  18. package/dist/cli/utils/parseXtcArg.d.ts +5 -0
  19. package/dist/cli/utils/parseXtcArg.js +16 -0
  20. package/dist/cli/utils/parseXtcArg.js.map +1 -0
  21. package/dist/evaluator/LlamaChat/LlamaChat.d.ts +36 -1
  22. package/dist/evaluator/LlamaChat/LlamaChat.js +29 -10
  23. package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
  24. package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +83 -2
  25. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +11 -5
  26. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
  27. package/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.d.ts +2 -0
  28. package/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js.map +1 -1
  29. package/dist/evaluator/LlamaCompletion.d.ts +36 -3
  30. package/dist/evaluator/LlamaCompletion.js +7 -4
  31. package/dist/evaluator/LlamaCompletion.js.map +1 -1
  32. package/dist/evaluator/LlamaContext/LlamaContext.js +67 -35
  33. package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
  34. package/dist/evaluator/LlamaContext/LlamaSampler.js +8 -0
  35. package/dist/evaluator/LlamaContext/LlamaSampler.js.map +1 -1
  36. package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts +1 -1
  37. package/dist/evaluator/LlamaContext/types.d.ts +113 -0
  38. package/dist/evaluator/LlamaModel/LlamaModel.d.ts +2 -2
  39. package/dist/evaluator/LlamaModel/LlamaModel.js +1 -1
  40. package/dist/evaluator/LlamaModel/LlamaModel.js.map +1 -1
  41. package/dist/gguf/insights/GgufInsights.js +4 -0
  42. package/dist/gguf/insights/GgufInsights.js.map +1 -1
  43. package/dist/gguf/types/GgufMetadataTypes.d.ts +5 -0
  44. package/dist/gguf/types/GgufMetadataTypes.js +5 -0
  45. package/dist/gguf/types/GgufMetadataTypes.js.map +1 -1
  46. package/dist/tsconfig.tsbuildinfo +1 -1
  47. package/dist/types.d.ts +51 -0
  48. package/dist/types.js.map +1 -1
  49. package/dist/utils/cmake.js +6 -3
  50. package/dist/utils/cmake.js.map +1 -1
  51. package/llama/addon/AddonContext.cpp +19 -5
  52. package/llama/addon/AddonContext.h +1 -1
  53. package/llama/addon/AddonSampler.cpp +158 -0
  54. package/llama/addon/AddonSampler.h +13 -1
  55. package/llama/addon/globals/getGpuInfo.cpp +1 -1
  56. package/llama/binariesGithubRelease.json +1 -1
  57. package/llama/gitRelease.bundle +0 -0
  58. package/llama/gpuInfo/vulkan-gpu-info.cpp +12 -5
  59. package/llama/llama.cpp.info.json +1 -1
  60. package/package.json +63 -62
  61. package/templates/packed/electron-typescript-react.json +1 -1
  62. package/templates/packed/node-typescript.json +1 -1
@@ -0,0 +1,5 @@
1
+ export declare function parseXtcArg(xtcString?: string): ParsedXtcArg | undefined;
2
+ export type ParsedXtcArg = {
3
+ probability: number;
4
+ threshold: number;
5
+ };
@@ -0,0 +1,16 @@
1
+ const xtcArgFormat = /^(\d+|\d*\.\d+),(\d*|\d?\.\d+)$/;
2
+ export function parseXtcArg(xtcString) {
3
+ if (xtcString == null || xtcString === "")
4
+ return undefined;
5
+ const match = xtcString.match(xtcArgFormat);
6
+ if (match != null && match[1] != null && match[2] != null) {
7
+ const probability = parseFloat(match[1]);
8
+ const threshold = parseFloat(match[2]);
9
+ if (probability >= 0 && probability <= 1 && threshold >= 0 && threshold <= 1) {
10
+ return { probability, threshold };
11
+ }
12
+ }
13
+ throw new Error(`Invalid xtc argument: ${xtcString}. ` +
14
+ 'Expected format: "probability,threshold" where probability and threshold are numbers between 0 and 1.');
15
+ }
16
+ //# sourceMappingURL=parseXtcArg.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"parseXtcArg.js","sourceRoot":"","sources":["../../../src/cli/utils/parseXtcArg.ts"],"names":[],"mappings":"AAAA,MAAM,YAAY,GAAG,iCAAiC,CAAC;AAEvD,MAAM,UAAU,WAAW,CAAC,SAAkB;IAC1C,IAAI,SAAS,IAAI,IAAI,IAAI,SAAS,KAAK,EAAE;QACrC,OAAO,SAAS,CAAC;IAErB,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;IAC5C,IAAI,KAAK,IAAI,IAAI,IAAI,KAAK,CAAC,CAAC,CAAC,IAAI,IAAI,IAAI,KAAK,CAAC,CAAC,CAAC,IAAI,IAAI,EAAE,CAAC;QACxD,MAAM,WAAW,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,SAAS,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAEvC,IAAI,WAAW,IAAI,CAAC,IAAI,WAAW,IAAI,CAAC,IAAI,SAAS,IAAI,CAAC,IAAI,SAAS,IAAI,CAAC,EAAE,CAAC;YAC3E,OAAO,EAAC,WAAW,EAAE,SAAS,EAAC,CAAC;QACpC,CAAC;IACL,CAAC;IAED,MAAM,IAAI,KAAK,CACX,yBAAyB,SAAS,IAAI;QACtC,uGAAuG,CAC1G,CAAC;AACN,CAAC"}
@@ -1,7 +1,7 @@
1
1
  import { EventRelay } from "lifecycle-utils";
2
2
  import { ChatWrapper } from "../../ChatWrapper.js";
3
3
  import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
4
- import { ChatHistoryItem, ChatModelFunctions, ChatModelSegmentType, LLamaContextualRepeatPenalty, Token, Tokenizer } from "../../types.js";
4
+ import { ChatHistoryItem, ChatModelFunctions, ChatModelSegmentType, LLamaContextualRepeatPenalty, Token, Tokenizer, LLamaContextualDryRepeatPenalty } from "../../types.js";
5
5
  import { GbnfJsonSchemaToType } from "../../utils/gbnfJson/types.js";
6
6
  import { LlamaGrammar } from "../LlamaGrammar.js";
7
7
  import { LlamaText, LlamaTextJSON } from "../../utils/LlamaText.js";
@@ -191,6 +191,28 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
191
191
  * Only relevant when using `temperature`.
192
192
  */
193
193
  seed?: number;
194
+ /**
195
+ * Exclude Top Choices (XTC) removes the top tokens from consideration and avoids more obvious and repetitive generations.
196
+ * Using it leads to more creative responses, but also to increased hallucinations.
197
+ *
198
+ * The `probability` value controls the chance that the top tokens will be removed in the next token generation step.
199
+ * The `threshold` value control the minimum probability of a token for it to be removed.
200
+ *
201
+ * Start with `{probability: 0.5, threshold: 0.1}` and adjust from there.
202
+ *
203
+ * Disabled by default.
204
+ */
205
+ xtc?: {
206
+ /**
207
+ * A number between `0` and `1` representing the probability of applying Exclude Top Choices (XTC) at each token generation step.
208
+ */
209
+ probability: number;
210
+ /**
211
+ * A number between `0` and `1` representing the minimum probability
212
+ * of a token for it to be removed when applying Exclude Top Choices (XTC).
213
+ */
214
+ threshold: number;
215
+ };
194
216
  /**
195
217
  * Trim whitespace from the end of the generated text
196
218
  *
@@ -198,6 +220,17 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
198
220
  */
199
221
  trimWhitespaceSuffix?: boolean;
200
222
  repeatPenalty?: false | LLamaContextualRepeatPenalty;
223
+ /**
224
+ * DRY (Don't Repeat Yourself) penalty is a technique to reduce repetitions in the generated text
225
+ * by penalizing tokens based on recent token usage patterns.
226
+ *
227
+ * With the right parameters choice, it makes it impossible for the model to
228
+ * repeat itself verbatim with the same tokens in the same order (the model can still repeat itself by
229
+ * using different tokens or by paraphrasing, but that is far less of an issue than a broken-record looping).
230
+ *
231
+ * Disabled by default.
232
+ */
233
+ dryRepeatPenalty?: LLamaContextualDryRepeatPenalty;
201
234
  /**
202
235
  * Adjust the probability of tokens being generated.
203
236
  * Can be used to bias the model to generate tokens that you want it to lean towards,
@@ -321,8 +354,10 @@ export type LLamaChatLoadAndCompleteUserMessageOptions<Functions extends ChatMod
321
354
  topK?: LLamaChatGenerateResponseOptions<Functions>["topK"];
322
355
  topP?: LLamaChatGenerateResponseOptions<Functions>["topP"];
323
356
  seed?: LLamaChatGenerateResponseOptions<Functions>["seed"];
357
+ xtc?: LLamaChatGenerateResponseOptions<Functions>["xtc"];
324
358
  trimWhitespaceSuffix?: LLamaChatGenerateResponseOptions<Functions>["trimWhitespaceSuffix"];
325
359
  repeatPenalty?: LLamaChatGenerateResponseOptions<Functions>["repeatPenalty"];
360
+ dryRepeatPenalty?: LLamaChatGenerateResponseOptions<Functions>["dryRepeatPenalty"];
326
361
  tokenBias?: LLamaChatGenerateResponseOptions<Functions>["tokenBias"];
327
362
  evaluationPriority?: LLamaChatGenerateResponseOptions<Functions>["evaluationPriority"];
328
363
  contextShift?: LLamaChatGenerateResponseOptions<Functions>["contextShift"];
@@ -80,7 +80,7 @@ export class LlamaChat {
80
80
  return this.sequence.model;
81
81
  }
82
82
  async generateResponse(history, options = {}) {
83
- const { onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, abortOnNonText = false, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = options;
83
+ const { onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, xtc, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, dryRepeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, abortOnNonText = false, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = options;
84
84
  this.sequence.tokenPredictor?.updateInputTokens?.(this.model.tokenize(findLastUserMessageInChatHistory(history)?.text ?? ""));
85
85
  const generateResponseState = new GenerateResponseState(this, this._chatWrapper, history, {
86
86
  onTextChunk,
@@ -96,9 +96,11 @@ export class LlamaChat {
96
96
  topK,
97
97
  topP,
98
98
  seed,
99
+ xtc,
99
100
  grammar: grammar, // this is a workaround to allow passing both `functions` and `grammar`
100
101
  trimWhitespaceSuffix,
101
102
  repeatPenalty,
103
+ dryRepeatPenalty,
102
104
  tokenBias,
103
105
  evaluationPriority,
104
106
  functions,
@@ -202,7 +204,7 @@ export class LlamaChat {
202
204
  });
203
205
  }
204
206
  async loadChatAndCompleteUserMessage(history, options = {}) {
205
- const { initialUserPrompt = "", stopOnAbortSignal = false, onTextChunk, onToken, signal, maxTokens = defaultMaxPreloadTokens(this.sequence), temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.8 } = {} } = options;
207
+ const { initialUserPrompt = "", stopOnAbortSignal = false, onTextChunk, onToken, signal, maxTokens = defaultMaxPreloadTokens(this.sequence), temperature, minP, topK, topP, seed, xtc, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, dryRepeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.8 } = {} } = options;
206
208
  this.sequence.tokenPredictor?.updateInputTokens?.(this.model.tokenize((findLastModelMessageInChatHistory(history)?.response ?? [])
207
209
  .map((item) => {
208
210
  if (typeof item === "string")
@@ -227,9 +229,11 @@ export class LlamaChat {
227
229
  topK,
228
230
  topP,
229
231
  seed,
232
+ xtc,
230
233
  grammar: grammar, // this is a workaround to allow passing both `functions` and `grammar`
231
234
  trimWhitespaceSuffix,
232
235
  repeatPenalty,
236
+ dryRepeatPenalty,
233
237
  tokenBias,
234
238
  evaluationPriority,
235
239
  functions,
@@ -721,6 +725,7 @@ class GenerateResponseState {
721
725
  topK;
722
726
  topP;
723
727
  seed;
728
+ xtc;
724
729
  grammar;
725
730
  trimWhitespaceSuffix;
726
731
  tokenBias;
@@ -737,6 +742,7 @@ class GenerateResponseState {
737
742
  repeatPenaltyEnabled;
738
743
  resolvedContextShift;
739
744
  resolvedRepeatPenalty;
745
+ dryRepeatPenalty;
740
746
  grammarEvaluationState;
741
747
  functionNameGrammar;
742
748
  functionsGrammar;
@@ -798,7 +804,7 @@ class GenerateResponseState {
798
804
  currentTokens = [];
799
805
  currentText = "";
800
806
  currentQueuedTokenRelease;
801
- constructor(llamaChat, chatWrapper, history, { onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, abortOnNonText, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
807
+ constructor(llamaChat, chatWrapper, history, { onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, xtc, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, dryRepeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, abortOnNonText, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
802
808
  this.llamaChat = llamaChat;
803
809
  this.chatWrapper = chatWrapper;
804
810
  this.history = history;
@@ -815,6 +821,7 @@ class GenerateResponseState {
815
821
  this.topK = topK;
816
822
  this.topP = topP;
817
823
  this.seed = seed;
824
+ this.xtc = xtc;
818
825
  this.grammar = grammar;
819
826
  this.trimWhitespaceSuffix = trimWhitespaceSuffix;
820
827
  this.tokenBias = tokenBias;
@@ -847,6 +854,7 @@ class GenerateResponseState {
847
854
  lastTokens: repeatPenalty?.lastTokens ?? defaultRepeatPenaltyLastTokens
848
855
  };
849
856
  this.repeatPenaltyEnabled = this.resolvedRepeatPenalty.lastTokens > 0;
857
+ this.dryRepeatPenalty = dryRepeatPenalty;
850
858
  this.grammarEvaluationState = this.grammar != null
851
859
  ? new LlamaGrammarEvaluationState({ model: this.llamaChat.model, grammar: this.grammar })
852
860
  : undefined;
@@ -863,11 +871,16 @@ class GenerateResponseState {
863
871
  if (this.grammar != null)
864
872
  StopGenerationDetector.resolveStopTriggers(this.grammar.stopGenerationTriggers, this.llamaChat.model.tokenizer)
865
873
  .map((stopTrigger) => this.stopGenerationDetector.addStopTrigger(stopTrigger));
866
- if (this.functions != null && Object.keys(this.functions).length > 0 && !this.abortOnNonText)
867
- this.functionSyntaxStartDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText([
874
+ if (this.functions != null && Object.keys(this.functions).length > 0 && !this.abortOnNonText) {
875
+ for (const sectionPrefix of [
868
876
  this.chatWrapper.settings.functions?.parallelism?.call?.sectionPrefix ?? "",
869
- this.chatWrapper.settings.functions.call.prefix
870
- ]), this.llamaChat.model.tokenizer));
877
+ ...(this.chatWrapper.settings.functions?.parallelism?.call.sectionPrefixAlternateMatches ?? [])
878
+ ])
879
+ this.functionSyntaxStartDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText([
880
+ sectionPrefix,
881
+ this.chatWrapper.settings.functions.call.prefix
882
+ ]), this.llamaChat.model.tokenizer));
883
+ }
871
884
  const segmentDefinitions = new Map();
872
885
  for (const segmentType of allSegmentTypes) {
873
886
  const segmentDefinition = getChatWrapperSegmentDefinition(this.chatWrapper.settings, segmentType);
@@ -889,10 +902,14 @@ class GenerateResponseState {
889
902
  : SegmentHandler.getSegmentTokenCounts(lastModelMessageFullResponse, this.llamaChat.model.tokenizer)
890
903
  });
891
904
  if (this.abortOnNonText) {
892
- this.stopGenerationDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText([
905
+ for (const sectionPrefix of [
893
906
  this.chatWrapper.settings.functions?.parallelism?.call?.sectionPrefix ?? "",
894
- this.chatWrapper.settings.functions.call.prefix
895
- ]), this.llamaChat.model.tokenizer));
907
+ ...(this.chatWrapper.settings.functions?.parallelism?.call.sectionPrefixAlternateMatches ?? [])
908
+ ])
909
+ this.stopGenerationDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText([
910
+ sectionPrefix,
911
+ this.chatWrapper.settings.functions.call.prefix
912
+ ]), this.llamaChat.model.tokenizer));
896
913
  for (const segmentType of allSegmentTypes) {
897
914
  const segmentDefinition = getChatWrapperSegmentDefinition(this.chatWrapper.settings, segmentType);
898
915
  if (segmentDefinition != null)
@@ -1765,6 +1782,7 @@ class GenerateResponseState {
1765
1782
  topK: this.topK,
1766
1783
  topP: this.topP,
1767
1784
  seed: this.seed,
1785
+ xtc: this.xtc,
1768
1786
  grammarEvaluationState: () => {
1769
1787
  if (this.functionEvaluationMode !== false)
1770
1788
  return this.functionsEvaluationState;
@@ -1777,6 +1795,7 @@ class GenerateResponseState {
1777
1795
  frequencyPenalty: this.resolvedRepeatPenalty.frequencyPenalty,
1778
1796
  presencePenalty: this.resolvedRepeatPenalty.presencePenalty
1779
1797
  },
1798
+ dryRepeatPenalty: this.dryRepeatPenalty,
1780
1799
  tokenBias: this.tokenBias,
1781
1800
  evaluationPriority: this.evaluationPriority,
1782
1801
  yieldEogToken: true