node-llama-cpp 3.17.0 → 3.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/bindings/AddonTypes.d.ts +11 -0
  2. package/dist/bindings/Llama.js +20 -2
  3. package/dist/bindings/Llama.js.map +1 -1
  4. package/dist/bindings/getLlama.d.ts +1 -1
  5. package/dist/bindings/getLlama.js +19 -8
  6. package/dist/bindings/getLlama.js.map +1 -1
  7. package/dist/bindings/utils/compileLLamaCpp.d.ts +2 -1
  8. package/dist/bindings/utils/compileLLamaCpp.js +8 -0
  9. package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
  10. package/dist/bindings/utils/getLlamaGpuTypes.js +2 -0
  11. package/dist/bindings/utils/getLlamaGpuTypes.js.map +1 -1
  12. package/dist/chatWrappers/QwenChatWrapper.d.ts +7 -0
  13. package/dist/chatWrappers/QwenChatWrapper.js +176 -56
  14. package/dist/chatWrappers/QwenChatWrapper.js.map +1 -1
  15. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js +127 -88
  16. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js.map +1 -1
  17. package/dist/chatWrappers/generic/utils/extractFunctionCallSettingsFromJinjaTemplate.d.ts +16 -10
  18. package/dist/chatWrappers/generic/utils/extractFunctionCallSettingsFromJinjaTemplate.js +115 -5
  19. package/dist/chatWrappers/generic/utils/extractFunctionCallSettingsFromJinjaTemplate.js.map +1 -1
  20. package/dist/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.js +1 -0
  21. package/dist/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.js.map +1 -1
  22. package/dist/cli/commands/ChatCommand.js +1 -1
  23. package/dist/cli/commands/ChatCommand.js.map +1 -1
  24. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js +51 -4
  25. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map +1 -1
  26. package/dist/cli/utils/resolveNpmrcConfig.d.ts +18 -0
  27. package/dist/cli/utils/resolveNpmrcConfig.js +129 -0
  28. package/dist/cli/utils/resolveNpmrcConfig.js.map +1 -0
  29. package/dist/config.d.ts +3 -0
  30. package/dist/config.js +4 -1
  31. package/dist/config.js.map +1 -1
  32. package/dist/evaluator/LlamaChat/LlamaChat.d.ts +8 -2
  33. package/dist/evaluator/LlamaChat/LlamaChat.js +99 -6
  34. package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
  35. package/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js +8 -2
  36. package/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js.map +1 -1
  37. package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +8 -2
  38. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
  39. package/dist/evaluator/LlamaContext/LlamaContext.d.ts +88 -0
  40. package/dist/evaluator/LlamaContext/LlamaContext.js +181 -17
  41. package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
  42. package/dist/evaluator/LlamaContext/LlamaContextSequenceCheckpoints.d.ts +27 -0
  43. package/dist/evaluator/LlamaContext/LlamaContextSequenceCheckpoints.js +130 -0
  44. package/dist/evaluator/LlamaContext/LlamaContextSequenceCheckpoints.js.map +1 -0
  45. package/dist/gguf/insights/GgufInsights.d.ts +3 -0
  46. package/dist/gguf/insights/GgufInsights.js +221 -43
  47. package/dist/gguf/insights/GgufInsights.js.map +1 -1
  48. package/dist/gguf/types/GgufMetadataTypes.d.ts +15 -1
  49. package/dist/gguf/types/GgufMetadataTypes.js +4 -0
  50. package/dist/gguf/types/GgufMetadataTypes.js.map +1 -1
  51. package/dist/tsconfig.tsbuildinfo +1 -1
  52. package/dist/utils/getFirstWritableDir.d.ts +8 -0
  53. package/dist/utils/getFirstWritableDir.js +60 -0
  54. package/dist/utils/getFirstWritableDir.js.map +1 -0
  55. package/dist/utils/getTempDir.d.ts +10 -0
  56. package/dist/utils/getTempDir.js +121 -0
  57. package/dist/utils/getTempDir.js.map +1 -0
  58. package/dist/utils/resolveModelFile.js +19 -8
  59. package/dist/utils/resolveModelFile.js.map +1 -1
  60. package/llama/addon/AddonContext.cpp +168 -0
  61. package/llama/addon/AddonContext.h +27 -0
  62. package/llama/addon/addon.cpp +1 -0
  63. package/llama/binariesGithubRelease.json +1 -1
  64. package/llama/gitRelease.bundle +0 -0
  65. package/llama/llama.cpp.info.json +1 -1
  66. package/package.json +24 -24
  67. package/templates/packed/electron-typescript-react.json +1 -1
@@ -1,4 +1,5 @@
1
1
  import { DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
2
+ import { internalCheckpoints } from "../LlamaContext/LlamaContext.js";
2
3
  import { isChatModelResponseFunctionCall, isChatModelResponseSegment, allSegmentTypes } from "../../types.js";
3
4
  import { removeNullFields } from "../../utils/removeNullFields.js";
4
5
  import { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
@@ -15,6 +16,7 @@ import { LlamaSampler } from "../LlamaContext/LlamaSampler.js";
15
16
  import { getChatWrapperSegmentDefinition } from "../../utils/getChatWrapperSegmentDefinition.js";
16
17
  import { jsonDumps } from "../../chatWrappers/utils/jsonDumps.js";
17
18
  import { defaultMaxPreloadTokens } from "../LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js";
19
+ import { LlamaLogLevel } from "../../bindings/types.js";
18
20
  import { eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy } from "./utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js";
19
21
  import { FunctionCallNameGrammar } from "./utils/FunctionCallNameGrammar.js";
20
22
  import { FunctionCallParamsGrammar } from "./utils/FunctionCallParamsGrammar.js";
@@ -26,6 +28,9 @@ const defaultContextShiftOptions = {
26
28
  const defaultRepeatPenaltyLastTokens = 64;
27
29
  const defaultTrimWhitespaceSuffix = false;
28
30
  const defaultEvaluationPriority = 5;
31
+ const defaultSegmentBudgetSize = (contextSize) => (contextSize < 8192
32
+ ? contextSize * 0.5
33
+ : contextSize * 0.75);
29
34
  export class LlamaChat {
30
35
  /** @internal */ _chatWrapper;
31
36
  /** @internal */ _disposeAggregator = new DisposeAggregator();
@@ -118,7 +123,9 @@ export class LlamaChat {
118
123
  if (generateResponseState.grammar != null && generateResponseState.functionsEnabled && !abortOnNonText)
119
124
  throw new Error("Using both grammar and functions is not supported yet");
120
125
  return await withLock([this._chatLock, "evaluate"], signal, async () => {
126
+ let hadError = false;
121
127
  try {
128
+ let tookInitialCheckpoint = false;
122
129
  generateResponseState.ensureLastHistoryItemIsModel();
123
130
  generateResponseState.ensureReopenedThoughtSegmentAfterFunctionCallsIfNeeded();
124
131
  const loadContextWindow = async (avoidReloadingHistory = false) => {
@@ -156,6 +163,10 @@ export class LlamaChat {
156
163
  await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
157
164
  await generateResponseState.createNewEvaluationIterator();
158
165
  while (await generateResponseState.iterateEvaluation()) {
166
+ if (!tookInitialCheckpoint && this.sequence.needsCheckpoints) {
167
+ await this.sequence.takeCheckpoint();
168
+ tookInitialCheckpoint = true;
169
+ }
159
170
  if (!generateResponseState.holdPartialTokensForNextEvaluation()) {
160
171
  generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
161
172
  generateResponseState.detectAndHandleFunctionStartSyntax();
@@ -166,7 +177,11 @@ export class LlamaChat {
166
177
  if (functionsCallsRes != null)
167
178
  return functionsCallsRes;
168
179
  }
169
- generateResponseState.recordStopGenerationEvaluation();
180
+ {
181
+ const resPromise = generateResponseState.recordStopGenerationEvaluation();
182
+ if (resPromise instanceof Promise)
183
+ await resPromise;
184
+ }
170
185
  generateResponseState.popStreamRegulatorFreeTokens();
171
186
  generateResponseState.removeFoundStartIgnoreTextsFromPendingTokens();
172
187
  const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("model");
@@ -198,8 +213,14 @@ export class LlamaChat {
198
213
  }
199
214
  throw new Error("The context size is too small to generate a response");
200
215
  }
216
+ catch (err) {
217
+ hadError = true;
218
+ throw err;
219
+ }
201
220
  finally {
202
221
  await generateResponseState.dispose();
222
+ if (!hadError && this.sequence.needsCheckpoints)
223
+ void this.sequence.takeCheckpoint();
203
224
  }
204
225
  });
205
226
  }
@@ -247,6 +268,7 @@ export class LlamaChat {
247
268
  });
248
269
  return await withLock([this._chatLock, "evaluate"], signal, async () => {
249
270
  try {
271
+ let tookInitialCheckpoint = false;
250
272
  generateResponseState.ensureLastHistoryItemIsUser();
251
273
  while (true) {
252
274
  generateResponseState.startTokenLoop();
@@ -279,9 +301,17 @@ export class LlamaChat {
279
301
  }
280
302
  await generateResponseState.createNewEvaluationIterator();
281
303
  while (await generateResponseState.iterateEvaluation()) {
304
+ if (!tookInitialCheckpoint && this.sequence.needsCheckpoints) {
305
+ await this.sequence.takeCheckpoint();
306
+ tookInitialCheckpoint = true;
307
+ }
282
308
  if (!generateResponseState.holdPartialTokensForNextEvaluation()) {
283
309
  generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
284
- generateResponseState.recordStopGenerationEvaluation();
310
+ {
311
+ const resPromise = generateResponseState.recordStopGenerationEvaluation();
312
+ if (resPromise instanceof Promise)
313
+ await resPromise;
314
+ }
285
315
  generateResponseState.popStreamRegulatorFreeTokens();
286
316
  const someOfCurrentTokensAreSpecial = generateResponseState.currentTokens.some((token) => (this.model.isSpecialToken(token)));
287
317
  const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("user", someOfCurrentTokensAreSpecial
@@ -792,6 +822,7 @@ class GenerateResponseState {
792
822
  userTextSuffix = undefined;
793
823
  prefixTriggerDetectors = new Map();
794
824
  noPrefixTrigger = undefined;
825
+ responsePrefix = undefined;
795
826
  rerenderTriggers = [];
796
827
  rerenderTriggerDetector = new StopGenerationDetector();
797
828
  rerenderActions = undefined;
@@ -1148,6 +1179,9 @@ class GenerateResponseState {
1148
1179
  !this.segmentHandler.isSegmentTypeOpen(trigger.segmentType) &&
1149
1180
  this.segmentHandler.getSegmentTokensCount(trigger.segmentType) >= segmentBudget)
1150
1181
  continue;
1182
+ if (this.responsePrefix == null && trigger.type === "response" && trigger.triggers.length > 0 &&
1183
+ (trigger.triggers[0]?.values?.length ?? 0) > 0)
1184
+ this.responsePrefix = LlamaText([trigger.triggers[0] ?? "", trigger.inject ?? ""]);
1151
1185
  const prefixDetector = new StopGenerationDetector();
1152
1186
  StopGenerationDetector.resolveStopTriggers(trigger.triggers, this.llamaChat.model.tokenizer)
1153
1187
  .forEach((stopTrigger) => prefixDetector.addStopTrigger(stopTrigger));
@@ -1172,6 +1206,8 @@ class GenerateResponseState {
1172
1206
  !this.segmentHandler.isSegmentTypeOpen(noPrefixTrigger.segmentType) &&
1173
1207
  this.segmentHandler.getSegmentTokensCount(noPrefixTrigger.segmentType) >= noPrefixTriggerSegmentBudget)
1174
1208
  this.noPrefixTrigger = undefined;
1209
+ else if (noPrefixTrigger?.type === "response")
1210
+ this.responsePrefix = noPrefixTrigger.inject;
1175
1211
  this.rerenderTriggers = rerender?.triggers ?? [];
1176
1212
  this.rerenderTriggerDetector.clearInProgressStops();
1177
1213
  this.rerenderTriggerDetector.clearTriggeredStops();
@@ -1252,6 +1288,11 @@ class GenerateResponseState {
1252
1288
  if (alignStateTokens)
1253
1289
  await reloadTokens();
1254
1290
  };
1291
+ if (this.grammar != null) {
1292
+ if (this.responsePrefix != null)
1293
+ await injectTokens(this.responsePrefix, true);
1294
+ return undefined;
1295
+ }
1255
1296
  if (this.prefixTriggerDetectors.size === 0) {
1256
1297
  if (this.abortOnNonText && this.noPrefixTrigger != null && this.noPrefixTrigger.type !== "response") {
1257
1298
  this.shouldAbortBecauseOfNonText = true;
@@ -1275,7 +1316,12 @@ class GenerateResponseState {
1275
1316
  const generatedTokens = [];
1276
1317
  let isFirstToken = true;
1277
1318
  let continueGeneration = true;
1319
+ let tookInitialCheckpoint = false;
1278
1320
  for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
1321
+ if (!tookInitialCheckpoint && this.llamaChat.sequence.needsCheckpoints) {
1322
+ await this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatSequenceStart.name, internalCheckpoints.chatSequenceStart.maxCheckpoints);
1323
+ tookInitialCheckpoint = true;
1324
+ }
1279
1325
  pushAll(generatedTokens, tokens);
1280
1326
  for (const [triggerDetector, { trigger, inject }] of [...this.prefixTriggerDetectors.entries()]) {
1281
1327
  triggerDetector.recordGeneration({
@@ -1420,7 +1466,12 @@ class GenerateResponseState {
1420
1466
  pushAll(prefixDetectorRecordedTokens, tokens);
1421
1467
  }
1422
1468
  }
1469
+ let tookInitialCheckpoint = false;
1423
1470
  for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
1471
+ if (!tookInitialCheckpoint && this.llamaChat.sequence.needsCheckpoints) {
1472
+ await this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatSequenceStart.name, internalCheckpoints.chatSequenceStart.maxCheckpoints);
1473
+ tookInitialCheckpoint = true;
1474
+ }
1424
1475
  const stopGenerationTriggerRes = this.handleStopGenerationTrigger("model");
1425
1476
  if (stopGenerationTriggerRes != null)
1426
1477
  return stopGenerationTriggerRes;
@@ -1463,7 +1514,11 @@ class GenerateResponseState {
1463
1514
  tokens: this.currentTokens,
1464
1515
  text: this.currentText
1465
1516
  });
1466
- this.recordStopGenerationEvaluation();
1517
+ {
1518
+ const resPromise = this.recordStopGenerationEvaluation();
1519
+ if (resPromise instanceof Promise)
1520
+ await resPromise;
1521
+ }
1467
1522
  }
1468
1523
  this.currentFunctionCallCurrentPartTokens.length = 0;
1469
1524
  this.functionEvaluationMode = false;
@@ -1515,7 +1570,12 @@ class GenerateResponseState {
1515
1570
  }
1516
1571
  }
1517
1572
  }
1573
+ let tookInitialCheckpoint = false;
1518
1574
  for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
1575
+ if (!tookInitialCheckpoint && this.llamaChat.sequence.needsCheckpoints) {
1576
+ await this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatSequenceStart.name, internalCheckpoints.chatSequenceStart.maxCheckpoints);
1577
+ tookInitialCheckpoint = true;
1578
+ }
1519
1579
  pushAll(this.currentFunctionCallCurrentPartTokens, tokens);
1520
1580
  functionNameGenerationDoneDetector.recordGeneration({
1521
1581
  text: this.currentText,
@@ -1578,11 +1638,20 @@ class GenerateResponseState {
1578
1638
  paramsChunk: this.llamaChat.model.detokenize(this.currentFunctionCallCurrentPartTokens, false, lastPartTokens),
1579
1639
  done: false
1580
1640
  });
1641
+ let tookInitialCheckpoint = false;
1581
1642
  for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
1643
+ if (!tookInitialCheckpoint && this.llamaChat.sequence.needsCheckpoints) {
1644
+ await this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatSequenceStart.name, internalCheckpoints.chatSequenceStart.maxCheckpoints);
1645
+ tookInitialCheckpoint = true;
1646
+ }
1647
+ const hadInProgressTriggers = functionParamsGenerationDoneDetector.hasInProgressStops;
1582
1648
  functionParamsGenerationDoneDetector.recordGeneration({
1583
1649
  text: this.currentText,
1584
1650
  tokens: this.currentTokens
1585
1651
  });
1652
+ if (!hadInProgressTriggers && functionParamsGenerationDoneDetector.hasInProgressStops &&
1653
+ this.llamaChat.sequence.needsCheckpoints)
1654
+ await this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatGrammarEnd.name, internalCheckpoints.chatGrammarEnd.maxCheckpoints);
1586
1655
  this.onFunctionCallParamsChunk?.({
1587
1656
  callIndex: this.resFunctionCalls.length,
1588
1657
  functionName: this.functionEvaluationFunctionName,
@@ -1646,7 +1715,12 @@ class GenerateResponseState {
1646
1715
  LlamaText(new SpecialToken("EOT"))
1647
1716
  ], this.llamaChat.model.tokenizer)
1648
1717
  .map((stopTrigger) => sectionSuffixDetector.addStopTrigger(stopTrigger));
1718
+ let tookInitialCheckpoint = false;
1649
1719
  for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
1720
+ if (!tookInitialCheckpoint && this.llamaChat.sequence.needsCheckpoints) {
1721
+ await this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatSequenceStart.name, internalCheckpoints.chatSequenceStart.maxCheckpoints);
1722
+ tookInitialCheckpoint = true;
1723
+ }
1650
1724
  pushAll(this.currentFunctionCallCurrentPartTokens, tokens);
1651
1725
  sectionSuffixDetector.recordGeneration({
1652
1726
  text: this.currentText,
@@ -1772,6 +1846,19 @@ class GenerateResponseState {
1772
1846
  }));
1773
1847
  }
1774
1848
  async createNewEvaluationIterator() {
1849
+ if (this.tokens.length === 0) {
1850
+ if (this.evaluationIterator != null)
1851
+ return;
1852
+ const token = this.llamaChat.sequence.contextTokens.at(-1);
1853
+ if (token == null)
1854
+ throw new Error("No tokens to evaluate");
1855
+ this.llamaChat.sequence.model._llama._log(LlamaLogLevel.warn, "Attempted to evaluate with no input, reevaluating the last context sequence token");
1856
+ await this.llamaChat.sequence.eraseContextTokenRanges([{
1857
+ start: this.llamaChat.sequence.contextTokens.length - 1,
1858
+ end: this.llamaChat.sequence.contextTokens.length
1859
+ }]);
1860
+ this.tokens = [token];
1861
+ }
1775
1862
  if (this.evaluationIterator != null)
1776
1863
  await this.evaluationIterator.return();
1777
1864
  this.currentIterationReplacementToken = undefined;
@@ -1881,6 +1968,7 @@ class GenerateResponseState {
1881
1968
  }
1882
1969
  }
1883
1970
  recordStopGenerationEvaluation() {
1971
+ const hadInProgressStopTrigger = this.stopGenerationDetector.hasInProgressStops;
1884
1972
  this.rerenderTriggerDetector.recordGeneration({
1885
1973
  text: this.currentText,
1886
1974
  tokens: this.currentTokens,
@@ -1898,6 +1986,9 @@ class GenerateResponseState {
1898
1986
  });
1899
1987
  if (this.llamaChat.model.isEogToken(this.currentToken))
1900
1988
  this.currentQueuedTokenRelease?.createTokenIndexLock(0);
1989
+ if (this.grammar != null && !hadInProgressStopTrigger && this.stopGenerationDetector.hasInProgressStops &&
1990
+ this.llamaChat.sequence.needsCheckpoints)
1991
+ return this.llamaChat.sequence._takeNamedCheckpoint(internalCheckpoints.chatGrammarEnd.name, internalCheckpoints.chatGrammarEnd.maxCheckpoints);
1901
1992
  }
1902
1993
  popStreamRegulatorFreeTokens() {
1903
1994
  pushAll(this.pendingTokens, this.streamRegulator.popFreeChunkTokens());
@@ -2020,9 +2111,11 @@ class GenerateResponseState {
2020
2111
  return shouldReloadEvaluationState;
2021
2112
  }
2022
2113
  getSegmentBudget(segmentType) {
2023
- const getBudget = (budget) => ((budget == null || budget === Infinity)
2024
- ? null
2025
- : budget);
2114
+ const getBudget = (budget) => (budget == null
2115
+ ? Math.ceil(defaultSegmentBudgetSize(this.llamaChat.sequence.contextSize))
2116
+ : budget === Infinity
2117
+ ? null
2118
+ : budget);
2026
2119
  if (this.budgets == null)
2027
2120
  return null;
2028
2121
  if (segmentType === "thought")