node-llama-cpp 3.4.3 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/README.md +2 -2
  2. package/dist/ChatWrapper.js +45 -0
  3. package/dist/ChatWrapper.js.map +1 -1
  4. package/dist/bindings/utils/compileLLamaCpp.js +2 -0
  5. package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
  6. package/dist/chatWrappers/DeepSeekChatWrapper.d.ts +37 -0
  7. package/dist/chatWrappers/DeepSeekChatWrapper.js +294 -0
  8. package/dist/chatWrappers/DeepSeekChatWrapper.js.map +1 -0
  9. package/dist/chatWrappers/FunctionaryChatWrapper.js +39 -13
  10. package/dist/chatWrappers/FunctionaryChatWrapper.js.map +1 -1
  11. package/dist/chatWrappers/Llama3_1ChatWrapper.d.ts +0 -3
  12. package/dist/chatWrappers/Llama3_1ChatWrapper.js +5 -4
  13. package/dist/chatWrappers/Llama3_1ChatWrapper.js.map +1 -1
  14. package/dist/chatWrappers/Llama3_2LightweightChatWrapper.js +1 -1
  15. package/dist/chatWrappers/Llama3_2LightweightChatWrapper.js.map +1 -1
  16. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.d.ts +10 -1
  17. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js +8 -2
  18. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js.map +1 -1
  19. package/dist/chatWrappers/generic/TemplateChatWrapper.d.ts +17 -1
  20. package/dist/chatWrappers/generic/TemplateChatWrapper.js +10 -2
  21. package/dist/chatWrappers/generic/TemplateChatWrapper.js.map +1 -1
  22. package/dist/chatWrappers/generic/utils/templateSegmentOptionsToChatWrapperSettings.d.ts +22 -0
  23. package/dist/chatWrappers/generic/utils/templateSegmentOptionsToChatWrapperSettings.js +28 -0
  24. package/dist/chatWrappers/generic/utils/templateSegmentOptionsToChatWrapperSettings.js.map +1 -0
  25. package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +46 -3
  26. package/dist/chatWrappers/utils/resolveChatWrapper.js +6 -2
  27. package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -1
  28. package/dist/cli/commands/ChatCommand.js +38 -7
  29. package/dist/cli/commands/ChatCommand.js.map +1 -1
  30. package/dist/cli/commands/PullCommand.js +2 -1
  31. package/dist/cli/commands/PullCommand.js.map +1 -1
  32. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js +18 -5
  33. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js.map +1 -1
  34. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +16 -5
  35. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -1
  36. package/dist/cli/recommendedModels.js +137 -67
  37. package/dist/cli/recommendedModels.js.map +1 -1
  38. package/dist/cli/utils/resolveCommandGgufPath.d.ts +8 -0
  39. package/dist/cli/utils/resolveCommandGgufPath.js +45 -1
  40. package/dist/cli/utils/resolveCommandGgufPath.js.map +1 -1
  41. package/dist/cli/utils/resolveModelRecommendationFileOptions.d.ts +2 -2
  42. package/dist/cli/utils/resolveModelRecommendationFileOptions.js.map +1 -1
  43. package/dist/config.d.ts +1 -1
  44. package/dist/config.js +1 -1
  45. package/dist/config.js.map +1 -1
  46. package/dist/evaluator/LlamaChat/LlamaChat.d.ts +87 -5
  47. package/dist/evaluator/LlamaChat/LlamaChat.js +770 -194
  48. package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
  49. package/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js +55 -1
  50. package/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js.map +1 -1
  51. package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +22 -7
  52. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +28 -8
  53. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
  54. package/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js +1 -1
  55. package/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js.map +1 -1
  56. package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.d.ts +9 -2
  57. package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js.map +1 -1
  58. package/dist/evaluator/LlamaCompletion.js +61 -48
  59. package/dist/evaluator/LlamaCompletion.js.map +1 -1
  60. package/dist/evaluator/LlamaGrammar.d.ts +2 -2
  61. package/dist/evaluator/LlamaGrammar.js +2 -2
  62. package/dist/evaluator/LlamaModel/LlamaModel.d.ts +1 -1
  63. package/dist/evaluator/LlamaModel/LlamaModel.js +1 -1
  64. package/dist/gguf/readGgufFileInfo.js +7 -4
  65. package/dist/gguf/readGgufFileInfo.js.map +1 -1
  66. package/dist/gguf/types/GgufMetadataTypes.d.ts +2 -2
  67. package/dist/gguf/types/GgufMetadataTypes.js +2 -2
  68. package/dist/gguf/types/GgufMetadataTypes.js.map +1 -1
  69. package/dist/gguf/utils/getGgufFileTypeName.d.ts +1 -1
  70. package/dist/gguf/utils/ggufQuantNames.d.ts +2 -0
  71. package/dist/gguf/utils/ggufQuantNames.js +40 -0
  72. package/dist/gguf/utils/ggufQuantNames.js.map +1 -0
  73. package/dist/gguf/utils/normalizeGgufDownloadUrl.js +1 -1
  74. package/dist/gguf/utils/normalizeGgufDownloadUrl.js.map +1 -1
  75. package/dist/gguf/utils/resolveBinarySplitGgufPartUrls.js +1 -2
  76. package/dist/gguf/utils/resolveBinarySplitGgufPartUrls.js.map +1 -1
  77. package/dist/index.d.ts +7 -5
  78. package/dist/index.js +3 -2
  79. package/dist/index.js.map +1 -1
  80. package/dist/tsconfig.tsbuildinfo +1 -1
  81. package/dist/types.d.ts +32 -2
  82. package/dist/types.js +7 -1
  83. package/dist/types.js.map +1 -1
  84. package/dist/utils/LlamaText.js +8 -9
  85. package/dist/utils/LlamaText.js.map +1 -1
  86. package/dist/utils/TokenStreamRegulator.d.ts +2 -0
  87. package/dist/utils/TokenStreamRegulator.js +12 -0
  88. package/dist/utils/TokenStreamRegulator.js.map +1 -1
  89. package/dist/utils/createModelDownloader.d.ts +21 -4
  90. package/dist/utils/createModelDownloader.js +83 -36
  91. package/dist/utils/createModelDownloader.js.map +1 -1
  92. package/dist/utils/getChatWrapperSegmentDefinition.d.ts +2 -0
  93. package/dist/utils/getChatWrapperSegmentDefinition.js +7 -0
  94. package/dist/utils/getChatWrapperSegmentDefinition.js.map +1 -0
  95. package/dist/utils/modelFileAccesTokens.js +1 -1
  96. package/dist/utils/modelFileAccesTokens.js.map +1 -1
  97. package/dist/utils/parseModelFileName.d.ts +5 -0
  98. package/dist/utils/parseModelFileName.js +63 -4
  99. package/dist/utils/parseModelFileName.js.map +1 -1
  100. package/dist/utils/parseModelUri.d.ts +30 -2
  101. package/dist/utils/parseModelUri.js +199 -24
  102. package/dist/utils/parseModelUri.js.map +1 -1
  103. package/dist/utils/resolveModelDestination.d.ts +4 -3
  104. package/dist/utils/resolveModelDestination.js +25 -2
  105. package/dist/utils/resolveModelDestination.js.map +1 -1
  106. package/dist/utils/resolveModelFile.d.ts +1 -1
  107. package/dist/utils/resolveModelFile.js +61 -20
  108. package/dist/utils/resolveModelFile.js.map +1 -1
  109. package/llama/binariesGithubRelease.json +1 -1
  110. package/llama/gitRelease.bundle +0 -0
  111. package/llama/grammars/README.md +4 -4
  112. package/llama/llama.cpp.info.json +2 -2
  113. package/package.json +36 -35
  114. package/templates/packed/electron-typescript-react.json +1 -1
  115. package/templates/packed/node-typescript.json +1 -1
@@ -1,4 +1,5 @@
1
1
  import { DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
2
+ import { isChatModelResponseFunctionCall, isChatModelResponseSegment, allSegmentTypes } from "../../types.js";
2
3
  import { removeNullFields } from "../../utils/removeNullFields.js";
3
4
  import { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
4
5
  import { LlamaText, SpecialToken } from "../../utils/LlamaText.js";
@@ -11,6 +12,7 @@ import { safeEventCallback } from "../../utils/safeEventCallback.js";
11
12
  import { pushAll } from "../../utils/pushAll.js";
12
13
  import { resolveLastTokens } from "../../utils/resolveLastTokens.js";
13
14
  import { LlamaSampler } from "../LlamaContext/LlamaSampler.js";
15
+ import { getChatWrapperSegmentDefinition } from "../../utils/getChatWrapperSegmentDefinition.js";
14
16
  import { eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy } from "./utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js";
15
17
  import { FunctionCallNameGrammar } from "./utils/FunctionCallNameGrammar.js";
16
18
  import { FunctionCallParamsGrammar } from "./utils/FunctionCallParamsGrammar.js";
@@ -76,11 +78,12 @@ export class LlamaChat {
76
78
  return this.sequence.model;
77
79
  }
78
80
  async generateResponse(history, options = {}) {
79
- const { onTextChunk, onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = options;
81
+ const { onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = options;
80
82
  this.sequence.tokenPredictor?.updateInputTokens?.(this.model.tokenize(findLastUserMessageInChatHistory(history)?.text ?? ""));
81
83
  const generateResponseState = new GenerateResponseState(this, this._chatWrapper, history, {
82
84
  onTextChunk,
83
85
  onToken,
86
+ onResponseChunk,
84
87
  signal,
85
88
  stopOnAbortSignal,
86
89
  maxTokens,
@@ -110,6 +113,7 @@ export class LlamaChat {
110
113
  return await withLock(this._chatLock, "evaluate", signal, async () => {
111
114
  try {
112
115
  generateResponseState.ensureLastHistoryItemIsModel();
116
+ generateResponseState.ensureReopenedThoughtSegmentAfterFunctionCallsIfNeeded();
113
117
  const loadContextWindow = async (avoidReloadingHistory = false) => {
114
118
  await generateResponseState.loadContextWindow(generateResponseState.getResolvedHistoryWithCurrentModelResponse(), generateResponseState.getContextWindowsHistoryWithCurrentModelResponse(), false, avoidReloadingHistory);
115
119
  };
@@ -134,23 +138,25 @@ export class LlamaChat {
134
138
  await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
135
139
  await generateResponseState.createNewEvaluationIterator();
136
140
  while (await generateResponseState.iterateEvaluation()) {
137
- generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
138
- generateResponseState.detectAndHandleFunctionStartSyntax();
139
- if (generateResponseState.functionEvaluationMode !== false) {
140
- generateResponseState.canAvoidReloadingHistory = false;
141
- generateResponseState.releasePartiallyFreeTokensBeforeFunctionCallStart();
142
- const functionsCallsRes = await generateResponseState.enterFunctionCallingLoop(loadContextWindowForFunctionCallingLoop);
143
- if (functionsCallsRes != null)
144
- return functionsCallsRes;
141
+ if (!generateResponseState.holdPartialTokensForNextEvaluation()) {
142
+ generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
143
+ generateResponseState.detectAndHandleFunctionStartSyntax();
144
+ if (generateResponseState.functionEvaluationMode !== false) {
145
+ generateResponseState.canAvoidReloadingHistory = false;
146
+ generateResponseState.releasePartiallyFreeTokensBeforeFunctionCallStart();
147
+ const functionsCallsRes = await generateResponseState.enterFunctionCallingLoop(loadContextWindowForFunctionCallingLoop);
148
+ if (functionsCallsRes != null)
149
+ return functionsCallsRes;
150
+ }
151
+ generateResponseState.recordStopGenerationEvaluation();
152
+ generateResponseState.popStreamRegulatorFreeTokens();
153
+ generateResponseState.removeFoundStartIgnoreTextsFromPendingTokens();
154
+ const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("model");
155
+ if (stopGenerationTriggerRes != null)
156
+ return stopGenerationTriggerRes;
157
+ generateResponseState.spliceIgnoreStartTextDetectedTokens();
158
+ generateResponseState.moveFreePendingTokensToRes();
145
159
  }
146
- generateResponseState.recordStopGenerationEvaluation();
147
- generateResponseState.popStreamRegulatorFreeTokens();
148
- generateResponseState.removeFoundStartIgnoreTextsFromPendingTokens();
149
- const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("model");
150
- if (stopGenerationTriggerRes != null)
151
- return stopGenerationTriggerRes;
152
- generateResponseState.spliceIgnoreStartTextDetectedTokens();
153
- generateResponseState.moveFreePendingTokensToRes();
154
160
  const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger("model");
155
161
  if (maxTokensTriggerRes != null)
156
162
  return maxTokensTriggerRes;
@@ -174,16 +180,20 @@ export class LlamaChat {
174
180
  }
175
181
  async loadChatAndCompleteUserMessage(history, options = {}) {
176
182
  const { initialUserPrompt = "", stopOnAbortSignal = false, onTextChunk, onToken, signal, maxTokens = Math.min(256, Math.ceil(this.context.contextSize / 2)), temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.8 } = {} } = options;
177
- const lastEvaluationContextWindowHistoryItem = lastEvaluationContextWindowHistory == null
178
- ? null
179
- : lastEvaluationContextWindowHistory[lastEvaluationContextWindowHistory.length - 1];
180
- const lastEvaluationContextWindowUserMessage = lastEvaluationContextWindowHistoryItem?.type === "user"
181
- ? lastEvaluationContextWindowHistoryItem.text
182
- : "";
183
183
  this.sequence.tokenPredictor?.updateInputTokens?.(this.model.tokenize((findLastModelMessageInChatHistory(history)?.response ?? [])
184
- .filter((item) => typeof item === "string")
184
+ .map((item) => {
185
+ if (typeof item === "string")
186
+ return item;
187
+ else if (isChatModelResponseFunctionCall(item))
188
+ return null;
189
+ else if (isChatModelResponseSegment(item))
190
+ return item.text;
191
+ void item;
192
+ return null;
193
+ })
194
+ .filter((item) => item != null)
185
195
  .join(" ")));
186
- const generateResponseState = new GenerateResponseState(this, this._chatWrapper, history, {
196
+ const generateResponseState = new GenerateResponseState(this, this._chatWrapper, mergeGeneratedResultWithChatHistory("user", history, [initialUserPrompt]), {
187
197
  onTextChunk,
188
198
  onToken,
189
199
  signal,
@@ -204,27 +214,16 @@ export class LlamaChat {
204
214
  contextShift,
205
215
  customStopTriggers,
206
216
  lastEvaluationContextWindow: {
207
- history: lastEvaluationContextWindowHistory == null
208
- ? undefined
209
- : setLastUserTextInChatHistory(lastEvaluationContextWindowHistory, lastEvaluationContextWindowUserMessage + initialUserPrompt),
217
+ history: mergeGeneratedResultWithChatHistory("user", lastEvaluationContextWindowHistory ?? history, [initialUserPrompt]),
210
218
  minimumOverlapPercentageToPreventContextShift
211
219
  }
212
220
  });
213
221
  return await withLock(this._chatLock, "evaluate", signal, async () => {
214
222
  try {
215
223
  generateResponseState.ensureLastHistoryItemIsUser();
216
- const getInitialUserMessage = (history) => {
217
- const lastResolvedHistoryItem = history[history.length - 1];
218
- if (lastResolvedHistoryItem?.type === "user")
219
- return lastResolvedHistoryItem.text;
220
- return "";
221
- };
222
- const initialUserMessage = getInitialUserMessage(generateResponseState.resolvedHistory);
223
- const contextWindowInitialUserMessage = getInitialUserMessage(generateResponseState.lastContextWindowHistory);
224
224
  while (true) {
225
225
  generateResponseState.startTokenLoop();
226
- const { userTextSuffix } = await generateResponseState.loadContextWindow(setLastUserTextInChatHistory(generateResponseState.resolvedHistory, initialUserMessage + initialUserPrompt + this.model.detokenize(generateResponseState.res)), setLastUserTextInChatHistory(generateResponseState.lastContextWindowHistory, contextWindowInitialUserMessage + initialUserPrompt +
227
- this.model.detokenize(generateResponseState.contextWindowsRes)), true);
226
+ const { userTextSuffix } = await generateResponseState.loadContextWindow(mergeGeneratedResultWithChatHistory("user", generateResponseState.resolvedHistory, generateResponseState.segmentHandler.getModelResponseSegments()), mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()), true);
228
227
  generateResponseState.functionEvaluationMode = false;
229
228
  generateResponseState.addStopGenerationTriggersFromChatWrapper();
230
229
  if (userTextSuffix != null && userTextSuffix.values.length > 0)
@@ -235,7 +234,7 @@ export class LlamaChat {
235
234
  return {
236
235
  completion: "",
237
236
  lastEvaluation: {
238
- contextWindow: setLastUserTextInChatHistory(generateResponseState.lastContextWindowHistory, initialUserMessage),
237
+ contextWindow: mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()),
239
238
  contextShiftMetadata: generateResponseState.lastHistoryCompressionMetadata
240
239
  },
241
240
  metadata: {
@@ -245,28 +244,30 @@ export class LlamaChat {
245
244
  }
246
245
  await generateResponseState.createNewEvaluationIterator();
247
246
  while (await generateResponseState.iterateEvaluation()) {
248
- generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
249
- generateResponseState.recordStopGenerationEvaluation();
250
- generateResponseState.popStreamRegulatorFreeTokens();
251
- const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("user");
252
- if (stopGenerationTriggerRes != null)
253
- return {
254
- completion: stopGenerationTriggerRes.response,
255
- lastEvaluation: {
256
- contextWindow: setLastUserTextInChatHistory(generateResponseState.lastContextWindowHistory, initialUserMessage),
257
- contextShiftMetadata: stopGenerationTriggerRes.lastEvaluation.contextShiftMetadata
258
- },
259
- metadata: stopGenerationTriggerRes.metadata.stopReason === "customStopTrigger"
260
- ? stopGenerationTriggerRes.metadata
261
- : stopGenerationTriggerRes.metadata
262
- };
263
- generateResponseState.moveFreePendingTokensToRes(false);
247
+ if (!generateResponseState.holdPartialTokensForNextEvaluation()) {
248
+ generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
249
+ generateResponseState.recordStopGenerationEvaluation();
250
+ generateResponseState.popStreamRegulatorFreeTokens();
251
+ const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("user");
252
+ if (stopGenerationTriggerRes != null)
253
+ return {
254
+ completion: stopGenerationTriggerRes.response,
255
+ lastEvaluation: {
256
+ contextWindow: mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()),
257
+ contextShiftMetadata: stopGenerationTriggerRes.lastEvaluation.contextShiftMetadata
258
+ },
259
+ metadata: stopGenerationTriggerRes.metadata.stopReason === "customStopTrigger"
260
+ ? stopGenerationTriggerRes.metadata
261
+ : stopGenerationTriggerRes.metadata
262
+ };
263
+ generateResponseState.moveFreePendingTokensToRes(false);
264
+ }
264
265
  const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger("user");
265
266
  if (maxTokensTriggerRes != null)
266
267
  return {
267
268
  completion: maxTokensTriggerRes.response,
268
269
  lastEvaluation: {
269
- contextWindow: setLastUserTextInChatHistory(generateResponseState.lastContextWindowHistory, initialUserMessage),
270
+ contextWindow: mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()),
270
271
  contextShiftMetadata: maxTokensTriggerRes.lastEvaluation.contextShiftMetadata
271
272
  },
272
273
  metadata: maxTokensTriggerRes.metadata
@@ -278,7 +279,7 @@ export class LlamaChat {
278
279
  return {
279
280
  completion: abortRes.response,
280
281
  lastEvaluation: {
281
- contextWindow: setLastUserTextInChatHistory(generateResponseState.lastContextWindowHistory, initialUserMessage),
282
+ contextWindow: mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()),
282
283
  contextShiftMetadata: abortRes.lastEvaluation.contextShiftMetadata
283
284
  },
284
285
  metadata: abortRes.metadata
@@ -303,11 +304,18 @@ function removeRawFromHistoryItem(historyItem) {
303
304
  newHistoryItem.response = newHistoryItem.response.map((item) => {
304
305
  if (typeof item === "string")
305
306
  return item;
306
- else
307
+ else if (isChatModelResponseFunctionCall(item))
307
308
  return {
308
309
  ...item,
309
310
  rawCall: undefined
310
311
  };
312
+ else if (isChatModelResponseSegment(item))
313
+ return {
314
+ ...item,
315
+ raw: undefined
316
+ };
317
+ void item;
318
+ return item;
311
319
  });
312
320
  return newHistoryItem;
313
321
  }
@@ -370,42 +378,17 @@ async function compressHistoryToFitContextSize({ history, contextShiftSize, cont
370
378
  metadata
371
379
  };
372
380
  }
373
- function getLastTextModelResponseFromChatHistory(chatHistory) {
374
- if (chatHistory.length === 0 || chatHistory[chatHistory.length - 1].type !== "model")
375
- return "";
376
- const lastModelResponseItem = chatHistory[chatHistory.length - 1];
377
- const modelResponse = lastModelResponseItem.response;
378
- if (modelResponse.length > 0 && typeof modelResponse[modelResponse.length - 1] === "string")
379
- return modelResponse[modelResponse.length - 1];
380
- return "";
381
+ function getLastModelMessageFullResponseFromChatHistory(chatHistory) {
382
+ const lastModelResponseItem = chatHistory.at(-1);
383
+ if (lastModelResponseItem == null || lastModelResponseItem.type !== "model")
384
+ return [];
385
+ return lastModelResponseItem.response;
381
386
  }
382
387
  function getLastUserTextFromChatHistory(chatHistory) {
383
388
  if (chatHistory.length === 0 || chatHistory[chatHistory.length - 1].type !== "user")
384
389
  return "";
385
390
  return chatHistory[chatHistory.length - 1].text;
386
391
  }
387
- function setLastModelTextResponseInChatHistory(chatHistory, textResponse) {
388
- const newChatHistory = chatHistory.slice();
389
- if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "model")
390
- newChatHistory.push({
391
- type: "model",
392
- response: []
393
- });
394
- const lastModelResponseItem = newChatHistory[newChatHistory.length - 1];
395
- const newLastModelResponseItem = { ...lastModelResponseItem };
396
- newChatHistory[newChatHistory.length - 1] = newLastModelResponseItem;
397
- const modelResponse = newLastModelResponseItem.response.slice();
398
- newLastModelResponseItem.response = modelResponse;
399
- if (modelResponse.length > 0 && typeof modelResponse[modelResponse.length - 1] === "string") {
400
- if (textResponse === "")
401
- modelResponse.pop();
402
- else
403
- modelResponse[modelResponse.length - 1] = textResponse;
404
- }
405
- else if (textResponse !== "")
406
- modelResponse.push(textResponse);
407
- return newChatHistory;
408
- }
409
392
  function setLastUserTextInChatHistory(chatHistory, userText) {
410
393
  const newChatHistory = chatHistory.slice();
411
394
  if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "user")
@@ -419,11 +402,73 @@ function setLastUserTextInChatHistory(chatHistory, userText) {
419
402
  newLastUserItem.text = userText;
420
403
  return newChatHistory;
421
404
  }
422
- function setLastTextInChatHistory(itemType, chatHistory, text) {
423
- if (itemType === "user")
424
- return setLastUserTextInChatHistory(chatHistory, text);
425
- else
426
- return setLastModelTextResponseInChatHistory(chatHistory, text);
405
+ function mergeGeneratedResultWithChatHistory(itemType, chatHistory, generatedResult) {
406
+ if (generatedResult.length === 0 || (generatedResult.length === 1 && generatedResult[0] === ""))
407
+ return chatHistory;
408
+ const newChatHistory = chatHistory.slice();
409
+ if (itemType === "user") {
410
+ let lastUserItem = newChatHistory.at(-1);
411
+ if (lastUserItem?.type !== "user") {
412
+ lastUserItem = {
413
+ type: "user",
414
+ text: ""
415
+ };
416
+ newChatHistory.push(lastUserItem);
417
+ }
418
+ const newLastUserItem = { ...lastUserItem };
419
+ newChatHistory[newChatHistory.length - 1] = newLastUserItem;
420
+ newLastUserItem.text += generatedResult
421
+ .map((item) => {
422
+ if (typeof item === "string")
423
+ return item;
424
+ return item.text;
425
+ })
426
+ .join("");
427
+ return newChatHistory;
428
+ }
429
+ else {
430
+ let lastModelItem = newChatHistory.at(-1);
431
+ if (lastModelItem?.type !== "model") {
432
+ lastModelItem = {
433
+ type: "model",
434
+ response: []
435
+ };
436
+ newChatHistory.push(lastModelItem);
437
+ }
438
+ const newLastModelItem = { ...lastModelItem };
439
+ newChatHistory[newChatHistory.length - 1] = newLastModelItem;
440
+ const modelResponse = newLastModelItem.response.slice();
441
+ newLastModelItem.response = modelResponse;
442
+ const firstGeneratedResultItem = generatedResult[0];
443
+ if (firstGeneratedResultItem == null)
444
+ return newChatHistory;
445
+ const lastModelResponseItem = modelResponse.at(-1);
446
+ if (typeof firstGeneratedResultItem === "string" && typeof lastModelResponseItem === "string") {
447
+ modelResponse[modelResponse.length - 1] = lastModelResponseItem + firstGeneratedResultItem;
448
+ }
449
+ else if (typeof firstGeneratedResultItem !== "string" && isChatModelResponseSegment(firstGeneratedResultItem) &&
450
+ typeof lastModelResponseItem !== "string" && isChatModelResponseSegment(lastModelResponseItem) &&
451
+ !lastModelResponseItem.ended && lastModelResponseItem.segmentType === firstGeneratedResultItem.segmentType) {
452
+ modelResponse[modelResponse.length - 1] = {
453
+ ...lastModelResponseItem,
454
+ ...firstGeneratedResultItem,
455
+ text: lastModelResponseItem.text + firstGeneratedResultItem.text,
456
+ ended: firstGeneratedResultItem.ended,
457
+ raw: (lastModelResponseItem.raw != null && firstGeneratedResultItem.raw != null)
458
+ ? LlamaText([
459
+ LlamaText.fromJSON(lastModelResponseItem.raw),
460
+ LlamaText.fromJSON(firstGeneratedResultItem.raw)
461
+ ]).toJSON()
462
+ : undefined,
463
+ startTime: lastModelResponseItem.startTime,
464
+ endTime: firstGeneratedResultItem.endTime
465
+ };
466
+ }
467
+ else
468
+ modelResponse.push(firstGeneratedResultItem);
469
+ pushAll(modelResponse, generatedResult.slice(1));
470
+ return newChatHistory;
471
+ }
427
472
  }
428
473
  function findLastUserMessageInChatHistory(chatHistory) {
429
474
  for (let i = chatHistory.length - 1; i >= 0; i--) {
@@ -486,6 +531,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
486
531
  throw new DisposedError();
487
532
  const model = sequence.model;
488
533
  const context = sequence.context;
534
+ let removeRawFromHistory = false;
489
535
  if (isFirstEvaluation && lastEvaluationContextWindowHistory != null && sequence.isLoadedToMemory) {
490
536
  const newContextWindow = lastEvaluationContextWindowHistory.slice();
491
537
  if (endWithUserText) {
@@ -514,7 +560,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
514
560
  history: newContextWindow,
515
561
  stopGenerationTriggers,
516
562
  tokens,
517
- newResolvedHistory: resolvedHistory,
563
+ removeRawFromHistory,
518
564
  newHistoryCompressionMetadata: lastHistoryCompressionMetadata,
519
565
  ignoreStartText: ignoreStartText ?? [],
520
566
  functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
@@ -523,9 +569,10 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
523
569
  };
524
570
  }
525
571
  }
526
- resolvedHistory = sequence.isLoadedToMemory
527
- ? resolvedHistory.slice()
528
- : resolvedHistory.map(removeRawFromHistoryItem);
572
+ removeRawFromHistory = !sequence.isLoadedToMemory;
573
+ resolvedHistory = removeRawFromHistory
574
+ ? resolvedHistory.map(removeRawFromHistoryItem)
575
+ : resolvedHistory.slice();
529
576
  if (resolvedContextShift.lastEvaluationMetadata != null) {
530
577
  const contextShiftSize = resolvedContextShift.size instanceof Function
531
578
  ? await resolvedContextShift.size(sequence)
@@ -550,7 +597,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
550
597
  history: compressedHistory,
551
598
  stopGenerationTriggers,
552
599
  tokens: contextText.tokenize(model.tokenizer),
553
- newResolvedHistory: resolvedHistory,
600
+ removeRawFromHistory,
554
601
  newHistoryCompressionMetadata: metadata,
555
602
  ignoreStartText: ignoreStartText ?? [],
556
603
  functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
@@ -570,7 +617,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
570
617
  history: resolvedHistory,
571
618
  stopGenerationTriggers,
572
619
  tokens,
573
- newResolvedHistory: resolvedHistory,
620
+ removeRawFromHistory,
574
621
  newHistoryCompressionMetadata: lastHistoryCompressionMetadata,
575
622
  ignoreStartText: ignoreStartText ?? [],
576
623
  functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
@@ -601,7 +648,7 @@ async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHis
601
648
  history: compressedHistory,
602
649
  stopGenerationTriggers,
603
650
  tokens: contextText.tokenize(model.tokenizer),
604
- newResolvedHistory: resolvedHistory,
651
+ removeRawFromHistory,
605
652
  newHistoryCompressionMetadata: metadata,
606
653
  ignoreStartText: ignoreStartText ?? [],
607
654
  functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
@@ -615,6 +662,7 @@ class GenerateResponseState {
615
662
  history;
616
663
  onTextChunk;
617
664
  onToken;
665
+ onResponseChunk;
618
666
  signal;
619
667
  stopOnAbortSignal;
620
668
  maxTokens;
@@ -638,7 +686,6 @@ class GenerateResponseState {
638
686
  repeatPenaltyEnabled;
639
687
  resolvedContextShift;
640
688
  resolvedRepeatPenalty;
641
- lastModelResponse;
642
689
  grammarEvaluationState;
643
690
  functionNameGrammar;
644
691
  functionsGrammar;
@@ -651,10 +698,13 @@ class GenerateResponseState {
651
698
  ignoreStartTextDetector = new StopGenerationDetector();
652
699
  locksToReleaseOnValidGeneration = [];
653
700
  resolvedHistory;
701
+ noRawInResolvedHistory;
654
702
  res = [];
655
703
  pendingTokens = [];
656
704
  ignoredStartTextTokens = [];
657
705
  resFunctionCalls = [];
706
+ segmentHandler;
707
+ pendingPartialTokens = [];
658
708
  functionEvaluationMode = false;
659
709
  currentFunctionCallPreviousText = LlamaText([]);
660
710
  currentFunctionCallCurrentPartTokens = [];
@@ -678,8 +728,6 @@ class GenerateResponseState {
678
728
  disengageInitiallyEngagedFunctionCall = [];
679
729
  userTextSuffix = undefined;
680
730
  tokens = [];
681
- contextWindowLastModelResponse = "";
682
- contextWindowsRes = [];
683
731
  // token evaluation loop
684
732
  evaluationIterator;
685
733
  currentIteration;
@@ -688,12 +736,13 @@ class GenerateResponseState {
688
736
  currentTokens = [];
689
737
  currentText = "";
690
738
  currentQueuedTokenRelease;
691
- constructor(llamaChat, chatWrapper, history, { onTextChunk, onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
739
+ constructor(llamaChat, chatWrapper, history, { onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
692
740
  this.llamaChat = llamaChat;
693
741
  this.chatWrapper = chatWrapper;
694
742
  this.history = history;
695
743
  this.onTextChunk = safeEventCallback(onTextChunk);
696
744
  this.onToken = safeEventCallback(onToken);
745
+ this.onResponseChunk = safeEventCallback(onResponseChunk);
697
746
  this.signal = signal;
698
747
  this.stopOnAbortSignal = stopOnAbortSignal;
699
748
  this.maxTokens = maxTokens;
@@ -718,9 +767,10 @@ class GenerateResponseState {
718
767
  throw this.signal.reason;
719
768
  if (this.llamaChat.disposed)
720
769
  throw new DisposedError();
721
- this.resolvedHistory = this.llamaChat.sequence.isLoadedToMemory
722
- ? this.history.slice()
723
- : this.history.map(removeRawFromHistoryItem);
770
+ this.noRawInResolvedHistory = !this.llamaChat.sequence.isLoadedToMemory;
771
+ this.resolvedHistory = this.noRawInResolvedHistory
772
+ ? this.history.map(removeRawFromHistoryItem)
773
+ : this.history.slice();
724
774
  this.resolvedContextShift = {
725
775
  ...defaultContextShiftOptions,
726
776
  ...removeNullFields(this.contextShift)
@@ -731,7 +781,6 @@ class GenerateResponseState {
731
781
  ...(repeatPenalty ?? {}),
732
782
  lastTokens: repeatPenalty?.lastTokens ?? defaultRepeatPenaltyLastTokens
733
783
  };
734
- this.lastModelResponse = getLastTextModelResponseFromChatHistory(this.resolvedHistory);
735
784
  this.repeatPenaltyEnabled = this.resolvedRepeatPenalty.lastTokens > 0;
736
785
  this.grammarEvaluationState = this.grammar != null
737
786
  ? new LlamaGrammarEvaluationState({ model: this.llamaChat.model, grammar: this.grammar })
@@ -742,7 +791,7 @@ class GenerateResponseState {
742
791
  this.functionsGrammar = undefined;
743
792
  this.functionsEvaluationState = undefined;
744
793
  this.lastContextWindowHistory = lastEvaluationContextWindowHistory ?? this.resolvedHistory;
745
- this.lastHistoryCompressionMetadata = this.resolvedContextShift;
794
+ this.lastHistoryCompressionMetadata = this.resolvedContextShift.lastEvaluationMetadata;
746
795
  if (this.customStopTriggers != null)
747
796
  StopGenerationDetector.resolveStopTriggers(this.customStopTriggers, this.llamaChat.model.tokenizer)
748
797
  .map((stopTrigger) => this.customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
@@ -754,6 +803,22 @@ class GenerateResponseState {
754
803
  this.chatWrapper.settings.functions?.parallelism?.call?.sectionPrefix ?? "",
755
804
  this.chatWrapper.settings.functions.call.prefix
756
805
  ]), this.llamaChat.model.tokenizer));
806
+ const segmentDefinitions = new Map();
807
+ for (const segmentType of allSegmentTypes) {
808
+ const segmentDefinition = getChatWrapperSegmentDefinition(this.chatWrapper.settings, segmentType);
809
+ if (segmentDefinition != null)
810
+ segmentDefinitions.set(segmentType, segmentDefinition);
811
+ }
812
+ this.segmentHandler = new SegmentHandler({
813
+ model: this.llamaChat.model,
814
+ onTextChunk: this.onTextChunk,
815
+ onToken: this.onToken,
816
+ onResponseChunk: this.onResponseChunk,
817
+ previousTokens: this.getLastTokens(),
818
+ closeAllSegments: this.chatWrapper.settings.segments?.closeAllSegments,
819
+ segmentDefinitions,
820
+ initialSegmentStack: SegmentHandler.getStackFromModelResponse(getLastModelMessageFullResponseFromChatHistory(this.resolvedHistory))
821
+ });
757
822
  this.getPenaltyTokens = this.getPenaltyTokens.bind(this);
758
823
  }
759
824
  async dispose() {
@@ -763,19 +828,47 @@ class GenerateResponseState {
763
828
  await this.dispose();
764
829
  }
765
830
  ensureLastHistoryItemIsModel() {
766
- if (this.resolvedHistory.length === 0 || this.resolvedHistory[this.resolvedHistory.length - 1].type !== "model")
831
+ if (this.resolvedHistory.at(-1)?.type !== "model")
767
832
  this.resolvedHistory.push({
768
833
  type: "model",
769
834
  response: []
770
835
  });
771
836
  }
772
837
  ensureLastHistoryItemIsUser() {
773
- if (this.resolvedHistory.length === 0 || this.resolvedHistory[this.resolvedHistory.length - 1].type !== "user")
838
+ if (this.resolvedHistory.at(-1)?.type !== "user")
774
839
  this.resolvedHistory.push({
775
840
  type: "user",
776
841
  text: ""
777
842
  });
778
843
  }
844
+ ensureReopenedThoughtSegmentAfterFunctionCallsIfNeeded() {
845
+ if (this.chatWrapper.settings.segments?.thought?.reopenAfterFunctionCalls !== true)
846
+ return;
847
+ const lastModelResponseItem = this.resolvedHistory.at(-1);
848
+ if (lastModelResponseItem == null || lastModelResponseItem.type !== "model")
849
+ return;
850
+ const lastResponse = lastModelResponseItem.response.at(-1);
851
+ if (lastResponse == null)
852
+ return;
853
+ const lastResponseIsFunctionCall = typeof lastResponse !== "string" && lastResponse.type === "functionCall";
854
+ if (!lastResponseIsFunctionCall)
855
+ return;
856
+ const currentResponseSegmentsStack = SegmentHandler.getStackFromModelResponse(lastModelResponseItem.response);
857
+ if (currentResponseSegmentsStack.includes("thought"))
858
+ return;
859
+ const hadThoughtSegments = this.resolvedHistory.some((chatItem) => {
860
+ if (chatItem.type !== "model")
861
+ return false;
862
+ return chatItem.response.some((responseItem) => {
863
+ if (typeof responseItem === "string")
864
+ return false;
865
+ return responseItem.type === "segment" && responseItem.segmentType === "thought";
866
+ });
867
+ });
868
+ if (!hadThoughtSegments)
869
+ return;
870
+ this.segmentHandler.openSegment("thought");
871
+ }
779
872
  ensureNotAborted() {
780
873
  if (this.signal?.aborted && (!this.stopOnAbortSignal || this.res.length === 0))
781
874
  throw this.signal.reason;
@@ -784,7 +877,7 @@ class GenerateResponseState {
784
877
  }
785
878
  getPenaltyTokens() {
786
879
  if (this.llamaChat.disposed)
787
- throw new DisposedError();
880
+ return [];
788
881
  let punishTokens = this.res.slice(-this.resolvedRepeatPenalty.lastTokens);
789
882
  if (this.resolvedRepeatPenalty.punishTokensFilter != null)
790
883
  punishTokens = this.resolvedRepeatPenalty.punishTokensFilter(punishTokens);
@@ -796,24 +889,10 @@ class GenerateResponseState {
796
889
  return punishTokens;
797
890
  }
798
891
  getResolvedHistoryWithCurrentModelResponse() {
799
- if (this.res.length === 0)
800
- return this.resolvedHistory;
801
- let modelResponse = this.llamaChat.model.detokenize(this.res);
802
- if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix)
803
- modelResponse = modelResponse.trimEnd();
804
- if (modelResponse === "")
805
- return this.resolvedHistory;
806
- return setLastModelTextResponseInChatHistory(this.resolvedHistory, this.lastModelResponse + modelResponse);
892
+ return mergeGeneratedResultWithChatHistory("model", this.resolvedHistory, this.segmentHandler.getModelResponseSegments());
807
893
  }
808
894
  getContextWindowsHistoryWithCurrentModelResponse() {
809
- if (this.contextWindowsRes.length === 0)
810
- return this.lastContextWindowHistory;
811
- let modelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
812
- if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix)
813
- modelResponse = modelResponse.trimEnd();
814
- if (modelResponse === "")
815
- return this.lastContextWindowHistory;
816
- return setLastModelTextResponseInChatHistory(this.lastContextWindowHistory, this.contextWindowLastModelResponse + modelResponse);
895
+ return mergeGeneratedResultWithChatHistory("model", this.lastContextWindowHistory, this.segmentHandler.getContextWindowModelResponseSegments());
817
896
  }
818
897
  removeFoundStartIgnoreTextsFromPendingTokens(forceRemove = false) {
819
898
  if (!this.removedStartTextToIgnore && this.res.length === 0 && this.pendingTokens.length > 0 &&
@@ -826,14 +905,26 @@ class GenerateResponseState {
826
905
  this.contextWindowTokens,
827
906
  this.ignoredStartTextTokens
828
907
  ]);
908
+ const pendingPartialTokens = [];
829
909
  for (let i = 0; i < this.pendingTokens.length; i++) {
910
+ const currentToken = this.pendingTokens[i];
911
+ const tokens = [...pendingPartialTokens, currentToken];
912
+ const text = this.llamaChat.model.detokenize(tokens, false, lastTokensForDetokenizer);
913
+ if (pendingPartialTokens.length === 0 &&
914
+ text.endsWith(UNKNOWN_UNICODE_CHAR) &&
915
+ !this.llamaChat.model.isSpecialToken(currentToken) &&
916
+ !this.llamaChat.model.isEogToken(currentToken)) {
917
+ pendingPartialTokens.length = 0;
918
+ pushAll(pendingPartialTokens, tokens);
919
+ continue;
920
+ }
830
921
  this.ignoreStartTextDetector.recordGeneration({
831
- text: this.llamaChat.model.detokenize([this.pendingTokens[i]], false, lastTokensForDetokenizer),
832
- tokens: [this.pendingTokens[i]],
922
+ text: this.llamaChat.model.detokenize(tokens, false, lastTokensForDetokenizer),
923
+ tokens,
833
924
  startNewChecks: i === 0,
834
925
  triggerMustStartWithGeneration: true
835
926
  });
836
- lastTokensForDetokenizer.push(this.pendingTokens[i]);
927
+ pushAll(lastTokensForDetokenizer, tokens);
837
928
  if (this.ignoreStartTextDetector.hasTriggeredStops) {
838
929
  mostExhaustiveTriggeredStops = this.ignoreStartTextDetector.getTriggeredStops();
839
930
  this.ignoreStartTextDetector.clearTriggeredStops();
@@ -902,11 +993,12 @@ class GenerateResponseState {
902
993
  const queuedChunkTokens = this.streamRegulator.getAllQueuedChunkTokens();
903
994
  const functionCallsTokens = this.getContextWindowFunctionCallsTokens();
904
995
  if (!avoidReloadingHistory || !this.canAvoidReloadingHistory || !this.llamaChat.sequence.isLoadedToMemory) {
905
- const { history: contextWindowHistory, stopGenerationTriggers, tokens: contextWindowTokens, newResolvedHistory, newHistoryCompressionMetadata, ignoreStartText, functionCallInitiallyEngaged, disengageInitiallyEngagedFunctionCall, userTextSuffix } = await getContextWindow({
996
+ const { history: contextWindowHistory, stopGenerationTriggers, tokens: contextWindowTokens, removeRawFromHistory, newHistoryCompressionMetadata, ignoreStartText, functionCallInitiallyEngaged, disengageInitiallyEngagedFunctionCall, userTextSuffix } = await getContextWindow({
906
997
  resolvedHistory: resolvedHistory,
907
998
  resolvedContextShift: this.resolvedContextShift,
908
999
  lastHistoryCompressionMetadata: this.lastHistoryCompressionMetadata,
909
- pendingTokensCount: this.pendingTokens.length + queuedChunkTokens.length + functionCallsTokens.length,
1000
+ pendingTokensCount: this.pendingTokens.length + queuedChunkTokens.length + functionCallsTokens.length +
1001
+ this.pendingPartialTokens.length,
910
1002
  isFirstEvaluation: this.isFirstEvaluation,
911
1003
  chatWrapper: this.chatWrapper,
912
1004
  lastEvaluationContextWindowHistory: resolvedContextWindowsHistory,
@@ -924,19 +1016,22 @@ class GenerateResponseState {
924
1016
  this.functionCallInitiallyEngaged = functionCallInitiallyEngaged;
925
1017
  this.disengageInitiallyEngagedFunctionCall = disengageInitiallyEngagedFunctionCall;
926
1018
  this.userTextSuffix = userTextSuffix;
927
- this.resolvedHistory = newResolvedHistory;
928
1019
  this.lastHistoryCompressionMetadata = newHistoryCompressionMetadata;
929
1020
  this.lastContextWindowHistory = contextWindowHistory;
930
- this.contextWindowLastModelResponse = getLastTextModelResponseFromChatHistory(contextWindowHistory);
931
- this.contextWindowsRes = [];
1021
+ this.segmentHandler.resetContextWindow();
932
1022
  this.canAvoidReloadingHistory = true;
1023
+ if (removeRawFromHistory && !this.noRawInResolvedHistory) {
1024
+ this.noRawInResolvedHistory = true;
1025
+ this.resolvedHistory = this.resolvedHistory.map(removeRawFromHistoryItem);
1026
+ }
933
1027
  }
934
1028
  this.tokens = [
935
1029
  ...this.contextWindowTokens,
936
1030
  ...this.ignoredStartTextTokens,
937
1031
  ...this.pendingTokens,
938
1032
  ...queuedChunkTokens,
939
- ...functionCallsTokens
1033
+ ...functionCallsTokens,
1034
+ ...this.pendingPartialTokens
940
1035
  ];
941
1036
  if (avoidReloadingHistory && this.tokens.length >= this.llamaChat.sequence.context.contextSize - 1)
942
1037
  return await this.loadContextWindow(resolvedHistory, resolvedContextWindowsHistory, endWithUserText, false);
@@ -1017,24 +1112,24 @@ class GenerateResponseState {
1017
1112
  pushAll(prefixDetectorRecordedTokens, tokens);
1018
1113
  }
1019
1114
  }
1020
- for await (const token of this.evaluateWithContextShift(loadContextWindow)) {
1115
+ for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
1021
1116
  const stopGenerationTriggerRes = this.handleStopGenerationTrigger("model");
1022
1117
  if (stopGenerationTriggerRes != null)
1023
1118
  return stopGenerationTriggerRes;
1024
- this.currentFunctionCallCurrentPartTokens.push(token);
1119
+ pushAll(this.currentFunctionCallCurrentPartTokens, tokens);
1025
1120
  this.disengageInitiallyEngagedFunctionMode.recordGeneration({
1026
1121
  text: this.currentText,
1027
1122
  tokens: this.currentTokens,
1028
- startNewChecks: this.currentFunctionCallCurrentPartTokens.length === 1,
1123
+ startNewChecks: this.currentFunctionCallCurrentPartTokens.length === tokens.length,
1029
1124
  triggerMustStartWithGeneration: true
1030
1125
  });
1031
1126
  if (prefixDetector.hasTriggeredStops)
1032
- afterPrefixLeftoverTokens.push(token);
1127
+ pushAll(afterPrefixLeftoverTokens, tokens);
1033
1128
  else {
1034
1129
  prefixDetector.recordGeneration({
1035
1130
  text: this.currentText,
1036
1131
  tokens: this.currentTokens,
1037
- startNewChecks: this.currentFunctionCallCurrentPartTokens.length === 1,
1132
+ startNewChecks: this.currentFunctionCallCurrentPartTokens.length === tokens.length,
1038
1133
  triggerMustStartWithGeneration: true
1039
1134
  });
1040
1135
  pushAll(prefixDetectorRecordedTokens, this.currentTokens);
@@ -1109,8 +1204,8 @@ class GenerateResponseState {
1109
1204
  }
1110
1205
  }
1111
1206
  }
1112
- for await (const token of this.evaluateWithContextShift(loadContextWindow)) {
1113
- this.currentFunctionCallCurrentPartTokens.push(token);
1207
+ for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
1208
+ pushAll(this.currentFunctionCallCurrentPartTokens, tokens);
1114
1209
  functionNameGenerationDoneDetector.recordGeneration({
1115
1210
  text: this.currentText,
1116
1211
  tokens: this.currentTokens
@@ -1154,8 +1249,8 @@ class GenerateResponseState {
1154
1249
  });
1155
1250
  StopGenerationDetector.resolveStopTriggers(this.functionsGrammar.stopGenerationTriggers, this.llamaChat.model.tokenizer)
1156
1251
  .map((stopTrigger) => functionParamsGenerationDoneDetector.addStopTrigger(stopTrigger));
1157
- for await (const token of this.evaluateWithContextShift(loadContextWindow)) {
1158
- this.currentFunctionCallCurrentPartTokens.push(token);
1252
+ for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
1253
+ pushAll(this.currentFunctionCallCurrentPartTokens, tokens);
1159
1254
  functionParamsGenerationDoneDetector.recordGeneration({
1160
1255
  text: this.currentText,
1161
1256
  tokens: this.currentTokens
@@ -1213,8 +1308,8 @@ class GenerateResponseState {
1213
1308
  LlamaText(new SpecialToken("EOT"))
1214
1309
  ], this.llamaChat.model.tokenizer)
1215
1310
  .map((stopTrigger) => sectionSuffixDetector.addStopTrigger(stopTrigger));
1216
- for await (const token of this.evaluateWithContextShift(loadContextWindow)) {
1217
- this.currentFunctionCallCurrentPartTokens.push(token);
1311
+ for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
1312
+ pushAll(this.currentFunctionCallCurrentPartTokens, tokens);
1218
1313
  sectionSuffixDetector.recordGeneration({
1219
1314
  text: this.currentText,
1220
1315
  tokens: this.currentTokens,
@@ -1258,17 +1353,17 @@ class GenerateResponseState {
1258
1353
  returnFunctionCallResults() {
1259
1354
  if (this.resFunctionCalls.length > 0) {
1260
1355
  this.releasePartiallyFreeTokensBeforeFunctionCallStart();
1261
- let modelResponse = this.llamaChat.model.detokenize(this.res);
1262
- let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
1263
- if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix) {
1264
- modelResponse = modelResponse.trimEnd();
1265
- contextWindowModelResponse = contextWindowModelResponse.trimEnd();
1266
- }
1356
+ this.segmentHandler.onFinishedGeneration();
1357
+ const trimWhitespaceSuffix = this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix;
1358
+ const responseSegments = this.segmentHandler.getModelResponseSegments(trimWhitespaceSuffix);
1267
1359
  return {
1268
- response: modelResponse,
1360
+ response: responseSegments
1361
+ .filter((segment) => typeof segment === "string")
1362
+ .join(""),
1363
+ fullResponse: responseSegments,
1269
1364
  lastEvaluation: {
1270
- contextWindow: setLastTextInChatHistory("model", this.lastContextWindowHistory, this.contextWindowLastModelResponse + contextWindowModelResponse),
1271
- cleanHistory: setLastTextInChatHistory("model", this.resolvedHistory, this.lastModelResponse + modelResponse),
1365
+ contextWindow: mergeGeneratedResultWithChatHistory("model", this.lastContextWindowHistory, this.segmentHandler.getContextWindowModelResponseSegments(trimWhitespaceSuffix)),
1366
+ cleanHistory: mergeGeneratedResultWithChatHistory("model", this.resolvedHistory, responseSegments),
1272
1367
  contextShiftMetadata: this.lastHistoryCompressionMetadata
1273
1368
  },
1274
1369
  functionCalls: this.resFunctionCalls.map((functionCall) => {
@@ -1292,9 +1387,10 @@ class GenerateResponseState {
1292
1387
  await this.alignCurrentSequenceStateWithCurrentTokens();
1293
1388
  await this.createNewEvaluationIterator();
1294
1389
  while (await this.iterateEvaluation()) {
1295
- if (this.currentToken == null)
1390
+ if (this.currentTokens.length === 0)
1296
1391
  break;
1297
- yield this.currentToken;
1392
+ if (!this.holdPartialTokensForNextEvaluation())
1393
+ yield this.currentTokens;
1298
1394
  if (this.shouldAbort)
1299
1395
  return;
1300
1396
  if (this.updateShouldContextShift())
@@ -1367,9 +1463,14 @@ class GenerateResponseState {
1367
1463
  this.currentIterationReplacementToken = undefined;
1368
1464
  this.ensureNotAborted();
1369
1465
  this.generatedTokens++;
1370
- if (this.currentIteration != null && this.currentIteration?.done !== true) {
1371
- this.currentToken = this.currentIteration.value;
1372
- this.currentTokens = [this.currentToken];
1466
+ if ((this.currentIteration != null && this.currentIteration?.done !== true) || this.pendingPartialTokens.length !== 0) {
1467
+ this.currentToken = this.currentIteration?.value ?? undefined;
1468
+ this.currentTokens = this.currentToken != null
1469
+ ? this.pendingPartialTokens.length === 0
1470
+ ? [this.currentToken]
1471
+ : [...this.pendingPartialTokens, this.currentToken]
1472
+ : [...this.pendingPartialTokens];
1473
+ this.pendingPartialTokens.length = 0;
1373
1474
  this.currentText = this.llamaChat.model.detokenize(this.currentTokens, false, this.getLastTokens());
1374
1475
  if (this.functionEvaluationMode === false)
1375
1476
  this.currentQueuedTokenRelease = this.streamRegulator.addChunk({
@@ -1382,6 +1483,19 @@ class GenerateResponseState {
1382
1483
  }
1383
1484
  return false;
1384
1485
  }
1486
+ holdPartialTokensForNextEvaluation() {
1487
+ if (this.pendingPartialTokens.length === 0 &&
1488
+ this.currentText.endsWith(UNKNOWN_UNICODE_CHAR) &&
1489
+ this.currentToken != null &&
1490
+ !this.llamaChat.model.isSpecialToken(this.currentToken) &&
1491
+ !this.llamaChat.model.isEogToken(this.currentToken)) {
1492
+ this.pendingPartialTokens.length = 0;
1493
+ pushAll(this.pendingPartialTokens, this.currentTokens);
1494
+ this.streamRegulator.removeChunkIfLast(this.currentQueuedTokenRelease);
1495
+ return true;
1496
+ }
1497
+ return false;
1498
+ }
1385
1499
  waitOnPartialCharactersOrWhiteSpaceTokens() {
1386
1500
  if (this.currentText.endsWith(UNKNOWN_UNICODE_CHAR) || ((this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix) && this.currentText?.trim() === "") || (this.currentText === "" && this.locksToReleaseOnValidGeneration.length > 0 &&
1387
1501
  !this.llamaChat.model.isSpecialToken(this.currentToken))) {
@@ -1449,21 +1563,22 @@ class GenerateResponseState {
1449
1563
  const { firstRemainingGenerationAfterStop } = StopGenerationDetector.getFirstRemainingGenerationAfterStop(triggeredStops);
1450
1564
  this.removeFoundStartIgnoreTextsFromPendingTokens(true);
1451
1565
  this.pushPendingTokensAndCallOnToken();
1452
- let modelResponse = this.llamaChat.model.detokenize(this.res);
1453
- let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
1454
- if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix) {
1455
- modelResponse = modelResponse.trimEnd();
1456
- contextWindowModelResponse = contextWindowModelResponse.trimEnd();
1457
- }
1566
+ this.segmentHandler.onFinishedGeneration();
1567
+ const trimWhitespaceSuffix = this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix;
1568
+ const responseSegments = this.segmentHandler.getModelResponseSegments(trimWhitespaceSuffix);
1569
+ const response = responseSegments
1570
+ .filter((segment) => typeof segment === "string")
1571
+ .join("");
1458
1572
  const lastEvaluation = {
1459
- contextWindow: setLastTextInChatHistory(lastHistoryItemType, this.lastContextWindowHistory, this.contextWindowLastModelResponse + contextWindowModelResponse),
1460
- cleanHistory: setLastTextInChatHistory(lastHistoryItemType, this.resolvedHistory, this.lastModelResponse + modelResponse),
1573
+ contextWindow: mergeGeneratedResultWithChatHistory(lastHistoryItemType, this.lastContextWindowHistory, this.segmentHandler.getContextWindowModelResponseSegments(trimWhitespaceSuffix)),
1574
+ cleanHistory: mergeGeneratedResultWithChatHistory(lastHistoryItemType, this.resolvedHistory, responseSegments),
1461
1575
  contextShiftMetadata: this.lastHistoryCompressionMetadata
1462
1576
  };
1463
1577
  const isEogToken = this.llamaChat.model.isEogToken(this.currentToken);
1464
1578
  if (isEogToken || this.stopGenerationDetector.hasTriggeredStops) {
1465
1579
  return {
1466
- response: modelResponse,
1580
+ response,
1581
+ fullResponse: responseSegments,
1467
1582
  lastEvaluation,
1468
1583
  metadata: {
1469
1584
  remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
@@ -1474,7 +1589,8 @@ class GenerateResponseState {
1474
1589
  };
1475
1590
  }
1476
1591
  return {
1477
- response: modelResponse,
1592
+ response,
1593
+ fullResponse: responseSegments,
1478
1594
  lastEvaluation,
1479
1595
  metadata: {
1480
1596
  remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
@@ -1511,17 +1627,17 @@ class GenerateResponseState {
1511
1627
  }
1512
1628
  handleMaxTokensTrigger(lastHistoryItemType) {
1513
1629
  if (this.isMaxTokensTriggered()) {
1514
- let modelResponse = this.llamaChat.model.detokenize(this.res);
1515
- let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
1516
- if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix) {
1517
- modelResponse = modelResponse.trimEnd();
1518
- contextWindowModelResponse = contextWindowModelResponse.trimEnd();
1519
- }
1630
+ this.segmentHandler.onFinishedGeneration();
1631
+ const trimWhitespaceSuffix = this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix;
1632
+ const responseSegments = this.segmentHandler.getModelResponseSegments(trimWhitespaceSuffix);
1520
1633
  return {
1521
- response: modelResponse,
1634
+ response: responseSegments
1635
+ .filter((segment) => typeof segment === "string")
1636
+ .join(""),
1637
+ fullResponse: responseSegments,
1522
1638
  lastEvaluation: {
1523
- contextWindow: setLastTextInChatHistory(lastHistoryItemType, this.lastContextWindowHistory, this.contextWindowLastModelResponse + contextWindowModelResponse),
1524
- cleanHistory: setLastTextInChatHistory(lastHistoryItemType, this.resolvedHistory, this.lastModelResponse + modelResponse),
1639
+ contextWindow: mergeGeneratedResultWithChatHistory(lastHistoryItemType, this.lastContextWindowHistory, this.segmentHandler.getContextWindowModelResponseSegments(trimWhitespaceSuffix)),
1640
+ cleanHistory: mergeGeneratedResultWithChatHistory(lastHistoryItemType, this.resolvedHistory, responseSegments),
1525
1641
  contextShiftMetadata: this.lastHistoryCompressionMetadata
1526
1642
  },
1527
1643
  metadata: {
@@ -1542,17 +1658,17 @@ class GenerateResponseState {
1542
1658
  if (this.shouldAbort && this.signal?.aborted && this.stopOnAbortSignal) {
1543
1659
  if (this.res.length === 0)
1544
1660
  throw this.signal.reason;
1545
- let modelResponse = this.llamaChat.model.detokenize(this.res);
1546
- let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
1547
- if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix) {
1548
- modelResponse = modelResponse.trimEnd();
1549
- contextWindowModelResponse = contextWindowModelResponse.trimEnd();
1550
- }
1661
+ this.segmentHandler.onFinishedGeneration();
1662
+ const trimWhitespaceSuffix = this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix;
1663
+ const responseSegments = this.segmentHandler.getModelResponseSegments(trimWhitespaceSuffix);
1551
1664
  return {
1552
- response: modelResponse,
1665
+ response: responseSegments
1666
+ .filter((segment) => typeof segment === "string")
1667
+ .join(""),
1668
+ fullResponse: responseSegments,
1553
1669
  lastEvaluation: {
1554
- contextWindow: setLastTextInChatHistory(lastHistoryItemType, this.lastContextWindowHistory, this.contextWindowLastModelResponse + contextWindowModelResponse),
1555
- cleanHistory: setLastTextInChatHistory(lastHistoryItemType, this.resolvedHistory, this.lastModelResponse + modelResponse),
1670
+ contextWindow: mergeGeneratedResultWithChatHistory(lastHistoryItemType, this.lastContextWindowHistory, this.segmentHandler.getContextWindowModelResponseSegments(trimWhitespaceSuffix)),
1671
+ cleanHistory: mergeGeneratedResultWithChatHistory(lastHistoryItemType, this.resolvedHistory, responseSegments),
1556
1672
  contextShiftMetadata: this.lastHistoryCompressionMetadata
1557
1673
  },
1558
1674
  metadata: {
@@ -1565,10 +1681,8 @@ class GenerateResponseState {
1565
1681
  pushPendingTokensAndCallOnToken() {
1566
1682
  if (this.pendingTokens.length === 0)
1567
1683
  return;
1568
- this.onToken?.(this.pendingTokens.slice());
1569
- this.onTextChunk?.(this.llamaChat.model.detokenize(this.pendingTokens, false, this.res));
1684
+ this.segmentHandler.processTokens(this.pendingTokens);
1570
1685
  pushAll(this.res, this.pendingTokens);
1571
- pushAll(this.contextWindowsRes, this.pendingTokens);
1572
1686
  this.pendingTokens.length = 0;
1573
1687
  }
1574
1688
  getLastTokens(maxTokens = maxRecentDetokenizerTokens) {
@@ -1577,8 +1691,470 @@ class GenerateResponseState {
1577
1691
  this.ignoredStartTextTokens,
1578
1692
  this.pendingTokens,
1579
1693
  this.streamRegulator.getLastQueuedChunkTokens(maxTokens),
1580
- this.getContextWindowFunctionCallsTokens()
1694
+ this.getContextWindowFunctionCallsTokens(),
1695
+ this.pendingPartialTokens
1581
1696
  ], maxTokens);
1582
1697
  }
1583
1698
  }
1699
+ class SegmentHandler {
1700
+ model;
1701
+ onToken;
1702
+ onTextChunk;
1703
+ onResponseChunk;
1704
+ _closeAllSegmentsDetector;
1705
+ _segmentDetectors;
1706
+ _segmentsStack = [];
1707
+ _segmentsStackSet = new Set();
1708
+ _ownedSegmentsStackLength = 0;
1709
+ _segments = [];
1710
+ _segmentsStartTokenTrail = [];
1711
+ _contextWindowSegments = [];
1712
+ _contextWindowStartTokenTrail = [];
1713
+ _initialTokensTrail;
1714
+ _tokensTrail;
1715
+ _streamRegulator = new TokenStreamRegulator();
1716
+ _segmentDefinitions;
1717
+ constructor({ model, onTextChunk, onToken, onResponseChunk, segmentDefinitions, closeAllSegments, initialSegmentStack, previousTokens }) {
1718
+ this.model = model;
1719
+ this.onTextChunk = onTextChunk;
1720
+ this.onToken = onToken;
1721
+ this.onResponseChunk = onResponseChunk;
1722
+ this._initialTokensTrail = previousTokens.slice(-maxRecentDetokenizerTokens);
1723
+ this._segmentsStartTokenTrail = previousTokens.slice(-maxRecentDetokenizerTokens);
1724
+ this._tokensTrail = previousTokens.slice(-maxRecentDetokenizerTokens);
1725
+ this._closeAllSegmentsDetector = closeAllSegments != null
1726
+ ? new StopGenerationDetector()
1727
+ .addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText(closeAllSegments), this.model.tokenizer))
1728
+ : undefined;
1729
+ this._segmentDetectors = new Map();
1730
+ this._segmentsStack = initialSegmentStack;
1731
+ this._segmentsStackSet = new Set(initialSegmentStack);
1732
+ this._ownedSegmentsStackLength = initialSegmentStack.length;
1733
+ this._segmentDefinitions = segmentDefinitions;
1734
+ for (const [segment, { prefix, suffix }] of segmentDefinitions.entries()) {
1735
+ this._segmentDetectors.set(segment, {
1736
+ prefix: new StopGenerationDetector()
1737
+ .addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText(prefix), this.model.tokenizer)),
1738
+ suffix: suffix != null
1739
+ ? new StopGenerationDetector()
1740
+ .addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText(suffix), this.model.tokenizer))
1741
+ : undefined
1742
+ });
1743
+ }
1744
+ }
1745
+ processTokens(tokens) {
1746
+ if (tokens.length === 0)
1747
+ return;
1748
+ let pendingTokens = [];
1749
+ for (const token of tokens) {
1750
+ pendingTokens.push(token);
1751
+ const currentText = this.model.detokenize(pendingTokens, false, this._tokensTrail);
1752
+ if (currentText.endsWith(UNKNOWN_UNICODE_CHAR))
1753
+ continue;
1754
+ pushAll(this._tokensTrail, pendingTokens);
1755
+ this._processTokens(pendingTokens, currentText);
1756
+ pendingTokens = [];
1757
+ }
1758
+ }
1759
+ onFinishedGeneration() {
1760
+ this._clearDetectors();
1761
+ this._pushCurrentTokens(this._streamRegulator.popFreeChunkTokens());
1762
+ }
1763
+ resetContextWindow() {
1764
+ this._contextWindowSegments.length = 0;
1765
+ this._contextWindowStartTokenTrail.length = 0;
1766
+ pushAll(this._contextWindowStartTokenTrail, this._getTokenTrailFromResult());
1767
+ }
1768
+ openSegment(type) {
1769
+ const now = Date.now();
1770
+ this._segmentsStack.push(type);
1771
+ this._segmentsStackSet.add(type);
1772
+ this._segments.push({ type, tokens: [], ended: false, start: true, startTime: now });
1773
+ this._contextWindowSegments.push({ type, tokens: [], ended: false, start: true, startTime: now });
1774
+ this.onResponseChunk?.({
1775
+ type: "segment",
1776
+ segmentType: type,
1777
+ tokens: [],
1778
+ text: "",
1779
+ segmentStartTime: new Date(now)
1780
+ });
1781
+ }
1782
+ _processTokens(tokens, text) {
1783
+ const queuedTokenRelease = this._streamRegulator.addChunk({
1784
+ tokens,
1785
+ text
1786
+ });
1787
+ const currentType = this._segmentsStack.at(-1);
1788
+ const handleDetector = (stopDetector, action, type) => {
1789
+ if (stopDetector == null)
1790
+ return false;
1791
+ stopDetector.recordGeneration({
1792
+ text,
1793
+ tokens,
1794
+ queuedTokenRelease
1795
+ });
1796
+ if (stopDetector.hasTriggeredStops) {
1797
+ const [leftTokens, leftText] = this._handleTriggeredStopDetector(stopDetector);
1798
+ if (action === "pop")
1799
+ this._closeSegment(type);
1800
+ else if (action === "push") {
1801
+ this.openSegment(type);
1802
+ }
1803
+ else if (action === "reset") {
1804
+ const now = Date.now();
1805
+ while (this._segmentsStack.length > 0) {
1806
+ const segmentType = this._segmentsStack.pop();
1807
+ this._segmentsStackSet.delete(segmentType);
1808
+ const lastSegment = this._segments.at(-1);
1809
+ if (lastSegment != null && !(lastSegment instanceof Array) && lastSegment.type === segmentType) {
1810
+ lastSegment.ended = true;
1811
+ lastSegment.endTime = now;
1812
+ this.onResponseChunk?.({
1813
+ type: "segment",
1814
+ segmentType: segmentType,
1815
+ tokens: [],
1816
+ text: "",
1817
+ segmentStartTime: undefined,
1818
+ segmentEndTime: new Date(now)
1819
+ });
1820
+ }
1821
+ else {
1822
+ this._segments.push({ type: segmentType, tokens: [], ended: true, start: false, endTime: now });
1823
+ this.onResponseChunk?.({
1824
+ type: "segment",
1825
+ segmentType: segmentType,
1826
+ tokens: [],
1827
+ text: "",
1828
+ segmentStartTime: undefined,
1829
+ segmentEndTime: new Date(now)
1830
+ });
1831
+ }
1832
+ const lastContextWindowSegment = this._contextWindowSegments.at(-1);
1833
+ if (lastContextWindowSegment != null && !(lastContextWindowSegment instanceof Array) &&
1834
+ lastContextWindowSegment.type === segmentType)
1835
+ lastContextWindowSegment.ended = true;
1836
+ else
1837
+ this._contextWindowSegments.push({ type: segmentType, tokens: [], ended: true, start: false, endTime: now });
1838
+ }
1839
+ this._ownedSegmentsStackLength = 0;
1840
+ }
1841
+ if (leftTokens.length > 0)
1842
+ this._processTokens(leftTokens, leftText);
1843
+ return true;
1844
+ }
1845
+ return false;
1846
+ };
1847
+ if (currentType != null) {
1848
+ if (handleDetector(this._closeAllSegmentsDetector, "reset", currentType))
1849
+ return;
1850
+ if (handleDetector(this._segmentDetectors.get(currentType)?.suffix, "pop", currentType))
1851
+ return;
1852
+ }
1853
+ else
1854
+ this._closeAllSegmentsDetector?.clearInProgressStops();
1855
+ for (const [type, { prefix, suffix }] of this._segmentDetectors.entries()) {
1856
+ if (!this._segmentsStackSet.has(type)) {
1857
+ if (handleDetector(prefix, "push", type))
1858
+ return;
1859
+ }
1860
+ else
1861
+ prefix.clearInProgressStops();
1862
+ if (this._segmentsStackSet.has(type)) {
1863
+ // `currentType` suffix is already handled above
1864
+ if (type === currentType && handleDetector(suffix, "pop", type))
1865
+ return;
1866
+ }
1867
+ else
1868
+ suffix?.clearInProgressStops();
1869
+ }
1870
+ this._pushCurrentTokens(this._streamRegulator.popFreeChunkTokens());
1871
+ }
1872
+ _handleTriggeredStopDetector(stopDetector) {
1873
+ this._clearDetectors(stopDetector);
1874
+ stopDetector.clearInProgressStops();
1875
+ const triggeredStops = stopDetector.getTriggeredStops();
1876
+ const freeTokens = this._streamRegulator.popFreeChunkTokens();
1877
+ const partiallyFreeTokens = this._streamRegulator.getPartiallyFreeChunk(this.model.tokenizer);
1878
+ const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, this.model.tokenizer);
1879
+ const { firstRemainingGenerationAfterStop } = StopGenerationDetector.getFirstRemainingGenerationAfterStop(triggeredStops);
1880
+ const remainingTokens = typeof firstRemainingGenerationAfterStop === "string"
1881
+ ? firstRemainingGenerationAfterStop === ""
1882
+ ? []
1883
+ : this.model.tokenize(firstRemainingGenerationAfterStop, false)
1884
+ : (firstRemainingGenerationAfterStop ?? []);
1885
+ const remainingText = typeof firstRemainingGenerationAfterStop === "string"
1886
+ ? firstRemainingGenerationAfterStop
1887
+ : this.model.detokenize(remainingTokens, false, queuedTokensBeforeStopTrigger.length === 0
1888
+ ? this._getTokenTrailFromResult()
1889
+ : queuedTokensBeforeStopTrigger);
1890
+ this._pushCurrentTokens([...freeTokens, ...queuedTokensBeforeStopTrigger]);
1891
+ stopDetector.clearTriggeredStops();
1892
+ this._streamRegulator.reset();
1893
+ return [remainingTokens, remainingText];
1894
+ }
1895
+ _closeSegment(type) {
1896
+ if (type == null)
1897
+ return;
1898
+ const lastSegment = this._segments.at(-1);
1899
+ const now = Date.now();
1900
+ if (lastSegment != null && !(lastSegment instanceof Array) && lastSegment.type === type && this._segmentsStack.at(-1) === type) {
1901
+ if (lastSegment.ended !== true) {
1902
+ lastSegment.ended = true;
1903
+ lastSegment.endTime = now;
1904
+ this.onResponseChunk?.({
1905
+ type: "segment",
1906
+ segmentType: type,
1907
+ tokens: [],
1908
+ text: "",
1909
+ segmentStartTime: undefined,
1910
+ segmentEndTime: new Date(now)
1911
+ });
1912
+ }
1913
+ this._segmentsStackSet.delete(this._segmentsStack.pop());
1914
+ if (this._segmentsStack.length < this._ownedSegmentsStackLength)
1915
+ this._ownedSegmentsStackLength = this._segmentsStack.length;
1916
+ const lastContextWindowSegment = this._contextWindowSegments.at(-1);
1917
+ if (lastContextWindowSegment != null && !(lastContextWindowSegment instanceof Array) &&
1918
+ lastContextWindowSegment.type === type && this._segmentsStack.at(-1) === type) {
1919
+ if (lastContextWindowSegment.ended !== true) {
1920
+ lastContextWindowSegment.ended = true;
1921
+ lastContextWindowSegment.endTime = now;
1922
+ }
1923
+ }
1924
+ else
1925
+ this._contextWindowSegments.push({ type, tokens: [], ended: true, start: false, endTime: now });
1926
+ return;
1927
+ }
1928
+ const typeIndex = this._segmentsStack.lastIndexOf(type);
1929
+ if (typeIndex < 0)
1930
+ return;
1931
+ for (let i = this._segmentsStack.length - 1; i >= typeIndex; i--) {
1932
+ const segmentType = this._segmentsStack.pop();
1933
+ this._segmentsStackSet.delete(segmentType);
1934
+ if (this._segmentsStack.length < this._ownedSegmentsStackLength)
1935
+ this._ownedSegmentsStackLength = this._segmentsStack.length;
1936
+ this._segments.push({ type: segmentType, tokens: [], ended: true, start: false, endTime: now });
1937
+ this._contextWindowSegments.push({ type: segmentType, tokens: [], ended: true, start: false, endTime: now });
1938
+ this.onResponseChunk?.({
1939
+ type: "segment",
1940
+ segmentType: segmentType,
1941
+ tokens: [],
1942
+ text: "",
1943
+ segmentStartTime: undefined,
1944
+ segmentEndTime: new Date(now)
1945
+ });
1946
+ }
1947
+ }
1948
+ _clearDetectors(skipDetector) {
1949
+ if (this._closeAllSegmentsDetector !== skipDetector) {
1950
+ this._closeAllSegmentsDetector?.clearInProgressStops();
1951
+ this._closeAllSegmentsDetector?.clearTriggeredStops();
1952
+ }
1953
+ for (const { prefix, suffix } of this._segmentDetectors.values()) {
1954
+ if (prefix !== skipDetector) {
1955
+ prefix.clearInProgressStops();
1956
+ prefix.clearTriggeredStops();
1957
+ }
1958
+ if (suffix !== skipDetector) {
1959
+ suffix?.clearInProgressStops();
1960
+ suffix?.clearTriggeredStops();
1961
+ }
1962
+ }
1963
+ }
1964
+ _pushCurrentTokens(tokens) {
1965
+ const lastSegment = this._segments.at(-1);
1966
+ const lastContextWindowSegment = this._contextWindowSegments.at(-1);
1967
+ const type = this._segmentsStack.at(-1);
1968
+ if (type == null) {
1969
+ if (lastSegment == null) {
1970
+ const text = (this.onResponseChunk != null || this.onTextChunk != null)
1971
+ ? this.model.detokenize(tokens, false, this._getTokenTrailFromResult())
1972
+ : "";
1973
+ this._segments.push(tokens);
1974
+ this.onToken?.(tokens.slice());
1975
+ this.onTextChunk?.(text);
1976
+ this.onResponseChunk?.({ type: undefined, segmentType: undefined, tokens: tokens.slice(), text });
1977
+ }
1978
+ else {
1979
+ if (lastSegment instanceof Array) {
1980
+ const text = (this.onResponseChunk != null || this.onTextChunk != null)
1981
+ ? this.model.detokenize(tokens, false, this._getTokenTrailFromResult())
1982
+ : "";
1983
+ pushAll(lastSegment, tokens);
1984
+ this.onToken?.(tokens);
1985
+ this.onTextChunk?.(text);
1986
+ this.onResponseChunk?.({ type: undefined, segmentType: undefined, tokens, text });
1987
+ }
1988
+ else
1989
+ this._segments.push(tokens);
1990
+ }
1991
+ if (lastContextWindowSegment == null)
1992
+ this._contextWindowSegments.push(tokens.slice());
1993
+ else {
1994
+ if (lastContextWindowSegment instanceof Array)
1995
+ pushAll(lastContextWindowSegment, tokens);
1996
+ else
1997
+ this._contextWindowSegments.push(tokens.slice());
1998
+ }
1999
+ }
2000
+ else {
2001
+ const now = Date.now();
2002
+ if (lastSegment == null) {
2003
+ const text = this.onResponseChunk != null
2004
+ ? this.model.detokenize(tokens, false, this._getTokenTrailFromResult())
2005
+ : "";
2006
+ this._segments.push({
2007
+ type,
2008
+ tokens,
2009
+ ended: false,
2010
+ start: this._segmentsStack.length > this._ownedSegmentsStackLength,
2011
+ startTime: now
2012
+ });
2013
+ this.onResponseChunk?.({
2014
+ type: "segment",
2015
+ segmentType: type,
2016
+ tokens: tokens.slice(),
2017
+ text,
2018
+ segmentStartTime: new Date(now)
2019
+ });
2020
+ }
2021
+ else {
2022
+ const text = this.onResponseChunk != null
2023
+ ? this.model.detokenize(tokens, false, this._getTokenTrailFromResult())
2024
+ : "";
2025
+ if (lastSegment instanceof Array || lastSegment.type !== type) {
2026
+ this._segments.push({
2027
+ type,
2028
+ tokens,
2029
+ ended: false,
2030
+ start: this._segmentsStack.length > this._ownedSegmentsStackLength,
2031
+ startTime: now
2032
+ });
2033
+ this.onResponseChunk?.({
2034
+ type: "segment",
2035
+ segmentType: type,
2036
+ tokens: tokens.slice(),
2037
+ text,
2038
+ segmentStartTime: new Date(now)
2039
+ });
2040
+ }
2041
+ else {
2042
+ pushAll(lastSegment.tokens, tokens);
2043
+ this.onResponseChunk?.({
2044
+ type: "segment",
2045
+ segmentType: type,
2046
+ tokens: tokens.slice(),
2047
+ text,
2048
+ segmentStartTime: undefined
2049
+ });
2050
+ }
2051
+ }
2052
+ if (lastContextWindowSegment == null)
2053
+ this._contextWindowSegments.push({
2054
+ type,
2055
+ tokens: tokens.slice(),
2056
+ ended: false,
2057
+ start: this._segmentsStack.length > this._ownedSegmentsStackLength,
2058
+ startTime: now
2059
+ });
2060
+ else {
2061
+ if (lastContextWindowSegment instanceof Array || lastContextWindowSegment.type !== type)
2062
+ this._contextWindowSegments.push({
2063
+ type,
2064
+ tokens: tokens.slice(),
2065
+ ended: false,
2066
+ start: this._segmentsStack.length > this._ownedSegmentsStackLength,
2067
+ startTime: now
2068
+ });
2069
+ else
2070
+ pushAll(lastContextWindowSegment.tokens, tokens);
2071
+ }
2072
+ }
2073
+ }
2074
+ _getTokenTrailFromResult() {
2075
+ const res = [];
2076
+ for (let i = this._segments.length - 1; i >= 0; i--) {
2077
+ const segment = this._segments[i];
2078
+ const segmentTokens = segment instanceof Array
2079
+ ? segment
2080
+ : segment.tokens;
2081
+ for (let j = segmentTokens.length - 1; j >= 0; j--) {
2082
+ res.unshift(segmentTokens[j]);
2083
+ if (res.length >= maxRecentDetokenizerTokens)
2084
+ return res;
2085
+ }
2086
+ }
2087
+ for (let i = this._initialTokensTrail.length - 1; i >= 0; i--) {
2088
+ res.unshift(this._initialTokensTrail[i]);
2089
+ if (res.length >= maxRecentDetokenizerTokens)
2090
+ return res;
2091
+ }
2092
+ return res;
2093
+ }
2094
+ getModelResponseSegments(trimWhitespaceSuffix = false) {
2095
+ return this._getModelResponseForSegments(this._segments, this._segmentsStartTokenTrail, trimWhitespaceSuffix);
2096
+ }
2097
+ getContextWindowModelResponseSegments(trimWhitespaceSuffix = false) {
2098
+ return this._getModelResponseForSegments(this._contextWindowSegments, this._contextWindowStartTokenTrail, trimWhitespaceSuffix);
2099
+ }
2100
+ _getModelResponseForSegments(rawSegments, recentTokens, trimWhitespaceSuffix) {
2101
+ let tokenTrail = resolveLastTokens([recentTokens]);
2102
+ return rawSegments.map((rawSegment, index) => {
2103
+ const isLast = index === rawSegments.length - 1;
2104
+ if (rawSegment instanceof Array) {
2105
+ let text = this.model.detokenize(rawSegment, false, tokenTrail);
2106
+ if (isLast && trimWhitespaceSuffix)
2107
+ text = text.trimEnd();
2108
+ tokenTrail = resolveLastTokens([tokenTrail, rawSegment]);
2109
+ return text;
2110
+ }
2111
+ let text = this.model.detokenize(rawSegment.tokens, false, tokenTrail);
2112
+ if (isLast && rawSegment.ended && trimWhitespaceSuffix)
2113
+ text = text.trimEnd();
2114
+ tokenTrail = resolveLastTokens([tokenTrail, rawSegment.tokens]);
2115
+ const segmentDefinition = this._segmentDefinitions.get(rawSegment.type);
2116
+ return {
2117
+ type: "segment",
2118
+ segmentType: rawSegment.type,
2119
+ text,
2120
+ ended: rawSegment.ended,
2121
+ raw: segmentDefinition == null
2122
+ ? LlamaText([text]).toJSON()
2123
+ : LlamaText([
2124
+ rawSegment.start
2125
+ ? segmentDefinition.prefix
2126
+ : "",
2127
+ text,
2128
+ rawSegment.ended
2129
+ ? (segmentDefinition.suffix ?? "")
2130
+ : ""
2131
+ ]).toJSON(),
2132
+ startTime: rawSegment.startTime != null
2133
+ ? new Date(rawSegment.startTime).toISOString()
2134
+ : undefined,
2135
+ endTime: rawSegment.endTime != null
2136
+ ? new Date(rawSegment.endTime).toISOString()
2137
+ : undefined
2138
+ };
2139
+ });
2140
+ }
2141
+ static getStackFromModelResponse(modelResponse) {
2142
+ const stack = [];
2143
+ const stackSet = new Set();
2144
+ for (const item of modelResponse) {
2145
+ if (typeof item === "string" || isChatModelResponseFunctionCall(item))
2146
+ continue;
2147
+ void item.type;
2148
+ if (item.ended && stack.at(-1) === item.segmentType) {
2149
+ stack.pop();
2150
+ stackSet.delete(item.segmentType);
2151
+ }
2152
+ else if (!item.ended && !stackSet.has(item.segmentType)) {
2153
+ stack.push(item.segmentType);
2154
+ stackSet.add(item.segmentType);
2155
+ }
2156
+ }
2157
+ return stack;
2158
+ }
2159
+ }
1584
2160
  //# sourceMappingURL=LlamaChat.js.map