@librechat/agents 2.4.13 → 2.4.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/graphs/Graph.cjs +17 -11
- package/dist/cjs/graphs/Graph.cjs.map +1 -1
- package/dist/cjs/messages/prune.cjs +65 -35
- package/dist/cjs/messages/prune.cjs.map +1 -1
- package/dist/esm/graphs/Graph.mjs +17 -11
- package/dist/esm/graphs/Graph.mjs.map +1 -1
- package/dist/esm/messages/prune.mjs +65 -35
- package/dist/esm/messages/prune.mjs.map +1 -1
- package/dist/types/graphs/Graph.d.ts +1 -1
- package/dist/types/messages/prune.d.ts +2 -2
- package/package.json +1 -1
- package/src/graphs/Graph.ts +19 -12
- package/src/messages/prune.ts +158 -61
- package/src/specs/prune.test.ts +93 -76
- package/src/specs/token-distribution-edge-case.test.ts +73 -52
package/src/messages/prune.ts
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
1
|
+
import {
|
|
2
|
+
AIMessage,
|
|
3
|
+
BaseMessage,
|
|
4
|
+
UsageMetadata,
|
|
5
|
+
} from '@langchain/core/messages';
|
|
6
|
+
import type {
|
|
7
|
+
ThinkingContentText,
|
|
8
|
+
MessageContentComplex,
|
|
9
|
+
ReasoningContentText,
|
|
10
|
+
} from '@/types/stream';
|
|
3
11
|
import type { TokenCounter } from '@/types/run';
|
|
4
12
|
import { ContentTypes, Providers } from '@/common';
|
|
5
13
|
|
|
@@ -8,27 +16,36 @@ export type PruneMessagesFactoryParams = {
|
|
|
8
16
|
maxTokens: number;
|
|
9
17
|
startIndex: number;
|
|
10
18
|
tokenCounter: TokenCounter;
|
|
11
|
-
indexTokenCountMap: Record<string, number>;
|
|
19
|
+
indexTokenCountMap: Record<string, number | undefined>;
|
|
12
20
|
thinkingEnabled?: boolean;
|
|
13
21
|
};
|
|
14
22
|
export type PruneMessagesParams = {
|
|
15
23
|
messages: BaseMessage[];
|
|
16
24
|
usageMetadata?: Partial<UsageMetadata>;
|
|
17
25
|
startType?: ReturnType<BaseMessage['getType']>;
|
|
18
|
-
}
|
|
26
|
+
};
|
|
19
27
|
|
|
20
|
-
function isIndexInContext(
|
|
28
|
+
function isIndexInContext(
|
|
29
|
+
arrayA: unknown[],
|
|
30
|
+
arrayB: unknown[],
|
|
31
|
+
targetIndex: number
|
|
32
|
+
): boolean {
|
|
21
33
|
const startingIndexInA = arrayA.length - arrayB.length;
|
|
22
34
|
return targetIndex >= startingIndexInA;
|
|
23
35
|
}
|
|
24
36
|
|
|
25
|
-
function addThinkingBlock(
|
|
37
|
+
function addThinkingBlock(
|
|
38
|
+
message: AIMessage,
|
|
39
|
+
thinkingBlock: ThinkingContentText | ReasoningContentText
|
|
40
|
+
): AIMessage {
|
|
26
41
|
const content: MessageContentComplex[] = Array.isArray(message.content)
|
|
27
|
-
? message.content as MessageContentComplex[]
|
|
28
|
-
: [
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
42
|
+
? (message.content as MessageContentComplex[])
|
|
43
|
+
: [
|
|
44
|
+
{
|
|
45
|
+
type: ContentTypes.TEXT,
|
|
46
|
+
text: message.content,
|
|
47
|
+
},
|
|
48
|
+
];
|
|
32
49
|
/** Edge case, the message already has the thinking block */
|
|
33
50
|
if (content[0].type === thinkingBlock.type) {
|
|
34
51
|
return message;
|
|
@@ -36,7 +53,7 @@ function addThinkingBlock(message: AIMessage, thinkingBlock: ThinkingContentText
|
|
|
36
53
|
content.unshift(thinkingBlock);
|
|
37
54
|
return new AIMessage({
|
|
38
55
|
...message,
|
|
39
|
-
content
|
|
56
|
+
content,
|
|
40
57
|
});
|
|
41
58
|
}
|
|
42
59
|
|
|
@@ -46,7 +63,9 @@ function addThinkingBlock(message: AIMessage, thinkingBlock: ThinkingContentText
|
|
|
46
63
|
* @param usage The usage metadata object containing token information
|
|
47
64
|
* @returns An object containing the total input and output tokens
|
|
48
65
|
*/
|
|
49
|
-
export function calculateTotalTokens(
|
|
66
|
+
export function calculateTotalTokens(
|
|
67
|
+
usage: Partial<UsageMetadata>
|
|
68
|
+
): UsageMetadata {
|
|
50
69
|
const baseInputTokens = Number(usage.input_tokens) || 0;
|
|
51
70
|
const cacheCreation = Number(usage.input_token_details?.cache_creation) || 0;
|
|
52
71
|
const cacheRead = Number(usage.input_token_details?.cache_read) || 0;
|
|
@@ -57,7 +76,7 @@ export function calculateTotalTokens(usage: Partial<UsageMetadata>): UsageMetada
|
|
|
57
76
|
return {
|
|
58
77
|
input_tokens: totalInputTokens,
|
|
59
78
|
output_tokens: totalOutputTokens,
|
|
60
|
-
total_tokens: totalInputTokens + totalOutputTokens
|
|
79
|
+
total_tokens: totalInputTokens + totalOutputTokens,
|
|
61
80
|
};
|
|
62
81
|
}
|
|
63
82
|
|
|
@@ -97,8 +116,10 @@ export function getMessagesWithinTokenLimit({
|
|
|
97
116
|
// Every reply is primed with <|start|>assistant<|message|>, so we
|
|
98
117
|
// start with 3 tokens for the label after all messages have been counted.
|
|
99
118
|
let currentTokenCount = 3;
|
|
100
|
-
const instructions =
|
|
101
|
-
|
|
119
|
+
const instructions =
|
|
120
|
+
_messages[0]?.getType() === 'system' ? _messages[0] : undefined;
|
|
121
|
+
const instructionsTokenCount =
|
|
122
|
+
instructions != null ? (indexTokenCountMap[0] ?? 0) : 0;
|
|
102
123
|
const initialContextTokens = maxContextTokens - instructionsTokenCount;
|
|
103
124
|
let remainingContextTokens = initialContextTokens;
|
|
104
125
|
let startType = _startType;
|
|
@@ -120,13 +141,19 @@ export function getMessagesWithinTokenLimit({
|
|
|
120
141
|
if (_thinkingStartIndex > -1) {
|
|
121
142
|
const thinkingMessageContent = messages[_thinkingStartIndex]?.content;
|
|
122
143
|
if (Array.isArray(thinkingMessageContent)) {
|
|
123
|
-
thinkingBlock = thinkingMessageContent.find(
|
|
144
|
+
thinkingBlock = thinkingMessageContent.find(
|
|
145
|
+
(content) => content.type === reasoningType
|
|
146
|
+
) as ThinkingContentText | undefined;
|
|
124
147
|
}
|
|
125
148
|
}
|
|
126
149
|
|
|
127
150
|
if (currentTokenCount < remainingContextTokens) {
|
|
128
151
|
let currentIndex = messages.length;
|
|
129
|
-
while (
|
|
152
|
+
while (
|
|
153
|
+
messages.length > 0 &&
|
|
154
|
+
currentTokenCount < remainingContextTokens &&
|
|
155
|
+
currentIndex > endIndex
|
|
156
|
+
) {
|
|
130
157
|
currentIndex--;
|
|
131
158
|
if (messages.length === 1 && instructions) {
|
|
132
159
|
break;
|
|
@@ -134,25 +161,42 @@ export function getMessagesWithinTokenLimit({
|
|
|
134
161
|
const poppedMessage = messages.pop();
|
|
135
162
|
if (!poppedMessage) continue;
|
|
136
163
|
const messageType = poppedMessage.getType();
|
|
137
|
-
if (
|
|
164
|
+
if (
|
|
165
|
+
thinkingEnabled === true &&
|
|
166
|
+
thinkingEndIndex === -1 &&
|
|
167
|
+
currentIndex === originalLength - 1 &&
|
|
168
|
+
(messageType === 'ai' || messageType === 'tool')
|
|
169
|
+
) {
|
|
138
170
|
thinkingEndIndex = currentIndex;
|
|
139
171
|
}
|
|
140
|
-
if (
|
|
141
|
-
|
|
172
|
+
if (
|
|
173
|
+
thinkingEndIndex > -1 &&
|
|
174
|
+
!thinkingBlock &&
|
|
175
|
+
thinkingStartIndex < 0 &&
|
|
176
|
+
messageType === 'ai' &&
|
|
177
|
+
Array.isArray(poppedMessage.content)
|
|
178
|
+
) {
|
|
179
|
+
thinkingBlock = poppedMessage.content.find(
|
|
180
|
+
(content) => content.type === reasoningType
|
|
181
|
+
) as ThinkingContentText | undefined;
|
|
142
182
|
thinkingStartIndex = thinkingBlock != null ? currentIndex : -1;
|
|
143
183
|
}
|
|
144
184
|
/** False start, the latest message was not part of a multi-assistant/tool sequence of messages */
|
|
145
185
|
if (
|
|
146
|
-
thinkingEndIndex > -1
|
|
147
|
-
|
|
148
|
-
|
|
186
|
+
thinkingEndIndex > -1 &&
|
|
187
|
+
currentIndex === thinkingEndIndex - 1 &&
|
|
188
|
+
messageType !== 'ai' &&
|
|
189
|
+
messageType !== 'tool'
|
|
149
190
|
) {
|
|
150
191
|
thinkingEndIndex = -1;
|
|
151
192
|
}
|
|
152
193
|
|
|
153
194
|
const tokenCount = indexTokenCountMap[currentIndex] ?? 0;
|
|
154
195
|
|
|
155
|
-
if (
|
|
196
|
+
if (
|
|
197
|
+
prunedMemory.length === 0 &&
|
|
198
|
+
currentTokenCount + tokenCount <= remainingContextTokens
|
|
199
|
+
) {
|
|
156
200
|
context.push(poppedMessage);
|
|
157
201
|
currentTokenCount += tokenCount;
|
|
158
202
|
} else {
|
|
@@ -174,7 +218,11 @@ export function getMessagesWithinTokenLimit({
|
|
|
174
218
|
let totalTokens = 0;
|
|
175
219
|
for (let i = context.length - 1; i >= 0; i--) {
|
|
176
220
|
const currentType = context[i]?.getType() ?? '';
|
|
177
|
-
if (
|
|
221
|
+
if (
|
|
222
|
+
Array.isArray(startType)
|
|
223
|
+
? startType.includes(currentType)
|
|
224
|
+
: currentType === startType
|
|
225
|
+
) {
|
|
178
226
|
requiredTypeIndex = i + 1;
|
|
179
227
|
break;
|
|
180
228
|
}
|
|
@@ -205,18 +253,27 @@ export function getMessagesWithinTokenLimit({
|
|
|
205
253
|
result.thinkingStartIndex = thinkingStartIndex;
|
|
206
254
|
}
|
|
207
255
|
|
|
208
|
-
if (
|
|
256
|
+
if (
|
|
257
|
+
prunedMemory.length === 0 ||
|
|
258
|
+
thinkingEndIndex < 0 ||
|
|
259
|
+
(thinkingStartIndex > -1 &&
|
|
260
|
+
isIndexInContext(_messages, context, thinkingStartIndex))
|
|
261
|
+
) {
|
|
209
262
|
// we reverse at this step to ensure the context is in the correct order for the model, and we need to work backwards
|
|
210
263
|
result.context = context.reverse() as BaseMessage[];
|
|
211
264
|
return result;
|
|
212
265
|
}
|
|
213
266
|
|
|
214
267
|
if (thinkingEndIndex > -1 && thinkingStartIndex < 0) {
|
|
215
|
-
throw new Error(
|
|
268
|
+
throw new Error(
|
|
269
|
+
'The payload is malformed. There is a thinking sequence but no "AI" messages with thinking blocks.'
|
|
270
|
+
);
|
|
216
271
|
}
|
|
217
272
|
|
|
218
273
|
if (!thinkingBlock) {
|
|
219
|
-
throw new Error(
|
|
274
|
+
throw new Error(
|
|
275
|
+
'The payload is malformed. There is a thinking sequence but no thinking block found.'
|
|
276
|
+
);
|
|
220
277
|
}
|
|
221
278
|
|
|
222
279
|
// Since we have a thinking sequence, we need to find the last assistant message
|
|
@@ -235,13 +292,20 @@ export function getMessagesWithinTokenLimit({
|
|
|
235
292
|
}
|
|
236
293
|
|
|
237
294
|
if (assistantIndex === -1) {
|
|
238
|
-
throw new Error(
|
|
295
|
+
throw new Error(
|
|
296
|
+
'The payload is malformed. There is a thinking sequence but no "AI" messages to append thinking blocks to.'
|
|
297
|
+
);
|
|
239
298
|
}
|
|
240
299
|
|
|
241
300
|
thinkingStartIndex = originalLength - 1 - assistantIndex;
|
|
242
|
-
const thinkingTokenCount = tokenCounter(
|
|
301
|
+
const thinkingTokenCount = tokenCounter(
|
|
302
|
+
new AIMessage({ content: [thinkingBlock] })
|
|
303
|
+
);
|
|
243
304
|
const newRemainingCount = remainingContextTokens - thinkingTokenCount;
|
|
244
|
-
const newMessage = addThinkingBlock(
|
|
305
|
+
const newMessage = addThinkingBlock(
|
|
306
|
+
context[assistantIndex] as AIMessage,
|
|
307
|
+
thinkingBlock
|
|
308
|
+
);
|
|
245
309
|
context[assistantIndex] = newMessage;
|
|
246
310
|
if (newRemainingCount > 0) {
|
|
247
311
|
result.context = context.reverse() as BaseMessage[];
|
|
@@ -250,18 +314,23 @@ export function getMessagesWithinTokenLimit({
|
|
|
250
314
|
|
|
251
315
|
const thinkingMessage: AIMessage = context[assistantIndex] as AIMessage;
|
|
252
316
|
// now we need to an additional round of pruning but making the thinking block fit
|
|
253
|
-
const newThinkingMessageTokenCount =
|
|
317
|
+
const newThinkingMessageTokenCount =
|
|
318
|
+
(indexTokenCountMap[thinkingStartIndex] ?? 0) + thinkingTokenCount;
|
|
254
319
|
remainingContextTokens = initialContextTokens - newThinkingMessageTokenCount;
|
|
255
320
|
currentTokenCount = 3;
|
|
256
321
|
let newContext: BaseMessage[] = [];
|
|
257
322
|
const secondRoundMessages = [..._messages];
|
|
258
323
|
let currentIndex = secondRoundMessages.length;
|
|
259
|
-
while (
|
|
324
|
+
while (
|
|
325
|
+
secondRoundMessages.length > 0 &&
|
|
326
|
+
currentTokenCount < remainingContextTokens &&
|
|
327
|
+
currentIndex > thinkingStartIndex
|
|
328
|
+
) {
|
|
260
329
|
currentIndex--;
|
|
261
330
|
const poppedMessage = secondRoundMessages.pop();
|
|
262
331
|
if (!poppedMessage) continue;
|
|
263
332
|
const tokenCount = indexTokenCountMap[currentIndex] ?? 0;
|
|
264
|
-
if (
|
|
333
|
+
if (currentTokenCount + tokenCount <= remainingContextTokens) {
|
|
265
334
|
newContext.push(poppedMessage);
|
|
266
335
|
currentTokenCount += tokenCount;
|
|
267
336
|
} else {
|
|
@@ -282,7 +351,11 @@ export function getMessagesWithinTokenLimit({
|
|
|
282
351
|
let totalTokens = 0;
|
|
283
352
|
for (let i = newContext.length - 1; i >= 0; i--) {
|
|
284
353
|
const currentType = newContext[i]?.getType() ?? '';
|
|
285
|
-
if (
|
|
354
|
+
if (
|
|
355
|
+
Array.isArray(startType)
|
|
356
|
+
? startType.includes(currentType)
|
|
357
|
+
: currentType === startType
|
|
358
|
+
) {
|
|
286
359
|
requiredTypeIndex = i + 1;
|
|
287
360
|
break;
|
|
288
361
|
}
|
|
@@ -320,23 +393,28 @@ export function createPruneMessages(factoryParams: PruneMessagesFactoryParams) {
|
|
|
320
393
|
const indexTokenCountMap = { ...factoryParams.indexTokenCountMap };
|
|
321
394
|
let lastTurnStartIndex = factoryParams.startIndex;
|
|
322
395
|
let lastCutOffIndex = 0;
|
|
323
|
-
let totalTokens =
|
|
396
|
+
let totalTokens = Object.values(indexTokenCountMap).reduce(
|
|
397
|
+
(a = 0, b = 0) => a + b,
|
|
398
|
+
0
|
|
399
|
+
) as number;
|
|
324
400
|
let runThinkingStartIndex = -1;
|
|
325
401
|
return function pruneMessages(params: PruneMessagesParams): {
|
|
326
402
|
context: BaseMessage[];
|
|
327
|
-
indexTokenCountMap: Record<string, number>;
|
|
403
|
+
indexTokenCountMap: Record<string, number | undefined>;
|
|
328
404
|
} {
|
|
329
405
|
let currentUsage: UsageMetadata | undefined;
|
|
330
|
-
if (
|
|
331
|
-
|
|
332
|
-
||
|
|
333
|
-
checkValidNumber(params.usageMetadata.input_token_details)
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
||
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
406
|
+
if (
|
|
407
|
+
params.usageMetadata &&
|
|
408
|
+
(checkValidNumber(params.usageMetadata.input_tokens) ||
|
|
409
|
+
(checkValidNumber(params.usageMetadata.input_token_details) &&
|
|
410
|
+
(checkValidNumber(
|
|
411
|
+
params.usageMetadata.input_token_details.cache_creation
|
|
412
|
+
) ||
|
|
413
|
+
checkValidNumber(
|
|
414
|
+
params.usageMetadata.input_token_details.cache_read
|
|
415
|
+
)))) &&
|
|
416
|
+
checkValidNumber(params.usageMetadata.output_tokens)
|
|
417
|
+
) {
|
|
340
418
|
currentUsage = calculateTotalTokens(params.usageMetadata);
|
|
341
419
|
totalTokens = currentUsage.total_tokens;
|
|
342
420
|
}
|
|
@@ -344,16 +422,18 @@ export function createPruneMessages(factoryParams: PruneMessagesFactoryParams) {
|
|
|
344
422
|
const newOutputs = new Set<number>();
|
|
345
423
|
for (let i = lastTurnStartIndex; i < params.messages.length; i++) {
|
|
346
424
|
const message = params.messages[i];
|
|
347
|
-
|
|
348
|
-
|
|
425
|
+
if (
|
|
426
|
+
i === lastTurnStartIndex &&
|
|
427
|
+
indexTokenCountMap[i] === undefined &&
|
|
428
|
+
currentUsage
|
|
429
|
+
) {
|
|
349
430
|
indexTokenCountMap[i] = currentUsage.output_tokens;
|
|
350
|
-
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
|
351
431
|
} else if (indexTokenCountMap[i] === undefined) {
|
|
352
432
|
indexTokenCountMap[i] = factoryParams.tokenCounter(message);
|
|
353
433
|
if (currentUsage) {
|
|
354
434
|
newOutputs.add(i);
|
|
355
435
|
}
|
|
356
|
-
totalTokens += indexTokenCountMap[i];
|
|
436
|
+
totalTokens += indexTokenCountMap[i] ?? 0;
|
|
357
437
|
}
|
|
358
438
|
}
|
|
359
439
|
|
|
@@ -364,7 +444,7 @@ export function createPruneMessages(factoryParams: PruneMessagesFactoryParams) {
|
|
|
364
444
|
if (currentUsage) {
|
|
365
445
|
let totalIndexTokens = 0;
|
|
366
446
|
if (params.messages[0].getType() === 'system') {
|
|
367
|
-
totalIndexTokens += indexTokenCountMap[0];
|
|
447
|
+
totalIndexTokens += indexTokenCountMap[0] ?? 0;
|
|
368
448
|
}
|
|
369
449
|
for (let i = lastCutOffIndex; i < params.messages.length; i++) {
|
|
370
450
|
if (i === 0 && params.messages[0].getType() === 'system') {
|
|
@@ -373,24 +453,31 @@ export function createPruneMessages(factoryParams: PruneMessagesFactoryParams) {
|
|
|
373
453
|
if (newOutputs.has(i)) {
|
|
374
454
|
continue;
|
|
375
455
|
}
|
|
376
|
-
totalIndexTokens += indexTokenCountMap[i];
|
|
456
|
+
totalIndexTokens += indexTokenCountMap[i] ?? 0;
|
|
377
457
|
}
|
|
378
458
|
|
|
379
459
|
// Calculate ratio based only on messages that remain in the context
|
|
380
460
|
const ratio = currentUsage.total_tokens / totalIndexTokens;
|
|
381
|
-
const isRatioSafe = ratio >= 1/3 && ratio <= 2.5;
|
|
461
|
+
const isRatioSafe = ratio >= 1 / 3 && ratio <= 2.5;
|
|
382
462
|
|
|
383
463
|
// Apply the ratio adjustment only to messages at or after lastCutOffIndex, and only if the ratio is safe
|
|
384
464
|
if (isRatioSafe) {
|
|
385
|
-
if (
|
|
386
|
-
|
|
465
|
+
if (
|
|
466
|
+
params.messages[0].getType() === 'system' &&
|
|
467
|
+
lastCutOffIndex !== 0
|
|
468
|
+
) {
|
|
469
|
+
indexTokenCountMap[0] = Math.round(
|
|
470
|
+
(indexTokenCountMap[0] ?? 0) * ratio
|
|
471
|
+
);
|
|
387
472
|
}
|
|
388
473
|
|
|
389
474
|
for (let i = lastCutOffIndex; i < params.messages.length; i++) {
|
|
390
475
|
if (newOutputs.has(i)) {
|
|
391
476
|
continue;
|
|
392
477
|
}
|
|
393
|
-
indexTokenCountMap[i] = Math.round(
|
|
478
|
+
indexTokenCountMap[i] = Math.round(
|
|
479
|
+
(indexTokenCountMap[i] ?? 0) * ratio
|
|
480
|
+
);
|
|
394
481
|
}
|
|
395
482
|
}
|
|
396
483
|
}
|
|
@@ -407,12 +494,22 @@ export function createPruneMessages(factoryParams: PruneMessagesFactoryParams) {
|
|
|
407
494
|
startType: params.startType,
|
|
408
495
|
thinkingEnabled: factoryParams.thinkingEnabled,
|
|
409
496
|
tokenCounter: factoryParams.tokenCounter,
|
|
410
|
-
reasoningType:
|
|
411
|
-
|
|
497
|
+
reasoningType:
|
|
498
|
+
factoryParams.provider === Providers.BEDROCK
|
|
499
|
+
? ContentTypes.REASONING_CONTENT
|
|
500
|
+
: ContentTypes.THINKING,
|
|
501
|
+
thinkingStartIndex:
|
|
502
|
+
factoryParams.thinkingEnabled === true
|
|
503
|
+
? runThinkingStartIndex
|
|
504
|
+
: undefined,
|
|
412
505
|
});
|
|
413
506
|
runThinkingStartIndex = thinkingStartIndex ?? -1;
|
|
414
507
|
/** The index is the first value of `context`, index relative to `params.messages` */
|
|
415
|
-
lastCutOffIndex = Math.max(
|
|
508
|
+
lastCutOffIndex = Math.max(
|
|
509
|
+
params.messages.length -
|
|
510
|
+
(context.length - (context[0]?.getType() === 'system' ? 1 : 0)),
|
|
511
|
+
0
|
|
512
|
+
);
|
|
416
513
|
|
|
417
514
|
return { context, indexTokenCountMap };
|
|
418
515
|
};
|