@link-assistant/agent 0.21.0 → 0.22.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/src/cli/continuous-mode.js +6 -2
- package/src/cli/defaults.ts +6 -1
- package/src/cli/run-options.js +5 -0
- package/src/index.js +14 -6
- package/src/provider/provider.ts +45 -1
- package/src/session/compaction.ts +30 -4
- package/src/session/message-v2.ts +1 -0
- package/src/session/processor.ts +83 -19
- package/src/session/prompt.ts +81 -10
- package/src/util/sse-usage-extractor.ts +144 -0
- package/src/util/token.ts +90 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@link-assistant/agent",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.22.1",
|
|
4
4
|
"description": "A minimal, public domain AI CLI agent compatible with OpenCode's JSON interface. Bun-only runtime.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"type": "module",
|
|
@@ -90,6 +90,7 @@
|
|
|
90
90
|
"diff": "^8.0.2",
|
|
91
91
|
"fuzzysort": "^3.1.0",
|
|
92
92
|
"glob": "^10.0.0",
|
|
93
|
+
"gpt-tokenizer": "^3.4.0",
|
|
93
94
|
"gray-matter": "^4.0.3",
|
|
94
95
|
"hono": "^4.10.6",
|
|
95
96
|
"hono-openapi": "^1.1.1",
|
|
@@ -194,7 +194,8 @@ export async function runContinuousServerMode(
|
|
|
194
194
|
systemMessage,
|
|
195
195
|
appendSystemMessage,
|
|
196
196
|
jsonStandard,
|
|
197
|
-
compactionModel
|
|
197
|
+
compactionModel,
|
|
198
|
+
temperature
|
|
198
199
|
) {
|
|
199
200
|
// Check both CLI flag and environment variable for compact JSON mode
|
|
200
201
|
const compactJson = argv['compact-json'] === true || config.compactJson;
|
|
@@ -290,6 +291,7 @@ export async function runContinuousServerMode(
|
|
|
290
291
|
compactionModel,
|
|
291
292
|
system: systemMessage,
|
|
292
293
|
appendSystem: appendSystemMessage,
|
|
294
|
+
temperature,
|
|
293
295
|
}),
|
|
294
296
|
}
|
|
295
297
|
).catch((error) => {
|
|
@@ -446,7 +448,8 @@ export async function runContinuousDirectMode(
|
|
|
446
448
|
systemMessage,
|
|
447
449
|
appendSystemMessage,
|
|
448
450
|
jsonStandard,
|
|
449
|
-
compactionModel
|
|
451
|
+
compactionModel,
|
|
452
|
+
temperature
|
|
450
453
|
) {
|
|
451
454
|
// Check both CLI flag and environment variable for compact JSON mode
|
|
452
455
|
const compactJson = argv['compact-json'] === true || config.compactJson;
|
|
@@ -523,6 +526,7 @@ export async function runContinuousDirectMode(
|
|
|
523
526
|
compactionModel,
|
|
524
527
|
system: systemMessage,
|
|
525
528
|
appendSystem: appendSystemMessage,
|
|
529
|
+
temperature,
|
|
526
530
|
}).catch((error) => {
|
|
527
531
|
hasError = true;
|
|
528
532
|
eventHandler.output({
|
package/src/cli/defaults.ts
CHANGED
|
@@ -52,6 +52,11 @@ export const DEFAULT_COMPACTION_MODELS =
|
|
|
52
52
|
* Applied only when the compaction model has a context window equal to or smaller
|
|
53
53
|
* than the base model. When the compaction model has a larger context, the margin
|
|
54
54
|
* is automatically set to 0 (allowing 100% context usage).
|
|
55
|
+
*
|
|
56
|
+
* Increased from 15% to 25% to reduce probability of context overflow errors,
|
|
57
|
+
* especially when providers return inaccurate or zero token counts.
|
|
58
|
+
* Matches OpenCode upstream's 75% threshold (25% margin).
|
|
55
59
|
* @see https://github.com/link-assistant/agent/issues/219
|
|
60
|
+
* @see https://github.com/link-assistant/agent/issues/249
|
|
56
61
|
*/
|
|
57
|
-
export const DEFAULT_COMPACTION_SAFETY_MARGIN_PERCENT =
|
|
62
|
+
export const DEFAULT_COMPACTION_SAFETY_MARGIN_PERCENT = 25;
|
package/src/cli/run-options.js
CHANGED
|
@@ -168,5 +168,10 @@ export function buildRunOptions(yargs) {
|
|
|
168
168
|
description:
|
|
169
169
|
'Safety margin (%) of usable context window before triggering compaction. Only applies when the compaction model has equal or smaller context than the base model. Default: 15.',
|
|
170
170
|
default: DEFAULT_COMPACTION_SAFETY_MARGIN_PERCENT,
|
|
171
|
+
})
|
|
172
|
+
.option('temperature', {
|
|
173
|
+
type: 'number',
|
|
174
|
+
description:
|
|
175
|
+
'Override the temperature for model completions. When not set, the default per-model temperature is used.',
|
|
171
176
|
});
|
|
172
177
|
}
|
package/src/index.js
CHANGED
|
@@ -313,7 +313,8 @@ async function runAgentMode(argv, request) {
|
|
|
313
313
|
systemMessage,
|
|
314
314
|
appendSystemMessage,
|
|
315
315
|
jsonStandard,
|
|
316
|
-
compactionModel
|
|
316
|
+
compactionModel,
|
|
317
|
+
argv.temperature
|
|
317
318
|
);
|
|
318
319
|
} else {
|
|
319
320
|
// DIRECT MODE: Run everything in single process
|
|
@@ -325,7 +326,8 @@ async function runAgentMode(argv, request) {
|
|
|
325
326
|
systemMessage,
|
|
326
327
|
appendSystemMessage,
|
|
327
328
|
jsonStandard,
|
|
328
|
-
compactionModel
|
|
329
|
+
compactionModel,
|
|
330
|
+
argv.temperature
|
|
329
331
|
);
|
|
330
332
|
}
|
|
331
333
|
},
|
|
@@ -399,7 +401,8 @@ async function runContinuousAgentMode(argv) {
|
|
|
399
401
|
systemMessage,
|
|
400
402
|
appendSystemMessage,
|
|
401
403
|
jsonStandard,
|
|
402
|
-
compactionModel
|
|
404
|
+
compactionModel,
|
|
405
|
+
argv.temperature
|
|
403
406
|
);
|
|
404
407
|
} else {
|
|
405
408
|
// DIRECT MODE: Run everything in single process
|
|
@@ -410,7 +413,8 @@ async function runContinuousAgentMode(argv) {
|
|
|
410
413
|
systemMessage,
|
|
411
414
|
appendSystemMessage,
|
|
412
415
|
jsonStandard,
|
|
413
|
-
compactionModel
|
|
416
|
+
compactionModel,
|
|
417
|
+
argv.temperature
|
|
414
418
|
);
|
|
415
419
|
}
|
|
416
420
|
},
|
|
@@ -433,7 +437,8 @@ async function runServerMode(
|
|
|
433
437
|
systemMessage,
|
|
434
438
|
appendSystemMessage,
|
|
435
439
|
jsonStandard,
|
|
436
|
-
compactionModel
|
|
440
|
+
compactionModel,
|
|
441
|
+
temperature
|
|
437
442
|
) {
|
|
438
443
|
const compactJson = argv['compact-json'] === true;
|
|
439
444
|
|
|
@@ -502,6 +507,7 @@ async function runServerMode(
|
|
|
502
507
|
compactionModel,
|
|
503
508
|
system: systemMessage,
|
|
504
509
|
appendSystem: appendSystemMessage,
|
|
510
|
+
temperature,
|
|
505
511
|
}),
|
|
506
512
|
}
|
|
507
513
|
).catch((error) => {
|
|
@@ -534,7 +540,8 @@ async function runDirectMode(
|
|
|
534
540
|
systemMessage,
|
|
535
541
|
appendSystemMessage,
|
|
536
542
|
jsonStandard,
|
|
537
|
-
compactionModel
|
|
543
|
+
compactionModel,
|
|
544
|
+
temperature
|
|
538
545
|
) {
|
|
539
546
|
const compactJson = argv['compact-json'] === true;
|
|
540
547
|
|
|
@@ -587,6 +594,7 @@ async function runDirectMode(
|
|
|
587
594
|
compactionModel,
|
|
588
595
|
system: systemMessage,
|
|
589
596
|
appendSystem: appendSystemMessage,
|
|
597
|
+
temperature,
|
|
590
598
|
}).catch((error) => {
|
|
591
599
|
hasError = true;
|
|
592
600
|
eventHandler.output({
|
package/src/provider/provider.ts
CHANGED
|
@@ -17,6 +17,7 @@ import { iife } from '../util/iife';
|
|
|
17
17
|
import { createEchoModel } from './echo';
|
|
18
18
|
import { createCacheModel } from './cache';
|
|
19
19
|
import { RetryFetch } from './retry-fetch';
|
|
20
|
+
import { SSEUsageExtractor } from '../util/sse-usage-extractor';
|
|
20
21
|
|
|
21
22
|
// Direct imports for bundled providers - these are pre-installed to avoid runtime installation hangs
|
|
22
23
|
// @see https://github.com/link-assistant/agent/issues/173
|
|
@@ -1232,8 +1233,41 @@ export namespace Provider {
|
|
|
1232
1233
|
// flag state loss in subprocess/module-reload scenarios.
|
|
1233
1234
|
// See: https://github.com/link-assistant/agent/issues/206
|
|
1234
1235
|
// See: https://github.com/link-assistant/agent/issues/227
|
|
1236
|
+
// Even when verbose mode is off, intercept streaming responses
|
|
1237
|
+
// to extract usage tokens from raw SSE data. This is critical for
|
|
1238
|
+
// recovering usage when the AI SDK drops it from finish-step events.
|
|
1239
|
+
// @see https://github.com/link-assistant/agent/issues/249
|
|
1235
1240
|
if (!isVerbose()) {
|
|
1236
|
-
|
|
1241
|
+
const response = await innerFetch(input, init);
|
|
1242
|
+
const ct = response.headers.get('content-type') ?? '';
|
|
1243
|
+
const isSSE =
|
|
1244
|
+
ct.includes('event-stream') || ct.includes('octet-stream');
|
|
1245
|
+
if (isSSE && response.body) {
|
|
1246
|
+
const [sdkStream, usageStream] = response.body.tee();
|
|
1247
|
+
const sseReqId = SSEUsageExtractor.nextRequestId();
|
|
1248
|
+
(async () => {
|
|
1249
|
+
try {
|
|
1250
|
+
const reader = usageStream.getReader();
|
|
1251
|
+
const decoder = new TextDecoder();
|
|
1252
|
+
let body = '';
|
|
1253
|
+
while (true) {
|
|
1254
|
+
const { done, value } = await reader.read();
|
|
1255
|
+
if (done) break;
|
|
1256
|
+
body += decoder.decode(value, { stream: true });
|
|
1257
|
+
if (body.length > 50000) break;
|
|
1258
|
+
}
|
|
1259
|
+
SSEUsageExtractor.processStreamForUsage(sseReqId, body);
|
|
1260
|
+
} catch {
|
|
1261
|
+
// Never break the SDK stream
|
|
1262
|
+
}
|
|
1263
|
+
})();
|
|
1264
|
+
return new Response(sdkStream, {
|
|
1265
|
+
status: response.status,
|
|
1266
|
+
statusText: response.statusText,
|
|
1267
|
+
headers: response.headers,
|
|
1268
|
+
});
|
|
1269
|
+
}
|
|
1270
|
+
return response;
|
|
1237
1271
|
}
|
|
1238
1272
|
|
|
1239
1273
|
httpCallCount++;
|
|
@@ -1374,6 +1408,10 @@ export namespace Provider {
|
|
|
1374
1408
|
const [sdkStream, logStream] = response.body.tee();
|
|
1375
1409
|
|
|
1376
1410
|
// Consume log stream asynchronously (does not block SDK)
|
|
1411
|
+
// Also extract usage tokens from raw SSE data as fallback
|
|
1412
|
+
// for when the AI SDK drops usage from its finish-step event.
|
|
1413
|
+
// @see https://github.com/link-assistant/agent/issues/249
|
|
1414
|
+
const sseRequestId = SSEUsageExtractor.nextRequestId();
|
|
1377
1415
|
(async () => {
|
|
1378
1416
|
try {
|
|
1379
1417
|
const reader = logStream.getReader();
|
|
@@ -1395,6 +1433,11 @@ export namespace Provider {
|
|
|
1395
1433
|
}
|
|
1396
1434
|
}
|
|
1397
1435
|
}
|
|
1436
|
+
// Extract usage from raw SSE stream as AI SDK fallback
|
|
1437
|
+
SSEUsageExtractor.processStreamForUsage(
|
|
1438
|
+
sseRequestId,
|
|
1439
|
+
bodyPreview
|
|
1440
|
+
);
|
|
1398
1441
|
// Use direct (non-lazy) logging for stream body
|
|
1399
1442
|
// See: https://github.com/link-assistant/agent/issues/211
|
|
1400
1443
|
log.info('HTTP response body (stream)', {
|
|
@@ -1402,6 +1445,7 @@ export namespace Provider {
|
|
|
1402
1445
|
providerID: provider.id,
|
|
1403
1446
|
callNum,
|
|
1404
1447
|
url,
|
|
1448
|
+
sseRequestId,
|
|
1405
1449
|
bodyPreview: truncated
|
|
1406
1450
|
? bodyPreview + `... [truncated]`
|
|
1407
1451
|
: bodyPreview,
|
|
@@ -30,11 +30,19 @@ export namespace SessionCompaction {
|
|
|
30
30
|
|
|
31
31
|
/**
|
|
32
32
|
* Default safety margin ratio for compaction trigger.
|
|
33
|
-
* We trigger compaction at
|
|
34
|
-
* This means we stop
|
|
33
|
+
* We trigger compaction at 75% of usable context to avoid hitting hard limits.
|
|
34
|
+
* This means we stop 25% before (context - output) tokens.
|
|
35
|
+
*
|
|
36
|
+
* Lowered from 0.85 to 0.75 (matching OpenCode upstream) because:
|
|
37
|
+
* - When providers return 0 token counts, the system relies on estimated tokens
|
|
38
|
+
* which can be inaccurate, so a larger safety buffer is needed.
|
|
39
|
+
* - Gemini CLI uses 50%, OpenCode upstream uses 75%, Claude Code uses ~83.5%.
|
|
40
|
+
* - A 75% threshold provides a good balance between context utilization and
|
|
41
|
+
* preventing context overflow errors.
|
|
35
42
|
* @see https://github.com/link-assistant/agent/issues/217
|
|
43
|
+
* @see https://github.com/link-assistant/agent/issues/249
|
|
36
44
|
*/
|
|
37
|
-
export const OVERFLOW_SAFETY_MARGIN = 0.
|
|
45
|
+
export const OVERFLOW_SAFETY_MARGIN = 0.75;
|
|
38
46
|
|
|
39
47
|
/**
|
|
40
48
|
* A single compaction model entry in the cascade.
|
|
@@ -117,12 +125,26 @@ export namespace SessionCompaction {
|
|
|
117
125
|
model: ModelsDev.Model;
|
|
118
126
|
compactionModel?: CompactionModelConfig;
|
|
119
127
|
compactionModelContextLimit?: number;
|
|
128
|
+
/**
|
|
129
|
+
* Optional estimated input tokens from message content.
|
|
130
|
+
* Used as fallback when provider returns 0 for all token counts.
|
|
131
|
+
* This prevents the system from never triggering compaction when
|
|
132
|
+
* providers don't report token usage.
|
|
133
|
+
* @see https://github.com/link-assistant/agent/issues/249
|
|
134
|
+
*/
|
|
135
|
+
estimatedInputTokens?: number;
|
|
120
136
|
}) {
|
|
121
137
|
if (config.disableAutocompact) return false;
|
|
122
138
|
const baseModelContextLimit = input.model.limit.context;
|
|
123
139
|
if (baseModelContextLimit === 0) return false;
|
|
124
|
-
const
|
|
140
|
+
const providerCount =
|
|
125
141
|
input.tokens.input + input.tokens.cache.read + input.tokens.output;
|
|
142
|
+
// When provider returns 0 for all token counts, use the estimated input tokens
|
|
143
|
+
// as a fallback. This prevents the system from never triggering compaction
|
|
144
|
+
// when providers (e.g., OpenCode with Nvidia/nemotron) don't report token usage.
|
|
145
|
+
// @see https://github.com/link-assistant/agent/issues/249
|
|
146
|
+
const count =
|
|
147
|
+
providerCount > 0 ? providerCount : (input.estimatedInputTokens ?? 0);
|
|
126
148
|
const outputTokenLimit =
|
|
127
149
|
Math.min(input.model.limit.output, SessionPrompt.OUTPUT_TOKEN_MAX) ||
|
|
128
150
|
SessionPrompt.OUTPUT_TOKEN_MAX;
|
|
@@ -145,6 +167,10 @@ export namespace SessionCompaction {
|
|
|
145
167
|
compactionModelID: input.compactionModel?.modelID,
|
|
146
168
|
compactionModelContextLimit: input.compactionModelContextLimit,
|
|
147
169
|
currentTokens: count,
|
|
170
|
+
providerTokens: providerCount,
|
|
171
|
+
estimatedInputTokens: input.estimatedInputTokens ?? 0,
|
|
172
|
+
usingEstimate:
|
|
173
|
+
providerCount === 0 && (input.estimatedInputTokens ?? 0) > 0,
|
|
148
174
|
tokensBreakdown: {
|
|
149
175
|
input: input.tokens.input,
|
|
150
176
|
cacheRead: input.tokens.cache.read,
|
|
@@ -411,6 +411,7 @@ export namespace MessageV2 {
|
|
|
411
411
|
.optional(),
|
|
412
412
|
system: z.string().optional(),
|
|
413
413
|
appendSystem: z.string().optional(),
|
|
414
|
+
temperature: z.number().optional(),
|
|
414
415
|
tools: z.record(z.string(), z.boolean()).optional(),
|
|
415
416
|
}).meta({
|
|
416
417
|
ref: 'UserMessage',
|
package/src/session/processor.ts
CHANGED
|
@@ -18,6 +18,7 @@ import { SessionRetry } from './retry';
|
|
|
18
18
|
import { SessionStatus } from './status';
|
|
19
19
|
import { config, isVerbose } from '../config/config';
|
|
20
20
|
import { SessionCompaction } from './compaction';
|
|
21
|
+
import { SSEUsageExtractor } from '../util/sse-usage-extractor';
|
|
21
22
|
|
|
22
23
|
export namespace SessionProcessor {
|
|
23
24
|
const DOOM_LOOP_THRESHOLD = 3;
|
|
@@ -327,32 +328,95 @@ export namespace SessionProcessor {
|
|
|
327
328
|
input.assistantMessage.cost += usage.cost;
|
|
328
329
|
input.assistantMessage.tokens = usage.tokens;
|
|
329
330
|
|
|
330
|
-
// Log
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
message:
|
|
339
|
-
'provider returned zero tokens with unknown finish reason at step level',
|
|
331
|
+
// Log raw usage data at step level for debugging token parsing issues.
|
|
332
|
+
// The AI SDK may drop token data between the raw HTTP response and the
|
|
333
|
+
// finish-step event (e.g., @ai-sdk/openai-compatible may not propagate
|
|
334
|
+
// usage from SSE stream chunks). This log helps detect such mismatches.
|
|
335
|
+
// @see https://github.com/link-assistant/agent/issues/249
|
|
336
|
+
if (isVerbose()) {
|
|
337
|
+
log.debug(() => ({
|
|
338
|
+
message: 'step-finish raw usage diagnostics',
|
|
340
339
|
providerID: input.providerID,
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
(value as any).response?.modelId ?? 'none',
|
|
344
|
-
rawFinishReason: String(
|
|
345
|
-
value.finishReason ?? 'undefined'
|
|
346
|
-
),
|
|
340
|
+
modelID: input.model.id,
|
|
341
|
+
parsedTokens: usage.tokens,
|
|
347
342
|
rawUsage: JSON.stringify(value.usage ?? null),
|
|
348
|
-
|
|
343
|
+
rawProviderMetadata: JSON.stringify(
|
|
349
344
|
value.providerMetadata ?? null
|
|
350
345
|
),
|
|
351
|
-
|
|
352
|
-
|
|
346
|
+
rawFinishReason: String(
|
|
347
|
+
value.finishReason ?? 'undefined'
|
|
348
|
+
),
|
|
349
|
+
respondedModelID:
|
|
350
|
+
(value as any).response?.modelId ?? 'none',
|
|
353
351
|
}));
|
|
354
352
|
}
|
|
355
353
|
|
|
354
|
+
// When AI SDK returns zero tokens, try to recover usage from
|
|
355
|
+
// raw SSE stream data captured by the fetch interceptor.
|
|
356
|
+
// The AI SDK may drop token data between the raw HTTP response
|
|
357
|
+
// and the finish-step event (known bug in @ai-sdk/openai-compatible).
|
|
358
|
+
// @see https://github.com/link-assistant/agent/issues/249
|
|
359
|
+
if (
|
|
360
|
+
usage.tokens.input === 0 &&
|
|
361
|
+
usage.tokens.output === 0 &&
|
|
362
|
+
usage.tokens.reasoning === 0
|
|
363
|
+
) {
|
|
364
|
+
const sseUsage = SSEUsageExtractor.consumeLatestUsage();
|
|
365
|
+
if (sseUsage) {
|
|
366
|
+
const recoveredUsage = Session.getUsage({
|
|
367
|
+
model: input.model,
|
|
368
|
+
usage: {
|
|
369
|
+
inputTokens: sseUsage.promptTokens,
|
|
370
|
+
outputTokens: sseUsage.completionTokens,
|
|
371
|
+
totalTokens: sseUsage.totalTokens,
|
|
372
|
+
reasoningTokens: sseUsage.reasoningTokens ?? 0,
|
|
373
|
+
cachedInputTokens: sseUsage.cachedTokens ?? 0,
|
|
374
|
+
},
|
|
375
|
+
metadata: value.providerMetadata,
|
|
376
|
+
});
|
|
377
|
+
input.assistantMessage.cost =
|
|
378
|
+
input.assistantMessage.cost -
|
|
379
|
+
usage.cost +
|
|
380
|
+
recoveredUsage.cost;
|
|
381
|
+
input.assistantMessage.tokens = recoveredUsage.tokens;
|
|
382
|
+
log.warn(() => ({
|
|
383
|
+
message:
|
|
384
|
+
'recovered usage from raw SSE stream — AI SDK dropped token data',
|
|
385
|
+
providerID: input.providerID,
|
|
386
|
+
requestedModelID: input.model.id,
|
|
387
|
+
recoveredTokens: recoveredUsage.tokens,
|
|
388
|
+
recoveredCost: recoveredUsage.cost,
|
|
389
|
+
ssePromptTokens: sseUsage.promptTokens,
|
|
390
|
+
sseCompletionTokens: sseUsage.completionTokens,
|
|
391
|
+
issue:
|
|
392
|
+
'https://github.com/link-assistant/agent/issues/249',
|
|
393
|
+
}));
|
|
394
|
+
// Update the step-finish part with recovered data
|
|
395
|
+
usage.tokens = recoveredUsage.tokens;
|
|
396
|
+
usage.cost = recoveredUsage.cost;
|
|
397
|
+
} else {
|
|
398
|
+
log.warn(() => ({
|
|
399
|
+
message:
|
|
400
|
+
'provider returned zero tokens at step level — AI SDK may not be propagating usage from raw HTTP response',
|
|
401
|
+
providerID: input.providerID,
|
|
402
|
+
requestedModelID: input.model.id,
|
|
403
|
+
respondedModelID:
|
|
404
|
+
(value as any).response?.modelId ?? 'none',
|
|
405
|
+
finishReason,
|
|
406
|
+
rawFinishReason: String(
|
|
407
|
+
value.finishReason ?? 'undefined'
|
|
408
|
+
),
|
|
409
|
+
rawUsage: JSON.stringify(value.usage ?? null),
|
|
410
|
+
providerMetadata: JSON.stringify(
|
|
411
|
+
value.providerMetadata ?? null
|
|
412
|
+
),
|
|
413
|
+
hint: 'No raw SSE usage found either. The token estimation fallback in isOverflow() handles this case.',
|
|
414
|
+
issue:
|
|
415
|
+
'https://github.com/link-assistant/agent/issues/249',
|
|
416
|
+
}));
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
|
|
356
420
|
// Build model info if --output-response-model flag is enabled
|
|
357
421
|
// @see https://github.com/link-assistant/agent/issues/179
|
|
358
422
|
const modelInfo: MessageV2.ModelInfo | undefined =
|
package/src/session/prompt.ts
CHANGED
|
@@ -54,6 +54,45 @@ export namespace SessionPrompt {
|
|
|
54
54
|
const log = Log.create({ service: 'session.prompt' });
|
|
55
55
|
export const OUTPUT_TOKEN_MAX = 32_000;
|
|
56
56
|
|
|
57
|
+
/**
|
|
58
|
+
* Cap maxOutputTokens so that estimated input + output never exceeds
|
|
59
|
+
* the model's context limit. This prevents "context length exceeded" errors
|
|
60
|
+
* when the conversation has grown close to the model's limit.
|
|
61
|
+
*
|
|
62
|
+
* Returns at least 1024 tokens to avoid degenerate cases.
|
|
63
|
+
* Returns baseMaxOutput unchanged if contextLimit is 0 (unknown).
|
|
64
|
+
* @see https://github.com/link-assistant/agent/issues/249
|
|
65
|
+
*/
|
|
66
|
+
function capOutputTokensToContext(input: {
|
|
67
|
+
baseMaxOutput: number;
|
|
68
|
+
contextLimit: number;
|
|
69
|
+
estimatedInputTokens: number;
|
|
70
|
+
}): number {
|
|
71
|
+
if (input.contextLimit <= 0) return input.baseMaxOutput;
|
|
72
|
+
const available = input.contextLimit - input.estimatedInputTokens;
|
|
73
|
+
if (available < 1024) {
|
|
74
|
+
log.warn(() => ({
|
|
75
|
+
message:
|
|
76
|
+
'estimated input tokens near or exceeding context limit — capping output to 1024',
|
|
77
|
+
contextLimit: input.contextLimit,
|
|
78
|
+
estimatedInputTokens: input.estimatedInputTokens,
|
|
79
|
+
available,
|
|
80
|
+
}));
|
|
81
|
+
return 1024;
|
|
82
|
+
}
|
|
83
|
+
const capped = Math.min(input.baseMaxOutput, available);
|
|
84
|
+
if (capped < input.baseMaxOutput) {
|
|
85
|
+
log.info(() => ({
|
|
86
|
+
message: 'capped maxOutputTokens to fit within context limit',
|
|
87
|
+
baseMaxOutput: input.baseMaxOutput,
|
|
88
|
+
cappedMaxOutput: capped,
|
|
89
|
+
contextLimit: input.contextLimit,
|
|
90
|
+
estimatedInputTokens: input.estimatedInputTokens,
|
|
91
|
+
}));
|
|
92
|
+
}
|
|
93
|
+
return capped;
|
|
94
|
+
}
|
|
95
|
+
|
|
57
96
|
const state = Instance.state(
|
|
58
97
|
() => {
|
|
59
98
|
const data: Record<
|
|
@@ -110,6 +149,7 @@ export namespace SessionPrompt {
|
|
|
110
149
|
noReply: z.boolean().optional(),
|
|
111
150
|
system: z.string().optional(),
|
|
112
151
|
appendSystem: z.string().optional(),
|
|
152
|
+
temperature: z.number().optional(),
|
|
113
153
|
tools: z.record(z.string(), z.boolean()).optional(),
|
|
114
154
|
parts: z.array(
|
|
115
155
|
z.discriminatedUnion('type', [
|
|
@@ -666,6 +706,29 @@ export namespace SessionPrompt {
|
|
|
666
706
|
}
|
|
667
707
|
|
|
668
708
|
// context overflow, needs compaction
|
|
709
|
+
// Count input tokens from message content as fallback for providers
|
|
710
|
+
// that return 0 token counts (e.g., Nvidia/nemotron via OpenCode).
|
|
711
|
+
// Uses real BPE tokenization (gpt-tokenizer) when available, falls back
|
|
712
|
+
// to character-based heuristic (~4 chars/token) for unknown tokenizers.
|
|
713
|
+
// @see https://github.com/link-assistant/agent/issues/249
|
|
714
|
+
const messageContent = msgs
|
|
715
|
+
.map((m) =>
|
|
716
|
+
m.parts
|
|
717
|
+
.map((p) => {
|
|
718
|
+
if (p.type === 'text') return p.text;
|
|
719
|
+
if (
|
|
720
|
+
p.type === 'tool' &&
|
|
721
|
+
p.state.status === 'completed' &&
|
|
722
|
+
!p.state.time.compacted
|
|
723
|
+
)
|
|
724
|
+
return p.state.output;
|
|
725
|
+
return '';
|
|
726
|
+
})
|
|
727
|
+
.join('')
|
|
728
|
+
)
|
|
729
|
+
.join('');
|
|
730
|
+
const tokenResult = Token.countTokens(messageContent);
|
|
731
|
+
const estimatedInputTokens = tokenResult.count;
|
|
669
732
|
if (
|
|
670
733
|
lastFinished &&
|
|
671
734
|
lastFinished.summary !== true &&
|
|
@@ -674,6 +737,7 @@ export namespace SessionPrompt {
|
|
|
674
737
|
model: model.info ?? { id: model.modelID },
|
|
675
738
|
compactionModel: lastUser.compactionModel,
|
|
676
739
|
compactionModelContextLimit,
|
|
740
|
+
estimatedInputTokens,
|
|
677
741
|
})
|
|
678
742
|
) {
|
|
679
743
|
await SessionCompaction.create({
|
|
@@ -734,10 +798,12 @@ export namespace SessionPrompt {
|
|
|
734
798
|
});
|
|
735
799
|
const params = {
|
|
736
800
|
temperature:
|
|
737
|
-
|
|
738
|
-
?
|
|
739
|
-
|
|
740
|
-
|
|
801
|
+
lastUser.temperature != null
|
|
802
|
+
? lastUser.temperature
|
|
803
|
+
: (model.info?.temperature ?? false)
|
|
804
|
+
? (agent.temperature ??
|
|
805
|
+
ProviderTransform.temperature(model.providerID, model.modelID))
|
|
806
|
+
: undefined,
|
|
741
807
|
topP:
|
|
742
808
|
agent.topP ?? ProviderTransform.topP(model.providerID, model.modelID),
|
|
743
809
|
options: {
|
|
@@ -905,12 +971,16 @@ export namespace SessionPrompt {
|
|
|
905
971
|
// set to 0, we handle loop
|
|
906
972
|
maxRetries: 0,
|
|
907
973
|
activeTools: Object.keys(tools).filter((x) => x !== 'invalid'),
|
|
908
|
-
maxOutputTokens:
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
974
|
+
maxOutputTokens: capOutputTokensToContext({
|
|
975
|
+
baseMaxOutput: ProviderTransform.maxOutputTokens(
|
|
976
|
+
model.providerID,
|
|
977
|
+
params.options,
|
|
978
|
+
model.info?.limit?.output ?? 100000,
|
|
979
|
+
OUTPUT_TOKEN_MAX
|
|
980
|
+
),
|
|
981
|
+
contextLimit: model.info?.limit?.context ?? 0,
|
|
982
|
+
estimatedInputTokens,
|
|
983
|
+
}),
|
|
914
984
|
abortSignal: abort,
|
|
915
985
|
providerOptions: ProviderTransform.providerOptions(
|
|
916
986
|
model.npm,
|
|
@@ -1189,6 +1259,7 @@ export namespace SessionPrompt {
|
|
|
1189
1259
|
tools: input.tools,
|
|
1190
1260
|
system: input.system,
|
|
1191
1261
|
appendSystem: input.appendSystem,
|
|
1262
|
+
temperature: input.temperature,
|
|
1192
1263
|
agent: agent.name,
|
|
1193
1264
|
model: await resolveModel({
|
|
1194
1265
|
model: input.model,
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import { Log } from './log';
|
|
2
|
+
import { isVerbose } from '../config/config';
|
|
3
|
+
|
|
4
|
+
const log = Log.create({ service: 'sse-usage' });
|
|
5
|
+
|
|
6
|
+
export interface SSEUsageData {
|
|
7
|
+
promptTokens: number;
|
|
8
|
+
completionTokens: number;
|
|
9
|
+
totalTokens: number;
|
|
10
|
+
cachedTokens?: number;
|
|
11
|
+
reasoningTokens?: number;
|
|
12
|
+
timestamp: number;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
const pendingUsage = new Map<string, SSEUsageData>();
|
|
16
|
+
let requestCounter = 0;
|
|
17
|
+
|
|
18
|
+
export namespace SSEUsageExtractor {
|
|
19
|
+
export function nextRequestId(): string {
|
|
20
|
+
return `sse-req-${++requestCounter}`;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export function extractUsageFromSSEChunk(
|
|
24
|
+
chunk: string
|
|
25
|
+
): SSEUsageData | undefined {
|
|
26
|
+
const lines = chunk.split('\n');
|
|
27
|
+
let lastUsage: SSEUsageData | undefined;
|
|
28
|
+
|
|
29
|
+
for (const line of lines) {
|
|
30
|
+
if (!line.startsWith('data: ')) continue;
|
|
31
|
+
const data = line.slice(6).trim();
|
|
32
|
+
if (data === '[DONE]') continue;
|
|
33
|
+
|
|
34
|
+
try {
|
|
35
|
+
const parsed = JSON.parse(data);
|
|
36
|
+
const usage =
|
|
37
|
+
parsed.usage ?? parsed.x_groq?.usage ?? parsed.choices?.[0]?.usage;
|
|
38
|
+
|
|
39
|
+
if (usage && typeof usage === 'object') {
|
|
40
|
+
const prompt =
|
|
41
|
+
usage.prompt_tokens ?? usage.input_tokens ?? usage.promptTokens;
|
|
42
|
+
const completion =
|
|
43
|
+
usage.completion_tokens ??
|
|
44
|
+
usage.output_tokens ??
|
|
45
|
+
usage.completionTokens;
|
|
46
|
+
const total =
|
|
47
|
+
usage.total_tokens ?? usage.totalTokens ?? prompt + completion;
|
|
48
|
+
|
|
49
|
+
if (
|
|
50
|
+
typeof prompt === 'number' &&
|
|
51
|
+
typeof completion === 'number' &&
|
|
52
|
+
(prompt > 0 || completion > 0)
|
|
53
|
+
) {
|
|
54
|
+
lastUsage = {
|
|
55
|
+
promptTokens: prompt,
|
|
56
|
+
completionTokens: completion,
|
|
57
|
+
totalTokens:
|
|
58
|
+
typeof total === 'number' ? total : prompt + completion,
|
|
59
|
+
cachedTokens:
|
|
60
|
+
usage.prompt_tokens_details?.cached_tokens ??
|
|
61
|
+
usage.cache_read_input_tokens ??
|
|
62
|
+
usage.cachedTokens ??
|
|
63
|
+
undefined,
|
|
64
|
+
reasoningTokens:
|
|
65
|
+
usage.completion_tokens_details?.reasoning_tokens ??
|
|
66
|
+
usage.reasoning_tokens ??
|
|
67
|
+
undefined,
|
|
68
|
+
timestamp: Date.now(),
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
} catch {
|
|
73
|
+
// Not valid JSON — skip
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return lastUsage;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
export function processStreamForUsage(
|
|
81
|
+
requestId: string,
|
|
82
|
+
streamBody: string
|
|
83
|
+
): void {
|
|
84
|
+
const usage = extractUsageFromSSEChunk(streamBody);
|
|
85
|
+
if (usage) {
|
|
86
|
+
pendingUsage.set(requestId, usage);
|
|
87
|
+
if (isVerbose()) {
|
|
88
|
+
log.info('raw SSE usage extracted', {
|
|
89
|
+
requestId,
|
|
90
|
+
promptTokens: usage.promptTokens,
|
|
91
|
+
completionTokens: usage.completionTokens,
|
|
92
|
+
totalTokens: usage.totalTokens,
|
|
93
|
+
cachedTokens: usage.cachedTokens,
|
|
94
|
+
reasoningTokens: usage.reasoningTokens,
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
export function getUsage(requestId: string): SSEUsageData | undefined {
|
|
101
|
+
return pendingUsage.get(requestId);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
export function consumeUsage(requestId: string): SSEUsageData | undefined {
|
|
105
|
+
const usage = pendingUsage.get(requestId);
|
|
106
|
+
if (usage) {
|
|
107
|
+
pendingUsage.delete(requestId);
|
|
108
|
+
}
|
|
109
|
+
return usage;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
export function getLatestUsage(): SSEUsageData | undefined {
|
|
113
|
+
let latest: SSEUsageData | undefined;
|
|
114
|
+
for (const usage of pendingUsage.values()) {
|
|
115
|
+
if (!latest || usage.timestamp > latest.timestamp) {
|
|
116
|
+
latest = usage;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return latest;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
export function consumeLatestUsage(): SSEUsageData | undefined {
|
|
123
|
+
let latestKey: string | undefined;
|
|
124
|
+
let latestUsage: SSEUsageData | undefined;
|
|
125
|
+
for (const [key, usage] of pendingUsage.entries()) {
|
|
126
|
+
if (!latestUsage || usage.timestamp > latestUsage.timestamp) {
|
|
127
|
+
latestKey = key;
|
|
128
|
+
latestUsage = usage;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
if (latestKey) {
|
|
132
|
+
pendingUsage.delete(latestKey);
|
|
133
|
+
}
|
|
134
|
+
return latestUsage;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
export function clear(): void {
|
|
138
|
+
pendingUsage.clear();
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
export function size(): number {
|
|
142
|
+
return pendingUsage.size;
|
|
143
|
+
}
|
|
144
|
+
}
|
package/src/util/token.ts
CHANGED
|
@@ -1,7 +1,97 @@
|
|
|
1
|
+
import { Log } from './log';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Token estimation utilities.
|
|
5
|
+
*
|
|
6
|
+
* Provides two levels of accuracy:
|
|
7
|
+
*
|
|
8
|
+
* 1. **Real BPE tokenization** via `gpt-tokenizer` (o200k_base encoding) —
|
|
9
|
+
* accurate for OpenAI-compatible models (GPT-4o, GPT-4.1, GPT-5, etc.).
|
|
10
|
+
* Used by `countTokens()` when available.
|
|
11
|
+
*
|
|
12
|
+
* 2. **Character-based heuristic** (≈4 chars per token for English text) —
|
|
13
|
+
* fallback for models with unknown tokenizers (Nvidia Nemotron, Google Gemini,
|
|
14
|
+
* Meta Llama, etc.). Their tokenizers use custom SentencePiece BPE vocabularies
|
|
15
|
+
* that are not available as JS libraries.
|
|
16
|
+
*
|
|
17
|
+
* For compaction/overflow decisions, the heuristic is sufficient because:
|
|
18
|
+
* - The 75% safety margin (25% buffer) absorbs estimation inaccuracy
|
|
19
|
+
* - The `capOutputTokensToContext` function caps output tokens as a last defense
|
|
20
|
+
* - Even real tokenizers would be wrong for non-OpenAI models
|
|
21
|
+
*
|
|
22
|
+
* @see https://github.com/link-assistant/agent/issues/249
|
|
23
|
+
*/
|
|
1
24
|
export namespace Token {
|
|
25
|
+
const log = Log.create({ service: 'token' });
|
|
26
|
+
|
|
27
|
+
/** Default characters-per-token ratio for the heuristic estimator. */
|
|
2
28
|
const CHARS_PER_TOKEN = 4;
|
|
3
29
|
|
|
30
|
+
/**
|
|
31
|
+
* Heuristic token estimation based on character count.
|
|
32
|
+
* Returns an approximate token count using the ~4 chars/token rule of thumb.
|
|
33
|
+
* This is accurate to within ±20% for typical English text across most LLM
|
|
34
|
+
* tokenizers (OpenAI, Nemotron, Llama, Gemini all average 3.5–4.5 chars/token
|
|
35
|
+
* for English).
|
|
36
|
+
*/
|
|
4
37
|
export function estimate(input: string) {
|
|
5
38
|
return Math.max(0, Math.round((input || '').length / CHARS_PER_TOKEN));
|
|
6
39
|
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Lazy-loaded BPE encoder instance. Uses o200k_base encoding (GPT-4o/GPT-4.1/GPT-5).
|
|
43
|
+
* Loaded on first call to `countTokens()`. Returns `null` if gpt-tokenizer is
|
|
44
|
+
* not available.
|
|
45
|
+
*/
|
|
46
|
+
let _encoder: { encode: (text: string) => number[] } | null | undefined;
|
|
47
|
+
|
|
48
|
+
function getEncoder(): { encode: (text: string) => number[] } | null {
|
|
49
|
+
if (_encoder !== undefined) return _encoder;
|
|
50
|
+
try {
|
|
51
|
+
// Dynamic import to keep gpt-tokenizer optional.
|
|
52
|
+
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
53
|
+
const mod = require('gpt-tokenizer/encoding/o200k_base');
|
|
54
|
+
_encoder = mod;
|
|
55
|
+
log.info(() => ({ message: 'loaded gpt-tokenizer (o200k_base)' }));
|
|
56
|
+
return _encoder;
|
|
57
|
+
} catch {
|
|
58
|
+
_encoder = null;
|
|
59
|
+
log.info(() => ({
|
|
60
|
+
message:
|
|
61
|
+
'gpt-tokenizer not available, using character-based estimation',
|
|
62
|
+
}));
|
|
63
|
+
return null;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Count tokens using real BPE tokenization when available, falling back to
|
|
69
|
+
* the character-based heuristic.
|
|
70
|
+
*
|
|
71
|
+
* Use this for critical paths where accuracy matters (overflow detection,
|
|
72
|
+
* output token capping). For logging or non-critical estimation, prefer
|
|
73
|
+
* the cheaper `estimate()`.
|
|
74
|
+
*
|
|
75
|
+
* @returns An object with the token count and whether real BPE was used.
|
|
76
|
+
*/
|
|
77
|
+
export function countTokens(input: string): {
|
|
78
|
+
count: number;
|
|
79
|
+
precise: boolean;
|
|
80
|
+
} {
|
|
81
|
+
if (!input) return { count: 0, precise: true };
|
|
82
|
+
const encoder = getEncoder();
|
|
83
|
+
if (encoder) {
|
|
84
|
+
try {
|
|
85
|
+
const tokens = encoder.encode(input);
|
|
86
|
+
return { count: tokens.length, precise: true };
|
|
87
|
+
} catch (e) {
|
|
88
|
+
log.warn(() => ({
|
|
89
|
+
message: 'BPE encoding failed, falling back to estimate',
|
|
90
|
+
error: String(e),
|
|
91
|
+
inputLength: input.length,
|
|
92
|
+
}));
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
return { count: estimate(input), precise: false };
|
|
96
|
+
}
|
|
7
97
|
}
|