@link-assistant/agent 0.22.0 → 0.22.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@link-assistant/agent",
3
- "version": "0.22.0",
3
+ "version": "0.22.1",
4
4
  "description": "A minimal, public domain AI CLI agent compatible with OpenCode's JSON interface. Bun-only runtime.",
5
5
  "main": "src/index.js",
6
6
  "type": "module",
@@ -90,6 +90,7 @@
90
90
  "diff": "^8.0.2",
91
91
  "fuzzysort": "^3.1.0",
92
92
  "glob": "^10.0.0",
93
+ "gpt-tokenizer": "^3.4.0",
93
94
  "gray-matter": "^4.0.3",
94
95
  "hono": "^4.10.6",
95
96
  "hono-openapi": "^1.1.1",
@@ -52,6 +52,11 @@ export const DEFAULT_COMPACTION_MODELS =
52
52
  * Applied only when the compaction model has a context window equal to or smaller
53
53
  * than the base model. When the compaction model has a larger context, the margin
54
54
  * is automatically set to 0 (allowing 100% context usage).
55
+ *
56
+ * Increased from 15% to 25% to reduce probability of context overflow errors,
57
+ * especially when providers return inaccurate or zero token counts.
58
+ * Matches OpenCode upstream's 75% threshold (25% margin).
55
59
  * @see https://github.com/link-assistant/agent/issues/219
60
+ * @see https://github.com/link-assistant/agent/issues/249
56
61
  */
57
- export const DEFAULT_COMPACTION_SAFETY_MARGIN_PERCENT = 15;
62
+ export const DEFAULT_COMPACTION_SAFETY_MARGIN_PERCENT = 25;
@@ -17,6 +17,7 @@ import { iife } from '../util/iife';
17
17
  import { createEchoModel } from './echo';
18
18
  import { createCacheModel } from './cache';
19
19
  import { RetryFetch } from './retry-fetch';
20
+ import { SSEUsageExtractor } from '../util/sse-usage-extractor';
20
21
 
21
22
  // Direct imports for bundled providers - these are pre-installed to avoid runtime installation hangs
22
23
  // @see https://github.com/link-assistant/agent/issues/173
@@ -1232,8 +1233,41 @@ export namespace Provider {
1232
1233
  // flag state loss in subprocess/module-reload scenarios.
1233
1234
  // See: https://github.com/link-assistant/agent/issues/206
1234
1235
  // See: https://github.com/link-assistant/agent/issues/227
1236
+ // Even when verbose mode is off, intercept streaming responses
1237
+ // to extract usage tokens from raw SSE data. This is critical for
1238
+ // recovering usage when the AI SDK drops it from finish-step events.
1239
+ // @see https://github.com/link-assistant/agent/issues/249
1235
1240
  if (!isVerbose()) {
1236
- return innerFetch(input, init);
1241
+ const response = await innerFetch(input, init);
1242
+ const ct = response.headers.get('content-type') ?? '';
1243
+ const isSSE =
1244
+ ct.includes('event-stream') || ct.includes('octet-stream');
1245
+ if (isSSE && response.body) {
1246
+ const [sdkStream, usageStream] = response.body.tee();
1247
+ const sseReqId = SSEUsageExtractor.nextRequestId();
1248
+ (async () => {
1249
+ try {
1250
+ const reader = usageStream.getReader();
1251
+ const decoder = new TextDecoder();
1252
+ let body = '';
1253
+ while (true) {
1254
+ const { done, value } = await reader.read();
1255
+ if (done) break;
1256
+ body += decoder.decode(value, { stream: true });
1257
+ if (body.length > 50000) break;
1258
+ }
1259
+ SSEUsageExtractor.processStreamForUsage(sseReqId, body);
1260
+ } catch {
1261
+ // Never break the SDK stream
1262
+ }
1263
+ })();
1264
+ return new Response(sdkStream, {
1265
+ status: response.status,
1266
+ statusText: response.statusText,
1267
+ headers: response.headers,
1268
+ });
1269
+ }
1270
+ return response;
1237
1271
  }
1238
1272
 
1239
1273
  httpCallCount++;
@@ -1374,6 +1408,10 @@ export namespace Provider {
1374
1408
  const [sdkStream, logStream] = response.body.tee();
1375
1409
 
1376
1410
  // Consume log stream asynchronously (does not block SDK)
1411
+ // Also extract usage tokens from raw SSE data as fallback
1412
+ // for when the AI SDK drops usage from its finish-step event.
1413
+ // @see https://github.com/link-assistant/agent/issues/249
1414
+ const sseRequestId = SSEUsageExtractor.nextRequestId();
1377
1415
  (async () => {
1378
1416
  try {
1379
1417
  const reader = logStream.getReader();
@@ -1395,6 +1433,11 @@ export namespace Provider {
1395
1433
  }
1396
1434
  }
1397
1435
  }
1436
+ // Extract usage from raw SSE stream as AI SDK fallback
1437
+ SSEUsageExtractor.processStreamForUsage(
1438
+ sseRequestId,
1439
+ bodyPreview
1440
+ );
1398
1441
  // Use direct (non-lazy) logging for stream body
1399
1442
  // See: https://github.com/link-assistant/agent/issues/211
1400
1443
  log.info('HTTP response body (stream)', {
@@ -1402,6 +1445,7 @@ export namespace Provider {
1402
1445
  providerID: provider.id,
1403
1446
  callNum,
1404
1447
  url,
1448
+ sseRequestId,
1405
1449
  bodyPreview: truncated
1406
1450
  ? bodyPreview + `... [truncated]`
1407
1451
  : bodyPreview,
@@ -30,11 +30,19 @@ export namespace SessionCompaction {
30
30
 
31
31
  /**
32
32
  * Default safety margin ratio for compaction trigger.
33
- * We trigger compaction at 85% of usable context to avoid hitting hard limits.
34
- * This means we stop 15% before (context - output) tokens.
33
+ * We trigger compaction at 75% of usable context to avoid hitting hard limits.
34
+ * This means we stop 25% before (context - output) tokens.
35
+ *
36
+ * Lowered from 0.85 to 0.75 (matching OpenCode upstream) because:
37
+ * - When providers return 0 token counts, the system relies on estimated tokens
38
+ * which can be inaccurate, so a larger safety buffer is needed.
39
+ * - Gemini CLI uses 50%, OpenCode upstream uses 75%, Claude Code uses ~83.5%.
40
+ * - A 75% threshold provides a good balance between context utilization and
41
+ * preventing context overflow errors.
35
42
  * @see https://github.com/link-assistant/agent/issues/217
43
+ * @see https://github.com/link-assistant/agent/issues/249
36
44
  */
37
- export const OVERFLOW_SAFETY_MARGIN = 0.85;
45
+ export const OVERFLOW_SAFETY_MARGIN = 0.75;
38
46
 
39
47
  /**
40
48
  * A single compaction model entry in the cascade.
@@ -117,12 +125,26 @@ export namespace SessionCompaction {
117
125
  model: ModelsDev.Model;
118
126
  compactionModel?: CompactionModelConfig;
119
127
  compactionModelContextLimit?: number;
128
+ /**
129
+ * Optional estimated input tokens from message content.
130
+ * Used as fallback when provider returns 0 for all token counts.
131
+ * This prevents the system from never triggering compaction when
132
+ * providers don't report token usage.
133
+ * @see https://github.com/link-assistant/agent/issues/249
134
+ */
135
+ estimatedInputTokens?: number;
120
136
  }) {
121
137
  if (config.disableAutocompact) return false;
122
138
  const baseModelContextLimit = input.model.limit.context;
123
139
  if (baseModelContextLimit === 0) return false;
124
- const count =
140
+ const providerCount =
125
141
  input.tokens.input + input.tokens.cache.read + input.tokens.output;
142
+ // When provider returns 0 for all token counts, use the estimated input tokens
143
+ // as a fallback. This prevents the system from never triggering compaction
144
+ // when providers (e.g., OpenCode with Nvidia/nemotron) don't report token usage.
145
+ // @see https://github.com/link-assistant/agent/issues/249
146
+ const count =
147
+ providerCount > 0 ? providerCount : (input.estimatedInputTokens ?? 0);
126
148
  const outputTokenLimit =
127
149
  Math.min(input.model.limit.output, SessionPrompt.OUTPUT_TOKEN_MAX) ||
128
150
  SessionPrompt.OUTPUT_TOKEN_MAX;
@@ -145,6 +167,10 @@ export namespace SessionCompaction {
145
167
  compactionModelID: input.compactionModel?.modelID,
146
168
  compactionModelContextLimit: input.compactionModelContextLimit,
147
169
  currentTokens: count,
170
+ providerTokens: providerCount,
171
+ estimatedInputTokens: input.estimatedInputTokens ?? 0,
172
+ usingEstimate:
173
+ providerCount === 0 && (input.estimatedInputTokens ?? 0) > 0,
148
174
  tokensBreakdown: {
149
175
  input: input.tokens.input,
150
176
  cacheRead: input.tokens.cache.read,
@@ -18,6 +18,7 @@ import { SessionRetry } from './retry';
18
18
  import { SessionStatus } from './status';
19
19
  import { config, isVerbose } from '../config/config';
20
20
  import { SessionCompaction } from './compaction';
21
+ import { SSEUsageExtractor } from '../util/sse-usage-extractor';
21
22
 
22
23
  export namespace SessionProcessor {
23
24
  const DOOM_LOOP_THRESHOLD = 3;
@@ -327,32 +328,95 @@ export namespace SessionProcessor {
327
328
  input.assistantMessage.cost += usage.cost;
328
329
  input.assistantMessage.tokens = usage.tokens;
329
330
 
330
- // Log warning when provider returns zero tokens (#198)
331
- if (
332
- usage.tokens.input === 0 &&
333
- usage.tokens.output === 0 &&
334
- usage.tokens.reasoning === 0 &&
335
- finishReason === 'unknown'
336
- ) {
337
- log.warn(() => ({
338
- message:
339
- 'provider returned zero tokens with unknown finish reason at step level',
331
+ // Log raw usage data at step level for debugging token parsing issues.
332
+ // The AI SDK may drop token data between the raw HTTP response and the
333
+ // finish-step event (e.g., @ai-sdk/openai-compatible may not propagate
334
+ // usage from SSE stream chunks). This log helps detect such mismatches.
335
+ // @see https://github.com/link-assistant/agent/issues/249
336
+ if (isVerbose()) {
337
+ log.debug(() => ({
338
+ message: 'step-finish raw usage diagnostics',
340
339
  providerID: input.providerID,
341
- requestedModelID: input.model.id,
342
- respondedModelID:
343
- (value as any).response?.modelId ?? 'none',
344
- rawFinishReason: String(
345
- value.finishReason ?? 'undefined'
346
- ),
340
+ modelID: input.model.id,
341
+ parsedTokens: usage.tokens,
347
342
  rawUsage: JSON.stringify(value.usage ?? null),
348
- providerMetadata: JSON.stringify(
343
+ rawProviderMetadata: JSON.stringify(
349
344
  value.providerMetadata ?? null
350
345
  ),
351
- issue:
352
- 'https://github.com/link-assistant/agent/issues/198',
346
+ rawFinishReason: String(
347
+ value.finishReason ?? 'undefined'
348
+ ),
349
+ respondedModelID:
350
+ (value as any).response?.modelId ?? 'none',
353
351
  }));
354
352
  }
355
353
 
354
+ // When AI SDK returns zero tokens, try to recover usage from
355
+ // raw SSE stream data captured by the fetch interceptor.
356
+ // The AI SDK may drop token data between the raw HTTP response
357
+ // and the finish-step event (known bug in @ai-sdk/openai-compatible).
358
+ // @see https://github.com/link-assistant/agent/issues/249
359
+ if (
360
+ usage.tokens.input === 0 &&
361
+ usage.tokens.output === 0 &&
362
+ usage.tokens.reasoning === 0
363
+ ) {
364
+ const sseUsage = SSEUsageExtractor.consumeLatestUsage();
365
+ if (sseUsage) {
366
+ const recoveredUsage = Session.getUsage({
367
+ model: input.model,
368
+ usage: {
369
+ inputTokens: sseUsage.promptTokens,
370
+ outputTokens: sseUsage.completionTokens,
371
+ totalTokens: sseUsage.totalTokens,
372
+ reasoningTokens: sseUsage.reasoningTokens ?? 0,
373
+ cachedInputTokens: sseUsage.cachedTokens ?? 0,
374
+ },
375
+ metadata: value.providerMetadata,
376
+ });
377
+ input.assistantMessage.cost =
378
+ input.assistantMessage.cost -
379
+ usage.cost +
380
+ recoveredUsage.cost;
381
+ input.assistantMessage.tokens = recoveredUsage.tokens;
382
+ log.warn(() => ({
383
+ message:
384
+ 'recovered usage from raw SSE stream — AI SDK dropped token data',
385
+ providerID: input.providerID,
386
+ requestedModelID: input.model.id,
387
+ recoveredTokens: recoveredUsage.tokens,
388
+ recoveredCost: recoveredUsage.cost,
389
+ ssePromptTokens: sseUsage.promptTokens,
390
+ sseCompletionTokens: sseUsage.completionTokens,
391
+ issue:
392
+ 'https://github.com/link-assistant/agent/issues/249',
393
+ }));
394
+ // Update the step-finish part with recovered data
395
+ usage.tokens = recoveredUsage.tokens;
396
+ usage.cost = recoveredUsage.cost;
397
+ } else {
398
+ log.warn(() => ({
399
+ message:
400
+ 'provider returned zero tokens at step level — AI SDK may not be propagating usage from raw HTTP response',
401
+ providerID: input.providerID,
402
+ requestedModelID: input.model.id,
403
+ respondedModelID:
404
+ (value as any).response?.modelId ?? 'none',
405
+ finishReason,
406
+ rawFinishReason: String(
407
+ value.finishReason ?? 'undefined'
408
+ ),
409
+ rawUsage: JSON.stringify(value.usage ?? null),
410
+ providerMetadata: JSON.stringify(
411
+ value.providerMetadata ?? null
412
+ ),
413
+ hint: 'No raw SSE usage found either. The token estimation fallback in isOverflow() handles this case.',
414
+ issue:
415
+ 'https://github.com/link-assistant/agent/issues/249',
416
+ }));
417
+ }
418
+ }
419
+
356
420
  // Build model info if --output-response-model flag is enabled
357
421
  // @see https://github.com/link-assistant/agent/issues/179
358
422
  const modelInfo: MessageV2.ModelInfo | undefined =
@@ -54,6 +54,45 @@ export namespace SessionPrompt {
54
54
  const log = Log.create({ service: 'session.prompt' });
55
55
  export const OUTPUT_TOKEN_MAX = 32_000;
56
56
 
57
+ /**
58
+ * Cap maxOutputTokens so that estimated input + output never exceeds
59
+ * the model's context limit. This prevents "context length exceeded" errors
60
+ * when the conversation has grown close to the model's limit.
61
+ *
62
+ * Returns at least 1024 tokens to avoid degenerate cases.
63
+ * Returns baseMaxOutput unchanged if contextLimit is 0 (unknown).
64
+ * @see https://github.com/link-assistant/agent/issues/249
65
+ */
66
+ function capOutputTokensToContext(input: {
67
+ baseMaxOutput: number;
68
+ contextLimit: number;
69
+ estimatedInputTokens: number;
70
+ }): number {
71
+ if (input.contextLimit <= 0) return input.baseMaxOutput;
72
+ const available = input.contextLimit - input.estimatedInputTokens;
73
+ if (available < 1024) {
74
+ log.warn(() => ({
75
+ message:
76
+ 'estimated input tokens near or exceeding context limit — capping output to 1024',
77
+ contextLimit: input.contextLimit,
78
+ estimatedInputTokens: input.estimatedInputTokens,
79
+ available,
80
+ }));
81
+ return 1024;
82
+ }
83
+ const capped = Math.min(input.baseMaxOutput, available);
84
+ if (capped < input.baseMaxOutput) {
85
+ log.info(() => ({
86
+ message: 'capped maxOutputTokens to fit within context limit',
87
+ baseMaxOutput: input.baseMaxOutput,
88
+ cappedMaxOutput: capped,
89
+ contextLimit: input.contextLimit,
90
+ estimatedInputTokens: input.estimatedInputTokens,
91
+ }));
92
+ }
93
+ return capped;
94
+ }
95
+
57
96
  const state = Instance.state(
58
97
  () => {
59
98
  const data: Record<
@@ -667,6 +706,29 @@ export namespace SessionPrompt {
667
706
  }
668
707
 
669
708
  // context overflow, needs compaction
709
+ // Count input tokens from message content as fallback for providers
710
+ // that return 0 token counts (e.g., Nvidia/nemotron via OpenCode).
711
+ // Uses real BPE tokenization (gpt-tokenizer) when available, falls back
712
+ // to character-based heuristic (~4 chars/token) for unknown tokenizers.
713
+ // @see https://github.com/link-assistant/agent/issues/249
714
+ const messageContent = msgs
715
+ .map((m) =>
716
+ m.parts
717
+ .map((p) => {
718
+ if (p.type === 'text') return p.text;
719
+ if (
720
+ p.type === 'tool' &&
721
+ p.state.status === 'completed' &&
722
+ !p.state.time.compacted
723
+ )
724
+ return p.state.output;
725
+ return '';
726
+ })
727
+ .join('')
728
+ )
729
+ .join('');
730
+ const tokenResult = Token.countTokens(messageContent);
731
+ const estimatedInputTokens = tokenResult.count;
670
732
  if (
671
733
  lastFinished &&
672
734
  lastFinished.summary !== true &&
@@ -675,6 +737,7 @@ export namespace SessionPrompt {
675
737
  model: model.info ?? { id: model.modelID },
676
738
  compactionModel: lastUser.compactionModel,
677
739
  compactionModelContextLimit,
740
+ estimatedInputTokens,
678
741
  })
679
742
  ) {
680
743
  await SessionCompaction.create({
@@ -908,12 +971,16 @@ export namespace SessionPrompt {
908
971
  // set to 0, we handle loop
909
972
  maxRetries: 0,
910
973
  activeTools: Object.keys(tools).filter((x) => x !== 'invalid'),
911
- maxOutputTokens: ProviderTransform.maxOutputTokens(
912
- model.providerID,
913
- params.options,
914
- model.info?.limit?.output ?? 100000,
915
- OUTPUT_TOKEN_MAX
916
- ),
974
+ maxOutputTokens: capOutputTokensToContext({
975
+ baseMaxOutput: ProviderTransform.maxOutputTokens(
976
+ model.providerID,
977
+ params.options,
978
+ model.info?.limit?.output ?? 100000,
979
+ OUTPUT_TOKEN_MAX
980
+ ),
981
+ contextLimit: model.info?.limit?.context ?? 0,
982
+ estimatedInputTokens,
983
+ }),
917
984
  abortSignal: abort,
918
985
  providerOptions: ProviderTransform.providerOptions(
919
986
  model.npm,
@@ -0,0 +1,144 @@
1
+ import { Log } from './log';
2
+ import { isVerbose } from '../config/config';
3
+
4
+ const log = Log.create({ service: 'sse-usage' });
5
+
6
+ export interface SSEUsageData {
7
+ promptTokens: number;
8
+ completionTokens: number;
9
+ totalTokens: number;
10
+ cachedTokens?: number;
11
+ reasoningTokens?: number;
12
+ timestamp: number;
13
+ }
14
+
15
+ const pendingUsage = new Map<string, SSEUsageData>();
16
+ let requestCounter = 0;
17
+
18
+ export namespace SSEUsageExtractor {
19
+ export function nextRequestId(): string {
20
+ return `sse-req-${++requestCounter}`;
21
+ }
22
+
23
+ export function extractUsageFromSSEChunk(
24
+ chunk: string
25
+ ): SSEUsageData | undefined {
26
+ const lines = chunk.split('\n');
27
+ let lastUsage: SSEUsageData | undefined;
28
+
29
+ for (const line of lines) {
30
+ if (!line.startsWith('data: ')) continue;
31
+ const data = line.slice(6).trim();
32
+ if (data === '[DONE]') continue;
33
+
34
+ try {
35
+ const parsed = JSON.parse(data);
36
+ const usage =
37
+ parsed.usage ?? parsed.x_groq?.usage ?? parsed.choices?.[0]?.usage;
38
+
39
+ if (usage && typeof usage === 'object') {
40
+ const prompt =
41
+ usage.prompt_tokens ?? usage.input_tokens ?? usage.promptTokens;
42
+ const completion =
43
+ usage.completion_tokens ??
44
+ usage.output_tokens ??
45
+ usage.completionTokens;
46
+ const total =
47
+ usage.total_tokens ?? usage.totalTokens ?? prompt + completion;
48
+
49
+ if (
50
+ typeof prompt === 'number' &&
51
+ typeof completion === 'number' &&
52
+ (prompt > 0 || completion > 0)
53
+ ) {
54
+ lastUsage = {
55
+ promptTokens: prompt,
56
+ completionTokens: completion,
57
+ totalTokens:
58
+ typeof total === 'number' ? total : prompt + completion,
59
+ cachedTokens:
60
+ usage.prompt_tokens_details?.cached_tokens ??
61
+ usage.cache_read_input_tokens ??
62
+ usage.cachedTokens ??
63
+ undefined,
64
+ reasoningTokens:
65
+ usage.completion_tokens_details?.reasoning_tokens ??
66
+ usage.reasoning_tokens ??
67
+ undefined,
68
+ timestamp: Date.now(),
69
+ };
70
+ }
71
+ }
72
+ } catch {
73
+ // Not valid JSON — skip
74
+ }
75
+ }
76
+
77
+ return lastUsage;
78
+ }
79
+
80
+ export function processStreamForUsage(
81
+ requestId: string,
82
+ streamBody: string
83
+ ): void {
84
+ const usage = extractUsageFromSSEChunk(streamBody);
85
+ if (usage) {
86
+ pendingUsage.set(requestId, usage);
87
+ if (isVerbose()) {
88
+ log.info('raw SSE usage extracted', {
89
+ requestId,
90
+ promptTokens: usage.promptTokens,
91
+ completionTokens: usage.completionTokens,
92
+ totalTokens: usage.totalTokens,
93
+ cachedTokens: usage.cachedTokens,
94
+ reasoningTokens: usage.reasoningTokens,
95
+ });
96
+ }
97
+ }
98
+ }
99
+
100
+ export function getUsage(requestId: string): SSEUsageData | undefined {
101
+ return pendingUsage.get(requestId);
102
+ }
103
+
104
+ export function consumeUsage(requestId: string): SSEUsageData | undefined {
105
+ const usage = pendingUsage.get(requestId);
106
+ if (usage) {
107
+ pendingUsage.delete(requestId);
108
+ }
109
+ return usage;
110
+ }
111
+
112
+ export function getLatestUsage(): SSEUsageData | undefined {
113
+ let latest: SSEUsageData | undefined;
114
+ for (const usage of pendingUsage.values()) {
115
+ if (!latest || usage.timestamp > latest.timestamp) {
116
+ latest = usage;
117
+ }
118
+ }
119
+ return latest;
120
+ }
121
+
122
+ export function consumeLatestUsage(): SSEUsageData | undefined {
123
+ let latestKey: string | undefined;
124
+ let latestUsage: SSEUsageData | undefined;
125
+ for (const [key, usage] of pendingUsage.entries()) {
126
+ if (!latestUsage || usage.timestamp > latestUsage.timestamp) {
127
+ latestKey = key;
128
+ latestUsage = usage;
129
+ }
130
+ }
131
+ if (latestKey) {
132
+ pendingUsage.delete(latestKey);
133
+ }
134
+ return latestUsage;
135
+ }
136
+
137
+ export function clear(): void {
138
+ pendingUsage.clear();
139
+ }
140
+
141
+ export function size(): number {
142
+ return pendingUsage.size;
143
+ }
144
+ }
package/src/util/token.ts CHANGED
@@ -1,7 +1,97 @@
1
+ import { Log } from './log';
2
+
3
+ /**
4
+ * Token estimation utilities.
5
+ *
6
+ * Provides two levels of accuracy:
7
+ *
8
+ * 1. **Real BPE tokenization** via `gpt-tokenizer` (o200k_base encoding) —
9
+ * accurate for OpenAI-compatible models (GPT-4o, GPT-4.1, GPT-5, etc.).
10
+ * Used by `countTokens()` when available.
11
+ *
12
+ * 2. **Character-based heuristic** (≈4 chars per token for English text) —
13
+ * fallback for models with unknown tokenizers (Nvidia Nemotron, Google Gemini,
14
+ * Meta Llama, etc.). Their tokenizers use custom SentencePiece BPE vocabularies
15
+ * that are not available as JS libraries.
16
+ *
17
+ * For compaction/overflow decisions, the heuristic is sufficient because:
18
+ * - The 75% safety margin (25% buffer) absorbs estimation inaccuracy
19
+ * - The `capOutputTokensToContext` function caps output tokens as a last defense
20
+ * - Even real tokenizers would be wrong for non-OpenAI models
21
+ *
22
+ * @see https://github.com/link-assistant/agent/issues/249
23
+ */
1
24
  export namespace Token {
25
+ const log = Log.create({ service: 'token' });
26
+
27
+ /** Default characters-per-token ratio for the heuristic estimator. */
2
28
  const CHARS_PER_TOKEN = 4;
3
29
 
30
+ /**
31
+ * Heuristic token estimation based on character count.
32
+ * Returns an approximate token count using the ~4 chars/token rule of thumb.
33
+ * This is accurate to within ±20% for typical English text across most LLM
34
+ * tokenizers (OpenAI, Nemotron, Llama, Gemini all average 3.5–4.5 chars/token
35
+ * for English).
36
+ */
4
37
  export function estimate(input: string) {
5
38
  return Math.max(0, Math.round((input || '').length / CHARS_PER_TOKEN));
6
39
  }
40
+
41
+ /**
42
+ * Lazy-loaded BPE encoder instance. Uses o200k_base encoding (GPT-4o/GPT-4.1/GPT-5).
43
+ * Loaded on first call to `countTokens()`. Returns `null` if gpt-tokenizer is
44
+ * not available.
45
+ */
46
+ let _encoder: { encode: (text: string) => number[] } | null | undefined;
47
+
48
+ function getEncoder(): { encode: (text: string) => number[] } | null {
49
+ if (_encoder !== undefined) return _encoder;
50
+ try {
51
+ // Dynamic import to keep gpt-tokenizer optional.
52
+ // eslint-disable-next-line @typescript-eslint/no-var-requires
53
+ const mod = require('gpt-tokenizer/encoding/o200k_base');
54
+ _encoder = mod;
55
+ log.info(() => ({ message: 'loaded gpt-tokenizer (o200k_base)' }));
56
+ return _encoder;
57
+ } catch {
58
+ _encoder = null;
59
+ log.info(() => ({
60
+ message:
61
+ 'gpt-tokenizer not available, using character-based estimation',
62
+ }));
63
+ return null;
64
+ }
65
+ }
66
+
67
+ /**
68
+ * Count tokens using real BPE tokenization when available, falling back to
69
+ * the character-based heuristic.
70
+ *
71
+ * Use this for critical paths where accuracy matters (overflow detection,
72
+ * output token capping). For logging or non-critical estimation, prefer
73
+ * the cheaper `estimate()`.
74
+ *
75
+ * @returns An object with the token count and whether real BPE was used.
76
+ */
77
+ export function countTokens(input: string): {
78
+ count: number;
79
+ precise: boolean;
80
+ } {
81
+ if (!input) return { count: 0, precise: true };
82
+ const encoder = getEncoder();
83
+ if (encoder) {
84
+ try {
85
+ const tokens = encoder.encode(input);
86
+ return { count: tokens.length, precise: true };
87
+ } catch (e) {
88
+ log.warn(() => ({
89
+ message: 'BPE encoding failed, falling back to estimate',
90
+ error: String(e),
91
+ inputLength: input.length,
92
+ }));
93
+ }
94
+ }
95
+ return { count: estimate(input), precise: false };
96
+ }
7
97
  }