npm - @link-assistant/agent - Versions diffs - 0.21.0 → 0.22.1 - Mend

@link-assistant/agent 0.21.0 → 0.22.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/package.json +2 -1
package/src/cli/continuous-mode.js +6 -2
package/src/cli/defaults.ts +6 -1
package/src/cli/run-options.js +5 -0
package/src/index.js +14 -6
package/src/provider/provider.ts +45 -1
package/src/session/compaction.ts +30 -4
package/src/session/message-v2.ts +1 -0
package/src/session/processor.ts +83 -19
package/src/session/prompt.ts +81 -10
package/src/util/sse-usage-extractor.ts +144 -0
package/src/util/token.ts +90 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@link-assistant/agent",
-  "version": "0.21.0",
+  "version": "0.22.1",
   "description": "A minimal, public domain AI CLI agent compatible with OpenCode's JSON interface. Bun-only runtime.",
   "main": "src/index.js",
   "type": "module",
@@ -90,6 +90,7 @@
     "diff": "^8.0.2",
     "fuzzysort": "^3.1.0",
     "glob": "^10.0.0",
+    "gpt-tokenizer": "^3.4.0",
     "gray-matter": "^4.0.3",
     "hono": "^4.10.6",
     "hono-openapi": "^1.1.1",

package/src/cli/continuous-mode.js CHANGED Viewed

@@ -194,7 +194,8 @@ export async function runContinuousServerMode(
   systemMessage,
   appendSystemMessage,
   jsonStandard,
-  compactionModel
+  compactionModel,
+  temperature
 ) {
   // Check both CLI flag and environment variable for compact JSON mode
   const compactJson = argv['compact-json'] === true || config.compactJson;
@@ -290,6 +291,7 @@ export async function runContinuousServerMode(
             compactionModel,
             system: systemMessage,
             appendSystem: appendSystemMessage,
+            temperature,
           }),
         }
       ).catch((error) => {
@@ -446,7 +448,8 @@ export async function runContinuousDirectMode(
   systemMessage,
   appendSystemMessage,
   jsonStandard,
-  compactionModel
+  compactionModel,
+  temperature
 ) {
   // Check both CLI flag and environment variable for compact JSON mode
   const compactJson = argv['compact-json'] === true || config.compactJson;
@@ -523,6 +526,7 @@ export async function runContinuousDirectMode(
         compactionModel,
         system: systemMessage,
         appendSystem: appendSystemMessage,
+        temperature,
       }).catch((error) => {
         hasError = true;
         eventHandler.output({

package/src/cli/defaults.ts CHANGED Viewed

@@ -52,6 +52,11 @@ export const DEFAULT_COMPACTION_MODELS =
  * Applied only when the compaction model has a context window equal to or smaller
  * than the base model. When the compaction model has a larger context, the margin
  * is automatically set to 0 (allowing 100% context usage).
+ *
+ * Increased from 15% to 25% to reduce probability of context overflow errors,
+ * especially when providers return inaccurate or zero token counts.
+ * Matches OpenCode upstream's 75% threshold (25% margin).
  * @see https://github.com/link-assistant/agent/issues/219
+ * @see https://github.com/link-assistant/agent/issues/249
  */
-export const DEFAULT_COMPACTION_SAFETY_MARGIN_PERCENT = 15;
+export const DEFAULT_COMPACTION_SAFETY_MARGIN_PERCENT = 25;

package/src/cli/run-options.js CHANGED Viewed

@@ -168,5 +168,10 @@ export function buildRunOptions(yargs) {
       description:
         'Safety margin (%) of usable context window before triggering compaction. Only applies when the compaction model has equal or smaller context than the base model. Default: 15.',
       default: DEFAULT_COMPACTION_SAFETY_MARGIN_PERCENT,
+    })
+    .option('temperature', {
+      type: 'number',
+      description:
+        'Override the temperature for model completions. When not set, the default per-model temperature is used.',
     });
 }

package/src/index.js CHANGED Viewed

@@ -313,7 +313,8 @@ async function runAgentMode(argv, request) {
           systemMessage,
           appendSystemMessage,
           jsonStandard,
-          compactionModel
+          compactionModel,
+          argv.temperature
         );
       } else {
         // DIRECT MODE: Run everything in single process
@@ -325,7 +326,8 @@ async function runAgentMode(argv, request) {
           systemMessage,
           appendSystemMessage,
           jsonStandard,
-          compactionModel
+          compactionModel,
+          argv.temperature
         );
       }
     },
@@ -399,7 +401,8 @@ async function runContinuousAgentMode(argv) {
           systemMessage,
           appendSystemMessage,
           jsonStandard,
-          compactionModel
+          compactionModel,
+          argv.temperature
         );
       } else {
         // DIRECT MODE: Run everything in single process
@@ -410,7 +413,8 @@ async function runContinuousAgentMode(argv) {
           systemMessage,
           appendSystemMessage,
           jsonStandard,
-          compactionModel
+          compactionModel,
+          argv.temperature
         );
       }
     },
@@ -433,7 +437,8 @@ async function runServerMode(
   systemMessage,
   appendSystemMessage,
   jsonStandard,
-  compactionModel
+  compactionModel,
+  temperature
 ) {
   const compactJson = argv['compact-json'] === true;
@@ -502,6 +507,7 @@ async function runServerMode(
           compactionModel,
           system: systemMessage,
           appendSystem: appendSystemMessage,
+          temperature,
         }),
       }
     ).catch((error) => {
@@ -534,7 +540,8 @@ async function runDirectMode(
   systemMessage,
   appendSystemMessage,
   jsonStandard,
-  compactionModel
+  compactionModel,
+  temperature
 ) {
   const compactJson = argv['compact-json'] === true;
@@ -587,6 +594,7 @@ async function runDirectMode(
       compactionModel,
       system: systemMessage,
       appendSystem: appendSystemMessage,
+      temperature,
     }).catch((error) => {
       hasError = true;
       eventHandler.output({

package/src/provider/provider.ts CHANGED Viewed

@@ -17,6 +17,7 @@ import { iife } from '../util/iife';
 import { createEchoModel } from './echo';
 import { createCacheModel } from './cache';
 import { RetryFetch } from './retry-fetch';
+import { SSEUsageExtractor } from '../util/sse-usage-extractor';
 // Direct imports for bundled providers - these are pre-installed to avoid runtime installation hangs
 // @see https://github.com/link-assistant/agent/issues/173
@@ -1232,8 +1233,41 @@ export namespace Provider {
           // flag state loss in subprocess/module-reload scenarios.
           // See: https://github.com/link-assistant/agent/issues/206
           // See: https://github.com/link-assistant/agent/issues/227
+          // Even when verbose mode is off, intercept streaming responses
+          // to extract usage tokens from raw SSE data. This is critical for
+          // recovering usage when the AI SDK drops it from finish-step events.
+          // @see https://github.com/link-assistant/agent/issues/249
           if (!isVerbose()) {
-            return innerFetch(input, init);
+            const response = await innerFetch(input, init);
+            const ct = response.headers.get('content-type') ?? '';
+            const isSSE =
+              ct.includes('event-stream') || ct.includes('octet-stream');
+            if (isSSE && response.body) {
+              const [sdkStream, usageStream] = response.body.tee();
+              const sseReqId = SSEUsageExtractor.nextRequestId();
+              (async () => {
+                try {
+                  const reader = usageStream.getReader();
+                  const decoder = new TextDecoder();
+                  let body = '';
+                  while (true) {
+                    const { done, value } = await reader.read();
+                    if (done) break;
+                    body += decoder.decode(value, { stream: true });
+                    if (body.length > 50000) break;
+                  }
+                  SSEUsageExtractor.processStreamForUsage(sseReqId, body);
+                } catch {
+                  // Never break the SDK stream
+                }
+              })();
+              return new Response(sdkStream, {
+                status: response.status,
+                statusText: response.statusText,
+                headers: response.headers,
+              });
+            }
+            return response;
           }
           httpCallCount++;
@@ -1374,6 +1408,10 @@ export namespace Provider {
                 const [sdkStream, logStream] = response.body.tee();
                 // Consume log stream asynchronously (does not block SDK)
+                // Also extract usage tokens from raw SSE data as fallback
+                // for when the AI SDK drops usage from its finish-step event.
+                // @see https://github.com/link-assistant/agent/issues/249
+                const sseRequestId = SSEUsageExtractor.nextRequestId();
                 (async () => {
                   try {
                     const reader = logStream.getReader();
@@ -1395,6 +1433,11 @@ export namespace Provider {
                         }
                       }
                     }
+                    // Extract usage from raw SSE stream as AI SDK fallback
+                    SSEUsageExtractor.processStreamForUsage(
+                      sseRequestId,
+                      bodyPreview
+                    );
                     // Use direct (non-lazy) logging for stream body
                     // See: https://github.com/link-assistant/agent/issues/211
                     log.info('HTTP response body (stream)', {
@@ -1402,6 +1445,7 @@ export namespace Provider {
                       providerID: provider.id,
                       callNum,
                       url,
+                      sseRequestId,
                       bodyPreview: truncated
                         ? bodyPreview + `... [truncated]`
                         : bodyPreview,

package/src/session/compaction.ts CHANGED Viewed

@@ -30,11 +30,19 @@ export namespace SessionCompaction {
   /**
    * Default safety margin ratio for compaction trigger.
-   * We trigger compaction at 85% of usable context to avoid hitting hard limits.
-   * This means we stop 15% before (context - output) tokens.
+   * We trigger compaction at 75% of usable context to avoid hitting hard limits.
+   * This means we stop 25% before (context - output) tokens.
+   *
+   * Lowered from 0.85 to 0.75 (matching OpenCode upstream) because:
+   * - When providers return 0 token counts, the system relies on estimated tokens
+   *   which can be inaccurate, so a larger safety buffer is needed.
+   * - Gemini CLI uses 50%, OpenCode upstream uses 75%, Claude Code uses ~83.5%.
+   * - A 75% threshold provides a good balance between context utilization and
+   *   preventing context overflow errors.
    * @see https://github.com/link-assistant/agent/issues/217
+   * @see https://github.com/link-assistant/agent/issues/249
    */
-  export const OVERFLOW_SAFETY_MARGIN = 0.85;
+  export const OVERFLOW_SAFETY_MARGIN = 0.75;
   /**
    * A single compaction model entry in the cascade.
@@ -117,12 +125,26 @@ export namespace SessionCompaction {
     model: ModelsDev.Model;
     compactionModel?: CompactionModelConfig;
     compactionModelContextLimit?: number;
+    /**
+     * Optional estimated input tokens from message content.
+     * Used as fallback when provider returns 0 for all token counts.
+     * This prevents the system from never triggering compaction when
+     * providers don't report token usage.
+     * @see https://github.com/link-assistant/agent/issues/249
+     */
+    estimatedInputTokens?: number;
   }) {
     if (config.disableAutocompact) return false;
     const baseModelContextLimit = input.model.limit.context;
     if (baseModelContextLimit === 0) return false;
-    const count =
+    const providerCount =
       input.tokens.input + input.tokens.cache.read + input.tokens.output;
+    // When provider returns 0 for all token counts, use the estimated input tokens
+    // as a fallback. This prevents the system from never triggering compaction
+    // when providers (e.g., OpenCode with Nvidia/nemotron) don't report token usage.
+    // @see https://github.com/link-assistant/agent/issues/249
+    const count =
+      providerCount > 0 ? providerCount : (input.estimatedInputTokens ?? 0);
     const outputTokenLimit =
       Math.min(input.model.limit.output, SessionPrompt.OUTPUT_TOKEN_MAX) ||
       SessionPrompt.OUTPUT_TOKEN_MAX;
@@ -145,6 +167,10 @@ export namespace SessionCompaction {
       compactionModelID: input.compactionModel?.modelID,
       compactionModelContextLimit: input.compactionModelContextLimit,
       currentTokens: count,
+      providerTokens: providerCount,
+      estimatedInputTokens: input.estimatedInputTokens ?? 0,
+      usingEstimate:
+        providerCount === 0 && (input.estimatedInputTokens ?? 0) > 0,
       tokensBreakdown: {
         input: input.tokens.input,
         cacheRead: input.tokens.cache.read,

package/src/session/message-v2.ts CHANGED Viewed

@@ -411,6 +411,7 @@ export namespace MessageV2 {
       .optional(),
     system: z.string().optional(),
     appendSystem: z.string().optional(),
+    temperature: z.number().optional(),
     tools: z.record(z.string(), z.boolean()).optional(),
   }).meta({
     ref: 'UserMessage',

package/src/session/processor.ts CHANGED Viewed

@@ -18,6 +18,7 @@ import { SessionRetry } from './retry';
 import { SessionStatus } from './status';
 import { config, isVerbose } from '../config/config';
 import { SessionCompaction } from './compaction';
+import { SSEUsageExtractor } from '../util/sse-usage-extractor';
 export namespace SessionProcessor {
   const DOOM_LOOP_THRESHOLD = 3;
@@ -327,32 +328,95 @@ export namespace SessionProcessor {
                   input.assistantMessage.cost += usage.cost;
                   input.assistantMessage.tokens = usage.tokens;
-                  // Log warning when provider returns zero tokens (#198)
-                  if (
-                    usage.tokens.input === 0 &&
-                    usage.tokens.output === 0 &&
-                    usage.tokens.reasoning === 0 &&
-                    finishReason === 'unknown'
-                  ) {
-                    log.warn(() => ({
-                      message:
-                        'provider returned zero tokens with unknown finish reason at step level',
+                  // Log raw usage data at step level for debugging token parsing issues.
+                  // The AI SDK may drop token data between the raw HTTP response and the
+                  // finish-step event (e.g., @ai-sdk/openai-compatible may not propagate
+                  // usage from SSE stream chunks). This log helps detect such mismatches.
+                  // @see https://github.com/link-assistant/agent/issues/249
+                  if (isVerbose()) {
+                    log.debug(() => ({
+                      message: 'step-finish raw usage diagnostics',
                       providerID: input.providerID,
-                      requestedModelID: input.model.id,
-                      respondedModelID:
-                        (value as any).response?.modelId ?? 'none',
-                      rawFinishReason: String(
-                        value.finishReason ?? 'undefined'
-                      ),
+                      modelID: input.model.id,
+                      parsedTokens: usage.tokens,
                       rawUsage: JSON.stringify(value.usage ?? null),
-                      providerMetadata: JSON.stringify(
+                      rawProviderMetadata: JSON.stringify(
                         value.providerMetadata ?? null
                       ),
-                      issue:
-                        'https://github.com/link-assistant/agent/issues/198',
+                      rawFinishReason: String(
+                        value.finishReason ?? 'undefined'
+                      ),
+                      respondedModelID:
+                        (value as any).response?.modelId ?? 'none',
                     }));
                   }
+                  // When AI SDK returns zero tokens, try to recover usage from
+                  // raw SSE stream data captured by the fetch interceptor.
+                  // The AI SDK may drop token data between the raw HTTP response
+                  // and the finish-step event (known bug in @ai-sdk/openai-compatible).
+                  // @see https://github.com/link-assistant/agent/issues/249
+                  if (
+                    usage.tokens.input === 0 &&
+                    usage.tokens.output === 0 &&
+                    usage.tokens.reasoning === 0
+                  ) {
+                    const sseUsage = SSEUsageExtractor.consumeLatestUsage();
+                    if (sseUsage) {
+                      const recoveredUsage = Session.getUsage({
+                        model: input.model,
+                        usage: {
+                          inputTokens: sseUsage.promptTokens,
+                          outputTokens: sseUsage.completionTokens,
+                          totalTokens: sseUsage.totalTokens,
+                          reasoningTokens: sseUsage.reasoningTokens ?? 0,
+                          cachedInputTokens: sseUsage.cachedTokens ?? 0,
+                        },
+                        metadata: value.providerMetadata,
+                      });
+                      input.assistantMessage.cost =
+                        input.assistantMessage.cost -
+                        usage.cost +
+                        recoveredUsage.cost;
+                      input.assistantMessage.tokens = recoveredUsage.tokens;
+                      log.warn(() => ({
+                        message:
+                          'recovered usage from raw SSE stream — AI SDK dropped token data',
+                        providerID: input.providerID,
+                        requestedModelID: input.model.id,
+                        recoveredTokens: recoveredUsage.tokens,
+                        recoveredCost: recoveredUsage.cost,
+                        ssePromptTokens: sseUsage.promptTokens,
+                        sseCompletionTokens: sseUsage.completionTokens,
+                        issue:
+                          'https://github.com/link-assistant/agent/issues/249',
+                      }));
+                      // Update the step-finish part with recovered data
+                      usage.tokens = recoveredUsage.tokens;
+                      usage.cost = recoveredUsage.cost;
+                    } else {
+                      log.warn(() => ({
+                        message:
+                          'provider returned zero tokens at step level — AI SDK may not be propagating usage from raw HTTP response',
+                        providerID: input.providerID,
+                        requestedModelID: input.model.id,
+                        respondedModelID:
+                          (value as any).response?.modelId ?? 'none',
+                        finishReason,
+                        rawFinishReason: String(
+                          value.finishReason ?? 'undefined'
+                        ),
+                        rawUsage: JSON.stringify(value.usage ?? null),
+                        providerMetadata: JSON.stringify(
+                          value.providerMetadata ?? null
+                        ),
+                        hint: 'No raw SSE usage found either. The token estimation fallback in isOverflow() handles this case.',
+                        issue:
+                          'https://github.com/link-assistant/agent/issues/249',
+                      }));
+                    }
+                  }
                   // Build model info if --output-response-model flag is enabled
                   // @see https://github.com/link-assistant/agent/issues/179
                   const modelInfo: MessageV2.ModelInfo | undefined =

package/src/session/prompt.ts CHANGED Viewed

@@ -54,6 +54,45 @@ export namespace SessionPrompt {
   const log = Log.create({ service: 'session.prompt' });
   export const OUTPUT_TOKEN_MAX = 32_000;
+  /**
+   * Cap maxOutputTokens so that estimated input + output never exceeds
+   * the model's context limit. This prevents "context length exceeded" errors
+   * when the conversation has grown close to the model's limit.
+   *
+   * Returns at least 1024 tokens to avoid degenerate cases.
+   * Returns baseMaxOutput unchanged if contextLimit is 0 (unknown).
+   * @see https://github.com/link-assistant/agent/issues/249
+   */
+  function capOutputTokensToContext(input: {
+    baseMaxOutput: number;
+    contextLimit: number;
+    estimatedInputTokens: number;
+  }): number {
+    if (input.contextLimit <= 0) return input.baseMaxOutput;
+    const available = input.contextLimit - input.estimatedInputTokens;
+    if (available < 1024) {
+      log.warn(() => ({
+        message:
+          'estimated input tokens near or exceeding context limit — capping output to 1024',
+        contextLimit: input.contextLimit,
+        estimatedInputTokens: input.estimatedInputTokens,
+        available,
+      }));
+      return 1024;
+    }
+    const capped = Math.min(input.baseMaxOutput, available);
+    if (capped < input.baseMaxOutput) {
+      log.info(() => ({
+        message: 'capped maxOutputTokens to fit within context limit',
+        baseMaxOutput: input.baseMaxOutput,
+        cappedMaxOutput: capped,
+        contextLimit: input.contextLimit,
+        estimatedInputTokens: input.estimatedInputTokens,
+      }));
+    }
+    return capped;
+  }
   const state = Instance.state(
     () => {
       const data: Record<
@@ -110,6 +149,7 @@ export namespace SessionPrompt {
     noReply: z.boolean().optional(),
     system: z.string().optional(),
     appendSystem: z.string().optional(),
+    temperature: z.number().optional(),
     tools: z.record(z.string(), z.boolean()).optional(),
     parts: z.array(
       z.discriminatedUnion('type', [
@@ -666,6 +706,29 @@ export namespace SessionPrompt {
       }
       // context overflow, needs compaction
+      // Count input tokens from message content as fallback for providers
+      // that return 0 token counts (e.g., Nvidia/nemotron via OpenCode).
+      // Uses real BPE tokenization (gpt-tokenizer) when available, falls back
+      // to character-based heuristic (~4 chars/token) for unknown tokenizers.
+      // @see https://github.com/link-assistant/agent/issues/249
+      const messageContent = msgs
+        .map((m) =>
+          m.parts
+            .map((p) => {
+              if (p.type === 'text') return p.text;
+              if (
+                p.type === 'tool' &&
+                p.state.status === 'completed' &&
+                !p.state.time.compacted
+              )
+                return p.state.output;
+              return '';
+            })
+            .join('')
+        )
+        .join('');
+      const tokenResult = Token.countTokens(messageContent);
+      const estimatedInputTokens = tokenResult.count;
       if (
         lastFinished &&
         lastFinished.summary !== true &&
@@ -674,6 +737,7 @@ export namespace SessionPrompt {
           model: model.info ?? { id: model.modelID },
           compactionModel: lastUser.compactionModel,
           compactionModelContextLimit,
+          estimatedInputTokens,
         })
       ) {
         await SessionCompaction.create({
@@ -734,10 +798,12 @@ export namespace SessionPrompt {
       });
       const params = {
         temperature:
-          (model.info?.temperature ?? false)
-            ? (agent.temperature ??
-              ProviderTransform.temperature(model.providerID, model.modelID))
-            : undefined,
+          lastUser.temperature != null
+            ? lastUser.temperature
+            : (model.info?.temperature ?? false)
+              ? (agent.temperature ??
+                ProviderTransform.temperature(model.providerID, model.modelID))
+              : undefined,
         topP:
           agent.topP ?? ProviderTransform.topP(model.providerID, model.modelID),
         options: {
@@ -905,12 +971,16 @@ export namespace SessionPrompt {
           // set to 0, we handle loop
           maxRetries: 0,
           activeTools: Object.keys(tools).filter((x) => x !== 'invalid'),
-          maxOutputTokens: ProviderTransform.maxOutputTokens(
-            model.providerID,
-            params.options,
-            model.info?.limit?.output ?? 100000,
-            OUTPUT_TOKEN_MAX
-          ),
+          maxOutputTokens: capOutputTokensToContext({
+            baseMaxOutput: ProviderTransform.maxOutputTokens(
+              model.providerID,
+              params.options,
+              model.info?.limit?.output ?? 100000,
+              OUTPUT_TOKEN_MAX
+            ),
+            contextLimit: model.info?.limit?.context ?? 0,
+            estimatedInputTokens,
+          }),
           abortSignal: abort,
           providerOptions: ProviderTransform.providerOptions(
             model.npm,
@@ -1189,6 +1259,7 @@ export namespace SessionPrompt {
       tools: input.tools,
       system: input.system,
       appendSystem: input.appendSystem,
+      temperature: input.temperature,
       agent: agent.name,
       model: await resolveModel({
         model: input.model,

package/src/util/sse-usage-extractor.ts ADDED Viewed

@@ -0,0 +1,144 @@
+import { Log } from './log';
+import { isVerbose } from '../config/config';
+const log = Log.create({ service: 'sse-usage' });
+export interface SSEUsageData {
+  promptTokens: number;
+  completionTokens: number;
+  totalTokens: number;
+  cachedTokens?: number;
+  reasoningTokens?: number;
+  timestamp: number;
+}
+const pendingUsage = new Map<string, SSEUsageData>();
+let requestCounter = 0;
+export namespace SSEUsageExtractor {
+  export function nextRequestId(): string {
+    return `sse-req-${++requestCounter}`;
+  }
+  export function extractUsageFromSSEChunk(
+    chunk: string
+  ): SSEUsageData | undefined {
+    const lines = chunk.split('\n');
+    let lastUsage: SSEUsageData | undefined;
+    for (const line of lines) {
+      if (!line.startsWith('data: ')) continue;
+      const data = line.slice(6).trim();
+      if (data === '[DONE]') continue;
+      try {
+        const parsed = JSON.parse(data);
+        const usage =
+          parsed.usage ?? parsed.x_groq?.usage ?? parsed.choices?.[0]?.usage;
+        if (usage && typeof usage === 'object') {
+          const prompt =
+            usage.prompt_tokens ?? usage.input_tokens ?? usage.promptTokens;
+          const completion =
+            usage.completion_tokens ??
+            usage.output_tokens ??
+            usage.completionTokens;
+          const total =
+            usage.total_tokens ?? usage.totalTokens ?? prompt + completion;
+          if (
+            typeof prompt === 'number' &&
+            typeof completion === 'number' &&
+            (prompt > 0 || completion > 0)
+          ) {
+            lastUsage = {
+              promptTokens: prompt,
+              completionTokens: completion,
+              totalTokens:
+                typeof total === 'number' ? total : prompt + completion,
+              cachedTokens:
+                usage.prompt_tokens_details?.cached_tokens ??
+                usage.cache_read_input_tokens ??
+                usage.cachedTokens ??
+                undefined,
+              reasoningTokens:
+                usage.completion_tokens_details?.reasoning_tokens ??
+                usage.reasoning_tokens ??
+                undefined,
+              timestamp: Date.now(),
+            };
+          }
+        }
+      } catch {
+        // Not valid JSON — skip
+      }
+    }
+    return lastUsage;
+  }
+  export function processStreamForUsage(
+    requestId: string,
+    streamBody: string
+  ): void {
+    const usage = extractUsageFromSSEChunk(streamBody);
+    if (usage) {
+      pendingUsage.set(requestId, usage);
+      if (isVerbose()) {
+        log.info('raw SSE usage extracted', {
+          requestId,
+          promptTokens: usage.promptTokens,
+          completionTokens: usage.completionTokens,
+          totalTokens: usage.totalTokens,
+          cachedTokens: usage.cachedTokens,
+          reasoningTokens: usage.reasoningTokens,
+        });
+      }
+    }
+  }
+  export function getUsage(requestId: string): SSEUsageData | undefined {
+    return pendingUsage.get(requestId);
+  }
+  export function consumeUsage(requestId: string): SSEUsageData | undefined {
+    const usage = pendingUsage.get(requestId);
+    if (usage) {
+      pendingUsage.delete(requestId);
+    }
+    return usage;
+  }
+  export function getLatestUsage(): SSEUsageData | undefined {
+    let latest: SSEUsageData | undefined;
+    for (const usage of pendingUsage.values()) {
+      if (!latest || usage.timestamp > latest.timestamp) {
+        latest = usage;
+      }
+    }
+    return latest;
+  }
+  export function consumeLatestUsage(): SSEUsageData | undefined {
+    let latestKey: string | undefined;
+    let latestUsage: SSEUsageData | undefined;
+    for (const [key, usage] of pendingUsage.entries()) {
+      if (!latestUsage || usage.timestamp > latestUsage.timestamp) {
+        latestKey = key;
+        latestUsage = usage;
+      }
+    }
+    if (latestKey) {
+      pendingUsage.delete(latestKey);
+    }
+    return latestUsage;
+  }
+  export function clear(): void {
+    pendingUsage.clear();
+  }
+  export function size(): number {
+    return pendingUsage.size;
+  }
+}

package/src/util/token.ts CHANGED Viewed

@@ -1,7 +1,97 @@
+import { Log } from './log';
+/**
+ * Token estimation utilities.
+ *
+ * Provides two levels of accuracy:
+ *
+ * 1. **Real BPE tokenization** via `gpt-tokenizer` (o200k_base encoding) —
+ *    accurate for OpenAI-compatible models (GPT-4o, GPT-4.1, GPT-5, etc.).
+ *    Used by `countTokens()` when available.
+ *
+ * 2. **Character-based heuristic** (≈4 chars per token for English text) —
+ *    fallback for models with unknown tokenizers (Nvidia Nemotron, Google Gemini,
+ *    Meta Llama, etc.). Their tokenizers use custom SentencePiece BPE vocabularies
+ *    that are not available as JS libraries.
+ *
+ * For compaction/overflow decisions, the heuristic is sufficient because:
+ * - The 75% safety margin (25% buffer) absorbs estimation inaccuracy
+ * - The `capOutputTokensToContext` function caps output tokens as a last defense
+ * - Even real tokenizers would be wrong for non-OpenAI models
+ *
+ * @see https://github.com/link-assistant/agent/issues/249
+ */
 export namespace Token {
+  const log = Log.create({ service: 'token' });
+  /** Default characters-per-token ratio for the heuristic estimator. */
   const CHARS_PER_TOKEN = 4;
+  /**
+   * Heuristic token estimation based on character count.
+   * Returns an approximate token count using the ~4 chars/token rule of thumb.
+   * This is accurate to within ±20% for typical English text across most LLM
+   * tokenizers (OpenAI, Nemotron, Llama, Gemini all average 3.5–4.5 chars/token
+   * for English).
+   */
   export function estimate(input: string) {
     return Math.max(0, Math.round((input || '').length / CHARS_PER_TOKEN));
   }
+  /**
+   * Lazy-loaded BPE encoder instance. Uses o200k_base encoding (GPT-4o/GPT-4.1/GPT-5).
+   * Loaded on first call to `countTokens()`. Returns `null` if gpt-tokenizer is
+   * not available.
+   */
+  let _encoder: { encode: (text: string) => number[] } | null | undefined;
+  function getEncoder(): { encode: (text: string) => number[] } | null {
+    if (_encoder !== undefined) return _encoder;
+    try {
+      // Dynamic import to keep gpt-tokenizer optional.
+      // eslint-disable-next-line @typescript-eslint/no-var-requires
+      const mod = require('gpt-tokenizer/encoding/o200k_base');
+      _encoder = mod;
+      log.info(() => ({ message: 'loaded gpt-tokenizer (o200k_base)' }));
+      return _encoder;
+    } catch {
+      _encoder = null;
+      log.info(() => ({
+        message:
+          'gpt-tokenizer not available, using character-based estimation',
+      }));
+      return null;
+    }
+  }
+  /**
+   * Count tokens using real BPE tokenization when available, falling back to
+   * the character-based heuristic.
+   *
+   * Use this for critical paths where accuracy matters (overflow detection,
+   * output token capping). For logging or non-critical estimation, prefer
+   * the cheaper `estimate()`.
+   *
+   * @returns An object with the token count and whether real BPE was used.
+   */
+  export function countTokens(input: string): {
+    count: number;
+    precise: boolean;
+  } {
+    if (!input) return { count: 0, precise: true };
+    const encoder = getEncoder();
+    if (encoder) {
+      try {
+        const tokens = encoder.encode(input);
+        return { count: tokens.length, precise: true };
+      } catch (e) {
+        log.warn(() => ({
+          message: 'BPE encoding failed, falling back to estimate',
+          error: String(e),
+          inputLength: input.length,
+        }));
+      }
+    }
+    return { count: estimate(input), precise: false };
+  }
 }