npm - @almadar/llm - Versions diffs - 2.20.0 → 2.22.0 - Mend

@almadar/llm 2.20.0 → 2.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/{chunk-DGW3YFPS.js → chunk-G2OE5TBE.js} +28 -9
package/dist/chunk-G2OE5TBE.js.map +1 -0
package/dist/{chunk-NO7P6EDT.js → chunk-IDXSWM57.js} +7 -3
package/dist/{chunk-NO7P6EDT.js.map → chunk-IDXSWM57.js.map} +1 -1
package/dist/{chunk-TGHGQB5I.js → chunk-SXSP6M24.js} +61 -12
package/dist/chunk-SXSP6M24.js.map +1 -0
package/dist/{client-BIq-gHZo.d.ts → client-Bfd-fT35.d.ts} +8 -2
package/dist/client.d.ts +2 -2
package/dist/client.js +2 -2
package/dist/index.d.ts +3 -3
package/dist/index.js +3 -3
package/dist/{rate-limiter-BqWOhaXY.d.ts → rate-limiter-7-oOHrtX.d.ts} +22 -1
package/dist/structured-output.d.ts +1 -1
package/dist/structured-output.js +2 -2
package/package.json +1 -1
package/src/client.ts +52 -4
package/src/structured-output.ts +7 -1
package/src/token-tracker.ts +110 -15
package/dist/chunk-DGW3YFPS.js.map +0 -1
package/dist/chunk-TGHGQB5I.js.map +0 -1

package/dist/{client-BIq-gHZo.d.ts → client-Bfd-fT35.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { a as RateLimiterOptions, b as TokenUsage } from './rate-limiter-BqWOhaXY.js';
+import { R as RateLimiterOptions, T as TokenUsage } from './rate-limiter-7-oOHrtX.js';
 import { ChatOpenAI } from '@langchain/openai';
 import { ChatAnthropic } from '@langchain/anthropic';
 import { z } from 'zod';
@@ -90,6 +90,12 @@ interface LLMClientOptions {
      * the call consuming the entire 300s).
      */
     rawTimeoutMs?: number;
+    /**
+     * Explicit provider credentials. When set, used instead of reading the
+     * provider's env vars — lets callers (e.g. the CLI's account store) supply
+     * `apiKey`/`baseUrl`/`defaultModel` directly. Absent ⇒ env fallback.
+     */
+    providerConfig?: ProviderConfig;
 }
 interface LLMCallOptions<T = unknown> {
     systemPrompt: string;
@@ -400,4 +406,4 @@ declare function createOpenRouterClient(options?: Partial<Omit<LLMClientOptions,
  */
 declare function createZhipuClient(options?: Partial<Omit<LLMClientOptions, 'provider'>>): LLMClient;
-export { ANTHROPIC_MODELS as A, getAvailableProvider as B, type CacheAwareLLMCallOptions as C, DEEPSEEK_MODELS as D, getSharedLLMClient as E, isProviderAvailable as F, parseChatCompletionResponse as G, resetSharedLLMClient as H, KIMI_MODELS as K, type LLMCallOptions as L, OPENAI_MODELS as O, type ProviderConfig as P, type CacheableBlock as a, type ChatCompletionChoice as b, type ChatCompletionMessage as c, type ChatCompletionResponse as d, type ChatCompletionRole as e, type ChatCompletionToolCall as f, type ChatCompletionToolDef as g, type ChatCompletionUsage as h, LLMClient as i, type LLMClientOptions as j, type LLMFinishReason as k, type LLMProvider as l, type LLMResponse as m, type LLMStreamChunk as n, type LLMStreamOptions as o, type LLMUsage as p, OPENROUTER_MODELS as q, createAnthropicClient as r, createCreativeClient as s, createDeepSeekClient as t, createFixClient as u, createKimiClient as v, createOpenAIClient as w, createOpenRouterClient as x, createRequirementsClient as y, createZhipuClient as z };
+export { ANTHROPIC_MODELS as A, getAvailableProvider as B, type CacheAwareLLMCallOptions as C, DEEPSEEK_MODELS as D, getSharedLLMClient as E, isProviderAvailable as F, parseChatCompletionResponse as G, resetSharedLLMClient as H, KIMI_MODELS as K, type LLMFinishReason as L, OPENAI_MODELS as O, type ProviderConfig as P, LLMClient as a, type CacheableBlock as b, type ChatCompletionChoice as c, type ChatCompletionMessage as d, type ChatCompletionResponse as e, type ChatCompletionRole as f, type ChatCompletionToolCall as g, type ChatCompletionToolDef as h, type ChatCompletionUsage as i, type LLMCallOptions as j, type LLMClientOptions as k, type LLMProvider as l, type LLMResponse as m, type LLMStreamChunk as n, type LLMStreamOptions as o, type LLMUsage as p, OPENROUTER_MODELS as q, createAnthropicClient as r, createCreativeClient as s, createDeepSeekClient as t, createFixClient as u, createKimiClient as v, createOpenAIClient as w, createOpenRouterClient as x, createRequirementsClient as y, createZhipuClient as z };

package/dist/client.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
-import './rate-limiter-BqWOhaXY.js';
+import './rate-limiter-7-oOHrtX.js';
 import '@langchain/openai';
 import '@langchain/anthropic';
 import 'zod';
-export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, a as CacheableBlock, D as DEEPSEEK_MODELS, K as KIMI_MODELS, L as LLMCallOptions, i as LLMClient, j as LLMClientOptions, k as LLMFinishReason, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, H as resetSharedLLMClient } from './client-BIq-gHZo.js';
+export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, b as CacheableBlock, D as DEEPSEEK_MODELS, K as KIMI_MODELS, j as LLMCallOptions, a as LLMClient, k as LLMClientOptions, L as LLMFinishReason, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, H as resetSharedLLMClient } from './client-Bfd-fT35.js';
 import '@almadar/core';

package/dist/client.js CHANGED Viewed

@@ -18,9 +18,9 @@ import {
   getSharedLLMClient,
   isProviderAvailable,
   resetSharedLLMClient
-} from "./chunk-DGW3YFPS.js";
+} from "./chunk-G2OE5TBE.js";
 import "./chunk-P4VCT25B.js";
-import "./chunk-TGHGQB5I.js";
+import "./chunk-SXSP6M24.js";
 export {
   ANTHROPIC_MODELS,
   DEEPSEEK_MODELS,

package/dist/index.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
-import { k as LLMFinishReason, i as LLMClient } from './client-BIq-gHZo.js';
-export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, a as CacheableBlock, b as ChatCompletionChoice, c as ChatCompletionMessage, d as ChatCompletionResponse, e as ChatCompletionRole, f as ChatCompletionToolCall, g as ChatCompletionToolDef, h as ChatCompletionUsage, D as DEEPSEEK_MODELS, K as KIMI_MODELS, L as LLMCallOptions, j as LLMClientOptions, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, G as parseChatCompletionResponse, H as resetSharedLLMClient } from './client-BIq-gHZo.js';
-export { R as RateLimiter, a as RateLimiterOptions, T as TokenTracker, b as TokenUsage, g as getGlobalRateLimiter, c as getGlobalTokenTracker, r as resetGlobalRateLimiter, d as resetGlobalTokenTracker } from './rate-limiter-BqWOhaXY.js';
+import { L as LLMFinishReason, a as LLMClient } from './client-Bfd-fT35.js';
+export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, b as CacheableBlock, c as ChatCompletionChoice, d as ChatCompletionMessage, e as ChatCompletionResponse, f as ChatCompletionRole, g as ChatCompletionToolCall, h as ChatCompletionToolDef, i as ChatCompletionUsage, D as DEEPSEEK_MODELS, K as KIMI_MODELS, j as LLMCallOptions, k as LLMClientOptions, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, G as parseChatCompletionResponse, H as resetSharedLLMClient } from './client-Bfd-fT35.js';
+export { a as RateLimiter, R as RateLimiterOptions, b as TokenTracker, T as TokenUsage, g as getGlobalRateLimiter, c as getGlobalTokenTracker, r as resetGlobalRateLimiter, d as resetGlobalTokenTracker } from './rate-limiter-7-oOHrtX.js';
 export { autoCloseJson, extractJsonFromText, isValidJson, parseJsonResponse, safeParseJson } from './json-parser.js';
 import { z } from 'zod';
 export { JsonSchema, STRUCTURED_OUTPUT_MODELS, StructuredGenerationOptions, StructuredGenerationResult, StructuredOutputClient, StructuredOutputOptions, getStructuredOutputClient, isStructuredOutputAvailable, resetStructuredOutputClient } from './structured-output.js';

package/dist/index.js CHANGED Viewed

@@ -19,7 +19,7 @@ import {
   isProviderAvailable,
   parseChatCompletionResponse,
   resetSharedLLMClient
-} from "./chunk-DGW3YFPS.js";
+} from "./chunk-G2OE5TBE.js";
 import {
   autoCloseJson,
   extractJsonFromText,
@@ -33,7 +33,7 @@ import {
   getStructuredOutputClient,
   isStructuredOutputAvailable,
   resetStructuredOutputClient
-} from "./chunk-NO7P6EDT.js";
+} from "./chunk-IDXSWM57.js";
 import {
   RateLimiter,
   TokenTracker,
@@ -41,7 +41,7 @@ import {
   getGlobalTokenTracker,
   resetGlobalRateLimiter,
   resetGlobalTokenTracker
-} from "./chunk-TGHGQB5I.js";
+} from "./chunk-SXSP6M24.js";
 import {
   MasarError,
   MasarProvider,

package/dist/{rate-limiter-BqWOhaXY.d.ts → rate-limiter-7-oOHrtX.d.ts} RENAMED Viewed

@@ -14,14 +14,35 @@ interface TokenUsage {
     completionTokens: number;
     totalTokens: number;
     callCount: number;
+    /** Subset of `promptTokens` that were cache reads (billed at the cache-read rate). */
+    cachedPromptTokens: number;
+    /** Subset of `promptTokens` written to cache (Anthropic; billed at the cache-write rate). */
+    cacheWriteTokens: number;
 }
 declare class TokenTracker {
     private model;
     private usage;
+    /** Sum of provider-reported authoritative costs (e.g. OpenRouter `usage.cost`). */
+    private authoritativeCostUSD;
+    /** Token buckets for calls WITHOUT an authoritative cost — priced cache-aware. */
+    private computed;
     constructor(model?: string);
+    /** Cache-aware cost for one (or an aggregate of) call(s), in USD. */
+    private costFor;
+    /**
+     * Record one LLM call's usage. `promptTokens` is the TOTAL input count
+     * (cache reads + cache writes + uncached); `cachedPromptTokens` and
+     * `cacheWriteTokens` are subsets of it, priced at their own (cheaper /
+     * pricier) rates. Providers that don't report cache detail pass 0, which
+     * reduces to the previous flat-rate behaviour.
+     */
     addUsage(promptTokens: number, completionTokens: number, options?: {
         provider?: string;
         durationMs?: number;
+        cachedPromptTokens?: number;
+        cacheWriteTokens?: number;
+        /** Provider-reported authoritative cost (e.g. OpenRouter `usage.cost`). When set, used verbatim. */
+        costUSD?: number;
     }): void;
     getSummary(): TokenUsage;
     getEstimatedCost(): number;
@@ -99,4 +120,4 @@ declare class RateLimiter {
 declare function getGlobalRateLimiter(options?: RateLimiterOptions): RateLimiter;
 declare function resetGlobalRateLimiter(): void;
-export { RateLimiter as R, TokenTracker as T, type RateLimiterOptions as a, type TokenUsage as b, getGlobalTokenTracker as c, resetGlobalTokenTracker as d, getGlobalRateLimiter as g, resetGlobalRateLimiter as r };
+export { type RateLimiterOptions as R, type TokenUsage as T, RateLimiter as a, TokenTracker as b, getGlobalTokenTracker as c, resetGlobalTokenTracker as d, getGlobalRateLimiter as g, resetGlobalRateLimiter as r };

package/dist/structured-output.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { a as RateLimiterOptions, b as TokenUsage } from './rate-limiter-BqWOhaXY.js';
+import { R as RateLimiterOptions, T as TokenUsage } from './rate-limiter-7-oOHrtX.js';
 import { z } from 'zod';
 /**

package/dist/structured-output.js CHANGED Viewed

@@ -4,8 +4,8 @@ import {
   getStructuredOutputClient,
   isStructuredOutputAvailable,
   resetStructuredOutputClient
-} from "./chunk-NO7P6EDT.js";
-import "./chunk-TGHGQB5I.js";
+} from "./chunk-IDXSWM57.js";
+import "./chunk-SXSP6M24.js";
 export {
   STRUCTURED_OUTPUT_MODELS,
   StructuredOutputClient,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@almadar/llm",
-  "version": "2.20.0",
+  "version": "2.22.0",
   "description": "Multi-provider LLM client with rate limiting, token tracking, structured outputs, and continuation handling",
   "type": "module",
   "main": "./dist/index.js",

package/src/client.ts CHANGED Viewed

@@ -55,6 +55,28 @@ interface ModelKwargs {
   max_completion_tokens?: number;
   thinking?: { type: string };
   tool_choice?: string;
+  /** OpenRouter: ask for detailed usage accounting (returns cached-token breakdown). */
+  usage?: { include: boolean };
+}
+/**
+ * Pull the cache-read / cache-write token split out of a LangChain
+ * `usage_metadata` object. Every provider LangChain supports normalises
+ * its cache-hit tokens into `input_token_details` (OpenAI/OpenRouter/DeepSeek
+ * via `cached_tokens`→`cache_read`; Anthropic via `cache_read`/`cache_creation`),
+ * so one extractor covers them all. Returns zeros when the provider reports
+ * no cache detail — which prices identically to the old flat-rate path.
+ */
+function cacheTokensFromUsageMetadata(usageMeta: {
+  input_tokens?: number;
+  output_tokens?: number;
+  input_token_details?: { cache_read?: number; cache_creation?: number };
+}): { cachedPromptTokens: number; cacheWriteTokens: number } {
+  const details = usageMeta.input_token_details ?? {};
+  return {
+    cachedPromptTokens: details.cache_read ?? 0,
+    cacheWriteTokens: details.cache_creation ?? 0,
+  };
 }
 /**
@@ -126,6 +148,12 @@ export interface LLMClientOptions {
    * the call consuming the entire 300s).
    */
   rawTimeoutMs?: number;
+  /**
+   * Explicit provider credentials. When set, used instead of reading the
+   * provider's env vars — lets callers (e.g. the CLI's account store) supply
+   * `apiKey`/`baseUrl`/`defaultModel` directly. Absent ⇒ env fallback.
+   */
+  providerConfig?: ProviderConfig;
 }
 export interface LLMCallOptions<T = unknown> {
@@ -369,7 +397,7 @@ export class LLMClient {
       (this.provider === 'kimi' ? 0.6 : DEFAULT_TEMPERATURE);
     this.streaming = options.streaming ?? false;
-    this.providerConfig = PROVIDER_CONFIGS[this.provider]();
+    this.providerConfig = options.providerConfig ?? PROVIDER_CONFIGS[this.provider]();
     this.modelName = options.model || this.providerConfig.defaultModel;
     this.rawTimeoutMs = options.rawTimeoutMs ?? LLMClient.DEFAULT_RAW_TIMEOUT_MS;
@@ -492,6 +520,8 @@ export class LLMClient {
     // OpenRouter (Qwen): explicit tool_choice so the model doesn't ignore tool definitions
     if (this.provider === 'openrouter') {
       modelKwargs.tool_choice = 'auto';
+      // Return the cached-token breakdown so cost is priced cache-aware.
+      modelKwargs.usage = { include: true };
     }
     return new ChatOpenAI({
@@ -637,7 +667,7 @@ export class LLMClient {
               this.tokenTracker.addUsage(
                 usage.promptTokens,
                 usage.completionTokens,
-                { provider: this.provider },
+                { provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
               );
             }
           }
@@ -843,6 +873,7 @@ export class LLMClient {
           this.tokenTracker.addUsage(
             usage.promptTokens,
             usage.completionTokens,
+            { provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
           );
         }
       }
@@ -912,6 +943,7 @@ export class LLMClient {
           this.tokenTracker.addUsage(
             usage.promptTokens,
             usage.completionTokens,
+            { provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
           );
         }
       }
@@ -989,6 +1021,8 @@ export class LLMClient {
       // consumers see one canonical field name.
       if (this.provider === 'openrouter') {
         body['reasoning'] = { enabled: true };
+        // Return authoritative usage accounting (real cost + cached-token split).
+        body['usage'] = { include: true };
       }
       const startedAt = Date.now();
@@ -1064,7 +1098,17 @@ export class LLMClient {
             totalTokens: parsed.usage.total_tokens,
           };
           if (this.tokenTracker) {
-            this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens);
+            const rawUsage = parsed.usage as {
+              prompt_tokens_details?: { cached_tokens?: number };
+              cost?: number;
+            };
+            const cachedTokens = rawUsage.prompt_tokens_details?.cached_tokens ?? 0;
+            this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, {
+              provider: this.provider,
+              cachedPromptTokens: cachedTokens,
+              // OpenRouter returns the real, routing+cache-adjusted charge here.
+              ...(typeof rawUsage.cost === 'number' ? { costUSD: rawUsage.cost } : {}),
+            });
           }
         }
@@ -1249,9 +1293,13 @@ export class LLMClient {
           };
           if (this.tokenTracker) {
+            // Anthropic reports input_tokens as the UNCACHED count; cache reads
+            // and writes are separate. Pass the true total so the tracker prices
+            // each bucket at its own rate.
             this.tokenTracker.addUsage(
-              usage.promptTokens,
+              apiUsage.input_tokens + cacheRead + cacheCreation,
               usage.completionTokens,
+              { provider: this.provider, cachedPromptTokens: cacheRead, cacheWriteTokens: cacheCreation },
             );
           }

package/src/structured-output.ts CHANGED Viewed

@@ -268,7 +268,13 @@ export class StructuredOutputClient {
     };
     if (this.tokenTracker) {
-      this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, { provider: 'structured-output' });
+      const cachedTokens =
+        (response.usage as { prompt_tokens_details?: { cached_tokens?: number } } | undefined)
+          ?.prompt_tokens_details?.cached_tokens ?? 0;
+      this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, {
+        provider: 'structured-output',
+        cachedPromptTokens: cachedTokens,
+      });
     }
     console.log(

package/src/token-tracker.ts CHANGED Viewed

@@ -18,11 +18,19 @@ export interface TokenUsage {
   completionTokens: number;
   totalTokens: number;
   callCount: number;
+  /** Subset of `promptTokens` that were cache reads (billed at the cache-read rate). */
+  cachedPromptTokens: number;
+  /** Subset of `promptTokens` written to cache (Anthropic; billed at the cache-write rate). */
+  cacheWriteTokens: number;
 }
 export interface TokenCost {
   promptCostPer1K: number;
   completionCostPer1K: number;
+  /** Per-1K rate for cache-read (cache-hit) prompt tokens. Falls back to prompt rate when absent. */
+  cacheReadCostPer1K?: number;
+  /** Per-1K rate for cache-write (cache-creation) prompt tokens. Falls back to prompt rate when absent. */
+  cacheWriteCostPer1K?: number;
 }
 export interface CallLogEntry {
@@ -32,6 +40,10 @@ export interface CallLogEntry {
   promptTokens: number;
   completionTokens: number;
   totalTokens: number;
+  /** Cache-read (cache-hit) subset of promptTokens, billed at the discounted rate. */
+  cachedPromptTokens?: number;
+  /** Cache-write subset of promptTokens (Anthropic). */
+  cacheWriteTokens?: number;
   estimatedCost: number;
   durationMs?: number;
   source: 'local-log';
@@ -45,6 +57,9 @@ const ALMADAR_ROOT = process.env['ALMADAR_ROOT'] ?? process.cwd();
 const PRICING_CACHE_PATH = join(ALMADAR_ROOT, '.llm-pricing-cache.json');
 const CALL_LOG_PATH = join(ALMADAR_ROOT, '.llm-call-log.jsonl');
 const CACHE_TTL_MS = 24 * 60 * 60 * 1000; // 24 hours
+// Bump when the cached TokenCost shape changes so stale on-disk caches are
+// invalidated on upgrade. v2 added cacheReadCostPer1K / cacheWriteCostPer1K.
+const PRICING_CACHE_VERSION = 2;
 /** Map from our local model name to OpenRouter model ID */
 const MODEL_ID_MAP: Record<string, string> = {
@@ -67,6 +82,7 @@ const MODEL_ID_MAP: Record<string, string> = {
 const FALLBACK_COSTS: Record<string, TokenCost> = {};
 interface PricingCache {
+  version?: number;
   fetchedAt: number;
   models: Record<string, TokenCost>;
 }
@@ -77,7 +93,7 @@ function loadCachedPricing(): PricingCache | null {
   try {
     const raw = readFileSync(PRICING_CACHE_PATH, 'utf-8');
     const parsed = JSON.parse(raw) as PricingCache;
-    if (Date.now() - parsed.fetchedAt < CACHE_TTL_MS) {
+    if (parsed.version === PRICING_CACHE_VERSION && Date.now() - parsed.fetchedAt < CACHE_TTL_MS) {
       return parsed;
     }
   } catch {
@@ -89,15 +105,30 @@ function loadCachedPricing(): PricingCache | null {
 async function fetchPricingFromOpenRouter(): Promise<Record<string, TokenCost>> {
   const res = await fetch('https://openrouter.ai/api/v1/models');
   if (!res.ok) throw new Error(`OpenRouter models API: HTTP ${res.status}`);
-  const json = await res.json() as { data?: Array<{ id: string; pricing?: { prompt?: string; completion?: string } }> };
+  const json = await res.json() as {
+    data?: Array<{
+      id: string;
+      pricing?: {
+        prompt?: string;
+        completion?: string;
+        input_cache_read?: string;
+        input_cache_write?: string;
+      };
+    }>;
+  };
   const models: Record<string, TokenCost> = {};
   for (const m of json.data ?? []) {
     const promptPerToken = parseFloat(m.pricing?.prompt ?? '0');
     const completionPerToken = parseFloat(m.pricing?.completion ?? '0');
+    const cacheReadPerToken = parseFloat(m.pricing?.input_cache_read ?? '0');
+    const cacheWritePerToken = parseFloat(m.pricing?.input_cache_write ?? '0');
     if (promptPerToken > 0 || completionPerToken > 0) {
       models[m.id] = {
         promptCostPer1K: promptPerToken * 1000,
         completionCostPer1K: completionPerToken * 1000,
+        // 0 (field absent) → leave undefined so cost math falls back to the prompt rate.
+        ...(cacheReadPerToken > 0 ? { cacheReadCostPer1K: cacheReadPerToken * 1000 } : {}),
+        ...(cacheWritePerToken > 0 ? { cacheWriteCostPer1K: cacheWritePerToken * 1000 } : {}),
       };
     }
   }
@@ -125,7 +156,7 @@ function getPricing(): Record<string, TokenCost> {
 function refreshPricingCache(): void {
   fetchPricingFromOpenRouter()
     .then((models) => {
-      pricingCache = { fetchedAt: Date.now(), models };
+      pricingCache = { version: PRICING_CACHE_VERSION, fetchedAt: Date.now(), models };
       try {
         mkdirSync(dirname(PRICING_CACHE_PATH), { recursive: true });
         writeFileSync(PRICING_CACHE_PATH, JSON.stringify(pricingCache));
@@ -164,22 +195,77 @@ export class TokenTracker {
     completionTokens: 0,
     totalTokens: 0,
     callCount: 0,
+    cachedPromptTokens: 0,
+    cacheWriteTokens: 0,
   };
+  /** Sum of provider-reported authoritative costs (e.g. OpenRouter `usage.cost`). */
+  private authoritativeCostUSD = 0;
+  /** Token buckets for calls WITHOUT an authoritative cost — priced cache-aware. */
+  private computed = { promptTokens: 0, completionTokens: 0, cachedPromptTokens: 0, cacheWriteTokens: 0 };
   constructor(model: string = 'claude-sonnet-4-5-20250929') {
     this.model = model;
   }
-  addUsage(promptTokens: number, completionTokens: number, options?: { provider?: string; durationMs?: number }): void {
+  /** Cache-aware cost for one (or an aggregate of) call(s), in USD. */
+  private costFor(promptTokens: number, completionTokens: number, cached: number, written: number): number {
+    const costs = getCostForModel(this.model);
+    const cacheReadRate = costs.cacheReadCostPer1K ?? costs.promptCostPer1K;
+    const cacheWriteRate = costs.cacheWriteCostPer1K ?? costs.promptCostPer1K;
+    const uncached = Math.max(0, promptTokens - cached - written);
+    return (
+      (uncached / 1000) * costs.promptCostPer1K +
+      (cached / 1000) * cacheReadRate +
+      (written / 1000) * cacheWriteRate +
+      (completionTokens / 1000) * costs.completionCostPer1K
+    );
+  }
+  /**
+   * Record one LLM call's usage. `promptTokens` is the TOTAL input count
+   * (cache reads + cache writes + uncached); `cachedPromptTokens` and
+   * `cacheWriteTokens` are subsets of it, priced at their own (cheaper /
+   * pricier) rates. Providers that don't report cache detail pass 0, which
+   * reduces to the previous flat-rate behaviour.
+   */
+  addUsage(
+    promptTokens: number,
+    completionTokens: number,
+    options?: {
+      provider?: string;
+      durationMs?: number;
+      cachedPromptTokens?: number;
+      cacheWriteTokens?: number;
+      /** Provider-reported authoritative cost (e.g. OpenRouter `usage.cost`). When set, used verbatim. */
+      costUSD?: number;
+    },
+  ): void {
+    const cached = Math.min(promptTokens, Math.max(0, options?.cachedPromptTokens ?? 0));
+    const written = Math.min(promptTokens - cached, Math.max(0, options?.cacheWriteTokens ?? 0));
     this.usage.promptTokens += promptTokens;
     this.usage.completionTokens += completionTokens;
     this.usage.totalTokens += promptTokens + completionTokens;
+    this.usage.cachedPromptTokens += cached;
+    this.usage.cacheWriteTokens += written;
     this.usage.callCount++;
-    const costs = getCostForModel(this.model);
-    const estimatedCost =
-      (promptTokens / 1000) * costs.promptCostPer1K +
-      (completionTokens / 1000) * costs.completionCostPer1K;
+    // Prefer the provider's authoritative cost (already cache- and routing-
+    // adjusted). Otherwise bucket the tokens and price them cache-aware so
+    // late-arriving pricing still applies retroactively to the estimate.
+    const authoritative = options?.costUSD;
+    let estimatedCost: number;
+    if (authoritative != null && Number.isFinite(authoritative)) {
+      this.authoritativeCostUSD += authoritative;
+      estimatedCost = authoritative;
+    } else {
+      this.computed.promptTokens += promptTokens;
+      this.computed.completionTokens += completionTokens;
+      this.computed.cachedPromptTokens += cached;
+      this.computed.cacheWriteTokens += written;
+      estimatedCost = this.costFor(promptTokens, completionTokens, cached, written);
+    }
     const entry: CallLogEntry = {
       timestamp: new Date().toISOString(),
@@ -188,6 +274,8 @@ export class TokenTracker {
       promptTokens,
       completionTokens,
       totalTokens: promptTokens + completionTokens,
+      cachedPromptTokens: cached,
+      cacheWriteTokens: written,
       estimatedCost,
       durationMs: options?.durationMs,
       source: 'local-log',
@@ -206,12 +294,15 @@ export class TokenTracker {
   }
   getEstimatedCost(): number {
-    const costs = getCostForModel(this.model);
-    const promptCost =
-      (this.usage.promptTokens / 1000) * costs.promptCostPer1K;
-    const completionCost =
-      (this.usage.completionTokens / 1000) * costs.completionCostPer1K;
-    return promptCost + completionCost;
+    return (
+      this.authoritativeCostUSD +
+      this.costFor(
+        this.computed.promptTokens,
+        this.computed.completionTokens,
+        this.computed.cachedPromptTokens,
+        this.computed.cacheWriteTokens,
+      )
+    );
   }
   getFormattedCost(): string {
@@ -239,7 +330,11 @@ export class TokenTracker {
       completionTokens: 0,
       totalTokens: 0,
       callCount: 0,
+      cachedPromptTokens: 0,
+      cacheWriteTokens: 0,
     };
+    this.authoritativeCostUSD = 0;
+    this.computed = { promptTokens: 0, completionTokens: 0, cachedPromptTokens: 0, cacheWriteTokens: 0 };
   }
   setModel(model: string): void {
@@ -270,7 +365,7 @@ export function getCallLogPath(): string {
 /** Force-refresh the pricing cache from OpenRouter. */
 export async function refreshPricing(): Promise<void> {
   const models = await fetchPricingFromOpenRouter();
-  pricingCache = { fetchedAt: Date.now(), models };
+  pricingCache = { version: PRICING_CACHE_VERSION, fetchedAt: Date.now(), models };
   mkdirSync(dirname(PRICING_CACHE_PATH), { recursive: true });
   writeFileSync(PRICING_CACHE_PATH, JSON.stringify(pricingCache));
 }