@almadar/llm 2.20.0 → 2.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/client.d.ts CHANGED
@@ -1,6 +1,6 @@
1
- import './rate-limiter-BqWOhaXY.js';
1
+ import './rate-limiter-7-oOHrtX.js';
2
2
  import '@langchain/openai';
3
3
  import '@langchain/anthropic';
4
4
  import 'zod';
5
- export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, a as CacheableBlock, D as DEEPSEEK_MODELS, K as KIMI_MODELS, L as LLMCallOptions, i as LLMClient, j as LLMClientOptions, k as LLMFinishReason, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, H as resetSharedLLMClient } from './client-BIq-gHZo.js';
5
+ export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, b as CacheableBlock, D as DEEPSEEK_MODELS, K as KIMI_MODELS, j as LLMCallOptions, a as LLMClient, k as LLMClientOptions, L as LLMFinishReason, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, H as resetSharedLLMClient } from './client-BTqaooer.js';
6
6
  import '@almadar/core';
package/dist/client.js CHANGED
@@ -18,9 +18,9 @@ import {
18
18
  getSharedLLMClient,
19
19
  isProviderAvailable,
20
20
  resetSharedLLMClient
21
- } from "./chunk-DGW3YFPS.js";
21
+ } from "./chunk-44QBGRL3.js";
22
22
  import "./chunk-P4VCT25B.js";
23
- import "./chunk-TGHGQB5I.js";
23
+ import "./chunk-SXSP6M24.js";
24
24
  export {
25
25
  ANTHROPIC_MODELS,
26
26
  DEEPSEEK_MODELS,
package/dist/index.d.ts CHANGED
@@ -1,6 +1,6 @@
1
- import { k as LLMFinishReason, i as LLMClient } from './client-BIq-gHZo.js';
2
- export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, a as CacheableBlock, b as ChatCompletionChoice, c as ChatCompletionMessage, d as ChatCompletionResponse, e as ChatCompletionRole, f as ChatCompletionToolCall, g as ChatCompletionToolDef, h as ChatCompletionUsage, D as DEEPSEEK_MODELS, K as KIMI_MODELS, L as LLMCallOptions, j as LLMClientOptions, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, G as parseChatCompletionResponse, H as resetSharedLLMClient } from './client-BIq-gHZo.js';
3
- export { R as RateLimiter, a as RateLimiterOptions, T as TokenTracker, b as TokenUsage, g as getGlobalRateLimiter, c as getGlobalTokenTracker, r as resetGlobalRateLimiter, d as resetGlobalTokenTracker } from './rate-limiter-BqWOhaXY.js';
1
+ import { L as LLMFinishReason, a as LLMClient } from './client-BTqaooer.js';
2
+ export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, b as CacheableBlock, c as ChatCompletionChoice, d as ChatCompletionMessage, e as ChatCompletionResponse, f as ChatCompletionRole, g as ChatCompletionToolCall, h as ChatCompletionToolDef, i as ChatCompletionUsage, D as DEEPSEEK_MODELS, K as KIMI_MODELS, j as LLMCallOptions, k as LLMClientOptions, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, G as parseChatCompletionResponse, H as resetSharedLLMClient } from './client-BTqaooer.js';
3
+ export { a as RateLimiter, R as RateLimiterOptions, b as TokenTracker, T as TokenUsage, g as getGlobalRateLimiter, c as getGlobalTokenTracker, r as resetGlobalRateLimiter, d as resetGlobalTokenTracker } from './rate-limiter-7-oOHrtX.js';
4
4
  export { autoCloseJson, extractJsonFromText, isValidJson, parseJsonResponse, safeParseJson } from './json-parser.js';
5
5
  import { z } from 'zod';
6
6
  export { JsonSchema, STRUCTURED_OUTPUT_MODELS, StructuredGenerationOptions, StructuredGenerationResult, StructuredOutputClient, StructuredOutputOptions, getStructuredOutputClient, isStructuredOutputAvailable, resetStructuredOutputClient } from './structured-output.js';
package/dist/index.js CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  isProviderAvailable,
20
20
  parseChatCompletionResponse,
21
21
  resetSharedLLMClient
22
- } from "./chunk-DGW3YFPS.js";
22
+ } from "./chunk-44QBGRL3.js";
23
23
  import {
24
24
  autoCloseJson,
25
25
  extractJsonFromText,
@@ -33,7 +33,7 @@ import {
33
33
  getStructuredOutputClient,
34
34
  isStructuredOutputAvailable,
35
35
  resetStructuredOutputClient
36
- } from "./chunk-NO7P6EDT.js";
36
+ } from "./chunk-IDXSWM57.js";
37
37
  import {
38
38
  RateLimiter,
39
39
  TokenTracker,
@@ -41,7 +41,7 @@ import {
41
41
  getGlobalTokenTracker,
42
42
  resetGlobalRateLimiter,
43
43
  resetGlobalTokenTracker
44
- } from "./chunk-TGHGQB5I.js";
44
+ } from "./chunk-SXSP6M24.js";
45
45
  import {
46
46
  MasarError,
47
47
  MasarProvider,
@@ -14,14 +14,35 @@ interface TokenUsage {
14
14
  completionTokens: number;
15
15
  totalTokens: number;
16
16
  callCount: number;
17
+ /** Subset of `promptTokens` that were cache reads (billed at the cache-read rate). */
18
+ cachedPromptTokens: number;
19
+ /** Subset of `promptTokens` written to cache (Anthropic; billed at the cache-write rate). */
20
+ cacheWriteTokens: number;
17
21
  }
18
22
  declare class TokenTracker {
19
23
  private model;
20
24
  private usage;
25
+ /** Sum of provider-reported authoritative costs (e.g. OpenRouter `usage.cost`). */
26
+ private authoritativeCostUSD;
27
+ /** Token buckets for calls WITHOUT an authoritative cost — priced cache-aware. */
28
+ private computed;
21
29
  constructor(model?: string);
30
+ /** Cache-aware cost for one (or an aggregate of) call(s), in USD. */
31
+ private costFor;
32
+ /**
33
+ * Record one LLM call's usage. `promptTokens` is the TOTAL input count
34
+ * (cache reads + cache writes + uncached); `cachedPromptTokens` and
35
+ * `cacheWriteTokens` are subsets of it, priced at their own (cheaper /
36
+ * pricier) rates. Providers that don't report cache detail pass 0, which
37
+ * reduces to the previous flat-rate behaviour.
38
+ */
22
39
  addUsage(promptTokens: number, completionTokens: number, options?: {
23
40
  provider?: string;
24
41
  durationMs?: number;
42
+ cachedPromptTokens?: number;
43
+ cacheWriteTokens?: number;
44
+ /** Provider-reported authoritative cost (e.g. OpenRouter `usage.cost`). When set, used verbatim. */
45
+ costUSD?: number;
25
46
  }): void;
26
47
  getSummary(): TokenUsage;
27
48
  getEstimatedCost(): number;
@@ -99,4 +120,4 @@ declare class RateLimiter {
99
120
  declare function getGlobalRateLimiter(options?: RateLimiterOptions): RateLimiter;
100
121
  declare function resetGlobalRateLimiter(): void;
101
122
 
102
- export { RateLimiter as R, TokenTracker as T, type RateLimiterOptions as a, type TokenUsage as b, getGlobalTokenTracker as c, resetGlobalTokenTracker as d, getGlobalRateLimiter as g, resetGlobalRateLimiter as r };
123
+ export { type RateLimiterOptions as R, type TokenUsage as T, RateLimiter as a, TokenTracker as b, getGlobalTokenTracker as c, resetGlobalTokenTracker as d, getGlobalRateLimiter as g, resetGlobalRateLimiter as r };
@@ -1,4 +1,4 @@
1
- import { a as RateLimiterOptions, b as TokenUsage } from './rate-limiter-BqWOhaXY.js';
1
+ import { R as RateLimiterOptions, T as TokenUsage } from './rate-limiter-7-oOHrtX.js';
2
2
  import { z } from 'zod';
3
3
 
4
4
  /**
@@ -4,8 +4,8 @@ import {
4
4
  getStructuredOutputClient,
5
5
  isStructuredOutputAvailable,
6
6
  resetStructuredOutputClient
7
- } from "./chunk-NO7P6EDT.js";
8
- import "./chunk-TGHGQB5I.js";
7
+ } from "./chunk-IDXSWM57.js";
8
+ import "./chunk-SXSP6M24.js";
9
9
  export {
10
10
  STRUCTURED_OUTPUT_MODELS,
11
11
  StructuredOutputClient,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@almadar/llm",
3
- "version": "2.20.0",
3
+ "version": "2.21.0",
4
4
  "description": "Multi-provider LLM client with rate limiting, token tracking, structured outputs, and continuation handling",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
package/src/client.ts CHANGED
@@ -55,6 +55,28 @@ interface ModelKwargs {
55
55
  max_completion_tokens?: number;
56
56
  thinking?: { type: string };
57
57
  tool_choice?: string;
58
+ /** OpenRouter: ask for detailed usage accounting (returns cached-token breakdown). */
59
+ usage?: { include: boolean };
60
+ }
61
+
62
+ /**
63
+ * Pull the cache-read / cache-write token split out of a LangChain
64
+ * `usage_metadata` object. Every provider LangChain supports normalises
65
+ * its cache-hit tokens into `input_token_details` (OpenAI/OpenRouter/DeepSeek
66
+ * via `cached_tokens`→`cache_read`; Anthropic via `cache_read`/`cache_creation`),
67
+ * so one extractor covers them all. Returns zeros when the provider reports
68
+ * no cache detail — which prices identically to the old flat-rate path.
69
+ */
70
+ function cacheTokensFromUsageMetadata(usageMeta: {
71
+ input_tokens?: number;
72
+ output_tokens?: number;
73
+ input_token_details?: { cache_read?: number; cache_creation?: number };
74
+ }): { cachedPromptTokens: number; cacheWriteTokens: number } {
75
+ const details = usageMeta.input_token_details ?? {};
76
+ return {
77
+ cachedPromptTokens: details.cache_read ?? 0,
78
+ cacheWriteTokens: details.cache_creation ?? 0,
79
+ };
58
80
  }
59
81
 
60
82
  /**
@@ -492,6 +514,8 @@ export class LLMClient {
492
514
  // OpenRouter (Qwen): explicit tool_choice so the model doesn't ignore tool definitions
493
515
  if (this.provider === 'openrouter') {
494
516
  modelKwargs.tool_choice = 'auto';
517
+ // Return the cached-token breakdown so cost is priced cache-aware.
518
+ modelKwargs.usage = { include: true };
495
519
  }
496
520
 
497
521
  return new ChatOpenAI({
@@ -637,7 +661,7 @@ export class LLMClient {
637
661
  this.tokenTracker.addUsage(
638
662
  usage.promptTokens,
639
663
  usage.completionTokens,
640
- { provider: this.provider },
664
+ { provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
641
665
  );
642
666
  }
643
667
  }
@@ -843,6 +867,7 @@ export class LLMClient {
843
867
  this.tokenTracker.addUsage(
844
868
  usage.promptTokens,
845
869
  usage.completionTokens,
870
+ { provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
846
871
  );
847
872
  }
848
873
  }
@@ -912,6 +937,7 @@ export class LLMClient {
912
937
  this.tokenTracker.addUsage(
913
938
  usage.promptTokens,
914
939
  usage.completionTokens,
940
+ { provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
915
941
  );
916
942
  }
917
943
  }
@@ -989,6 +1015,8 @@ export class LLMClient {
989
1015
  // consumers see one canonical field name.
990
1016
  if (this.provider === 'openrouter') {
991
1017
  body['reasoning'] = { enabled: true };
1018
+ // Return authoritative usage accounting (real cost + cached-token split).
1019
+ body['usage'] = { include: true };
992
1020
  }
993
1021
 
994
1022
  const startedAt = Date.now();
@@ -1064,7 +1092,17 @@ export class LLMClient {
1064
1092
  totalTokens: parsed.usage.total_tokens,
1065
1093
  };
1066
1094
  if (this.tokenTracker) {
1067
- this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens);
1095
+ const rawUsage = parsed.usage as {
1096
+ prompt_tokens_details?: { cached_tokens?: number };
1097
+ cost?: number;
1098
+ };
1099
+ const cachedTokens = rawUsage.prompt_tokens_details?.cached_tokens ?? 0;
1100
+ this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, {
1101
+ provider: this.provider,
1102
+ cachedPromptTokens: cachedTokens,
1103
+ // OpenRouter returns the real, routing+cache-adjusted charge here.
1104
+ ...(typeof rawUsage.cost === 'number' ? { costUSD: rawUsage.cost } : {}),
1105
+ });
1068
1106
  }
1069
1107
  }
1070
1108
 
@@ -1249,9 +1287,13 @@ export class LLMClient {
1249
1287
  };
1250
1288
 
1251
1289
  if (this.tokenTracker) {
1290
+ // Anthropic reports input_tokens as the UNCACHED count; cache reads
1291
+ // and writes are separate. Pass the true total so the tracker prices
1292
+ // each bucket at its own rate.
1252
1293
  this.tokenTracker.addUsage(
1253
- usage.promptTokens,
1294
+ apiUsage.input_tokens + cacheRead + cacheCreation,
1254
1295
  usage.completionTokens,
1296
+ { provider: this.provider, cachedPromptTokens: cacheRead, cacheWriteTokens: cacheCreation },
1255
1297
  );
1256
1298
  }
1257
1299
 
@@ -268,7 +268,13 @@ export class StructuredOutputClient {
268
268
  };
269
269
 
270
270
  if (this.tokenTracker) {
271
- this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, { provider: 'structured-output' });
271
+ const cachedTokens =
272
+ (response.usage as { prompt_tokens_details?: { cached_tokens?: number } } | undefined)
273
+ ?.prompt_tokens_details?.cached_tokens ?? 0;
274
+ this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, {
275
+ provider: 'structured-output',
276
+ cachedPromptTokens: cachedTokens,
277
+ });
272
278
  }
273
279
 
274
280
  console.log(
@@ -18,11 +18,19 @@ export interface TokenUsage {
18
18
  completionTokens: number;
19
19
  totalTokens: number;
20
20
  callCount: number;
21
+ /** Subset of `promptTokens` that were cache reads (billed at the cache-read rate). */
22
+ cachedPromptTokens: number;
23
+ /** Subset of `promptTokens` written to cache (Anthropic; billed at the cache-write rate). */
24
+ cacheWriteTokens: number;
21
25
  }
22
26
 
23
27
  export interface TokenCost {
24
28
  promptCostPer1K: number;
25
29
  completionCostPer1K: number;
30
+ /** Per-1K rate for cache-read (cache-hit) prompt tokens. Falls back to prompt rate when absent. */
31
+ cacheReadCostPer1K?: number;
32
+ /** Per-1K rate for cache-write (cache-creation) prompt tokens. Falls back to prompt rate when absent. */
33
+ cacheWriteCostPer1K?: number;
26
34
  }
27
35
 
28
36
  export interface CallLogEntry {
@@ -32,6 +40,10 @@ export interface CallLogEntry {
32
40
  promptTokens: number;
33
41
  completionTokens: number;
34
42
  totalTokens: number;
43
+ /** Cache-read (cache-hit) subset of promptTokens, billed at the discounted rate. */
44
+ cachedPromptTokens?: number;
45
+ /** Cache-write subset of promptTokens (Anthropic). */
46
+ cacheWriteTokens?: number;
35
47
  estimatedCost: number;
36
48
  durationMs?: number;
37
49
  source: 'local-log';
@@ -45,6 +57,9 @@ const ALMADAR_ROOT = process.env['ALMADAR_ROOT'] ?? process.cwd();
45
57
  const PRICING_CACHE_PATH = join(ALMADAR_ROOT, '.llm-pricing-cache.json');
46
58
  const CALL_LOG_PATH = join(ALMADAR_ROOT, '.llm-call-log.jsonl');
47
59
  const CACHE_TTL_MS = 24 * 60 * 60 * 1000; // 24 hours
60
+ // Bump when the cached TokenCost shape changes so stale on-disk caches are
61
+ // invalidated on upgrade. v2 added cacheReadCostPer1K / cacheWriteCostPer1K.
62
+ const PRICING_CACHE_VERSION = 2;
48
63
 
49
64
  /** Map from our local model name to OpenRouter model ID */
50
65
  const MODEL_ID_MAP: Record<string, string> = {
@@ -67,6 +82,7 @@ const MODEL_ID_MAP: Record<string, string> = {
67
82
  const FALLBACK_COSTS: Record<string, TokenCost> = {};
68
83
 
69
84
  interface PricingCache {
85
+ version?: number;
70
86
  fetchedAt: number;
71
87
  models: Record<string, TokenCost>;
72
88
  }
@@ -77,7 +93,7 @@ function loadCachedPricing(): PricingCache | null {
77
93
  try {
78
94
  const raw = readFileSync(PRICING_CACHE_PATH, 'utf-8');
79
95
  const parsed = JSON.parse(raw) as PricingCache;
80
- if (Date.now() - parsed.fetchedAt < CACHE_TTL_MS) {
96
+ if (parsed.version === PRICING_CACHE_VERSION && Date.now() - parsed.fetchedAt < CACHE_TTL_MS) {
81
97
  return parsed;
82
98
  }
83
99
  } catch {
@@ -89,15 +105,30 @@ function loadCachedPricing(): PricingCache | null {
89
105
  async function fetchPricingFromOpenRouter(): Promise<Record<string, TokenCost>> {
90
106
  const res = await fetch('https://openrouter.ai/api/v1/models');
91
107
  if (!res.ok) throw new Error(`OpenRouter models API: HTTP ${res.status}`);
92
- const json = await res.json() as { data?: Array<{ id: string; pricing?: { prompt?: string; completion?: string } }> };
108
+ const json = await res.json() as {
109
+ data?: Array<{
110
+ id: string;
111
+ pricing?: {
112
+ prompt?: string;
113
+ completion?: string;
114
+ input_cache_read?: string;
115
+ input_cache_write?: string;
116
+ };
117
+ }>;
118
+ };
93
119
  const models: Record<string, TokenCost> = {};
94
120
  for (const m of json.data ?? []) {
95
121
  const promptPerToken = parseFloat(m.pricing?.prompt ?? '0');
96
122
  const completionPerToken = parseFloat(m.pricing?.completion ?? '0');
123
+ const cacheReadPerToken = parseFloat(m.pricing?.input_cache_read ?? '0');
124
+ const cacheWritePerToken = parseFloat(m.pricing?.input_cache_write ?? '0');
97
125
  if (promptPerToken > 0 || completionPerToken > 0) {
98
126
  models[m.id] = {
99
127
  promptCostPer1K: promptPerToken * 1000,
100
128
  completionCostPer1K: completionPerToken * 1000,
129
+ // 0 (field absent) → leave undefined so cost math falls back to the prompt rate.
130
+ ...(cacheReadPerToken > 0 ? { cacheReadCostPer1K: cacheReadPerToken * 1000 } : {}),
131
+ ...(cacheWritePerToken > 0 ? { cacheWriteCostPer1K: cacheWritePerToken * 1000 } : {}),
101
132
  };
102
133
  }
103
134
  }
@@ -125,7 +156,7 @@ function getPricing(): Record<string, TokenCost> {
125
156
  function refreshPricingCache(): void {
126
157
  fetchPricingFromOpenRouter()
127
158
  .then((models) => {
128
- pricingCache = { fetchedAt: Date.now(), models };
159
+ pricingCache = { version: PRICING_CACHE_VERSION, fetchedAt: Date.now(), models };
129
160
  try {
130
161
  mkdirSync(dirname(PRICING_CACHE_PATH), { recursive: true });
131
162
  writeFileSync(PRICING_CACHE_PATH, JSON.stringify(pricingCache));
@@ -164,22 +195,77 @@ export class TokenTracker {
164
195
  completionTokens: 0,
165
196
  totalTokens: 0,
166
197
  callCount: 0,
198
+ cachedPromptTokens: 0,
199
+ cacheWriteTokens: 0,
167
200
  };
168
201
 
202
+ /** Sum of provider-reported authoritative costs (e.g. OpenRouter `usage.cost`). */
203
+ private authoritativeCostUSD = 0;
204
+ /** Token buckets for calls WITHOUT an authoritative cost — priced cache-aware. */
205
+ private computed = { promptTokens: 0, completionTokens: 0, cachedPromptTokens: 0, cacheWriteTokens: 0 };
206
+
169
207
  constructor(model: string = 'claude-sonnet-4-5-20250929') {
170
208
  this.model = model;
171
209
  }
172
210
 
173
- addUsage(promptTokens: number, completionTokens: number, options?: { provider?: string; durationMs?: number }): void {
211
+ /** Cache-aware cost for one (or an aggregate of) call(s), in USD. */
212
+ private costFor(promptTokens: number, completionTokens: number, cached: number, written: number): number {
213
+ const costs = getCostForModel(this.model);
214
+ const cacheReadRate = costs.cacheReadCostPer1K ?? costs.promptCostPer1K;
215
+ const cacheWriteRate = costs.cacheWriteCostPer1K ?? costs.promptCostPer1K;
216
+ const uncached = Math.max(0, promptTokens - cached - written);
217
+ return (
218
+ (uncached / 1000) * costs.promptCostPer1K +
219
+ (cached / 1000) * cacheReadRate +
220
+ (written / 1000) * cacheWriteRate +
221
+ (completionTokens / 1000) * costs.completionCostPer1K
222
+ );
223
+ }
224
+
225
+ /**
226
+ * Record one LLM call's usage. `promptTokens` is the TOTAL input count
227
+ * (cache reads + cache writes + uncached); `cachedPromptTokens` and
228
+ * `cacheWriteTokens` are subsets of it, priced at their own (cheaper /
229
+ * pricier) rates. Providers that don't report cache detail pass 0, which
230
+ * reduces to the previous flat-rate behaviour.
231
+ */
232
+ addUsage(
233
+ promptTokens: number,
234
+ completionTokens: number,
235
+ options?: {
236
+ provider?: string;
237
+ durationMs?: number;
238
+ cachedPromptTokens?: number;
239
+ cacheWriteTokens?: number;
240
+ /** Provider-reported authoritative cost (e.g. OpenRouter `usage.cost`). When set, used verbatim. */
241
+ costUSD?: number;
242
+ },
243
+ ): void {
244
+ const cached = Math.min(promptTokens, Math.max(0, options?.cachedPromptTokens ?? 0));
245
+ const written = Math.min(promptTokens - cached, Math.max(0, options?.cacheWriteTokens ?? 0));
246
+
174
247
  this.usage.promptTokens += promptTokens;
175
248
  this.usage.completionTokens += completionTokens;
176
249
  this.usage.totalTokens += promptTokens + completionTokens;
250
+ this.usage.cachedPromptTokens += cached;
251
+ this.usage.cacheWriteTokens += written;
177
252
  this.usage.callCount++;
178
253
 
179
- const costs = getCostForModel(this.model);
180
- const estimatedCost =
181
- (promptTokens / 1000) * costs.promptCostPer1K +
182
- (completionTokens / 1000) * costs.completionCostPer1K;
254
+ // Prefer the provider's authoritative cost (already cache- and routing-
255
+ // adjusted). Otherwise bucket the tokens and price them cache-aware so
256
+ // late-arriving pricing still applies retroactively to the estimate.
257
+ const authoritative = options?.costUSD;
258
+ let estimatedCost: number;
259
+ if (authoritative != null && Number.isFinite(authoritative)) {
260
+ this.authoritativeCostUSD += authoritative;
261
+ estimatedCost = authoritative;
262
+ } else {
263
+ this.computed.promptTokens += promptTokens;
264
+ this.computed.completionTokens += completionTokens;
265
+ this.computed.cachedPromptTokens += cached;
266
+ this.computed.cacheWriteTokens += written;
267
+ estimatedCost = this.costFor(promptTokens, completionTokens, cached, written);
268
+ }
183
269
 
184
270
  const entry: CallLogEntry = {
185
271
  timestamp: new Date().toISOString(),
@@ -188,6 +274,8 @@ export class TokenTracker {
188
274
  promptTokens,
189
275
  completionTokens,
190
276
  totalTokens: promptTokens + completionTokens,
277
+ cachedPromptTokens: cached,
278
+ cacheWriteTokens: written,
191
279
  estimatedCost,
192
280
  durationMs: options?.durationMs,
193
281
  source: 'local-log',
@@ -206,12 +294,15 @@ export class TokenTracker {
206
294
  }
207
295
 
208
296
  getEstimatedCost(): number {
209
- const costs = getCostForModel(this.model);
210
- const promptCost =
211
- (this.usage.promptTokens / 1000) * costs.promptCostPer1K;
212
- const completionCost =
213
- (this.usage.completionTokens / 1000) * costs.completionCostPer1K;
214
- return promptCost + completionCost;
297
+ return (
298
+ this.authoritativeCostUSD +
299
+ this.costFor(
300
+ this.computed.promptTokens,
301
+ this.computed.completionTokens,
302
+ this.computed.cachedPromptTokens,
303
+ this.computed.cacheWriteTokens,
304
+ )
305
+ );
215
306
  }
216
307
 
217
308
  getFormattedCost(): string {
@@ -239,7 +330,11 @@ export class TokenTracker {
239
330
  completionTokens: 0,
240
331
  totalTokens: 0,
241
332
  callCount: 0,
333
+ cachedPromptTokens: 0,
334
+ cacheWriteTokens: 0,
242
335
  };
336
+ this.authoritativeCostUSD = 0;
337
+ this.computed = { promptTokens: 0, completionTokens: 0, cachedPromptTokens: 0, cacheWriteTokens: 0 };
243
338
  }
244
339
 
245
340
  setModel(model: string): void {
@@ -270,7 +365,7 @@ export function getCallLogPath(): string {
270
365
  /** Force-refresh the pricing cache from OpenRouter. */
271
366
  export async function refreshPricing(): Promise<void> {
272
367
  const models = await fetchPricingFromOpenRouter();
273
- pricingCache = { fetchedAt: Date.now(), models };
368
+ pricingCache = { version: PRICING_CACHE_VERSION, fetchedAt: Date.now(), models };
274
369
  mkdirSync(dirname(PRICING_CACHE_PATH), { recursive: true });
275
370
  writeFileSync(PRICING_CACHE_PATH, JSON.stringify(pricingCache));
276
371
  }