@almadar/llm 2.19.0 → 2.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { a as RateLimiterOptions, b as TokenUsage } from './rate-limiter-BqWOhaXY.js';
1
+ import { R as RateLimiterOptions, T as TokenUsage } from './rate-limiter-7-oOHrtX.js';
2
2
  import { ChatOpenAI } from '@langchain/openai';
3
3
  import { ChatAnthropic } from '@langchain/anthropic';
4
4
  import { z } from 'zod';
@@ -67,7 +67,7 @@ interface ChatCompletionResponse {
67
67
  declare function parseChatCompletionResponse(raw: string): ChatCompletionResponse;
68
68
 
69
69
  type ChatModel = ChatOpenAI | ChatAnthropic;
70
- type LLMProvider = 'openai' | 'deepseek' | 'anthropic' | 'kimi' | 'openrouter' | 'orbgen';
70
+ type LLMProvider = 'openai' | 'deepseek' | 'anthropic' | 'kimi' | 'openrouter' | 'orbgen' | 'masar';
71
71
  interface ProviderConfig {
72
72
  apiKey: string;
73
73
  baseUrl?: string;
@@ -400,4 +400,4 @@ declare function createOpenRouterClient(options?: Partial<Omit<LLMClientOptions,
400
400
  */
401
401
  declare function createZhipuClient(options?: Partial<Omit<LLMClientOptions, 'provider'>>): LLMClient;
402
402
 
403
- export { ANTHROPIC_MODELS as A, getAvailableProvider as B, type CacheAwareLLMCallOptions as C, DEEPSEEK_MODELS as D, getSharedLLMClient as E, isProviderAvailable as F, parseChatCompletionResponse as G, resetSharedLLMClient as H, KIMI_MODELS as K, type LLMCallOptions as L, OPENAI_MODELS as O, type ProviderConfig as P, type CacheableBlock as a, type ChatCompletionChoice as b, type ChatCompletionMessage as c, type ChatCompletionResponse as d, type ChatCompletionRole as e, type ChatCompletionToolCall as f, type ChatCompletionToolDef as g, type ChatCompletionUsage as h, LLMClient as i, type LLMClientOptions as j, type LLMFinishReason as k, type LLMProvider as l, type LLMResponse as m, type LLMStreamChunk as n, type LLMStreamOptions as o, type LLMUsage as p, OPENROUTER_MODELS as q, createAnthropicClient as r, createCreativeClient as s, createDeepSeekClient as t, createFixClient as u, createKimiClient as v, createOpenAIClient as w, createOpenRouterClient as x, createRequirementsClient as y, createZhipuClient as z };
403
+ export { ANTHROPIC_MODELS as A, getAvailableProvider as B, type CacheAwareLLMCallOptions as C, DEEPSEEK_MODELS as D, getSharedLLMClient as E, isProviderAvailable as F, parseChatCompletionResponse as G, resetSharedLLMClient as H, KIMI_MODELS as K, type LLMFinishReason as L, OPENAI_MODELS as O, type ProviderConfig as P, LLMClient as a, type CacheableBlock as b, type ChatCompletionChoice as c, type ChatCompletionMessage as d, type ChatCompletionResponse as e, type ChatCompletionRole as f, type ChatCompletionToolCall as g, type ChatCompletionToolDef as h, type ChatCompletionUsage as i, type LLMCallOptions as j, type LLMClientOptions as k, type LLMProvider as l, type LLMResponse as m, type LLMStreamChunk as n, type LLMStreamOptions as o, type LLMUsage as p, OPENROUTER_MODELS as q, createAnthropicClient as r, createCreativeClient as s, createDeepSeekClient as t, createFixClient as u, createKimiClient as v, createOpenAIClient as w, createOpenRouterClient as x, createRequirementsClient as y, createZhipuClient as z };
package/dist/client.d.ts CHANGED
@@ -1,6 +1,6 @@
1
- import './rate-limiter-BqWOhaXY.js';
1
+ import './rate-limiter-7-oOHrtX.js';
2
2
  import '@langchain/openai';
3
3
  import '@langchain/anthropic';
4
4
  import 'zod';
5
- export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, a as CacheableBlock, D as DEEPSEEK_MODELS, K as KIMI_MODELS, L as LLMCallOptions, i as LLMClient, j as LLMClientOptions, k as LLMFinishReason, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, H as resetSharedLLMClient } from './client-C478gp_H.js';
5
+ export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, b as CacheableBlock, D as DEEPSEEK_MODELS, K as KIMI_MODELS, j as LLMCallOptions, a as LLMClient, k as LLMClientOptions, L as LLMFinishReason, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, H as resetSharedLLMClient } from './client-BTqaooer.js';
6
6
  import '@almadar/core';
package/dist/client.js CHANGED
@@ -18,9 +18,9 @@ import {
18
18
  getSharedLLMClient,
19
19
  isProviderAvailable,
20
20
  resetSharedLLMClient
21
- } from "./chunk-4V7BB3ZQ.js";
21
+ } from "./chunk-44QBGRL3.js";
22
22
  import "./chunk-P4VCT25B.js";
23
- import "./chunk-TGHGQB5I.js";
23
+ import "./chunk-SXSP6M24.js";
24
24
  export {
25
25
  ANTHROPIC_MODELS,
26
26
  DEEPSEEK_MODELS,
package/dist/index.d.ts CHANGED
@@ -1,6 +1,6 @@
1
- import { k as LLMFinishReason, i as LLMClient } from './client-C478gp_H.js';
2
- export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, a as CacheableBlock, b as ChatCompletionChoice, c as ChatCompletionMessage, d as ChatCompletionResponse, e as ChatCompletionRole, f as ChatCompletionToolCall, g as ChatCompletionToolDef, h as ChatCompletionUsage, D as DEEPSEEK_MODELS, K as KIMI_MODELS, L as LLMCallOptions, j as LLMClientOptions, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, G as parseChatCompletionResponse, H as resetSharedLLMClient } from './client-C478gp_H.js';
3
- export { R as RateLimiter, a as RateLimiterOptions, T as TokenTracker, b as TokenUsage, g as getGlobalRateLimiter, c as getGlobalTokenTracker, r as resetGlobalRateLimiter, d as resetGlobalTokenTracker } from './rate-limiter-BqWOhaXY.js';
1
+ import { L as LLMFinishReason, a as LLMClient } from './client-BTqaooer.js';
2
+ export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, b as CacheableBlock, c as ChatCompletionChoice, d as ChatCompletionMessage, e as ChatCompletionResponse, f as ChatCompletionRole, g as ChatCompletionToolCall, h as ChatCompletionToolDef, i as ChatCompletionUsage, D as DEEPSEEK_MODELS, K as KIMI_MODELS, j as LLMCallOptions, k as LLMClientOptions, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, G as parseChatCompletionResponse, H as resetSharedLLMClient } from './client-BTqaooer.js';
3
+ export { a as RateLimiter, R as RateLimiterOptions, b as TokenTracker, T as TokenUsage, g as getGlobalRateLimiter, c as getGlobalTokenTracker, r as resetGlobalRateLimiter, d as resetGlobalTokenTracker } from './rate-limiter-7-oOHrtX.js';
4
4
  export { autoCloseJson, extractJsonFromText, isValidJson, parseJsonResponse, safeParseJson } from './json-parser.js';
5
5
  import { z } from 'zod';
6
6
  export { JsonSchema, STRUCTURED_OUTPUT_MODELS, StructuredGenerationOptions, StructuredGenerationResult, StructuredOutputClient, StructuredOutputOptions, getStructuredOutputClient, isStructuredOutputAvailable, resetStructuredOutputClient } from './structured-output.js';
package/dist/index.js CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  isProviderAvailable,
20
20
  parseChatCompletionResponse,
21
21
  resetSharedLLMClient
22
- } from "./chunk-4V7BB3ZQ.js";
22
+ } from "./chunk-44QBGRL3.js";
23
23
  import {
24
24
  autoCloseJson,
25
25
  extractJsonFromText,
@@ -33,7 +33,7 @@ import {
33
33
  getStructuredOutputClient,
34
34
  isStructuredOutputAvailable,
35
35
  resetStructuredOutputClient
36
- } from "./chunk-NO7P6EDT.js";
36
+ } from "./chunk-IDXSWM57.js";
37
37
  import {
38
38
  RateLimiter,
39
39
  TokenTracker,
@@ -41,7 +41,7 @@ import {
41
41
  getGlobalTokenTracker,
42
42
  resetGlobalRateLimiter,
43
43
  resetGlobalTokenTracker
44
- } from "./chunk-TGHGQB5I.js";
44
+ } from "./chunk-SXSP6M24.js";
45
45
  import {
46
46
  MasarError,
47
47
  MasarProvider,
@@ -14,14 +14,35 @@ interface TokenUsage {
14
14
  completionTokens: number;
15
15
  totalTokens: number;
16
16
  callCount: number;
17
+ /** Subset of `promptTokens` that were cache reads (billed at the cache-read rate). */
18
+ cachedPromptTokens: number;
19
+ /** Subset of `promptTokens` written to cache (Anthropic; billed at the cache-write rate). */
20
+ cacheWriteTokens: number;
17
21
  }
18
22
  declare class TokenTracker {
19
23
  private model;
20
24
  private usage;
25
+ /** Sum of provider-reported authoritative costs (e.g. OpenRouter `usage.cost`). */
26
+ private authoritativeCostUSD;
27
+ /** Token buckets for calls WITHOUT an authoritative cost — priced cache-aware. */
28
+ private computed;
21
29
  constructor(model?: string);
30
+ /** Cache-aware cost for one (or an aggregate of) call(s), in USD. */
31
+ private costFor;
32
+ /**
33
+ * Record one LLM call's usage. `promptTokens` is the TOTAL input count
34
+ * (cache reads + cache writes + uncached); `cachedPromptTokens` and
35
+ * `cacheWriteTokens` are subsets of it, priced at their own (cheaper /
36
+ * pricier) rates. Providers that don't report cache detail pass 0, which
37
+ * reduces to the previous flat-rate behaviour.
38
+ */
22
39
  addUsage(promptTokens: number, completionTokens: number, options?: {
23
40
  provider?: string;
24
41
  durationMs?: number;
42
+ cachedPromptTokens?: number;
43
+ cacheWriteTokens?: number;
44
+ /** Provider-reported authoritative cost (e.g. OpenRouter `usage.cost`). When set, used verbatim. */
45
+ costUSD?: number;
25
46
  }): void;
26
47
  getSummary(): TokenUsage;
27
48
  getEstimatedCost(): number;
@@ -99,4 +120,4 @@ declare class RateLimiter {
99
120
  declare function getGlobalRateLimiter(options?: RateLimiterOptions): RateLimiter;
100
121
  declare function resetGlobalRateLimiter(): void;
101
122
 
102
- export { RateLimiter as R, TokenTracker as T, type RateLimiterOptions as a, type TokenUsage as b, getGlobalTokenTracker as c, resetGlobalTokenTracker as d, getGlobalRateLimiter as g, resetGlobalRateLimiter as r };
123
+ export { type RateLimiterOptions as R, type TokenUsage as T, RateLimiter as a, TokenTracker as b, getGlobalTokenTracker as c, resetGlobalTokenTracker as d, getGlobalRateLimiter as g, resetGlobalRateLimiter as r };
@@ -1,4 +1,4 @@
1
- import { a as RateLimiterOptions, b as TokenUsage } from './rate-limiter-BqWOhaXY.js';
1
+ import { R as RateLimiterOptions, T as TokenUsage } from './rate-limiter-7-oOHrtX.js';
2
2
  import { z } from 'zod';
3
3
 
4
4
  /**
@@ -4,8 +4,8 @@ import {
4
4
  getStructuredOutputClient,
5
5
  isStructuredOutputAvailable,
6
6
  resetStructuredOutputClient
7
- } from "./chunk-NO7P6EDT.js";
8
- import "./chunk-TGHGQB5I.js";
7
+ } from "./chunk-IDXSWM57.js";
8
+ import "./chunk-SXSP6M24.js";
9
9
  export {
10
10
  STRUCTURED_OUTPUT_MODELS,
11
11
  StructuredOutputClient,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@almadar/llm",
3
- "version": "2.19.0",
3
+ "version": "2.21.0",
4
4
  "description": "Multi-provider LLM client with rate limiting, token tracking, structured outputs, and continuation handling",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
package/src/client.ts CHANGED
@@ -55,6 +55,28 @@ interface ModelKwargs {
55
55
  max_completion_tokens?: number;
56
56
  thinking?: { type: string };
57
57
  tool_choice?: string;
58
+ /** OpenRouter: ask for detailed usage accounting (returns cached-token breakdown). */
59
+ usage?: { include: boolean };
60
+ }
61
+
62
+ /**
63
+ * Pull the cache-read / cache-write token split out of a LangChain
64
+ * `usage_metadata` object. Every provider LangChain supports normalises
65
+ * its cache-hit tokens into `input_token_details` (OpenAI/OpenRouter/DeepSeek
66
+ * via `cached_tokens`→`cache_read`; Anthropic via `cache_read`/`cache_creation`),
67
+ * so one extractor covers them all. Returns zeros when the provider reports
68
+ * no cache detail — which prices identically to the old flat-rate path.
69
+ */
70
+ function cacheTokensFromUsageMetadata(usageMeta: {
71
+ input_tokens?: number;
72
+ output_tokens?: number;
73
+ input_token_details?: { cache_read?: number; cache_creation?: number };
74
+ }): { cachedPromptTokens: number; cacheWriteTokens: number } {
75
+ const details = usageMeta.input_token_details ?? {};
76
+ return {
77
+ cachedPromptTokens: details.cache_read ?? 0,
78
+ cacheWriteTokens: details.cache_creation ?? 0,
79
+ };
58
80
  }
59
81
 
60
82
  /**
@@ -101,7 +123,7 @@ type ChatModel = ChatOpenAI | ChatAnthropic;
101
123
  // Types
102
124
  // ============================================================================
103
125
 
104
- export type LLMProvider = 'openai' | 'deepseek' | 'anthropic' | 'kimi' | 'openrouter' | 'orbgen';
126
+ export type LLMProvider = 'openai' | 'deepseek' | 'anthropic' | 'kimi' | 'openrouter' | 'orbgen' | 'masar';
105
127
 
106
128
  export interface ProviderConfig {
107
129
  apiKey: string;
@@ -269,6 +291,21 @@ const PROVIDER_CONFIGS: Record<LLMProvider, () => ProviderConfig> = {
269
291
  defaultModel: 'orbgen-v2',
270
292
  };
271
293
  },
294
+ masar: () => {
295
+ // Fine-tuned masar subagent served OpenAI-compatibly (vLLM on Cloud Run GPU).
296
+ const baseUrl = process.env.MASAR_SUBAGENT_URL;
297
+ if (!baseUrl) {
298
+ throw new Error(
299
+ 'MASAR_SUBAGENT_URL environment variable is not set. ' +
300
+ 'Set it to the masar subagent endpoint (e.g., https://masar-subagent-xxx.run.app)',
301
+ );
302
+ }
303
+ return {
304
+ apiKey: process.env.MASAR_SUBAGENT_API_KEY ?? 'not-needed',
305
+ baseUrl: `${baseUrl}/v1`,
306
+ defaultModel: process.env.MASAR_SUBAGENT_MODEL ?? 'subagent',
307
+ };
308
+ },
272
309
  };
273
310
 
274
311
  export const DEEPSEEK_MODELS = {
@@ -477,6 +514,8 @@ export class LLMClient {
477
514
  // OpenRouter (Qwen): explicit tool_choice so the model doesn't ignore tool definitions
478
515
  if (this.provider === 'openrouter') {
479
516
  modelKwargs.tool_choice = 'auto';
517
+ // Return the cached-token breakdown so cost is priced cache-aware.
518
+ modelKwargs.usage = { include: true };
480
519
  }
481
520
 
482
521
  return new ChatOpenAI({
@@ -622,7 +661,7 @@ export class LLMClient {
622
661
  this.tokenTracker.addUsage(
623
662
  usage.promptTokens,
624
663
  usage.completionTokens,
625
- { provider: this.provider },
664
+ { provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
626
665
  );
627
666
  }
628
667
  }
@@ -828,6 +867,7 @@ export class LLMClient {
828
867
  this.tokenTracker.addUsage(
829
868
  usage.promptTokens,
830
869
  usage.completionTokens,
870
+ { provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
831
871
  );
832
872
  }
833
873
  }
@@ -897,6 +937,7 @@ export class LLMClient {
897
937
  this.tokenTracker.addUsage(
898
938
  usage.promptTokens,
899
939
  usage.completionTokens,
940
+ { provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
900
941
  );
901
942
  }
902
943
  }
@@ -974,6 +1015,8 @@ export class LLMClient {
974
1015
  // consumers see one canonical field name.
975
1016
  if (this.provider === 'openrouter') {
976
1017
  body['reasoning'] = { enabled: true };
1018
+ // Return authoritative usage accounting (real cost + cached-token split).
1019
+ body['usage'] = { include: true };
977
1020
  }
978
1021
 
979
1022
  const startedAt = Date.now();
@@ -1049,7 +1092,17 @@ export class LLMClient {
1049
1092
  totalTokens: parsed.usage.total_tokens,
1050
1093
  };
1051
1094
  if (this.tokenTracker) {
1052
- this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens);
1095
+ const rawUsage = parsed.usage as {
1096
+ prompt_tokens_details?: { cached_tokens?: number };
1097
+ cost?: number;
1098
+ };
1099
+ const cachedTokens = rawUsage.prompt_tokens_details?.cached_tokens ?? 0;
1100
+ this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, {
1101
+ provider: this.provider,
1102
+ cachedPromptTokens: cachedTokens,
1103
+ // OpenRouter returns the real, routing+cache-adjusted charge here.
1104
+ ...(typeof rawUsage.cost === 'number' ? { costUSD: rawUsage.cost } : {}),
1105
+ });
1053
1106
  }
1054
1107
  }
1055
1108
 
@@ -1234,9 +1287,13 @@ export class LLMClient {
1234
1287
  };
1235
1288
 
1236
1289
  if (this.tokenTracker) {
1290
+ // Anthropic reports input_tokens as the UNCACHED count; cache reads
1291
+ // and writes are separate. Pass the true total so the tracker prices
1292
+ // each bucket at its own rate.
1237
1293
  this.tokenTracker.addUsage(
1238
- usage.promptTokens,
1294
+ apiUsage.input_tokens + cacheRead + cacheCreation,
1239
1295
  usage.completionTokens,
1296
+ { provider: this.provider, cachedPromptTokens: cacheRead, cacheWriteTokens: cacheCreation },
1240
1297
  );
1241
1298
  }
1242
1299
 
@@ -1343,6 +1400,8 @@ export function isProviderAvailable(provider: LLMProvider): boolean {
1343
1400
  return !!process.env.OPEN_ROUTER_API_KEY;
1344
1401
  case 'orbgen':
1345
1402
  return !!process.env.ORBGEN_URL;
1403
+ case 'masar':
1404
+ return !!process.env.MASAR_SUBAGENT_URL;
1346
1405
  default:
1347
1406
  return false;
1348
1407
  }
@@ -268,7 +268,13 @@ export class StructuredOutputClient {
268
268
  };
269
269
 
270
270
  if (this.tokenTracker) {
271
- this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, { provider: 'structured-output' });
271
+ const cachedTokens =
272
+ (response.usage as { prompt_tokens_details?: { cached_tokens?: number } } | undefined)
273
+ ?.prompt_tokens_details?.cached_tokens ?? 0;
274
+ this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, {
275
+ provider: 'structured-output',
276
+ cachedPromptTokens: cachedTokens,
277
+ });
272
278
  }
273
279
 
274
280
  console.log(
@@ -18,11 +18,19 @@ export interface TokenUsage {
18
18
  completionTokens: number;
19
19
  totalTokens: number;
20
20
  callCount: number;
21
+ /** Subset of `promptTokens` that were cache reads (billed at the cache-read rate). */
22
+ cachedPromptTokens: number;
23
+ /** Subset of `promptTokens` written to cache (Anthropic; billed at the cache-write rate). */
24
+ cacheWriteTokens: number;
21
25
  }
22
26
 
23
27
  export interface TokenCost {
24
28
  promptCostPer1K: number;
25
29
  completionCostPer1K: number;
30
+ /** Per-1K rate for cache-read (cache-hit) prompt tokens. Falls back to prompt rate when absent. */
31
+ cacheReadCostPer1K?: number;
32
+ /** Per-1K rate for cache-write (cache-creation) prompt tokens. Falls back to prompt rate when absent. */
33
+ cacheWriteCostPer1K?: number;
26
34
  }
27
35
 
28
36
  export interface CallLogEntry {
@@ -32,6 +40,10 @@ export interface CallLogEntry {
32
40
  promptTokens: number;
33
41
  completionTokens: number;
34
42
  totalTokens: number;
43
+ /** Cache-read (cache-hit) subset of promptTokens, billed at the discounted rate. */
44
+ cachedPromptTokens?: number;
45
+ /** Cache-write subset of promptTokens (Anthropic). */
46
+ cacheWriteTokens?: number;
35
47
  estimatedCost: number;
36
48
  durationMs?: number;
37
49
  source: 'local-log';
@@ -45,6 +57,9 @@ const ALMADAR_ROOT = process.env['ALMADAR_ROOT'] ?? process.cwd();
45
57
  const PRICING_CACHE_PATH = join(ALMADAR_ROOT, '.llm-pricing-cache.json');
46
58
  const CALL_LOG_PATH = join(ALMADAR_ROOT, '.llm-call-log.jsonl');
47
59
  const CACHE_TTL_MS = 24 * 60 * 60 * 1000; // 24 hours
60
+ // Bump when the cached TokenCost shape changes so stale on-disk caches are
61
+ // invalidated on upgrade. v2 added cacheReadCostPer1K / cacheWriteCostPer1K.
62
+ const PRICING_CACHE_VERSION = 2;
48
63
 
49
64
  /** Map from our local model name to OpenRouter model ID */
50
65
  const MODEL_ID_MAP: Record<string, string> = {
@@ -67,6 +82,7 @@ const MODEL_ID_MAP: Record<string, string> = {
67
82
  const FALLBACK_COSTS: Record<string, TokenCost> = {};
68
83
 
69
84
  interface PricingCache {
85
+ version?: number;
70
86
  fetchedAt: number;
71
87
  models: Record<string, TokenCost>;
72
88
  }
@@ -77,7 +93,7 @@ function loadCachedPricing(): PricingCache | null {
77
93
  try {
78
94
  const raw = readFileSync(PRICING_CACHE_PATH, 'utf-8');
79
95
  const parsed = JSON.parse(raw) as PricingCache;
80
- if (Date.now() - parsed.fetchedAt < CACHE_TTL_MS) {
96
+ if (parsed.version === PRICING_CACHE_VERSION && Date.now() - parsed.fetchedAt < CACHE_TTL_MS) {
81
97
  return parsed;
82
98
  }
83
99
  } catch {
@@ -89,15 +105,30 @@ function loadCachedPricing(): PricingCache | null {
89
105
  async function fetchPricingFromOpenRouter(): Promise<Record<string, TokenCost>> {
90
106
  const res = await fetch('https://openrouter.ai/api/v1/models');
91
107
  if (!res.ok) throw new Error(`OpenRouter models API: HTTP ${res.status}`);
92
- const json = await res.json() as { data?: Array<{ id: string; pricing?: { prompt?: string; completion?: string } }> };
108
+ const json = await res.json() as {
109
+ data?: Array<{
110
+ id: string;
111
+ pricing?: {
112
+ prompt?: string;
113
+ completion?: string;
114
+ input_cache_read?: string;
115
+ input_cache_write?: string;
116
+ };
117
+ }>;
118
+ };
93
119
  const models: Record<string, TokenCost> = {};
94
120
  for (const m of json.data ?? []) {
95
121
  const promptPerToken = parseFloat(m.pricing?.prompt ?? '0');
96
122
  const completionPerToken = parseFloat(m.pricing?.completion ?? '0');
123
+ const cacheReadPerToken = parseFloat(m.pricing?.input_cache_read ?? '0');
124
+ const cacheWritePerToken = parseFloat(m.pricing?.input_cache_write ?? '0');
97
125
  if (promptPerToken > 0 || completionPerToken > 0) {
98
126
  models[m.id] = {
99
127
  promptCostPer1K: promptPerToken * 1000,
100
128
  completionCostPer1K: completionPerToken * 1000,
129
+ // 0 (field absent) → leave undefined so cost math falls back to the prompt rate.
130
+ ...(cacheReadPerToken > 0 ? { cacheReadCostPer1K: cacheReadPerToken * 1000 } : {}),
131
+ ...(cacheWritePerToken > 0 ? { cacheWriteCostPer1K: cacheWritePerToken * 1000 } : {}),
101
132
  };
102
133
  }
103
134
  }
@@ -125,7 +156,7 @@ function getPricing(): Record<string, TokenCost> {
125
156
  function refreshPricingCache(): void {
126
157
  fetchPricingFromOpenRouter()
127
158
  .then((models) => {
128
- pricingCache = { fetchedAt: Date.now(), models };
159
+ pricingCache = { version: PRICING_CACHE_VERSION, fetchedAt: Date.now(), models };
129
160
  try {
130
161
  mkdirSync(dirname(PRICING_CACHE_PATH), { recursive: true });
131
162
  writeFileSync(PRICING_CACHE_PATH, JSON.stringify(pricingCache));
@@ -164,22 +195,77 @@ export class TokenTracker {
164
195
  completionTokens: 0,
165
196
  totalTokens: 0,
166
197
  callCount: 0,
198
+ cachedPromptTokens: 0,
199
+ cacheWriteTokens: 0,
167
200
  };
168
201
 
202
+ /** Sum of provider-reported authoritative costs (e.g. OpenRouter `usage.cost`). */
203
+ private authoritativeCostUSD = 0;
204
+ /** Token buckets for calls WITHOUT an authoritative cost — priced cache-aware. */
205
+ private computed = { promptTokens: 0, completionTokens: 0, cachedPromptTokens: 0, cacheWriteTokens: 0 };
206
+
169
207
  constructor(model: string = 'claude-sonnet-4-5-20250929') {
170
208
  this.model = model;
171
209
  }
172
210
 
173
- addUsage(promptTokens: number, completionTokens: number, options?: { provider?: string; durationMs?: number }): void {
211
+ /** Cache-aware cost for one (or an aggregate of) call(s), in USD. */
212
+ private costFor(promptTokens: number, completionTokens: number, cached: number, written: number): number {
213
+ const costs = getCostForModel(this.model);
214
+ const cacheReadRate = costs.cacheReadCostPer1K ?? costs.promptCostPer1K;
215
+ const cacheWriteRate = costs.cacheWriteCostPer1K ?? costs.promptCostPer1K;
216
+ const uncached = Math.max(0, promptTokens - cached - written);
217
+ return (
218
+ (uncached / 1000) * costs.promptCostPer1K +
219
+ (cached / 1000) * cacheReadRate +
220
+ (written / 1000) * cacheWriteRate +
221
+ (completionTokens / 1000) * costs.completionCostPer1K
222
+ );
223
+ }
224
+
225
+ /**
226
+ * Record one LLM call's usage. `promptTokens` is the TOTAL input count
227
+ * (cache reads + cache writes + uncached); `cachedPromptTokens` and
228
+ * `cacheWriteTokens` are subsets of it, priced at their own (cheaper /
229
+ * pricier) rates. Providers that don't report cache detail pass 0, which
230
+ * reduces to the previous flat-rate behaviour.
231
+ */
232
+ addUsage(
233
+ promptTokens: number,
234
+ completionTokens: number,
235
+ options?: {
236
+ provider?: string;
237
+ durationMs?: number;
238
+ cachedPromptTokens?: number;
239
+ cacheWriteTokens?: number;
240
+ /** Provider-reported authoritative cost (e.g. OpenRouter `usage.cost`). When set, used verbatim. */
241
+ costUSD?: number;
242
+ },
243
+ ): void {
244
+ const cached = Math.min(promptTokens, Math.max(0, options?.cachedPromptTokens ?? 0));
245
+ const written = Math.min(promptTokens - cached, Math.max(0, options?.cacheWriteTokens ?? 0));
246
+
174
247
  this.usage.promptTokens += promptTokens;
175
248
  this.usage.completionTokens += completionTokens;
176
249
  this.usage.totalTokens += promptTokens + completionTokens;
250
+ this.usage.cachedPromptTokens += cached;
251
+ this.usage.cacheWriteTokens += written;
177
252
  this.usage.callCount++;
178
253
 
179
- const costs = getCostForModel(this.model);
180
- const estimatedCost =
181
- (promptTokens / 1000) * costs.promptCostPer1K +
182
- (completionTokens / 1000) * costs.completionCostPer1K;
254
+ // Prefer the provider's authoritative cost (already cache- and routing-
255
+ // adjusted). Otherwise bucket the tokens and price them cache-aware so
256
+ // late-arriving pricing still applies retroactively to the estimate.
257
+ const authoritative = options?.costUSD;
258
+ let estimatedCost: number;
259
+ if (authoritative != null && Number.isFinite(authoritative)) {
260
+ this.authoritativeCostUSD += authoritative;
261
+ estimatedCost = authoritative;
262
+ } else {
263
+ this.computed.promptTokens += promptTokens;
264
+ this.computed.completionTokens += completionTokens;
265
+ this.computed.cachedPromptTokens += cached;
266
+ this.computed.cacheWriteTokens += written;
267
+ estimatedCost = this.costFor(promptTokens, completionTokens, cached, written);
268
+ }
183
269
 
184
270
  const entry: CallLogEntry = {
185
271
  timestamp: new Date().toISOString(),
@@ -188,6 +274,8 @@ export class TokenTracker {
188
274
  promptTokens,
189
275
  completionTokens,
190
276
  totalTokens: promptTokens + completionTokens,
277
+ cachedPromptTokens: cached,
278
+ cacheWriteTokens: written,
191
279
  estimatedCost,
192
280
  durationMs: options?.durationMs,
193
281
  source: 'local-log',
@@ -206,12 +294,15 @@ export class TokenTracker {
206
294
  }
207
295
 
208
296
  getEstimatedCost(): number {
209
- const costs = getCostForModel(this.model);
210
- const promptCost =
211
- (this.usage.promptTokens / 1000) * costs.promptCostPer1K;
212
- const completionCost =
213
- (this.usage.completionTokens / 1000) * costs.completionCostPer1K;
214
- return promptCost + completionCost;
297
+ return (
298
+ this.authoritativeCostUSD +
299
+ this.costFor(
300
+ this.computed.promptTokens,
301
+ this.computed.completionTokens,
302
+ this.computed.cachedPromptTokens,
303
+ this.computed.cacheWriteTokens,
304
+ )
305
+ );
215
306
  }
216
307
 
217
308
  getFormattedCost(): string {
@@ -239,7 +330,11 @@ export class TokenTracker {
239
330
  completionTokens: 0,
240
331
  totalTokens: 0,
241
332
  callCount: 0,
333
+ cachedPromptTokens: 0,
334
+ cacheWriteTokens: 0,
242
335
  };
336
+ this.authoritativeCostUSD = 0;
337
+ this.computed = { promptTokens: 0, completionTokens: 0, cachedPromptTokens: 0, cacheWriteTokens: 0 };
243
338
  }
244
339
 
245
340
  setModel(model: string): void {
@@ -270,7 +365,7 @@ export function getCallLogPath(): string {
270
365
  /** Force-refresh the pricing cache from OpenRouter. */
271
366
  export async function refreshPricing(): Promise<void> {
272
367
  const models = await fetchPricingFromOpenRouter();
273
- pricingCache = { fetchedAt: Date.now(), models };
368
+ pricingCache = { version: PRICING_CACHE_VERSION, fetchedAt: Date.now(), models };
274
369
  mkdirSync(dirname(PRICING_CACHE_PATH), { recursive: true });
275
370
  writeFileSync(PRICING_CACHE_PATH, JSON.stringify(pricingCache));
276
371
  }