@almadar/llm 2.20.0 → 2.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-DGW3YFPS.js → chunk-44QBGRL3.js} +27 -8
- package/dist/chunk-44QBGRL3.js.map +1 -0
- package/dist/{chunk-NO7P6EDT.js → chunk-IDXSWM57.js} +7 -3
- package/dist/{chunk-NO7P6EDT.js.map → chunk-IDXSWM57.js.map} +1 -1
- package/dist/{chunk-TGHGQB5I.js → chunk-SXSP6M24.js} +61 -12
- package/dist/chunk-SXSP6M24.js.map +1 -0
- package/dist/{client-BIq-gHZo.d.ts → client-BTqaooer.d.ts} +2 -2
- package/dist/client.d.ts +2 -2
- package/dist/client.js +2 -2
- package/dist/index.d.ts +3 -3
- package/dist/index.js +3 -3
- package/dist/{rate-limiter-BqWOhaXY.d.ts → rate-limiter-7-oOHrtX.d.ts} +22 -1
- package/dist/structured-output.d.ts +1 -1
- package/dist/structured-output.js +2 -2
- package/package.json +1 -1
- package/src/client.ts +45 -3
- package/src/structured-output.ts +7 -1
- package/src/token-tracker.ts +110 -15
- package/dist/chunk-DGW3YFPS.js.map +0 -1
- package/dist/chunk-TGHGQB5I.js.map +0 -1
package/dist/client.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import './rate-limiter-
|
|
1
|
+
import './rate-limiter-7-oOHrtX.js';
|
|
2
2
|
import '@langchain/openai';
|
|
3
3
|
import '@langchain/anthropic';
|
|
4
4
|
import 'zod';
|
|
5
|
-
export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions,
|
|
5
|
+
export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, b as CacheableBlock, D as DEEPSEEK_MODELS, K as KIMI_MODELS, j as LLMCallOptions, a as LLMClient, k as LLMClientOptions, L as LLMFinishReason, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, H as resetSharedLLMClient } from './client-BTqaooer.js';
|
|
6
6
|
import '@almadar/core';
|
package/dist/client.js
CHANGED
|
@@ -18,9 +18,9 @@ import {
|
|
|
18
18
|
getSharedLLMClient,
|
|
19
19
|
isProviderAvailable,
|
|
20
20
|
resetSharedLLMClient
|
|
21
|
-
} from "./chunk-
|
|
21
|
+
} from "./chunk-44QBGRL3.js";
|
|
22
22
|
import "./chunk-P4VCT25B.js";
|
|
23
|
-
import "./chunk-
|
|
23
|
+
import "./chunk-SXSP6M24.js";
|
|
24
24
|
export {
|
|
25
25
|
ANTHROPIC_MODELS,
|
|
26
26
|
DEEPSEEK_MODELS,
|
package/dist/index.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions,
|
|
3
|
-
export {
|
|
1
|
+
import { L as LLMFinishReason, a as LLMClient } from './client-BTqaooer.js';
|
|
2
|
+
export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, b as CacheableBlock, c as ChatCompletionChoice, d as ChatCompletionMessage, e as ChatCompletionResponse, f as ChatCompletionRole, g as ChatCompletionToolCall, h as ChatCompletionToolDef, i as ChatCompletionUsage, D as DEEPSEEK_MODELS, K as KIMI_MODELS, j as LLMCallOptions, k as LLMClientOptions, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, G as parseChatCompletionResponse, H as resetSharedLLMClient } from './client-BTqaooer.js';
|
|
3
|
+
export { a as RateLimiter, R as RateLimiterOptions, b as TokenTracker, T as TokenUsage, g as getGlobalRateLimiter, c as getGlobalTokenTracker, r as resetGlobalRateLimiter, d as resetGlobalTokenTracker } from './rate-limiter-7-oOHrtX.js';
|
|
4
4
|
export { autoCloseJson, extractJsonFromText, isValidJson, parseJsonResponse, safeParseJson } from './json-parser.js';
|
|
5
5
|
import { z } from 'zod';
|
|
6
6
|
export { JsonSchema, STRUCTURED_OUTPUT_MODELS, StructuredGenerationOptions, StructuredGenerationResult, StructuredOutputClient, StructuredOutputOptions, getStructuredOutputClient, isStructuredOutputAvailable, resetStructuredOutputClient } from './structured-output.js';
|
package/dist/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
isProviderAvailable,
|
|
20
20
|
parseChatCompletionResponse,
|
|
21
21
|
resetSharedLLMClient
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-44QBGRL3.js";
|
|
23
23
|
import {
|
|
24
24
|
autoCloseJson,
|
|
25
25
|
extractJsonFromText,
|
|
@@ -33,7 +33,7 @@ import {
|
|
|
33
33
|
getStructuredOutputClient,
|
|
34
34
|
isStructuredOutputAvailable,
|
|
35
35
|
resetStructuredOutputClient
|
|
36
|
-
} from "./chunk-
|
|
36
|
+
} from "./chunk-IDXSWM57.js";
|
|
37
37
|
import {
|
|
38
38
|
RateLimiter,
|
|
39
39
|
TokenTracker,
|
|
@@ -41,7 +41,7 @@ import {
|
|
|
41
41
|
getGlobalTokenTracker,
|
|
42
42
|
resetGlobalRateLimiter,
|
|
43
43
|
resetGlobalTokenTracker
|
|
44
|
-
} from "./chunk-
|
|
44
|
+
} from "./chunk-SXSP6M24.js";
|
|
45
45
|
import {
|
|
46
46
|
MasarError,
|
|
47
47
|
MasarProvider,
|
|
@@ -14,14 +14,35 @@ interface TokenUsage {
|
|
|
14
14
|
completionTokens: number;
|
|
15
15
|
totalTokens: number;
|
|
16
16
|
callCount: number;
|
|
17
|
+
/** Subset of `promptTokens` that were cache reads (billed at the cache-read rate). */
|
|
18
|
+
cachedPromptTokens: number;
|
|
19
|
+
/** Subset of `promptTokens` written to cache (Anthropic; billed at the cache-write rate). */
|
|
20
|
+
cacheWriteTokens: number;
|
|
17
21
|
}
|
|
18
22
|
declare class TokenTracker {
|
|
19
23
|
private model;
|
|
20
24
|
private usage;
|
|
25
|
+
/** Sum of provider-reported authoritative costs (e.g. OpenRouter `usage.cost`). */
|
|
26
|
+
private authoritativeCostUSD;
|
|
27
|
+
/** Token buckets for calls WITHOUT an authoritative cost — priced cache-aware. */
|
|
28
|
+
private computed;
|
|
21
29
|
constructor(model?: string);
|
|
30
|
+
/** Cache-aware cost for one (or an aggregate of) call(s), in USD. */
|
|
31
|
+
private costFor;
|
|
32
|
+
/**
|
|
33
|
+
* Record one LLM call's usage. `promptTokens` is the TOTAL input count
|
|
34
|
+
* (cache reads + cache writes + uncached); `cachedPromptTokens` and
|
|
35
|
+
* `cacheWriteTokens` are subsets of it, priced at their own (cheaper /
|
|
36
|
+
* pricier) rates. Providers that don't report cache detail pass 0, which
|
|
37
|
+
* reduces to the previous flat-rate behaviour.
|
|
38
|
+
*/
|
|
22
39
|
addUsage(promptTokens: number, completionTokens: number, options?: {
|
|
23
40
|
provider?: string;
|
|
24
41
|
durationMs?: number;
|
|
42
|
+
cachedPromptTokens?: number;
|
|
43
|
+
cacheWriteTokens?: number;
|
|
44
|
+
/** Provider-reported authoritative cost (e.g. OpenRouter `usage.cost`). When set, used verbatim. */
|
|
45
|
+
costUSD?: number;
|
|
25
46
|
}): void;
|
|
26
47
|
getSummary(): TokenUsage;
|
|
27
48
|
getEstimatedCost(): number;
|
|
@@ -99,4 +120,4 @@ declare class RateLimiter {
|
|
|
99
120
|
declare function getGlobalRateLimiter(options?: RateLimiterOptions): RateLimiter;
|
|
100
121
|
declare function resetGlobalRateLimiter(): void;
|
|
101
122
|
|
|
102
|
-
export {
|
|
123
|
+
export { type RateLimiterOptions as R, type TokenUsage as T, RateLimiter as a, TokenTracker as b, getGlobalTokenTracker as c, resetGlobalTokenTracker as d, getGlobalRateLimiter as g, resetGlobalRateLimiter as r };
|
|
@@ -4,8 +4,8 @@ import {
|
|
|
4
4
|
getStructuredOutputClient,
|
|
5
5
|
isStructuredOutputAvailable,
|
|
6
6
|
resetStructuredOutputClient
|
|
7
|
-
} from "./chunk-
|
|
8
|
-
import "./chunk-
|
|
7
|
+
} from "./chunk-IDXSWM57.js";
|
|
8
|
+
import "./chunk-SXSP6M24.js";
|
|
9
9
|
export {
|
|
10
10
|
STRUCTURED_OUTPUT_MODELS,
|
|
11
11
|
StructuredOutputClient,
|
package/package.json
CHANGED
package/src/client.ts
CHANGED
|
@@ -55,6 +55,28 @@ interface ModelKwargs {
|
|
|
55
55
|
max_completion_tokens?: number;
|
|
56
56
|
thinking?: { type: string };
|
|
57
57
|
tool_choice?: string;
|
|
58
|
+
/** OpenRouter: ask for detailed usage accounting (returns cached-token breakdown). */
|
|
59
|
+
usage?: { include: boolean };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Pull the cache-read / cache-write token split out of a LangChain
|
|
64
|
+
* `usage_metadata` object. Every provider LangChain supports normalises
|
|
65
|
+
* its cache-hit tokens into `input_token_details` (OpenAI/OpenRouter/DeepSeek
|
|
66
|
+
* via `cached_tokens`→`cache_read`; Anthropic via `cache_read`/`cache_creation`),
|
|
67
|
+
* so one extractor covers them all. Returns zeros when the provider reports
|
|
68
|
+
* no cache detail — which prices identically to the old flat-rate path.
|
|
69
|
+
*/
|
|
70
|
+
function cacheTokensFromUsageMetadata(usageMeta: {
|
|
71
|
+
input_tokens?: number;
|
|
72
|
+
output_tokens?: number;
|
|
73
|
+
input_token_details?: { cache_read?: number; cache_creation?: number };
|
|
74
|
+
}): { cachedPromptTokens: number; cacheWriteTokens: number } {
|
|
75
|
+
const details = usageMeta.input_token_details ?? {};
|
|
76
|
+
return {
|
|
77
|
+
cachedPromptTokens: details.cache_read ?? 0,
|
|
78
|
+
cacheWriteTokens: details.cache_creation ?? 0,
|
|
79
|
+
};
|
|
58
80
|
}
|
|
59
81
|
|
|
60
82
|
/**
|
|
@@ -492,6 +514,8 @@ export class LLMClient {
|
|
|
492
514
|
// OpenRouter (Qwen): explicit tool_choice so the model doesn't ignore tool definitions
|
|
493
515
|
if (this.provider === 'openrouter') {
|
|
494
516
|
modelKwargs.tool_choice = 'auto';
|
|
517
|
+
// Return the cached-token breakdown so cost is priced cache-aware.
|
|
518
|
+
modelKwargs.usage = { include: true };
|
|
495
519
|
}
|
|
496
520
|
|
|
497
521
|
return new ChatOpenAI({
|
|
@@ -637,7 +661,7 @@ export class LLMClient {
|
|
|
637
661
|
this.tokenTracker.addUsage(
|
|
638
662
|
usage.promptTokens,
|
|
639
663
|
usage.completionTokens,
|
|
640
|
-
{ provider: this.provider },
|
|
664
|
+
{ provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
|
|
641
665
|
);
|
|
642
666
|
}
|
|
643
667
|
}
|
|
@@ -843,6 +867,7 @@ export class LLMClient {
|
|
|
843
867
|
this.tokenTracker.addUsage(
|
|
844
868
|
usage.promptTokens,
|
|
845
869
|
usage.completionTokens,
|
|
870
|
+
{ provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
|
|
846
871
|
);
|
|
847
872
|
}
|
|
848
873
|
}
|
|
@@ -912,6 +937,7 @@ export class LLMClient {
|
|
|
912
937
|
this.tokenTracker.addUsage(
|
|
913
938
|
usage.promptTokens,
|
|
914
939
|
usage.completionTokens,
|
|
940
|
+
{ provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
|
|
915
941
|
);
|
|
916
942
|
}
|
|
917
943
|
}
|
|
@@ -989,6 +1015,8 @@ export class LLMClient {
|
|
|
989
1015
|
// consumers see one canonical field name.
|
|
990
1016
|
if (this.provider === 'openrouter') {
|
|
991
1017
|
body['reasoning'] = { enabled: true };
|
|
1018
|
+
// Return authoritative usage accounting (real cost + cached-token split).
|
|
1019
|
+
body['usage'] = { include: true };
|
|
992
1020
|
}
|
|
993
1021
|
|
|
994
1022
|
const startedAt = Date.now();
|
|
@@ -1064,7 +1092,17 @@ export class LLMClient {
|
|
|
1064
1092
|
totalTokens: parsed.usage.total_tokens,
|
|
1065
1093
|
};
|
|
1066
1094
|
if (this.tokenTracker) {
|
|
1067
|
-
|
|
1095
|
+
const rawUsage = parsed.usage as {
|
|
1096
|
+
prompt_tokens_details?: { cached_tokens?: number };
|
|
1097
|
+
cost?: number;
|
|
1098
|
+
};
|
|
1099
|
+
const cachedTokens = rawUsage.prompt_tokens_details?.cached_tokens ?? 0;
|
|
1100
|
+
this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, {
|
|
1101
|
+
provider: this.provider,
|
|
1102
|
+
cachedPromptTokens: cachedTokens,
|
|
1103
|
+
// OpenRouter returns the real, routing+cache-adjusted charge here.
|
|
1104
|
+
...(typeof rawUsage.cost === 'number' ? { costUSD: rawUsage.cost } : {}),
|
|
1105
|
+
});
|
|
1068
1106
|
}
|
|
1069
1107
|
}
|
|
1070
1108
|
|
|
@@ -1249,9 +1287,13 @@ export class LLMClient {
|
|
|
1249
1287
|
};
|
|
1250
1288
|
|
|
1251
1289
|
if (this.tokenTracker) {
|
|
1290
|
+
// Anthropic reports input_tokens as the UNCACHED count; cache reads
|
|
1291
|
+
// and writes are separate. Pass the true total so the tracker prices
|
|
1292
|
+
// each bucket at its own rate.
|
|
1252
1293
|
this.tokenTracker.addUsage(
|
|
1253
|
-
|
|
1294
|
+
apiUsage.input_tokens + cacheRead + cacheCreation,
|
|
1254
1295
|
usage.completionTokens,
|
|
1296
|
+
{ provider: this.provider, cachedPromptTokens: cacheRead, cacheWriteTokens: cacheCreation },
|
|
1255
1297
|
);
|
|
1256
1298
|
}
|
|
1257
1299
|
|
package/src/structured-output.ts
CHANGED
|
@@ -268,7 +268,13 @@ export class StructuredOutputClient {
|
|
|
268
268
|
};
|
|
269
269
|
|
|
270
270
|
if (this.tokenTracker) {
|
|
271
|
-
|
|
271
|
+
const cachedTokens =
|
|
272
|
+
(response.usage as { prompt_tokens_details?: { cached_tokens?: number } } | undefined)
|
|
273
|
+
?.prompt_tokens_details?.cached_tokens ?? 0;
|
|
274
|
+
this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, {
|
|
275
|
+
provider: 'structured-output',
|
|
276
|
+
cachedPromptTokens: cachedTokens,
|
|
277
|
+
});
|
|
272
278
|
}
|
|
273
279
|
|
|
274
280
|
console.log(
|
package/src/token-tracker.ts
CHANGED
|
@@ -18,11 +18,19 @@ export interface TokenUsage {
|
|
|
18
18
|
completionTokens: number;
|
|
19
19
|
totalTokens: number;
|
|
20
20
|
callCount: number;
|
|
21
|
+
/** Subset of `promptTokens` that were cache reads (billed at the cache-read rate). */
|
|
22
|
+
cachedPromptTokens: number;
|
|
23
|
+
/** Subset of `promptTokens` written to cache (Anthropic; billed at the cache-write rate). */
|
|
24
|
+
cacheWriteTokens: number;
|
|
21
25
|
}
|
|
22
26
|
|
|
23
27
|
export interface TokenCost {
|
|
24
28
|
promptCostPer1K: number;
|
|
25
29
|
completionCostPer1K: number;
|
|
30
|
+
/** Per-1K rate for cache-read (cache-hit) prompt tokens. Falls back to prompt rate when absent. */
|
|
31
|
+
cacheReadCostPer1K?: number;
|
|
32
|
+
/** Per-1K rate for cache-write (cache-creation) prompt tokens. Falls back to prompt rate when absent. */
|
|
33
|
+
cacheWriteCostPer1K?: number;
|
|
26
34
|
}
|
|
27
35
|
|
|
28
36
|
export interface CallLogEntry {
|
|
@@ -32,6 +40,10 @@ export interface CallLogEntry {
|
|
|
32
40
|
promptTokens: number;
|
|
33
41
|
completionTokens: number;
|
|
34
42
|
totalTokens: number;
|
|
43
|
+
/** Cache-read (cache-hit) subset of promptTokens, billed at the discounted rate. */
|
|
44
|
+
cachedPromptTokens?: number;
|
|
45
|
+
/** Cache-write subset of promptTokens (Anthropic). */
|
|
46
|
+
cacheWriteTokens?: number;
|
|
35
47
|
estimatedCost: number;
|
|
36
48
|
durationMs?: number;
|
|
37
49
|
source: 'local-log';
|
|
@@ -45,6 +57,9 @@ const ALMADAR_ROOT = process.env['ALMADAR_ROOT'] ?? process.cwd();
|
|
|
45
57
|
const PRICING_CACHE_PATH = join(ALMADAR_ROOT, '.llm-pricing-cache.json');
|
|
46
58
|
const CALL_LOG_PATH = join(ALMADAR_ROOT, '.llm-call-log.jsonl');
|
|
47
59
|
const CACHE_TTL_MS = 24 * 60 * 60 * 1000; // 24 hours
|
|
60
|
+
// Bump when the cached TokenCost shape changes so stale on-disk caches are
|
|
61
|
+
// invalidated on upgrade. v2 added cacheReadCostPer1K / cacheWriteCostPer1K.
|
|
62
|
+
const PRICING_CACHE_VERSION = 2;
|
|
48
63
|
|
|
49
64
|
/** Map from our local model name to OpenRouter model ID */
|
|
50
65
|
const MODEL_ID_MAP: Record<string, string> = {
|
|
@@ -67,6 +82,7 @@ const MODEL_ID_MAP: Record<string, string> = {
|
|
|
67
82
|
const FALLBACK_COSTS: Record<string, TokenCost> = {};
|
|
68
83
|
|
|
69
84
|
interface PricingCache {
|
|
85
|
+
version?: number;
|
|
70
86
|
fetchedAt: number;
|
|
71
87
|
models: Record<string, TokenCost>;
|
|
72
88
|
}
|
|
@@ -77,7 +93,7 @@ function loadCachedPricing(): PricingCache | null {
|
|
|
77
93
|
try {
|
|
78
94
|
const raw = readFileSync(PRICING_CACHE_PATH, 'utf-8');
|
|
79
95
|
const parsed = JSON.parse(raw) as PricingCache;
|
|
80
|
-
if (Date.now() - parsed.fetchedAt < CACHE_TTL_MS) {
|
|
96
|
+
if (parsed.version === PRICING_CACHE_VERSION && Date.now() - parsed.fetchedAt < CACHE_TTL_MS) {
|
|
81
97
|
return parsed;
|
|
82
98
|
}
|
|
83
99
|
} catch {
|
|
@@ -89,15 +105,30 @@ function loadCachedPricing(): PricingCache | null {
|
|
|
89
105
|
async function fetchPricingFromOpenRouter(): Promise<Record<string, TokenCost>> {
|
|
90
106
|
const res = await fetch('https://openrouter.ai/api/v1/models');
|
|
91
107
|
if (!res.ok) throw new Error(`OpenRouter models API: HTTP ${res.status}`);
|
|
92
|
-
const json = await res.json() as {
|
|
108
|
+
const json = await res.json() as {
|
|
109
|
+
data?: Array<{
|
|
110
|
+
id: string;
|
|
111
|
+
pricing?: {
|
|
112
|
+
prompt?: string;
|
|
113
|
+
completion?: string;
|
|
114
|
+
input_cache_read?: string;
|
|
115
|
+
input_cache_write?: string;
|
|
116
|
+
};
|
|
117
|
+
}>;
|
|
118
|
+
};
|
|
93
119
|
const models: Record<string, TokenCost> = {};
|
|
94
120
|
for (const m of json.data ?? []) {
|
|
95
121
|
const promptPerToken = parseFloat(m.pricing?.prompt ?? '0');
|
|
96
122
|
const completionPerToken = parseFloat(m.pricing?.completion ?? '0');
|
|
123
|
+
const cacheReadPerToken = parseFloat(m.pricing?.input_cache_read ?? '0');
|
|
124
|
+
const cacheWritePerToken = parseFloat(m.pricing?.input_cache_write ?? '0');
|
|
97
125
|
if (promptPerToken > 0 || completionPerToken > 0) {
|
|
98
126
|
models[m.id] = {
|
|
99
127
|
promptCostPer1K: promptPerToken * 1000,
|
|
100
128
|
completionCostPer1K: completionPerToken * 1000,
|
|
129
|
+
// 0 (field absent) → leave undefined so cost math falls back to the prompt rate.
|
|
130
|
+
...(cacheReadPerToken > 0 ? { cacheReadCostPer1K: cacheReadPerToken * 1000 } : {}),
|
|
131
|
+
...(cacheWritePerToken > 0 ? { cacheWriteCostPer1K: cacheWritePerToken * 1000 } : {}),
|
|
101
132
|
};
|
|
102
133
|
}
|
|
103
134
|
}
|
|
@@ -125,7 +156,7 @@ function getPricing(): Record<string, TokenCost> {
|
|
|
125
156
|
function refreshPricingCache(): void {
|
|
126
157
|
fetchPricingFromOpenRouter()
|
|
127
158
|
.then((models) => {
|
|
128
|
-
pricingCache = { fetchedAt: Date.now(), models };
|
|
159
|
+
pricingCache = { version: PRICING_CACHE_VERSION, fetchedAt: Date.now(), models };
|
|
129
160
|
try {
|
|
130
161
|
mkdirSync(dirname(PRICING_CACHE_PATH), { recursive: true });
|
|
131
162
|
writeFileSync(PRICING_CACHE_PATH, JSON.stringify(pricingCache));
|
|
@@ -164,22 +195,77 @@ export class TokenTracker {
|
|
|
164
195
|
completionTokens: 0,
|
|
165
196
|
totalTokens: 0,
|
|
166
197
|
callCount: 0,
|
|
198
|
+
cachedPromptTokens: 0,
|
|
199
|
+
cacheWriteTokens: 0,
|
|
167
200
|
};
|
|
168
201
|
|
|
202
|
+
/** Sum of provider-reported authoritative costs (e.g. OpenRouter `usage.cost`). */
|
|
203
|
+
private authoritativeCostUSD = 0;
|
|
204
|
+
/** Token buckets for calls WITHOUT an authoritative cost — priced cache-aware. */
|
|
205
|
+
private computed = { promptTokens: 0, completionTokens: 0, cachedPromptTokens: 0, cacheWriteTokens: 0 };
|
|
206
|
+
|
|
169
207
|
constructor(model: string = 'claude-sonnet-4-5-20250929') {
|
|
170
208
|
this.model = model;
|
|
171
209
|
}
|
|
172
210
|
|
|
173
|
-
|
|
211
|
+
/** Cache-aware cost for one (or an aggregate of) call(s), in USD. */
|
|
212
|
+
private costFor(promptTokens: number, completionTokens: number, cached: number, written: number): number {
|
|
213
|
+
const costs = getCostForModel(this.model);
|
|
214
|
+
const cacheReadRate = costs.cacheReadCostPer1K ?? costs.promptCostPer1K;
|
|
215
|
+
const cacheWriteRate = costs.cacheWriteCostPer1K ?? costs.promptCostPer1K;
|
|
216
|
+
const uncached = Math.max(0, promptTokens - cached - written);
|
|
217
|
+
return (
|
|
218
|
+
(uncached / 1000) * costs.promptCostPer1K +
|
|
219
|
+
(cached / 1000) * cacheReadRate +
|
|
220
|
+
(written / 1000) * cacheWriteRate +
|
|
221
|
+
(completionTokens / 1000) * costs.completionCostPer1K
|
|
222
|
+
);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Record one LLM call's usage. `promptTokens` is the TOTAL input count
|
|
227
|
+
* (cache reads + cache writes + uncached); `cachedPromptTokens` and
|
|
228
|
+
* `cacheWriteTokens` are subsets of it, priced at their own (cheaper /
|
|
229
|
+
* pricier) rates. Providers that don't report cache detail pass 0, which
|
|
230
|
+
* reduces to the previous flat-rate behaviour.
|
|
231
|
+
*/
|
|
232
|
+
addUsage(
|
|
233
|
+
promptTokens: number,
|
|
234
|
+
completionTokens: number,
|
|
235
|
+
options?: {
|
|
236
|
+
provider?: string;
|
|
237
|
+
durationMs?: number;
|
|
238
|
+
cachedPromptTokens?: number;
|
|
239
|
+
cacheWriteTokens?: number;
|
|
240
|
+
/** Provider-reported authoritative cost (e.g. OpenRouter `usage.cost`). When set, used verbatim. */
|
|
241
|
+
costUSD?: number;
|
|
242
|
+
},
|
|
243
|
+
): void {
|
|
244
|
+
const cached = Math.min(promptTokens, Math.max(0, options?.cachedPromptTokens ?? 0));
|
|
245
|
+
const written = Math.min(promptTokens - cached, Math.max(0, options?.cacheWriteTokens ?? 0));
|
|
246
|
+
|
|
174
247
|
this.usage.promptTokens += promptTokens;
|
|
175
248
|
this.usage.completionTokens += completionTokens;
|
|
176
249
|
this.usage.totalTokens += promptTokens + completionTokens;
|
|
250
|
+
this.usage.cachedPromptTokens += cached;
|
|
251
|
+
this.usage.cacheWriteTokens += written;
|
|
177
252
|
this.usage.callCount++;
|
|
178
253
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
254
|
+
// Prefer the provider's authoritative cost (already cache- and routing-
|
|
255
|
+
// adjusted). Otherwise bucket the tokens and price them cache-aware so
|
|
256
|
+
// late-arriving pricing still applies retroactively to the estimate.
|
|
257
|
+
const authoritative = options?.costUSD;
|
|
258
|
+
let estimatedCost: number;
|
|
259
|
+
if (authoritative != null && Number.isFinite(authoritative)) {
|
|
260
|
+
this.authoritativeCostUSD += authoritative;
|
|
261
|
+
estimatedCost = authoritative;
|
|
262
|
+
} else {
|
|
263
|
+
this.computed.promptTokens += promptTokens;
|
|
264
|
+
this.computed.completionTokens += completionTokens;
|
|
265
|
+
this.computed.cachedPromptTokens += cached;
|
|
266
|
+
this.computed.cacheWriteTokens += written;
|
|
267
|
+
estimatedCost = this.costFor(promptTokens, completionTokens, cached, written);
|
|
268
|
+
}
|
|
183
269
|
|
|
184
270
|
const entry: CallLogEntry = {
|
|
185
271
|
timestamp: new Date().toISOString(),
|
|
@@ -188,6 +274,8 @@ export class TokenTracker {
|
|
|
188
274
|
promptTokens,
|
|
189
275
|
completionTokens,
|
|
190
276
|
totalTokens: promptTokens + completionTokens,
|
|
277
|
+
cachedPromptTokens: cached,
|
|
278
|
+
cacheWriteTokens: written,
|
|
191
279
|
estimatedCost,
|
|
192
280
|
durationMs: options?.durationMs,
|
|
193
281
|
source: 'local-log',
|
|
@@ -206,12 +294,15 @@ export class TokenTracker {
|
|
|
206
294
|
}
|
|
207
295
|
|
|
208
296
|
getEstimatedCost(): number {
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
297
|
+
return (
|
|
298
|
+
this.authoritativeCostUSD +
|
|
299
|
+
this.costFor(
|
|
300
|
+
this.computed.promptTokens,
|
|
301
|
+
this.computed.completionTokens,
|
|
302
|
+
this.computed.cachedPromptTokens,
|
|
303
|
+
this.computed.cacheWriteTokens,
|
|
304
|
+
)
|
|
305
|
+
);
|
|
215
306
|
}
|
|
216
307
|
|
|
217
308
|
getFormattedCost(): string {
|
|
@@ -239,7 +330,11 @@ export class TokenTracker {
|
|
|
239
330
|
completionTokens: 0,
|
|
240
331
|
totalTokens: 0,
|
|
241
332
|
callCount: 0,
|
|
333
|
+
cachedPromptTokens: 0,
|
|
334
|
+
cacheWriteTokens: 0,
|
|
242
335
|
};
|
|
336
|
+
this.authoritativeCostUSD = 0;
|
|
337
|
+
this.computed = { promptTokens: 0, completionTokens: 0, cachedPromptTokens: 0, cacheWriteTokens: 0 };
|
|
243
338
|
}
|
|
244
339
|
|
|
245
340
|
setModel(model: string): void {
|
|
@@ -270,7 +365,7 @@ export function getCallLogPath(): string {
|
|
|
270
365
|
/** Force-refresh the pricing cache from OpenRouter. */
|
|
271
366
|
export async function refreshPricing(): Promise<void> {
|
|
272
367
|
const models = await fetchPricingFromOpenRouter();
|
|
273
|
-
pricingCache = { fetchedAt: Date.now(), models };
|
|
368
|
+
pricingCache = { version: PRICING_CACHE_VERSION, fetchedAt: Date.now(), models };
|
|
274
369
|
mkdirSync(dirname(PRICING_CACHE_PATH), { recursive: true });
|
|
275
370
|
writeFileSync(PRICING_CACHE_PATH, JSON.stringify(pricingCache));
|
|
276
371
|
}
|