@almadar/llm 2.20.0 → 2.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-DGW3YFPS.js → chunk-G2OE5TBE.js} +28 -9
- package/dist/chunk-G2OE5TBE.js.map +1 -0
- package/dist/{chunk-NO7P6EDT.js → chunk-IDXSWM57.js} +7 -3
- package/dist/{chunk-NO7P6EDT.js.map → chunk-IDXSWM57.js.map} +1 -1
- package/dist/{chunk-TGHGQB5I.js → chunk-SXSP6M24.js} +61 -12
- package/dist/chunk-SXSP6M24.js.map +1 -0
- package/dist/{client-BIq-gHZo.d.ts → client-Bfd-fT35.d.ts} +8 -2
- package/dist/client.d.ts +2 -2
- package/dist/client.js +2 -2
- package/dist/index.d.ts +3 -3
- package/dist/index.js +3 -3
- package/dist/{rate-limiter-BqWOhaXY.d.ts → rate-limiter-7-oOHrtX.d.ts} +22 -1
- package/dist/structured-output.d.ts +1 -1
- package/dist/structured-output.js +2 -2
- package/package.json +1 -1
- package/src/client.ts +52 -4
- package/src/structured-output.ts +7 -1
- package/src/token-tracker.ts +110 -15
- package/dist/chunk-DGW3YFPS.js.map +0 -1
- package/dist/chunk-TGHGQB5I.js.map +0 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { R as RateLimiterOptions, T as TokenUsage } from './rate-limiter-7-oOHrtX.js';
|
|
2
2
|
import { ChatOpenAI } from '@langchain/openai';
|
|
3
3
|
import { ChatAnthropic } from '@langchain/anthropic';
|
|
4
4
|
import { z } from 'zod';
|
|
@@ -90,6 +90,12 @@ interface LLMClientOptions {
|
|
|
90
90
|
* the call consuming the entire 300s).
|
|
91
91
|
*/
|
|
92
92
|
rawTimeoutMs?: number;
|
|
93
|
+
/**
|
|
94
|
+
* Explicit provider credentials. When set, used instead of reading the
|
|
95
|
+
* provider's env vars — lets callers (e.g. the CLI's account store) supply
|
|
96
|
+
* `apiKey`/`baseUrl`/`defaultModel` directly. Absent ⇒ env fallback.
|
|
97
|
+
*/
|
|
98
|
+
providerConfig?: ProviderConfig;
|
|
93
99
|
}
|
|
94
100
|
interface LLMCallOptions<T = unknown> {
|
|
95
101
|
systemPrompt: string;
|
|
@@ -400,4 +406,4 @@ declare function createOpenRouterClient(options?: Partial<Omit<LLMClientOptions,
|
|
|
400
406
|
*/
|
|
401
407
|
declare function createZhipuClient(options?: Partial<Omit<LLMClientOptions, 'provider'>>): LLMClient;
|
|
402
408
|
|
|
403
|
-
export { ANTHROPIC_MODELS as A, getAvailableProvider as B, type CacheAwareLLMCallOptions as C, DEEPSEEK_MODELS as D, getSharedLLMClient as E, isProviderAvailable as F, parseChatCompletionResponse as G, resetSharedLLMClient as H, KIMI_MODELS as K, type
|
|
409
|
+
export { ANTHROPIC_MODELS as A, getAvailableProvider as B, type CacheAwareLLMCallOptions as C, DEEPSEEK_MODELS as D, getSharedLLMClient as E, isProviderAvailable as F, parseChatCompletionResponse as G, resetSharedLLMClient as H, KIMI_MODELS as K, type LLMFinishReason as L, OPENAI_MODELS as O, type ProviderConfig as P, LLMClient as a, type CacheableBlock as b, type ChatCompletionChoice as c, type ChatCompletionMessage as d, type ChatCompletionResponse as e, type ChatCompletionRole as f, type ChatCompletionToolCall as g, type ChatCompletionToolDef as h, type ChatCompletionUsage as i, type LLMCallOptions as j, type LLMClientOptions as k, type LLMProvider as l, type LLMResponse as m, type LLMStreamChunk as n, type LLMStreamOptions as o, type LLMUsage as p, OPENROUTER_MODELS as q, createAnthropicClient as r, createCreativeClient as s, createDeepSeekClient as t, createFixClient as u, createKimiClient as v, createOpenAIClient as w, createOpenRouterClient as x, createRequirementsClient as y, createZhipuClient as z };
|
package/dist/client.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import './rate-limiter-
|
|
1
|
+
import './rate-limiter-7-oOHrtX.js';
|
|
2
2
|
import '@langchain/openai';
|
|
3
3
|
import '@langchain/anthropic';
|
|
4
4
|
import 'zod';
|
|
5
|
-
export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions,
|
|
5
|
+
export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, b as CacheableBlock, D as DEEPSEEK_MODELS, K as KIMI_MODELS, j as LLMCallOptions, a as LLMClient, k as LLMClientOptions, L as LLMFinishReason, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, H as resetSharedLLMClient } from './client-Bfd-fT35.js';
|
|
6
6
|
import '@almadar/core';
|
package/dist/client.js
CHANGED
|
@@ -18,9 +18,9 @@ import {
|
|
|
18
18
|
getSharedLLMClient,
|
|
19
19
|
isProviderAvailable,
|
|
20
20
|
resetSharedLLMClient
|
|
21
|
-
} from "./chunk-
|
|
21
|
+
} from "./chunk-G2OE5TBE.js";
|
|
22
22
|
import "./chunk-P4VCT25B.js";
|
|
23
|
-
import "./chunk-
|
|
23
|
+
import "./chunk-SXSP6M24.js";
|
|
24
24
|
export {
|
|
25
25
|
ANTHROPIC_MODELS,
|
|
26
26
|
DEEPSEEK_MODELS,
|
package/dist/index.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions,
|
|
3
|
-
export {
|
|
1
|
+
import { L as LLMFinishReason, a as LLMClient } from './client-Bfd-fT35.js';
|
|
2
|
+
export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, b as CacheableBlock, c as ChatCompletionChoice, d as ChatCompletionMessage, e as ChatCompletionResponse, f as ChatCompletionRole, g as ChatCompletionToolCall, h as ChatCompletionToolDef, i as ChatCompletionUsage, D as DEEPSEEK_MODELS, K as KIMI_MODELS, j as LLMCallOptions, k as LLMClientOptions, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, G as parseChatCompletionResponse, H as resetSharedLLMClient } from './client-Bfd-fT35.js';
|
|
3
|
+
export { a as RateLimiter, R as RateLimiterOptions, b as TokenTracker, T as TokenUsage, g as getGlobalRateLimiter, c as getGlobalTokenTracker, r as resetGlobalRateLimiter, d as resetGlobalTokenTracker } from './rate-limiter-7-oOHrtX.js';
|
|
4
4
|
export { autoCloseJson, extractJsonFromText, isValidJson, parseJsonResponse, safeParseJson } from './json-parser.js';
|
|
5
5
|
import { z } from 'zod';
|
|
6
6
|
export { JsonSchema, STRUCTURED_OUTPUT_MODELS, StructuredGenerationOptions, StructuredGenerationResult, StructuredOutputClient, StructuredOutputOptions, getStructuredOutputClient, isStructuredOutputAvailable, resetStructuredOutputClient } from './structured-output.js';
|
package/dist/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
isProviderAvailable,
|
|
20
20
|
parseChatCompletionResponse,
|
|
21
21
|
resetSharedLLMClient
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-G2OE5TBE.js";
|
|
23
23
|
import {
|
|
24
24
|
autoCloseJson,
|
|
25
25
|
extractJsonFromText,
|
|
@@ -33,7 +33,7 @@ import {
|
|
|
33
33
|
getStructuredOutputClient,
|
|
34
34
|
isStructuredOutputAvailable,
|
|
35
35
|
resetStructuredOutputClient
|
|
36
|
-
} from "./chunk-
|
|
36
|
+
} from "./chunk-IDXSWM57.js";
|
|
37
37
|
import {
|
|
38
38
|
RateLimiter,
|
|
39
39
|
TokenTracker,
|
|
@@ -41,7 +41,7 @@ import {
|
|
|
41
41
|
getGlobalTokenTracker,
|
|
42
42
|
resetGlobalRateLimiter,
|
|
43
43
|
resetGlobalTokenTracker
|
|
44
|
-
} from "./chunk-
|
|
44
|
+
} from "./chunk-SXSP6M24.js";
|
|
45
45
|
import {
|
|
46
46
|
MasarError,
|
|
47
47
|
MasarProvider,
|
|
@@ -14,14 +14,35 @@ interface TokenUsage {
|
|
|
14
14
|
completionTokens: number;
|
|
15
15
|
totalTokens: number;
|
|
16
16
|
callCount: number;
|
|
17
|
+
/** Subset of `promptTokens` that were cache reads (billed at the cache-read rate). */
|
|
18
|
+
cachedPromptTokens: number;
|
|
19
|
+
/** Subset of `promptTokens` written to cache (Anthropic; billed at the cache-write rate). */
|
|
20
|
+
cacheWriteTokens: number;
|
|
17
21
|
}
|
|
18
22
|
declare class TokenTracker {
|
|
19
23
|
private model;
|
|
20
24
|
private usage;
|
|
25
|
+
/** Sum of provider-reported authoritative costs (e.g. OpenRouter `usage.cost`). */
|
|
26
|
+
private authoritativeCostUSD;
|
|
27
|
+
/** Token buckets for calls WITHOUT an authoritative cost — priced cache-aware. */
|
|
28
|
+
private computed;
|
|
21
29
|
constructor(model?: string);
|
|
30
|
+
/** Cache-aware cost for one (or an aggregate of) call(s), in USD. */
|
|
31
|
+
private costFor;
|
|
32
|
+
/**
|
|
33
|
+
* Record one LLM call's usage. `promptTokens` is the TOTAL input count
|
|
34
|
+
* (cache reads + cache writes + uncached); `cachedPromptTokens` and
|
|
35
|
+
* `cacheWriteTokens` are subsets of it, priced at their own (cheaper /
|
|
36
|
+
* pricier) rates. Providers that don't report cache detail pass 0, which
|
|
37
|
+
* reduces to the previous flat-rate behaviour.
|
|
38
|
+
*/
|
|
22
39
|
addUsage(promptTokens: number, completionTokens: number, options?: {
|
|
23
40
|
provider?: string;
|
|
24
41
|
durationMs?: number;
|
|
42
|
+
cachedPromptTokens?: number;
|
|
43
|
+
cacheWriteTokens?: number;
|
|
44
|
+
/** Provider-reported authoritative cost (e.g. OpenRouter `usage.cost`). When set, used verbatim. */
|
|
45
|
+
costUSD?: number;
|
|
25
46
|
}): void;
|
|
26
47
|
getSummary(): TokenUsage;
|
|
27
48
|
getEstimatedCost(): number;
|
|
@@ -99,4 +120,4 @@ declare class RateLimiter {
|
|
|
99
120
|
declare function getGlobalRateLimiter(options?: RateLimiterOptions): RateLimiter;
|
|
100
121
|
declare function resetGlobalRateLimiter(): void;
|
|
101
122
|
|
|
102
|
-
export {
|
|
123
|
+
export { type RateLimiterOptions as R, type TokenUsage as T, RateLimiter as a, TokenTracker as b, getGlobalTokenTracker as c, resetGlobalTokenTracker as d, getGlobalRateLimiter as g, resetGlobalRateLimiter as r };
|
|
@@ -4,8 +4,8 @@ import {
|
|
|
4
4
|
getStructuredOutputClient,
|
|
5
5
|
isStructuredOutputAvailable,
|
|
6
6
|
resetStructuredOutputClient
|
|
7
|
-
} from "./chunk-
|
|
8
|
-
import "./chunk-
|
|
7
|
+
} from "./chunk-IDXSWM57.js";
|
|
8
|
+
import "./chunk-SXSP6M24.js";
|
|
9
9
|
export {
|
|
10
10
|
STRUCTURED_OUTPUT_MODELS,
|
|
11
11
|
StructuredOutputClient,
|
package/package.json
CHANGED
package/src/client.ts
CHANGED
|
@@ -55,6 +55,28 @@ interface ModelKwargs {
|
|
|
55
55
|
max_completion_tokens?: number;
|
|
56
56
|
thinking?: { type: string };
|
|
57
57
|
tool_choice?: string;
|
|
58
|
+
/** OpenRouter: ask for detailed usage accounting (returns cached-token breakdown). */
|
|
59
|
+
usage?: { include: boolean };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Pull the cache-read / cache-write token split out of a LangChain
|
|
64
|
+
* `usage_metadata` object. Every provider LangChain supports normalises
|
|
65
|
+
* its cache-hit tokens into `input_token_details` (OpenAI/OpenRouter/DeepSeek
|
|
66
|
+
* via `cached_tokens`→`cache_read`; Anthropic via `cache_read`/`cache_creation`),
|
|
67
|
+
* so one extractor covers them all. Returns zeros when the provider reports
|
|
68
|
+
* no cache detail — which prices identically to the old flat-rate path.
|
|
69
|
+
*/
|
|
70
|
+
function cacheTokensFromUsageMetadata(usageMeta: {
|
|
71
|
+
input_tokens?: number;
|
|
72
|
+
output_tokens?: number;
|
|
73
|
+
input_token_details?: { cache_read?: number; cache_creation?: number };
|
|
74
|
+
}): { cachedPromptTokens: number; cacheWriteTokens: number } {
|
|
75
|
+
const details = usageMeta.input_token_details ?? {};
|
|
76
|
+
return {
|
|
77
|
+
cachedPromptTokens: details.cache_read ?? 0,
|
|
78
|
+
cacheWriteTokens: details.cache_creation ?? 0,
|
|
79
|
+
};
|
|
58
80
|
}
|
|
59
81
|
|
|
60
82
|
/**
|
|
@@ -126,6 +148,12 @@ export interface LLMClientOptions {
|
|
|
126
148
|
* the call consuming the entire 300s).
|
|
127
149
|
*/
|
|
128
150
|
rawTimeoutMs?: number;
|
|
151
|
+
/**
|
|
152
|
+
* Explicit provider credentials. When set, used instead of reading the
|
|
153
|
+
* provider's env vars — lets callers (e.g. the CLI's account store) supply
|
|
154
|
+
* `apiKey`/`baseUrl`/`defaultModel` directly. Absent ⇒ env fallback.
|
|
155
|
+
*/
|
|
156
|
+
providerConfig?: ProviderConfig;
|
|
129
157
|
}
|
|
130
158
|
|
|
131
159
|
export interface LLMCallOptions<T = unknown> {
|
|
@@ -369,7 +397,7 @@ export class LLMClient {
|
|
|
369
397
|
(this.provider === 'kimi' ? 0.6 : DEFAULT_TEMPERATURE);
|
|
370
398
|
this.streaming = options.streaming ?? false;
|
|
371
399
|
|
|
372
|
-
this.providerConfig = PROVIDER_CONFIGS[this.provider]();
|
|
400
|
+
this.providerConfig = options.providerConfig ?? PROVIDER_CONFIGS[this.provider]();
|
|
373
401
|
this.modelName = options.model || this.providerConfig.defaultModel;
|
|
374
402
|
this.rawTimeoutMs = options.rawTimeoutMs ?? LLMClient.DEFAULT_RAW_TIMEOUT_MS;
|
|
375
403
|
|
|
@@ -492,6 +520,8 @@ export class LLMClient {
|
|
|
492
520
|
// OpenRouter (Qwen): explicit tool_choice so the model doesn't ignore tool definitions
|
|
493
521
|
if (this.provider === 'openrouter') {
|
|
494
522
|
modelKwargs.tool_choice = 'auto';
|
|
523
|
+
// Return the cached-token breakdown so cost is priced cache-aware.
|
|
524
|
+
modelKwargs.usage = { include: true };
|
|
495
525
|
}
|
|
496
526
|
|
|
497
527
|
return new ChatOpenAI({
|
|
@@ -637,7 +667,7 @@ export class LLMClient {
|
|
|
637
667
|
this.tokenTracker.addUsage(
|
|
638
668
|
usage.promptTokens,
|
|
639
669
|
usage.completionTokens,
|
|
640
|
-
{ provider: this.provider },
|
|
670
|
+
{ provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
|
|
641
671
|
);
|
|
642
672
|
}
|
|
643
673
|
}
|
|
@@ -843,6 +873,7 @@ export class LLMClient {
|
|
|
843
873
|
this.tokenTracker.addUsage(
|
|
844
874
|
usage.promptTokens,
|
|
845
875
|
usage.completionTokens,
|
|
876
|
+
{ provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
|
|
846
877
|
);
|
|
847
878
|
}
|
|
848
879
|
}
|
|
@@ -912,6 +943,7 @@ export class LLMClient {
|
|
|
912
943
|
this.tokenTracker.addUsage(
|
|
913
944
|
usage.promptTokens,
|
|
914
945
|
usage.completionTokens,
|
|
946
|
+
{ provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
|
|
915
947
|
);
|
|
916
948
|
}
|
|
917
949
|
}
|
|
@@ -989,6 +1021,8 @@ export class LLMClient {
|
|
|
989
1021
|
// consumers see one canonical field name.
|
|
990
1022
|
if (this.provider === 'openrouter') {
|
|
991
1023
|
body['reasoning'] = { enabled: true };
|
|
1024
|
+
// Return authoritative usage accounting (real cost + cached-token split).
|
|
1025
|
+
body['usage'] = { include: true };
|
|
992
1026
|
}
|
|
993
1027
|
|
|
994
1028
|
const startedAt = Date.now();
|
|
@@ -1064,7 +1098,17 @@ export class LLMClient {
|
|
|
1064
1098
|
totalTokens: parsed.usage.total_tokens,
|
|
1065
1099
|
};
|
|
1066
1100
|
if (this.tokenTracker) {
|
|
1067
|
-
|
|
1101
|
+
const rawUsage = parsed.usage as {
|
|
1102
|
+
prompt_tokens_details?: { cached_tokens?: number };
|
|
1103
|
+
cost?: number;
|
|
1104
|
+
};
|
|
1105
|
+
const cachedTokens = rawUsage.prompt_tokens_details?.cached_tokens ?? 0;
|
|
1106
|
+
this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, {
|
|
1107
|
+
provider: this.provider,
|
|
1108
|
+
cachedPromptTokens: cachedTokens,
|
|
1109
|
+
// OpenRouter returns the real, routing+cache-adjusted charge here.
|
|
1110
|
+
...(typeof rawUsage.cost === 'number' ? { costUSD: rawUsage.cost } : {}),
|
|
1111
|
+
});
|
|
1068
1112
|
}
|
|
1069
1113
|
}
|
|
1070
1114
|
|
|
@@ -1249,9 +1293,13 @@ export class LLMClient {
|
|
|
1249
1293
|
};
|
|
1250
1294
|
|
|
1251
1295
|
if (this.tokenTracker) {
|
|
1296
|
+
// Anthropic reports input_tokens as the UNCACHED count; cache reads
|
|
1297
|
+
// and writes are separate. Pass the true total so the tracker prices
|
|
1298
|
+
// each bucket at its own rate.
|
|
1252
1299
|
this.tokenTracker.addUsage(
|
|
1253
|
-
|
|
1300
|
+
apiUsage.input_tokens + cacheRead + cacheCreation,
|
|
1254
1301
|
usage.completionTokens,
|
|
1302
|
+
{ provider: this.provider, cachedPromptTokens: cacheRead, cacheWriteTokens: cacheCreation },
|
|
1255
1303
|
);
|
|
1256
1304
|
}
|
|
1257
1305
|
|
package/src/structured-output.ts
CHANGED
|
@@ -268,7 +268,13 @@ export class StructuredOutputClient {
|
|
|
268
268
|
};
|
|
269
269
|
|
|
270
270
|
if (this.tokenTracker) {
|
|
271
|
-
|
|
271
|
+
const cachedTokens =
|
|
272
|
+
(response.usage as { prompt_tokens_details?: { cached_tokens?: number } } | undefined)
|
|
273
|
+
?.prompt_tokens_details?.cached_tokens ?? 0;
|
|
274
|
+
this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, {
|
|
275
|
+
provider: 'structured-output',
|
|
276
|
+
cachedPromptTokens: cachedTokens,
|
|
277
|
+
});
|
|
272
278
|
}
|
|
273
279
|
|
|
274
280
|
console.log(
|
package/src/token-tracker.ts
CHANGED
|
@@ -18,11 +18,19 @@ export interface TokenUsage {
|
|
|
18
18
|
completionTokens: number;
|
|
19
19
|
totalTokens: number;
|
|
20
20
|
callCount: number;
|
|
21
|
+
/** Subset of `promptTokens` that were cache reads (billed at the cache-read rate). */
|
|
22
|
+
cachedPromptTokens: number;
|
|
23
|
+
/** Subset of `promptTokens` written to cache (Anthropic; billed at the cache-write rate). */
|
|
24
|
+
cacheWriteTokens: number;
|
|
21
25
|
}
|
|
22
26
|
|
|
23
27
|
export interface TokenCost {
|
|
24
28
|
promptCostPer1K: number;
|
|
25
29
|
completionCostPer1K: number;
|
|
30
|
+
/** Per-1K rate for cache-read (cache-hit) prompt tokens. Falls back to prompt rate when absent. */
|
|
31
|
+
cacheReadCostPer1K?: number;
|
|
32
|
+
/** Per-1K rate for cache-write (cache-creation) prompt tokens. Falls back to prompt rate when absent. */
|
|
33
|
+
cacheWriteCostPer1K?: number;
|
|
26
34
|
}
|
|
27
35
|
|
|
28
36
|
export interface CallLogEntry {
|
|
@@ -32,6 +40,10 @@ export interface CallLogEntry {
|
|
|
32
40
|
promptTokens: number;
|
|
33
41
|
completionTokens: number;
|
|
34
42
|
totalTokens: number;
|
|
43
|
+
/** Cache-read (cache-hit) subset of promptTokens, billed at the discounted rate. */
|
|
44
|
+
cachedPromptTokens?: number;
|
|
45
|
+
/** Cache-write subset of promptTokens (Anthropic). */
|
|
46
|
+
cacheWriteTokens?: number;
|
|
35
47
|
estimatedCost: number;
|
|
36
48
|
durationMs?: number;
|
|
37
49
|
source: 'local-log';
|
|
@@ -45,6 +57,9 @@ const ALMADAR_ROOT = process.env['ALMADAR_ROOT'] ?? process.cwd();
|
|
|
45
57
|
const PRICING_CACHE_PATH = join(ALMADAR_ROOT, '.llm-pricing-cache.json');
|
|
46
58
|
const CALL_LOG_PATH = join(ALMADAR_ROOT, '.llm-call-log.jsonl');
|
|
47
59
|
const CACHE_TTL_MS = 24 * 60 * 60 * 1000; // 24 hours
|
|
60
|
+
// Bump when the cached TokenCost shape changes so stale on-disk caches are
|
|
61
|
+
// invalidated on upgrade. v2 added cacheReadCostPer1K / cacheWriteCostPer1K.
|
|
62
|
+
const PRICING_CACHE_VERSION = 2;
|
|
48
63
|
|
|
49
64
|
/** Map from our local model name to OpenRouter model ID */
|
|
50
65
|
const MODEL_ID_MAP: Record<string, string> = {
|
|
@@ -67,6 +82,7 @@ const MODEL_ID_MAP: Record<string, string> = {
|
|
|
67
82
|
const FALLBACK_COSTS: Record<string, TokenCost> = {};
|
|
68
83
|
|
|
69
84
|
interface PricingCache {
|
|
85
|
+
version?: number;
|
|
70
86
|
fetchedAt: number;
|
|
71
87
|
models: Record<string, TokenCost>;
|
|
72
88
|
}
|
|
@@ -77,7 +93,7 @@ function loadCachedPricing(): PricingCache | null {
|
|
|
77
93
|
try {
|
|
78
94
|
const raw = readFileSync(PRICING_CACHE_PATH, 'utf-8');
|
|
79
95
|
const parsed = JSON.parse(raw) as PricingCache;
|
|
80
|
-
if (Date.now() - parsed.fetchedAt < CACHE_TTL_MS) {
|
|
96
|
+
if (parsed.version === PRICING_CACHE_VERSION && Date.now() - parsed.fetchedAt < CACHE_TTL_MS) {
|
|
81
97
|
return parsed;
|
|
82
98
|
}
|
|
83
99
|
} catch {
|
|
@@ -89,15 +105,30 @@ function loadCachedPricing(): PricingCache | null {
|
|
|
89
105
|
async function fetchPricingFromOpenRouter(): Promise<Record<string, TokenCost>> {
|
|
90
106
|
const res = await fetch('https://openrouter.ai/api/v1/models');
|
|
91
107
|
if (!res.ok) throw new Error(`OpenRouter models API: HTTP ${res.status}`);
|
|
92
|
-
const json = await res.json() as {
|
|
108
|
+
const json = await res.json() as {
|
|
109
|
+
data?: Array<{
|
|
110
|
+
id: string;
|
|
111
|
+
pricing?: {
|
|
112
|
+
prompt?: string;
|
|
113
|
+
completion?: string;
|
|
114
|
+
input_cache_read?: string;
|
|
115
|
+
input_cache_write?: string;
|
|
116
|
+
};
|
|
117
|
+
}>;
|
|
118
|
+
};
|
|
93
119
|
const models: Record<string, TokenCost> = {};
|
|
94
120
|
for (const m of json.data ?? []) {
|
|
95
121
|
const promptPerToken = parseFloat(m.pricing?.prompt ?? '0');
|
|
96
122
|
const completionPerToken = parseFloat(m.pricing?.completion ?? '0');
|
|
123
|
+
const cacheReadPerToken = parseFloat(m.pricing?.input_cache_read ?? '0');
|
|
124
|
+
const cacheWritePerToken = parseFloat(m.pricing?.input_cache_write ?? '0');
|
|
97
125
|
if (promptPerToken > 0 || completionPerToken > 0) {
|
|
98
126
|
models[m.id] = {
|
|
99
127
|
promptCostPer1K: promptPerToken * 1000,
|
|
100
128
|
completionCostPer1K: completionPerToken * 1000,
|
|
129
|
+
// 0 (field absent) → leave undefined so cost math falls back to the prompt rate.
|
|
130
|
+
...(cacheReadPerToken > 0 ? { cacheReadCostPer1K: cacheReadPerToken * 1000 } : {}),
|
|
131
|
+
...(cacheWritePerToken > 0 ? { cacheWriteCostPer1K: cacheWritePerToken * 1000 } : {}),
|
|
101
132
|
};
|
|
102
133
|
}
|
|
103
134
|
}
|
|
@@ -125,7 +156,7 @@ function getPricing(): Record<string, TokenCost> {
|
|
|
125
156
|
function refreshPricingCache(): void {
|
|
126
157
|
fetchPricingFromOpenRouter()
|
|
127
158
|
.then((models) => {
|
|
128
|
-
pricingCache = { fetchedAt: Date.now(), models };
|
|
159
|
+
pricingCache = { version: PRICING_CACHE_VERSION, fetchedAt: Date.now(), models };
|
|
129
160
|
try {
|
|
130
161
|
mkdirSync(dirname(PRICING_CACHE_PATH), { recursive: true });
|
|
131
162
|
writeFileSync(PRICING_CACHE_PATH, JSON.stringify(pricingCache));
|
|
@@ -164,22 +195,77 @@ export class TokenTracker {
|
|
|
164
195
|
completionTokens: 0,
|
|
165
196
|
totalTokens: 0,
|
|
166
197
|
callCount: 0,
|
|
198
|
+
cachedPromptTokens: 0,
|
|
199
|
+
cacheWriteTokens: 0,
|
|
167
200
|
};
|
|
168
201
|
|
|
202
|
+
/** Sum of provider-reported authoritative costs (e.g. OpenRouter `usage.cost`). */
|
|
203
|
+
private authoritativeCostUSD = 0;
|
|
204
|
+
/** Token buckets for calls WITHOUT an authoritative cost — priced cache-aware. */
|
|
205
|
+
private computed = { promptTokens: 0, completionTokens: 0, cachedPromptTokens: 0, cacheWriteTokens: 0 };
|
|
206
|
+
|
|
169
207
|
constructor(model: string = 'claude-sonnet-4-5-20250929') {
|
|
170
208
|
this.model = model;
|
|
171
209
|
}
|
|
172
210
|
|
|
173
|
-
|
|
211
|
+
/** Cache-aware cost for one (or an aggregate of) call(s), in USD. */
|
|
212
|
+
private costFor(promptTokens: number, completionTokens: number, cached: number, written: number): number {
|
|
213
|
+
const costs = getCostForModel(this.model);
|
|
214
|
+
const cacheReadRate = costs.cacheReadCostPer1K ?? costs.promptCostPer1K;
|
|
215
|
+
const cacheWriteRate = costs.cacheWriteCostPer1K ?? costs.promptCostPer1K;
|
|
216
|
+
const uncached = Math.max(0, promptTokens - cached - written);
|
|
217
|
+
return (
|
|
218
|
+
(uncached / 1000) * costs.promptCostPer1K +
|
|
219
|
+
(cached / 1000) * cacheReadRate +
|
|
220
|
+
(written / 1000) * cacheWriteRate +
|
|
221
|
+
(completionTokens / 1000) * costs.completionCostPer1K
|
|
222
|
+
);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Record one LLM call's usage. `promptTokens` is the TOTAL input count
|
|
227
|
+
* (cache reads + cache writes + uncached); `cachedPromptTokens` and
|
|
228
|
+
* `cacheWriteTokens` are subsets of it, priced at their own (cheaper /
|
|
229
|
+
* pricier) rates. Providers that don't report cache detail pass 0, which
|
|
230
|
+
* reduces to the previous flat-rate behaviour.
|
|
231
|
+
*/
|
|
232
|
+
addUsage(
|
|
233
|
+
promptTokens: number,
|
|
234
|
+
completionTokens: number,
|
|
235
|
+
options?: {
|
|
236
|
+
provider?: string;
|
|
237
|
+
durationMs?: number;
|
|
238
|
+
cachedPromptTokens?: number;
|
|
239
|
+
cacheWriteTokens?: number;
|
|
240
|
+
/** Provider-reported authoritative cost (e.g. OpenRouter `usage.cost`). When set, used verbatim. */
|
|
241
|
+
costUSD?: number;
|
|
242
|
+
},
|
|
243
|
+
): void {
|
|
244
|
+
const cached = Math.min(promptTokens, Math.max(0, options?.cachedPromptTokens ?? 0));
|
|
245
|
+
const written = Math.min(promptTokens - cached, Math.max(0, options?.cacheWriteTokens ?? 0));
|
|
246
|
+
|
|
174
247
|
this.usage.promptTokens += promptTokens;
|
|
175
248
|
this.usage.completionTokens += completionTokens;
|
|
176
249
|
this.usage.totalTokens += promptTokens + completionTokens;
|
|
250
|
+
this.usage.cachedPromptTokens += cached;
|
|
251
|
+
this.usage.cacheWriteTokens += written;
|
|
177
252
|
this.usage.callCount++;
|
|
178
253
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
254
|
+
// Prefer the provider's authoritative cost (already cache- and routing-
|
|
255
|
+
// adjusted). Otherwise bucket the tokens and price them cache-aware so
|
|
256
|
+
// late-arriving pricing still applies retroactively to the estimate.
|
|
257
|
+
const authoritative = options?.costUSD;
|
|
258
|
+
let estimatedCost: number;
|
|
259
|
+
if (authoritative != null && Number.isFinite(authoritative)) {
|
|
260
|
+
this.authoritativeCostUSD += authoritative;
|
|
261
|
+
estimatedCost = authoritative;
|
|
262
|
+
} else {
|
|
263
|
+
this.computed.promptTokens += promptTokens;
|
|
264
|
+
this.computed.completionTokens += completionTokens;
|
|
265
|
+
this.computed.cachedPromptTokens += cached;
|
|
266
|
+
this.computed.cacheWriteTokens += written;
|
|
267
|
+
estimatedCost = this.costFor(promptTokens, completionTokens, cached, written);
|
|
268
|
+
}
|
|
183
269
|
|
|
184
270
|
const entry: CallLogEntry = {
|
|
185
271
|
timestamp: new Date().toISOString(),
|
|
@@ -188,6 +274,8 @@ export class TokenTracker {
|
|
|
188
274
|
promptTokens,
|
|
189
275
|
completionTokens,
|
|
190
276
|
totalTokens: promptTokens + completionTokens,
|
|
277
|
+
cachedPromptTokens: cached,
|
|
278
|
+
cacheWriteTokens: written,
|
|
191
279
|
estimatedCost,
|
|
192
280
|
durationMs: options?.durationMs,
|
|
193
281
|
source: 'local-log',
|
|
@@ -206,12 +294,15 @@ export class TokenTracker {
|
|
|
206
294
|
}
|
|
207
295
|
|
|
208
296
|
getEstimatedCost(): number {
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
297
|
+
return (
|
|
298
|
+
this.authoritativeCostUSD +
|
|
299
|
+
this.costFor(
|
|
300
|
+
this.computed.promptTokens,
|
|
301
|
+
this.computed.completionTokens,
|
|
302
|
+
this.computed.cachedPromptTokens,
|
|
303
|
+
this.computed.cacheWriteTokens,
|
|
304
|
+
)
|
|
305
|
+
);
|
|
215
306
|
}
|
|
216
307
|
|
|
217
308
|
getFormattedCost(): string {
|
|
@@ -239,7 +330,11 @@ export class TokenTracker {
|
|
|
239
330
|
completionTokens: 0,
|
|
240
331
|
totalTokens: 0,
|
|
241
332
|
callCount: 0,
|
|
333
|
+
cachedPromptTokens: 0,
|
|
334
|
+
cacheWriteTokens: 0,
|
|
242
335
|
};
|
|
336
|
+
this.authoritativeCostUSD = 0;
|
|
337
|
+
this.computed = { promptTokens: 0, completionTokens: 0, cachedPromptTokens: 0, cacheWriteTokens: 0 };
|
|
243
338
|
}
|
|
244
339
|
|
|
245
340
|
setModel(model: string): void {
|
|
@@ -270,7 +365,7 @@ export function getCallLogPath(): string {
|
|
|
270
365
|
/** Force-refresh the pricing cache from OpenRouter. */
|
|
271
366
|
export async function refreshPricing(): Promise<void> {
|
|
272
367
|
const models = await fetchPricingFromOpenRouter();
|
|
273
|
-
pricingCache = { fetchedAt: Date.now(), models };
|
|
368
|
+
pricingCache = { version: PRICING_CACHE_VERSION, fetchedAt: Date.now(), models };
|
|
274
369
|
mkdirSync(dirname(PRICING_CACHE_PATH), { recursive: true });
|
|
275
370
|
writeFileSync(PRICING_CACHE_PATH, JSON.stringify(pricingCache));
|
|
276
371
|
}
|