@almadar/llm 2.19.0 → 2.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-4V7BB3ZQ.js → chunk-44QBGRL3.js} +42 -8
- package/dist/chunk-44QBGRL3.js.map +1 -0
- package/dist/{chunk-NO7P6EDT.js → chunk-IDXSWM57.js} +7 -3
- package/dist/{chunk-NO7P6EDT.js.map → chunk-IDXSWM57.js.map} +1 -1
- package/dist/{chunk-TGHGQB5I.js → chunk-SXSP6M24.js} +61 -12
- package/dist/chunk-SXSP6M24.js.map +1 -0
- package/dist/{client-C478gp_H.d.ts → client-BTqaooer.d.ts} +3 -3
- package/dist/client.d.ts +2 -2
- package/dist/client.js +2 -2
- package/dist/index.d.ts +3 -3
- package/dist/index.js +3 -3
- package/dist/{rate-limiter-BqWOhaXY.d.ts → rate-limiter-7-oOHrtX.d.ts} +22 -1
- package/dist/structured-output.d.ts +1 -1
- package/dist/structured-output.js +2 -2
- package/package.json +1 -1
- package/src/client.ts +63 -4
- package/src/structured-output.ts +7 -1
- package/src/token-tracker.ts +110 -15
- package/dist/chunk-4V7BB3ZQ.js.map +0 -1
- package/dist/chunk-TGHGQB5I.js.map +0 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { R as RateLimiterOptions, T as TokenUsage } from './rate-limiter-7-oOHrtX.js';
|
|
2
2
|
import { ChatOpenAI } from '@langchain/openai';
|
|
3
3
|
import { ChatAnthropic } from '@langchain/anthropic';
|
|
4
4
|
import { z } from 'zod';
|
|
@@ -67,7 +67,7 @@ interface ChatCompletionResponse {
|
|
|
67
67
|
declare function parseChatCompletionResponse(raw: string): ChatCompletionResponse;
|
|
68
68
|
|
|
69
69
|
type ChatModel = ChatOpenAI | ChatAnthropic;
|
|
70
|
-
type LLMProvider = 'openai' | 'deepseek' | 'anthropic' | 'kimi' | 'openrouter' | 'orbgen';
|
|
70
|
+
type LLMProvider = 'openai' | 'deepseek' | 'anthropic' | 'kimi' | 'openrouter' | 'orbgen' | 'masar';
|
|
71
71
|
interface ProviderConfig {
|
|
72
72
|
apiKey: string;
|
|
73
73
|
baseUrl?: string;
|
|
@@ -400,4 +400,4 @@ declare function createOpenRouterClient(options?: Partial<Omit<LLMClientOptions,
|
|
|
400
400
|
*/
|
|
401
401
|
declare function createZhipuClient(options?: Partial<Omit<LLMClientOptions, 'provider'>>): LLMClient;
|
|
402
402
|
|
|
403
|
-
export { ANTHROPIC_MODELS as A, getAvailableProvider as B, type CacheAwareLLMCallOptions as C, DEEPSEEK_MODELS as D, getSharedLLMClient as E, isProviderAvailable as F, parseChatCompletionResponse as G, resetSharedLLMClient as H, KIMI_MODELS as K, type
|
|
403
|
+
export { ANTHROPIC_MODELS as A, getAvailableProvider as B, type CacheAwareLLMCallOptions as C, DEEPSEEK_MODELS as D, getSharedLLMClient as E, isProviderAvailable as F, parseChatCompletionResponse as G, resetSharedLLMClient as H, KIMI_MODELS as K, type LLMFinishReason as L, OPENAI_MODELS as O, type ProviderConfig as P, LLMClient as a, type CacheableBlock as b, type ChatCompletionChoice as c, type ChatCompletionMessage as d, type ChatCompletionResponse as e, type ChatCompletionRole as f, type ChatCompletionToolCall as g, type ChatCompletionToolDef as h, type ChatCompletionUsage as i, type LLMCallOptions as j, type LLMClientOptions as k, type LLMProvider as l, type LLMResponse as m, type LLMStreamChunk as n, type LLMStreamOptions as o, type LLMUsage as p, OPENROUTER_MODELS as q, createAnthropicClient as r, createCreativeClient as s, createDeepSeekClient as t, createFixClient as u, createKimiClient as v, createOpenAIClient as w, createOpenRouterClient as x, createRequirementsClient as y, createZhipuClient as z };
|
package/dist/client.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import './rate-limiter-
|
|
1
|
+
import './rate-limiter-7-oOHrtX.js';
|
|
2
2
|
import '@langchain/openai';
|
|
3
3
|
import '@langchain/anthropic';
|
|
4
4
|
import 'zod';
|
|
5
|
-
export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions,
|
|
5
|
+
export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, b as CacheableBlock, D as DEEPSEEK_MODELS, K as KIMI_MODELS, j as LLMCallOptions, a as LLMClient, k as LLMClientOptions, L as LLMFinishReason, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, H as resetSharedLLMClient } from './client-BTqaooer.js';
|
|
6
6
|
import '@almadar/core';
|
package/dist/client.js
CHANGED
|
@@ -18,9 +18,9 @@ import {
|
|
|
18
18
|
getSharedLLMClient,
|
|
19
19
|
isProviderAvailable,
|
|
20
20
|
resetSharedLLMClient
|
|
21
|
-
} from "./chunk-
|
|
21
|
+
} from "./chunk-44QBGRL3.js";
|
|
22
22
|
import "./chunk-P4VCT25B.js";
|
|
23
|
-
import "./chunk-
|
|
23
|
+
import "./chunk-SXSP6M24.js";
|
|
24
24
|
export {
|
|
25
25
|
ANTHROPIC_MODELS,
|
|
26
26
|
DEEPSEEK_MODELS,
|
package/dist/index.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions,
|
|
3
|
-
export {
|
|
1
|
+
import { L as LLMFinishReason, a as LLMClient } from './client-BTqaooer.js';
|
|
2
|
+
export { A as ANTHROPIC_MODELS, C as CacheAwareLLMCallOptions, b as CacheableBlock, c as ChatCompletionChoice, d as ChatCompletionMessage, e as ChatCompletionResponse, f as ChatCompletionRole, g as ChatCompletionToolCall, h as ChatCompletionToolDef, i as ChatCompletionUsage, D as DEEPSEEK_MODELS, K as KIMI_MODELS, j as LLMCallOptions, k as LLMClientOptions, l as LLMProvider, m as LLMResponse, n as LLMStreamChunk, o as LLMStreamOptions, p as LLMUsage, O as OPENAI_MODELS, q as OPENROUTER_MODELS, P as ProviderConfig, r as createAnthropicClient, s as createCreativeClient, t as createDeepSeekClient, u as createFixClient, v as createKimiClient, w as createOpenAIClient, x as createOpenRouterClient, y as createRequirementsClient, z as createZhipuClient, B as getAvailableProvider, E as getSharedLLMClient, F as isProviderAvailable, G as parseChatCompletionResponse, H as resetSharedLLMClient } from './client-BTqaooer.js';
|
|
3
|
+
export { a as RateLimiter, R as RateLimiterOptions, b as TokenTracker, T as TokenUsage, g as getGlobalRateLimiter, c as getGlobalTokenTracker, r as resetGlobalRateLimiter, d as resetGlobalTokenTracker } from './rate-limiter-7-oOHrtX.js';
|
|
4
4
|
export { autoCloseJson, extractJsonFromText, isValidJson, parseJsonResponse, safeParseJson } from './json-parser.js';
|
|
5
5
|
import { z } from 'zod';
|
|
6
6
|
export { JsonSchema, STRUCTURED_OUTPUT_MODELS, StructuredGenerationOptions, StructuredGenerationResult, StructuredOutputClient, StructuredOutputOptions, getStructuredOutputClient, isStructuredOutputAvailable, resetStructuredOutputClient } from './structured-output.js';
|
package/dist/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
isProviderAvailable,
|
|
20
20
|
parseChatCompletionResponse,
|
|
21
21
|
resetSharedLLMClient
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-44QBGRL3.js";
|
|
23
23
|
import {
|
|
24
24
|
autoCloseJson,
|
|
25
25
|
extractJsonFromText,
|
|
@@ -33,7 +33,7 @@ import {
|
|
|
33
33
|
getStructuredOutputClient,
|
|
34
34
|
isStructuredOutputAvailable,
|
|
35
35
|
resetStructuredOutputClient
|
|
36
|
-
} from "./chunk-
|
|
36
|
+
} from "./chunk-IDXSWM57.js";
|
|
37
37
|
import {
|
|
38
38
|
RateLimiter,
|
|
39
39
|
TokenTracker,
|
|
@@ -41,7 +41,7 @@ import {
|
|
|
41
41
|
getGlobalTokenTracker,
|
|
42
42
|
resetGlobalRateLimiter,
|
|
43
43
|
resetGlobalTokenTracker
|
|
44
|
-
} from "./chunk-
|
|
44
|
+
} from "./chunk-SXSP6M24.js";
|
|
45
45
|
import {
|
|
46
46
|
MasarError,
|
|
47
47
|
MasarProvider,
|
|
@@ -14,14 +14,35 @@ interface TokenUsage {
|
|
|
14
14
|
completionTokens: number;
|
|
15
15
|
totalTokens: number;
|
|
16
16
|
callCount: number;
|
|
17
|
+
/** Subset of `promptTokens` that were cache reads (billed at the cache-read rate). */
|
|
18
|
+
cachedPromptTokens: number;
|
|
19
|
+
/** Subset of `promptTokens` written to cache (Anthropic; billed at the cache-write rate). */
|
|
20
|
+
cacheWriteTokens: number;
|
|
17
21
|
}
|
|
18
22
|
declare class TokenTracker {
|
|
19
23
|
private model;
|
|
20
24
|
private usage;
|
|
25
|
+
/** Sum of provider-reported authoritative costs (e.g. OpenRouter `usage.cost`). */
|
|
26
|
+
private authoritativeCostUSD;
|
|
27
|
+
/** Token buckets for calls WITHOUT an authoritative cost — priced cache-aware. */
|
|
28
|
+
private computed;
|
|
21
29
|
constructor(model?: string);
|
|
30
|
+
/** Cache-aware cost for one (or an aggregate of) call(s), in USD. */
|
|
31
|
+
private costFor;
|
|
32
|
+
/**
|
|
33
|
+
* Record one LLM call's usage. `promptTokens` is the TOTAL input count
|
|
34
|
+
* (cache reads + cache writes + uncached); `cachedPromptTokens` and
|
|
35
|
+
* `cacheWriteTokens` are subsets of it, priced at their own (cheaper /
|
|
36
|
+
* pricier) rates. Providers that don't report cache detail pass 0, which
|
|
37
|
+
* reduces to the previous flat-rate behaviour.
|
|
38
|
+
*/
|
|
22
39
|
addUsage(promptTokens: number, completionTokens: number, options?: {
|
|
23
40
|
provider?: string;
|
|
24
41
|
durationMs?: number;
|
|
42
|
+
cachedPromptTokens?: number;
|
|
43
|
+
cacheWriteTokens?: number;
|
|
44
|
+
/** Provider-reported authoritative cost (e.g. OpenRouter `usage.cost`). When set, used verbatim. */
|
|
45
|
+
costUSD?: number;
|
|
25
46
|
}): void;
|
|
26
47
|
getSummary(): TokenUsage;
|
|
27
48
|
getEstimatedCost(): number;
|
|
@@ -99,4 +120,4 @@ declare class RateLimiter {
|
|
|
99
120
|
declare function getGlobalRateLimiter(options?: RateLimiterOptions): RateLimiter;
|
|
100
121
|
declare function resetGlobalRateLimiter(): void;
|
|
101
122
|
|
|
102
|
-
export {
|
|
123
|
+
export { type RateLimiterOptions as R, type TokenUsage as T, RateLimiter as a, TokenTracker as b, getGlobalTokenTracker as c, resetGlobalTokenTracker as d, getGlobalRateLimiter as g, resetGlobalRateLimiter as r };
|
|
@@ -4,8 +4,8 @@ import {
|
|
|
4
4
|
getStructuredOutputClient,
|
|
5
5
|
isStructuredOutputAvailable,
|
|
6
6
|
resetStructuredOutputClient
|
|
7
|
-
} from "./chunk-
|
|
8
|
-
import "./chunk-
|
|
7
|
+
} from "./chunk-IDXSWM57.js";
|
|
8
|
+
import "./chunk-SXSP6M24.js";
|
|
9
9
|
export {
|
|
10
10
|
STRUCTURED_OUTPUT_MODELS,
|
|
11
11
|
StructuredOutputClient,
|
package/package.json
CHANGED
package/src/client.ts
CHANGED
|
@@ -55,6 +55,28 @@ interface ModelKwargs {
|
|
|
55
55
|
max_completion_tokens?: number;
|
|
56
56
|
thinking?: { type: string };
|
|
57
57
|
tool_choice?: string;
|
|
58
|
+
/** OpenRouter: ask for detailed usage accounting (returns cached-token breakdown). */
|
|
59
|
+
usage?: { include: boolean };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Pull the cache-read / cache-write token split out of a LangChain
|
|
64
|
+
* `usage_metadata` object. Every provider LangChain supports normalises
|
|
65
|
+
* its cache-hit tokens into `input_token_details` (OpenAI/OpenRouter/DeepSeek
|
|
66
|
+
* via `cached_tokens`→`cache_read`; Anthropic via `cache_read`/`cache_creation`),
|
|
67
|
+
* so one extractor covers them all. Returns zeros when the provider reports
|
|
68
|
+
* no cache detail — which prices identically to the old flat-rate path.
|
|
69
|
+
*/
|
|
70
|
+
function cacheTokensFromUsageMetadata(usageMeta: {
|
|
71
|
+
input_tokens?: number;
|
|
72
|
+
output_tokens?: number;
|
|
73
|
+
input_token_details?: { cache_read?: number; cache_creation?: number };
|
|
74
|
+
}): { cachedPromptTokens: number; cacheWriteTokens: number } {
|
|
75
|
+
const details = usageMeta.input_token_details ?? {};
|
|
76
|
+
return {
|
|
77
|
+
cachedPromptTokens: details.cache_read ?? 0,
|
|
78
|
+
cacheWriteTokens: details.cache_creation ?? 0,
|
|
79
|
+
};
|
|
58
80
|
}
|
|
59
81
|
|
|
60
82
|
/**
|
|
@@ -101,7 +123,7 @@ type ChatModel = ChatOpenAI | ChatAnthropic;
|
|
|
101
123
|
// Types
|
|
102
124
|
// ============================================================================
|
|
103
125
|
|
|
104
|
-
export type LLMProvider = 'openai' | 'deepseek' | 'anthropic' | 'kimi' | 'openrouter' | 'orbgen';
|
|
126
|
+
export type LLMProvider = 'openai' | 'deepseek' | 'anthropic' | 'kimi' | 'openrouter' | 'orbgen' | 'masar';
|
|
105
127
|
|
|
106
128
|
export interface ProviderConfig {
|
|
107
129
|
apiKey: string;
|
|
@@ -269,6 +291,21 @@ const PROVIDER_CONFIGS: Record<LLMProvider, () => ProviderConfig> = {
|
|
|
269
291
|
defaultModel: 'orbgen-v2',
|
|
270
292
|
};
|
|
271
293
|
},
|
|
294
|
+
masar: () => {
|
|
295
|
+
// Fine-tuned masar subagent served OpenAI-compatibly (vLLM on Cloud Run GPU).
|
|
296
|
+
const baseUrl = process.env.MASAR_SUBAGENT_URL;
|
|
297
|
+
if (!baseUrl) {
|
|
298
|
+
throw new Error(
|
|
299
|
+
'MASAR_SUBAGENT_URL environment variable is not set. ' +
|
|
300
|
+
'Set it to the masar subagent endpoint (e.g., https://masar-subagent-xxx.run.app)',
|
|
301
|
+
);
|
|
302
|
+
}
|
|
303
|
+
return {
|
|
304
|
+
apiKey: process.env.MASAR_SUBAGENT_API_KEY ?? 'not-needed',
|
|
305
|
+
baseUrl: `${baseUrl}/v1`,
|
|
306
|
+
defaultModel: process.env.MASAR_SUBAGENT_MODEL ?? 'subagent',
|
|
307
|
+
};
|
|
308
|
+
},
|
|
272
309
|
};
|
|
273
310
|
|
|
274
311
|
export const DEEPSEEK_MODELS = {
|
|
@@ -477,6 +514,8 @@ export class LLMClient {
|
|
|
477
514
|
// OpenRouter (Qwen): explicit tool_choice so the model doesn't ignore tool definitions
|
|
478
515
|
if (this.provider === 'openrouter') {
|
|
479
516
|
modelKwargs.tool_choice = 'auto';
|
|
517
|
+
// Return the cached-token breakdown so cost is priced cache-aware.
|
|
518
|
+
modelKwargs.usage = { include: true };
|
|
480
519
|
}
|
|
481
520
|
|
|
482
521
|
return new ChatOpenAI({
|
|
@@ -622,7 +661,7 @@ export class LLMClient {
|
|
|
622
661
|
this.tokenTracker.addUsage(
|
|
623
662
|
usage.promptTokens,
|
|
624
663
|
usage.completionTokens,
|
|
625
|
-
{ provider: this.provider },
|
|
664
|
+
{ provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
|
|
626
665
|
);
|
|
627
666
|
}
|
|
628
667
|
}
|
|
@@ -828,6 +867,7 @@ export class LLMClient {
|
|
|
828
867
|
this.tokenTracker.addUsage(
|
|
829
868
|
usage.promptTokens,
|
|
830
869
|
usage.completionTokens,
|
|
870
|
+
{ provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
|
|
831
871
|
);
|
|
832
872
|
}
|
|
833
873
|
}
|
|
@@ -897,6 +937,7 @@ export class LLMClient {
|
|
|
897
937
|
this.tokenTracker.addUsage(
|
|
898
938
|
usage.promptTokens,
|
|
899
939
|
usage.completionTokens,
|
|
940
|
+
{ provider: this.provider, ...cacheTokensFromUsageMetadata(usageMeta) },
|
|
900
941
|
);
|
|
901
942
|
}
|
|
902
943
|
}
|
|
@@ -974,6 +1015,8 @@ export class LLMClient {
|
|
|
974
1015
|
// consumers see one canonical field name.
|
|
975
1016
|
if (this.provider === 'openrouter') {
|
|
976
1017
|
body['reasoning'] = { enabled: true };
|
|
1018
|
+
// Return authoritative usage accounting (real cost + cached-token split).
|
|
1019
|
+
body['usage'] = { include: true };
|
|
977
1020
|
}
|
|
978
1021
|
|
|
979
1022
|
const startedAt = Date.now();
|
|
@@ -1049,7 +1092,17 @@ export class LLMClient {
|
|
|
1049
1092
|
totalTokens: parsed.usage.total_tokens,
|
|
1050
1093
|
};
|
|
1051
1094
|
if (this.tokenTracker) {
|
|
1052
|
-
|
|
1095
|
+
const rawUsage = parsed.usage as {
|
|
1096
|
+
prompt_tokens_details?: { cached_tokens?: number };
|
|
1097
|
+
cost?: number;
|
|
1098
|
+
};
|
|
1099
|
+
const cachedTokens = rawUsage.prompt_tokens_details?.cached_tokens ?? 0;
|
|
1100
|
+
this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, {
|
|
1101
|
+
provider: this.provider,
|
|
1102
|
+
cachedPromptTokens: cachedTokens,
|
|
1103
|
+
// OpenRouter returns the real, routing+cache-adjusted charge here.
|
|
1104
|
+
...(typeof rawUsage.cost === 'number' ? { costUSD: rawUsage.cost } : {}),
|
|
1105
|
+
});
|
|
1053
1106
|
}
|
|
1054
1107
|
}
|
|
1055
1108
|
|
|
@@ -1234,9 +1287,13 @@ export class LLMClient {
|
|
|
1234
1287
|
};
|
|
1235
1288
|
|
|
1236
1289
|
if (this.tokenTracker) {
|
|
1290
|
+
// Anthropic reports input_tokens as the UNCACHED count; cache reads
|
|
1291
|
+
// and writes are separate. Pass the true total so the tracker prices
|
|
1292
|
+
// each bucket at its own rate.
|
|
1237
1293
|
this.tokenTracker.addUsage(
|
|
1238
|
-
|
|
1294
|
+
apiUsage.input_tokens + cacheRead + cacheCreation,
|
|
1239
1295
|
usage.completionTokens,
|
|
1296
|
+
{ provider: this.provider, cachedPromptTokens: cacheRead, cacheWriteTokens: cacheCreation },
|
|
1240
1297
|
);
|
|
1241
1298
|
}
|
|
1242
1299
|
|
|
@@ -1343,6 +1400,8 @@ export function isProviderAvailable(provider: LLMProvider): boolean {
|
|
|
1343
1400
|
return !!process.env.OPEN_ROUTER_API_KEY;
|
|
1344
1401
|
case 'orbgen':
|
|
1345
1402
|
return !!process.env.ORBGEN_URL;
|
|
1403
|
+
case 'masar':
|
|
1404
|
+
return !!process.env.MASAR_SUBAGENT_URL;
|
|
1346
1405
|
default:
|
|
1347
1406
|
return false;
|
|
1348
1407
|
}
|
package/src/structured-output.ts
CHANGED
|
@@ -268,7 +268,13 @@ export class StructuredOutputClient {
|
|
|
268
268
|
};
|
|
269
269
|
|
|
270
270
|
if (this.tokenTracker) {
|
|
271
|
-
|
|
271
|
+
const cachedTokens =
|
|
272
|
+
(response.usage as { prompt_tokens_details?: { cached_tokens?: number } } | undefined)
|
|
273
|
+
?.prompt_tokens_details?.cached_tokens ?? 0;
|
|
274
|
+
this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens, {
|
|
275
|
+
provider: 'structured-output',
|
|
276
|
+
cachedPromptTokens: cachedTokens,
|
|
277
|
+
});
|
|
272
278
|
}
|
|
273
279
|
|
|
274
280
|
console.log(
|
package/src/token-tracker.ts
CHANGED
|
@@ -18,11 +18,19 @@ export interface TokenUsage {
|
|
|
18
18
|
completionTokens: number;
|
|
19
19
|
totalTokens: number;
|
|
20
20
|
callCount: number;
|
|
21
|
+
/** Subset of `promptTokens` that were cache reads (billed at the cache-read rate). */
|
|
22
|
+
cachedPromptTokens: number;
|
|
23
|
+
/** Subset of `promptTokens` written to cache (Anthropic; billed at the cache-write rate). */
|
|
24
|
+
cacheWriteTokens: number;
|
|
21
25
|
}
|
|
22
26
|
|
|
23
27
|
export interface TokenCost {
|
|
24
28
|
promptCostPer1K: number;
|
|
25
29
|
completionCostPer1K: number;
|
|
30
|
+
/** Per-1K rate for cache-read (cache-hit) prompt tokens. Falls back to prompt rate when absent. */
|
|
31
|
+
cacheReadCostPer1K?: number;
|
|
32
|
+
/** Per-1K rate for cache-write (cache-creation) prompt tokens. Falls back to prompt rate when absent. */
|
|
33
|
+
cacheWriteCostPer1K?: number;
|
|
26
34
|
}
|
|
27
35
|
|
|
28
36
|
export interface CallLogEntry {
|
|
@@ -32,6 +40,10 @@ export interface CallLogEntry {
|
|
|
32
40
|
promptTokens: number;
|
|
33
41
|
completionTokens: number;
|
|
34
42
|
totalTokens: number;
|
|
43
|
+
/** Cache-read (cache-hit) subset of promptTokens, billed at the discounted rate. */
|
|
44
|
+
cachedPromptTokens?: number;
|
|
45
|
+
/** Cache-write subset of promptTokens (Anthropic). */
|
|
46
|
+
cacheWriteTokens?: number;
|
|
35
47
|
estimatedCost: number;
|
|
36
48
|
durationMs?: number;
|
|
37
49
|
source: 'local-log';
|
|
@@ -45,6 +57,9 @@ const ALMADAR_ROOT = process.env['ALMADAR_ROOT'] ?? process.cwd();
|
|
|
45
57
|
const PRICING_CACHE_PATH = join(ALMADAR_ROOT, '.llm-pricing-cache.json');
|
|
46
58
|
const CALL_LOG_PATH = join(ALMADAR_ROOT, '.llm-call-log.jsonl');
|
|
47
59
|
const CACHE_TTL_MS = 24 * 60 * 60 * 1000; // 24 hours
|
|
60
|
+
// Bump when the cached TokenCost shape changes so stale on-disk caches are
|
|
61
|
+
// invalidated on upgrade. v2 added cacheReadCostPer1K / cacheWriteCostPer1K.
|
|
62
|
+
const PRICING_CACHE_VERSION = 2;
|
|
48
63
|
|
|
49
64
|
/** Map from our local model name to OpenRouter model ID */
|
|
50
65
|
const MODEL_ID_MAP: Record<string, string> = {
|
|
@@ -67,6 +82,7 @@ const MODEL_ID_MAP: Record<string, string> = {
|
|
|
67
82
|
const FALLBACK_COSTS: Record<string, TokenCost> = {};
|
|
68
83
|
|
|
69
84
|
interface PricingCache {
|
|
85
|
+
version?: number;
|
|
70
86
|
fetchedAt: number;
|
|
71
87
|
models: Record<string, TokenCost>;
|
|
72
88
|
}
|
|
@@ -77,7 +93,7 @@ function loadCachedPricing(): PricingCache | null {
|
|
|
77
93
|
try {
|
|
78
94
|
const raw = readFileSync(PRICING_CACHE_PATH, 'utf-8');
|
|
79
95
|
const parsed = JSON.parse(raw) as PricingCache;
|
|
80
|
-
if (Date.now() - parsed.fetchedAt < CACHE_TTL_MS) {
|
|
96
|
+
if (parsed.version === PRICING_CACHE_VERSION && Date.now() - parsed.fetchedAt < CACHE_TTL_MS) {
|
|
81
97
|
return parsed;
|
|
82
98
|
}
|
|
83
99
|
} catch {
|
|
@@ -89,15 +105,30 @@ function loadCachedPricing(): PricingCache | null {
|
|
|
89
105
|
async function fetchPricingFromOpenRouter(): Promise<Record<string, TokenCost>> {
|
|
90
106
|
const res = await fetch('https://openrouter.ai/api/v1/models');
|
|
91
107
|
if (!res.ok) throw new Error(`OpenRouter models API: HTTP ${res.status}`);
|
|
92
|
-
const json = await res.json() as {
|
|
108
|
+
const json = await res.json() as {
|
|
109
|
+
data?: Array<{
|
|
110
|
+
id: string;
|
|
111
|
+
pricing?: {
|
|
112
|
+
prompt?: string;
|
|
113
|
+
completion?: string;
|
|
114
|
+
input_cache_read?: string;
|
|
115
|
+
input_cache_write?: string;
|
|
116
|
+
};
|
|
117
|
+
}>;
|
|
118
|
+
};
|
|
93
119
|
const models: Record<string, TokenCost> = {};
|
|
94
120
|
for (const m of json.data ?? []) {
|
|
95
121
|
const promptPerToken = parseFloat(m.pricing?.prompt ?? '0');
|
|
96
122
|
const completionPerToken = parseFloat(m.pricing?.completion ?? '0');
|
|
123
|
+
const cacheReadPerToken = parseFloat(m.pricing?.input_cache_read ?? '0');
|
|
124
|
+
const cacheWritePerToken = parseFloat(m.pricing?.input_cache_write ?? '0');
|
|
97
125
|
if (promptPerToken > 0 || completionPerToken > 0) {
|
|
98
126
|
models[m.id] = {
|
|
99
127
|
promptCostPer1K: promptPerToken * 1000,
|
|
100
128
|
completionCostPer1K: completionPerToken * 1000,
|
|
129
|
+
// 0 (field absent) → leave undefined so cost math falls back to the prompt rate.
|
|
130
|
+
...(cacheReadPerToken > 0 ? { cacheReadCostPer1K: cacheReadPerToken * 1000 } : {}),
|
|
131
|
+
...(cacheWritePerToken > 0 ? { cacheWriteCostPer1K: cacheWritePerToken * 1000 } : {}),
|
|
101
132
|
};
|
|
102
133
|
}
|
|
103
134
|
}
|
|
@@ -125,7 +156,7 @@ function getPricing(): Record<string, TokenCost> {
|
|
|
125
156
|
function refreshPricingCache(): void {
|
|
126
157
|
fetchPricingFromOpenRouter()
|
|
127
158
|
.then((models) => {
|
|
128
|
-
pricingCache = { fetchedAt: Date.now(), models };
|
|
159
|
+
pricingCache = { version: PRICING_CACHE_VERSION, fetchedAt: Date.now(), models };
|
|
129
160
|
try {
|
|
130
161
|
mkdirSync(dirname(PRICING_CACHE_PATH), { recursive: true });
|
|
131
162
|
writeFileSync(PRICING_CACHE_PATH, JSON.stringify(pricingCache));
|
|
@@ -164,22 +195,77 @@ export class TokenTracker {
|
|
|
164
195
|
completionTokens: 0,
|
|
165
196
|
totalTokens: 0,
|
|
166
197
|
callCount: 0,
|
|
198
|
+
cachedPromptTokens: 0,
|
|
199
|
+
cacheWriteTokens: 0,
|
|
167
200
|
};
|
|
168
201
|
|
|
202
|
+
/** Sum of provider-reported authoritative costs (e.g. OpenRouter `usage.cost`). */
|
|
203
|
+
private authoritativeCostUSD = 0;
|
|
204
|
+
/** Token buckets for calls WITHOUT an authoritative cost — priced cache-aware. */
|
|
205
|
+
private computed = { promptTokens: 0, completionTokens: 0, cachedPromptTokens: 0, cacheWriteTokens: 0 };
|
|
206
|
+
|
|
169
207
|
constructor(model: string = 'claude-sonnet-4-5-20250929') {
|
|
170
208
|
this.model = model;
|
|
171
209
|
}
|
|
172
210
|
|
|
173
|
-
|
|
211
|
+
/** Cache-aware cost for one (or an aggregate of) call(s), in USD. */
|
|
212
|
+
private costFor(promptTokens: number, completionTokens: number, cached: number, written: number): number {
|
|
213
|
+
const costs = getCostForModel(this.model);
|
|
214
|
+
const cacheReadRate = costs.cacheReadCostPer1K ?? costs.promptCostPer1K;
|
|
215
|
+
const cacheWriteRate = costs.cacheWriteCostPer1K ?? costs.promptCostPer1K;
|
|
216
|
+
const uncached = Math.max(0, promptTokens - cached - written);
|
|
217
|
+
return (
|
|
218
|
+
(uncached / 1000) * costs.promptCostPer1K +
|
|
219
|
+
(cached / 1000) * cacheReadRate +
|
|
220
|
+
(written / 1000) * cacheWriteRate +
|
|
221
|
+
(completionTokens / 1000) * costs.completionCostPer1K
|
|
222
|
+
);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Record one LLM call's usage. `promptTokens` is the TOTAL input count
|
|
227
|
+
* (cache reads + cache writes + uncached); `cachedPromptTokens` and
|
|
228
|
+
* `cacheWriteTokens` are subsets of it, priced at their own (cheaper /
|
|
229
|
+
* pricier) rates. Providers that don't report cache detail pass 0, which
|
|
230
|
+
* reduces to the previous flat-rate behaviour.
|
|
231
|
+
*/
|
|
232
|
+
addUsage(
|
|
233
|
+
promptTokens: number,
|
|
234
|
+
completionTokens: number,
|
|
235
|
+
options?: {
|
|
236
|
+
provider?: string;
|
|
237
|
+
durationMs?: number;
|
|
238
|
+
cachedPromptTokens?: number;
|
|
239
|
+
cacheWriteTokens?: number;
|
|
240
|
+
/** Provider-reported authoritative cost (e.g. OpenRouter `usage.cost`). When set, used verbatim. */
|
|
241
|
+
costUSD?: number;
|
|
242
|
+
},
|
|
243
|
+
): void {
|
|
244
|
+
const cached = Math.min(promptTokens, Math.max(0, options?.cachedPromptTokens ?? 0));
|
|
245
|
+
const written = Math.min(promptTokens - cached, Math.max(0, options?.cacheWriteTokens ?? 0));
|
|
246
|
+
|
|
174
247
|
this.usage.promptTokens += promptTokens;
|
|
175
248
|
this.usage.completionTokens += completionTokens;
|
|
176
249
|
this.usage.totalTokens += promptTokens + completionTokens;
|
|
250
|
+
this.usage.cachedPromptTokens += cached;
|
|
251
|
+
this.usage.cacheWriteTokens += written;
|
|
177
252
|
this.usage.callCount++;
|
|
178
253
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
254
|
+
// Prefer the provider's authoritative cost (already cache- and routing-
|
|
255
|
+
// adjusted). Otherwise bucket the tokens and price them cache-aware so
|
|
256
|
+
// late-arriving pricing still applies retroactively to the estimate.
|
|
257
|
+
const authoritative = options?.costUSD;
|
|
258
|
+
let estimatedCost: number;
|
|
259
|
+
if (authoritative != null && Number.isFinite(authoritative)) {
|
|
260
|
+
this.authoritativeCostUSD += authoritative;
|
|
261
|
+
estimatedCost = authoritative;
|
|
262
|
+
} else {
|
|
263
|
+
this.computed.promptTokens += promptTokens;
|
|
264
|
+
this.computed.completionTokens += completionTokens;
|
|
265
|
+
this.computed.cachedPromptTokens += cached;
|
|
266
|
+
this.computed.cacheWriteTokens += written;
|
|
267
|
+
estimatedCost = this.costFor(promptTokens, completionTokens, cached, written);
|
|
268
|
+
}
|
|
183
269
|
|
|
184
270
|
const entry: CallLogEntry = {
|
|
185
271
|
timestamp: new Date().toISOString(),
|
|
@@ -188,6 +274,8 @@ export class TokenTracker {
|
|
|
188
274
|
promptTokens,
|
|
189
275
|
completionTokens,
|
|
190
276
|
totalTokens: promptTokens + completionTokens,
|
|
277
|
+
cachedPromptTokens: cached,
|
|
278
|
+
cacheWriteTokens: written,
|
|
191
279
|
estimatedCost,
|
|
192
280
|
durationMs: options?.durationMs,
|
|
193
281
|
source: 'local-log',
|
|
@@ -206,12 +294,15 @@ export class TokenTracker {
|
|
|
206
294
|
}
|
|
207
295
|
|
|
208
296
|
getEstimatedCost(): number {
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
297
|
+
return (
|
|
298
|
+
this.authoritativeCostUSD +
|
|
299
|
+
this.costFor(
|
|
300
|
+
this.computed.promptTokens,
|
|
301
|
+
this.computed.completionTokens,
|
|
302
|
+
this.computed.cachedPromptTokens,
|
|
303
|
+
this.computed.cacheWriteTokens,
|
|
304
|
+
)
|
|
305
|
+
);
|
|
215
306
|
}
|
|
216
307
|
|
|
217
308
|
getFormattedCost(): string {
|
|
@@ -239,7 +330,11 @@ export class TokenTracker {
|
|
|
239
330
|
completionTokens: 0,
|
|
240
331
|
totalTokens: 0,
|
|
241
332
|
callCount: 0,
|
|
333
|
+
cachedPromptTokens: 0,
|
|
334
|
+
cacheWriteTokens: 0,
|
|
242
335
|
};
|
|
336
|
+
this.authoritativeCostUSD = 0;
|
|
337
|
+
this.computed = { promptTokens: 0, completionTokens: 0, cachedPromptTokens: 0, cacheWriteTokens: 0 };
|
|
243
338
|
}
|
|
244
339
|
|
|
245
340
|
setModel(model: string): void {
|
|
@@ -270,7 +365,7 @@ export function getCallLogPath(): string {
|
|
|
270
365
|
/** Force-refresh the pricing cache from OpenRouter. */
|
|
271
366
|
export async function refreshPricing(): Promise<void> {
|
|
272
367
|
const models = await fetchPricingFromOpenRouter();
|
|
273
|
-
pricingCache = { fetchedAt: Date.now(), models };
|
|
368
|
+
pricingCache = { version: PRICING_CACHE_VERSION, fetchedAt: Date.now(), models };
|
|
274
369
|
mkdirSync(dirname(PRICING_CACHE_PATH), { recursive: true });
|
|
275
370
|
writeFileSync(PRICING_CACHE_PATH, JSON.stringify(pricingCache));
|
|
276
371
|
}
|