llm-cli-gateway 1.5.35 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ /**
2
+ * Per-model pricing for cache-savings estimation.
3
+ *
4
+ * `priced_as_of` is the date these numbers were last refreshed. The
5
+ * gateway's doctor surfaces this so operators can see when the table is
6
+ * stale — pricing is an ESTIMATE, not a billing number.
7
+ *
8
+ * Pricing units: USD per 1M tokens.
9
+ *
10
+ * Anthropic source: <https://platform.claude.com/docs/en/about-claude/pricing>
11
+ * - Sonnet 4.x / Sonnet 3.5: $3 input / $15 output.
12
+ * - Opus 4.5+ / Mythos Preview: $15 input / $75 output.
13
+ * - Opus 4 / 4.1 (deprecated): same as 4.5+.
14
+ * - Haiku 4.5: $1 input / $5 output.
15
+ * - Haiku 3.5 (Vertex-only): $0.80 input / $4 output.
16
+ *
17
+ * Cache pricing multipliers (Anthropic):
18
+ * - cache write 5-min TTL: 1.25× base input.
19
+ * - cache write 1-hour TTL: 2× base input.
20
+ * - cache read: 0.10× base input (90% savings).
21
+ *
22
+ * Codex / OpenAI: GPT-5.4 input ~$1.25 / output $10 per 1M (approx; OpenAI
23
+ * does not publish a stable per-CLI table). Cached input ~50% of base.
24
+ *
25
+ * Gemini, Grok, Mistral: pricing varies by model and is not surfaced in
26
+ * gateway today. Returns 0 for unknown.
27
+ */
28
+ export interface PricePerMillion {
29
+ inputUsd: number;
30
+ outputUsd: number;
31
+ /** Multiplier on inputUsd for a cache HIT (read). Anthropic: 0.10. */
32
+ cacheReadMultiplier: number;
33
+ }
34
+ export declare const PRICING_AS_OF = "2026-05-26";
35
+ /**
36
+ * Look up pricing by (cli, model) name. Best-effort; unknown models return
37
+ * ZEROED pricing so estimated_savings_usd in aggregates falls back to 0
38
+ * rather than throwing OR over-reporting savings on an unpriced model.
39
+ *
40
+ * Recognised model families:
41
+ * - claude: model name contains "sonnet" | "opus" | "haiku".
42
+ * - codex: model name contains "gpt-5" or "o3" (current OpenAI families).
43
+ *
44
+ * Anything outside these explicit matches returns ZERO. This is a
45
+ * deliberate conservative choice — we'd rather under-report savings on
46
+ * an unrecognised model than over-report on one whose actual pricing we
47
+ * don't know. Update this table when a new model family ships.
48
+ */
49
+ export declare function getPricing(cli: "claude" | "codex" | "gemini" | "grok" | "mistral", model: string): PricePerMillion;
50
+ /**
51
+ * Estimate USD saved by `cacheReadTokens` being served from cache instead
52
+ * of fresh input. Returns 0 for zero cache reads or unknown pricing.
53
+ */
54
+ export declare function estimateCacheSavingsUsd(cli: "claude" | "codex" | "gemini" | "grok" | "mistral", model: string, cacheReadTokens: number): number;
@@ -0,0 +1,100 @@
1
+ /**
2
+ * Per-model pricing for cache-savings estimation.
3
+ *
4
+ * `priced_as_of` is the date these numbers were last refreshed. The
5
+ * gateway's doctor surfaces this so operators can see when the table is
6
+ * stale — pricing is an ESTIMATE, not a billing number.
7
+ *
8
+ * Pricing units: USD per 1M tokens.
9
+ *
10
+ * Anthropic source: <https://platform.claude.com/docs/en/about-claude/pricing>
11
+ * - Sonnet 4.x / Sonnet 3.5: $3 input / $15 output.
12
+ * - Opus 4.5+ / Mythos Preview: $15 input / $75 output.
13
+ * - Opus 4 / 4.1 (deprecated): same as 4.5+.
14
+ * - Haiku 4.5: $1 input / $5 output.
15
+ * - Haiku 3.5 (Vertex-only): $0.80 input / $4 output.
16
+ *
17
+ * Cache pricing multipliers (Anthropic):
18
+ * - cache write 5-min TTL: 1.25× base input.
19
+ * - cache write 1-hour TTL: 2× base input.
20
+ * - cache read: 0.10× base input (90% savings).
21
+ *
22
+ * Codex / OpenAI: GPT-5.4 input ~$1.25 / output $10 per 1M (approx; OpenAI
23
+ * does not publish a stable per-CLI table). Cached input ~50% of base.
24
+ *
25
+ * Gemini, Grok, Mistral: pricing varies by model and is not surfaced in
26
+ * gateway today. Returns 0 for unknown.
27
+ */
28
+ export const PRICING_AS_OF = "2026-05-26";
29
+ const ANTHROPIC_SONNET = {
30
+ inputUsd: 3,
31
+ outputUsd: 15,
32
+ cacheReadMultiplier: 0.1,
33
+ };
34
+ const ANTHROPIC_OPUS = {
35
+ inputUsd: 15,
36
+ outputUsd: 75,
37
+ cacheReadMultiplier: 0.1,
38
+ };
39
+ const ANTHROPIC_HAIKU = {
40
+ inputUsd: 1,
41
+ outputUsd: 5,
42
+ cacheReadMultiplier: 0.1,
43
+ };
44
+ const OPENAI_GPT5 = {
45
+ inputUsd: 1.25,
46
+ outputUsd: 10,
47
+ // OpenAI prompt-caching: cached input tokens billed at 50% of base.
48
+ cacheReadMultiplier: 0.5,
49
+ };
50
+ const ZERO = {
51
+ inputUsd: 0,
52
+ outputUsd: 0,
53
+ cacheReadMultiplier: 0,
54
+ };
55
+ /**
56
+ * Look up pricing by (cli, model) name. Best-effort; unknown models return
57
+ * ZEROED pricing so estimated_savings_usd in aggregates falls back to 0
58
+ * rather than throwing OR over-reporting savings on an unpriced model.
59
+ *
60
+ * Recognised model families:
61
+ * - claude: model name contains "sonnet" | "opus" | "haiku".
62
+ * - codex: model name contains "gpt-5" or "o3" (current OpenAI families).
63
+ *
64
+ * Anything outside these explicit matches returns ZERO. This is a
65
+ * deliberate conservative choice — we'd rather under-report savings on
66
+ * an unrecognised model than over-report on one whose actual pricing we
67
+ * don't know. Update this table when a new model family ships.
68
+ */
69
+ export function getPricing(cli, model) {
70
+ const lower = model.toLowerCase();
71
+ if (cli === "claude") {
72
+ if (lower.includes("sonnet"))
73
+ return ANTHROPIC_SONNET;
74
+ if (lower.includes("opus"))
75
+ return ANTHROPIC_OPUS;
76
+ if (lower.includes("haiku"))
77
+ return ANTHROPIC_HAIKU;
78
+ return ZERO;
79
+ }
80
+ if (cli === "codex") {
81
+ if (lower.includes("gpt-5") || lower.includes("o3"))
82
+ return OPENAI_GPT5;
83
+ return ZERO;
84
+ }
85
+ return ZERO;
86
+ }
87
+ /**
88
+ * Estimate USD saved by `cacheReadTokens` being served from cache instead
89
+ * of fresh input. Returns 0 for zero cache reads or unknown pricing.
90
+ */
91
+ export function estimateCacheSavingsUsd(cli, model, cacheReadTokens) {
92
+ if (cacheReadTokens <= 0)
93
+ return 0;
94
+ const p = getPricing(cli, model);
95
+ if (p.inputUsd === 0)
96
+ return 0;
97
+ // Savings = (fresh-input-cost) - (cache-read-cost) = inputUsd × (1 - mult)
98
+ const savedPerToken = (p.inputUsd * (1 - p.cacheReadMultiplier)) / 1_000_000;
99
+ return cacheReadTokens * savedPerToken;
100
+ }
@@ -0,0 +1,38 @@
1
+ import { z } from "zod";
2
+ export interface PromptParts {
3
+ system?: string;
4
+ tools?: string;
5
+ context?: string;
6
+ task: string;
7
+ }
8
+ export declare const PromptPartsSchema: z.ZodObject<{
9
+ system: z.ZodOptional<z.ZodString>;
10
+ tools: z.ZodOptional<z.ZodString>;
11
+ context: z.ZodOptional<z.ZodString>;
12
+ task: z.ZodString;
13
+ }, "strip", z.ZodTypeAny, {
14
+ task: string;
15
+ system?: string | undefined;
16
+ tools?: string | undefined;
17
+ context?: string | undefined;
18
+ }, {
19
+ task: string;
20
+ system?: string | undefined;
21
+ tools?: string | undefined;
22
+ context?: string | undefined;
23
+ }>;
24
+ export interface AssembleResult {
25
+ text: string;
26
+ stableByteEnd: number;
27
+ }
28
+ export declare function assemble(parts: PromptParts): AssembleResult;
29
+ export interface ResolvedPromptInput {
30
+ assembledPrompt: string;
31
+ stablePrefixHash: string | null;
32
+ stablePrefixTokens: number | null;
33
+ }
34
+ export interface ResolvePromptInputArgs {
35
+ prompt?: string;
36
+ promptParts?: PromptParts;
37
+ }
38
+ export declare function resolvePromptInput(input: ResolvePromptInputArgs): ResolvedPromptInput;
@@ -0,0 +1,42 @@
1
+ import { createHash } from "crypto";
2
+ import { z } from "zod";
3
+ export const PromptPartsSchema = z.object({
4
+ system: z.string().optional(),
5
+ tools: z.string().optional(),
6
+ context: z.string().optional(),
7
+ task: z.string().min(1),
8
+ });
9
+ const SEPARATOR = "\n\n";
10
+ export function assemble(parts) {
11
+ const stableSegments = [];
12
+ if (parts.system && parts.system.length > 0)
13
+ stableSegments.push(parts.system);
14
+ if (parts.tools && parts.tools.length > 0)
15
+ stableSegments.push(parts.tools);
16
+ if (parts.context && parts.context.length > 0)
17
+ stableSegments.push(parts.context);
18
+ const stableText = stableSegments.join(SEPARATOR);
19
+ const stableByteEnd = Buffer.byteLength(stableText, "utf8");
20
+ const text = stableText.length > 0 ? `${stableText}${SEPARATOR}${parts.task}` : parts.task;
21
+ return { text, stableByteEnd };
22
+ }
23
+ export function resolvePromptInput(input) {
24
+ if (input.promptParts !== undefined) {
25
+ const assembled = assemble(input.promptParts);
26
+ const stableBytes = Buffer.from(assembled.text, "utf8").subarray(0, assembled.stableByteEnd);
27
+ const hash = assembled.stableByteEnd > 0
28
+ ? createHash("sha256").update(stableBytes).digest("hex")
29
+ : createHash("sha256").update("").digest("hex");
30
+ const tokens = Math.ceil(assembled.stableByteEnd / 4);
31
+ return {
32
+ assembledPrompt: assembled.text,
33
+ stablePrefixHash: hash,
34
+ stablePrefixTokens: tokens,
35
+ };
36
+ }
37
+ return {
38
+ assembledPrompt: input.prompt ?? "",
39
+ stablePrefixHash: null,
40
+ stablePrefixTokens: null,
41
+ };
42
+ }
@@ -1,5 +1,8 @@
1
1
  import { ISessionManager } from "./session-manager.js";
2
2
  import { PerformanceMetrics } from "./metrics.js";
3
+ import { FlightRecorderQuery } from "./flight-recorder.js";
4
+ import { type GlobalCacheStats, type PrefixCacheStats, type SessionCacheStats } from "./cache-stats.js";
5
+ import type { CacheAwarenessConfig } from "./config.js";
3
6
  export interface ResourceDefinition {
4
7
  uri: string;
5
8
  name: string;
@@ -20,7 +23,35 @@ export interface ResourceContents {
20
23
  export declare class ResourceProvider {
21
24
  private sessionManager;
22
25
  private performanceMetrics;
23
- constructor(sessionManager: ISessionManager, performanceMetrics: PerformanceMetrics);
26
+ private flightRecorder;
27
+ private cacheAwareness;
28
+ constructor(sessionManager: ISessionManager, performanceMetrics: PerformanceMetrics, flightRecorder?: FlightRecorderQuery, cacheAwareness?: CacheAwarenessConfig | null);
29
+ /** Read-only flight-recorder accessor for cache-state resource readers. */
30
+ getFlightRecorderQuery(): FlightRecorderQuery;
31
+ /**
32
+ * cache_state://global — aggregates across the entire flight recorder.
33
+ * Optionally restrict to a recent window via `lastNHours`. Returns
34
+ * tokens/hashes/aggregates ONLY — no prompt text fields. The redaction is
35
+ * structural: the response shape (GlobalCacheStats) has no `prompt`,
36
+ * `response`, `system`, or `task` field by construction.
37
+ */
38
+ readCacheStateGlobal(opts?: {
39
+ lastNHours?: number;
40
+ }): GlobalCacheStats;
41
+ /**
42
+ * cache_state://session/{sessionId} — per-session aggregates. Returns
43
+ * empty defaults when the session has no rows. Token/hash fields only.
44
+ *
45
+ * Slice 3: populates `ttlRemainingMs` by applying the configured TTL
46
+ * policy. Null for non-claude sessions or when the gateway has no
47
+ * cache-awareness config loaded (defaults to 5-min policy).
48
+ */
49
+ readCacheStateSession(sessionId: string): SessionCacheStats;
50
+ /**
51
+ * cache_state://prefix/{hash} — per-stable-prefix-hash aggregates.
52
+ * Returns empty defaults for unknown hashes. Token/hash fields only.
53
+ */
54
+ readCacheStateForPrefix(stablePrefixHash: string): PrefixCacheStats;
24
55
  listResources(): ResourceDefinition[];
25
56
  readResource(uri: string): Promise<ResourceContents | null>;
26
57
  }
package/dist/resources.js CHANGED
@@ -1,10 +1,61 @@
1
1
  import { getAvailableCliInfo } from "./model-registry.js";
2
+ import { computeGlobalCacheStats, computePrefixCacheStats, computeSessionCacheStats, computeTtlRemaining, } from "./cache-stats.js";
2
3
  export class ResourceProvider {
3
4
  sessionManager;
4
5
  performanceMetrics;
5
- constructor(sessionManager, performanceMetrics) {
6
+ flightRecorder;
7
+ cacheAwareness;
8
+ constructor(sessionManager, performanceMetrics,
9
+ // Optional read access to the flight recorder. Used by cache-state
10
+ // resources (slice 2). Falls back to a stub returning [] when not
11
+ // injected so existing call sites continue to work without changes.
12
+ flightRecorder = { queryRequests: () => [] },
13
+ // Slice 3: optional cache-awareness config. When present, drives the
14
+ // TTL policy applied to ttlRemainingMs on session-scoped reads.
15
+ // When absent, the default Anthropic 5-min TTL applies (matches the
16
+ // 1.x default of `[cache_awareness].anthropic_ttl_seconds = 300`).
17
+ cacheAwareness = null) {
6
18
  this.sessionManager = sessionManager;
7
19
  this.performanceMetrics = performanceMetrics;
20
+ this.flightRecorder = flightRecorder;
21
+ this.cacheAwareness = cacheAwareness;
22
+ }
23
+ /** Read-only flight-recorder accessor for cache-state resource readers. */
24
+ getFlightRecorderQuery() {
25
+ return this.flightRecorder;
26
+ }
27
+ /**
28
+ * cache_state://global — aggregates across the entire flight recorder.
29
+ * Optionally restrict to a recent window via `lastNHours`. Returns
30
+ * tokens/hashes/aggregates ONLY — no prompt text fields. The redaction is
31
+ * structural: the response shape (GlobalCacheStats) has no `prompt`,
32
+ * `response`, `system`, or `task` field by construction.
33
+ */
34
+ readCacheStateGlobal(opts = {}) {
35
+ return computeGlobalCacheStats(this.flightRecorder, opts);
36
+ }
37
+ /**
38
+ * cache_state://session/{sessionId} — per-session aggregates. Returns
39
+ * empty defaults when the session has no rows. Token/hash fields only.
40
+ *
41
+ * Slice 3: populates `ttlRemainingMs` by applying the configured TTL
42
+ * policy. Null for non-claude sessions or when the gateway has no
43
+ * cache-awareness config loaded (defaults to 5-min policy).
44
+ */
45
+ readCacheStateSession(sessionId) {
46
+ const stats = computeSessionCacheStats(this.flightRecorder, sessionId);
47
+ const ttlSeconds = this.cacheAwareness?.anthropicTtlSeconds ?? 300;
48
+ stats.ttlRemainingMs = computeTtlRemaining(stats, stats.cli, {
49
+ anthropicTtlSeconds: ttlSeconds,
50
+ });
51
+ return stats;
52
+ }
53
+ /**
54
+ * cache_state://prefix/{hash} — per-stable-prefix-hash aggregates.
55
+ * Returns empty defaults for unknown hashes. Token/hash fields only.
56
+ */
57
+ readCacheStateForPrefix(stablePrefixHash) {
58
+ return computePrefixCacheStats(this.flightRecorder, stablePrefixHash);
8
59
  }
9
60
  // List all available resources
10
61
  listResources() {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "llm-cli-gateway",
3
- "version": "1.5.35",
3
+ "version": "1.6.0",
4
4
  "mcpName": "io.github.verivus-oss/llm-cli-gateway",
5
5
  "description": "MCP server providing unified access to Claude Code, Codex, Gemini, Grok, and Mistral Vibe CLIs with session management, retry logic, async job orchestration, durable job results, and cross-LLM validation.",
6
6
  "license": "MIT",
@@ -109,6 +109,7 @@
109
109
  "@vitest/coverage-v8": "^4.1.2",
110
110
  "eslint": "^8.57.1",
111
111
  "eslint-config-prettier": "^9.0.0",
112
+ "eslint-plugin-security": "^3.0.1",
112
113
  "ioredis": "5.9.2",
113
114
  "pg": "^8.12.0",
114
115
  "prettier": "^3.0.0",
@@ -14,6 +14,7 @@
14
14
  "providers",
15
15
  "endpoint_exposure",
16
16
  "client_config",
17
+ "cache_awareness",
17
18
  "next_actions"
18
19
  ],
19
20
  "properties": {
@@ -263,6 +264,44 @@
263
264
  },
264
265
  "additionalProperties": false
265
266
  },
267
+ "cache_awareness": {
268
+ "type": "object",
269
+ "required": ["enabled_features", "last_24h", "per_cli"],
270
+ "properties": {
271
+ "enabled_features": {
272
+ "type": "array",
273
+ "items": {
274
+ "type": "string",
275
+ "enum": ["anthropic_cache_control", "ttl_warnings"]
276
+ }
277
+ },
278
+ "last_24h": {
279
+ "type": "object",
280
+ "required": ["hit_rate", "total_hits", "total_requests", "estimated_savings_usd"],
281
+ "properties": {
282
+ "hit_rate": { "type": "number" },
283
+ "total_hits": { "type": "integer" },
284
+ "total_requests": { "type": "integer" },
285
+ "estimated_savings_usd": { "type": "number" }
286
+ },
287
+ "additionalProperties": false
288
+ },
289
+ "per_cli": {
290
+ "type": "object",
291
+ "additionalProperties": {
292
+ "type": "object",
293
+ "required": ["hit_rate", "total_hits", "total_cache_read_tokens"],
294
+ "properties": {
295
+ "hit_rate": { "type": "number" },
296
+ "total_hits": { "type": "integer" },
297
+ "total_cache_read_tokens": { "type": "integer" }
298
+ },
299
+ "additionalProperties": false
300
+ }
301
+ }
302
+ },
303
+ "additionalProperties": false
304
+ },
266
305
  "next_actions": {
267
306
  "type": "array",
268
307
  "items": { "type": "string" }
package/socket.yml CHANGED
@@ -33,6 +33,16 @@ version: 2
33
33
  # gateway does not call db.pragma() from production code; SQLite setup
34
34
  # uses fixed literal db.exec("PRAGMA ...") statements, and the release
35
35
  # security audit fails future production `.pragma()` calls.
36
+ #
37
+ # ioredis obfuscated code / base64 strings
38
+ # Socket may flag ioredis@5.10.1 built/constants/TLSProfiles.js because it
39
+ # contains base64-looking strings. This is a reviewed false positive: the
40
+ # strings are PEM-encoded Redis Cloud TLS CA certificates. The file exports
41
+ # static TLS profile data only; it contains no decoder loop, dynamic eval,
42
+ # network call, or hidden execution path. The same file is byte-for-byte
43
+ # identical in ioredis@5.9.2. ioredis is not installed by the default
44
+ # production dependency tree; it is an optional peer for PostgreSQL/Redis
45
+ # session storage and a pinned dev dependency for tests.
36
46
 
37
47
  issueRules:
38
48
  # Defaults from Socket. Listed explicitly so future contributors see what