llm-cli-gateway 1.5.35 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ /**
2
+ * Per-model pricing for cache-savings estimation.
3
+ *
4
+ * `priced_as_of` is the date these numbers were last refreshed. The
5
+ * gateway's doctor surfaces this so operators can see when the table is
6
+ * stale — pricing is an ESTIMATE, not a billing number.
7
+ *
8
+ * Pricing units: USD per 1M tokens.
9
+ *
10
+ * Anthropic source: <https://platform.claude.com/docs/en/about-claude/pricing>
11
+ * - Sonnet 4.x / Sonnet 3.5: $3 input / $15 output.
12
+ * - Opus 4.5+ / Mythos Preview: $15 input / $75 output.
13
+ * - Opus 4 / 4.1 (deprecated): same as 4.5+.
14
+ * - Haiku 4.5: $1 input / $5 output.
15
+ * - Haiku 3.5 (Vertex-only): $0.80 input / $4 output.
16
+ *
17
+ * Cache pricing multipliers (Anthropic):
18
+ * - cache write 5-min TTL: 1.25× base input.
19
+ * - cache write 1-hour TTL: 2× base input.
20
+ * - cache read: 0.10× base input (90% savings).
21
+ *
22
+ * Codex / OpenAI: GPT-5.4 input ~$1.25 / output $10 per 1M (approx; OpenAI
23
+ * does not publish a stable per-CLI table). Cached input ~50% of base.
24
+ *
25
+ * Gemini, Grok, Mistral: pricing varies by model and is not surfaced in
26
+ * gateway today. Returns 0 for unknown.
27
+ */
28
+ export interface PricePerMillion {
29
+ inputUsd: number;
30
+ outputUsd: number;
31
+ /** Multiplier on inputUsd for a cache HIT (read). Anthropic: 0.10. */
32
+ cacheReadMultiplier: number;
33
+ }
34
+ export declare const PRICING_AS_OF = "2026-05-26";
35
+ /**
36
+ * Look up pricing by (cli, model) name. Best-effort; unknown models return
37
+ * ZEROED pricing so estimated_savings_usd in aggregates falls back to 0
38
+ * rather than throwing OR over-reporting savings on an unpriced model.
39
+ *
40
+ * Recognised model families:
41
+ * - claude: model name contains "sonnet" | "opus" | "haiku".
42
+ * - codex: model name contains "gpt-5" or "o3" (current OpenAI families).
43
+ *
44
+ * Anything outside these explicit matches returns ZERO. This is a
45
+ * deliberate conservative choice — we'd rather under-report savings on
46
+ * an unrecognised model than over-report on one whose actual pricing we
47
+ * don't know. Update this table when a new model family ships.
48
+ */
49
+ export declare function getPricing(cli: "claude" | "codex" | "gemini" | "grok" | "mistral", model: string): PricePerMillion;
50
+ /**
51
+ * Estimate USD saved by `cacheReadTokens` being served from cache instead
52
+ * of fresh input. Returns 0 for zero cache reads or unknown pricing.
53
+ */
54
+ export declare function estimateCacheSavingsUsd(cli: "claude" | "codex" | "gemini" | "grok" | "mistral", model: string, cacheReadTokens: number): number;
@@ -0,0 +1,100 @@
1
+ /**
2
+ * Per-model pricing for cache-savings estimation.
3
+ *
4
+ * `priced_as_of` is the date these numbers were last refreshed. The
5
+ * gateway's doctor surfaces this so operators can see when the table is
6
+ * stale — pricing is an ESTIMATE, not a billing number.
7
+ *
8
+ * Pricing units: USD per 1M tokens.
9
+ *
10
+ * Anthropic source: <https://platform.claude.com/docs/en/about-claude/pricing>
11
+ * - Sonnet 4.x / Sonnet 3.5: $3 input / $15 output.
12
+ * - Opus 4.5+ / Mythos Preview: $15 input / $75 output.
13
+ * - Opus 4 / 4.1 (deprecated): same as 4.5+.
14
+ * - Haiku 4.5: $1 input / $5 output.
15
+ * - Haiku 3.5 (Vertex-only): $0.80 input / $4 output.
16
+ *
17
+ * Cache pricing multipliers (Anthropic):
18
+ * - cache write 5-min TTL: 1.25× base input.
19
+ * - cache write 1-hour TTL: 2× base input.
20
+ * - cache read: 0.10× base input (90% savings).
21
+ *
22
+ * Codex / OpenAI: GPT-5.4 input ~$1.25 / output $10 per 1M (approx; OpenAI
23
+ * does not publish a stable per-CLI table). Cached input ~50% of base.
24
+ *
25
+ * Gemini, Grok, Mistral: pricing varies by model and is not surfaced in
26
+ * gateway today. Returns 0 for unknown.
27
+ */
28
+ export const PRICING_AS_OF = "2026-05-26";
29
+ const ANTHROPIC_SONNET = {
30
+ inputUsd: 3,
31
+ outputUsd: 15,
32
+ cacheReadMultiplier: 0.1,
33
+ };
34
+ const ANTHROPIC_OPUS = {
35
+ inputUsd: 15,
36
+ outputUsd: 75,
37
+ cacheReadMultiplier: 0.1,
38
+ };
39
+ const ANTHROPIC_HAIKU = {
40
+ inputUsd: 1,
41
+ outputUsd: 5,
42
+ cacheReadMultiplier: 0.1,
43
+ };
44
+ const OPENAI_GPT5 = {
45
+ inputUsd: 1.25,
46
+ outputUsd: 10,
47
+ // OpenAI prompt-caching: cached input tokens billed at 50% of base.
48
+ cacheReadMultiplier: 0.5,
49
+ };
50
+ const ZERO = {
51
+ inputUsd: 0,
52
+ outputUsd: 0,
53
+ cacheReadMultiplier: 0,
54
+ };
55
+ /**
56
+ * Look up pricing by (cli, model) name. Best-effort; unknown models return
57
+ * ZEROED pricing so estimated_savings_usd in aggregates falls back to 0
58
+ * rather than throwing OR over-reporting savings on an unpriced model.
59
+ *
60
+ * Recognised model families:
61
+ * - claude: model name contains "sonnet" | "opus" | "haiku".
62
+ * - codex: model name contains "gpt-5" or "o3" (current OpenAI families).
63
+ *
64
+ * Anything outside these explicit matches returns ZERO. This is a
65
+ * deliberate conservative choice — we'd rather under-report savings on
66
+ * an unrecognised model than over-report on one whose actual pricing we
67
+ * don't know. Update this table when a new model family ships.
68
+ */
69
+ export function getPricing(cli, model) {
70
+ const lower = model.toLowerCase();
71
+ if (cli === "claude") {
72
+ if (lower.includes("sonnet"))
73
+ return ANTHROPIC_SONNET;
74
+ if (lower.includes("opus"))
75
+ return ANTHROPIC_OPUS;
76
+ if (lower.includes("haiku"))
77
+ return ANTHROPIC_HAIKU;
78
+ return ZERO;
79
+ }
80
+ if (cli === "codex") {
81
+ if (lower.includes("gpt-5") || lower.includes("o3"))
82
+ return OPENAI_GPT5;
83
+ return ZERO;
84
+ }
85
+ return ZERO;
86
+ }
87
+ /**
88
+ * Estimate USD saved by `cacheReadTokens` being served from cache instead
89
+ * of fresh input. Returns 0 for zero cache reads or unknown pricing.
90
+ */
91
+ export function estimateCacheSavingsUsd(cli, model, cacheReadTokens) {
92
+ if (cacheReadTokens <= 0)
93
+ return 0;
94
+ const p = getPricing(cli, model);
95
+ if (p.inputUsd === 0)
96
+ return 0;
97
+ // Savings = (fresh-input-cost) - (cache-read-cost) = inputUsd × (1 - mult)
98
+ const savedPerToken = (p.inputUsd * (1 - p.cacheReadMultiplier)) / 1_000_000;
99
+ return cacheReadTokens * savedPerToken;
100
+ }
@@ -0,0 +1,38 @@
1
+ import { z } from "zod";
2
+ export interface PromptParts {
3
+ system?: string;
4
+ tools?: string;
5
+ context?: string;
6
+ task: string;
7
+ }
8
+ export declare const PromptPartsSchema: z.ZodObject<{
9
+ system: z.ZodOptional<z.ZodString>;
10
+ tools: z.ZodOptional<z.ZodString>;
11
+ context: z.ZodOptional<z.ZodString>;
12
+ task: z.ZodString;
13
+ }, "strip", z.ZodTypeAny, {
14
+ task: string;
15
+ system?: string | undefined;
16
+ tools?: string | undefined;
17
+ context?: string | undefined;
18
+ }, {
19
+ task: string;
20
+ system?: string | undefined;
21
+ tools?: string | undefined;
22
+ context?: string | undefined;
23
+ }>;
24
+ export interface AssembleResult {
25
+ text: string;
26
+ stableByteEnd: number;
27
+ }
28
+ export declare function assemble(parts: PromptParts): AssembleResult;
29
+ export interface ResolvedPromptInput {
30
+ assembledPrompt: string;
31
+ stablePrefixHash: string | null;
32
+ stablePrefixTokens: number | null;
33
+ }
34
+ export interface ResolvePromptInputArgs {
35
+ prompt?: string;
36
+ promptParts?: PromptParts;
37
+ }
38
+ export declare function resolvePromptInput(input: ResolvePromptInputArgs): ResolvedPromptInput;
@@ -0,0 +1,42 @@
1
+ import { createHash } from "crypto";
2
+ import { z } from "zod";
3
+ export const PromptPartsSchema = z.object({
4
+ system: z.string().optional(),
5
+ tools: z.string().optional(),
6
+ context: z.string().optional(),
7
+ task: z.string().min(1),
8
+ });
9
+ const SEPARATOR = "\n\n";
10
+ export function assemble(parts) {
11
+ const stableSegments = [];
12
+ if (parts.system && parts.system.length > 0)
13
+ stableSegments.push(parts.system);
14
+ if (parts.tools && parts.tools.length > 0)
15
+ stableSegments.push(parts.tools);
16
+ if (parts.context && parts.context.length > 0)
17
+ stableSegments.push(parts.context);
18
+ const stableText = stableSegments.join(SEPARATOR);
19
+ const stableByteEnd = Buffer.byteLength(stableText, "utf8");
20
+ const text = stableText.length > 0 ? `${stableText}${SEPARATOR}${parts.task}` : parts.task;
21
+ return { text, stableByteEnd };
22
+ }
23
+ export function resolvePromptInput(input) {
24
+ if (input.promptParts !== undefined) {
25
+ const assembled = assemble(input.promptParts);
26
+ const stableBytes = Buffer.from(assembled.text, "utf8").subarray(0, assembled.stableByteEnd);
27
+ const hash = assembled.stableByteEnd > 0
28
+ ? createHash("sha256").update(stableBytes).digest("hex")
29
+ : createHash("sha256").update("").digest("hex");
30
+ const tokens = Math.ceil(assembled.stableByteEnd / 4);
31
+ return {
32
+ assembledPrompt: assembled.text,
33
+ stablePrefixHash: hash,
34
+ stablePrefixTokens: tokens,
35
+ };
36
+ }
37
+ return {
38
+ assembledPrompt: input.prompt ?? "",
39
+ stablePrefixHash: null,
40
+ stablePrefixTokens: null,
41
+ };
42
+ }
@@ -1,5 +1,8 @@
1
1
  import { ISessionManager } from "./session-manager.js";
2
2
  import { PerformanceMetrics } from "./metrics.js";
3
+ import { FlightRecorderQuery } from "./flight-recorder.js";
4
+ import { type GlobalCacheStats, type PrefixCacheStats, type SessionCacheStats } from "./cache-stats.js";
5
+ import type { CacheAwarenessConfig } from "./config.js";
3
6
  export interface ResourceDefinition {
4
7
  uri: string;
5
8
  name: string;
@@ -20,7 +23,35 @@ export interface ResourceContents {
20
23
  export declare class ResourceProvider {
21
24
  private sessionManager;
22
25
  private performanceMetrics;
23
- constructor(sessionManager: ISessionManager, performanceMetrics: PerformanceMetrics);
26
+ private flightRecorder;
27
+ private cacheAwareness;
28
+ constructor(sessionManager: ISessionManager, performanceMetrics: PerformanceMetrics, flightRecorder?: FlightRecorderQuery, cacheAwareness?: CacheAwarenessConfig | null);
29
+ /** Read-only flight-recorder accessor for cache-state resource readers. */
30
+ getFlightRecorderQuery(): FlightRecorderQuery;
31
+ /**
32
+ * cache_state://global — aggregates across the entire flight recorder.
33
+ * Optionally restrict to a recent window via `lastNHours`. Returns
34
+ * tokens/hashes/aggregates ONLY — no prompt text fields. The redaction is
35
+ * structural: the response shape (GlobalCacheStats) has no `prompt`,
36
+ * `response`, `system`, or `task` field by construction.
37
+ */
38
+ readCacheStateGlobal(opts?: {
39
+ lastNHours?: number;
40
+ }): GlobalCacheStats;
41
+ /**
42
+ * cache_state://session/{sessionId} — per-session aggregates. Returns
43
+ * empty defaults when the session has no rows. Token/hash fields only.
44
+ *
45
+ * Slice 3: populates `ttlRemainingMs` by applying the configured TTL
46
+ * policy. Null for non-claude sessions or when the gateway has no
47
+ * cache-awareness config loaded (defaults to 5-min policy).
48
+ */
49
+ readCacheStateSession(sessionId: string): SessionCacheStats;
50
+ /**
51
+ * cache_state://prefix/{hash} — per-stable-prefix-hash aggregates.
52
+ * Returns empty defaults for unknown hashes. Token/hash fields only.
53
+ */
54
+ readCacheStateForPrefix(stablePrefixHash: string): PrefixCacheStats;
24
55
  listResources(): ResourceDefinition[];
25
56
  readResource(uri: string): Promise<ResourceContents | null>;
26
57
  }
package/dist/resources.js CHANGED
@@ -1,10 +1,61 @@
1
1
  import { getAvailableCliInfo } from "./model-registry.js";
2
+ import { computeGlobalCacheStats, computePrefixCacheStats, computeSessionCacheStats, computeTtlRemaining, } from "./cache-stats.js";
2
3
  export class ResourceProvider {
3
4
  sessionManager;
4
5
  performanceMetrics;
5
- constructor(sessionManager, performanceMetrics) {
6
+ flightRecorder;
7
+ cacheAwareness;
8
+ constructor(sessionManager, performanceMetrics,
9
+ // Optional read access to the flight recorder. Used by cache-state
10
+ // resources (slice 2). Falls back to a stub returning [] when not
11
+ // injected so existing call sites continue to work without changes.
12
+ flightRecorder = { queryRequests: () => [] },
13
+ // Slice 3: optional cache-awareness config. When present, drives the
14
+ // TTL policy applied to ttlRemainingMs on session-scoped reads.
15
+ // When absent, the default Anthropic 5-min TTL applies (matches the
16
+ // 1.x default of `[cache_awareness].anthropic_ttl_seconds = 300`).
17
+ cacheAwareness = null) {
6
18
  this.sessionManager = sessionManager;
7
19
  this.performanceMetrics = performanceMetrics;
20
+ this.flightRecorder = flightRecorder;
21
+ this.cacheAwareness = cacheAwareness;
22
+ }
23
+ /** Read-only flight-recorder accessor for cache-state resource readers. */
24
+ getFlightRecorderQuery() {
25
+ return this.flightRecorder;
26
+ }
27
+ /**
28
+ * cache_state://global — aggregates across the entire flight recorder.
29
+ * Optionally restrict to a recent window via `lastNHours`. Returns
30
+ * tokens/hashes/aggregates ONLY — no prompt text fields. The redaction is
31
+ * structural: the response shape (GlobalCacheStats) has no `prompt`,
32
+ * `response`, `system`, or `task` field by construction.
33
+ */
34
+ readCacheStateGlobal(opts = {}) {
35
+ return computeGlobalCacheStats(this.flightRecorder, opts);
36
+ }
37
+ /**
38
+ * cache_state://session/{sessionId} — per-session aggregates. Returns
39
+ * empty defaults when the session has no rows. Token/hash fields only.
40
+ *
41
+ * Slice 3: populates `ttlRemainingMs` by applying the configured TTL
42
+ * policy. Null for non-claude sessions or when the gateway has no
43
+ * cache-awareness config loaded (defaults to 5-min policy).
44
+ */
45
+ readCacheStateSession(sessionId) {
46
+ const stats = computeSessionCacheStats(this.flightRecorder, sessionId);
47
+ const ttlSeconds = this.cacheAwareness?.anthropicTtlSeconds ?? 300;
48
+ stats.ttlRemainingMs = computeTtlRemaining(stats, stats.cli, {
49
+ anthropicTtlSeconds: ttlSeconds,
50
+ });
51
+ return stats;
52
+ }
53
+ /**
54
+ * cache_state://prefix/{hash} — per-stable-prefix-hash aggregates.
55
+ * Returns empty defaults for unknown hashes. Token/hash fields only.
56
+ */
57
+ readCacheStateForPrefix(stablePrefixHash) {
58
+ return computePrefixCacheStats(this.flightRecorder, stablePrefixHash);
8
59
  }
9
60
  // List all available resources
10
61
  listResources() {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "llm-cli-gateway",
3
- "version": "1.5.35",
3
+ "version": "1.6.1",
4
4
  "mcpName": "io.github.verivus-oss/llm-cli-gateway",
5
5
  "description": "MCP server providing unified access to Claude Code, Codex, Gemini, Grok, and Mistral Vibe CLIs with session management, retry logic, async job orchestration, durable job results, and cross-LLM validation.",
6
6
  "license": "MIT",
@@ -109,6 +109,7 @@
109
109
  "@vitest/coverage-v8": "^4.1.2",
110
110
  "eslint": "^8.57.1",
111
111
  "eslint-config-prettier": "^9.0.0",
112
+ "eslint-plugin-security": "^3.0.1",
112
113
  "ioredis": "5.9.2",
113
114
  "pg": "^8.12.0",
114
115
  "prettier": "^3.0.0",
@@ -14,6 +14,7 @@
14
14
  "providers",
15
15
  "endpoint_exposure",
16
16
  "client_config",
17
+ "cache_awareness",
17
18
  "next_actions"
18
19
  ],
19
20
  "properties": {
@@ -263,6 +264,44 @@
263
264
  },
264
265
  "additionalProperties": false
265
266
  },
267
+ "cache_awareness": {
268
+ "type": "object",
269
+ "required": ["enabled_features", "last_24h", "per_cli"],
270
+ "properties": {
271
+ "enabled_features": {
272
+ "type": "array",
273
+ "items": {
274
+ "type": "string",
275
+ "enum": ["anthropic_cache_control", "ttl_warnings"]
276
+ }
277
+ },
278
+ "last_24h": {
279
+ "type": "object",
280
+ "required": ["hit_rate", "total_hits", "total_requests", "estimated_savings_usd"],
281
+ "properties": {
282
+ "hit_rate": { "type": "number" },
283
+ "total_hits": { "type": "integer" },
284
+ "total_requests": { "type": "integer" },
285
+ "estimated_savings_usd": { "type": "number" }
286
+ },
287
+ "additionalProperties": false
288
+ },
289
+ "per_cli": {
290
+ "type": "object",
291
+ "additionalProperties": {
292
+ "type": "object",
293
+ "required": ["hit_rate", "total_hits", "total_cache_read_tokens"],
294
+ "properties": {
295
+ "hit_rate": { "type": "number" },
296
+ "total_hits": { "type": "integer" },
297
+ "total_cache_read_tokens": { "type": "integer" }
298
+ },
299
+ "additionalProperties": false
300
+ }
301
+ }
302
+ },
303
+ "additionalProperties": false
304
+ },
266
305
  "next_actions": {
267
306
  "type": "array",
268
307
  "items": { "type": "string" }
package/socket.yml CHANGED
@@ -14,6 +14,25 @@ version: 2
14
14
  # src/endpoint-exposure.ts also issues a HEAD probe when verifying
15
15
  # tunnel reachability — opt-in via the start:http entry point only.
16
16
  #
17
+ # Additionally, Socket may flag `dist/index.js` and `dist/job-store.js`
18
+ # against the `globalThis["fetch"]` rule. This is a substring-match
19
+ # false positive (verified for v1.6.0 by sub-agent investigation on
20
+ # 2026-05-26; same matches exist in v1.5.35). Neither file contains
21
+ # any `fetch(`, `globalThis.fetch`, polyfill import, or any other
22
+ # network-call construct. The matches are:
23
+ # - dist/index.js — the English word "fetch" inside an async-defer
24
+ # error message ("Poll with llm_job_status, fetch with
25
+ # llm_job_result.") AND the JSON field name `fetchWith:
26
+ # "llm_job_result"` (part of the deferred-job response contract).
27
+ # - dist/job-store.js — the word "fetch" inside a code comment on
28
+ # markOrphanedOnStartup() describing how callers retrieve partial
29
+ # output from SQLite.
30
+ # Verify with: `grep -rEn "\bfetch\(|globalThis\.fetch|globalThis\[" dist/`
31
+ # — returns empty. Production code does not import undici / node-fetch
32
+ # / axios / got. The cache-awareness slice (v1.6.0) introduced zero
33
+ # new network surfaces; all I/O is filesystem (SQLite, sessions.json)
34
+ # or in-process.
35
+ #
17
36
  # shellAccess
18
37
  # src/executor.ts uses child_process.spawn(cmd, args, { ... }) with a
19
38
  # fixed allow-list of CLI binaries (claude / codex / gemini / grok /
@@ -33,6 +52,16 @@ version: 2
33
52
  # gateway does not call db.pragma() from production code; SQLite setup
34
53
  # uses fixed literal db.exec("PRAGMA ...") statements, and the release
35
54
  # security audit fails future production `.pragma()` calls.
55
+ #
56
+ # ioredis obfuscated code / base64 strings
57
+ # Socket may flag ioredis@5.10.1 built/constants/TLSProfiles.js because it
58
+ # contains base64-looking strings. This is a reviewed false positive: the
59
+ # strings are PEM-encoded Redis Cloud TLS CA certificates. The file exports
60
+ # static TLS profile data only; it contains no decoder loop, dynamic eval,
61
+ # network call, or hidden execution path. The same file is byte-for-byte
62
+ # identical in ioredis@5.9.2. ioredis is not installed by the default
63
+ # production dependency tree; it is an optional peer for PostgreSQL/Redis
64
+ # session storage and a pinned dev dependency for tests.
36
65
 
37
66
  issueRules:
38
67
  # Defaults from Socket. Listed explicitly so future contributors see what