llm-cli-gateway 1.13.1 → 1.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +199 -0
- package/dist/async-job-manager.d.ts +15 -1
- package/dist/async-job-manager.js +25 -6
- package/dist/cache-stats.d.ts +26 -0
- package/dist/cache-stats.js +45 -2
- package/dist/executor.d.ts +8 -0
- package/dist/executor.js +7 -2
- package/dist/flight-recorder.d.ts +7 -0
- package/dist/flight-recorder.js +27 -2
- package/dist/index.d.ts +25 -1
- package/dist/index.js +172 -22
- package/dist/prompt-parts.d.ts +74 -0
- package/dist/prompt-parts.js +47 -0
- package/dist/upstream-contracts.d.ts +8 -1
- package/dist/upstream-contracts.js +58 -1
- package/package.json +2 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,205 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to the llm-cli-gateway project.
|
|
4
4
|
|
|
5
|
+
## [1.14.0] - 2026-05-28 — Phase 4 slice κ (Claude explicit `cache_control` via `--input-format stream-json`)
|
|
6
|
+
|
|
7
|
+
Ships the ninth Phase 4 slice. Callers can now opt their stable
|
|
8
|
+
`promptParts` blocks into Anthropic's explicit `cache_control`
|
|
9
|
+
breakpoints — the gateway switches from positional `-p <prompt>` to
|
|
10
|
+
`claude -p --input-format stream-json` and pipes a JSON content-blocks
|
|
11
|
+
payload via stdin. Smoke-test against a live 1-hour-cache-enabled
|
|
12
|
+
account observed a **15,511-token shift from `cache_creation` to
|
|
13
|
+
`cache_read` on the second call, 82 % cost drop, 36 % latency drop**.
|
|
14
|
+
|
|
15
|
+
Seven recommendation commits land alongside the feature (default
|
|
16
|
+
`outputFormat`, auto-emit-from-config, observability split, warning,
|
|
17
|
+
schema mutex, smoke-script gate, tool description) plus three
|
|
18
|
+
falsifiability-tightening commits driven by the multi-LLM review gate.
|
|
19
|
+
|
|
20
|
+
### Added — slice κ feature
|
|
21
|
+
|
|
22
|
+
- **`PromptParts.cacheControl`** (`src/prompt-parts.ts`): per-block
|
|
23
|
+
boolean opt-in (`system?`/`tools?`/`context?`) with strict Zod
|
|
24
|
+
schema. The `task` field is intentionally never markable — it's the
|
|
25
|
+
volatile tail. Setting any flag activates the κ emission path.
|
|
26
|
+
- **`assembleClaudeCacheBlocks(parts)`** helper (`src/prompt-parts.ts`):
|
|
27
|
+
builds the `{type:"user",message:{role:"user",content:[…]}}` payload
|
|
28
|
+
in `system → tools → context → task` order. Each marked non-empty
|
|
29
|
+
block gets `cache_control: {type:"ephemeral", ttl:"1h"}`. Empty
|
|
30
|
+
parts are silently skipped; markers on empty parts are a no-op.
|
|
31
|
+
- **`prepareClaudeRequest` κ branch** (`src/index.ts`): when the
|
|
32
|
+
caller marks any block AND requests `outputFormat: "stream-json"`,
|
|
33
|
+
argv switches to `-p --input-format stream-json --output-format
|
|
34
|
+
stream-json --include-partial-messages --verbose` with NO positional
|
|
35
|
+
prompt; the prep result carries `stdinPayload` + `cacheControlBlocks`.
|
|
36
|
+
Mixing `cacheControl` with `text`/`json` output returns an
|
|
37
|
+
actionable error instead of silently coercing.
|
|
38
|
+
- **`-p` arity widened** to a new `"optional"` (`src/upstream-contracts.ts`):
|
|
39
|
+
consumes the next token as a value iff it does not start with `-`.
|
|
40
|
+
Preserves the legacy `-p <prompt>` positional form AND validates the
|
|
41
|
+
κ `-p` standalone form. New `--input-format` flag registered with
|
|
42
|
+
`values: ["text","stream-json"]`. New conformance fixture
|
|
43
|
+
`claude-input-format-stream-json` pins the exact κ argv combo.
|
|
44
|
+
- **Executor + AsyncJobManager stdin** (`src/executor.ts`,
|
|
45
|
+
`src/async-job-manager.ts`): both gain `stdin?: string` options.
|
|
46
|
+
When set, stdio[0] switches from `"ignore"` to `"pipe"` and the
|
|
47
|
+
payload is written. The stdin payload participates in the
|
|
48
|
+
AsyncJobManager dedup key — two requests with identical argv but
|
|
49
|
+
different cache_control payloads cannot collide.
|
|
50
|
+
- **Flight recorder migration v4** (`src/flight-recorder.ts`):
|
|
51
|
+
`cache_control_blocks INTEGER` column added idempotently;
|
|
52
|
+
`FlightLogStart.cacheControlBlocks?` persists the per-request
|
|
53
|
+
marker count for cache_state aggregates.
|
|
54
|
+
|
|
55
|
+
### Added — seven recommendations (rec #1..#7)
|
|
56
|
+
|
|
57
|
+
- **Rec #1** — `claude_request` + `claude_request_async` default
|
|
58
|
+
`outputFormat` changes from `"text"` to `"stream-json"`. The gateway
|
|
59
|
+
already parses NDJSON usage events; the prior default routed every
|
|
60
|
+
call through unparseable text, leaving 1,078 historic FR rows with
|
|
61
|
+
NULL tokens. Override to `"text"` still works for callers that
|
|
62
|
+
truly want raw stdout (loses observability).
|
|
63
|
+
- **Rec #2** — `[cache_awareness].emit_anthropic_cache_control`
|
|
64
|
+
config flag is now wired. When enabled AND the caller passes a
|
|
65
|
+
`promptParts` whose stable prefix exceeds the per-model threshold
|
|
66
|
+
(`minStableTokensForModel`), the gateway auto-marks the rightmost
|
|
67
|
+
non-empty stable block (context → tools → system priority) with
|
|
68
|
+
`ttl: "1h"`. Skipped when `optimizePrompt: true` (rec #5 desync
|
|
69
|
+
risk) or `outputFormat !== "stream-json"`.
|
|
70
|
+
- **Rec #3** — `GlobalCacheStats` (`src/cache-stats.ts`) gains five
|
|
71
|
+
derived metrics that distinguish κ-explicit hits from Claude Code's
|
|
72
|
+
baseline cache reads in the same flight-recorder window:
|
|
73
|
+
`explicitCacheControlRows`, `explicitCacheControlHits`,
|
|
74
|
+
`explicitCacheControlHitRate`, `stablePrefixReuseCount`,
|
|
75
|
+
`avgCacheCreationAfterFirstCall` (averaged over rows AFTER the
|
|
76
|
+
first-by-datetime in each stable-prefix reuse group).
|
|
77
|
+
- **Rec #4** — new structured warning `cacheable_prefix_uncached`
|
|
78
|
+
(`src/index.ts`): fires when `promptParts`' stable prefix is above
|
|
79
|
+
the per-model threshold but no `cache_control` breakpoint will be
|
|
80
|
+
emitted (caller didn't set it AND auto-emit also didn't fire). The
|
|
81
|
+
warning includes the measured `stablePrefixTokens`, `threshold`,
|
|
82
|
+
and `reason` (outputFormat-not-streamjson / config-off /
|
|
83
|
+
no-eligible-block). Threaded through both Claude handlers.
|
|
84
|
+
- **Rec #5** — `prepareClaudeRequest` refuses `optimizePrompt: true`
|
|
85
|
+
combined with `promptParts.cacheControl` (`src/index.ts:1455`)
|
|
86
|
+
before optimization runs. Without this mutex the FR `prompt` column
|
|
87
|
+
would log optimized text while Claude actually received raw
|
|
88
|
+
promptParts blocks via stdin, breaking prefix-cache reuse on the
|
|
89
|
+
next call. Actionable error message points the caller at the
|
|
90
|
+
combination to drop.
|
|
91
|
+
- **Rec #6** — new `npm run smoke:cache-control` script
|
|
92
|
+
(`package.json`). Runs `docs/plans/slice-kappa-smoke-test.mjs`,
|
|
93
|
+
which gates on `SMOKE_CACHE_CONTROL=1` env var with a "BILLABLE
|
|
94
|
+
TEST" banner so accidental invocation in CI does not burn live
|
|
95
|
+
Anthropic credit (~$0.08 per run).
|
|
96
|
+
- **Rec #7** — both Claude tools' `promptParts` descriptions now
|
|
97
|
+
explicitly document the `cacheControl` opt-in, the
|
|
98
|
+
`outputFormat: "stream-json"` requirement, the `ttl='1h'`
|
|
99
|
+
hard-code, and the "task is the volatile tail" convention.
|
|
100
|
+
|
|
101
|
+
### Tests + multi-LLM review gate
|
|
102
|
+
|
|
103
|
+
`886 → 940` tests pass. 54 new tests across `Kα/Kβ/Kγ/Kδ/Kε/Kζ`
|
|
104
|
+
regression sets + 13 falsifiability-gap closures + 1 SQL-drop
|
|
105
|
+
falsifier strengthening. Every new test is mutation-probe-verified:
|
|
106
|
+
the targeted regression goes red on the predicted mutation.
|
|
107
|
+
|
|
108
|
+
The branch passed a strict-evidence multi-LLM review gate per the
|
|
109
|
+
project's standing protocol (`feedback_multi_llm_review_gate.md` and
|
|
110
|
+
`feedback_test_veracity_audit_protocol.md`). Round 3 was sequential
|
|
111
|
+
to avoid concurrent gateway contention; all four reviewers — Codex
|
|
112
|
+
(`gpt-5.4`), Grok (`grok-build`), Mistral (`mistral-medium-3.5`),
|
|
113
|
+
Claude (`sonnet-4-6`) — issued **UNCONDITIONAL APPROVE** against the
|
|
114
|
+
head with file:line citations and executed mutation probes. The
|
|
115
|
+
iteration trail (Codex round-3 REJECT → fix → recheck APPROVE; Grok
|
|
116
|
+
round-3 REJECT → fix → recheck APPROVE; Mistral + Claude first-pass
|
|
117
|
+
APPROVE) is preserved in commit history (`bea1aee` and `bbc3b5f`).
|
|
118
|
+
|
|
119
|
+
### Caller-honest framing
|
|
120
|
+
|
|
121
|
+
- κ adds caller-side reuse ON TOP of the irreducible ~10–12K
|
|
122
|
+
`cache_creation` token floor that every fresh `claude -p` session
|
|
123
|
+
rebuilds (Claude Code's session-wrap content). The *added* benefit
|
|
124
|
+
scales with the caller's stable block size, not the total prompt.
|
|
125
|
+
- The `ttl='1h'` hard-code is mandatory because Anthropic rejects a
|
|
126
|
+
`5m` block after Claude Code's own 1h-marked session blocks; the
|
|
127
|
+
gateway warns if `[cache_awareness].anthropic_ttl_seconds` says 300.
|
|
128
|
+
- Recommended migration: callers running batch / orchestration /
|
|
129
|
+
repeated similar prompts should opt in; callers running one-shot
|
|
130
|
+
ad-hoc prompts won't see benefit.
|
|
131
|
+
|
|
132
|
+
### Files
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
src/prompt-parts.ts — PromptParts.cacheControl + assembleClaudeCacheBlocks
|
|
136
|
+
src/index.ts — prepareClaudeRequest κ branch + rec #1/#2/#4/#5/#7 + handler threading
|
|
137
|
+
src/upstream-contracts.ts — arity "optional", --input-format, claude-input-format-stream-json fixture
|
|
138
|
+
src/executor.ts — ExecuteOptions.stdin? threading
|
|
139
|
+
src/async-job-manager.ts — stdin? + dedup-key + cacheControlBlocks plumbing
|
|
140
|
+
src/flight-recorder.ts — migration v4 + cache_control_blocks column
|
|
141
|
+
src/cache-stats.ts — GlobalCacheStats 5 new derived metrics
|
|
142
|
+
package.json — smoke:cache-control script
|
|
143
|
+
docs/plans/slice-kappa.spec.md — audit spec
|
|
144
|
+
docs/plans/slice-kappa-final-review.spec.md — round-3 review spec
|
|
145
|
+
docs/plans/slice-kappa-captures/ — live smoke evidence
|
|
146
|
+
docs/plans/slice-kappa-smoke-test.mjs — billable smoke script (SMOKE_CACHE_CONTROL gated)
|
|
147
|
+
src/__tests__/test-veracity-regressions-slice-kappa.test.ts — 40 κ regressions (Kα/Kβ/Kγ/Kδ/Kε/Kζ)
|
|
148
|
+
src/__tests__/cache-stats.test.ts — +7 rec #3 + SQL-drop falsifier tests
|
|
149
|
+
src/__tests__/prompt-parts-tool-wiring.test.ts — +5 B1/B2/D1/D2 schema falsifiers
|
|
150
|
+
src/__tests__/smoke-script-gate.test.ts — 2 I2 subprocess tests
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## [1.13.2] - 2026-05-27 — Claude stream-json regression fix (--verbose now required)
|
|
154
|
+
|
|
155
|
+
Patch release. Single user-facing fix to `claude_request` /
|
|
156
|
+
`claude_request_async` when called with `outputFormat: "stream-json"`.
|
|
157
|
+
|
|
158
|
+
### Fixed
|
|
159
|
+
|
|
160
|
+
- Claude CLI 2.x rejects `--print --output-format=stream-json` without
|
|
161
|
+
`--verbose` ("When using --print, --output-format=stream-json requires
|
|
162
|
+
--verbose"). The gateway was emitting `--output-format stream-json
|
|
163
|
+
--include-partial-messages` without `--verbose`, so every claude
|
|
164
|
+
request configured for stream-json (sync or async) was exiting 1.
|
|
165
|
+
- `prepareClaudeRequest` now pushes `--verbose` as part of the
|
|
166
|
+
stream-json arg group. `--verbose` only affects what claude writes to
|
|
167
|
+
stderr; the stream-json stdout payload is unchanged, so the existing
|
|
168
|
+
NDJSON parser in `src/stream-json-parser.ts` needs no changes.
|
|
169
|
+
- This was the practical reason the flight recorder's
|
|
170
|
+
`cache_read_tokens` / `cache_creation_tokens` columns stayed NULL for
|
|
171
|
+
claude rows — token capture is gated on a successful stream-json run.
|
|
172
|
+
With this fix, callers who opt into `outputFormat: "stream-json"` get
|
|
173
|
+
Anthropic cache_read_input_tokens / cache_creation_input_tokens
|
|
174
|
+
recorded in the FR for the first time since the CLI started enforcing
|
|
175
|
+
`--verbose`.
|
|
176
|
+
- Direct CLI verification: `claude -p ... --output-format stream-json
|
|
177
|
+
--verbose --include-partial-messages` returned a clean NDJSON stream
|
|
178
|
+
with `cache_read_input_tokens: 17978` and
|
|
179
|
+
`cache_creation_input_tokens: 17435` on a 1-hour-cache-enabled
|
|
180
|
+
account. The parser path is correct; only the missing flag was
|
|
181
|
+
blocking it.
|
|
182
|
+
|
|
183
|
+
### Tests
|
|
184
|
+
|
|
185
|
+
- New regression: `prepareClaudeRequest` emits `--verbose` when
|
|
186
|
+
`outputFormat: "stream-json"` and does NOT emit it for `text` / `json`
|
|
187
|
+
(src/__tests__/claude-handler.test.ts).
|
|
188
|
+
- Updated `upstream-contracts.test.ts` "accepts a valid Claude argv
|
|
189
|
+
emitted by the gateway" to pin the three-flag combo so a future
|
|
190
|
+
removal of `--verbose` fails at the contract gate.
|
|
191
|
+
- New conformance fixture `claude-stream-json-requires-verbose` in
|
|
192
|
+
`src/upstream-contracts.ts` registering `--verbose` and asserting the
|
|
193
|
+
combo is accepted.
|
|
194
|
+
- 886 tests pass (884 prior + 2 new). Build clean.
|
|
195
|
+
|
|
196
|
+
### Why a patch release
|
|
197
|
+
|
|
198
|
+
The regression silently broke a documented MCP API surface; users
|
|
199
|
+
explicitly opting into stream-json (for token observability or
|
|
200
|
+
upcoming cache_control work in slice κ) were getting exit-1 errors
|
|
201
|
+
with no obvious gateway-side cause. Same shape as v1.13.1 (single
|
|
202
|
+
focused fix, no behaviour change for callers using `text` / `json`).
|
|
203
|
+
|
|
5
204
|
## [1.13.1] - 2026-05-27 — Installer Windows build fix (no code changes)
|
|
6
205
|
|
|
7
206
|
Patch release. **No changes to the gateway, MCP tools, or any provider
|
|
@@ -16,6 +16,13 @@ export interface AsyncJobFlightRecorderEntry {
|
|
|
16
16
|
sessionId?: string;
|
|
17
17
|
stablePrefixHash?: string;
|
|
18
18
|
stablePrefixTokens?: number;
|
|
19
|
+
/**
|
|
20
|
+
* Slice κ: count of caller-supplied prompt-parts content blocks the
|
|
21
|
+
* gateway emitted with explicit Anthropic `cache_control` markers
|
|
22
|
+
* (ttl='1h'). Only set for Claude requests that opt into κ; left
|
|
23
|
+
* undefined elsewhere so legacy rows stay NULL.
|
|
24
|
+
*/
|
|
25
|
+
cacheControlBlocks?: number;
|
|
19
26
|
}
|
|
20
27
|
/**
|
|
21
28
|
* Slice 1.5 usage-extraction callback. Closures MUST be constructed from
|
|
@@ -66,6 +73,13 @@ export interface StartJobOptions {
|
|
|
66
73
|
* therefore do NOT collide on dedup.
|
|
67
74
|
*/
|
|
68
75
|
env?: Record<string, string>;
|
|
76
|
+
/**
|
|
77
|
+
* Slice κ: optional UTF-8 payload to pipe into the child's stdin.
|
|
78
|
+
* Participates in the dedup key — two requests with identical argv
|
|
79
|
+
* but different stdin do NOT collide. When set, stdio[0] is "pipe";
|
|
80
|
+
* when unset, stdio[0] stays "ignore" (regression-protected).
|
|
81
|
+
*/
|
|
82
|
+
stdin?: string;
|
|
69
83
|
/**
|
|
70
84
|
* Optional hook fired exactly once when the job reaches a terminal state.
|
|
71
85
|
* Used by callers that own per-request resources (outputSchema temp files,
|
|
@@ -168,7 +182,7 @@ export declare class AsyncJobManager {
|
|
|
168
182
|
* Existing callers keep working unchanged; forceRefresh is exposed as a trailing
|
|
169
183
|
* optional param for the dedup-aware path.
|
|
170
184
|
*/
|
|
171
|
-
startJob(cli: LlmCli, args: string[], correlationId: string, cwd?: string, idleTimeoutMs?: number, outputFormat?: string, forceRefresh?: boolean, env?: Record<string, string>, onComplete?: () => void, flightRecorderEntry?: AsyncJobFlightRecorderEntry, extractUsage?: AsyncJobUsageExtractor, writeFlightStart?: boolean): AsyncJobSnapshot;
|
|
185
|
+
startJob(cli: LlmCli, args: string[], correlationId: string, cwd?: string, idleTimeoutMs?: number, outputFormat?: string, forceRefresh?: boolean, env?: Record<string, string>, onComplete?: () => void, flightRecorderEntry?: AsyncJobFlightRecorderEntry, extractUsage?: AsyncJobUsageExtractor, writeFlightStart?: boolean, stdin?: string): AsyncJobSnapshot;
|
|
172
186
|
/**
|
|
173
187
|
* Start a job, with optional dedup against recent identical requests.
|
|
174
188
|
* Returns `{ snapshot, deduped }` so callers can log/report the short-circuit.
|
|
@@ -207,8 +207,16 @@ export class AsyncJobManager {
|
|
|
207
207
|
* (sorted keys → JSON-stringified). This prevents two Mistral requests with the
|
|
208
208
|
* same argv but different `VIBE_ACTIVE_MODEL` from deduping onto each other.
|
|
209
209
|
*/
|
|
210
|
-
buildRequestKey(cli, args, env) {
|
|
211
|
-
|
|
210
|
+
buildRequestKey(cli, args, env, stdin) {
|
|
211
|
+
// Slice κ: stdin participates in the dedup key. Two Claude requests
|
|
212
|
+
// with identical argv but different cache_control content blocks
|
|
213
|
+
// would otherwise collide on dedup and the second caller would get
|
|
214
|
+
// the wrong response. The legacy "no stdin" code path passes
|
|
215
|
+
// stdin=undefined, which serialises to the same empty marker the
|
|
216
|
+
// previous version emitted — non-κ dedup is unchanged.
|
|
217
|
+
const extraEnv = canonicaliseEnvForKey(env);
|
|
218
|
+
const extra = stdin === undefined ? extraEnv : `${extraEnv}|stdin:${stdin}`;
|
|
219
|
+
return computeRequestKey(cli, args, extra);
|
|
212
220
|
}
|
|
213
221
|
fireOnComplete(job) {
|
|
214
222
|
if (job.onCompleteFired)
|
|
@@ -417,13 +425,14 @@ export class AsyncJobManager {
|
|
|
417
425
|
* Existing callers keep working unchanged; forceRefresh is exposed as a trailing
|
|
418
426
|
* optional param for the dedup-aware path.
|
|
419
427
|
*/
|
|
420
|
-
startJob(cli, args, correlationId, cwd, idleTimeoutMs, outputFormat, forceRefresh, env, onComplete, flightRecorderEntry, extractUsage, writeFlightStart) {
|
|
428
|
+
startJob(cli, args, correlationId, cwd, idleTimeoutMs, outputFormat, forceRefresh, env, onComplete, flightRecorderEntry, extractUsage, writeFlightStart, stdin) {
|
|
421
429
|
return this.startJobWithDedup(cli, args, correlationId, {
|
|
422
430
|
cwd,
|
|
423
431
|
idleTimeoutMs,
|
|
424
432
|
outputFormat,
|
|
425
433
|
forceRefresh,
|
|
426
434
|
env,
|
|
435
|
+
stdin,
|
|
427
436
|
onComplete,
|
|
428
437
|
flightRecorderEntry,
|
|
429
438
|
extractUsage,
|
|
@@ -439,8 +448,8 @@ export class AsyncJobManager {
|
|
|
439
448
|
* is returned without spawning a new process. forceRefresh skips dedup entirely.
|
|
440
449
|
*/
|
|
441
450
|
startJobWithDedup(cli, args, correlationId, opts = {}) {
|
|
442
|
-
const { cwd, idleTimeoutMs, outputFormat, forceRefresh, env: extraEnv, onComplete, flightRecorderEntry, extractUsage, writeFlightStart, } = opts;
|
|
443
|
-
const requestKey = this.buildRequestKey(cli, args, extraEnv);
|
|
451
|
+
const { cwd, idleTimeoutMs, outputFormat, forceRefresh, env: extraEnv, stdin, onComplete, flightRecorderEntry, extractUsage, writeFlightStart, } = opts;
|
|
452
|
+
const requestKey = this.buildRequestKey(cli, args, extraEnv, stdin);
|
|
444
453
|
if (!forceRefresh && this.store) {
|
|
445
454
|
try {
|
|
446
455
|
const existing = this.store.findByRequestKey(requestKey);
|
|
@@ -489,9 +498,18 @@ export class AsyncJobManager {
|
|
|
489
498
|
const baseEnv = envWithExtendedPath(process.env, getExtendedPath());
|
|
490
499
|
const child = spawnCliProcess(command, args, {
|
|
491
500
|
cwd,
|
|
492
|
-
stdio: ["ignore", "pipe", "pipe"],
|
|
501
|
+
stdio: stdin === undefined ? ["ignore", "pipe", "pipe"] : ["pipe", "pipe", "pipe"],
|
|
493
502
|
env: { ...baseEnv, ...(extraEnv ?? {}) },
|
|
494
503
|
});
|
|
504
|
+
if (stdin !== undefined && child.stdin) {
|
|
505
|
+
try {
|
|
506
|
+
child.stdin.write(stdin);
|
|
507
|
+
}
|
|
508
|
+
catch (err) {
|
|
509
|
+
this.logger.error(`Job ${id} failed to write stdin payload`, err);
|
|
510
|
+
}
|
|
511
|
+
child.stdin.end();
|
|
512
|
+
}
|
|
495
513
|
// Single cleanup flag to prevent double-unregister
|
|
496
514
|
let groupCleaned = false;
|
|
497
515
|
const cleanupGroup = () => {
|
|
@@ -560,6 +578,7 @@ export class AsyncJobManager {
|
|
|
560
578
|
asyncJobId: id,
|
|
561
579
|
stablePrefixHash: flightRecorderEntry.stablePrefixHash,
|
|
562
580
|
stablePrefixTokens: flightRecorderEntry.stablePrefixTokens,
|
|
581
|
+
cacheControlBlocks: flightRecorderEntry.cacheControlBlocks,
|
|
563
582
|
});
|
|
564
583
|
}
|
|
565
584
|
catch (err) {
|
package/dist/cache-stats.d.ts
CHANGED
|
@@ -76,6 +76,32 @@ export interface GlobalCacheStats {
|
|
|
76
76
|
estimatedSavingsUsd: number;
|
|
77
77
|
}>;
|
|
78
78
|
estimatedSavingsUsd: number;
|
|
79
|
+
/**
|
|
80
|
+
* Rec #3 (slice κ): derived metrics that distinguish gateway-driven
|
|
81
|
+
* κ-explicit `cache_control` breakpoints from Claude Code's
|
|
82
|
+
* own baseline cache reads.
|
|
83
|
+
*
|
|
84
|
+
* - explicitCacheControlRows: rows where the gateway emitted at
|
|
85
|
+
* least one `cache_control` marker (`cache_control_blocks > 0`).
|
|
86
|
+
* - explicitCacheControlHits: those rows whose `cache_read_tokens
|
|
87
|
+
* > 0` — closest signal we have to "the caller's marked block
|
|
88
|
+
* actually hit Anthropic's cache" (still includes Claude Code's
|
|
89
|
+
* baseline cache reads on top, which is unavoidable without
|
|
90
|
+
* per-block token accounting from Anthropic).
|
|
91
|
+
* - explicitCacheControlHitRate: ratio explicit hits / explicit rows.
|
|
92
|
+
* - stablePrefixReuseCount: distinct `stable_prefix_hash` values
|
|
93
|
+
* that appear in >1 row in-window (i.e. real reuse opportunities).
|
|
94
|
+
* - avgCacheCreationAfterFirstCall: averaged across stable-prefix
|
|
95
|
+
* reuse groups, the cache_creation_tokens on rows AFTER the
|
|
96
|
+
* first-by-datetime in each group. Drops sharply when caller
|
|
97
|
+
* blocks are reused; stays high when Claude Code's session-wrap
|
|
98
|
+
* floor dominates.
|
|
99
|
+
*/
|
|
100
|
+
explicitCacheControlRows: number;
|
|
101
|
+
explicitCacheControlHits: number;
|
|
102
|
+
explicitCacheControlHitRate: number;
|
|
103
|
+
stablePrefixReuseCount: number;
|
|
104
|
+
avgCacheCreationAfterFirstCall: number | null;
|
|
79
105
|
}
|
|
80
106
|
export declare function computeSessionCacheStats(db: FlightRecorderQuery, sessionId: string): SessionCacheStats;
|
|
81
107
|
export interface TtlPolicy {
|
package/dist/cache-stats.js
CHANGED
|
@@ -159,14 +159,16 @@ export function computeGlobalCacheStats(db, opts = {}) {
|
|
|
159
159
|
COALESCE(cache_read_tokens, 0) AS cache_read_tokens,
|
|
160
160
|
COALESCE(cache_creation_tokens, 0) AS cache_creation_tokens,
|
|
161
161
|
stable_prefix_hash,
|
|
162
|
-
datetime_utc
|
|
162
|
+
datetime_utc,
|
|
163
|
+
cache_control_blocks
|
|
163
164
|
FROM requests
|
|
164
165
|
WHERE datetime_utc >= ?`
|
|
165
166
|
: `SELECT cli, model,
|
|
166
167
|
COALESCE(cache_read_tokens, 0) AS cache_read_tokens,
|
|
167
168
|
COALESCE(cache_creation_tokens, 0) AS cache_creation_tokens,
|
|
168
169
|
stable_prefix_hash,
|
|
169
|
-
datetime_utc
|
|
170
|
+
datetime_utc,
|
|
171
|
+
cache_control_blocks
|
|
170
172
|
FROM requests`;
|
|
171
173
|
const rows = sinceIso ? db.queryRequests(sql, sinceIso) : db.queryRequests(sql);
|
|
172
174
|
const perCliMap = new Map();
|
|
@@ -175,6 +177,17 @@ export function computeGlobalCacheStats(db, opts = {}) {
|
|
|
175
177
|
let totalRead = 0;
|
|
176
178
|
let totalCreation = 0;
|
|
177
179
|
let totalSavings = 0;
|
|
180
|
+
// Rec #3: κ-explicit metrics. A row is "κ-explicit" iff it has
|
|
181
|
+
// `cache_control_blocks > 0` — i.e. the gateway emitted at least one
|
|
182
|
+
// caller-supplied `cache_control` marker. Rows with NULL or 0 are
|
|
183
|
+
// either pre-v4 or non-κ Claude / non-Claude requests.
|
|
184
|
+
let explicitRows = 0;
|
|
185
|
+
let explicitHits = 0;
|
|
186
|
+
// Per-prefix reuse tracking: collect cache_creation_tokens for every
|
|
187
|
+
// row keyed by stable_prefix_hash, ordered ascending by datetime_utc.
|
|
188
|
+
// For each group with >1 row, drop the first (the cache-write call)
|
|
189
|
+
// and average the rest (the cache-read calls).
|
|
190
|
+
const perPrefix = new Map();
|
|
178
191
|
for (const row of rows) {
|
|
179
192
|
totalRequests += 1;
|
|
180
193
|
const reads = safeNum(row.cache_read_tokens);
|
|
@@ -183,6 +196,17 @@ export function computeGlobalCacheStats(db, opts = {}) {
|
|
|
183
196
|
totalCreation += creation;
|
|
184
197
|
if (reads > 0)
|
|
185
198
|
totalHits += 1;
|
|
199
|
+
const ccBlocks = safeNum(row.cache_control_blocks);
|
|
200
|
+
if (ccBlocks > 0) {
|
|
201
|
+
explicitRows += 1;
|
|
202
|
+
if (reads > 0)
|
|
203
|
+
explicitHits += 1;
|
|
204
|
+
}
|
|
205
|
+
if (row.stable_prefix_hash) {
|
|
206
|
+
const arr = perPrefix.get(row.stable_prefix_hash) ?? [];
|
|
207
|
+
arr.push({ datetime_utc: row.datetime_utc, cache_creation_tokens: creation });
|
|
208
|
+
perPrefix.set(row.stable_prefix_hash, arr);
|
|
209
|
+
}
|
|
186
210
|
if (!isCacheStatsCli(row.cli))
|
|
187
211
|
continue;
|
|
188
212
|
const cli = row.cli;
|
|
@@ -203,6 +227,20 @@ export function computeGlobalCacheStats(db, opts = {}) {
|
|
|
203
227
|
agg.estimatedSavingsUsd += savings;
|
|
204
228
|
perCliMap.set(cli, agg);
|
|
205
229
|
}
|
|
230
|
+
let stablePrefixReuseCount = 0;
|
|
231
|
+
let creationAfterFirstSum = 0;
|
|
232
|
+
let creationAfterFirstCount = 0;
|
|
233
|
+
for (const arr of perPrefix.values()) {
|
|
234
|
+
if (arr.length <= 1)
|
|
235
|
+
continue;
|
|
236
|
+
stablePrefixReuseCount += 1;
|
|
237
|
+
arr.sort((a, b) => a.datetime_utc < b.datetime_utc ? -1 : a.datetime_utc > b.datetime_utc ? 1 : 0);
|
|
238
|
+
for (let i = 1; i < arr.length; i++) {
|
|
239
|
+
creationAfterFirstSum += arr[i].cache_creation_tokens;
|
|
240
|
+
creationAfterFirstCount += 1;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
const avgCacheCreationAfterFirstCall = creationAfterFirstCount > 0 ? creationAfterFirstSum / creationAfterFirstCount : null;
|
|
206
244
|
const perCli = Array.from(perCliMap.entries()).map(([cli, agg]) => ({
|
|
207
245
|
cli,
|
|
208
246
|
requestCount: agg.requestCount,
|
|
@@ -221,5 +259,10 @@ export function computeGlobalCacheStats(db, opts = {}) {
|
|
|
221
259
|
totalCacheCreationTokens: totalCreation,
|
|
222
260
|
perCli,
|
|
223
261
|
estimatedSavingsUsd: totalSavings,
|
|
262
|
+
explicitCacheControlRows: explicitRows,
|
|
263
|
+
explicitCacheControlHits: explicitHits,
|
|
264
|
+
explicitCacheControlHitRate: explicitRows > 0 ? explicitHits / explicitRows : 0,
|
|
265
|
+
stablePrefixReuseCount,
|
|
266
|
+
avgCacheCreationAfterFirstCall,
|
|
224
267
|
};
|
|
225
268
|
}
|
package/dist/executor.d.ts
CHANGED
|
@@ -7,6 +7,14 @@ export interface ExecuteOptions {
|
|
|
7
7
|
logger?: Logger;
|
|
8
8
|
/** Extra environment variables to inject; merged after PATH. */
|
|
9
9
|
env?: NodeJS.ProcessEnv;
|
|
10
|
+
/**
|
|
11
|
+
* Slice κ: optional UTF-8 payload to write to the child's stdin
|
|
12
|
+
* immediately after spawn. When provided, stdio for stdin switches
|
|
13
|
+
* from "ignore" to "pipe" so the CLI can read the payload (used by
|
|
14
|
+
* `claude --input-format stream-json`). Undefined preserves the
|
|
15
|
+
* legacy stdio:["ignore","pipe","pipe"] shape.
|
|
16
|
+
*/
|
|
17
|
+
stdin?: string;
|
|
10
18
|
}
|
|
11
19
|
export interface ExecuteResult {
|
|
12
20
|
stdout: string;
|
package/dist/executor.js
CHANGED
|
@@ -296,16 +296,21 @@ export function spawnCliProcess(command, args, options) {
|
|
|
296
296
|
return proc;
|
|
297
297
|
}
|
|
298
298
|
export async function executeCli(command, args, options = {}) {
|
|
299
|
-
const { timeout, idleTimeout, cwd, env: extraEnv } = options;
|
|
299
|
+
const { timeout, idleTimeout, cwd, env: extraEnv, stdin } = options;
|
|
300
300
|
const extendedPath = getExtendedPath();
|
|
301
301
|
const baseEnv = envWithExtendedPath(process.env, extendedPath);
|
|
302
302
|
const circuitBreaker = getCircuitBreaker(command);
|
|
303
303
|
const runOnce = () => new Promise((resolve, reject) => {
|
|
304
|
+
const stdio = stdin === undefined ? ["ignore", "pipe", "pipe"] : ["pipe", "pipe", "pipe"];
|
|
304
305
|
const proc = spawnCliProcess(command, args, {
|
|
305
306
|
cwd,
|
|
306
|
-
stdio
|
|
307
|
+
stdio,
|
|
307
308
|
env: { ...baseEnv, ...(extraEnv ?? {}) },
|
|
308
309
|
});
|
|
310
|
+
if (stdin !== undefined && proc.stdin) {
|
|
311
|
+
proc.stdin.write(stdin);
|
|
312
|
+
proc.stdin.end();
|
|
313
|
+
}
|
|
309
314
|
let stdout = "";
|
|
310
315
|
let stderr = "";
|
|
311
316
|
let timedOut = false;
|
|
@@ -8,6 +8,13 @@ export interface FlightLogStart {
|
|
|
8
8
|
asyncJobId?: string;
|
|
9
9
|
stablePrefixHash?: string;
|
|
10
10
|
stablePrefixTokens?: number;
|
|
11
|
+
/**
|
|
12
|
+
* Slice κ: number of caller-supplied prompt-parts content blocks
|
|
13
|
+
* that the gateway emitted with an explicit `cache_control`
|
|
14
|
+
* breakpoint on this request. `null` (default) for non-κ requests,
|
|
15
|
+
* including pre-κ rows after a v4 migration of a legacy DB.
|
|
16
|
+
*/
|
|
17
|
+
cacheControlBlocks?: number;
|
|
11
18
|
}
|
|
12
19
|
export interface FlightLogResult {
|
|
13
20
|
response: string;
|
package/dist/flight-recorder.js
CHANGED
|
@@ -55,6 +55,20 @@ function ensureStablePrefixColumns(db) {
|
|
|
55
55
|
}
|
|
56
56
|
db.exec("CREATE INDEX IF NOT EXISTS idx_requests_stable_hash ON requests(stable_prefix_hash)");
|
|
57
57
|
}
|
|
58
|
+
/**
|
|
59
|
+
* Idempotent v4 migration (slice κ): add `cache_control_blocks` column
|
|
60
|
+
* to the `requests` table. Counts the caller-supplied content blocks
|
|
61
|
+
* the gateway emitted with an explicit Anthropic `cache_control`
|
|
62
|
+
* marker. Pre-κ rows keep NULL; only κ-opt-in callers ever set the
|
|
63
|
+
* column to a non-NULL integer.
|
|
64
|
+
*/
|
|
65
|
+
function ensureCacheControlBlocksColumn(db) {
|
|
66
|
+
const rows = db.prepare("PRAGMA table_info(requests)").all?.() ?? [];
|
|
67
|
+
const names = new Set(rows.map((row) => (row && typeof row.name === "string" ? row.name : "")));
|
|
68
|
+
if (!names.has("cache_control_blocks")) {
|
|
69
|
+
db.exec("ALTER TABLE requests ADD COLUMN cache_control_blocks INTEGER");
|
|
70
|
+
}
|
|
71
|
+
}
|
|
58
72
|
export function resolveFlightRecorderDbPath() {
|
|
59
73
|
const configured = process.env.LLM_GATEWAY_LOGS_DB;
|
|
60
74
|
if (configured !== undefined) {
|
|
@@ -176,6 +190,14 @@ export class FlightRecorder {
|
|
|
176
190
|
this.db
|
|
177
191
|
.prepare("INSERT OR IGNORE INTO _migrations(version, applied_at) VALUES(3, ?)")
|
|
178
192
|
.run(new Date().toISOString());
|
|
193
|
+
// Migration v4: cache_control_blocks (slice κ). Pre-κ rows keep NULL;
|
|
194
|
+
// only κ-opt-in writes populate this. Aggregates in cache-stats /
|
|
195
|
+
// MCP resources can use this to separate explicit κ hits from
|
|
196
|
+
// implicit prefix-cache hits.
|
|
197
|
+
ensureCacheControlBlocksColumn(this.db);
|
|
198
|
+
this.db
|
|
199
|
+
.prepare("INSERT OR IGNORE INTO _migrations(version, applied_at) VALUES(4, ?)")
|
|
200
|
+
.run(new Date().toISOString());
|
|
179
201
|
if (process.platform !== "win32") {
|
|
180
202
|
try {
|
|
181
203
|
chmodSync(dbPath, 0o600);
|
|
@@ -186,9 +208,11 @@ export class FlightRecorder {
|
|
|
186
208
|
}
|
|
187
209
|
const insertRequest = this.db.prepare(`
|
|
188
210
|
INSERT INTO requests (id, cli, model, prompt, system, session_id, datetime_utc,
|
|
189
|
-
stable_prefix_hash, stable_prefix_tokens
|
|
211
|
+
stable_prefix_hash, stable_prefix_tokens,
|
|
212
|
+
cache_control_blocks)
|
|
190
213
|
VALUES (@id, @cli, @model, @prompt, @system, @session_id, @datetime_utc,
|
|
191
|
-
@stable_prefix_hash, @stable_prefix_tokens
|
|
214
|
+
@stable_prefix_hash, @stable_prefix_tokens,
|
|
215
|
+
@cache_control_blocks)
|
|
192
216
|
`);
|
|
193
217
|
const insertMetadata = this.db.prepare(`
|
|
194
218
|
INSERT INTO gateway_metadata (request_id, async_job_id, status)
|
|
@@ -205,6 +229,7 @@ export class FlightRecorder {
|
|
|
205
229
|
datetime_utc: new Date().toISOString(),
|
|
206
230
|
stable_prefix_hash: entry.stablePrefixHash ?? null,
|
|
207
231
|
stable_prefix_tokens: entry.stablePrefixTokens ?? null,
|
|
232
|
+
cache_control_blocks: entry.cacheControlBlocks ?? null,
|
|
208
233
|
});
|
|
209
234
|
insertMetadata.run({
|
|
210
235
|
request_id: entry.correlationId,
|
package/dist/index.d.ts
CHANGED
|
@@ -82,7 +82,7 @@ export interface GatewayServerDeps {
|
|
|
82
82
|
persistence?: PersistenceConfig;
|
|
83
83
|
cacheAwareness?: CacheAwarenessConfig;
|
|
84
84
|
}
|
|
85
|
-
interface GatewayServerRuntime {
|
|
85
|
+
export interface GatewayServerRuntime {
|
|
86
86
|
sessionManager: ISessionManager;
|
|
87
87
|
resourceProvider: ResourceProvider;
|
|
88
88
|
db: DatabaseConnection | null;
|
|
@@ -94,6 +94,9 @@ interface GatewayServerRuntime {
|
|
|
94
94
|
persistence: PersistenceConfig;
|
|
95
95
|
cacheAwareness: CacheAwarenessConfig;
|
|
96
96
|
}
|
|
97
|
+
export declare function resolveGatewayServerRuntime(deps?: GatewayServerDeps, options?: {
|
|
98
|
+
isolateState?: boolean;
|
|
99
|
+
}): GatewayServerRuntime;
|
|
97
100
|
export declare function extractUsageAndCost(cli: "claude" | "codex" | "gemini" | "grok" | "mistral", output: string, outputFormat?: string,
|
|
98
101
|
/**
|
|
99
102
|
* Optional context for off-stdout telemetry sources. Today only Mistral
|
|
@@ -129,6 +132,27 @@ interface CliRequestPrep {
|
|
|
129
132
|
stablePrefixHash: string | null;
|
|
130
133
|
/** Heuristic token count (bytes/4) of the same stable prefix. */
|
|
131
134
|
stablePrefixTokens: number | null;
|
|
135
|
+
/**
|
|
136
|
+
* Slice κ (Claude only): JSON stream-json payload to feed on stdin
|
|
137
|
+
* when the gateway emits `-p --input-format stream-json`. Undefined
|
|
138
|
+
* when the caller did not opt into Anthropic `cache_control`
|
|
139
|
+
* breakpoints. Non-κ providers always leave this undefined.
|
|
140
|
+
*/
|
|
141
|
+
stdinPayload?: string;
|
|
142
|
+
/**
|
|
143
|
+
* Slice κ (Claude only): number of caller-supplied content blocks
|
|
144
|
+
* that carry an explicit `cache_control` marker. Threaded into the
|
|
145
|
+
* flight recorder so `cache_state` aggregates can distinguish
|
|
146
|
+
* κ-explicit breakpoints from implicit prefix-cache hits.
|
|
147
|
+
*/
|
|
148
|
+
cacheControlBlocks?: number;
|
|
149
|
+
/**
|
|
150
|
+
* Rec #4: structured warnings produced during prep (e.g. cacheable
|
|
151
|
+
* stable prefix without cacheControl). Handlers merge these with any
|
|
152
|
+
* other warnings (cache_ttl_expiring_soon, etc.) before returning to
|
|
153
|
+
* the caller.
|
|
154
|
+
*/
|
|
155
|
+
warnings?: WarningEntry[];
|
|
132
156
|
}
|
|
133
157
|
export declare function prepareClaudeRequest(params: {
|
|
134
158
|
prompt?: string;
|
package/dist/index.js
CHANGED
|
@@ -16,7 +16,7 @@ import { createSessionManager } from "./session-manager.js";
|
|
|
16
16
|
import { ResourceProvider } from "./resources.js";
|
|
17
17
|
import { PerformanceMetrics } from "./metrics.js";
|
|
18
18
|
import { estimateTokens, optimizePrompt as optimizePromptText, optimizeResponse as optimizeResponseText, } from "./optimizer.js";
|
|
19
|
-
import { loadConfig, loadPersistenceConfig, loadCacheAwarenessConfig, } from "./config.js";
|
|
19
|
+
import { loadConfig, loadPersistenceConfig, loadCacheAwarenessConfig, minStableTokensForModel, } from "./config.js";
|
|
20
20
|
import { checkHealth } from "./health.js";
|
|
21
21
|
import { clearModelRegistryCache, getAvailableCliInfo, getCliInfo, resolveModelAlias, } from "./model-registry.js";
|
|
22
22
|
import { AsyncJobManager, } from "./async-job-manager.js";
|
|
@@ -26,7 +26,7 @@ import { checkReviewIntegrity } from "./review-integrity.js";
|
|
|
26
26
|
import { buildClaudeMcpConfig, CLAUDE_MCP_SERVER_NAMES, } from "./claude-mcp-config.js";
|
|
27
27
|
import { resolveGrokSessionArgs, resolveMistralSessionArgs, resolveCodexSessionArgs, sanitizeCliArgValues, prepareMistralRequest as buildMistralCliInvocation, MISTRAL_AGENT_MODES, GATEWAY_SESSION_PREFIX, resolveClaudePermissionFlags, resolveCodexSandboxFlags, CLAUDE_PERMISSION_MODES, GEMINI_APPROVAL_MODES, CODEX_SANDBOX_MODES, CODEX_ASK_FOR_APPROVAL_MODES, CLAUDE_EFFORT_LEVELS, prepareClaudeHighImpactFlags, validateClaudeAgentsMap, prepareCodexHighImpactFlags, prepareCodexForkRequest, CODEX_CONFIG_OVERRIDES_SCHEMA, prepareGeminiHighImpactFlags, prependGeminiAttachments, resolveGeminiSessionPlan, GEMINI_HIGH_IMPACT_PARAMS_SCHEMA, } from "./request-helpers.js";
|
|
28
28
|
import { createFlightRecorder } from "./flight-recorder.js";
|
|
29
|
-
import { resolvePromptInput, PromptPartsSchema } from "./prompt-parts.js";
|
|
29
|
+
import { resolvePromptInput, PromptPartsSchema, assembleClaudeCacheBlocks, } from "./prompt-parts.js";
|
|
30
30
|
import { computeSessionCacheStats, computeTtlRemaining } from "./cache-stats.js";
|
|
31
31
|
import { getCliVersions, runCliUpgrade } from "./cli-updater.js";
|
|
32
32
|
import { startHttpGateway } from "./http-transport.js";
|
|
@@ -253,7 +253,7 @@ export const SESSION_PROVIDER_VALUES = ["claude", "codex", "gemini", "grok", "mi
|
|
|
253
253
|
export const SESSION_PROVIDER_ENUM = z.enum(SESSION_PROVIDER_VALUES);
|
|
254
254
|
let activeServer = null;
|
|
255
255
|
let activeHttpGateway = null;
|
|
256
|
-
function resolveGatewayServerRuntime(deps = {}, options = {}) {
|
|
256
|
+
export function resolveGatewayServerRuntime(deps = {}, options = {}) {
|
|
257
257
|
const runtimeLogger = deps.logger ?? logger;
|
|
258
258
|
const runtimeSessionManager = deps.sessionManager ?? sessionManager;
|
|
259
259
|
const runtimePerformanceMetrics = deps.performanceMetrics ??
|
|
@@ -316,7 +316,14 @@ async function awaitJobOrDefer(cli, args, corrId, idleTimeoutMs, outputFormat, f
|
|
|
316
316
|
* `writeFlightStart` is NEVER true on this path: the sync handler is
|
|
317
317
|
* always the upstream logStart writer.
|
|
318
318
|
*/
|
|
319
|
-
flightRecorderEntry, extractUsage
|
|
319
|
+
flightRecorderEntry, extractUsage,
|
|
320
|
+
/**
|
|
321
|
+
* Slice κ: optional stdin payload piped to the child CLI. Currently
|
|
322
|
+
* only Claude's `--input-format stream-json` path sets this. Threaded
|
|
323
|
+
* through both the direct-execute fallback (SYNC_DEADLINE_MS===0) and
|
|
324
|
+
* the AsyncJobManager spawn path, and participates in the dedup key.
|
|
325
|
+
*/
|
|
326
|
+
stdin) {
|
|
320
327
|
// U26 fix: ownership of onComplete is a contract. Once this function returns
|
|
321
328
|
// OR throws, the caller MUST consider onComplete consumed — i.e. it has
|
|
322
329
|
// either been run, or the AsyncJobManager has taken ownership of it. The
|
|
@@ -350,6 +357,7 @@ flightRecorderEntry, extractUsage) {
|
|
|
350
357
|
idleTimeout: idleTimeoutMs,
|
|
351
358
|
logger: runtime.logger,
|
|
352
359
|
env: env ? { ...process.env, ...env } : undefined,
|
|
360
|
+
stdin,
|
|
353
361
|
});
|
|
354
362
|
}
|
|
355
363
|
finally {
|
|
@@ -365,6 +373,7 @@ flightRecorderEntry, extractUsage) {
|
|
|
365
373
|
outputFormat,
|
|
366
374
|
forceRefresh,
|
|
367
375
|
env,
|
|
376
|
+
stdin,
|
|
368
377
|
onComplete,
|
|
369
378
|
// Sync-deferred path: the upstream sync handler already wrote
|
|
370
379
|
// logStart for this corrId, so writeFlightStart stays false. The
|
|
@@ -575,6 +584,7 @@ function buildAsyncFlightRecorderHandoff(cliName, prep, sessionId, outputFormat)
|
|
|
575
584
|
sessionId,
|
|
576
585
|
stablePrefixHash: prep.stablePrefixHash ?? undefined,
|
|
577
586
|
stablePrefixTokens: prep.stablePrefixTokens ?? undefined,
|
|
587
|
+
cacheControlBlocks: prep.cacheControlBlocks,
|
|
578
588
|
},
|
|
579
589
|
extractUsage: (stdout) => extractUsageAndCost(cli, stdout, fmt, { sessionId: sid, home }),
|
|
580
590
|
};
|
|
@@ -919,6 +929,19 @@ export function prepareClaudeRequest(params, runtime = resolveGatewayServerRunti
|
|
|
919
929
|
score: reviewIntegrity.totalScore,
|
|
920
930
|
});
|
|
921
931
|
}
|
|
932
|
+
// Rec #5 (slice κ): refuse the optimizePrompt + cacheControl combo
|
|
933
|
+
// before running optimization. Optimization rewrites the assembled
|
|
934
|
+
// prompt text the flight-recorder logs, but the κ stdin payload is
|
|
935
|
+
// built from raw `promptParts` content blocks — letting both run
|
|
936
|
+
// produces a FR row whose `prompt` no longer matches what Claude
|
|
937
|
+
// actually received, AND any optimisation-driven text change would
|
|
938
|
+
// silently break Anthropic prefix-cache reuse on the next call.
|
|
939
|
+
const ccEarly = params.promptParts?.cacheControl;
|
|
940
|
+
const cacheControlRequestedEarly = !!(ccEarly &&
|
|
941
|
+
(ccEarly.system || ccEarly.tools || ccEarly.context));
|
|
942
|
+
if (params.optimizePrompt && cacheControlRequestedEarly) {
|
|
943
|
+
return createErrorResponse(params.operation, 1, "", corrId, new Error("optimizePrompt is incompatible with promptParts.cacheControl (slice κ): optimization rewrites the assembled prompt text the flight recorder logs, while the cache_control payload is built from raw promptParts; the two would desync and break Anthropic prefix-cache reuse. Disable optimizePrompt when opting into cacheControl."));
|
|
944
|
+
}
|
|
922
945
|
let effectivePrompt = assembledPrompt;
|
|
923
946
|
if (params.optimizePrompt) {
|
|
924
947
|
const optimized = optimizePromptText(effectivePrompt);
|
|
@@ -950,14 +973,127 @@ export function prepareClaudeRequest(params, runtime = resolveGatewayServerRunti
|
|
|
950
973
|
return createApprovalDeniedResponse(params.operation, approvalDecision);
|
|
951
974
|
}
|
|
952
975
|
}
|
|
953
|
-
|
|
976
|
+
// Rec #2 (slice κ): auto-emit `cache_control` when the caller passes
|
|
977
|
+
// `promptParts` whose stable prefix exceeds the per-model minimum,
|
|
978
|
+
// the caller has NOT explicitly set `cacheControl`, the gateway
|
|
979
|
+
// config has opted in (`[cache_awareness].emit_anthropic_cache_control`),
|
|
980
|
+
// and outputFormat is stream-json. Auto-emit marks the LAST non-empty
|
|
981
|
+
// stable block (context → tools → system priority — the rightmost
|
|
982
|
+
// stable block covers the widest prefix). Skipped when optimizePrompt
|
|
983
|
+
// is on (same rec #5 desync risk).
|
|
984
|
+
//
|
|
985
|
+
// The 1h ttl is forced regardless of `anthropic_ttl_seconds`: 5m
|
|
986
|
+
// breakpoints from caller content are rejected by Anthropic once
|
|
987
|
+
// Claude Code's own 1h-marked session-wrap blocks land ahead of them.
|
|
988
|
+
let autoEmittedCacheControlBlock = null;
|
|
989
|
+
if (!cacheControlRequestedEarly &&
|
|
990
|
+
runtime.cacheAwareness.emitAnthropicCacheControl &&
|
|
991
|
+
!params.optimizePrompt &&
|
|
992
|
+
params.outputFormat === "stream-json" &&
|
|
993
|
+
params.promptParts &&
|
|
994
|
+
stablePrefixTokens !== null) {
|
|
995
|
+
const threshold = minStableTokensForModel(runtime.cacheAwareness, resolvedModel ?? "default");
|
|
996
|
+
if (stablePrefixTokens >= threshold) {
|
|
997
|
+
const pp = params.promptParts;
|
|
998
|
+
// Rightmost non-empty stable block — its cache_control breakpoint
|
|
999
|
+
// covers everything above it in the message (the API matches
|
|
1000
|
+
// breakpoints in order).
|
|
1001
|
+
if (pp.context && pp.context.length > 0)
|
|
1002
|
+
autoEmittedCacheControlBlock = "context";
|
|
1003
|
+
else if (pp.tools && pp.tools.length > 0)
|
|
1004
|
+
autoEmittedCacheControlBlock = "tools";
|
|
1005
|
+
else if (pp.system && pp.system.length > 0)
|
|
1006
|
+
autoEmittedCacheControlBlock = "system";
|
|
1007
|
+
if (autoEmittedCacheControlBlock !== null) {
|
|
1008
|
+
runtime.logger.info(`[${corrId}] auto-emitting cache_control on '${autoEmittedCacheControlBlock}' (stablePrefixTokens=${stablePrefixTokens} >= ${threshold} for model='${resolvedModel ?? "default"}')`);
|
|
1009
|
+
if (runtime.cacheAwareness.anthropicTtlSeconds !== 3600) {
|
|
1010
|
+
runtime.logger.warn(`[${corrId}] [cache_awareness].anthropic_ttl_seconds=${runtime.cacheAwareness.anthropicTtlSeconds} ignored for Claude CLI path — Anthropic rejects 5m blocks after Claude Code's 1h-marked session-wrap content; using ttl='1h'.`);
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
// Rec #4: warn when promptParts has a cacheable stable prefix but no
|
|
1016
|
+
// cache_control breakpoint is being emitted (neither explicit nor
|
|
1017
|
+
// auto). Either the caller forgot to set `cacheControl` or
|
|
1018
|
+
// `[cache_awareness].emit_anthropic_cache_control` is off — both
|
|
1019
|
+
// leave the stable prefix bytes unreused across calls, defeating the
|
|
1020
|
+
// point of using `promptParts`.
|
|
1021
|
+
const warnings = [];
|
|
1022
|
+
if (!cacheControlRequestedEarly &&
|
|
1023
|
+
autoEmittedCacheControlBlock === null &&
|
|
1024
|
+
params.promptParts &&
|
|
1025
|
+
stablePrefixTokens !== null) {
|
|
1026
|
+
const threshold = minStableTokensForModel(runtime.cacheAwareness, resolvedModel ?? "default");
|
|
1027
|
+
if (stablePrefixTokens >= threshold) {
|
|
1028
|
+
const reason = params.outputFormat !== "stream-json"
|
|
1029
|
+
? "outputFormat is not 'stream-json'"
|
|
1030
|
+
: !runtime.cacheAwareness.emitAnthropicCacheControl
|
|
1031
|
+
? "[cache_awareness].emit_anthropic_cache_control is false"
|
|
1032
|
+
: "no eligible non-empty stable block";
|
|
1033
|
+
warnings.push({
|
|
1034
|
+
code: "cacheable_prefix_uncached",
|
|
1035
|
+
message: `Stable prefix is cacheable (${stablePrefixTokens} tokens >= ${threshold} for model='${resolvedModel ?? "default"}') but no cache_control breakpoint will be emitted (${reason}). Set promptParts.cacheControl explicitly, switch outputFormat to 'stream-json', or enable [cache_awareness].emit_anthropic_cache_control.`,
|
|
1036
|
+
stablePrefixTokens,
|
|
1037
|
+
threshold,
|
|
1038
|
+
reason,
|
|
1039
|
+
});
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
// Slice κ: switch from the legacy positional `-p <prompt>` emission
|
|
1043
|
+
// to `claude -p --input-format stream-json` and feed a JSON
|
|
1044
|
+
// content-blocks payload via stdin. Non-κ callers (no cacheControl,
|
|
1045
|
+
// or cacheControl with all flags false) take the existing positional
|
|
1046
|
+
// path bit-for-bit. The κ path activates on EITHER an explicit caller
|
|
1047
|
+
// opt-in (`cacheControlRequestedEarly`) OR a gateway-driven auto-emit
|
|
1048
|
+
// (`autoEmittedCacheControlBlock`).
|
|
1049
|
+
const cacheControlRequested = cacheControlRequestedEarly || autoEmittedCacheControlBlock !== null;
|
|
1050
|
+
let stdinPayload;
|
|
1051
|
+
let cacheControlBlocks;
|
|
1052
|
+
if (cacheControlRequested) {
|
|
1053
|
+
if (params.outputFormat !== "stream-json") {
|
|
1054
|
+
return createErrorResponse(params.operation, 1, "", corrId, new Error("promptParts.cacheControl requires outputFormat: 'stream-json' (slice κ pipes the cache_control blocks over --input-format stream-json; text/json output formats cannot carry the required NDJSON usage events)."));
|
|
1055
|
+
}
|
|
1056
|
+
// promptParts is non-null whenever cacheControlRequested is true
|
|
1057
|
+
// (explicit opt-in lives in PromptParts; auto-emit guard requires
|
|
1058
|
+
// promptParts to be defined).
|
|
1059
|
+
const effectiveParts = autoEmittedCacheControlBlock !== null
|
|
1060
|
+
? {
|
|
1061
|
+
...params.promptParts,
|
|
1062
|
+
cacheControl: {
|
|
1063
|
+
...(params.promptParts.cacheControl ?? {}),
|
|
1064
|
+
[autoEmittedCacheControlBlock]: true,
|
|
1065
|
+
},
|
|
1066
|
+
}
|
|
1067
|
+
: params.promptParts;
|
|
1068
|
+
const built = assembleClaudeCacheBlocks(effectiveParts);
|
|
1069
|
+
stdinPayload = `${JSON.stringify(built.payload)}\n`;
|
|
1070
|
+
cacheControlBlocks = built.markedBlockCount;
|
|
1071
|
+
}
|
|
1072
|
+
const args = cacheControlRequested
|
|
1073
|
+
? [
|
|
1074
|
+
"-p",
|
|
1075
|
+
"--input-format",
|
|
1076
|
+
"stream-json",
|
|
1077
|
+
"--output-format",
|
|
1078
|
+
"stream-json",
|
|
1079
|
+
"--include-partial-messages",
|
|
1080
|
+
"--verbose",
|
|
1081
|
+
]
|
|
1082
|
+
: ["-p", effectivePrompt];
|
|
954
1083
|
if (resolvedModel)
|
|
955
1084
|
args.push("--model", resolvedModel);
|
|
956
|
-
if (
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
1085
|
+
if (!cacheControlRequested) {
|
|
1086
|
+
if (params.outputFormat === "json") {
|
|
1087
|
+
args.push("--output-format", "json");
|
|
1088
|
+
}
|
|
1089
|
+
else if (params.outputFormat === "stream-json") {
|
|
1090
|
+
// Claude CLI 2.x rejects `--print --output-format stream-json` without
|
|
1091
|
+
// `--verbose`: "When using --print, --output-format=stream-json requires
|
|
1092
|
+
// --verbose". --verbose only affects what claude logs to stderr; the
|
|
1093
|
+
// stream-json stdout payload is unchanged, so the gateway's NDJSON
|
|
1094
|
+
// parser is unaffected.
|
|
1095
|
+
args.push("--output-format", "stream-json", "--include-partial-messages", "--verbose");
|
|
1096
|
+
}
|
|
961
1097
|
}
|
|
962
1098
|
if (params.allowedTools && params.allowedTools.length > 0) {
|
|
963
1099
|
sanitizeCliArgValues(params.allowedTools, "allowedTools");
|
|
@@ -1020,6 +1156,9 @@ export function prepareClaudeRequest(params, runtime = resolveGatewayServerRunti
|
|
|
1020
1156
|
args,
|
|
1021
1157
|
stablePrefixHash,
|
|
1022
1158
|
stablePrefixTokens,
|
|
1159
|
+
stdinPayload,
|
|
1160
|
+
cacheControlBlocks,
|
|
1161
|
+
warnings: warnings.length > 0 ? warnings : undefined,
|
|
1023
1162
|
};
|
|
1024
1163
|
}
|
|
1025
1164
|
export function prepareCodexRequest(params, runtime = resolveGatewayServerRuntime()) {
|
|
@@ -2476,15 +2615,15 @@ export function createGatewayServer(deps = {}) {
|
|
|
2476
2615
|
.max(100000, "Prompt too long (max 100k chars)")
|
|
2477
2616
|
.optional()
|
|
2478
2617
|
.describe("Prompt text for Claude (mutually exclusive with promptParts)"),
|
|
2479
|
-
promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt
|
|
2618
|
+
promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task, cacheControl? }. Use for repeated calls that share a stable prefix — `system`/`tools`/`context` are the stable head; `task` is the volatile tail (never marked). Set `cacheControl: { system?: boolean, tools?: boolean, context?: boolean }` to opt into explicit Anthropic prefix caching via `--input-format stream-json` (slice κ). Requires `outputFormat: 'stream-json'` and hard-codes `ttl='1h'` (Anthropic rejects 5m blocks after Claude Code's 1h-marked session-wrap content). Mutually exclusive with `prompt`. The stable prefix hash is logged to the flight recorder for cache_state aggregates."),
|
|
2480
2619
|
model: z
|
|
2481
2620
|
.string()
|
|
2482
2621
|
.optional()
|
|
2483
2622
|
.describe("Model name or alias (e.g. sonnet, claude-sonnet-4-5-20250929, latest)"),
|
|
2484
2623
|
outputFormat: z
|
|
2485
2624
|
.enum(["text", "json", "stream-json"])
|
|
2486
|
-
.default("
|
|
2487
|
-
.describe("Output format (text|json|stream-json). stream-json
|
|
2625
|
+
.default("stream-json")
|
|
2626
|
+
.describe("Output format (text|json|stream-json). DEFAULT: stream-json — the gateway parses NDJSON usage events to extract input/output/cache_read/cache_creation tokens + cost + model, persists them to the flight recorder for cache_state aggregates, and still returns the assistant text. Override to 'text' only when you truly want unparsed stdout (loses observability)."),
|
|
2488
2627
|
sessionId: z.string().optional().describe("Session ID (uses active if omitted)"),
|
|
2489
2628
|
continueSession: z.boolean().default(false).describe("Continue active session"),
|
|
2490
2629
|
createNewSession: z.boolean().default(false).describe("Force new session"),
|
|
@@ -2660,7 +2799,11 @@ export function createGatewayServer(deps = {}) {
|
|
|
2660
2799
|
sessionId: effectiveSessionId,
|
|
2661
2800
|
cli: "claude",
|
|
2662
2801
|
});
|
|
2663
|
-
|
|
2802
|
+
// Rec #4: include any prep-time warnings (e.g. cacheable_prefix_uncached).
|
|
2803
|
+
const warnings = [
|
|
2804
|
+
...(ttlWarning ? [ttlWarning] : []),
|
|
2805
|
+
...(prep.warnings ?? []),
|
|
2806
|
+
];
|
|
2664
2807
|
safeFlightStart({
|
|
2665
2808
|
correlationId: corrId,
|
|
2666
2809
|
cli: "claude",
|
|
@@ -2669,8 +2812,9 @@ export function createGatewayServer(deps = {}) {
|
|
|
2669
2812
|
sessionId: effectiveSessionId,
|
|
2670
2813
|
stablePrefixHash: prep.stablePrefixHash ?? undefined,
|
|
2671
2814
|
stablePrefixTokens: prep.stablePrefixTokens ?? undefined,
|
|
2815
|
+
cacheControlBlocks: prep.cacheControlBlocks,
|
|
2672
2816
|
}, runtime);
|
|
2673
|
-
logger.info(`[${corrId}] claude_request invoked with model=${prep.resolvedModel || "default"}, outputFormat=${outputFormat}, prompt length=${prep.effectivePrompt.length}, sessionId=${effectiveSessionId}`);
|
|
2817
|
+
logger.info(`[${corrId}] claude_request invoked with model=${prep.resolvedModel || "default"}, outputFormat=${outputFormat}, prompt length=${prep.effectivePrompt.length}, sessionId=${effectiveSessionId}, cacheControlBlocks=${prep.cacheControlBlocks ?? 0}`);
|
|
2674
2818
|
try {
|
|
2675
2819
|
if (useContinue) {
|
|
2676
2820
|
args.push("--continue");
|
|
@@ -2682,7 +2826,7 @@ export function createGatewayServer(deps = {}) {
|
|
|
2682
2826
|
// Idle timeout only for stream-json (text/json produce no output until done)
|
|
2683
2827
|
const effectiveIdleTimeout = outputFormat === "stream-json" ? resolveIdleTimeout("claude", idleTimeoutMs) : undefined;
|
|
2684
2828
|
const claudeSyncFrHandoff = buildAsyncFlightRecorderHandoff("claude", prep, effectiveSessionId, outputFormat);
|
|
2685
|
-
const result = await awaitJobOrDefer("claude", args, corrId, effectiveIdleTimeout, outputFormat, forceRefresh, runtime, undefined, undefined, claudeSyncFrHandoff.flightRecorderEntry, claudeSyncFrHandoff.extractUsage);
|
|
2829
|
+
const result = await awaitJobOrDefer("claude", args, corrId, effectiveIdleTimeout, outputFormat, forceRefresh, runtime, undefined, undefined, claudeSyncFrHandoff.flightRecorderEntry, claudeSyncFrHandoff.extractUsage, prep.stdinPayload);
|
|
2686
2830
|
// Deferred — job still running, return async reference
|
|
2687
2831
|
if (isDeferredResponse(result)) {
|
|
2688
2832
|
return buildDeferredToolResponse(result, effectiveSessionId);
|
|
@@ -3481,15 +3625,15 @@ export function createGatewayServer(deps = {}) {
|
|
|
3481
3625
|
.max(100000, "Prompt too long (max 100k chars)")
|
|
3482
3626
|
.optional()
|
|
3483
3627
|
.describe("Prompt text for Claude (mutually exclusive with promptParts)"),
|
|
3484
|
-
promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt
|
|
3628
|
+
promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task, cacheControl? }. Same semantics as claude_request: stable head (system/tools/context) + volatile tail (task). Set `cacheControl: { system?, tools?, context?: boolean }` to opt into explicit Anthropic prefix caching via `--input-format stream-json` (slice κ); requires `outputFormat: 'stream-json'` and hard-codes `ttl='1h'`. Mutually exclusive with `prompt`. Stable prefix hash logged to flight recorder."),
|
|
3485
3629
|
model: z
|
|
3486
3630
|
.string()
|
|
3487
3631
|
.optional()
|
|
3488
3632
|
.describe("Model name or alias (e.g. sonnet, claude-sonnet-4-5-20250929, latest)"),
|
|
3489
3633
|
outputFormat: z
|
|
3490
3634
|
.enum(["text", "json", "stream-json"])
|
|
3491
|
-
.default("
|
|
3492
|
-
.describe("Output format (text|json|stream-json). stream-json:
|
|
3635
|
+
.default("stream-json")
|
|
3636
|
+
.describe("Output format (text|json|stream-json). DEFAULT: stream-json — same rationale as claude_request: keeps usage/cache/cost observable for cache_state aggregates. Override to 'text' only when raw stdout is required (loses observability)."),
|
|
3493
3637
|
sessionId: z.string().optional().describe("Session ID (uses active if omitted)"),
|
|
3494
3638
|
continueSession: z.boolean().default(false).describe("Continue active session"),
|
|
3495
3639
|
createNewSession: z.boolean().default(false).describe("Force new session"),
|
|
@@ -3664,7 +3808,7 @@ export function createGatewayServer(deps = {}) {
|
|
|
3664
3808
|
assertUpstreamCliArgs("claude", args);
|
|
3665
3809
|
assertUpstreamCliEnv("claude", undefined);
|
|
3666
3810
|
const claudeAsyncFrHandoff = buildAsyncFlightRecorderHandoff("claude", prep, effectiveSessionId, outputFormat);
|
|
3667
|
-
const job = asyncJobManager.startJob("claude", args, corrId, undefined, effectiveIdleTimeout, outputFormat, forceRefresh, undefined, undefined, claudeAsyncFrHandoff.flightRecorderEntry, claudeAsyncFrHandoff.extractUsage, true);
|
|
3811
|
+
const job = asyncJobManager.startJob("claude", args, corrId, undefined, effectiveIdleTimeout, outputFormat, forceRefresh, undefined, undefined, claudeAsyncFrHandoff.flightRecorderEntry, claudeAsyncFrHandoff.extractUsage, true, prep.stdinPayload);
|
|
3668
3812
|
logger.info(`[${corrId}] claude_request_async started job ${job.id}, outputFormat=${outputFormat}`);
|
|
3669
3813
|
const asyncResponse = {
|
|
3670
3814
|
success: true,
|
|
@@ -3680,8 +3824,14 @@ export function createGatewayServer(deps = {}) {
|
|
|
3680
3824
|
if (prep.reviewIntegrity && prep.reviewIntegrity.violations.length > 0) {
|
|
3681
3825
|
asyncResponse.reviewIntegrity = prep.reviewIntegrity;
|
|
3682
3826
|
}
|
|
3683
|
-
|
|
3684
|
-
|
|
3827
|
+
// Rec #4: include any prep-time warnings (e.g.
|
|
3828
|
+
// cacheable_prefix_uncached) alongside ttlWarning.
|
|
3829
|
+
const mergedWarnings = [
|
|
3830
|
+
...(ttlWarning ? [ttlWarning] : []),
|
|
3831
|
+
...(prep.warnings ?? []),
|
|
3832
|
+
];
|
|
3833
|
+
if (mergedWarnings.length > 0) {
|
|
3834
|
+
asyncResponse.warnings = mergedWarnings;
|
|
3685
3835
|
}
|
|
3686
3836
|
return {
|
|
3687
3837
|
content: [
|
package/dist/prompt-parts.d.ts
CHANGED
|
@@ -1,25 +1,68 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
|
+
export interface PromptPartsCacheControl {
|
|
3
|
+
system?: boolean;
|
|
4
|
+
tools?: boolean;
|
|
5
|
+
context?: boolean;
|
|
6
|
+
}
|
|
2
7
|
export interface PromptParts {
|
|
3
8
|
system?: string;
|
|
4
9
|
tools?: string;
|
|
5
10
|
context?: string;
|
|
6
11
|
task: string;
|
|
12
|
+
/**
|
|
13
|
+
* Slice κ (Claude only): per-block opt-in to Anthropic `cache_control`
|
|
14
|
+
* breakpoints. Setting `system: true` (or tools/context) marks that
|
|
15
|
+
* block with `cache_control: {type:"ephemeral", ttl:"1h"}` in the
|
|
16
|
+
* stream-json payload the gateway pipes to `claude --input-format
|
|
17
|
+
* stream-json`. The `task` block is NEVER marked (it's the volatile
|
|
18
|
+
* tail). Empty parts are silently skipped even if their flag is true.
|
|
19
|
+
*
|
|
20
|
+
* Constraint: callers MUST also pass `outputFormat:"stream-json"` —
|
|
21
|
+
* mixing cacheControl with text/json output returns an error response.
|
|
22
|
+
* `ttl` is hard-coded to `"1h"` because Claude Code injects its own
|
|
23
|
+
* 1h-marked system blocks ahead of caller content and Anthropic
|
|
24
|
+
* rejects a 1h block after a 5m block.
|
|
25
|
+
*/
|
|
26
|
+
cacheControl?: PromptPartsCacheControl;
|
|
7
27
|
}
|
|
8
28
|
export declare const PromptPartsSchema: z.ZodObject<{
|
|
9
29
|
system: z.ZodOptional<z.ZodString>;
|
|
10
30
|
tools: z.ZodOptional<z.ZodString>;
|
|
11
31
|
context: z.ZodOptional<z.ZodString>;
|
|
12
32
|
task: z.ZodString;
|
|
33
|
+
cacheControl: z.ZodOptional<z.ZodObject<{
|
|
34
|
+
system: z.ZodOptional<z.ZodBoolean>;
|
|
35
|
+
tools: z.ZodOptional<z.ZodBoolean>;
|
|
36
|
+
context: z.ZodOptional<z.ZodBoolean>;
|
|
37
|
+
}, "strict", z.ZodTypeAny, {
|
|
38
|
+
system?: boolean | undefined;
|
|
39
|
+
tools?: boolean | undefined;
|
|
40
|
+
context?: boolean | undefined;
|
|
41
|
+
}, {
|
|
42
|
+
system?: boolean | undefined;
|
|
43
|
+
tools?: boolean | undefined;
|
|
44
|
+
context?: boolean | undefined;
|
|
45
|
+
}>>;
|
|
13
46
|
}, "strip", z.ZodTypeAny, {
|
|
14
47
|
task: string;
|
|
15
48
|
system?: string | undefined;
|
|
16
49
|
tools?: string | undefined;
|
|
17
50
|
context?: string | undefined;
|
|
51
|
+
cacheControl?: {
|
|
52
|
+
system?: boolean | undefined;
|
|
53
|
+
tools?: boolean | undefined;
|
|
54
|
+
context?: boolean | undefined;
|
|
55
|
+
} | undefined;
|
|
18
56
|
}, {
|
|
19
57
|
task: string;
|
|
20
58
|
system?: string | undefined;
|
|
21
59
|
tools?: string | undefined;
|
|
22
60
|
context?: string | undefined;
|
|
61
|
+
cacheControl?: {
|
|
62
|
+
system?: boolean | undefined;
|
|
63
|
+
tools?: boolean | undefined;
|
|
64
|
+
context?: boolean | undefined;
|
|
65
|
+
} | undefined;
|
|
23
66
|
}>;
|
|
24
67
|
export interface AssembleResult {
|
|
25
68
|
text: string;
|
|
@@ -36,3 +79,34 @@ export interface ResolvePromptInputArgs {
|
|
|
36
79
|
promptParts?: PromptParts;
|
|
37
80
|
}
|
|
38
81
|
export declare function resolvePromptInput(input: ResolvePromptInputArgs): ResolvedPromptInput;
|
|
82
|
+
export interface ClaudeContentBlock {
|
|
83
|
+
type: "text";
|
|
84
|
+
text: string;
|
|
85
|
+
cache_control?: {
|
|
86
|
+
type: "ephemeral";
|
|
87
|
+
ttl: "1h";
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
export interface ClaudeStreamJsonUserMessage {
|
|
91
|
+
type: "user";
|
|
92
|
+
message: {
|
|
93
|
+
role: "user";
|
|
94
|
+
content: ClaudeContentBlock[];
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
export interface AssembleClaudeCacheBlocksResult {
|
|
98
|
+
payload: ClaudeStreamJsonUserMessage;
|
|
99
|
+
markedBlockCount: number;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Slice κ: build the Claude `--input-format stream-json` payload from
|
|
103
|
+
* a `PromptParts`. Each non-empty part becomes one content block in
|
|
104
|
+
* `system → tools → context → task` order; parts whose name is `true`
|
|
105
|
+
* in `cacheControl` get `cache_control: {type:"ephemeral", ttl:"1h"}`.
|
|
106
|
+
*
|
|
107
|
+
* Empty parts are skipped (no zero-byte blocks) — a true flag on an
|
|
108
|
+
* empty part is silently a no-op and not counted in `markedBlockCount`.
|
|
109
|
+
* The `task` block is never marked, even if a caller accidentally
|
|
110
|
+
* tries (the schema doesn't expose `task` in `cacheControl`).
|
|
111
|
+
*/
|
|
112
|
+
export declare function assembleClaudeCacheBlocks(parts: PromptParts): AssembleClaudeCacheBlocksResult;
|
package/dist/prompt-parts.js
CHANGED
|
@@ -1,10 +1,18 @@
|
|
|
1
1
|
import { createHash } from "crypto";
|
|
2
2
|
import { z } from "zod";
|
|
3
|
+
const CacheControlSchema = z
|
|
4
|
+
.object({
|
|
5
|
+
system: z.boolean().optional(),
|
|
6
|
+
tools: z.boolean().optional(),
|
|
7
|
+
context: z.boolean().optional(),
|
|
8
|
+
})
|
|
9
|
+
.strict();
|
|
3
10
|
export const PromptPartsSchema = z.object({
|
|
4
11
|
system: z.string().optional(),
|
|
5
12
|
tools: z.string().optional(),
|
|
6
13
|
context: z.string().optional(),
|
|
7
14
|
task: z.string().min(1),
|
|
15
|
+
cacheControl: CacheControlSchema.optional(),
|
|
8
16
|
});
|
|
9
17
|
const SEPARATOR = "\n\n";
|
|
10
18
|
export function assemble(parts) {
|
|
@@ -40,3 +48,42 @@ export function resolvePromptInput(input) {
|
|
|
40
48
|
stablePrefixTokens: null,
|
|
41
49
|
};
|
|
42
50
|
}
|
|
51
|
+
/**
|
|
52
|
+
* Slice κ: build the Claude `--input-format stream-json` payload from
|
|
53
|
+
* a `PromptParts`. Each non-empty part becomes one content block in
|
|
54
|
+
* `system → tools → context → task` order; parts whose name is `true`
|
|
55
|
+
* in `cacheControl` get `cache_control: {type:"ephemeral", ttl:"1h"}`.
|
|
56
|
+
*
|
|
57
|
+
* Empty parts are skipped (no zero-byte blocks) — a true flag on an
|
|
58
|
+
* empty part is silently a no-op and not counted in `markedBlockCount`.
|
|
59
|
+
* The `task` block is never marked, even if a caller accidentally
|
|
60
|
+
* tries (the schema doesn't expose `task` in `cacheControl`).
|
|
61
|
+
*/
|
|
62
|
+
export function assembleClaudeCacheBlocks(parts) {
|
|
63
|
+
const blocks = [];
|
|
64
|
+
let markedBlockCount = 0;
|
|
65
|
+
const cc = parts.cacheControl ?? {};
|
|
66
|
+
const stableEntries = [
|
|
67
|
+
["system", parts.system],
|
|
68
|
+
["tools", parts.tools],
|
|
69
|
+
["context", parts.context],
|
|
70
|
+
];
|
|
71
|
+
for (const [name, value] of stableEntries) {
|
|
72
|
+
if (value === undefined || value.length === 0)
|
|
73
|
+
continue;
|
|
74
|
+
const block = { type: "text", text: value };
|
|
75
|
+
if (cc[name]) {
|
|
76
|
+
block.cache_control = { type: "ephemeral", ttl: "1h" };
|
|
77
|
+
markedBlockCount += 1;
|
|
78
|
+
}
|
|
79
|
+
blocks.push(block);
|
|
80
|
+
}
|
|
81
|
+
blocks.push({ type: "text", text: parts.task });
|
|
82
|
+
return {
|
|
83
|
+
payload: {
|
|
84
|
+
type: "user",
|
|
85
|
+
message: { role: "user", content: blocks },
|
|
86
|
+
},
|
|
87
|
+
markedBlockCount,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
import type { CliType } from "./session-manager.js";
|
|
2
|
-
|
|
2
|
+
/**
|
|
3
|
+
* `optional` (slice κ): consumes the next token as the flag's value
|
|
4
|
+
* ONLY if that token does not start with `-`. Used for Claude's
|
|
5
|
+
* `-p`/`--print`, which is a no-arg switch in claude-code 2.x but
|
|
6
|
+
* also doubles as the legacy `-p <prompt>` positional shorthand that
|
|
7
|
+
* the gateway has emitted since v0.x.
|
|
8
|
+
*/
|
|
9
|
+
export type CliFlagArity = "none" | "one" | "optional" | "variadic";
|
|
3
10
|
export interface CliFlagContract {
|
|
4
11
|
arity: CliFlagArity;
|
|
5
12
|
values?: readonly string[];
|
|
@@ -46,8 +46,16 @@ export const UPSTREAM_CLI_CONTRACTS = {
|
|
|
46
46
|
"strictMcpConfig",
|
|
47
47
|
],
|
|
48
48
|
flags: {
|
|
49
|
-
"-p": {
|
|
49
|
+
"-p": {
|
|
50
|
+
arity: "optional",
|
|
51
|
+
description: "Print/non-interactive mode. Legacy gateway emission used `-p <prompt>` (consumed as positional in claude's grammar); slice κ emits `-p` standalone followed by `--input-format stream-json` so the prompt flows in on stdin.",
|
|
52
|
+
},
|
|
50
53
|
"--model": { arity: "one", description: "Model selector" },
|
|
54
|
+
"--input-format": {
|
|
55
|
+
arity: "one",
|
|
56
|
+
values: ["text", "stream-json"],
|
|
57
|
+
description: "Slice κ: realtime JSON stdin payload. `stream-json` enables Anthropic cache_control breakpoints from caller-supplied content blocks.",
|
|
58
|
+
},
|
|
51
59
|
"--output-format": {
|
|
52
60
|
arity: "one",
|
|
53
61
|
values: ["json", "stream-json"],
|
|
@@ -57,6 +65,10 @@ export const UPSTREAM_CLI_CONTRACTS = {
|
|
|
57
65
|
arity: "none",
|
|
58
66
|
description: "Include partial messages in stream-json output",
|
|
59
67
|
},
|
|
68
|
+
"--verbose": {
|
|
69
|
+
arity: "none",
|
|
70
|
+
description: "Claude CLI 2.x: required alongside --print + --output-format=stream-json; affects stderr only, stream-json stdout shape unchanged",
|
|
71
|
+
},
|
|
60
72
|
"--allowed-tools": { arity: "variadic", description: "Allowed tool names/patterns" },
|
|
61
73
|
"--disallowed-tools": { arity: "variadic", description: "Disallowed tool names/patterns" },
|
|
62
74
|
"--permission-mode": {
|
|
@@ -142,6 +154,43 @@ export const UPSTREAM_CLI_CONTRACTS = {
|
|
|
142
154
|
args: ["-p", "hello", "--add-dir", "/tmp/a", "--add-dir", "/tmp/b"],
|
|
143
155
|
expect: "pass",
|
|
144
156
|
},
|
|
157
|
+
{
|
|
158
|
+
// Claude CLI 2.x: stream-json requires --verbose alongside --print.
|
|
159
|
+
// The gateway emits all three together; this fixture pins the combo
|
|
160
|
+
// so a future removal of --verbose breaks loudly here instead of
|
|
161
|
+
// silently at runtime against the upstream CLI.
|
|
162
|
+
id: "claude-stream-json-requires-verbose",
|
|
163
|
+
description: "Claude CLI 2.x: --output-format stream-json + --include-partial-messages + --verbose accepted together",
|
|
164
|
+
args: [
|
|
165
|
+
"-p",
|
|
166
|
+
"hello",
|
|
167
|
+
"--output-format",
|
|
168
|
+
"stream-json",
|
|
169
|
+
"--include-partial-messages",
|
|
170
|
+
"--verbose",
|
|
171
|
+
],
|
|
172
|
+
expect: "pass",
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
// Slice κ: when caller marks promptParts with cache_control, the
|
|
176
|
+
// gateway emits `-p` as a standalone flag and pipes the JSON
|
|
177
|
+
// content-blocks payload over stdin via `--input-format
|
|
178
|
+
// stream-json`. The fixture pins the exact argv combination so
|
|
179
|
+
// a future regression (re-emitting a positional prompt, dropping
|
|
180
|
+
// `--input-format`, etc.) trips loudly here.
|
|
181
|
+
id: "claude-input-format-stream-json",
|
|
182
|
+
description: "Slice κ: `-p` standalone + --input-format stream-json + --output-format stream-json + --include-partial-messages + --verbose",
|
|
183
|
+
args: [
|
|
184
|
+
"-p",
|
|
185
|
+
"--input-format",
|
|
186
|
+
"stream-json",
|
|
187
|
+
"--output-format",
|
|
188
|
+
"stream-json",
|
|
189
|
+
"--include-partial-messages",
|
|
190
|
+
"--verbose",
|
|
191
|
+
],
|
|
192
|
+
expect: "pass",
|
|
193
|
+
},
|
|
145
194
|
],
|
|
146
195
|
},
|
|
147
196
|
codex: {
|
|
@@ -743,6 +792,14 @@ export function validateUpstreamCliArgs(cli, args) {
|
|
|
743
792
|
i += 1;
|
|
744
793
|
continue;
|
|
745
794
|
}
|
|
795
|
+
if (flag.arity === "optional") {
|
|
796
|
+
const value = args[i + 1];
|
|
797
|
+
if (value !== undefined && !value.startsWith("-")) {
|
|
798
|
+
validateFlagValue(cli, arg, flag, value, i + 1, violations);
|
|
799
|
+
i += 1;
|
|
800
|
+
}
|
|
801
|
+
continue;
|
|
802
|
+
}
|
|
746
803
|
let consumed = 0;
|
|
747
804
|
while (i + 1 < args.length && !args[i + 1].startsWith("-")) {
|
|
748
805
|
validateFlagValue(cli, arg, flag, args[i + 1], i + 1, violations);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "llm-cli-gateway",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.14.0",
|
|
4
4
|
"mcpName": "io.github.verivus-oss/llm-cli-gateway",
|
|
5
5
|
"description": "MCP server providing unified access to Claude Code, Codex, Gemini, Grok, and Mistral Vibe CLIs with session management, retry logic, async job orchestration, durable job results, and cross-LLM validation.",
|
|
6
6
|
"license": "MIT",
|
|
@@ -70,6 +70,7 @@
|
|
|
70
70
|
"test:integration": "INTEGRATION_TESTS=1 vitest run src/__tests__/integration.test.ts",
|
|
71
71
|
"test:pg": "bash ./scripts/test-pg.sh",
|
|
72
72
|
"test:all": "npm run test && npm run test:pg",
|
|
73
|
+
"smoke:cache-control": "node docs/plans/slice-kappa-smoke-test.mjs",
|
|
73
74
|
"lint": "eslint src/**/*.ts",
|
|
74
75
|
"lint:fix": "eslint src/**/*.ts --fix",
|
|
75
76
|
"format": "prettier --write 'src/**/*.ts'",
|