llm-cli-gateway 1.5.34 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
2
+ import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
3
3
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
4
4
  import { randomUUID } from "crypto";
5
5
  import { existsSync, readFileSync, readdirSync, renameSync, unlinkSync } from "fs";
@@ -14,7 +14,7 @@ import { createSessionManager } from "./session-manager.js";
14
14
  import { ResourceProvider } from "./resources.js";
15
15
  import { PerformanceMetrics } from "./metrics.js";
16
16
  import { estimateTokens, optimizePrompt as optimizePromptText, optimizeResponse as optimizeResponseText, } from "./optimizer.js";
17
- import { loadConfig, loadPersistenceConfig } from "./config.js";
17
+ import { loadConfig, loadPersistenceConfig, loadCacheAwarenessConfig, } from "./config.js";
18
18
  import { checkHealth } from "./health.js";
19
19
  import { clearModelRegistryCache, getAvailableCliInfo, getCliInfo, resolveModelAlias, } from "./model-registry.js";
20
20
  import { AsyncJobManager } from "./async-job-manager.js";
@@ -24,6 +24,8 @@ import { checkReviewIntegrity } from "./review-integrity.js";
24
24
  import { buildClaudeMcpConfig, CLAUDE_MCP_SERVER_NAMES, } from "./claude-mcp-config.js";
25
25
  import { resolveGrokSessionArgs, resolveMistralSessionArgs, resolveCodexSessionArgs, sanitizeCliArgValues, prepareMistralRequest as buildMistralCliInvocation, MISTRAL_AGENT_MODES, GATEWAY_SESSION_PREFIX, resolveClaudePermissionFlags, resolveCodexSandboxFlags, CLAUDE_PERMISSION_MODES, GEMINI_APPROVAL_MODES, CODEX_SANDBOX_MODES, CODEX_ASK_FOR_APPROVAL_MODES, CLAUDE_EFFORT_LEVELS, prepareClaudeHighImpactFlags, validateClaudeAgentsMap, prepareCodexHighImpactFlags, prepareCodexForkRequest, CODEX_CONFIG_OVERRIDES_SCHEMA, prepareGeminiHighImpactFlags, prependGeminiAttachments, resolveGeminiSessionPlan, GEMINI_HIGH_IMPACT_PARAMS_SCHEMA, } from "./request-helpers.js";
26
26
  import { createFlightRecorder } from "./flight-recorder.js";
27
+ import { resolvePromptInput, PromptPartsSchema } from "./prompt-parts.js";
28
+ import { computeSessionCacheStats, computeTtlRemaining } from "./cache-stats.js";
27
29
  import { getCliVersions, runCliUpgrade } from "./cli-updater.js";
28
30
  import { startHttpGateway } from "./http-transport.js";
29
31
  import { printDoctorJson } from "./doctor.js";
@@ -108,6 +110,22 @@ const SYNC_DEADLINE_MS = (() => {
108
110
  const __filename = fileURLToPath(import.meta.url);
109
111
  const __dirname = dirname(__filename);
110
112
  const SKILLS_DIR = join(__dirname, "..", ".agents", "skills");
113
+ function packageVersion() {
114
+ const candidates = [
115
+ join(__dirname, "..", "package.json"),
116
+ join(__dirname, "..", "..", "package.json"),
117
+ ];
118
+ for (const candidate of candidates) {
119
+ try {
120
+ const parsed = JSON.parse(readFileSync(candidate, "utf8"));
121
+ return parsed.version || "unknown";
122
+ }
123
+ catch {
124
+ // Try next candidate.
125
+ }
126
+ }
127
+ return "unknown";
128
+ }
111
129
  function loadSkills() {
112
130
  const skills = [];
113
131
  try {
@@ -158,29 +176,56 @@ let sessionManager;
158
176
  let db = null;
159
177
  const performanceMetrics = new PerformanceMetrics();
160
178
  let resourceProvider;
161
- const flightRecorder = createFlightRecorder(logger);
179
+ let flightRecorder = null;
162
180
  // Resolved persistence config — single source of truth for the async-job backend.
163
181
  // Driven by ~/.llm-cli-gateway/config.toml (+ deprecated env-var overrides).
164
182
  // When backend = "none", the JobStore is null AND *_request_async tools are not
165
183
  // registered (see createGatewayServer), making silent in-memory loss
166
184
  // structurally impossible.
167
- const persistenceConfig = loadPersistenceConfig(logger);
168
- const jobStore = (() => {
185
+ let persistenceConfig = null;
186
+ let cacheAwarenessConfig = null;
187
+ let jobStore = null;
188
+ let jobStoreInitialized = false;
189
+ let asyncJobManager = null;
190
+ let approvalManager = null;
191
+ function getFlightRecorder(runtimeLogger = logger) {
192
+ flightRecorder ??= createFlightRecorder(runtimeLogger);
193
+ return flightRecorder;
194
+ }
195
+ function getPersistenceConfig(runtimeLogger = logger) {
196
+ persistenceConfig ??= loadPersistenceConfig(runtimeLogger);
197
+ return persistenceConfig;
198
+ }
199
+ function getCacheAwarenessConfig(runtimeLogger = logger) {
200
+ cacheAwarenessConfig ??= loadCacheAwarenessConfig(runtimeLogger);
201
+ return cacheAwarenessConfig;
202
+ }
203
+ function getJobStore(runtimeLogger = logger) {
204
+ if (jobStoreInitialized)
205
+ return jobStore;
206
+ jobStoreInitialized = true;
169
207
  try {
170
- return createJobStore(persistenceConfig, logger);
208
+ jobStore = createJobStore(getPersistenceConfig(runtimeLogger), runtimeLogger);
171
209
  }
172
210
  catch (err) {
173
- logger.error("Failed to open durable job store; async tools will be unavailable", err);
174
- return null;
211
+ runtimeLogger.error("Failed to open durable job store; async tools will be unavailable", err);
212
+ jobStore = null;
175
213
  }
176
- })();
177
- function newAsyncJobManager(metrics, runtimeLogger, store = jobStore) {
214
+ return jobStore;
215
+ }
216
+ function newAsyncJobManager(metrics, runtimeLogger, store = getJobStore(runtimeLogger)) {
178
217
  return new AsyncJobManager(runtimeLogger, (cli, durationMs, success) => {
179
218
  metrics.recordRequest(cli, durationMs, success);
180
219
  }, store);
181
220
  }
182
- const asyncJobManager = newAsyncJobManager(performanceMetrics, logger);
183
- const approvalManager = new ApprovalManager(undefined, logger);
221
+ function getAsyncJobManager(runtimeLogger = logger) {
222
+ asyncJobManager ??= newAsyncJobManager(performanceMetrics, runtimeLogger);
223
+ return asyncJobManager;
224
+ }
225
+ function getApprovalManager(runtimeLogger = logger) {
226
+ approvalManager ??= new ApprovalManager(undefined, runtimeLogger);
227
+ return approvalManager;
228
+ }
184
229
  const MCP_SERVER_ENUM = z.enum(CLAUDE_MCP_SERVER_NAMES);
185
230
  // U22: Session-provider enum extended to five providers. The storage layer's
186
231
  // CLI_TYPES already includes "mistral"; the MCP-tool layer mirrors that here so
@@ -199,22 +244,26 @@ function resolveGatewayServerRuntime(deps = {}, options = {}) {
199
244
  ? // Factory-created test/HTTP session servers must not mark another instance's
200
245
  // durable jobs orphaned. Stdio startup injects the process-global manager.
201
246
  newAsyncJobManager(runtimePerformanceMetrics, runtimeLogger, null)
202
- : asyncJobManager);
247
+ : getAsyncJobManager(runtimeLogger));
203
248
  const runtimeApprovalManager = deps.approvalManager ??
204
- (options.isolateState ? new ApprovalManager(undefined, runtimeLogger) : approvalManager);
249
+ (options.isolateState
250
+ ? new ApprovalManager(undefined, runtimeLogger)
251
+ : getApprovalManager(runtimeLogger));
252
+ const runtimeFlightRecorder = deps.flightRecorder ?? getFlightRecorder(runtimeLogger);
205
253
  return {
206
254
  sessionManager: runtimeSessionManager,
207
255
  resourceProvider: deps.resourceProvider ??
208
256
  (options.isolateState
209
- ? new ResourceProvider(runtimeSessionManager, runtimePerformanceMetrics)
257
+ ? new ResourceProvider(runtimeSessionManager, runtimePerformanceMetrics, runtimeFlightRecorder, deps.cacheAwareness ?? getCacheAwarenessConfig(runtimeLogger))
210
258
  : resourceProvider),
211
259
  db: "db" in deps ? (deps.db ?? null) : db,
212
260
  performanceMetrics: runtimePerformanceMetrics,
213
261
  asyncJobManager: runtimeAsyncJobManager,
214
262
  approvalManager: runtimeApprovalManager,
215
- flightRecorder: deps.flightRecorder ?? flightRecorder,
263
+ flightRecorder: runtimeFlightRecorder,
216
264
  logger: runtimeLogger,
217
- persistence: deps.persistence ?? persistenceConfig,
265
+ persistence: deps.persistence ?? getPersistenceConfig(runtimeLogger),
266
+ cacheAwareness: deps.cacheAwareness ?? getCacheAwarenessConfig(runtimeLogger),
218
267
  };
219
268
  }
220
269
  // Per-CLI idle timeouts: kill process if no stdout/stderr activity for this duration.
@@ -664,14 +713,118 @@ function registerBaseResources(server, runtime) {
664
713
  const contents = await runtime.resourceProvider.readResource(uri.href);
665
714
  return { contents: contents ? [contents] : [] };
666
715
  });
716
+ // Cache-state resources (slice 2). Static URI for global, templated for
717
+ // session/{id} and prefix/{hash}. All three return tokens/hashes/aggregates
718
+ // ONLY — never raw prompt or response text. The structural guarantee is in
719
+ // the SessionCacheStats / PrefixCacheStats / GlobalCacheStats types
720
+ // themselves: those shapes have no prompt/response/system/task fields.
721
+ server.registerResource("cache-state-global", "cache_state://global", {
722
+ title: "💾 Cache State (Global)",
723
+ description: "Aggregate cache hit/miss/savings across all CLIs in the flight recorder. Tokens/hashes only — no prompt text.",
724
+ mimeType: "application/json",
725
+ }, async (uri) => {
726
+ runtime.logger.debug("Reading cache_state://global resource");
727
+ const stats = runtime.resourceProvider.readCacheStateGlobal({
728
+ lastNHours: 24,
729
+ });
730
+ return {
731
+ contents: [
732
+ {
733
+ uri: uri.href,
734
+ mimeType: "application/json",
735
+ text: JSON.stringify(stats, null, 2),
736
+ },
737
+ ],
738
+ };
739
+ });
740
+ server.registerResource("cache-state-session", new ResourceTemplate("cache_state://session/{sessionId}", { list: undefined }), {
741
+ title: "💾 Cache State (Session)",
742
+ description: "Per-session cache hit/miss/savings. Tokens/hashes only — no prompt text.",
743
+ mimeType: "application/json",
744
+ }, async (uri, variables) => {
745
+ const sessionId = Array.isArray(variables.sessionId)
746
+ ? variables.sessionId[0]
747
+ : variables.sessionId;
748
+ runtime.logger.debug(`Reading cache_state://session/${sessionId}`);
749
+ const stats = runtime.resourceProvider.readCacheStateSession(String(sessionId));
750
+ return {
751
+ contents: [
752
+ {
753
+ uri: uri.href,
754
+ mimeType: "application/json",
755
+ text: JSON.stringify(stats, null, 2),
756
+ },
757
+ ],
758
+ };
759
+ });
760
+ server.registerResource("cache-state-prefix", new ResourceTemplate("cache_state://prefix/{hash}", { list: undefined }), {
761
+ title: "💾 Cache State (Prefix)",
762
+ description: "Per-stable-prefix-hash cache hit/miss/savings, with CLI breakdown. Tokens/hashes only — no prompt text.",
763
+ mimeType: "application/json",
764
+ }, async (uri, variables) => {
765
+ const hash = Array.isArray(variables.hash) ? variables.hash[0] : variables.hash;
766
+ runtime.logger.debug(`Reading cache_state://prefix/${hash}`);
767
+ const stats = runtime.resourceProvider.readCacheStateForPrefix(String(hash));
768
+ return {
769
+ contents: [
770
+ {
771
+ uri: uri.href,
772
+ mimeType: "application/json",
773
+ text: JSON.stringify(stats, null, 2),
774
+ },
775
+ ],
776
+ };
777
+ });
778
+ }
779
+ /**
780
+ * Slice 1: validate the prompt / promptParts mutex at the prep boundary and
781
+ * return either an error response or the resolved input. The exact error
782
+ * messages are part of the public contract — tests assert them verbatim.
783
+ */
784
+ function resolvePromptOrPartsForPrep(args) {
785
+ const hasPrompt = typeof args.prompt === "string" && args.prompt.length > 0;
786
+ const hasParts = args.promptParts !== undefined;
787
+ if (hasPrompt && hasParts) {
788
+ return {
789
+ ok: false,
790
+ error: createErrorResponse(args.operation, 1, "", args.correlationId, new Error("provide exactly one of `prompt` or `promptParts`")),
791
+ };
792
+ }
793
+ if (!hasPrompt && !hasParts) {
794
+ return {
795
+ ok: false,
796
+ error: createErrorResponse(args.operation, 1, "", args.correlationId, new Error("one of `prompt` or `promptParts` is required")),
797
+ };
798
+ }
799
+ const resolved = resolvePromptInput({
800
+ prompt: args.prompt,
801
+ promptParts: args.promptParts,
802
+ });
803
+ return {
804
+ ok: true,
805
+ assembledPrompt: resolved.assembledPrompt,
806
+ stablePrefixHash: resolved.stablePrefixHash,
807
+ stablePrefixTokens: resolved.stablePrefixTokens,
808
+ };
667
809
  }
668
810
  export function prepareClaudeRequest(params, runtime = resolveGatewayServerRuntime()) {
669
811
  const corrId = params.correlationId || randomUUID();
670
812
  const cliInfo = getCliInfo();
671
813
  const resolvedModel = resolveModelAlias("claude", params.model, cliInfo);
814
+ const inputResolution = resolvePromptOrPartsForPrep({
815
+ prompt: params.prompt,
816
+ promptParts: params.promptParts,
817
+ operation: params.operation,
818
+ correlationId: corrId,
819
+ });
820
+ if (!inputResolution.ok)
821
+ return inputResolution.error;
822
+ const assembledPrompt = inputResolution.assembledPrompt;
823
+ const stablePrefixHash = inputResolution.stablePrefixHash;
824
+ const stablePrefixTokens = inputResolution.stablePrefixTokens;
672
825
  // Review integrity check on raw prompt (before optimization)
673
826
  const reviewIntegrity = checkReviewIntegrity({
674
- prompt: params.prompt,
827
+ prompt: assembledPrompt,
675
828
  allowedTools: params.allowedTools,
676
829
  disallowedTools: params.disallowedTools,
677
830
  });
@@ -682,7 +835,7 @@ export function prepareClaudeRequest(params, runtime = resolveGatewayServerRunti
682
835
  score: reviewIntegrity.totalScore,
683
836
  });
684
837
  }
685
- let effectivePrompt = params.prompt;
838
+ let effectivePrompt = assembledPrompt;
686
839
  if (params.optimizePrompt) {
687
840
  const optimized = optimizePromptText(effectivePrompt);
688
841
  logOptimizationTokens("prompt", corrId, effectivePrompt, optimized);
@@ -699,7 +852,7 @@ export function prepareClaudeRequest(params, runtime = resolveGatewayServerRunti
699
852
  approvalDecision = runtime.approvalManager.decide({
700
853
  cli: "claude",
701
854
  operation: params.operation,
702
- prompt: params.prompt, // Use raw prompt for review-context detection, not optimized
855
+ prompt: assembledPrompt, // Use raw assembled prompt for review-context detection, not optimized
703
856
  bypassRequested: params.dangerouslySkipPermissions,
704
857
  fullAuto: false,
705
858
  requestedMcpServers,
@@ -778,14 +931,27 @@ export function prepareClaudeRequest(params, runtime = resolveGatewayServerRunti
778
931
  approvalDecision,
779
932
  reviewIntegrity,
780
933
  args,
934
+ stablePrefixHash,
935
+ stablePrefixTokens,
781
936
  };
782
937
  }
783
938
  export function prepareCodexRequest(params, runtime = resolveGatewayServerRuntime()) {
784
939
  const corrId = params.correlationId || randomUUID();
785
940
  const cliInfo = getCliInfo();
786
941
  const resolvedModel = resolveModelAlias("codex", params.model, cliInfo);
942
+ const inputResolution = resolvePromptOrPartsForPrep({
943
+ prompt: params.prompt,
944
+ promptParts: params.promptParts,
945
+ operation: params.operation,
946
+ correlationId: corrId,
947
+ });
948
+ if (!inputResolution.ok)
949
+ return inputResolution.error;
950
+ const assembledPrompt = inputResolution.assembledPrompt;
951
+ const stablePrefixHash = inputResolution.stablePrefixHash;
952
+ const stablePrefixTokens = inputResolution.stablePrefixTokens;
787
953
  // Review integrity check on raw prompt (before optimization)
788
- const reviewIntegrity = checkReviewIntegrity({ prompt: params.prompt });
954
+ const reviewIntegrity = checkReviewIntegrity({ prompt: assembledPrompt });
789
955
  if (reviewIntegrity.violations.length > 0) {
790
956
  runtime.logger.info(`[${corrId}] Review integrity violations detected: ${reviewIntegrity.violations.map(v => v.type).join(", ")}`, {
791
957
  cli: "codex",
@@ -793,7 +959,7 @@ export function prepareCodexRequest(params, runtime = resolveGatewayServerRuntim
793
959
  score: reviewIntegrity.totalScore,
794
960
  });
795
961
  }
796
- let effectivePrompt = params.prompt;
962
+ let effectivePrompt = assembledPrompt;
797
963
  if (params.optimizePrompt) {
798
964
  const optimized = optimizePromptText(effectivePrompt);
799
965
  logOptimizationTokens("prompt", corrId, effectivePrompt, optimized);
@@ -805,7 +971,7 @@ export function prepareCodexRequest(params, runtime = resolveGatewayServerRuntim
805
971
  approvalDecision = runtime.approvalManager.decide({
806
972
  cli: "codex",
807
973
  operation: params.operation,
808
- prompt: params.prompt, // Use raw prompt for review-context detection, not optimized
974
+ prompt: assembledPrompt, // Use raw assembled prompt for review-context detection, not optimized
809
975
  bypassRequested: params.dangerouslyBypassApprovalsAndSandbox,
810
976
  fullAuto: params.fullAuto,
811
977
  requestedMcpServers,
@@ -920,15 +1086,28 @@ export function prepareCodexRequest(params, runtime = resolveGatewayServerRuntim
920
1086
  reviewIntegrity,
921
1087
  args,
922
1088
  cleanup: highImpactCleanup,
1089
+ stablePrefixHash,
1090
+ stablePrefixTokens,
923
1091
  };
924
1092
  }
925
1093
  export function prepareGeminiRequest(params, runtime = resolveGatewayServerRuntime()) {
926
1094
  const corrId = params.correlationId || randomUUID();
927
1095
  const cliInfo = getCliInfo();
928
1096
  const resolvedModel = resolveModelAlias("gemini", params.model, cliInfo);
1097
+ const inputResolution = resolvePromptOrPartsForPrep({
1098
+ prompt: params.prompt,
1099
+ promptParts: params.promptParts,
1100
+ operation: params.operation,
1101
+ correlationId: corrId,
1102
+ });
1103
+ if (!inputResolution.ok)
1104
+ return inputResolution.error;
1105
+ const assembledPrompt = inputResolution.assembledPrompt;
1106
+ const stablePrefixHash = inputResolution.stablePrefixHash;
1107
+ const stablePrefixTokens = inputResolution.stablePrefixTokens;
929
1108
  // Review integrity check on raw prompt (before optimization)
930
1109
  const reviewIntegrity = checkReviewIntegrity({
931
- prompt: params.prompt,
1110
+ prompt: assembledPrompt,
932
1111
  allowedTools: params.allowedTools,
933
1112
  });
934
1113
  if (reviewIntegrity.violations.length > 0) {
@@ -938,7 +1117,7 @@ export function prepareGeminiRequest(params, runtime = resolveGatewayServerRunti
938
1117
  score: reviewIntegrity.totalScore,
939
1118
  });
940
1119
  }
941
- let effectivePrompt = params.prompt;
1120
+ let effectivePrompt = assembledPrompt;
942
1121
  if (params.optimizePrompt) {
943
1122
  const optimized = optimizePromptText(effectivePrompt);
944
1123
  logOptimizationTokens("prompt", corrId, effectivePrompt, optimized);
@@ -950,7 +1129,7 @@ export function prepareGeminiRequest(params, runtime = resolveGatewayServerRunti
950
1129
  approvalDecision = runtime.approvalManager.decide({
951
1130
  cli: "gemini",
952
1131
  operation: params.operation,
953
- prompt: params.prompt, // Use raw prompt for review-context detection, not optimized
1132
+ prompt: assembledPrompt, // Use raw assembled prompt for review-context detection, not optimized
954
1133
  bypassRequested: params.approvalMode === "yolo",
955
1134
  fullAuto: false,
956
1135
  requestedMcpServers,
@@ -1020,15 +1199,28 @@ export function prepareGeminiRequest(params, runtime = resolveGatewayServerRunti
1020
1199
  approvalDecision,
1021
1200
  reviewIntegrity,
1022
1201
  args,
1202
+ stablePrefixHash,
1203
+ stablePrefixTokens,
1023
1204
  };
1024
1205
  }
1025
1206
  function prepareGrokRequest(params, runtime = resolveGatewayServerRuntime()) {
1026
1207
  const corrId = params.correlationId || randomUUID();
1027
1208
  const cliInfo = getCliInfo();
1028
1209
  const resolvedModel = resolveModelAlias("grok", params.model, cliInfo);
1210
+ const inputResolution = resolvePromptOrPartsForPrep({
1211
+ prompt: params.prompt,
1212
+ promptParts: params.promptParts,
1213
+ operation: params.operation,
1214
+ correlationId: corrId,
1215
+ });
1216
+ if (!inputResolution.ok)
1217
+ return inputResolution.error;
1218
+ const assembledPrompt = inputResolution.assembledPrompt;
1219
+ const stablePrefixHash = inputResolution.stablePrefixHash;
1220
+ const stablePrefixTokens = inputResolution.stablePrefixTokens;
1029
1221
  // Review integrity check on raw prompt (before optimization)
1030
1222
  const reviewIntegrity = checkReviewIntegrity({
1031
- prompt: params.prompt,
1223
+ prompt: assembledPrompt,
1032
1224
  allowedTools: params.allowedTools,
1033
1225
  disallowedTools: params.disallowedTools,
1034
1226
  });
@@ -1039,7 +1231,7 @@ function prepareGrokRequest(params, runtime = resolveGatewayServerRuntime()) {
1039
1231
  score: reviewIntegrity.totalScore,
1040
1232
  });
1041
1233
  }
1042
- let effectivePrompt = params.prompt;
1234
+ let effectivePrompt = assembledPrompt;
1043
1235
  if (params.optimizePrompt) {
1044
1236
  const optimized = optimizePromptText(effectivePrompt);
1045
1237
  logOptimizationTokens("prompt", corrId, effectivePrompt, optimized);
@@ -1051,7 +1243,7 @@ function prepareGrokRequest(params, runtime = resolveGatewayServerRuntime()) {
1051
1243
  approvalDecision = runtime.approvalManager.decide({
1052
1244
  cli: "grok",
1053
1245
  operation: params.operation,
1054
- prompt: params.prompt, // Use raw prompt for review-context detection, not optimized
1246
+ prompt: assembledPrompt, // Use raw assembled prompt for review-context detection, not optimized
1055
1247
  bypassRequested: Boolean(params.alwaysApprove) || params.permissionMode === "bypassPermissions",
1056
1248
  fullAuto: false,
1057
1249
  requestedMcpServers,
@@ -1095,14 +1287,27 @@ function prepareGrokRequest(params, runtime = resolveGatewayServerRuntime()) {
1095
1287
  approvalDecision,
1096
1288
  reviewIntegrity,
1097
1289
  args,
1290
+ stablePrefixHash,
1291
+ stablePrefixTokens,
1098
1292
  };
1099
1293
  }
1100
1294
  export function prepareMistralRequest(params, runtime = resolveGatewayServerRuntime()) {
1101
1295
  const corrId = params.correlationId || randomUUID();
1102
1296
  const cliInfo = getCliInfo();
1103
1297
  const resolvedModel = resolveModelAlias("mistral", params.model, cliInfo);
1104
- const reviewIntegrity = checkReviewIntegrity({
1298
+ const inputResolution = resolvePromptOrPartsForPrep({
1105
1299
  prompt: params.prompt,
1300
+ promptParts: params.promptParts,
1301
+ operation: params.operation,
1302
+ correlationId: corrId,
1303
+ });
1304
+ if (!inputResolution.ok)
1305
+ return inputResolution.error;
1306
+ const assembledPrompt = inputResolution.assembledPrompt;
1307
+ const stablePrefixHash = inputResolution.stablePrefixHash;
1308
+ const stablePrefixTokens = inputResolution.stablePrefixTokens;
1309
+ const reviewIntegrity = checkReviewIntegrity({
1310
+ prompt: assembledPrompt,
1106
1311
  allowedTools: params.allowedTools,
1107
1312
  disallowedTools: params.disallowedTools,
1108
1313
  });
@@ -1113,7 +1318,7 @@ export function prepareMistralRequest(params, runtime = resolveGatewayServerRunt
1113
1318
  score: reviewIntegrity.totalScore,
1114
1319
  });
1115
1320
  }
1116
- let effectivePrompt = params.prompt;
1321
+ let effectivePrompt = assembledPrompt;
1117
1322
  if (params.optimizePrompt) {
1118
1323
  const optimized = optimizePromptText(effectivePrompt);
1119
1324
  logOptimizationTokens("prompt", corrId, effectivePrompt, optimized);
@@ -1125,7 +1330,7 @@ export function prepareMistralRequest(params, runtime = resolveGatewayServerRunt
1125
1330
  approvalDecision = runtime.approvalManager.decide({
1126
1331
  cli: "mistral",
1127
1332
  operation: params.operation,
1128
- prompt: params.prompt,
1333
+ prompt: assembledPrompt,
1129
1334
  bypassRequested: params.permissionMode === "auto-approve",
1130
1335
  fullAuto: false,
1131
1336
  requestedMcpServers,
@@ -1170,6 +1375,8 @@ export function prepareMistralRequest(params, runtime = resolveGatewayServerRunt
1170
1375
  reviewIntegrity,
1171
1376
  args: prep.args,
1172
1377
  mistralEnv: prep.env,
1378
+ stablePrefixHash,
1379
+ stablePrefixTokens,
1173
1380
  };
1174
1381
  }
1175
1382
  function isMistralModelSelectionFailure(stderr) {
@@ -1185,7 +1392,7 @@ function selectMistralRecoveryModel(failedModel) {
1185
1392
  ].filter((model) => Boolean(model && model !== failedModel));
1186
1393
  return candidates.find(model => model !== "local");
1187
1394
  }
1188
- function buildCliResponse(cli, stdout, optimizeResponse, corrId, sessionId, prep, durationMs, resumable, outputFormat) {
1395
+ function buildCliResponse(cli, stdout, optimizeResponse, corrId, sessionId, prep, durationMs, resumable, outputFormat, warnings) {
1189
1396
  let finalStdout = stdout;
1190
1397
  // Skip response optimization for JSON output to prevent corrupting structured data
1191
1398
  if (optimizeResponse && outputFormat !== "json") {
@@ -1234,8 +1441,41 @@ function buildCliResponse(cli, stdout, optimizeResponse, corrId, sessionId, prep
1234
1441
  if (prep.reviewIntegrity && prep.reviewIntegrity.violations.length > 0) {
1235
1442
  response.reviewIntegrity = prep.reviewIntegrity;
1236
1443
  }
1444
+ if (warnings && warnings.length > 0) {
1445
+ response.warnings = warnings;
1446
+ }
1237
1447
  return response;
1238
1448
  }
1449
+ /**
1450
+ * Slice 3 helper: compute the cache_ttl_expiring_soon warning for a
1451
+ * claude session, if the feature is enabled, the session has prior cache
1452
+ * writes, and ttlRemainingMs is below the threshold (30s by default).
1453
+ * Returns null when no warning applies.
1454
+ */
1455
+ function maybeBuildCacheTtlWarning(args) {
1456
+ if (args.cli !== "claude")
1457
+ return null;
1458
+ if (!args.sessionId)
1459
+ return null;
1460
+ if (!args.runtime.cacheAwareness?.warnOnTtlExpiry)
1461
+ return null;
1462
+ const stats = computeSessionCacheStats(args.runtime.flightRecorder, args.sessionId);
1463
+ if (stats.requestCount === 0 || !stats.lastRequestAt)
1464
+ return null;
1465
+ const ttl = computeTtlRemaining(stats, args.cli, {
1466
+ anthropicTtlSeconds: args.runtime.cacheAwareness.anthropicTtlSeconds,
1467
+ });
1468
+ if (ttl === null)
1469
+ return null;
1470
+ const threshold = args.thresholdMs ?? 30_000;
1471
+ if (ttl >= threshold)
1472
+ return null;
1473
+ return {
1474
+ code: "cache_ttl_expiring_soon",
1475
+ ttlRemainingMs: ttl,
1476
+ message: `Anthropic cache breakpoint for session ${args.sessionId} expires in ${ttl}ms (< ${threshold}ms). Subsequent requests may miss the cache.`,
1477
+ };
1478
+ }
1239
1479
  function resolveHandlerRuntime(deps) {
1240
1480
  if (deps.runtime)
1241
1481
  return deps.runtime;
@@ -1259,6 +1499,7 @@ export async function handleGeminiRequest(deps, params) {
1259
1499
  const startTime = Date.now();
1260
1500
  const prep = prepareGeminiRequest({
1261
1501
  prompt: params.prompt,
1502
+ promptParts: params.promptParts,
1262
1503
  model: params.model,
1263
1504
  approvalMode: params.approvalMode,
1264
1505
  approvalStrategy: params.approvalStrategy,
@@ -1284,10 +1525,12 @@ export async function handleGeminiRequest(deps, params) {
1284
1525
  correlationId: corrId,
1285
1526
  cli: "gemini",
1286
1527
  model: prep.resolvedModel || "default",
1287
- prompt: params.prompt,
1528
+ prompt: prep.effectivePrompt,
1288
1529
  sessionId: params.sessionId,
1530
+ stablePrefixHash: prep.stablePrefixHash ?? undefined,
1531
+ stablePrefixTokens: prep.stablePrefixTokens ?? undefined,
1289
1532
  }, runtime);
1290
- deps.logger.info(`[${corrId}] gemini_request invoked with model=${prep.resolvedModel || "default"}, approvalMode=${params.approvalMode}, prompt length=${params.prompt.length}`);
1533
+ deps.logger.info(`[${corrId}] gemini_request invoked with model=${prep.resolvedModel || "default"}, approvalMode=${params.approvalMode}, prompt length=${prep.effectivePrompt.length}`);
1291
1534
  try {
1292
1535
  // Gemini CLI 0.43 supports `--resume`, but not a supported fresh
1293
1536
  // `--session-id` flag. Fresh sessions emit no session flag.
@@ -1383,6 +1626,7 @@ export async function handleGeminiRequestAsync(deps, params) {
1383
1626
  const runtime = resolveHandlerRuntime(deps);
1384
1627
  const prep = prepareGeminiRequest({
1385
1628
  prompt: params.prompt,
1629
+ promptParts: params.promptParts,
1386
1630
  model: params.model,
1387
1631
  approvalMode: params.approvalMode,
1388
1632
  approvalStrategy: params.approvalStrategy,
@@ -1462,6 +1706,7 @@ export async function handleGrokRequest(deps, params) {
1462
1706
  const startTime = Date.now();
1463
1707
  const prep = prepareGrokRequest({
1464
1708
  prompt: params.prompt,
1709
+ promptParts: params.promptParts,
1465
1710
  model: params.model,
1466
1711
  outputFormat: params.outputFormat,
1467
1712
  alwaysApprove: params.alwaysApprove,
@@ -1486,10 +1731,12 @@ export async function handleGrokRequest(deps, params) {
1486
1731
  correlationId: corrId,
1487
1732
  cli: "grok",
1488
1733
  model: prep.resolvedModel || "default",
1489
- prompt: params.prompt,
1734
+ prompt: prep.effectivePrompt,
1490
1735
  sessionId: params.sessionId,
1736
+ stablePrefixHash: prep.stablePrefixHash ?? undefined,
1737
+ stablePrefixTokens: prep.stablePrefixTokens ?? undefined,
1491
1738
  }, runtime);
1492
- deps.logger.info(`[${corrId}] grok_request invoked with model=${prep.resolvedModel || "default"}, permissionMode=${params.permissionMode}, prompt length=${params.prompt.length}`);
1739
+ deps.logger.info(`[${corrId}] grok_request invoked with model=${prep.resolvedModel || "default"}, permissionMode=${params.permissionMode}, prompt length=${prep.effectivePrompt.length}`);
1493
1740
  try {
1494
1741
  // Session arg planning (pure, no I/O)
1495
1742
  const sessionResult = resolveGrokSessionArgs({
@@ -1578,6 +1825,7 @@ export async function handleGrokRequestAsync(deps, params) {
1578
1825
  const runtime = resolveHandlerRuntime(deps);
1579
1826
  const prep = prepareGrokRequest({
1580
1827
  prompt: params.prompt,
1828
+ promptParts: params.promptParts,
1581
1829
  model: params.model,
1582
1830
  outputFormat: params.outputFormat,
1583
1831
  alwaysApprove: params.alwaysApprove,
@@ -1658,6 +1906,7 @@ export async function handleMistralRequest(deps, params) {
1658
1906
  const startTime = Date.now();
1659
1907
  const prep = prepareMistralRequest({
1660
1908
  prompt: params.prompt,
1909
+ promptParts: params.promptParts,
1661
1910
  model: params.model,
1662
1911
  outputFormat: params.outputFormat,
1663
1912
  permissionMode: params.permissionMode,
@@ -1681,10 +1930,12 @@ export async function handleMistralRequest(deps, params) {
1681
1930
  correlationId: corrId,
1682
1931
  cli: "mistral",
1683
1932
  model: prep.resolvedModel || "default",
1684
- prompt: params.prompt,
1933
+ prompt: prep.effectivePrompt,
1685
1934
  sessionId: params.sessionId,
1935
+ stablePrefixHash: prep.stablePrefixHash ?? undefined,
1936
+ stablePrefixTokens: prep.stablePrefixTokens ?? undefined,
1686
1937
  }, runtime);
1687
- deps.logger.info(`[${corrId}] mistral_request invoked with model=${prep.resolvedModel || "default"}, permissionMode=${params.permissionMode || "auto-approve"}, prompt length=${params.prompt.length}`);
1938
+ deps.logger.info(`[${corrId}] mistral_request invoked with model=${prep.resolvedModel || "default"}, permissionMode=${params.permissionMode || "auto-approve"}, prompt length=${prep.effectivePrompt.length}`);
1688
1939
  try {
1689
1940
  const sessionResult = resolveMistralSessionArgs({
1690
1941
  sessionId: params.sessionId,
@@ -1795,6 +2046,7 @@ export async function handleMistralRequestAsync(deps, params) {
1795
2046
  const runtime = resolveHandlerRuntime(deps);
1796
2047
  const prep = prepareMistralRequest({
1797
2048
  prompt: params.prompt,
2049
+ promptParts: params.promptParts,
1798
2050
  model: params.model,
1799
2051
  outputFormat: params.outputFormat,
1800
2052
  permissionMode: params.permissionMode,
@@ -1870,6 +2122,7 @@ export async function handleCodexRequestAsync(deps, params) {
1870
2122
  const runtime = resolveHandlerRuntime(deps);
1871
2123
  const prep = prepareCodexRequest({
1872
2124
  prompt: params.prompt,
2125
+ promptParts: params.promptParts,
1873
2126
  model: params.model,
1874
2127
  fullAuto: params.fullAuto,
1875
2128
  sandboxMode: params.sandboxMode,
@@ -1986,7 +2239,14 @@ export async function handleCodexRequestAsync(deps, params) {
1986
2239
  //──────────────────────────────────────────────────────────────────────────────
1987
2240
  export function createGatewayServer(deps = {}) {
1988
2241
  const runtime = resolveGatewayServerRuntime(deps, { isolateState: true });
1989
- const { sessionManager, asyncJobManager, approvalManager, performanceMetrics, logger, persistence, } = runtime;
2242
+ const { sessionManager, asyncJobManager, approvalManager, performanceMetrics, logger, persistence, flightRecorder, cacheAwareness, } = runtime;
2243
+ // `flightRecorder` is destructured into closure scope so the session_get
2244
+ // handler (see ~line 5590) has the FlightRecorderQuery read capability
2245
+ // available without re-resolving runtime. Slice 2 will populate the
2246
+ // `cacheState` field of session_get's response from this read surface.
2247
+ // `cacheAwareness` is the loaded [cache_awareness] block (config.ts).
2248
+ void flightRecorder;
2249
+ void cacheAwareness;
1990
2250
  // Structural invariant: tools register iff ALL THREE conditions hold:
1991
2251
  // (1) persistence.backend !== "none" — the operator/config has not
1992
2252
  // explicitly disabled durable persistence;
@@ -2012,7 +2272,9 @@ export function createGatewayServer(deps = {}) {
2012
2272
  .string()
2013
2273
  .min(1, "Prompt cannot be empty")
2014
2274
  .max(100000, "Prompt too long (max 100k chars)")
2015
- .describe("Prompt text for Claude"),
2275
+ .optional()
2276
+ .describe("Prompt text for Claude (mutually exclusive with promptParts)"),
2277
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
2016
2278
  model: z
2017
2279
  .string()
2018
2280
  .optional()
@@ -2107,13 +2369,14 @@ export function createGatewayServer(deps = {}) {
2107
2369
  .boolean()
2108
2370
  .default(false)
2109
2371
  .describe("Bypass dedup and force a fresh CLI run even if a recent identical request exists"),
2110
- }, async ({ prompt, model, outputFormat, sessionId, continueSession, createNewSession, allowedTools, disallowedTools, dangerouslySkipPermissions, permissionMode, agent, agents, forkSession, systemPrompt, appendSystemPrompt, maxBudgetUsd, maxTurns, effort, excludeDynamicSystemPromptSections, approvalStrategy, approvalPolicy, mcpServers, strictMcpConfig, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, }) => {
2372
+ }, async ({ prompt, promptParts, model, outputFormat, sessionId, continueSession, createNewSession, allowedTools, disallowedTools, dangerouslySkipPermissions, permissionMode, agent, agents, forkSession, systemPrompt, appendSystemPrompt, maxBudgetUsd, maxTurns, effort, excludeDynamicSystemPromptSections, approvalStrategy, approvalPolicy, mcpServers, strictMcpConfig, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, }) => {
2111
2373
  const startTime = Date.now();
2112
2374
  if (systemPrompt !== undefined && appendSystemPrompt !== undefined) {
2113
2375
  return createErrorResponse("claude", 1, "", correlationId, new Error("systemPrompt and appendSystemPrompt are mutually exclusive; use one or the other (not both)."));
2114
2376
  }
2115
2377
  const prep = prepareClaudeRequest({
2116
2378
  prompt,
2379
+ promptParts,
2117
2380
  model,
2118
2381
  outputFormat,
2119
2382
  allowedTools,
@@ -2142,26 +2405,53 @@ export function createGatewayServer(deps = {}) {
2142
2405
  const { corrId, args } = prep;
2143
2406
  let durationMs = 0;
2144
2407
  let wasSuccessful = false;
2408
+ // Session resolution happens BEFORE safeFlightStart so that:
2409
+ // (1) the TTL warning reads the PRIOR session's lastWriteAt
2410
+ // rather than the row about to be inserted (codex-r1/F1).
2411
+ // (2) the flight-recorder row is tagged with effectiveSessionId
2412
+ // (the session the CLI will actually resume), not the raw
2413
+ // user-provided sessionId.
2414
+ let effectiveSessionId = sessionId;
2415
+ let useContinue = continueSession;
2416
+ // Guard the active-session lookup: in some test harnesses the
2417
+ // sessionManager is undefined; the original try-catch wrapped this
2418
+ // block, so we replicate that tolerance here. Failure leaves
2419
+ // effectiveSessionId as the user-provided sessionId.
2420
+ let activeSession = null;
2421
+ try {
2422
+ activeSession = await sessionManager.getActiveSession("claude");
2423
+ }
2424
+ catch (err) {
2425
+ logger.warn(`[${corrId}] sessionManager.getActiveSession failed (non-fatal): ${err.message}`);
2426
+ }
2427
+ if (!createNewSession && !continueSession && !sessionId && activeSession) {
2428
+ effectiveSessionId = activeSession.id;
2429
+ useContinue = true;
2430
+ }
2431
+ if (!useContinue && effectiveSessionId && activeSession?.id === effectiveSessionId) {
2432
+ useContinue = true;
2433
+ }
2434
+ // Slice 3: if the resolved session has a near-expiry Anthropic
2435
+ // cache breakpoint, attach a structured warning (NOT a hard error)
2436
+ // to the response. Computed BEFORE safeFlightStart so the current
2437
+ // row does not skew lastRequestAt.
2438
+ const ttlWarning = maybeBuildCacheTtlWarning({
2439
+ runtime,
2440
+ sessionId: effectiveSessionId,
2441
+ cli: "claude",
2442
+ });
2443
+ const warnings = ttlWarning ? [ttlWarning] : [];
2145
2444
  safeFlightStart({
2146
2445
  correlationId: corrId,
2147
2446
  cli: "claude",
2148
2447
  model: prep.resolvedModel || "default",
2149
- prompt,
2150
- sessionId,
2448
+ prompt: prep.effectivePrompt,
2449
+ sessionId: effectiveSessionId,
2450
+ stablePrefixHash: prep.stablePrefixHash ?? undefined,
2451
+ stablePrefixTokens: prep.stablePrefixTokens ?? undefined,
2151
2452
  }, runtime);
2152
- logger.info(`[${corrId}] claude_request invoked with model=${prep.resolvedModel || "default"}, outputFormat=${outputFormat}, prompt length=${prompt.length}, sessionId=${sessionId}`);
2453
+ logger.info(`[${corrId}] claude_request invoked with model=${prep.resolvedModel || "default"}, outputFormat=${outputFormat}, prompt length=${prep.effectivePrompt.length}, sessionId=${effectiveSessionId}`);
2153
2454
  try {
2154
- // Session management
2155
- let effectiveSessionId = sessionId;
2156
- let useContinue = continueSession;
2157
- const activeSession = await sessionManager.getActiveSession("claude");
2158
- if (!createNewSession && !continueSession && !sessionId && activeSession) {
2159
- effectiveSessionId = activeSession.id;
2160
- useContinue = true;
2161
- }
2162
- if (!useContinue && effectiveSessionId && activeSession?.id === effectiveSessionId) {
2163
- useContinue = true;
2164
- }
2165
2455
  if (useContinue) {
2166
2456
  args.push("--continue");
2167
2457
  }
@@ -2190,7 +2480,14 @@ export function createGatewayServer(deps = {}) {
2190
2480
  errorMessage: stderr || `Exit code ${code}`,
2191
2481
  status: "failed",
2192
2482
  }, runtime);
2193
- return createErrorResponse("claude", code, stderr, corrId);
2483
+ // Slice 3: attach any computed warnings to the error response so
2484
+ // the caller still sees cache_ttl_expiring_soon when the CLI
2485
+ // happens to fail for an unrelated reason.
2486
+ const errResp = createErrorResponse("claude", code, stderr, corrId);
2487
+ if (warnings.length > 0) {
2488
+ errResp.warnings = warnings;
2489
+ }
2490
+ return errResp;
2194
2491
  }
2195
2492
  wasSuccessful = true;
2196
2493
  // If we used a session ID and it's not tracked yet, create a session record
@@ -2221,7 +2518,7 @@ export function createGatewayServer(deps = {}) {
2221
2518
  exitCode: 0,
2222
2519
  status: "completed",
2223
2520
  }, runtime);
2224
- return buildCliResponse("claude", parsed.text, optimizeResponse, corrId, effectiveSessionId, prep, durationMs, undefined, outputFormat);
2521
+ return buildCliResponse("claude", parsed.text, optimizeResponse, corrId, effectiveSessionId, prep, durationMs, undefined, outputFormat, warnings);
2225
2522
  }
2226
2523
  safeFlightComplete(corrId, {
2227
2524
  response: stdout,
@@ -2232,7 +2529,7 @@ export function createGatewayServer(deps = {}) {
2232
2529
  exitCode: 0,
2233
2530
  status: "completed",
2234
2531
  }, runtime);
2235
- return buildCliResponse("claude", stdout, optimizeResponse, corrId, effectiveSessionId, prep, durationMs, undefined, outputFormat);
2532
+ return buildCliResponse("claude", stdout, optimizeResponse, corrId, effectiveSessionId, prep, durationMs, undefined, outputFormat, warnings);
2236
2533
  }
2237
2534
  catch (error) {
2238
2535
  const elapsedMs = Math.max(0, Date.now() - startTime);
@@ -2262,7 +2559,9 @@ export function createGatewayServer(deps = {}) {
2262
2559
  .string()
2263
2560
  .min(1, "Prompt cannot be empty")
2264
2561
  .max(100000, "Prompt too long (max 100k chars)")
2265
- .describe("Prompt text for Codex"),
2562
+ .optional()
2563
+ .describe("Prompt text for Codex (mutually exclusive with promptParts)"),
2564
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
2266
2565
  model: z.string().optional().describe("Model name or alias (e.g. gpt-5.4, latest)"),
2267
2566
  fullAuto: z
2268
2567
  .boolean()
@@ -2353,10 +2652,11 @@ export function createGatewayServer(deps = {}) {
2353
2652
  .boolean()
2354
2653
  .optional()
2355
2654
  .describe("Codex --ignore-rules: skip project rule files for this run."),
2356
- }, async ({ prompt, model, fullAuto, sandboxMode, askForApproval, useLegacyFullAutoFlag, dangerouslyBypassApprovalsAndSandbox, approvalStrategy, approvalPolicy, mcpServers, sessionId, resumeLatest, createNewSession, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, outputFormat, outputSchema, search, profile, configOverrides, ephemeral, images, ignoreUserConfig, ignoreRules, }) => {
2655
+ }, async ({ prompt, promptParts, model, fullAuto, sandboxMode, askForApproval, useLegacyFullAutoFlag, dangerouslyBypassApprovalsAndSandbox, approvalStrategy, approvalPolicy, mcpServers, sessionId, resumeLatest, createNewSession, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, outputFormat, outputSchema, search, profile, configOverrides, ephemeral, images, ignoreUserConfig, ignoreRules, }) => {
2357
2656
  const startTime = Date.now();
2358
2657
  const prep = prepareCodexRequest({
2359
2658
  prompt,
2659
+ promptParts,
2360
2660
  model,
2361
2661
  fullAuto,
2362
2662
  sandboxMode,
@@ -2391,10 +2691,12 @@ export function createGatewayServer(deps = {}) {
2391
2691
  correlationId: corrId,
2392
2692
  cli: "codex",
2393
2693
  model: prep.resolvedModel || "default",
2394
- prompt,
2694
+ prompt: prep.effectivePrompt,
2395
2695
  sessionId,
2696
+ stablePrefixHash: prep.stablePrefixHash ?? undefined,
2697
+ stablePrefixTokens: prep.stablePrefixTokens ?? undefined,
2396
2698
  }, runtime);
2397
- logger.info(`[${corrId}] codex_request invoked with model=${prep.resolvedModel || "default"}, fullAuto=${fullAuto}, prompt length=${prompt.length}`);
2699
+ logger.info(`[${corrId}] codex_request invoked with model=${prep.resolvedModel || "default"}, fullAuto=${fullAuto}, prompt length=${prep.effectivePrompt.length}`);
2398
2700
  // U26 fix: pass the outputSchema cleanup to awaitJobOrDefer, which
2399
2701
  // guarantees the cleanup runs exactly once — inline for direct
2400
2702
  // execution, on terminal status for the job-backed path (sync
@@ -2587,7 +2889,9 @@ export function createGatewayServer(deps = {}) {
2587
2889
  .string()
2588
2890
  .min(1, "Prompt cannot be empty")
2589
2891
  .max(100000, "Prompt too long (max 100k chars)")
2590
- .describe("Prompt text for Gemini"),
2892
+ .optional()
2893
+ .describe("Prompt text for Gemini (mutually exclusive with promptParts)"),
2894
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
2591
2895
  model: z
2592
2896
  .string()
2593
2897
  .optional()
@@ -2640,9 +2944,10 @@ export function createGatewayServer(deps = {}) {
2640
2944
  policyFiles: GEMINI_HIGH_IMPACT_PARAMS_SCHEMA.shape.policyFiles.describe("Policy file paths (--policy <path>, one per file). Paths must exist."),
2641
2945
  adminPolicyFiles: GEMINI_HIGH_IMPACT_PARAMS_SCHEMA.shape.adminPolicyFiles.describe("Admin policy file paths (--admin-policy <path>, one per file). Paths must exist."),
2642
2946
  attachments: GEMINI_HIGH_IMPACT_PARAMS_SCHEMA.shape.attachments.describe("Absolute file paths prepended as @<path> tokens to the prompt"),
2643
- }, async ({ prompt, model, sessionId, resumeLatest, createNewSession, approvalMode, approvalStrategy, approvalPolicy, mcpServers, allowedTools, includeDirs, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, outputFormat, sandbox, policyFiles, adminPolicyFiles, attachments, }) => {
2947
+ }, async ({ prompt, promptParts, model, sessionId, resumeLatest, createNewSession, approvalMode, approvalStrategy, approvalPolicy, mcpServers, allowedTools, includeDirs, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, outputFormat, sandbox, policyFiles, adminPolicyFiles, attachments, }) => {
2644
2948
  return handleGeminiRequest({ sessionManager, logger, runtime }, {
2645
2949
  prompt,
2950
+ promptParts,
2646
2951
  model,
2647
2952
  sessionId,
2648
2953
  resumeLatest,
@@ -2673,7 +2978,9 @@ export function createGatewayServer(deps = {}) {
2673
2978
  .string()
2674
2979
  .min(1, "Prompt cannot be empty")
2675
2980
  .max(100000, "Prompt too long (max 100k chars)")
2676
- .describe("Prompt text for Grok"),
2981
+ .optional()
2982
+ .describe("Prompt text for Grok (mutually exclusive with promptParts)"),
2983
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
2677
2984
  model: z.string().optional().describe("Model name or alias (e.g. grok-build, latest)"),
2678
2985
  outputFormat: z
2679
2986
  .enum(["plain", "json", "streaming-json"])
@@ -2735,9 +3042,10 @@ export function createGatewayServer(deps = {}) {
2735
3042
  .boolean()
2736
3043
  .default(false)
2737
3044
  .describe("Bypass dedup and force a fresh CLI run even if a recent identical request exists"),
2738
- }, async ({ prompt, model, outputFormat, sessionId, resumeLatest, createNewSession, alwaysApprove, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, }) => {
3045
+ }, async ({ prompt, promptParts, model, outputFormat, sessionId, resumeLatest, createNewSession, alwaysApprove, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, }) => {
2739
3046
  return handleGrokRequest({ sessionManager, logger, runtime }, {
2740
3047
  prompt,
3048
+ promptParts,
2741
3049
  model,
2742
3050
  outputFormat,
2743
3051
  sessionId,
@@ -2767,7 +3075,9 @@ export function createGatewayServer(deps = {}) {
2767
3075
  .string()
2768
3076
  .min(1, "Prompt cannot be empty")
2769
3077
  .max(100000, "Prompt too long (max 100k chars)")
2770
- .describe("Prompt text for Mistral Vibe"),
3078
+ .optional()
3079
+ .describe("Prompt text for Mistral Vibe (mutually exclusive with promptParts)"),
3080
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
2771
3081
  model: z
2772
3082
  .string()
2773
3083
  .optional()
@@ -2828,9 +3138,10 @@ export function createGatewayServer(deps = {}) {
2828
3138
  .boolean()
2829
3139
  .default(false)
2830
3140
  .describe("Bypass dedup and force a fresh CLI run even if a recent identical request exists"),
2831
- }, async ({ prompt, model, outputFormat, sessionId, resumeLatest, createNewSession, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, }) => {
3141
+ }, async ({ prompt, promptParts, model, outputFormat, sessionId, resumeLatest, createNewSession, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, }) => {
2832
3142
  return handleMistralRequest({ sessionManager, logger, runtime }, {
2833
3143
  prompt,
3144
+ promptParts,
2834
3145
  model,
2835
3146
  outputFormat,
2836
3147
  sessionId,
@@ -2867,7 +3178,9 @@ export function createGatewayServer(deps = {}) {
2867
3178
  .string()
2868
3179
  .min(1, "Prompt cannot be empty")
2869
3180
  .max(100000, "Prompt too long (max 100k chars)")
2870
- .describe("Prompt text for Claude"),
3181
+ .optional()
3182
+ .describe("Prompt text for Claude (mutually exclusive with promptParts)"),
3183
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
2871
3184
  model: z
2872
3185
  .string()
2873
3186
  .optional()
@@ -2961,12 +3274,13 @@ export function createGatewayServer(deps = {}) {
2961
3274
  .boolean()
2962
3275
  .default(false)
2963
3276
  .describe("Bypass dedup and force a fresh CLI run even if a recent identical request exists"),
2964
- }, async ({ prompt, model, outputFormat, sessionId, continueSession, createNewSession, allowedTools, disallowedTools, dangerouslySkipPermissions, permissionMode, agent, agents, forkSession, systemPrompt, appendSystemPrompt, maxBudgetUsd, maxTurns, effort, excludeDynamicSystemPromptSections, approvalStrategy, approvalPolicy, mcpServers, strictMcpConfig, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, }) => {
3277
+ }, async ({ prompt, promptParts, model, outputFormat, sessionId, continueSession, createNewSession, allowedTools, disallowedTools, dangerouslySkipPermissions, permissionMode, agent, agents, forkSession, systemPrompt, appendSystemPrompt, maxBudgetUsd, maxTurns, effort, excludeDynamicSystemPromptSections, approvalStrategy, approvalPolicy, mcpServers, strictMcpConfig, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, }) => {
2965
3278
  if (systemPrompt !== undefined && appendSystemPrompt !== undefined) {
2966
3279
  return createErrorResponse("claude", 1, "", correlationId, new Error("systemPrompt and appendSystemPrompt are mutually exclusive; use one or the other (not both)."));
2967
3280
  }
2968
3281
  const prep = prepareClaudeRequest({
2969
3282
  prompt,
3283
+ promptParts,
2970
3284
  model,
2971
3285
  outputFormat,
2972
3286
  allowedTools,
@@ -3018,6 +3332,12 @@ export function createGatewayServer(deps = {}) {
3018
3332
  await sessionManager.createSession("claude", "Claude Session", effectiveSessionId);
3019
3333
  }
3020
3334
  }
3335
+ // Slice 3: TTL warning on resume (async path too).
3336
+ const ttlWarning = maybeBuildCacheTtlWarning({
3337
+ runtime,
3338
+ sessionId: effectiveSessionId,
3339
+ cli: "claude",
3340
+ });
3021
3341
  // Idle timeout only for stream-json (text/json produce no output until done)
3022
3342
  const effectiveIdleTimeout = outputFormat === "stream-json"
3023
3343
  ? resolveIdleTimeout("claude", idleTimeoutMs)
@@ -3040,6 +3360,9 @@ export function createGatewayServer(deps = {}) {
3040
3360
  if (prep.reviewIntegrity && prep.reviewIntegrity.violations.length > 0) {
3041
3361
  asyncResponse.reviewIntegrity = prep.reviewIntegrity;
3042
3362
  }
3363
+ if (ttlWarning) {
3364
+ asyncResponse.warnings = [ttlWarning];
3365
+ }
3043
3366
  return {
3044
3367
  content: [
3045
3368
  {
@@ -3058,7 +3381,9 @@ export function createGatewayServer(deps = {}) {
3058
3381
  .string()
3059
3382
  .min(1, "Prompt cannot be empty")
3060
3383
  .max(100000, "Prompt too long (max 100k chars)")
3061
- .describe("Prompt text for Codex"),
3384
+ .optional()
3385
+ .describe("Prompt text for Codex (mutually exclusive with promptParts)"),
3386
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
3062
3387
  model: z.string().optional().describe("Model name or alias (e.g. gpt-5.4, latest)"),
3063
3388
  fullAuto: z
3064
3389
  .boolean()
@@ -3131,9 +3456,10 @@ export function createGatewayServer(deps = {}) {
3131
3456
  images: z.array(z.string()).optional().describe("Codex -i <path>: image attachments."),
3132
3457
  ignoreUserConfig: z.boolean().optional().describe("Codex --ignore-user-config."),
3133
3458
  ignoreRules: z.boolean().optional().describe("Codex --ignore-rules."),
3134
- }, async ({ prompt, model, fullAuto, sandboxMode, askForApproval, useLegacyFullAutoFlag, dangerouslyBypassApprovalsAndSandbox, approvalStrategy, approvalPolicy, mcpServers, sessionId, resumeLatest, createNewSession, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, outputFormat, outputSchema, search, profile, configOverrides, ephemeral, images, ignoreUserConfig, ignoreRules, }) => {
3459
+ }, async ({ prompt, promptParts, model, fullAuto, sandboxMode, askForApproval, useLegacyFullAutoFlag, dangerouslyBypassApprovalsAndSandbox, approvalStrategy, approvalPolicy, mcpServers, sessionId, resumeLatest, createNewSession, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, outputFormat, outputSchema, search, profile, configOverrides, ephemeral, images, ignoreUserConfig, ignoreRules, }) => {
3135
3460
  return handleCodexRequestAsync({ sessionManager, asyncJobManager, logger, runtime }, {
3136
3461
  prompt,
3462
+ promptParts,
3137
3463
  model,
3138
3464
  fullAuto,
3139
3465
  sandboxMode,
@@ -3166,7 +3492,9 @@ export function createGatewayServer(deps = {}) {
3166
3492
  .string()
3167
3493
  .min(1, "Prompt cannot be empty")
3168
3494
  .max(100000, "Prompt too long (max 100k chars)")
3169
- .describe("Prompt text for Gemini"),
3495
+ .optional()
3496
+ .describe("Prompt text for Gemini (mutually exclusive with promptParts)"),
3497
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
3170
3498
  model: z
3171
3499
  .string()
3172
3500
  .optional()
@@ -3221,9 +3549,10 @@ export function createGatewayServer(deps = {}) {
3221
3549
  policyFiles: GEMINI_HIGH_IMPACT_PARAMS_SCHEMA.shape.policyFiles.describe("Policy file paths (--policy <path>, one per file). Paths must exist."),
3222
3550
  adminPolicyFiles: GEMINI_HIGH_IMPACT_PARAMS_SCHEMA.shape.adminPolicyFiles.describe("Admin policy file paths (--admin-policy <path>, one per file). Paths must exist."),
3223
3551
  attachments: GEMINI_HIGH_IMPACT_PARAMS_SCHEMA.shape.attachments.describe("Absolute file paths prepended as @<path> tokens to the prompt"),
3224
- }, async ({ prompt, model, sessionId, resumeLatest, createNewSession, approvalMode, approvalStrategy, approvalPolicy, mcpServers, allowedTools, includeDirs, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, outputFormat, sandbox, policyFiles, adminPolicyFiles, attachments, }) => {
3552
+ }, async ({ prompt, promptParts, model, sessionId, resumeLatest, createNewSession, approvalMode, approvalStrategy, approvalPolicy, mcpServers, allowedTools, includeDirs, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, outputFormat, sandbox, policyFiles, adminPolicyFiles, attachments, }) => {
3225
3553
  return handleGeminiRequestAsync({ sessionManager, asyncJobManager, logger, runtime }, {
3226
3554
  prompt,
3555
+ promptParts,
3227
3556
  model,
3228
3557
  sessionId,
3229
3558
  resumeLatest,
@@ -3250,7 +3579,9 @@ export function createGatewayServer(deps = {}) {
3250
3579
  .string()
3251
3580
  .min(1, "Prompt cannot be empty")
3252
3581
  .max(100000, "Prompt too long (max 100k chars)")
3253
- .describe("Prompt text for Grok"),
3582
+ .optional()
3583
+ .describe("Prompt text for Grok (mutually exclusive with promptParts)"),
3584
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
3254
3585
  model: z.string().optional().describe("Model name or alias (e.g. grok-build, latest)"),
3255
3586
  outputFormat: z
3256
3587
  .enum(["plain", "json", "streaming-json"])
@@ -3311,9 +3642,10 @@ export function createGatewayServer(deps = {}) {
3311
3642
  .boolean()
3312
3643
  .default(false)
3313
3644
  .describe("Bypass dedup and force a fresh CLI run even if a recent identical request exists"),
3314
- }, async ({ prompt, model, outputFormat, sessionId, resumeLatest, createNewSession, alwaysApprove, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, }) => {
3645
+ }, async ({ prompt, promptParts, model, outputFormat, sessionId, resumeLatest, createNewSession, alwaysApprove, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, }) => {
3315
3646
  return handleGrokRequestAsync({ sessionManager, asyncJobManager, logger, runtime }, {
3316
3647
  prompt,
3648
+ promptParts,
3317
3649
  model,
3318
3650
  outputFormat,
3319
3651
  sessionId,
@@ -3339,7 +3671,9 @@ export function createGatewayServer(deps = {}) {
3339
3671
  .string()
3340
3672
  .min(1, "Prompt cannot be empty")
3341
3673
  .max(100000, "Prompt too long (max 100k chars)")
3342
- .describe("Prompt text for Mistral Vibe"),
3674
+ .optional()
3675
+ .describe("Prompt text for Mistral Vibe (mutually exclusive with promptParts)"),
3676
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
3343
3677
  model: z
3344
3678
  .string()
3345
3679
  .optional()
@@ -3399,9 +3733,10 @@ export function createGatewayServer(deps = {}) {
3399
3733
  .boolean()
3400
3734
  .default(false)
3401
3735
  .describe("Bypass dedup and force a fresh CLI run even if a recent identical request exists"),
3402
- }, async ({ prompt, model, outputFormat, sessionId, resumeLatest, createNewSession, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, }) => {
3736
+ }, async ({ prompt, promptParts, model, outputFormat, sessionId, resumeLatest, createNewSession, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, }) => {
3403
3737
  return handleMistralRequestAsync({ sessionManager, asyncJobManager, logger, runtime }, {
3404
3738
  prompt,
3739
+ promptParts,
3405
3740
  model,
3406
3741
  outputFormat,
3407
3742
  sessionId,
@@ -3852,6 +4187,38 @@ export function createGatewayServer(deps = {}) {
3852
4187
  };
3853
4188
  }
3854
4189
  const activeSession = await sessionManager.getActiveSession(session.cli);
4190
+ // Slice 2: project a compact cacheState view from the flight
4191
+ // recorder at read time. NOT persisted on the Session interface
4192
+ // (sessions.json stays content-free per the project invariant).
4193
+ // The field is OMITTED entirely (not null, not empty object) when
4194
+ // the session has zero rows in the flight recorder so the response
4195
+ // stays compact for fresh sessions.
4196
+ //
4197
+ // Slice 3: include ttlRemainingMs derived from the gateway's
4198
+ // configured TTL policy. Null for non-claude sessions.
4199
+ let cacheState;
4200
+ try {
4201
+ const stats = computeSessionCacheStats(flightRecorder, session.id);
4202
+ if (stats.requestCount > 0) {
4203
+ const ttlRemainingMs = computeTtlRemaining(stats, stats.cli, {
4204
+ anthropicTtlSeconds: cacheAwareness?.anthropicTtlSeconds ?? 300,
4205
+ });
4206
+ cacheState = {
4207
+ cli: stats.cli,
4208
+ prefixDistinct: stats.distinctPrefixCount,
4209
+ totalCacheReadTokens: stats.totalCacheReadTokens,
4210
+ totalCacheCreationTokens: stats.totalCacheCreationTokens,
4211
+ requestCount: stats.requestCount,
4212
+ hitCount: stats.hitCount,
4213
+ hitRate: stats.hitRate,
4214
+ estimatedSavingsUsd: stats.estimatedSavingsUsd,
4215
+ ttlRemainingMs,
4216
+ };
4217
+ }
4218
+ }
4219
+ catch (err) {
4220
+ logger.warn?.(`[session_get] cache-stats lookup failed (non-fatal)`, err);
4221
+ }
3855
4222
  return {
3856
4223
  content: [
3857
4224
  {
@@ -3861,6 +4228,7 @@ export function createGatewayServer(deps = {}) {
3861
4228
  session: {
3862
4229
  ...session,
3863
4230
  isActive: activeSession?.id === session.id,
4231
+ ...(cacheState ? { cacheState } : {}),
3864
4232
  },
3865
4233
  }, null, 2),
3866
4234
  },
@@ -3913,7 +4281,7 @@ async function initializeSessionManager() {
3913
4281
  sessionManager = await createSessionManager(config, undefined, logger);
3914
4282
  logger.info("File-based session manager initialized");
3915
4283
  }
3916
- resourceProvider = new ResourceProvider(sessionManager, performanceMetrics);
4284
+ resourceProvider = new ResourceProvider(sessionManager, performanceMetrics, getFlightRecorder(logger), getCacheAwarenessConfig(logger));
3917
4285
  }
3918
4286
  //──────────────────────────────────────────────────────────────────────────────
3919
4287
  // Health Check Resource (only if using PostgreSQL)
@@ -3944,7 +4312,7 @@ function registerHealthResource(server) {
3944
4312
  description: "Async job health (CPU, memory, zombie detection)",
3945
4313
  mimeType: "application/json",
3946
4314
  }, async (uri) => {
3947
- const health = asyncJobManager.getJobHealth();
4315
+ const health = getAsyncJobManager().getJobHealth();
3948
4316
  return {
3949
4317
  contents: [
3950
4318
  {
@@ -3980,8 +4348,10 @@ async function shutdown(signal) {
3980
4348
  await db.disconnect();
3981
4349
  logger.info("Database connections closed");
3982
4350
  }
3983
- flightRecorder.close();
3984
- logger.info("Flight recorder closed");
4351
+ if (flightRecorder) {
4352
+ flightRecorder.close();
4353
+ logger.info("Flight recorder closed");
4354
+ }
3985
4355
  process.exit(0);
3986
4356
  }
3987
4357
  catch (error) {
@@ -3997,6 +4367,20 @@ process.on("SIGINT", () => shutdown("SIGINT"));
3997
4367
  async function main() {
3998
4368
  startWindowsBootstrapperSelfHeal();
3999
4369
  const args = process.argv.slice(2);
4370
+ if (args[0] === "--version" || args[0] === "-version" || args[0] === "version") {
4371
+ process.stdout.write(`${packageVersion()}\n`);
4372
+ return;
4373
+ }
4374
+ if (args[0] === "--help" || args[0] === "-help" || args[0] === "/?" || args[0] === "help") {
4375
+ process.stdout.write([
4376
+ "llm-cli-gateway MCP server",
4377
+ "",
4378
+ "Usage:",
4379
+ " llm-cli-gateway [doctor --json|contracts --json|--transport=http|--version]",
4380
+ "",
4381
+ ].join("\n"));
4382
+ return;
4383
+ }
4000
4384
  if (args[0] === "doctor") {
4001
4385
  if (args.includes("--json")) {
4002
4386
  printDoctorJson();
@@ -4035,9 +4419,9 @@ async function main() {
4035
4419
  resourceProvider,
4036
4420
  db,
4037
4421
  performanceMetrics,
4038
- asyncJobManager,
4039
- approvalManager,
4040
- flightRecorder,
4422
+ asyncJobManager: getAsyncJobManager(logger),
4423
+ approvalManager: getApprovalManager(logger),
4424
+ flightRecorder: getFlightRecorder(logger),
4041
4425
  logger,
4042
4426
  };
4043
4427
  if (transportMode === "http") {