llm-cli-gateway 1.5.35 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
2
+ import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
3
3
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
4
4
  import { randomUUID } from "crypto";
5
5
  import { existsSync, readFileSync, readdirSync, renameSync, unlinkSync } from "fs";
@@ -14,7 +14,7 @@ import { createSessionManager } from "./session-manager.js";
14
14
  import { ResourceProvider } from "./resources.js";
15
15
  import { PerformanceMetrics } from "./metrics.js";
16
16
  import { estimateTokens, optimizePrompt as optimizePromptText, optimizeResponse as optimizeResponseText, } from "./optimizer.js";
17
- import { loadConfig, loadPersistenceConfig } from "./config.js";
17
+ import { loadConfig, loadPersistenceConfig, loadCacheAwarenessConfig, } from "./config.js";
18
18
  import { checkHealth } from "./health.js";
19
19
  import { clearModelRegistryCache, getAvailableCliInfo, getCliInfo, resolveModelAlias, } from "./model-registry.js";
20
20
  import { AsyncJobManager } from "./async-job-manager.js";
@@ -24,6 +24,8 @@ import { checkReviewIntegrity } from "./review-integrity.js";
24
24
  import { buildClaudeMcpConfig, CLAUDE_MCP_SERVER_NAMES, } from "./claude-mcp-config.js";
25
25
  import { resolveGrokSessionArgs, resolveMistralSessionArgs, resolveCodexSessionArgs, sanitizeCliArgValues, prepareMistralRequest as buildMistralCliInvocation, MISTRAL_AGENT_MODES, GATEWAY_SESSION_PREFIX, resolveClaudePermissionFlags, resolveCodexSandboxFlags, CLAUDE_PERMISSION_MODES, GEMINI_APPROVAL_MODES, CODEX_SANDBOX_MODES, CODEX_ASK_FOR_APPROVAL_MODES, CLAUDE_EFFORT_LEVELS, prepareClaudeHighImpactFlags, validateClaudeAgentsMap, prepareCodexHighImpactFlags, prepareCodexForkRequest, CODEX_CONFIG_OVERRIDES_SCHEMA, prepareGeminiHighImpactFlags, prependGeminiAttachments, resolveGeminiSessionPlan, GEMINI_HIGH_IMPACT_PARAMS_SCHEMA, } from "./request-helpers.js";
26
26
  import { createFlightRecorder } from "./flight-recorder.js";
27
+ import { resolvePromptInput, PromptPartsSchema } from "./prompt-parts.js";
28
+ import { computeSessionCacheStats, computeTtlRemaining } from "./cache-stats.js";
27
29
  import { getCliVersions, runCliUpgrade } from "./cli-updater.js";
28
30
  import { startHttpGateway } from "./http-transport.js";
29
31
  import { printDoctorJson } from "./doctor.js";
@@ -181,6 +183,7 @@ let flightRecorder = null;
181
183
  // registered (see createGatewayServer), making silent in-memory loss
182
184
  // structurally impossible.
183
185
  let persistenceConfig = null;
186
+ let cacheAwarenessConfig = null;
184
187
  let jobStore = null;
185
188
  let jobStoreInitialized = false;
186
189
  let asyncJobManager = null;
@@ -193,6 +196,10 @@ function getPersistenceConfig(runtimeLogger = logger) {
193
196
  persistenceConfig ??= loadPersistenceConfig(runtimeLogger);
194
197
  return persistenceConfig;
195
198
  }
199
+ function getCacheAwarenessConfig(runtimeLogger = logger) {
200
+ cacheAwarenessConfig ??= loadCacheAwarenessConfig(runtimeLogger);
201
+ return cacheAwarenessConfig;
202
+ }
196
203
  function getJobStore(runtimeLogger = logger) {
197
204
  if (jobStoreInitialized)
198
205
  return jobStore;
@@ -242,19 +249,21 @@ function resolveGatewayServerRuntime(deps = {}, options = {}) {
242
249
  (options.isolateState
243
250
  ? new ApprovalManager(undefined, runtimeLogger)
244
251
  : getApprovalManager(runtimeLogger));
252
+ const runtimeFlightRecorder = deps.flightRecorder ?? getFlightRecorder(runtimeLogger);
245
253
  return {
246
254
  sessionManager: runtimeSessionManager,
247
255
  resourceProvider: deps.resourceProvider ??
248
256
  (options.isolateState
249
- ? new ResourceProvider(runtimeSessionManager, runtimePerformanceMetrics)
257
+ ? new ResourceProvider(runtimeSessionManager, runtimePerformanceMetrics, runtimeFlightRecorder, deps.cacheAwareness ?? getCacheAwarenessConfig(runtimeLogger))
250
258
  : resourceProvider),
251
259
  db: "db" in deps ? (deps.db ?? null) : db,
252
260
  performanceMetrics: runtimePerformanceMetrics,
253
261
  asyncJobManager: runtimeAsyncJobManager,
254
262
  approvalManager: runtimeApprovalManager,
255
- flightRecorder: deps.flightRecorder ?? getFlightRecorder(runtimeLogger),
263
+ flightRecorder: runtimeFlightRecorder,
256
264
  logger: runtimeLogger,
257
265
  persistence: deps.persistence ?? getPersistenceConfig(runtimeLogger),
266
+ cacheAwareness: deps.cacheAwareness ?? getCacheAwarenessConfig(runtimeLogger),
258
267
  };
259
268
  }
260
269
  // Per-CLI idle timeouts: kill process if no stdout/stderr activity for this duration.
@@ -704,14 +713,118 @@ function registerBaseResources(server, runtime) {
704
713
  const contents = await runtime.resourceProvider.readResource(uri.href);
705
714
  return { contents: contents ? [contents] : [] };
706
715
  });
716
+ // Cache-state resources (slice 2). Static URI for global, templated for
717
+ // session/{id} and prefix/{hash}. All three return tokens/hashes/aggregates
718
+ // ONLY — never raw prompt or response text. The structural guarantee is in
719
+ // the SessionCacheStats / PrefixCacheStats / GlobalCacheStats types
720
+ // themselves: those shapes have no prompt/response/system/task fields.
721
+ server.registerResource("cache-state-global", "cache_state://global", {
722
+ title: "💾 Cache State (Global)",
723
+ description: "Aggregate cache hit/miss/savings across all CLIs in the flight recorder. Tokens/hashes only — no prompt text.",
724
+ mimeType: "application/json",
725
+ }, async (uri) => {
726
+ runtime.logger.debug("Reading cache_state://global resource");
727
+ const stats = runtime.resourceProvider.readCacheStateGlobal({
728
+ lastNHours: 24,
729
+ });
730
+ return {
731
+ contents: [
732
+ {
733
+ uri: uri.href,
734
+ mimeType: "application/json",
735
+ text: JSON.stringify(stats, null, 2),
736
+ },
737
+ ],
738
+ };
739
+ });
740
+ server.registerResource("cache-state-session", new ResourceTemplate("cache_state://session/{sessionId}", { list: undefined }), {
741
+ title: "💾 Cache State (Session)",
742
+ description: "Per-session cache hit/miss/savings. Tokens/hashes only — no prompt text.",
743
+ mimeType: "application/json",
744
+ }, async (uri, variables) => {
745
+ const sessionId = Array.isArray(variables.sessionId)
746
+ ? variables.sessionId[0]
747
+ : variables.sessionId;
748
+ runtime.logger.debug(`Reading cache_state://session/${sessionId}`);
749
+ const stats = runtime.resourceProvider.readCacheStateSession(String(sessionId));
750
+ return {
751
+ contents: [
752
+ {
753
+ uri: uri.href,
754
+ mimeType: "application/json",
755
+ text: JSON.stringify(stats, null, 2),
756
+ },
757
+ ],
758
+ };
759
+ });
760
+ server.registerResource("cache-state-prefix", new ResourceTemplate("cache_state://prefix/{hash}", { list: undefined }), {
761
+ title: "💾 Cache State (Prefix)",
762
+ description: "Per-stable-prefix-hash cache hit/miss/savings, with CLI breakdown. Tokens/hashes only — no prompt text.",
763
+ mimeType: "application/json",
764
+ }, async (uri, variables) => {
765
+ const hash = Array.isArray(variables.hash) ? variables.hash[0] : variables.hash;
766
+ runtime.logger.debug(`Reading cache_state://prefix/${hash}`);
767
+ const stats = runtime.resourceProvider.readCacheStateForPrefix(String(hash));
768
+ return {
769
+ contents: [
770
+ {
771
+ uri: uri.href,
772
+ mimeType: "application/json",
773
+ text: JSON.stringify(stats, null, 2),
774
+ },
775
+ ],
776
+ };
777
+ });
778
+ }
779
+ /**
780
+ * Slice 1: validate the prompt / promptParts mutex at the prep boundary and
781
+ * return either an error response or the resolved input. The exact error
782
+ * messages are part of the public contract — tests assert them verbatim.
783
+ */
784
+ function resolvePromptOrPartsForPrep(args) {
785
+ const hasPrompt = typeof args.prompt === "string" && args.prompt.length > 0;
786
+ const hasParts = args.promptParts !== undefined;
787
+ if (hasPrompt && hasParts) {
788
+ return {
789
+ ok: false,
790
+ error: createErrorResponse(args.operation, 1, "", args.correlationId, new Error("provide exactly one of `prompt` or `promptParts`")),
791
+ };
792
+ }
793
+ if (!hasPrompt && !hasParts) {
794
+ return {
795
+ ok: false,
796
+ error: createErrorResponse(args.operation, 1, "", args.correlationId, new Error("one of `prompt` or `promptParts` is required")),
797
+ };
798
+ }
799
+ const resolved = resolvePromptInput({
800
+ prompt: args.prompt,
801
+ promptParts: args.promptParts,
802
+ });
803
+ return {
804
+ ok: true,
805
+ assembledPrompt: resolved.assembledPrompt,
806
+ stablePrefixHash: resolved.stablePrefixHash,
807
+ stablePrefixTokens: resolved.stablePrefixTokens,
808
+ };
707
809
  }
708
810
  export function prepareClaudeRequest(params, runtime = resolveGatewayServerRuntime()) {
709
811
  const corrId = params.correlationId || randomUUID();
710
812
  const cliInfo = getCliInfo();
711
813
  const resolvedModel = resolveModelAlias("claude", params.model, cliInfo);
814
+ const inputResolution = resolvePromptOrPartsForPrep({
815
+ prompt: params.prompt,
816
+ promptParts: params.promptParts,
817
+ operation: params.operation,
818
+ correlationId: corrId,
819
+ });
820
+ if (!inputResolution.ok)
821
+ return inputResolution.error;
822
+ const assembledPrompt = inputResolution.assembledPrompt;
823
+ const stablePrefixHash = inputResolution.stablePrefixHash;
824
+ const stablePrefixTokens = inputResolution.stablePrefixTokens;
712
825
  // Review integrity check on raw prompt (before optimization)
713
826
  const reviewIntegrity = checkReviewIntegrity({
714
- prompt: params.prompt,
827
+ prompt: assembledPrompt,
715
828
  allowedTools: params.allowedTools,
716
829
  disallowedTools: params.disallowedTools,
717
830
  });
@@ -722,7 +835,7 @@ export function prepareClaudeRequest(params, runtime = resolveGatewayServerRunti
722
835
  score: reviewIntegrity.totalScore,
723
836
  });
724
837
  }
725
- let effectivePrompt = params.prompt;
838
+ let effectivePrompt = assembledPrompt;
726
839
  if (params.optimizePrompt) {
727
840
  const optimized = optimizePromptText(effectivePrompt);
728
841
  logOptimizationTokens("prompt", corrId, effectivePrompt, optimized);
@@ -739,7 +852,7 @@ export function prepareClaudeRequest(params, runtime = resolveGatewayServerRunti
739
852
  approvalDecision = runtime.approvalManager.decide({
740
853
  cli: "claude",
741
854
  operation: params.operation,
742
- prompt: params.prompt, // Use raw prompt for review-context detection, not optimized
855
+ prompt: assembledPrompt, // Use raw assembled prompt for review-context detection, not optimized
743
856
  bypassRequested: params.dangerouslySkipPermissions,
744
857
  fullAuto: false,
745
858
  requestedMcpServers,
@@ -818,14 +931,27 @@ export function prepareClaudeRequest(params, runtime = resolveGatewayServerRunti
818
931
  approvalDecision,
819
932
  reviewIntegrity,
820
933
  args,
934
+ stablePrefixHash,
935
+ stablePrefixTokens,
821
936
  };
822
937
  }
823
938
  export function prepareCodexRequest(params, runtime = resolveGatewayServerRuntime()) {
824
939
  const corrId = params.correlationId || randomUUID();
825
940
  const cliInfo = getCliInfo();
826
941
  const resolvedModel = resolveModelAlias("codex", params.model, cliInfo);
942
+ const inputResolution = resolvePromptOrPartsForPrep({
943
+ prompt: params.prompt,
944
+ promptParts: params.promptParts,
945
+ operation: params.operation,
946
+ correlationId: corrId,
947
+ });
948
+ if (!inputResolution.ok)
949
+ return inputResolution.error;
950
+ const assembledPrompt = inputResolution.assembledPrompt;
951
+ const stablePrefixHash = inputResolution.stablePrefixHash;
952
+ const stablePrefixTokens = inputResolution.stablePrefixTokens;
827
953
  // Review integrity check on raw prompt (before optimization)
828
- const reviewIntegrity = checkReviewIntegrity({ prompt: params.prompt });
954
+ const reviewIntegrity = checkReviewIntegrity({ prompt: assembledPrompt });
829
955
  if (reviewIntegrity.violations.length > 0) {
830
956
  runtime.logger.info(`[${corrId}] Review integrity violations detected: ${reviewIntegrity.violations.map(v => v.type).join(", ")}`, {
831
957
  cli: "codex",
@@ -833,7 +959,7 @@ export function prepareCodexRequest(params, runtime = resolveGatewayServerRuntim
833
959
  score: reviewIntegrity.totalScore,
834
960
  });
835
961
  }
836
- let effectivePrompt = params.prompt;
962
+ let effectivePrompt = assembledPrompt;
837
963
  if (params.optimizePrompt) {
838
964
  const optimized = optimizePromptText(effectivePrompt);
839
965
  logOptimizationTokens("prompt", corrId, effectivePrompt, optimized);
@@ -845,7 +971,7 @@ export function prepareCodexRequest(params, runtime = resolveGatewayServerRuntim
845
971
  approvalDecision = runtime.approvalManager.decide({
846
972
  cli: "codex",
847
973
  operation: params.operation,
848
- prompt: params.prompt, // Use raw prompt for review-context detection, not optimized
974
+ prompt: assembledPrompt, // Use raw assembled prompt for review-context detection, not optimized
849
975
  bypassRequested: params.dangerouslyBypassApprovalsAndSandbox,
850
976
  fullAuto: params.fullAuto,
851
977
  requestedMcpServers,
@@ -960,15 +1086,28 @@ export function prepareCodexRequest(params, runtime = resolveGatewayServerRuntim
960
1086
  reviewIntegrity,
961
1087
  args,
962
1088
  cleanup: highImpactCleanup,
1089
+ stablePrefixHash,
1090
+ stablePrefixTokens,
963
1091
  };
964
1092
  }
965
1093
  export function prepareGeminiRequest(params, runtime = resolveGatewayServerRuntime()) {
966
1094
  const corrId = params.correlationId || randomUUID();
967
1095
  const cliInfo = getCliInfo();
968
1096
  const resolvedModel = resolveModelAlias("gemini", params.model, cliInfo);
1097
+ const inputResolution = resolvePromptOrPartsForPrep({
1098
+ prompt: params.prompt,
1099
+ promptParts: params.promptParts,
1100
+ operation: params.operation,
1101
+ correlationId: corrId,
1102
+ });
1103
+ if (!inputResolution.ok)
1104
+ return inputResolution.error;
1105
+ const assembledPrompt = inputResolution.assembledPrompt;
1106
+ const stablePrefixHash = inputResolution.stablePrefixHash;
1107
+ const stablePrefixTokens = inputResolution.stablePrefixTokens;
969
1108
  // Review integrity check on raw prompt (before optimization)
970
1109
  const reviewIntegrity = checkReviewIntegrity({
971
- prompt: params.prompt,
1110
+ prompt: assembledPrompt,
972
1111
  allowedTools: params.allowedTools,
973
1112
  });
974
1113
  if (reviewIntegrity.violations.length > 0) {
@@ -978,7 +1117,7 @@ export function prepareGeminiRequest(params, runtime = resolveGatewayServerRunti
978
1117
  score: reviewIntegrity.totalScore,
979
1118
  });
980
1119
  }
981
- let effectivePrompt = params.prompt;
1120
+ let effectivePrompt = assembledPrompt;
982
1121
  if (params.optimizePrompt) {
983
1122
  const optimized = optimizePromptText(effectivePrompt);
984
1123
  logOptimizationTokens("prompt", corrId, effectivePrompt, optimized);
@@ -990,7 +1129,7 @@ export function prepareGeminiRequest(params, runtime = resolveGatewayServerRunti
990
1129
  approvalDecision = runtime.approvalManager.decide({
991
1130
  cli: "gemini",
992
1131
  operation: params.operation,
993
- prompt: params.prompt, // Use raw prompt for review-context detection, not optimized
1132
+ prompt: assembledPrompt, // Use raw assembled prompt for review-context detection, not optimized
994
1133
  bypassRequested: params.approvalMode === "yolo",
995
1134
  fullAuto: false,
996
1135
  requestedMcpServers,
@@ -1060,15 +1199,28 @@ export function prepareGeminiRequest(params, runtime = resolveGatewayServerRunti
1060
1199
  approvalDecision,
1061
1200
  reviewIntegrity,
1062
1201
  args,
1202
+ stablePrefixHash,
1203
+ stablePrefixTokens,
1063
1204
  };
1064
1205
  }
1065
1206
  function prepareGrokRequest(params, runtime = resolveGatewayServerRuntime()) {
1066
1207
  const corrId = params.correlationId || randomUUID();
1067
1208
  const cliInfo = getCliInfo();
1068
1209
  const resolvedModel = resolveModelAlias("grok", params.model, cliInfo);
1210
+ const inputResolution = resolvePromptOrPartsForPrep({
1211
+ prompt: params.prompt,
1212
+ promptParts: params.promptParts,
1213
+ operation: params.operation,
1214
+ correlationId: corrId,
1215
+ });
1216
+ if (!inputResolution.ok)
1217
+ return inputResolution.error;
1218
+ const assembledPrompt = inputResolution.assembledPrompt;
1219
+ const stablePrefixHash = inputResolution.stablePrefixHash;
1220
+ const stablePrefixTokens = inputResolution.stablePrefixTokens;
1069
1221
  // Review integrity check on raw prompt (before optimization)
1070
1222
  const reviewIntegrity = checkReviewIntegrity({
1071
- prompt: params.prompt,
1223
+ prompt: assembledPrompt,
1072
1224
  allowedTools: params.allowedTools,
1073
1225
  disallowedTools: params.disallowedTools,
1074
1226
  });
@@ -1079,7 +1231,7 @@ function prepareGrokRequest(params, runtime = resolveGatewayServerRuntime()) {
1079
1231
  score: reviewIntegrity.totalScore,
1080
1232
  });
1081
1233
  }
1082
- let effectivePrompt = params.prompt;
1234
+ let effectivePrompt = assembledPrompt;
1083
1235
  if (params.optimizePrompt) {
1084
1236
  const optimized = optimizePromptText(effectivePrompt);
1085
1237
  logOptimizationTokens("prompt", corrId, effectivePrompt, optimized);
@@ -1091,7 +1243,7 @@ function prepareGrokRequest(params, runtime = resolveGatewayServerRuntime()) {
1091
1243
  approvalDecision = runtime.approvalManager.decide({
1092
1244
  cli: "grok",
1093
1245
  operation: params.operation,
1094
- prompt: params.prompt, // Use raw prompt for review-context detection, not optimized
1246
+ prompt: assembledPrompt, // Use raw assembled prompt for review-context detection, not optimized
1095
1247
  bypassRequested: Boolean(params.alwaysApprove) || params.permissionMode === "bypassPermissions",
1096
1248
  fullAuto: false,
1097
1249
  requestedMcpServers,
@@ -1135,14 +1287,27 @@ function prepareGrokRequest(params, runtime = resolveGatewayServerRuntime()) {
1135
1287
  approvalDecision,
1136
1288
  reviewIntegrity,
1137
1289
  args,
1290
+ stablePrefixHash,
1291
+ stablePrefixTokens,
1138
1292
  };
1139
1293
  }
1140
1294
  export function prepareMistralRequest(params, runtime = resolveGatewayServerRuntime()) {
1141
1295
  const corrId = params.correlationId || randomUUID();
1142
1296
  const cliInfo = getCliInfo();
1143
1297
  const resolvedModel = resolveModelAlias("mistral", params.model, cliInfo);
1144
- const reviewIntegrity = checkReviewIntegrity({
1298
+ const inputResolution = resolvePromptOrPartsForPrep({
1145
1299
  prompt: params.prompt,
1300
+ promptParts: params.promptParts,
1301
+ operation: params.operation,
1302
+ correlationId: corrId,
1303
+ });
1304
+ if (!inputResolution.ok)
1305
+ return inputResolution.error;
1306
+ const assembledPrompt = inputResolution.assembledPrompt;
1307
+ const stablePrefixHash = inputResolution.stablePrefixHash;
1308
+ const stablePrefixTokens = inputResolution.stablePrefixTokens;
1309
+ const reviewIntegrity = checkReviewIntegrity({
1310
+ prompt: assembledPrompt,
1146
1311
  allowedTools: params.allowedTools,
1147
1312
  disallowedTools: params.disallowedTools,
1148
1313
  });
@@ -1153,7 +1318,7 @@ export function prepareMistralRequest(params, runtime = resolveGatewayServerRunt
1153
1318
  score: reviewIntegrity.totalScore,
1154
1319
  });
1155
1320
  }
1156
- let effectivePrompt = params.prompt;
1321
+ let effectivePrompt = assembledPrompt;
1157
1322
  if (params.optimizePrompt) {
1158
1323
  const optimized = optimizePromptText(effectivePrompt);
1159
1324
  logOptimizationTokens("prompt", corrId, effectivePrompt, optimized);
@@ -1165,7 +1330,7 @@ export function prepareMistralRequest(params, runtime = resolveGatewayServerRunt
1165
1330
  approvalDecision = runtime.approvalManager.decide({
1166
1331
  cli: "mistral",
1167
1332
  operation: params.operation,
1168
- prompt: params.prompt,
1333
+ prompt: assembledPrompt,
1169
1334
  bypassRequested: params.permissionMode === "auto-approve",
1170
1335
  fullAuto: false,
1171
1336
  requestedMcpServers,
@@ -1210,6 +1375,8 @@ export function prepareMistralRequest(params, runtime = resolveGatewayServerRunt
1210
1375
  reviewIntegrity,
1211
1376
  args: prep.args,
1212
1377
  mistralEnv: prep.env,
1378
+ stablePrefixHash,
1379
+ stablePrefixTokens,
1213
1380
  };
1214
1381
  }
1215
1382
  function isMistralModelSelectionFailure(stderr) {
@@ -1225,7 +1392,7 @@ function selectMistralRecoveryModel(failedModel) {
1225
1392
  ].filter((model) => Boolean(model && model !== failedModel));
1226
1393
  return candidates.find(model => model !== "local");
1227
1394
  }
1228
- function buildCliResponse(cli, stdout, optimizeResponse, corrId, sessionId, prep, durationMs, resumable, outputFormat) {
1395
+ function buildCliResponse(cli, stdout, optimizeResponse, corrId, sessionId, prep, durationMs, resumable, outputFormat, warnings) {
1229
1396
  let finalStdout = stdout;
1230
1397
  // Skip response optimization for JSON output to prevent corrupting structured data
1231
1398
  if (optimizeResponse && outputFormat !== "json") {
@@ -1274,8 +1441,41 @@ function buildCliResponse(cli, stdout, optimizeResponse, corrId, sessionId, prep
1274
1441
  if (prep.reviewIntegrity && prep.reviewIntegrity.violations.length > 0) {
1275
1442
  response.reviewIntegrity = prep.reviewIntegrity;
1276
1443
  }
1444
+ if (warnings && warnings.length > 0) {
1445
+ response.warnings = warnings;
1446
+ }
1277
1447
  return response;
1278
1448
  }
1449
+ /**
1450
+ * Slice 3 helper: compute the cache_ttl_expiring_soon warning for a
1451
+ * claude session, if the feature is enabled, the session has prior cache
1452
+ * writes, and ttlRemainingMs is below the threshold (30s by default).
1453
+ * Returns null when no warning applies.
1454
+ */
1455
+ function maybeBuildCacheTtlWarning(args) {
1456
+ if (args.cli !== "claude")
1457
+ return null;
1458
+ if (!args.sessionId)
1459
+ return null;
1460
+ if (!args.runtime.cacheAwareness?.warnOnTtlExpiry)
1461
+ return null;
1462
+ const stats = computeSessionCacheStats(args.runtime.flightRecorder, args.sessionId);
1463
+ if (stats.requestCount === 0 || !stats.lastRequestAt)
1464
+ return null;
1465
+ const ttl = computeTtlRemaining(stats, args.cli, {
1466
+ anthropicTtlSeconds: args.runtime.cacheAwareness.anthropicTtlSeconds,
1467
+ });
1468
+ if (ttl === null)
1469
+ return null;
1470
+ const threshold = args.thresholdMs ?? 30_000;
1471
+ if (ttl >= threshold)
1472
+ return null;
1473
+ return {
1474
+ code: "cache_ttl_expiring_soon",
1475
+ ttlRemainingMs: ttl,
1476
+ message: `Anthropic cache breakpoint for session ${args.sessionId} expires in ${ttl}ms (< ${threshold}ms). Subsequent requests may miss the cache.`,
1477
+ };
1478
+ }
1279
1479
  function resolveHandlerRuntime(deps) {
1280
1480
  if (deps.runtime)
1281
1481
  return deps.runtime;
@@ -1299,6 +1499,7 @@ export async function handleGeminiRequest(deps, params) {
1299
1499
  const startTime = Date.now();
1300
1500
  const prep = prepareGeminiRequest({
1301
1501
  prompt: params.prompt,
1502
+ promptParts: params.promptParts,
1302
1503
  model: params.model,
1303
1504
  approvalMode: params.approvalMode,
1304
1505
  approvalStrategy: params.approvalStrategy,
@@ -1324,10 +1525,12 @@ export async function handleGeminiRequest(deps, params) {
1324
1525
  correlationId: corrId,
1325
1526
  cli: "gemini",
1326
1527
  model: prep.resolvedModel || "default",
1327
- prompt: params.prompt,
1528
+ prompt: prep.effectivePrompt,
1328
1529
  sessionId: params.sessionId,
1530
+ stablePrefixHash: prep.stablePrefixHash ?? undefined,
1531
+ stablePrefixTokens: prep.stablePrefixTokens ?? undefined,
1329
1532
  }, runtime);
1330
- deps.logger.info(`[${corrId}] gemini_request invoked with model=${prep.resolvedModel || "default"}, approvalMode=${params.approvalMode}, prompt length=${params.prompt.length}`);
1533
+ deps.logger.info(`[${corrId}] gemini_request invoked with model=${prep.resolvedModel || "default"}, approvalMode=${params.approvalMode}, prompt length=${prep.effectivePrompt.length}`);
1331
1534
  try {
1332
1535
  // Gemini CLI 0.43 supports `--resume`, but not a supported fresh
1333
1536
  // `--session-id` flag. Fresh sessions emit no session flag.
@@ -1423,6 +1626,7 @@ export async function handleGeminiRequestAsync(deps, params) {
1423
1626
  const runtime = resolveHandlerRuntime(deps);
1424
1627
  const prep = prepareGeminiRequest({
1425
1628
  prompt: params.prompt,
1629
+ promptParts: params.promptParts,
1426
1630
  model: params.model,
1427
1631
  approvalMode: params.approvalMode,
1428
1632
  approvalStrategy: params.approvalStrategy,
@@ -1502,6 +1706,7 @@ export async function handleGrokRequest(deps, params) {
1502
1706
  const startTime = Date.now();
1503
1707
  const prep = prepareGrokRequest({
1504
1708
  prompt: params.prompt,
1709
+ promptParts: params.promptParts,
1505
1710
  model: params.model,
1506
1711
  outputFormat: params.outputFormat,
1507
1712
  alwaysApprove: params.alwaysApprove,
@@ -1526,10 +1731,12 @@ export async function handleGrokRequest(deps, params) {
1526
1731
  correlationId: corrId,
1527
1732
  cli: "grok",
1528
1733
  model: prep.resolvedModel || "default",
1529
- prompt: params.prompt,
1734
+ prompt: prep.effectivePrompt,
1530
1735
  sessionId: params.sessionId,
1736
+ stablePrefixHash: prep.stablePrefixHash ?? undefined,
1737
+ stablePrefixTokens: prep.stablePrefixTokens ?? undefined,
1531
1738
  }, runtime);
1532
- deps.logger.info(`[${corrId}] grok_request invoked with model=${prep.resolvedModel || "default"}, permissionMode=${params.permissionMode}, prompt length=${params.prompt.length}`);
1739
+ deps.logger.info(`[${corrId}] grok_request invoked with model=${prep.resolvedModel || "default"}, permissionMode=${params.permissionMode}, prompt length=${prep.effectivePrompt.length}`);
1533
1740
  try {
1534
1741
  // Session arg planning (pure, no I/O)
1535
1742
  const sessionResult = resolveGrokSessionArgs({
@@ -1618,6 +1825,7 @@ export async function handleGrokRequestAsync(deps, params) {
1618
1825
  const runtime = resolveHandlerRuntime(deps);
1619
1826
  const prep = prepareGrokRequest({
1620
1827
  prompt: params.prompt,
1828
+ promptParts: params.promptParts,
1621
1829
  model: params.model,
1622
1830
  outputFormat: params.outputFormat,
1623
1831
  alwaysApprove: params.alwaysApprove,
@@ -1698,6 +1906,7 @@ export async function handleMistralRequest(deps, params) {
1698
1906
  const startTime = Date.now();
1699
1907
  const prep = prepareMistralRequest({
1700
1908
  prompt: params.prompt,
1909
+ promptParts: params.promptParts,
1701
1910
  model: params.model,
1702
1911
  outputFormat: params.outputFormat,
1703
1912
  permissionMode: params.permissionMode,
@@ -1721,10 +1930,12 @@ export async function handleMistralRequest(deps, params) {
1721
1930
  correlationId: corrId,
1722
1931
  cli: "mistral",
1723
1932
  model: prep.resolvedModel || "default",
1724
- prompt: params.prompt,
1933
+ prompt: prep.effectivePrompt,
1725
1934
  sessionId: params.sessionId,
1935
+ stablePrefixHash: prep.stablePrefixHash ?? undefined,
1936
+ stablePrefixTokens: prep.stablePrefixTokens ?? undefined,
1726
1937
  }, runtime);
1727
- deps.logger.info(`[${corrId}] mistral_request invoked with model=${prep.resolvedModel || "default"}, permissionMode=${params.permissionMode || "auto-approve"}, prompt length=${params.prompt.length}`);
1938
+ deps.logger.info(`[${corrId}] mistral_request invoked with model=${prep.resolvedModel || "default"}, permissionMode=${params.permissionMode || "auto-approve"}, prompt length=${prep.effectivePrompt.length}`);
1728
1939
  try {
1729
1940
  const sessionResult = resolveMistralSessionArgs({
1730
1941
  sessionId: params.sessionId,
@@ -1835,6 +2046,7 @@ export async function handleMistralRequestAsync(deps, params) {
1835
2046
  const runtime = resolveHandlerRuntime(deps);
1836
2047
  const prep = prepareMistralRequest({
1837
2048
  prompt: params.prompt,
2049
+ promptParts: params.promptParts,
1838
2050
  model: params.model,
1839
2051
  outputFormat: params.outputFormat,
1840
2052
  permissionMode: params.permissionMode,
@@ -1910,6 +2122,7 @@ export async function handleCodexRequestAsync(deps, params) {
1910
2122
  const runtime = resolveHandlerRuntime(deps);
1911
2123
  const prep = prepareCodexRequest({
1912
2124
  prompt: params.prompt,
2125
+ promptParts: params.promptParts,
1913
2126
  model: params.model,
1914
2127
  fullAuto: params.fullAuto,
1915
2128
  sandboxMode: params.sandboxMode,
@@ -2026,7 +2239,14 @@ export async function handleCodexRequestAsync(deps, params) {
2026
2239
  //──────────────────────────────────────────────────────────────────────────────
2027
2240
  export function createGatewayServer(deps = {}) {
2028
2241
  const runtime = resolveGatewayServerRuntime(deps, { isolateState: true });
2029
- const { sessionManager, asyncJobManager, approvalManager, performanceMetrics, logger, persistence, } = runtime;
2242
+ const { sessionManager, asyncJobManager, approvalManager, performanceMetrics, logger, persistence, flightRecorder, cacheAwareness, } = runtime;
2243
+ // `flightRecorder` is destructured into closure scope so the session_get
2244
+ // handler (see ~line 5590) has the FlightRecorderQuery read capability
2245
+ // available without re-resolving runtime. Slice 2 will populate the
2246
+ // `cacheState` field of session_get's response from this read surface.
2247
+ // `cacheAwareness` is the loaded [cache_awareness] block (config.ts).
2248
+ void flightRecorder;
2249
+ void cacheAwareness;
2030
2250
  // Structural invariant: tools register iff ALL THREE conditions hold:
2031
2251
  // (1) persistence.backend !== "none" — the operator/config has not
2032
2252
  // explicitly disabled durable persistence;
@@ -2052,7 +2272,9 @@ export function createGatewayServer(deps = {}) {
2052
2272
  .string()
2053
2273
  .min(1, "Prompt cannot be empty")
2054
2274
  .max(100000, "Prompt too long (max 100k chars)")
2055
- .describe("Prompt text for Claude"),
2275
+ .optional()
2276
+ .describe("Prompt text for Claude (mutually exclusive with promptParts)"),
2277
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
2056
2278
  model: z
2057
2279
  .string()
2058
2280
  .optional()
@@ -2147,13 +2369,14 @@ export function createGatewayServer(deps = {}) {
2147
2369
  .boolean()
2148
2370
  .default(false)
2149
2371
  .describe("Bypass dedup and force a fresh CLI run even if a recent identical request exists"),
2150
- }, async ({ prompt, model, outputFormat, sessionId, continueSession, createNewSession, allowedTools, disallowedTools, dangerouslySkipPermissions, permissionMode, agent, agents, forkSession, systemPrompt, appendSystemPrompt, maxBudgetUsd, maxTurns, effort, excludeDynamicSystemPromptSections, approvalStrategy, approvalPolicy, mcpServers, strictMcpConfig, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, }) => {
2372
+ }, async ({ prompt, promptParts, model, outputFormat, sessionId, continueSession, createNewSession, allowedTools, disallowedTools, dangerouslySkipPermissions, permissionMode, agent, agents, forkSession, systemPrompt, appendSystemPrompt, maxBudgetUsd, maxTurns, effort, excludeDynamicSystemPromptSections, approvalStrategy, approvalPolicy, mcpServers, strictMcpConfig, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, }) => {
2151
2373
  const startTime = Date.now();
2152
2374
  if (systemPrompt !== undefined && appendSystemPrompt !== undefined) {
2153
2375
  return createErrorResponse("claude", 1, "", correlationId, new Error("systemPrompt and appendSystemPrompt are mutually exclusive; use one or the other (not both)."));
2154
2376
  }
2155
2377
  const prep = prepareClaudeRequest({
2156
2378
  prompt,
2379
+ promptParts,
2157
2380
  model,
2158
2381
  outputFormat,
2159
2382
  allowedTools,
@@ -2182,26 +2405,53 @@ export function createGatewayServer(deps = {}) {
2182
2405
  const { corrId, args } = prep;
2183
2406
  let durationMs = 0;
2184
2407
  let wasSuccessful = false;
2408
+ // Session resolution happens BEFORE safeFlightStart so that:
2409
+ // (1) the TTL warning reads the PRIOR session's lastWriteAt
2410
+ // rather than the row about to be inserted (codex-r1/F1).
2411
+ // (2) the flight-recorder row is tagged with effectiveSessionId
2412
+ // (the session the CLI will actually resume), not the raw
2413
+ // user-provided sessionId.
2414
+ let effectiveSessionId = sessionId;
2415
+ let useContinue = continueSession;
2416
+ // Guard the active-session lookup: in some test harnesses the
2417
+ // sessionManager is undefined; the original try-catch wrapped this
2418
+ // block, so we replicate that tolerance here. Failure leaves
2419
+ // effectiveSessionId as the user-provided sessionId.
2420
+ let activeSession = null;
2421
+ try {
2422
+ activeSession = await sessionManager.getActiveSession("claude");
2423
+ }
2424
+ catch (err) {
2425
+ logger.warn(`[${corrId}] sessionManager.getActiveSession failed (non-fatal): ${err.message}`);
2426
+ }
2427
+ if (!createNewSession && !continueSession && !sessionId && activeSession) {
2428
+ effectiveSessionId = activeSession.id;
2429
+ useContinue = true;
2430
+ }
2431
+ if (!useContinue && effectiveSessionId && activeSession?.id === effectiveSessionId) {
2432
+ useContinue = true;
2433
+ }
2434
+ // Slice 3: if the resolved session has a near-expiry Anthropic
2435
+ // cache breakpoint, attach a structured warning (NOT a hard error)
2436
+ // to the response. Computed BEFORE safeFlightStart so the current
2437
+ // row does not skew lastRequestAt.
2438
+ const ttlWarning = maybeBuildCacheTtlWarning({
2439
+ runtime,
2440
+ sessionId: effectiveSessionId,
2441
+ cli: "claude",
2442
+ });
2443
+ const warnings = ttlWarning ? [ttlWarning] : [];
2185
2444
  safeFlightStart({
2186
2445
  correlationId: corrId,
2187
2446
  cli: "claude",
2188
2447
  model: prep.resolvedModel || "default",
2189
- prompt,
2190
- sessionId,
2448
+ prompt: prep.effectivePrompt,
2449
+ sessionId: effectiveSessionId,
2450
+ stablePrefixHash: prep.stablePrefixHash ?? undefined,
2451
+ stablePrefixTokens: prep.stablePrefixTokens ?? undefined,
2191
2452
  }, runtime);
2192
- logger.info(`[${corrId}] claude_request invoked with model=${prep.resolvedModel || "default"}, outputFormat=${outputFormat}, prompt length=${prompt.length}, sessionId=${sessionId}`);
2453
+ logger.info(`[${corrId}] claude_request invoked with model=${prep.resolvedModel || "default"}, outputFormat=${outputFormat}, prompt length=${prep.effectivePrompt.length}, sessionId=${effectiveSessionId}`);
2193
2454
  try {
2194
- // Session management
2195
- let effectiveSessionId = sessionId;
2196
- let useContinue = continueSession;
2197
- const activeSession = await sessionManager.getActiveSession("claude");
2198
- if (!createNewSession && !continueSession && !sessionId && activeSession) {
2199
- effectiveSessionId = activeSession.id;
2200
- useContinue = true;
2201
- }
2202
- if (!useContinue && effectiveSessionId && activeSession?.id === effectiveSessionId) {
2203
- useContinue = true;
2204
- }
2205
2455
  if (useContinue) {
2206
2456
  args.push("--continue");
2207
2457
  }
@@ -2230,7 +2480,14 @@ export function createGatewayServer(deps = {}) {
2230
2480
  errorMessage: stderr || `Exit code ${code}`,
2231
2481
  status: "failed",
2232
2482
  }, runtime);
2233
- return createErrorResponse("claude", code, stderr, corrId);
2483
+ // Slice 3: attach any computed warnings to the error response so
2484
+ // the caller still sees cache_ttl_expiring_soon when the CLI
2485
+ // happens to fail for an unrelated reason.
2486
+ const errResp = createErrorResponse("claude", code, stderr, corrId);
2487
+ if (warnings.length > 0) {
2488
+ errResp.warnings = warnings;
2489
+ }
2490
+ return errResp;
2234
2491
  }
2235
2492
  wasSuccessful = true;
2236
2493
  // If we used a session ID and it's not tracked yet, create a session record
@@ -2261,7 +2518,7 @@ export function createGatewayServer(deps = {}) {
2261
2518
  exitCode: 0,
2262
2519
  status: "completed",
2263
2520
  }, runtime);
2264
- return buildCliResponse("claude", parsed.text, optimizeResponse, corrId, effectiveSessionId, prep, durationMs, undefined, outputFormat);
2521
+ return buildCliResponse("claude", parsed.text, optimizeResponse, corrId, effectiveSessionId, prep, durationMs, undefined, outputFormat, warnings);
2265
2522
  }
2266
2523
  safeFlightComplete(corrId, {
2267
2524
  response: stdout,
@@ -2272,7 +2529,7 @@ export function createGatewayServer(deps = {}) {
2272
2529
  exitCode: 0,
2273
2530
  status: "completed",
2274
2531
  }, runtime);
2275
- return buildCliResponse("claude", stdout, optimizeResponse, corrId, effectiveSessionId, prep, durationMs, undefined, outputFormat);
2532
+ return buildCliResponse("claude", stdout, optimizeResponse, corrId, effectiveSessionId, prep, durationMs, undefined, outputFormat, warnings);
2276
2533
  }
2277
2534
  catch (error) {
2278
2535
  const elapsedMs = Math.max(0, Date.now() - startTime);
@@ -2302,7 +2559,9 @@ export function createGatewayServer(deps = {}) {
2302
2559
  .string()
2303
2560
  .min(1, "Prompt cannot be empty")
2304
2561
  .max(100000, "Prompt too long (max 100k chars)")
2305
- .describe("Prompt text for Codex"),
2562
+ .optional()
2563
+ .describe("Prompt text for Codex (mutually exclusive with promptParts)"),
2564
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
2306
2565
  model: z.string().optional().describe("Model name or alias (e.g. gpt-5.4, latest)"),
2307
2566
  fullAuto: z
2308
2567
  .boolean()
@@ -2393,10 +2652,11 @@ export function createGatewayServer(deps = {}) {
2393
2652
  .boolean()
2394
2653
  .optional()
2395
2654
  .describe("Codex --ignore-rules: skip project rule files for this run."),
2396
- }, async ({ prompt, model, fullAuto, sandboxMode, askForApproval, useLegacyFullAutoFlag, dangerouslyBypassApprovalsAndSandbox, approvalStrategy, approvalPolicy, mcpServers, sessionId, resumeLatest, createNewSession, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, outputFormat, outputSchema, search, profile, configOverrides, ephemeral, images, ignoreUserConfig, ignoreRules, }) => {
2655
+ }, async ({ prompt, promptParts, model, fullAuto, sandboxMode, askForApproval, useLegacyFullAutoFlag, dangerouslyBypassApprovalsAndSandbox, approvalStrategy, approvalPolicy, mcpServers, sessionId, resumeLatest, createNewSession, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, outputFormat, outputSchema, search, profile, configOverrides, ephemeral, images, ignoreUserConfig, ignoreRules, }) => {
2397
2656
  const startTime = Date.now();
2398
2657
  const prep = prepareCodexRequest({
2399
2658
  prompt,
2659
+ promptParts,
2400
2660
  model,
2401
2661
  fullAuto,
2402
2662
  sandboxMode,
@@ -2431,10 +2691,12 @@ export function createGatewayServer(deps = {}) {
2431
2691
  correlationId: corrId,
2432
2692
  cli: "codex",
2433
2693
  model: prep.resolvedModel || "default",
2434
- prompt,
2694
+ prompt: prep.effectivePrompt,
2435
2695
  sessionId,
2696
+ stablePrefixHash: prep.stablePrefixHash ?? undefined,
2697
+ stablePrefixTokens: prep.stablePrefixTokens ?? undefined,
2436
2698
  }, runtime);
2437
- logger.info(`[${corrId}] codex_request invoked with model=${prep.resolvedModel || "default"}, fullAuto=${fullAuto}, prompt length=${prompt.length}`);
2699
+ logger.info(`[${corrId}] codex_request invoked with model=${prep.resolvedModel || "default"}, fullAuto=${fullAuto}, prompt length=${prep.effectivePrompt.length}`);
2438
2700
  // U26 fix: pass the outputSchema cleanup to awaitJobOrDefer, which
2439
2701
  // guarantees the cleanup runs exactly once — inline for direct
2440
2702
  // execution, on terminal status for the job-backed path (sync
@@ -2627,7 +2889,9 @@ export function createGatewayServer(deps = {}) {
2627
2889
  .string()
2628
2890
  .min(1, "Prompt cannot be empty")
2629
2891
  .max(100000, "Prompt too long (max 100k chars)")
2630
- .describe("Prompt text for Gemini"),
2892
+ .optional()
2893
+ .describe("Prompt text for Gemini (mutually exclusive with promptParts)"),
2894
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
2631
2895
  model: z
2632
2896
  .string()
2633
2897
  .optional()
@@ -2680,9 +2944,10 @@ export function createGatewayServer(deps = {}) {
2680
2944
  policyFiles: GEMINI_HIGH_IMPACT_PARAMS_SCHEMA.shape.policyFiles.describe("Policy file paths (--policy <path>, one per file). Paths must exist."),
2681
2945
  adminPolicyFiles: GEMINI_HIGH_IMPACT_PARAMS_SCHEMA.shape.adminPolicyFiles.describe("Admin policy file paths (--admin-policy <path>, one per file). Paths must exist."),
2682
2946
  attachments: GEMINI_HIGH_IMPACT_PARAMS_SCHEMA.shape.attachments.describe("Absolute file paths prepended as @<path> tokens to the prompt"),
2683
- }, async ({ prompt, model, sessionId, resumeLatest, createNewSession, approvalMode, approvalStrategy, approvalPolicy, mcpServers, allowedTools, includeDirs, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, outputFormat, sandbox, policyFiles, adminPolicyFiles, attachments, }) => {
2947
+ }, async ({ prompt, promptParts, model, sessionId, resumeLatest, createNewSession, approvalMode, approvalStrategy, approvalPolicy, mcpServers, allowedTools, includeDirs, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, outputFormat, sandbox, policyFiles, adminPolicyFiles, attachments, }) => {
2684
2948
  return handleGeminiRequest({ sessionManager, logger, runtime }, {
2685
2949
  prompt,
2950
+ promptParts,
2686
2951
  model,
2687
2952
  sessionId,
2688
2953
  resumeLatest,
@@ -2713,7 +2978,9 @@ export function createGatewayServer(deps = {}) {
2713
2978
  .string()
2714
2979
  .min(1, "Prompt cannot be empty")
2715
2980
  .max(100000, "Prompt too long (max 100k chars)")
2716
- .describe("Prompt text for Grok"),
2981
+ .optional()
2982
+ .describe("Prompt text for Grok (mutually exclusive with promptParts)"),
2983
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
2717
2984
  model: z.string().optional().describe("Model name or alias (e.g. grok-build, latest)"),
2718
2985
  outputFormat: z
2719
2986
  .enum(["plain", "json", "streaming-json"])
@@ -2775,9 +3042,10 @@ export function createGatewayServer(deps = {}) {
2775
3042
  .boolean()
2776
3043
  .default(false)
2777
3044
  .describe("Bypass dedup and force a fresh CLI run even if a recent identical request exists"),
2778
- }, async ({ prompt, model, outputFormat, sessionId, resumeLatest, createNewSession, alwaysApprove, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, }) => {
3045
+ }, async ({ prompt, promptParts, model, outputFormat, sessionId, resumeLatest, createNewSession, alwaysApprove, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, }) => {
2779
3046
  return handleGrokRequest({ sessionManager, logger, runtime }, {
2780
3047
  prompt,
3048
+ promptParts,
2781
3049
  model,
2782
3050
  outputFormat,
2783
3051
  sessionId,
@@ -2807,7 +3075,9 @@ export function createGatewayServer(deps = {}) {
2807
3075
  .string()
2808
3076
  .min(1, "Prompt cannot be empty")
2809
3077
  .max(100000, "Prompt too long (max 100k chars)")
2810
- .describe("Prompt text for Mistral Vibe"),
3078
+ .optional()
3079
+ .describe("Prompt text for Mistral Vibe (mutually exclusive with promptParts)"),
3080
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
2811
3081
  model: z
2812
3082
  .string()
2813
3083
  .optional()
@@ -2868,9 +3138,10 @@ export function createGatewayServer(deps = {}) {
2868
3138
  .boolean()
2869
3139
  .default(false)
2870
3140
  .describe("Bypass dedup and force a fresh CLI run even if a recent identical request exists"),
2871
- }, async ({ prompt, model, outputFormat, sessionId, resumeLatest, createNewSession, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, }) => {
3141
+ }, async ({ prompt, promptParts, model, outputFormat, sessionId, resumeLatest, createNewSession, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, optimizeResponse, idleTimeoutMs, forceRefresh, }) => {
2872
3142
  return handleMistralRequest({ sessionManager, logger, runtime }, {
2873
3143
  prompt,
3144
+ promptParts,
2874
3145
  model,
2875
3146
  outputFormat,
2876
3147
  sessionId,
@@ -2907,7 +3178,9 @@ export function createGatewayServer(deps = {}) {
2907
3178
  .string()
2908
3179
  .min(1, "Prompt cannot be empty")
2909
3180
  .max(100000, "Prompt too long (max 100k chars)")
2910
- .describe("Prompt text for Claude"),
3181
+ .optional()
3182
+ .describe("Prompt text for Claude (mutually exclusive with promptParts)"),
3183
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
2911
3184
  model: z
2912
3185
  .string()
2913
3186
  .optional()
@@ -3001,12 +3274,13 @@ export function createGatewayServer(deps = {}) {
3001
3274
  .boolean()
3002
3275
  .default(false)
3003
3276
  .describe("Bypass dedup and force a fresh CLI run even if a recent identical request exists"),
3004
- }, async ({ prompt, model, outputFormat, sessionId, continueSession, createNewSession, allowedTools, disallowedTools, dangerouslySkipPermissions, permissionMode, agent, agents, forkSession, systemPrompt, appendSystemPrompt, maxBudgetUsd, maxTurns, effort, excludeDynamicSystemPromptSections, approvalStrategy, approvalPolicy, mcpServers, strictMcpConfig, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, }) => {
3277
+ }, async ({ prompt, promptParts, model, outputFormat, sessionId, continueSession, createNewSession, allowedTools, disallowedTools, dangerouslySkipPermissions, permissionMode, agent, agents, forkSession, systemPrompt, appendSystemPrompt, maxBudgetUsd, maxTurns, effort, excludeDynamicSystemPromptSections, approvalStrategy, approvalPolicy, mcpServers, strictMcpConfig, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, }) => {
3005
3278
  if (systemPrompt !== undefined && appendSystemPrompt !== undefined) {
3006
3279
  return createErrorResponse("claude", 1, "", correlationId, new Error("systemPrompt and appendSystemPrompt are mutually exclusive; use one or the other (not both)."));
3007
3280
  }
3008
3281
  const prep = prepareClaudeRequest({
3009
3282
  prompt,
3283
+ promptParts,
3010
3284
  model,
3011
3285
  outputFormat,
3012
3286
  allowedTools,
@@ -3058,6 +3332,12 @@ export function createGatewayServer(deps = {}) {
3058
3332
  await sessionManager.createSession("claude", "Claude Session", effectiveSessionId);
3059
3333
  }
3060
3334
  }
3335
+ // Slice 3: TTL warning on resume (async path too).
3336
+ const ttlWarning = maybeBuildCacheTtlWarning({
3337
+ runtime,
3338
+ sessionId: effectiveSessionId,
3339
+ cli: "claude",
3340
+ });
3061
3341
  // Idle timeout only for stream-json (text/json produce no output until done)
3062
3342
  const effectiveIdleTimeout = outputFormat === "stream-json"
3063
3343
  ? resolveIdleTimeout("claude", idleTimeoutMs)
@@ -3080,6 +3360,9 @@ export function createGatewayServer(deps = {}) {
3080
3360
  if (prep.reviewIntegrity && prep.reviewIntegrity.violations.length > 0) {
3081
3361
  asyncResponse.reviewIntegrity = prep.reviewIntegrity;
3082
3362
  }
3363
+ if (ttlWarning) {
3364
+ asyncResponse.warnings = [ttlWarning];
3365
+ }
3083
3366
  return {
3084
3367
  content: [
3085
3368
  {
@@ -3098,7 +3381,9 @@ export function createGatewayServer(deps = {}) {
3098
3381
  .string()
3099
3382
  .min(1, "Prompt cannot be empty")
3100
3383
  .max(100000, "Prompt too long (max 100k chars)")
3101
- .describe("Prompt text for Codex"),
3384
+ .optional()
3385
+ .describe("Prompt text for Codex (mutually exclusive with promptParts)"),
3386
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
3102
3387
  model: z.string().optional().describe("Model name or alias (e.g. gpt-5.4, latest)"),
3103
3388
  fullAuto: z
3104
3389
  .boolean()
@@ -3171,9 +3456,10 @@ export function createGatewayServer(deps = {}) {
3171
3456
  images: z.array(z.string()).optional().describe("Codex -i <path>: image attachments."),
3172
3457
  ignoreUserConfig: z.boolean().optional().describe("Codex --ignore-user-config."),
3173
3458
  ignoreRules: z.boolean().optional().describe("Codex --ignore-rules."),
3174
- }, async ({ prompt, model, fullAuto, sandboxMode, askForApproval, useLegacyFullAutoFlag, dangerouslyBypassApprovalsAndSandbox, approvalStrategy, approvalPolicy, mcpServers, sessionId, resumeLatest, createNewSession, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, outputFormat, outputSchema, search, profile, configOverrides, ephemeral, images, ignoreUserConfig, ignoreRules, }) => {
3459
+ }, async ({ prompt, promptParts, model, fullAuto, sandboxMode, askForApproval, useLegacyFullAutoFlag, dangerouslyBypassApprovalsAndSandbox, approvalStrategy, approvalPolicy, mcpServers, sessionId, resumeLatest, createNewSession, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, outputFormat, outputSchema, search, profile, configOverrides, ephemeral, images, ignoreUserConfig, ignoreRules, }) => {
3175
3460
  return handleCodexRequestAsync({ sessionManager, asyncJobManager, logger, runtime }, {
3176
3461
  prompt,
3462
+ promptParts,
3177
3463
  model,
3178
3464
  fullAuto,
3179
3465
  sandboxMode,
@@ -3206,7 +3492,9 @@ export function createGatewayServer(deps = {}) {
3206
3492
  .string()
3207
3493
  .min(1, "Prompt cannot be empty")
3208
3494
  .max(100000, "Prompt too long (max 100k chars)")
3209
- .describe("Prompt text for Gemini"),
3495
+ .optional()
3496
+ .describe("Prompt text for Gemini (mutually exclusive with promptParts)"),
3497
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
3210
3498
  model: z
3211
3499
  .string()
3212
3500
  .optional()
@@ -3261,9 +3549,10 @@ export function createGatewayServer(deps = {}) {
3261
3549
  policyFiles: GEMINI_HIGH_IMPACT_PARAMS_SCHEMA.shape.policyFiles.describe("Policy file paths (--policy <path>, one per file). Paths must exist."),
3262
3550
  adminPolicyFiles: GEMINI_HIGH_IMPACT_PARAMS_SCHEMA.shape.adminPolicyFiles.describe("Admin policy file paths (--admin-policy <path>, one per file). Paths must exist."),
3263
3551
  attachments: GEMINI_HIGH_IMPACT_PARAMS_SCHEMA.shape.attachments.describe("Absolute file paths prepended as @<path> tokens to the prompt"),
3264
- }, async ({ prompt, model, sessionId, resumeLatest, createNewSession, approvalMode, approvalStrategy, approvalPolicy, mcpServers, allowedTools, includeDirs, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, outputFormat, sandbox, policyFiles, adminPolicyFiles, attachments, }) => {
3552
+ }, async ({ prompt, promptParts, model, sessionId, resumeLatest, createNewSession, approvalMode, approvalStrategy, approvalPolicy, mcpServers, allowedTools, includeDirs, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, outputFormat, sandbox, policyFiles, adminPolicyFiles, attachments, }) => {
3265
3553
  return handleGeminiRequestAsync({ sessionManager, asyncJobManager, logger, runtime }, {
3266
3554
  prompt,
3555
+ promptParts,
3267
3556
  model,
3268
3557
  sessionId,
3269
3558
  resumeLatest,
@@ -3290,7 +3579,9 @@ export function createGatewayServer(deps = {}) {
3290
3579
  .string()
3291
3580
  .min(1, "Prompt cannot be empty")
3292
3581
  .max(100000, "Prompt too long (max 100k chars)")
3293
- .describe("Prompt text for Grok"),
3582
+ .optional()
3583
+ .describe("Prompt text for Grok (mutually exclusive with promptParts)"),
3584
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
3294
3585
  model: z.string().optional().describe("Model name or alias (e.g. grok-build, latest)"),
3295
3586
  outputFormat: z
3296
3587
  .enum(["plain", "json", "streaming-json"])
@@ -3351,9 +3642,10 @@ export function createGatewayServer(deps = {}) {
3351
3642
  .boolean()
3352
3643
  .default(false)
3353
3644
  .describe("Bypass dedup and force a fresh CLI run even if a recent identical request exists"),
3354
- }, async ({ prompt, model, outputFormat, sessionId, resumeLatest, createNewSession, alwaysApprove, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, }) => {
3645
+ }, async ({ prompt, promptParts, model, outputFormat, sessionId, resumeLatest, createNewSession, alwaysApprove, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, }) => {
3355
3646
  return handleGrokRequestAsync({ sessionManager, asyncJobManager, logger, runtime }, {
3356
3647
  prompt,
3648
+ promptParts,
3357
3649
  model,
3358
3650
  outputFormat,
3359
3651
  sessionId,
@@ -3379,7 +3671,9 @@ export function createGatewayServer(deps = {}) {
3379
3671
  .string()
3380
3672
  .min(1, "Prompt cannot be empty")
3381
3673
  .max(100000, "Prompt too long (max 100k chars)")
3382
- .describe("Prompt text for Mistral Vibe"),
3674
+ .optional()
3675
+ .describe("Prompt text for Mistral Vibe (mutually exclusive with promptParts)"),
3676
+ promptParts: PromptPartsSchema.optional().describe("Cache-aware structured prompt: { system?, tools?, context?, task }. Mutually exclusive with prompt. Stable parts hash into cache_state for prefix-discipline tracking."),
3383
3677
  model: z
3384
3678
  .string()
3385
3679
  .optional()
@@ -3439,9 +3733,10 @@ export function createGatewayServer(deps = {}) {
3439
3733
  .boolean()
3440
3734
  .default(false)
3441
3735
  .describe("Bypass dedup and force a fresh CLI run even if a recent identical request exists"),
3442
- }, async ({ prompt, model, outputFormat, sessionId, resumeLatest, createNewSession, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, }) => {
3736
+ }, async ({ prompt, promptParts, model, outputFormat, sessionId, resumeLatest, createNewSession, permissionMode, effort, reasoningEffort, approvalStrategy, approvalPolicy, mcpServers, allowedTools, disallowedTools, correlationId, optimizePrompt, idleTimeoutMs, forceRefresh, }) => {
3443
3737
  return handleMistralRequestAsync({ sessionManager, asyncJobManager, logger, runtime }, {
3444
3738
  prompt,
3739
+ promptParts,
3445
3740
  model,
3446
3741
  outputFormat,
3447
3742
  sessionId,
@@ -3892,6 +4187,38 @@ export function createGatewayServer(deps = {}) {
3892
4187
  };
3893
4188
  }
3894
4189
  const activeSession = await sessionManager.getActiveSession(session.cli);
4190
+ // Slice 2: project a compact cacheState view from the flight
4191
+ // recorder at read time. NOT persisted on the Session interface
4192
+ // (sessions.json stays content-free per the project invariant).
4193
+ // The field is OMITTED entirely (not null, not empty object) when
4194
+ // the session has zero rows in the flight recorder so the response
4195
+ // stays compact for fresh sessions.
4196
+ //
4197
+ // Slice 3: include ttlRemainingMs derived from the gateway's
4198
+ // configured TTL policy. Null for non-claude sessions.
4199
+ let cacheState;
4200
+ try {
4201
+ const stats = computeSessionCacheStats(flightRecorder, session.id);
4202
+ if (stats.requestCount > 0) {
4203
+ const ttlRemainingMs = computeTtlRemaining(stats, stats.cli, {
4204
+ anthropicTtlSeconds: cacheAwareness?.anthropicTtlSeconds ?? 300,
4205
+ });
4206
+ cacheState = {
4207
+ cli: stats.cli,
4208
+ prefixDistinct: stats.distinctPrefixCount,
4209
+ totalCacheReadTokens: stats.totalCacheReadTokens,
4210
+ totalCacheCreationTokens: stats.totalCacheCreationTokens,
4211
+ requestCount: stats.requestCount,
4212
+ hitCount: stats.hitCount,
4213
+ hitRate: stats.hitRate,
4214
+ estimatedSavingsUsd: stats.estimatedSavingsUsd,
4215
+ ttlRemainingMs,
4216
+ };
4217
+ }
4218
+ }
4219
+ catch (err) {
4220
+ logger.warn?.(`[session_get] cache-stats lookup failed (non-fatal)`, err);
4221
+ }
3895
4222
  return {
3896
4223
  content: [
3897
4224
  {
@@ -3901,6 +4228,7 @@ export function createGatewayServer(deps = {}) {
3901
4228
  session: {
3902
4229
  ...session,
3903
4230
  isActive: activeSession?.id === session.id,
4231
+ ...(cacheState ? { cacheState } : {}),
3904
4232
  },
3905
4233
  }, null, 2),
3906
4234
  },
@@ -3953,7 +4281,7 @@ async function initializeSessionManager() {
3953
4281
  sessionManager = await createSessionManager(config, undefined, logger);
3954
4282
  logger.info("File-based session manager initialized");
3955
4283
  }
3956
- resourceProvider = new ResourceProvider(sessionManager, performanceMetrics);
4284
+ resourceProvider = new ResourceProvider(sessionManager, performanceMetrics, getFlightRecorder(logger), getCacheAwarenessConfig(logger));
3957
4285
  }
3958
4286
  //──────────────────────────────────────────────────────────────────────────────
3959
4287
  // Health Check Resource (only if using PostgreSQL)