@loreai/gateway 0.13.3 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/pipeline.ts CHANGED
@@ -16,7 +16,6 @@ import {
16
16
  load,
17
17
  config as loreConfig,
18
18
  ensureProject,
19
- isFirstRun,
20
19
  temporal,
21
20
  ltm,
22
21
  distillation,
@@ -92,6 +91,7 @@ import {
92
91
  import type { UpstreamInterceptor } from "./recorder";
93
92
  import { startIdleScheduler, buildIdleWorkHandler } from "./idle";
94
93
  import { getWorkerModel, resetWorkerModelState } from "./worker-model";
94
+ import { analyzeCacheTurn } from "./cache-analytics";
95
95
  import {
96
96
  RECALL_GATEWAY_TOOL,
97
97
  RECALL_TOOL_NAME,
@@ -100,10 +100,12 @@ import {
100
100
  hasRecallToolUse,
101
101
  hasOtherToolUse,
102
102
  clientHasRecallTool,
103
- isPendingRecallValid,
104
- injectPendingRecall,
105
103
  buildRecallFollowUp,
106
- stripRecallFromResponse,
104
+ buildRecallMarker,
105
+ recallStoreKey,
106
+ expandRecallMarkers,
107
+ cleanupRecallStore,
108
+ replaceRecallWithMarker,
107
109
  } from "./recall";
108
110
 
109
111
  // ---------------------------------------------------------------------------
@@ -143,6 +145,7 @@ export async function resetPipelineState(): Promise<void> {
143
145
  cachedProjectPath = null;
144
146
  sessions.clear();
145
147
  ltmSessionCache.clear();
148
+ ltmPinnedText.clear();
146
149
  // Shut down batch queue gracefully before clearing the client
147
150
  if (llmClient && "shutdown" in llmClient) {
148
151
  await (llmClient as LLMClient & { shutdown: () => Promise<void> }).shutdown();
@@ -175,6 +178,46 @@ const ltmSessionCache = new Map<
175
178
  { formatted: string; tokenCount: number }
176
179
  >();
177
180
 
181
+ /**
182
+ * Pinned LTM text per session — the text currently being injected into the
183
+ * system prompt. When ltmSessionCache is invalidated and recomputed, we
184
+ * compare the new text against the pin. Only update if >5% character
185
+ * difference to avoid cache busts from minor BM25 re-ranking changes.
186
+ */
187
+ const ltmPinnedText = new Map<
188
+ string,
189
+ { formatted: string; tokenCount: number }
190
+ >();
191
+
192
+ /**
193
+ * Measure character-level difference between two strings as a ratio (0..1).
194
+ * Uses a simple length + common-prefix heuristic — not a full diff, but
195
+ * sufficient to detect "substantially the same" vs "meaningfully different".
196
+ */
197
+ function textDiffRatio(a: string, b: string): number {
198
+ if (a === b) return 0;
199
+ if (!a || !b) return 1;
200
+
201
+ // Common prefix length
202
+ const minLen = Math.min(a.length, b.length);
203
+ const maxLen = Math.max(a.length, b.length);
204
+ let common = 0;
205
+ for (let i = 0; i < minLen; i++) {
206
+ if (a[i] === b[i]) common++;
207
+ else break;
208
+ }
209
+
210
+ // Common suffix length (non-overlapping with prefix)
211
+ let suffix = 0;
212
+ for (let i = 0; i < minLen - common; i++) {
213
+ if (a[a.length - 1 - i] === b[b.length - 1 - i]) suffix++;
214
+ else break;
215
+ }
216
+
217
+ const matched = common + suffix;
218
+ return 1 - matched / maxLen;
219
+ }
220
+
178
221
  /** Cached LLM client for background workers. */
179
222
  let llmClient: LLMClient | null = null;
180
223
 
@@ -242,8 +285,6 @@ async function initIfNeeded(projectPath: string, config?: GatewayConfig): Promis
242
285
  config.upstreamAnthropic,
243
286
  () => resolveAuth(),
244
287
  sessionModelID,
245
- // onLtmInvalidated: clear the LTM session cache
246
- () => ltmSessionCache.clear(),
247
288
  );
248
289
  stopIdleScheduler = startIdleScheduler(config, sessions, idleHandler);
249
290
  }
@@ -298,17 +339,23 @@ function getOrCreateSession(
298
339
  lastRequestTime: Date.now(),
299
340
  messageCount: 0,
300
341
  turnsSinceCuration: 0,
342
+ recallStore: new Map(),
343
+ cacheAnalytics: {
344
+ lastRequestBody: null,
345
+ lastRequestBodyLength: 0,
346
+ lastCacheRead: 0,
347
+ lastCacheCreation: 0,
348
+ turnCount: 0,
349
+ bustCount: 0,
350
+ },
301
351
  };
302
352
  sessions.set(sessionID, state);
303
353
  }
304
354
  state.lastRequestTime = Date.now();
305
355
 
306
- // Lazy cleanup: discard expired pending recall on access
307
- if (state.pendingRecall && !isPendingRecallValid(state.pendingRecall)) {
308
- log.warn(
309
- `lazy cleanup: discarding expired pending recall for session ${sessionID.slice(0, 16)}`,
310
- );
311
- state.pendingRecall = undefined;
356
+ // Ensure recallStore exists (upgrade from older session state)
357
+ if (!state.recallStore) {
358
+ state.recallStore = new Map();
312
359
  }
313
360
 
314
361
  return state;
@@ -369,6 +416,13 @@ async function identifySession(
369
416
  // Upstream forwarding
370
417
  // ---------------------------------------------------------------------------
371
418
 
419
+ /** Result from forwardToUpstream — includes the serialized body for cache analytics. */
420
+ type UpstreamResult = {
421
+ response: Response;
422
+ /** The serialized JSON body sent to the upstream provider. */
423
+ serializedBody: string;
424
+ };
425
+
372
426
  /**
373
427
  * Forward a request to the upstream provider (Anthropic or OpenAI).
374
428
  *
@@ -376,14 +430,15 @@ async function identifySession(
376
430
  * interceptor is called instead of `fetch` directly. This enables recording
377
431
  * and replay without modifying individual call sites.
378
432
  *
379
- * Returns the raw fetch Response (may be streaming or non-streaming).
433
+ * Returns the raw fetch Response alongside the serialized request body
434
+ * (for cache analytics prefix comparison).
380
435
  */
381
436
  async function forwardToUpstream(
382
437
  req: GatewayRequest,
383
438
  config: GatewayConfig,
384
439
  interceptor?: UpstreamInterceptor,
385
440
  cache?: AnthropicCacheOptions,
386
- ): Promise<Response> {
441
+ ): Promise<UpstreamResult> {
387
442
  let url: string;
388
443
  let headers: Record<string, string>;
389
444
  let body: unknown;
@@ -405,10 +460,11 @@ async function forwardToUpstream(
405
460
  body = result.body;
406
461
  }
407
462
 
463
+ const serializedBody = JSON.stringify(body);
408
464
  const effectiveInterceptor = interceptor ?? activeInterceptor;
409
465
 
410
466
  if (effectiveInterceptor) {
411
- return effectiveInterceptor(
467
+ const response = await effectiveInterceptor(
412
468
  body,
413
469
  req.model,
414
470
  req.stream,
@@ -416,16 +472,18 @@ async function forwardToUpstream(
416
472
  fetch(url, {
417
473
  method: "POST",
418
474
  headers,
419
- body: JSON.stringify(body),
475
+ body: serializedBody,
420
476
  }),
421
477
  );
478
+ return { response, serializedBody };
422
479
  }
423
480
 
424
- return fetch(url, {
481
+ const response = await fetch(url, {
425
482
  method: "POST",
426
483
  headers,
427
- body: JSON.stringify(body),
484
+ body: serializedBody,
428
485
  });
486
+ return { response, serializedBody };
429
487
  }
430
488
 
431
489
  // ---------------------------------------------------------------------------
@@ -483,44 +541,46 @@ function buildStreamingResponse(
483
541
  recallContext.sessionState.sessionID,
484
542
  );
485
543
 
544
+ const scope = input.scope ?? "all";
545
+
546
+ // Store recall result for marker round-trip expansion
547
+ const storeKey = recallStoreKey(input.query, scope);
548
+ const position = resp.content.indexOf(recallBlock);
549
+ recallContext.sessionState.recallStore.set(storeKey, {
550
+ toolUseId: recallBlock.id,
551
+ input,
552
+ position,
553
+ result,
554
+ });
555
+
556
+ // Emit marker text block in place of the suppressed recall block
557
+ const markerText = buildRecallMarker(input.query, scope);
558
+ const markerIdx = recallAccum.clientBlockCount();
559
+ const syntheticMarker = [
560
+ formatSSEEvent("content_block_start", JSON.stringify({
561
+ type: "content_block_start",
562
+ index: markerIdx,
563
+ content_block: { type: "text", text: "" },
564
+ })),
565
+ formatSSEEvent("content_block_delta", JSON.stringify({
566
+ type: "content_block_delta",
567
+ index: markerIdx,
568
+ delta: { type: "text_delta", text: markerText },
569
+ })),
570
+ formatSSEEvent("content_block_stop", JSON.stringify({
571
+ type: "content_block_stop",
572
+ index: markerIdx,
573
+ })),
574
+ ].join("");
575
+ controller.enqueue(encoder.encode(syntheticMarker));
576
+
486
577
  if (recallAccum.hasOtherTools()) {
487
- // Case 2: mixed tools — store pending, forward held-back events
488
- const position = resp.content.indexOf(recallBlock);
489
- recallContext.sessionState.pendingRecall = {
490
- toolUseId: recallBlock.id,
491
- input,
492
- position,
493
- result,
494
- timestamp: Date.now(),
495
- };
578
+ // Forward held-back events, close stream
496
579
  log.info(
497
- `recall (stream, mixed): stored pending result for session ` +
580
+ `recall (stream, mixed): stored result for session ` +
498
581
  `${recallContext.sessionState.sessionID.slice(0, 16)}`,
499
582
  );
500
583
 
501
- // Emit a synthetic "[Searching memory...]" text block after all
502
- // other tool blocks. The accumulator already re-indexed other
503
- // tools to fill the gap, so this goes at clientBlockCount.
504
- const searchingIdx = recallAccum.clientBlockCount();
505
- const syntheticCase2 = [
506
- formatSSEEvent("content_block_start", JSON.stringify({
507
- type: "content_block_start",
508
- index: searchingIdx,
509
- content_block: { type: "text", text: "" },
510
- })),
511
- formatSSEEvent("content_block_delta", JSON.stringify({
512
- type: "content_block_delta",
513
- index: searchingIdx,
514
- delta: { type: "text_delta", text: "\n\n[Searching memory...]" },
515
- })),
516
- formatSSEEvent("content_block_stop", JSON.stringify({
517
- type: "content_block_stop",
518
- index: searchingIdx,
519
- })),
520
- ].join("");
521
- controller.enqueue(encoder.encode(syntheticCase2));
522
-
523
- // Forward the held-back message_delta + message_stop
524
584
  const heldBack = recallAccum.heldBackEvents();
525
585
  if (heldBack) {
526
586
  controller.enqueue(encoder.encode(heldBack));
@@ -528,51 +588,50 @@ function buildStreamingResponse(
528
588
 
529
589
  controller.close();
530
590
 
531
- // Post-stream: use stripped response for temporal storage
532
- const cleanResp = stripRecallFromResponse(resp);
533
- onComplete(cleanResp);
591
+ // Post-stream: store response with marker text (not raw tool_use)
592
+ const markerResp = replaceRecallWithMarker(resp);
593
+ onComplete(markerResp);
534
594
  return;
535
595
  }
536
596
 
537
- // Case 1: recall-only — send follow-up, pipe continuation
597
+ // Recall-only — send follow-up, pipe continuation
538
598
  log.info(
539
599
  `recall (stream, only): executing follow-up for session ` +
540
600
  `${recallContext.sessionState.sessionID.slice(0, 16)}`,
541
601
  );
542
602
 
543
- // Emit a synthetic "[Searching memory...]" text block at the
544
- // suppressed recall index so the client sees a natural indicator
545
- // during the pause while the recall executes.
546
- const searchingIndex = recallAccum.clientBlockCount();
547
- const syntheticBlock = [
548
- formatSSEEvent("content_block_start", JSON.stringify({
549
- type: "content_block_start",
550
- index: searchingIndex,
551
- content_block: { type: "text", text: "" },
552
- })),
553
- formatSSEEvent("content_block_delta", JSON.stringify({
554
- type: "content_block_delta",
555
- index: searchingIndex,
556
- delta: { type: "text_delta", text: "\n\n[Searching memory...]" },
557
- })),
558
- formatSSEEvent("content_block_stop", JSON.stringify({
559
- type: "content_block_stop",
560
- index: searchingIndex,
561
- })),
562
- ].join("");
563
- controller.enqueue(encoder.encode(syntheticBlock));
564
-
565
603
  const followUp = buildRecallFollowUp(
566
604
  recallContext.modifiedReq,
567
605
  resp,
568
606
  result,
569
607
  recallBlock,
570
608
  );
571
- const followUpResponse = await forwardToUpstream(
572
- followUp,
573
- recallContext.config,
574
- undefined,
575
- recallContext.cacheOptions,
609
+ let followUpResponse: Response;
610
+ try {
611
+ ({ response: followUpResponse } = await forwardToUpstream(
612
+ followUp,
613
+ recallContext.config,
614
+ undefined,
615
+ recallContext.cacheOptions,
616
+ ));
617
+ } catch (fetchErr) {
618
+ log.error(
619
+ `recall follow-up fetch error for session ${recallContext.sessionState.sessionID.slice(0, 16)}:`,
620
+ fetchErr,
621
+ );
622
+ const heldBack = recallAccum.heldBackEvents();
623
+ if (heldBack) {
624
+ controller.enqueue(encoder.encode(heldBack));
625
+ }
626
+ controller.close();
627
+ const markerResp = replaceRecallWithMarker(resp);
628
+ onComplete(markerResp);
629
+ return;
630
+ }
631
+
632
+ log.info(
633
+ `recall follow-up response: status=${followUpResponse.status} ` +
634
+ `hasBody=${!!followUpResponse.body} session=${recallContext.sessionState.sessionID.slice(0, 16)}`,
576
635
  );
577
636
 
578
637
  if (!followUpResponse.ok) {
@@ -586,22 +645,21 @@ function buildStreamingResponse(
586
645
  controller.enqueue(encoder.encode(heldBack));
587
646
  }
588
647
  controller.close();
589
- const cleanResp = stripRecallFromResponse(resp);
590
- onComplete(cleanResp);
648
+ const markerResp = replaceRecallWithMarker(resp);
649
+ onComplete(markerResp);
591
650
  return;
592
651
  }
593
652
 
594
653
  // Pipe the continuation stream into the same HTTP response.
595
654
  // Suppress message_start (client already has one) and re-index
596
655
  // content blocks to continue from where the client left off.
597
- // +1 accounts for the synthetic "[Searching memory...]" block.
598
- // Use clientBlockCount (not recallBlockIndex) — this is the number
599
- // of blocks the client has already seen, so continuation blocks
600
- // start at clientBlockCount + 1 (for the synthetic block).
656
+ // +1 accounts for the synthetic marker block.
601
657
  const blockOffset = recallAccum.clientBlockCount() + 1;
602
658
  const contReader = followUpResponse.body!.getReader();
659
+ let contEventCount = 0;
603
660
 
604
661
  for await (const { event: contEvent, data: contData } of parseSSEStream(contReader)) {
662
+ contEventCount++;
605
663
  if (contEvent === "message_start") {
606
664
  // Suppress — client already received one
607
665
  continue;
@@ -634,19 +692,18 @@ function buildStreamingResponse(
634
692
  controller.enqueue(encoder.encode(forwarded));
635
693
  }
636
694
 
695
+ log.info(
696
+ `recall follow-up stream complete: ${contEventCount} events piped, ` +
697
+ `session=${recallContext.sessionState.sessionID.slice(0, 16)}`,
698
+ );
699
+
637
700
  controller.close();
638
701
 
639
- // Post-stream: accumulate the continuation for temporal storage.
640
- // We use resp (original) + continuation for a complete picture,
641
- // but for simplicity just store the continuation response since
642
- // it's what the model actually produced for the client.
643
- // The continuation accumulator was not wired — use the original
644
- // response's pre-recall content + continuation's content.
645
- // For now, call onComplete with the original response so at least
646
- // the pre-recall content is stored. The continuation's text is
647
- // visible to the client but not separately stored — acceptable
648
- // since temporal storage captures the full conversation on next turn.
649
- onComplete(resp);
702
+ // Post-stream: store response with marker text for temporal storage.
703
+ // The marker replaces the raw tool_use, so future turns can
704
+ // round-trip the marker tool_use/tool_result correctly.
705
+ const markerResp = replaceRecallWithMarker(resp);
706
+ onComplete(markerResp);
650
707
  return;
651
708
  }
652
709
  }
@@ -795,6 +852,8 @@ function postResponse(
795
852
  resp: GatewayResponse,
796
853
  sessionState: SessionState,
797
854
  config: GatewayConfig,
855
+ /** Serialized JSON body sent upstream — for cache prefix comparison. */
856
+ requestBody?: string,
798
857
  ): void {
799
858
  const { sessionID, projectPath } = sessionState;
800
859
 
@@ -810,6 +869,11 @@ function postResponse(
810
869
  getLastTransformedCount(sessionID),
811
870
  );
812
871
 
872
+ // --- Cache analytics ---
873
+ if (requestBody) {
874
+ analyzeCacheTurn(sessionState.cacheAnalytics, requestBody, resp.usage, sessionID);
875
+ }
876
+
813
877
  // --- Temporal storage ---
814
878
  // Store all messages (user + assistant) from this turn.
815
879
  // Convert gateway messages to Lore format.
@@ -1010,7 +1074,7 @@ async function handlePassthrough(
1010
1074
  req: GatewayRequest,
1011
1075
  config: GatewayConfig,
1012
1076
  ): Promise<Response> {
1013
- const upstreamResponse = await forwardToUpstream(req, config);
1077
+ const { response: upstreamResponse } = await forwardToUpstream(req, config);
1014
1078
 
1015
1079
  // For streaming, pipe through unchanged
1016
1080
  if (req.stream && upstreamResponse.body) {
@@ -1079,25 +1143,18 @@ async function handleConversationTurn(
1079
1143
  // Track session model for worker model discovery
1080
1144
  lastSeenSessionModel = req.model;
1081
1145
 
1082
- // --- Inject pending recall from previous turn (Case 2: mixed tools) ---
1083
- if (sessionState.pendingRecall) {
1084
- if (isPendingRecallValid(sessionState.pendingRecall)) {
1085
- const injected = injectPendingRecall(req, sessionState.pendingRecall);
1086
- if (injected) {
1087
- log.info(
1088
- `injected pending recall result into request for session ${sessionID.slice(0, 16)}`,
1089
- );
1090
- } else {
1091
- log.warn(
1092
- `failed to inject pending recall — conversation structure mismatch`,
1093
- );
1094
- }
1095
- } else {
1096
- log.warn(
1097
- `discarding expired pending recall for session ${sessionID.slice(0, 16)}`,
1146
+ // --- Expand recall markers from previous turns ---
1147
+ // Scan all assistant messages for marker text blocks and restore them
1148
+ // to tool_use + tool_result pairs before forwarding upstream.
1149
+ if (sessionState.recallStore.size > 0) {
1150
+ const expanded = expandRecallMarkers(req, sessionState.recallStore);
1151
+ if (expanded) {
1152
+ log.info(
1153
+ `expanded recall markers for session ${sessionID.slice(0, 16)}`,
1098
1154
  );
1099
1155
  }
1100
- sessionState.pendingRecall = undefined;
1156
+ // Clean up orphaned store entries (markers evicted by gradient)
1157
+ cleanupRecallStore(req, sessionState.recallStore);
1101
1158
  }
1102
1159
 
1103
1160
  log.info(
@@ -1130,8 +1187,8 @@ async function handleConversationTurn(
1130
1187
  );
1131
1188
  }
1132
1189
 
1133
- // --- 6. LTM injection into system prompt ---
1134
- let modifiedSystem = req.system;
1190
+ // --- 6. LTM injection (kept separate from host system prompt for caching) ---
1191
+ let ltmText: string | undefined;
1135
1192
  if (cfg.knowledge.enabled) {
1136
1193
  try {
1137
1194
  let cached = ltmSessionCache.get(sessionID);
@@ -1159,8 +1216,21 @@ async function handleConversationTurn(
1159
1216
  }
1160
1217
 
1161
1218
  if (cached) {
1162
- setLtmTokens(cached.tokenCount, sessionID);
1163
- modifiedSystem = `${req.system}\n\n${cached.formatted}`;
1219
+ // Content-diff pinning: only update the injected LTM text if the
1220
+ // new content differs by >5% from what's currently pinned. This
1221
+ // prevents cache busts from minor BM25 re-ranking after background
1222
+ // curation/consolidation invalidates the LTM cache.
1223
+ const pinned = ltmPinnedText.get(sessionID);
1224
+ if (pinned && textDiffRatio(pinned.formatted, cached.formatted) < 0.05) {
1225
+ // Near-identical — keep the pinned text to preserve cache prefix
1226
+ ltmText = pinned.formatted;
1227
+ setLtmTokens(pinned.tokenCount, sessionID);
1228
+ } else {
1229
+ // Substantially different or first injection — pin the new text
1230
+ ltmPinnedText.set(sessionID, cached);
1231
+ ltmText = cached.formatted;
1232
+ setLtmTokens(cached.tokenCount, sessionID);
1233
+ }
1164
1234
  } else {
1165
1235
  setLtmTokens(0, sessionID);
1166
1236
  }
@@ -1175,25 +1245,6 @@ async function handleConversationTurn(
1175
1245
  consumeCameOutOfIdle(sessionID);
1176
1246
  }
1177
1247
 
1178
- // First-run greeting
1179
- if (isFirstRun()) {
1180
- modifiedSystem +=
1181
- "\n\n[Lore plugin] This is the first time Lore has been activated. " +
1182
- "Briefly let the user know that Lore is now active and their " +
1183
- "coding agent will get progressively smarter on this codebase " +
1184
- "over time as knowledge accumulates across sessions.";
1185
- }
1186
-
1187
- // Lore knowledge file commit reminder
1188
- if (cfg.knowledge.enabled) {
1189
- const filesToTrack = [".lore.md"];
1190
- if (cfg.agentsFile.enabled) filesToTrack.push(cfg.agentsFile.path);
1191
- modifiedSystem +=
1192
- `\n\nWhen making git commits, always check if ${filesToTrack.join(" and ")} ` +
1193
- `have unstaged changes and include them in the commit. These files contain ` +
1194
- `shared project knowledge managed by lore and must be version-controlled.`;
1195
- }
1196
-
1197
1248
  // --- 7. Gradient transform on messages ---
1198
1249
  const loreMessages = gatewayMessagesToLore(req.messages, sessionID);
1199
1250
  resolveToolResults(loreMessages);
@@ -1225,34 +1276,54 @@ async function handleConversationTurn(
1225
1276
 
1226
1277
  const modifiedReq: GatewayRequest = {
1227
1278
  ...req,
1228
- system: modifiedSystem,
1279
+ // Host system prompt is passed through unmodified — LTM is injected
1280
+ // as a separate system block via cache options for prefix stability.
1229
1281
  messages: transformedMessages,
1230
1282
  };
1231
1283
 
1232
- // --- 8b. Inject recall tool ---
1284
+ // --- 8b. Inject recall tool (with git reminder appended to description) ---
1233
1285
  // Only inject if the client doesn't already have a recall tool (e.g. from
1234
1286
  // a host plugin like OpenCode) and the request has other tools (so it's a
1235
1287
  // coding agent, not a bare chat).
1236
1288
  if (modifiedReq.tools.length > 0 && !clientHasRecallTool(modifiedReq.tools)) {
1237
- modifiedReq.tools = [...modifiedReq.tools, RECALL_GATEWAY_TOOL];
1289
+ // Build the recall tool with git reminder baked into its description.
1290
+ // This keeps the reminder in the stable tools prefix (1h cache) rather
1291
+ // than the volatile system prompt.
1292
+ const recallTool = cfg.knowledge.enabled
1293
+ ? {
1294
+ ...RECALL_GATEWAY_TOOL,
1295
+ description:
1296
+ RECALL_GATEWAY_TOOL.description +
1297
+ "\n\nWhen making git commits, always check if .lore.md " +
1298
+ "has unstaged changes and include it in the commit. " +
1299
+ "This file contains shared project knowledge managed " +
1300
+ "by lore and must be version-controlled.",
1301
+ }
1302
+ : RECALL_GATEWAY_TOOL;
1303
+ modifiedReq.tools = [...modifiedReq.tools, recallTool];
1238
1304
  }
1239
1305
 
1240
1306
  // --- 9. Forward to upstream ---
1241
- // Enable prompt caching for conversation turns:
1242
- // - System prompt: explicit breakpoint with 5m TTL (frequent turns)
1243
- // - Conversation: breakpoint on last block so Anthropic caches the prefix
1307
+ // Enable prompt caching for conversation turns with layered breakpoints:
1308
+ // - System prompt: 1h TTL (host prompt is very stable within a session)
1309
+ // - LTM: separate system block (no breakpoint, benefits from prefix)
1310
+ // - Tools: 1h TTL on last tool (recall + git reminder are static)
1311
+ // - Conversation: 5m TTL on last message block
1244
1312
  // Title/summary passthrough (handlePassthrough) never reaches here — it
1245
1313
  // forwards the raw request without buildAnthropicRequest, so no caching.
1246
1314
  const cacheOptions: AnthropicCacheOptions = {
1247
- systemTTL: "5m",
1315
+ systemTTL: "1h",
1316
+ ltmSystem: ltmText,
1317
+ cacheTools: true,
1248
1318
  cacheConversation: true,
1249
1319
  };
1250
- const upstreamResponse = await forwardToUpstream(
1251
- modifiedReq,
1252
- config,
1253
- undefined,
1254
- cacheOptions,
1255
- );
1320
+ const { response: upstreamResponse, serializedBody: requestBody } =
1321
+ await forwardToUpstream(
1322
+ modifiedReq,
1323
+ config,
1324
+ undefined,
1325
+ cacheOptions,
1326
+ );
1256
1327
 
1257
1328
  if (!upstreamResponse.ok) {
1258
1329
  const errorBody = await upstreamResponse.text();
@@ -1273,7 +1344,7 @@ async function handleConversationTurn(
1273
1344
  );
1274
1345
  return buildStreamingResponse(
1275
1346
  upstreamResponse,
1276
- (resp) => postResponse(req, resp, sessionState, config),
1347
+ (resp) => postResponse(req, resp, sessionState, config, requestBody),
1277
1348
  hasRecallTool
1278
1349
  ? { modifiedReq, config, sessionState, cacheOptions }
1279
1350
  : undefined,
@@ -1292,46 +1363,49 @@ async function handleConversationTurn(
1292
1363
  sessionState.sessionID,
1293
1364
  );
1294
1365
 
1366
+ // Store recall result for marker round-trip expansion
1367
+ const storeKey = recallStoreKey(input.query, input.scope ?? "all");
1368
+ const position = resp.content.indexOf(recallBlock);
1369
+ sessionState.recallStore.set(storeKey, {
1370
+ toolUseId: recallBlock.id,
1371
+ input,
1372
+ position,
1373
+ result,
1374
+ });
1375
+
1376
+ // Replace recall tool_use with marker text in the response
1377
+ const markerResp = replaceRecallWithMarker(resp);
1378
+
1295
1379
  if (hasOtherToolUse(resp)) {
1296
- // Case 2: recall + other tools store pending, strip recall from response
1297
- const position = resp.content.indexOf(recallBlock);
1298
- sessionState.pendingRecall = {
1299
- toolUseId: recallBlock.id,
1300
- input,
1301
- position,
1302
- result,
1303
- timestamp: Date.now(),
1304
- };
1380
+ // Mixed tools return response with marker replacing recall tool_use
1305
1381
  log.info(
1306
- `recall (non-stream, mixed): stored pending result for session ${sessionState.sessionID.slice(0, 16)}`,
1382
+ `recall (non-stream, mixed): stored result for session ${sessionState.sessionID.slice(0, 16)}`,
1307
1383
  );
1308
- const cleanResp = stripRecallFromResponse(resp);
1309
- postResponse(req, cleanResp, sessionState, config);
1310
- return nonStreamHttpResponse(cleanResp);
1384
+ postResponse(req, markerResp, sessionState, config, requestBody);
1385
+ return nonStreamHttpResponse(markerResp);
1311
1386
  }
1312
1387
 
1313
- // Case 1: recall-only — send follow-up request
1388
+ // Recall-only — send follow-up request for seamless UX
1314
1389
  log.info(
1315
1390
  `recall (non-stream, only): executing follow-up for session ${sessionState.sessionID.slice(0, 16)}`,
1316
1391
  );
1317
1392
  const followUp = buildRecallFollowUp(modifiedReq, resp, result, recallBlock);
1318
- // Strip recall from the follow-up tools (already done by buildRecallFollowUp)
1319
- const followUpResponse = await forwardToUpstream(
1393
+ let followUpResponse: Response;
1394
+ ({ response: followUpResponse } = await forwardToUpstream(
1320
1395
  followUp,
1321
1396
  config,
1322
1397
  undefined,
1323
1398
  cacheOptions,
1324
- );
1399
+ ));
1325
1400
 
1326
1401
  if (!followUpResponse.ok) {
1327
1402
  const errorBody = await followUpResponse.text();
1328
1403
  log.error(
1329
1404
  `recall follow-up upstream error: ${followUpResponse.status} ${errorBody.slice(0, 500)}`,
1330
1405
  );
1331
- // Fall back to the original response without recall
1332
- const cleanResp = stripRecallFromResponse(resp);
1333
- postResponse(req, cleanResp, sessionState, config);
1334
- return nonStreamHttpResponse(cleanResp);
1406
+ // Fall back to response with marker (no continuation)
1407
+ postResponse(req, markerResp, sessionState, config, requestBody);
1408
+ return nonStreamHttpResponse(markerResp);
1335
1409
  }
1336
1410
 
1337
1411
  const continuationResp = await accumulateNonStreamResponse(followUpResponse);
@@ -1350,11 +1424,11 @@ async function handleConversationTurn(
1350
1424
  resp.usage.cacheCreationInputTokens;
1351
1425
  }
1352
1426
 
1353
- postResponse(req, continuationResp, sessionState, config);
1427
+ postResponse(req, continuationResp, sessionState, config, requestBody);
1354
1428
  return nonStreamHttpResponse(continuationResp);
1355
1429
  }
1356
1430
 
1357
- postResponse(req, resp, sessionState, config);
1431
+ postResponse(req, resp, sessionState, config, requestBody);
1358
1432
  return nonStreamHttpResponse(resp);
1359
1433
  }
1360
1434
 
@@ -1417,6 +1491,9 @@ export function loreMessagesToGateway(
1417
1491
  content.push({
1418
1492
  type: "thinking",
1419
1493
  thinking: (part as { text: string }).text ?? "",
1494
+ ...((part as { signature?: string }).signature != null
1495
+ ? { signature: (part as { signature?: string }).signature }
1496
+ : undefined),
1420
1497
  });
1421
1498
  break;
1422
1499
  case "tool": {