@loreai/gateway 0.13.3 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/recall.ts CHANGED
@@ -1,15 +1,19 @@
1
1
  /**
2
2
  * Gateway recall interception — transparent memory search for any client.
3
3
  *
4
- * Injects a `recall` tool into upstream requests and handles the response
5
- * transparently. Two strategies based on whether recall is the only tool:
4
+ * Uses a unified "Marker and Expand" strategy:
6
5
  *
7
- * - **Case 1 (recall-only)**: "Pause and Continue" pause client stream,
8
- * execute recall, send follow-up request, resume streaming in the same
9
- * HTTP response.
10
- * - **Case 2 (mixed tools)**: "Strip and Inject" — suppress recall blocks
11
- * from the client stream, execute recall in background, inject the result
12
- * into the next request from the client.
6
+ * 1. **On response (to client):** The recall `tool_use` block is replaced
7
+ * with a human-readable marker text block
8
+ * (`📚 Searching <scope> for "<query>"…`). The recall is executed
9
+ * internally and the result is stored in session state.
10
+ *
11
+ * 2. **On request (from client):** Marker text blocks in the conversation
12
+ * are expanded back into the original `tool_use` + `tool_result` pairs
13
+ * before forwarding upstream.
14
+ *
15
+ * For recall-only responses, a follow-up call is still made internally
16
+ * so the model can continue in the same HTTP response (seamless UX).
13
17
  *
14
18
  * All recall execution delegates to `runRecall()` from `@loreai/core`.
15
19
  */
@@ -28,7 +32,7 @@ import type {
28
32
  GatewayResponse,
29
33
  GatewayToolUseBlock,
30
34
  GatewayMessage,
31
- PendingRecall,
35
+ RecallStore,
32
36
  } from "./translate/types";
33
37
 
34
38
  // ---------------------------------------------------------------------------
@@ -59,15 +63,205 @@ export const RECALL_GATEWAY_TOOL: GatewayTool = {
59
63
  export const RECALL_TOOL_NAME = "recall";
60
64
 
61
65
  // ---------------------------------------------------------------------------
62
- // Pending recall state (cross-request, Case 2)
66
+ // Marker utilities human-readable text ↔ recall tool round-trip
63
67
  // ---------------------------------------------------------------------------
64
68
 
65
- /** TTL for pending recall results discard after 60 seconds. */
66
- const PENDING_RECALL_TTL_MS = 60_000;
69
+ /** Scope human-readable label for marker text. */
70
+ const SCOPE_LABELS: Record<string, string> = {
71
+ all: "all archives",
72
+ session: "session history",
73
+ project: "project archives",
74
+ knowledge: "knowledge base",
75
+ };
76
+
77
+ /** Reverse: label → scope enum. */
78
+ const LABEL_TO_SCOPE: Record<string, RecallScope> = Object.fromEntries(
79
+ Object.entries(SCOPE_LABELS).map(([k, v]) => [v, k as RecallScope]),
80
+ );
81
+
82
+ /** Map a recall scope to a human-readable label. */
83
+ export function scopeToLabel(scope: string = "all"): string {
84
+ return SCOPE_LABELS[scope] ?? SCOPE_LABELS.all;
85
+ }
67
86
 
68
- /** Check whether a pending recall is still valid (within TTL). */
69
- export function isPendingRecallValid(pending: PendingRecall): boolean {
70
- return Date.now() - pending.timestamp < PENDING_RECALL_TTL_MS;
87
+ /** Map a human-readable label back to a scope enum value. */
88
+ export function labelToScope(label: string): RecallScope {
89
+ return LABEL_TO_SCOPE[label] ?? "all";
90
+ }
91
+
92
+ /**
93
+ * Build a marker text string for a recall tool call.
94
+ *
95
+ * Format: `📚 Searching <scope-label> for "<query>"…`
96
+ */
97
+ export function buildRecallMarker(query: string, scope: string = "all"): string {
98
+ return `📚 Searching ${scopeToLabel(scope)} for "${query}"…`;
99
+ }
100
+
101
+ /** Regex to parse a recall marker back into query + scope. */
102
+ const MARKER_REGEX = /📚 Searching (.+?) for "(.+?)"…/;
103
+
104
+ /**
105
+ * Parse a recall marker text block, returning query and scope if valid.
106
+ * Returns null if the text doesn't match the marker format.
107
+ */
108
+ export function parseRecallMarker(
109
+ text: string,
110
+ ): { query: string; scope: RecallScope } | null {
111
+ const match = MARKER_REGEX.exec(text);
112
+ if (!match) return null;
113
+ return {
114
+ query: match[2],
115
+ scope: labelToScope(match[1]),
116
+ };
117
+ }
118
+
119
+ /** Derive a store key from query + scope. */
120
+ export function recallStoreKey(query: string, scope: string = "all"): string {
121
+ return `${scope}:${query}`;
122
+ }
123
+
124
+ // ---------------------------------------------------------------------------
125
+ // Marker expansion — restore tool_use + tool_result from markers on inbound
126
+ // ---------------------------------------------------------------------------
127
+
128
+ /**
129
+ * Find recall marker text blocks in the conversation and expand them
130
+ * back into tool_use + tool_result pairs for the upstream API.
131
+ *
132
+ * Scans ALL assistant messages (not just the last one) since markers
133
+ * persist across turns until gradient evicts the message.
134
+ *
135
+ * Mutates the request in-place. Returns true if any expansion was performed.
136
+ */
137
+ export function expandRecallMarkers(
138
+ req: GatewayRequest,
139
+ store: RecallStore,
140
+ ): boolean {
141
+ let expanded = false;
142
+
143
+ // Iterate forward; when we splice messages the index is adjusted.
144
+ for (let i = 0; i < req.messages.length; i++) {
145
+ const msg = req.messages[i];
146
+ if (msg.role !== "assistant") continue;
147
+
148
+ // Find the first (should be only) recall marker in this message.
149
+ // We process one marker per assistant message per pass; the outer
150
+ // loop will revisit if there's more than one (rare).
151
+ let markerIdx = -1;
152
+ let parsed: { query: string; scope: RecallScope } | null = null;
153
+ for (let j = 0; j < msg.content.length; j++) {
154
+ const block = msg.content[j];
155
+ if (block.type !== "text") continue;
156
+ parsed = parseRecallMarker(block.text);
157
+ if (parsed) {
158
+ markerIdx = j;
159
+ break;
160
+ }
161
+ }
162
+
163
+ if (markerIdx < 0 || !parsed) continue;
164
+
165
+ const key = recallStoreKey(parsed.query, parsed.scope);
166
+ const stored = store.get(key);
167
+ if (!stored) continue; // No stored result — leave marker as-is
168
+
169
+ // Check if there's non-tool content AFTER the marker in this message.
170
+ // This happens when recall-only follow-up piped continuation content
171
+ // (text blocks) into the same assistant message. Tool_use blocks after
172
+ // the marker are from the same turn (mixed tools) and stay together.
173
+ const afterMarker = msg.content.slice(markerIdx + 1);
174
+ const hasContinuationAfter = afterMarker.length > 0 &&
175
+ afterMarker.some((b) => b.type !== "tool_use");
176
+
177
+ // Replace marker with tool_use
178
+ msg.content[markerIdx] = {
179
+ type: "tool_use",
180
+ id: stored.toolUseId,
181
+ name: RECALL_TOOL_NAME,
182
+ input: stored.input,
183
+ };
184
+
185
+ // Truncate assistant message at the tool_use (remove continuation)
186
+ if (hasContinuationAfter) {
187
+ msg.content.length = markerIdx + 1;
188
+ }
189
+
190
+ // Build synthetic tool_result user message
191
+ const toolResultMsg: GatewayMessage = {
192
+ role: "user",
193
+ content: [
194
+ {
195
+ type: "tool_result",
196
+ toolUseId: stored.toolUseId,
197
+ content: stored.result,
198
+ },
199
+ ],
200
+ };
201
+
202
+ if (hasContinuationAfter) {
203
+ // Split: insert tool_result user message + continuation assistant
204
+ // message after the current assistant message.
205
+ const continuationMsg: GatewayMessage = {
206
+ role: "assistant",
207
+ content: afterMarker,
208
+ };
209
+ req.messages.splice(i + 1, 0, toolResultMsg, continuationMsg);
210
+ // Skip past the two newly inserted messages
211
+ i += 2;
212
+ } else {
213
+ // No split needed — insert tool_result into the following user message.
214
+ // Prepend (unshift) so the recall result appears before existing
215
+ // tool_results — matching the tool_use order in the assistant message.
216
+ const nextMsg = req.messages[i + 1];
217
+ if (nextMsg?.role === "user") {
218
+ nextMsg.content.unshift({
219
+ type: "tool_result",
220
+ toolUseId: stored.toolUseId,
221
+ content: stored.result,
222
+ });
223
+ } else {
224
+ // No following user message — insert a synthetic one
225
+ req.messages.splice(i + 1, 0, toolResultMsg);
226
+ i += 1;
227
+ }
228
+ }
229
+
230
+ expanded = true;
231
+ }
232
+
233
+ return expanded;
234
+ }
235
+
236
+ /**
237
+ * Clean up orphaned recall store entries whose markers no longer
238
+ * appear in the conversation (e.g. gradient evicted the turn).
239
+ */
240
+ export function cleanupRecallStore(
241
+ req: GatewayRequest,
242
+ store: RecallStore,
243
+ ): void {
244
+ if (store.size === 0) return;
245
+
246
+ // Collect all marker keys still present in assistant messages
247
+ const activeKeys = new Set<string>();
248
+ for (const msg of req.messages) {
249
+ if (msg.role !== "assistant") continue;
250
+ for (const block of msg.content) {
251
+ if (block.type !== "text") continue;
252
+ const parsed = parseRecallMarker(block.text);
253
+ if (parsed) {
254
+ activeKeys.add(recallStoreKey(parsed.query, parsed.scope));
255
+ }
256
+ }
257
+ }
258
+
259
+ // Remove entries not referenced by any current marker
260
+ for (const key of store.keys()) {
261
+ if (!activeKeys.has(key)) {
262
+ store.delete(key);
263
+ }
264
+ }
71
265
  }
72
266
 
73
267
  // ---------------------------------------------------------------------------
@@ -212,90 +406,28 @@ export function buildRecallFollowUp(
212
406
  }
213
407
 
214
408
  // ---------------------------------------------------------------------------
215
- // Pending recall injection (Case 2: next request enrichment)
216
- // ---------------------------------------------------------------------------
217
-
218
- /**
219
- * Inject a pending recall result into the current request.
220
- *
221
- * Finds the last assistant message in `req.messages`, inserts the recall
222
- * tool_use block at the recorded position, and inserts a tool_result block
223
- * into the following user message.
224
- *
225
- * Mutates the request in-place for efficiency. Returns true if injection
226
- * was performed, false if the conversation structure didn't match
227
- * (e.g., no trailing assistant→user pair).
228
- */
229
- export function injectPendingRecall(
230
- req: GatewayRequest,
231
- pending: PendingRecall,
232
- ): boolean {
233
- const messages = req.messages;
234
- if (messages.length < 2) return false;
235
-
236
- // Find the last assistant message followed by a user message.
237
- // The pending recall was from the previous turn's assistant response.
238
- let assistantIdx = -1;
239
- for (let i = messages.length - 2; i >= 0; i--) {
240
- if (
241
- messages[i].role === "assistant" &&
242
- messages[i + 1]?.role === "user"
243
- ) {
244
- assistantIdx = i;
245
- break;
246
- }
247
- }
248
-
249
- if (assistantIdx < 0) {
250
- log.warn("injectPendingRecall: no assistant→user pair found");
251
- return false;
252
- }
253
-
254
- const assistantMsg = messages[assistantIdx];
255
- const userMsg = messages[assistantIdx + 1];
256
-
257
- // Insert recall tool_use into assistant message at the recorded position.
258
- // Clamp to content length in case the message was modified by gradient.
259
- const insertPos = Math.min(pending.position, assistantMsg.content.length);
260
- const recallToolUse: GatewayToolUseBlock = {
261
- type: "tool_use",
262
- id: pending.toolUseId,
263
- name: RECALL_TOOL_NAME,
264
- input: pending.input,
265
- };
266
- assistantMsg.content.splice(insertPos, 0, recallToolUse);
267
-
268
- // Insert recall tool_result into the user message.
269
- // Add it at the beginning alongside any other tool_results.
270
- userMsg.content.unshift({
271
- type: "tool_result",
272
- toolUseId: pending.toolUseId,
273
- content: pending.result,
274
- });
275
-
276
- // Strip recall from tools list for this request
277
- req.tools = req.tools.filter((t) => t.name !== RECALL_TOOL_NAME);
278
-
279
- return true;
280
- }
281
-
282
- // ---------------------------------------------------------------------------
283
- // Response content stripping (Case 2: remove recall from response)
409
+ // Response content rewriting replace recall tool_use with marker text
284
410
  // ---------------------------------------------------------------------------
285
411
 
286
412
  /**
287
- * Build a GatewayResponse with recall tool_use blocks removed.
413
+ * Build a GatewayResponse with recall tool_use blocks replaced by marker text.
288
414
  *
289
- * Used for Case 2 to produce a clean response for `postResponse` storage
290
- * that excludes the gateway-internal recall blocks.
415
+ * Used for both recall-only and mixed-tools cases to produce a response
416
+ * where the client sees human-readable markers instead of tool call mechanics.
291
417
  */
292
- export function stripRecallFromResponse(
418
+ export function replaceRecallWithMarker(
293
419
  resp: GatewayResponse,
294
420
  ): GatewayResponse {
295
421
  return {
296
422
  ...resp,
297
- content: resp.content.filter(
298
- (b) => !(b.type === "tool_use" && b.name === RECALL_TOOL_NAME),
299
- ),
423
+ content: resp.content.map((b) => {
424
+ if (b.type === "tool_use" && b.name === RECALL_TOOL_NAME) {
425
+ const input = b.input as Record<string, unknown>;
426
+ const query = typeof input.query === "string" ? input.query : "";
427
+ const scope = (input.scope as string) ?? "all";
428
+ return { type: "text" as const, text: buildRecallMarker(query, scope) };
429
+ }
430
+ return b;
431
+ }),
300
432
  };
301
433
  }
@@ -97,6 +97,9 @@ function contentBlockToPart(
97
97
  messageID,
98
98
  type: "reasoning",
99
99
  text: block.thinking,
100
+ ...(block.signature != null
101
+ ? { signature: block.signature }
102
+ : undefined),
100
103
  } satisfies LoreReasoningPart;
101
104
 
102
105
  case "tool_use":
@@ -265,6 +265,24 @@ export type AnthropicCacheOptions = {
265
265
  */
266
266
  systemTTL?: "5m" | "1h" | false;
267
267
 
268
+ /**
269
+ * LTM knowledge text to inject as a separate system block after the host
270
+ * prompt. Keeping it in a separate block means the host prompt gets its
271
+ * own cache breakpoint (1h) and LTM changes don't bust the host prefix.
272
+ *
273
+ * When provided AND systemTTL is set, the system becomes a 2-block array:
274
+ * system[0]: host prompt — cache_control with systemTTL
275
+ * system[1]: LTM content — no cache_control (benefits from prefix)
276
+ */
277
+ ltmSystem?: string;
278
+
279
+ /**
280
+ * Cache the last tool definition with an explicit 1h breakpoint.
281
+ * Tool definitions (including our injected recall tool) are stable
282
+ * across turns — caching them avoids re-processing on every request.
283
+ */
284
+ cacheTools?: boolean;
285
+
268
286
  /**
269
287
  * Place an explicit `cache_control` breakpoint on the last block of the
270
288
  * last message, enabling Anthropic to cache the conversation prefix.
@@ -329,19 +347,33 @@ export function buildAnthropicRequest(
329
347
  // System — only include if non-empty
330
348
  if (req.system) {
331
349
  const systemTTL = cache?.systemTTL;
350
+ const ltmText = cache?.ltmSystem;
351
+
332
352
  if (systemTTL) {
333
- // Send as block array with explicit cache_control breakpoint.
334
- // This creates a stable cache slot for the system prompt it changes
335
- // only when LTM entries are added/removed or AGENTS.md is updated.
353
+ // Send as block array with explicit cache_control breakpoint on the
354
+ // host prompt. The host prompt is the most stable part (changes only
355
+ // when the host mutates AGENTS.md, memory, etc.) so it gets a 1h TTL.
336
356
  const cacheControl: Record<string, string> =
337
357
  systemTTL === "1h"
338
358
  ? { type: "ephemeral", ttl: "1h" }
339
359
  : { type: "ephemeral" };
340
- body.system = [
360
+
361
+ const blocks: Record<string, unknown>[] = [
341
362
  { type: "text", text: req.system, cache_control: cacheControl },
342
363
  ];
364
+
365
+ // LTM knowledge as a separate block — no cache_control of its own,
366
+ // but benefits from the host prompt prefix cache. When LTM changes,
367
+ // only this block and everything after it is re-processed; the host
368
+ // prompt prefix is still a cache read.
369
+ if (ltmText) {
370
+ blocks.push({ type: "text", text: ltmText });
371
+ }
372
+
373
+ body.system = blocks;
343
374
  } else {
344
- body.system = req.system;
375
+ // No caching — concatenate LTM into a single string.
376
+ body.system = ltmText ? `${req.system}\n\n${ltmText}` : req.system;
345
377
  }
346
378
  }
347
379
 
@@ -368,11 +400,23 @@ export function buildAnthropicRequest(
368
400
 
369
401
  // Tools — only include if present
370
402
  if (req.tools.length > 0) {
371
- body.tools = req.tools.map((t) => ({
403
+ const tools = req.tools.map((t) => ({
372
404
  name: t.name,
373
405
  description: t.description,
374
406
  input_schema: t.inputSchema,
375
407
  }));
408
+
409
+ // Tool caching: place a 1h breakpoint on the last tool definition.
410
+ // Tool definitions (including our recall tool) are stable across turns.
411
+ if (cache?.cacheTools && tools.length > 0) {
412
+ const lastTool = tools[tools.length - 1]!;
413
+ (lastTool as Record<string, unknown>).cache_control = {
414
+ type: "ephemeral",
415
+ ttl: "1h",
416
+ };
417
+ }
418
+
419
+ body.tools = tools;
376
420
  }
377
421
 
378
422
  // Restore all metadata params (temperature, top_p, stop_sequences, etc.)
@@ -139,27 +139,70 @@ export type GatewayResponse = {
139
139
  };
140
140
 
141
141
  // ---------------------------------------------------------------------------
142
- // Pending recall state (cross-request, gateway recall interception)
142
+ // Recall store (cross-request, gateway recall interception)
143
143
  // ---------------------------------------------------------------------------
144
144
 
145
- /** Pending recall result stored between requests (Case 2: mixed tools). */
146
- export type PendingRecall = {
147
- /** tool_use ID from the suppressed block. */
145
+ /** Stored recall result for marker-based round-trip expansion. */
146
+ export type StoredRecall = {
147
+ /** The tool_use ID to reconstruct in the upstream request. */
148
148
  toolUseId: string;
149
- /** The original recall input (for conversation history reconstruction). */
149
+ /** Original recall input (query + scope). */
150
150
  input: { query: string; scope?: string };
151
151
  /** Position (content block index) in the original assistant message. */
152
152
  position: number;
153
153
  /** Executed recall result (formatted markdown). */
154
154
  result: string;
155
- /** Timestamp for TTL-based cleanup. */
156
- timestamp: number;
157
155
  };
158
156
 
157
+ /** Map from marker key (`${scope}:${query}`) → stored recall data. */
158
+ export type RecallStore = Map<string, StoredRecall>;
159
+
159
160
  // ---------------------------------------------------------------------------
160
161
  // Session state — per-session tracking for Lore pipeline integration
161
162
  // ---------------------------------------------------------------------------
162
163
 
164
+ /** Per-turn cache analysis emitted as structured log data. */
165
+ export type CacheTurnAnalysis = {
166
+ /** Turn number within this session. */
167
+ turn: number;
168
+
169
+ // --- Ground truth from API response ---
170
+ /** Tokens served from prompt cache (hit). */
171
+ cacheRead: number;
172
+ /** Tokens written to prompt cache (miss / new). */
173
+ cacheCreation: number;
174
+ /** Uncached input tokens. */
175
+ inputTokens: number;
176
+ /** cacheRead / total input — 0..1. */
177
+ cacheHitRate: number;
178
+
179
+ // --- Request body prefix comparison ---
180
+ /** Bytes matching from start of serialized request body vs previous turn. */
181
+ prefixMatchBytes: number;
182
+ /** prefixMatchBytes / min(prev, current) body length — 0..1. */
183
+ prefixMatchPercent: number;
184
+ /** Semantic location of the first divergence (e.g. "messages[3].content[1]"). */
185
+ divergencePoint: string;
186
+ /** Human-readable reason (e.g. "system prompt changed", "new message appended"). */
187
+ divergenceReason: string;
188
+ };
189
+
190
+ /** Per-session cache analytics state. */
191
+ export type CacheAnalytics = {
192
+ /** Deflate-compressed serialized request body from the last turn. */
193
+ lastRequestBody: Uint8Array | null;
194
+ /** Uncompressed byte length of lastRequestBody (for prefix match %). */
195
+ lastRequestBodyLength: number;
196
+ /** cache_read_input_tokens from last API response. */
197
+ lastCacheRead: number;
198
+ /** cache_creation_input_tokens from last API response. */
199
+ lastCacheCreation: number;
200
+ /** Total turns observed. */
201
+ turnCount: number;
202
+ /** Confirmed busts (API returned cacheRead=0 with cacheCreation>0). */
203
+ bustCount: number;
204
+ };
205
+
163
206
  /** Per-session state tracked by the gateway for Lore pipeline decisions. */
164
207
  export type SessionState = {
165
208
  sessionID: string;
@@ -172,6 +215,8 @@ export type SessionState = {
172
215
  messageCount: number;
173
216
  /** Turns since last curation run — triggers background curation. */
174
217
  turnsSinceCuration: number;
175
- /** Pending recall result from previous turn (Case 2: mixed tool interception). */
176
- pendingRecall?: PendingRecall;
218
+ /** Stored recall results for marker-based round-trip expansion. */
219
+ recallStore: RecallStore;
220
+ /** Cache analytics — request body prefix comparison + API cache fields. */
221
+ cacheAnalytics: CacheAnalytics;
177
222
  };