@loreai/gateway 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1604 @@
1
+ /**
2
+ * Core request processing pipeline for the Lore gateway.
3
+ *
4
+ * Orchestrates the full flow for every request:
5
+ * session identification → LTM injection → gradient transform →
6
+ * upstream forwarding → response accumulation → calibration →
7
+ * temporal storage → background work scheduling.
8
+ *
9
+ * Three request classes are handled:
10
+ * 1. Compaction requests → intercepted, never forwarded upstream.
11
+ * 2. Title/summary requests → forwarded transparently, no Lore processing.
12
+ * 3. Normal conversation turns → full pipeline.
13
+ */
14
+ import type { LoreMessageWithParts, LLMClient } from "@loreai/core";
15
+ import {
16
+ load,
17
+ config as loreConfig,
18
+ ensureProject,
19
+ isFirstRun,
20
+ temporal,
21
+ ltm,
22
+ distillation,
23
+ curator,
24
+ log,
25
+ transform,
26
+ setModelLimits,
27
+ setLtmTokens,
28
+ getLtmBudget,
29
+ setMaxLayer0Tokens,
30
+ computeLayer0Cap,
31
+ calibrate,
32
+ getLastTransformedCount,
33
+ onIdleResume,
34
+ consumeCameOutOfIdle,
35
+ needsUrgentDistillation,
36
+ formatKnowledge,
37
+ buildCompactPrompt,
38
+ } from "@loreai/core";
39
+
40
+ import type {
41
+ GatewayRequest,
42
+ GatewayResponse,
43
+ GatewayContentBlock,
44
+ GatewayToolUseBlock,
45
+ GatewayToolResultBlock,
46
+ SessionState,
47
+ } from "./translate/types";
48
+ import type { GatewayConfig } from "./config";
49
+ import { getProjectPath, resolveUpstreamRoute } from "./config";
50
+ import {
51
+ generateSessionID,
52
+ fingerprintMessages,
53
+ MESSAGE_COUNT_PROXIMITY_THRESHOLD,
54
+ } from "./session";
55
+ import {
56
+ isCompactionRequest,
57
+ isTitleOrSummaryRequest,
58
+ extractPreviousSummary,
59
+ buildCompactionResponse,
60
+ } from "./compaction";
61
+ import {
62
+ buildAnthropicRequest,
63
+ buildAnthropicNonStreamResponse,
64
+ type AnthropicCacheOptions,
65
+ } from "./translate/anthropic";
66
+ import {
67
+ buildOpenAIUpstreamRequest,
68
+ buildOpenAIResponse,
69
+ } from "./translate/openai";
70
+ import {
71
+ createStreamAccumulator,
72
+ createRecallAwareAccumulator,
73
+ parseSSEStream,
74
+ buildSSETextResponse,
75
+ formatSSEEvent,
76
+ type StreamAccumulator,
77
+ } from "./stream/anthropic";
78
+ import {
79
+ gatewayMessagesToLore,
80
+ updateAssistantMessageTokens,
81
+ resolveToolResults,
82
+ } from "./temporal-adapter";
83
+ import { createGatewayLLMClient } from "./llm-adapter";
84
+ import { createBatchLLMClient } from "./batch-queue";
85
+ import {
86
+ extractAuth,
87
+ authFingerprint,
88
+ setLastSeenAuth,
89
+ setSessionAuth,
90
+ resolveAuth,
91
+ } from "./auth";
92
+ import type { UpstreamInterceptor } from "./recorder";
93
+ import { startIdleScheduler, buildIdleWorkHandler } from "./idle";
94
+ import { getWorkerModel, resetWorkerModelState } from "./worker-model";
95
+ import {
96
+ RECALL_GATEWAY_TOOL,
97
+ RECALL_TOOL_NAME,
98
+ executeRecall,
99
+ findRecallToolUse,
100
+ hasRecallToolUse,
101
+ hasOtherToolUse,
102
+ clientHasRecallTool,
103
+ isPendingRecallValid,
104
+ injectPendingRecall,
105
+ buildRecallFollowUp,
106
+ stripRecallFromResponse,
107
+ } from "./recall";
108
+
109
+ // ---------------------------------------------------------------------------
110
+ // Module state
111
+ // ---------------------------------------------------------------------------
112
+
113
+ /** One-time initialization flag. */
114
+ let initialized = false;
115
+
116
+ /** Active upstream interceptor — used for recording/replay. */
117
+ let activeInterceptor: UpstreamInterceptor | undefined;
118
+
119
+ /**
120
+ * Set (or clear) the module-level upstream interceptor.
121
+ *
122
+ * When set, every call to `forwardToUpstream` passes through the interceptor
123
+ * instead of calling `fetch` directly. Used by the recording and replay
124
+ * scripts to capture or replay upstream traffic without modifying individual
125
+ * call sites.
126
+ */
127
+ export function setUpstreamInterceptor(
128
+ interceptor: UpstreamInterceptor | undefined,
129
+ ): void {
130
+ activeInterceptor = interceptor;
131
+ }
132
+
133
+ /**
134
+ * Reset all module-level singleton state.
135
+ *
136
+ * Intended for test harnesses only — allows multiple independent gateway
137
+ * instances to run sequentially in the same Bun process without leaking
138
+ * session state, initialization flags, or cached project paths across test
139
+ * suites.
140
+ */
141
+ export async function resetPipelineState(): Promise<void> {
142
+ initialized = false;
143
+ cachedProjectPath = null;
144
+ sessions.clear();
145
+ ltmSessionCache.clear();
146
+ // Shut down batch queue gracefully before clearing the client
147
+ if (llmClient && "shutdown" in llmClient) {
148
+ await (llmClient as LLMClient & { shutdown: () => Promise<void> }).shutdown();
149
+ }
150
+ llmClient = null;
151
+ activeInterceptor = undefined;
152
+ if (stopIdleScheduler) {
153
+ stopIdleScheduler();
154
+ stopIdleScheduler = null;
155
+ }
156
+ lastSeenSessionModel = null;
157
+ resetWorkerModelState();
158
+ }
159
+
160
+ /** Cached project path from the first request that carried a system prompt. */
161
+ let cachedProjectPath: string | null = null;
162
+
163
+ /** Per-session state tracked across requests. */
164
+ const sessions = new Map<string, SessionState>();
165
+
166
+ /**
167
+ * Per-session LTM cache for byte-stability.
168
+ *
169
+ * Without caching, `ltm.forSession()` re-scores entries against evolving
170
+ * session context every turn, producing different formatted text → system
171
+ * prompt changes at byte 0 → total cache invalidation on every turn.
172
+ */
173
+ const ltmSessionCache = new Map<
174
+ string,
175
+ { formatted: string; tokenCount: number }
176
+ >();
177
+
178
+ /** Cached LLM client for background workers. */
179
+ let llmClient: LLMClient | null = null;
180
+
181
+ /** Cleanup function for the idle scheduler timer. */
182
+ let stopIdleScheduler: (() => void) | null = null;
183
+
184
+ /** Last seen session model ID — used for worker model discovery context. */
185
+ let lastSeenSessionModel: string | null = null;
186
+
187
+ // ---------------------------------------------------------------------------
188
+ // Model limits — hardcoded for known models, fallback for unknown
189
+ // ---------------------------------------------------------------------------
190
+
191
+ type ModelSpec = {
192
+ context: number;
193
+ output: number;
194
+ /** Cache-read cost per token in USD (Anthropic: 10% of input price). */
195
+ cacheReadCost?: number;
196
+ };
197
+
198
+ const MODEL_SPECS: Record<string, ModelSpec> = {
199
+ // Pricing: https://docs.anthropic.com/en/docs/about-claude/models
200
+ // Cache-read = input_price / 1_000_000 * 0.1 (10% of input for Anthropic)
201
+ "claude-opus-4": { context: 200_000, output: 32_000, cacheReadCost: 15 / 1_000_000 * 0.1 },
202
+ "claude-sonnet-4": { context: 200_000, output: 16_000, cacheReadCost: 3 / 1_000_000 * 0.1 },
203
+ "claude-sonnet-3-5": { context: 200_000, output: 8_192, cacheReadCost: 3 / 1_000_000 * 0.1 },
204
+ "claude-haiku-3-5": { context: 200_000, output: 8_192, cacheReadCost: 0.80 / 1_000_000 * 0.1 },
205
+ };
206
+
207
+ const DEFAULT_MODEL_SPEC: ModelSpec = { context: 200_000, output: 8_192 };
208
+
209
+ function getModelSpec(model: string): ModelSpec {
210
+ // Check for prefix matches: "claude-opus-4-20250514" → "claude-opus-4"
211
+ for (const [prefix, spec] of Object.entries(MODEL_SPECS)) {
212
+ if (model.startsWith(prefix)) return spec;
213
+ }
214
+ return DEFAULT_MODEL_SPEC;
215
+ }
216
+
217
+ // ---------------------------------------------------------------------------
218
+ // Initialization
219
+ // ---------------------------------------------------------------------------
220
+
221
+ /**
222
+ * One-time init: load Lore config, ensure project exists in DB, start idle scheduler.
223
+ * Safe to call multiple times — only the first call does work.
224
+ */
225
+ async function initIfNeeded(projectPath: string, config?: GatewayConfig): Promise<void> {
226
+ if (initialized) return;
227
+
228
+ await load(projectPath);
229
+ ensureProject(projectPath);
230
+ initialized = true;
231
+ cachedProjectPath = projectPath;
232
+
233
+ // Start the idle scheduler for background work (distillation, curation,
234
+ // pruning, AGENTS.md export). Uses a 30s poll interval and fires for any
235
+ // session whose lastRequestTime exceeds the idle timeout.
236
+ if (config && !stopIdleScheduler) {
237
+ const llm = getLLMClient(config);
238
+ const sessionModelID = lastSeenSessionModel ?? (loreConfig().model?.modelID ?? "claude-sonnet-4-20250514");
239
+ const idleHandler = buildIdleWorkHandler(
240
+ projectPath,
241
+ llm,
242
+ config.upstreamAnthropic,
243
+ () => resolveAuth(),
244
+ sessionModelID,
245
+ // onLtmInvalidated: clear the LTM session cache
246
+ () => ltmSessionCache.clear(),
247
+ );
248
+ stopIdleScheduler = startIdleScheduler(config, sessions, idleHandler);
249
+ }
250
+
251
+ log.info(`gateway pipeline initialized: ${projectPath}`);
252
+ }
253
+
254
+ function getLLMClient(config: GatewayConfig): LLMClient {
255
+ if (!llmClient) {
256
+ const cfg = loreConfig();
257
+ const defaultModel = cfg.model ?? {
258
+ providerID: "anthropic",
259
+ modelID: "claude-sonnet-4-20250514",
260
+ };
261
+ const inner = createGatewayLLMClient(
262
+ config.upstreamAnthropic,
263
+ resolveAuth,
264
+ defaultModel,
265
+ );
266
+
267
+ // Wrap with batch queue for 50% cost savings on non-urgent worker calls.
268
+ // Enabled by default — disable via LORE_BATCH_DISABLED=1.
269
+ const batchDisabled = process.env.LORE_BATCH_DISABLED === "1";
270
+ if (batchDisabled) {
271
+ llmClient = inner;
272
+ } else {
273
+ llmClient = createBatchLLMClient(
274
+ inner,
275
+ config.upstreamAnthropic,
276
+ resolveAuth,
277
+ defaultModel,
278
+ );
279
+ }
280
+ }
281
+ return llmClient;
282
+ }
283
+
284
+ // ---------------------------------------------------------------------------
285
+ // Session management helpers
286
+ // ---------------------------------------------------------------------------
287
+
288
+ function getOrCreateSession(
289
+ sessionID: string,
290
+ projectPath: string,
291
+ ): SessionState {
292
+ let state = sessions.get(sessionID);
293
+ if (!state) {
294
+ state = {
295
+ sessionID,
296
+ projectPath,
297
+ fingerprint: "",
298
+ lastRequestTime: Date.now(),
299
+ messageCount: 0,
300
+ turnsSinceCuration: 0,
301
+ };
302
+ sessions.set(sessionID, state);
303
+ }
304
+ state.lastRequestTime = Date.now();
305
+
306
+ // Lazy cleanup: discard expired pending recall on access
307
+ if (state.pendingRecall && !isPendingRecallValid(state.pendingRecall)) {
308
+ log.warn(
309
+ `lazy cleanup: discarding expired pending recall for session ${sessionID.slice(0, 16)}`,
310
+ );
311
+ state.pendingRecall = undefined;
312
+ }
313
+
314
+ return state;
315
+ }
316
+
317
+ /**
318
+ * Identify or create a session from the incoming request messages.
319
+ *
320
+ * Uses a fingerprint of the first user message combined with
321
+ * message-count proximity to correlate requests to sessions.
322
+ * Forked sessions (which share the same first message) are
323
+ * disambiguated by a significant drop in message count.
324
+ */
325
+ async function identifySession(
326
+ req: GatewayRequest,
327
+ _projectPath: string,
328
+ ): Promise<{ sessionID: string; isNew: boolean }> {
329
+ const rawMessages = req.messages.map((m) => ({
330
+ role: m.role,
331
+ content: m.content,
332
+ }));
333
+ const cred = extractAuth(req.rawHeaders);
334
+ const fingerprint = await fingerprintMessages(rawMessages, {
335
+ model: req.model,
336
+ authSuffix: cred ? authFingerprint(cred) : "",
337
+ });
338
+ const msgCount = req.messages.length;
339
+
340
+ // Find the best matching session: same fingerprint + closest message count
341
+ let bestMatch: { sid: string; countDiff: number } | null = null;
342
+
343
+ for (const [sid, state] of sessions) {
344
+ if (state.fingerprint !== fingerprint) continue;
345
+
346
+ const diff = msgCount - state.messageCount;
347
+
348
+ // Normal session: count grows by 2–6 per turn.
349
+ // Fork: count drops significantly (parent at 600, fork at 300).
350
+ // Reject if the count dropped too far (likely a fork).
351
+ if (diff < -MESSAGE_COUNT_PROXIMITY_THRESHOLD) continue;
352
+
353
+ const absDiff = Math.abs(diff);
354
+ if (!bestMatch || absDiff < bestMatch.countDiff) {
355
+ bestMatch = { sid, countDiff: absDiff };
356
+ }
357
+ }
358
+
359
+ if (bestMatch) {
360
+ return { sessionID: bestMatch.sid, isNew: false };
361
+ }
362
+
363
+ // No matching session → create new
364
+ const sessionID = generateSessionID();
365
+ return { sessionID, isNew: true };
366
+ }
367
+
368
+ // ---------------------------------------------------------------------------
369
+ // Upstream forwarding
370
+ // ---------------------------------------------------------------------------
371
+
372
+ /**
373
+ * Forward a request to the upstream provider (Anthropic or OpenAI).
374
+ *
375
+ * When an interceptor is provided (or a module-level one is active), the
376
+ * interceptor is called instead of `fetch` directly. This enables recording
377
+ * and replay without modifying individual call sites.
378
+ *
379
+ * Returns the raw fetch Response (may be streaming or non-streaming).
380
+ */
381
+ async function forwardToUpstream(
382
+ req: GatewayRequest,
383
+ config: GatewayConfig,
384
+ interceptor?: UpstreamInterceptor,
385
+ cache?: AnthropicCacheOptions,
386
+ ): Promise<Response> {
387
+ let url: string;
388
+ let headers: Record<string, string>;
389
+ let body: unknown;
390
+
391
+ // Infer upstream from model name; fall back to protocol + env-var defaults.
392
+ const route = resolveUpstreamRoute(req.model);
393
+ const effectiveProtocol = route?.protocol ?? req.protocol;
394
+ const effectiveUpstreamBase = route?.url ?? (effectiveProtocol === "openai" ? config.upstreamOpenAI : config.upstreamAnthropic);
395
+
396
+ if (effectiveProtocol === "openai") {
397
+ const result = buildOpenAIUpstreamRequest(req, effectiveUpstreamBase);
398
+ url = result.url;
399
+ headers = result.headers;
400
+ body = result.body;
401
+ } else {
402
+ const result = buildAnthropicRequest(req, cache);
403
+ url = `${effectiveUpstreamBase}${result.url}`;
404
+ headers = result.headers;
405
+ body = result.body;
406
+ }
407
+
408
+ const effectiveInterceptor = interceptor ?? activeInterceptor;
409
+
410
+ if (effectiveInterceptor) {
411
+ return effectiveInterceptor(
412
+ body,
413
+ req.model,
414
+ req.stream,
415
+ () =>
416
+ fetch(url, {
417
+ method: "POST",
418
+ headers,
419
+ body: JSON.stringify(body),
420
+ }),
421
+ );
422
+ }
423
+
424
+ return fetch(url, {
425
+ method: "POST",
426
+ headers,
427
+ body: JSON.stringify(body),
428
+ });
429
+ }
430
+
431
+ // ---------------------------------------------------------------------------
432
+ // Response builders
433
+ // ---------------------------------------------------------------------------
434
+
435
+ /**
436
+ * Create a streaming SSE response from upstream with parallel accumulation.
437
+ *
438
+ * When `recallContext` is provided, uses a recall-aware accumulator that
439
+ * transparently intercepts recall tool_use blocks:
440
+ * - **Case 1 (recall-only)**: pauses client stream, executes recall, sends
441
+ * a follow-up request, and pipes the continuation into the same HTTP
442
+ * response stream.
443
+ * - **Case 2 (mixed tools)**: suppresses recall blocks, stores the pending
444
+ * result for injection into the next request.
445
+ */
446
+ function buildStreamingResponse(
447
+ upstreamResponse: Response,
448
+ onComplete: (response: GatewayResponse) => void,
449
+ recallContext?: {
450
+ modifiedReq: GatewayRequest;
451
+ config: GatewayConfig;
452
+ sessionState: SessionState;
453
+ cacheOptions: AnthropicCacheOptions;
454
+ },
455
+ ): Response {
456
+ const recallAccum = recallContext
457
+ ? createRecallAwareAccumulator(RECALL_TOOL_NAME)
458
+ : null;
459
+ const accumulator: StreamAccumulator = recallAccum ?? createStreamAccumulator();
460
+ const encoder = new TextEncoder();
461
+
462
+ const stream = new ReadableStream({
463
+ async start(controller) {
464
+ try {
465
+ // Parse and forward upstream SSE events
466
+ const reader = upstreamResponse.body!.getReader();
467
+ for await (const { event, data } of parseSSEStream(reader)) {
468
+ const forwarded = accumulator.processEvent(event, data);
469
+ if (forwarded) {
470
+ controller.enqueue(encoder.encode(forwarded));
471
+ }
472
+ }
473
+
474
+ // --- Recall interception (streaming) ---
475
+ if (recallAccum?.hasRecall()) {
476
+ const resp = recallAccum.getResponse();
477
+ const recallBlock = findRecallToolUse(resp);
478
+
479
+ if (recallBlock && recallContext) {
480
+ const { result, input } = await executeRecall(
481
+ recallBlock,
482
+ recallContext.sessionState.projectPath,
483
+ recallContext.sessionState.sessionID,
484
+ );
485
+
486
+ if (recallAccum.hasOtherTools()) {
487
+ // Case 2: mixed tools — store pending, forward held-back events
488
+ const position = resp.content.indexOf(recallBlock);
489
+ recallContext.sessionState.pendingRecall = {
490
+ toolUseId: recallBlock.id,
491
+ input,
492
+ position,
493
+ result,
494
+ timestamp: Date.now(),
495
+ };
496
+ log.info(
497
+ `recall (stream, mixed): stored pending result for session ` +
498
+ `${recallContext.sessionState.sessionID.slice(0, 16)}`,
499
+ );
500
+
501
+ // Emit a synthetic "[Searching memory...]" text block after all
502
+ // other tool blocks. The accumulator already re-indexed other
503
+ // tools to fill the gap, so this goes at clientBlockCount.
504
+ const searchingIdx = recallAccum.clientBlockCount();
505
+ const syntheticCase2 = [
506
+ formatSSEEvent("content_block_start", JSON.stringify({
507
+ type: "content_block_start",
508
+ index: searchingIdx,
509
+ content_block: { type: "text", text: "" },
510
+ })),
511
+ formatSSEEvent("content_block_delta", JSON.stringify({
512
+ type: "content_block_delta",
513
+ index: searchingIdx,
514
+ delta: { type: "text_delta", text: "\n\n[Searching memory...]" },
515
+ })),
516
+ formatSSEEvent("content_block_stop", JSON.stringify({
517
+ type: "content_block_stop",
518
+ index: searchingIdx,
519
+ })),
520
+ ].join("");
521
+ controller.enqueue(encoder.encode(syntheticCase2));
522
+
523
+ // Forward the held-back message_delta + message_stop
524
+ const heldBack = recallAccum.heldBackEvents();
525
+ if (heldBack) {
526
+ controller.enqueue(encoder.encode(heldBack));
527
+ }
528
+
529
+ controller.close();
530
+
531
+ // Post-stream: use stripped response for temporal storage
532
+ const cleanResp = stripRecallFromResponse(resp);
533
+ onComplete(cleanResp);
534
+ return;
535
+ }
536
+
537
+ // Case 1: recall-only — send follow-up, pipe continuation
538
+ log.info(
539
+ `recall (stream, only): executing follow-up for session ` +
540
+ `${recallContext.sessionState.sessionID.slice(0, 16)}`,
541
+ );
542
+
543
+ // Emit a synthetic "[Searching memory...]" text block at the
544
+ // suppressed recall index so the client sees a natural indicator
545
+ // during the pause while the recall executes.
546
+ const searchingIndex = recallAccum.clientBlockCount();
547
+ const syntheticBlock = [
548
+ formatSSEEvent("content_block_start", JSON.stringify({
549
+ type: "content_block_start",
550
+ index: searchingIndex,
551
+ content_block: { type: "text", text: "" },
552
+ })),
553
+ formatSSEEvent("content_block_delta", JSON.stringify({
554
+ type: "content_block_delta",
555
+ index: searchingIndex,
556
+ delta: { type: "text_delta", text: "\n\n[Searching memory...]" },
557
+ })),
558
+ formatSSEEvent("content_block_stop", JSON.stringify({
559
+ type: "content_block_stop",
560
+ index: searchingIndex,
561
+ })),
562
+ ].join("");
563
+ controller.enqueue(encoder.encode(syntheticBlock));
564
+
565
+ const followUp = buildRecallFollowUp(
566
+ recallContext.modifiedReq,
567
+ resp,
568
+ result,
569
+ recallBlock,
570
+ );
571
+ const followUpResponse = await forwardToUpstream(
572
+ followUp,
573
+ recallContext.config,
574
+ undefined,
575
+ recallContext.cacheOptions,
576
+ );
577
+
578
+ if (!followUpResponse.ok) {
579
+ const errorBody = await followUpResponse.text();
580
+ log.error(
581
+ `recall follow-up upstream error: ${followUpResponse.status} ${errorBody.slice(0, 500)}`,
582
+ );
583
+ // Forward the held-back events to close the stream gracefully
584
+ const heldBack = recallAccum.heldBackEvents();
585
+ if (heldBack) {
586
+ controller.enqueue(encoder.encode(heldBack));
587
+ }
588
+ controller.close();
589
+ const cleanResp = stripRecallFromResponse(resp);
590
+ onComplete(cleanResp);
591
+ return;
592
+ }
593
+
594
+ // Pipe the continuation stream into the same HTTP response.
595
+ // Suppress message_start (client already has one) and re-index
596
+ // content blocks to continue from where the client left off.
597
+ // +1 accounts for the synthetic "[Searching memory...]" block.
598
+ // Use clientBlockCount (not recallBlockIndex) — this is the number
599
+ // of blocks the client has already seen, so continuation blocks
600
+ // start at clientBlockCount + 1 (for the synthetic block).
601
+ const blockOffset = recallAccum.clientBlockCount() + 1;
602
+ const contReader = followUpResponse.body!.getReader();
603
+
604
+ for await (const { event: contEvent, data: contData } of parseSSEStream(contReader)) {
605
+ if (contEvent === "message_start") {
606
+ // Suppress — client already received one
607
+ continue;
608
+ }
609
+
610
+ // Re-index content block events
611
+ if (
612
+ contEvent === "content_block_start" ||
613
+ contEvent === "content_block_delta" ||
614
+ contEvent === "content_block_stop"
615
+ ) {
616
+ try {
617
+ const parsed = JSON.parse(contData) as Record<string, unknown>;
618
+ if (typeof parsed.index === "number") {
619
+ parsed.index = (parsed.index as number) + blockOffset;
620
+ const adjusted = formatSSEEvent(
621
+ contEvent,
622
+ JSON.stringify(parsed),
623
+ );
624
+ controller.enqueue(encoder.encode(adjusted));
625
+ continue;
626
+ }
627
+ } catch {
628
+ // Fall through to forward as-is
629
+ }
630
+ }
631
+
632
+ // Forward message_delta, message_stop, and other events as-is
633
+ const forwarded = formatSSEEvent(contEvent, contData);
634
+ controller.enqueue(encoder.encode(forwarded));
635
+ }
636
+
637
+ controller.close();
638
+
639
+ // Post-stream: accumulate the continuation for temporal storage.
640
+ // We use resp (original) + continuation for a complete picture,
641
+ // but for simplicity just store the continuation response since
642
+ // it's what the model actually produced for the client.
643
+ // The continuation accumulator was not wired — use the original
644
+ // response's pre-recall content + continuation's content.
645
+ // For now, call onComplete with the original response so at least
646
+ // the pre-recall content is stored. The continuation's text is
647
+ // visible to the client but not separately stored — acceptable
648
+ // since temporal storage captures the full conversation on next turn.
649
+ onComplete(resp);
650
+ return;
651
+ }
652
+ }
653
+
654
+ // No recall — normal path
655
+ controller.close();
656
+ const response = accumulator.getResponse();
657
+ onComplete(response);
658
+ } catch (err) {
659
+ log.error("streaming pipeline error:", err);
660
+ controller.error(err);
661
+ }
662
+ },
663
+ });
664
+
665
+ return new Response(stream, {
666
+ status: 200,
667
+ headers: {
668
+ "content-type": "text/event-stream",
669
+ "cache-control": "no-cache",
670
+ connection: "keep-alive",
671
+ },
672
+ });
673
+ }
674
+
675
+ /**
676
+ * Accumulate a non-streaming upstream response into a GatewayResponse.
677
+ */
678
+ async function accumulateNonStreamResponse(
679
+ upstreamResponse: Response,
680
+ ): Promise<GatewayResponse> {
681
+ const json = (await upstreamResponse.json()) as Record<string, unknown>;
682
+
683
+ const content: GatewayContentBlock[] = [];
684
+ const rawContent = json.content as Array<Record<string, unknown>> | undefined;
685
+ if (rawContent) {
686
+ for (const block of rawContent) {
687
+ switch (block.type) {
688
+ case "text":
689
+ content.push({ type: "text", text: String(block.text ?? "") });
690
+ break;
691
+ case "thinking":
692
+ content.push({
693
+ type: "thinking",
694
+ thinking: String(block.thinking ?? ""),
695
+ ...(block.signature
696
+ ? { signature: String(block.signature) }
697
+ : undefined),
698
+ });
699
+ break;
700
+ case "tool_use":
701
+ content.push({
702
+ type: "tool_use",
703
+ id: String(block.id ?? ""),
704
+ name: String(block.name ?? ""),
705
+ input: block.input,
706
+ });
707
+ break;
708
+ }
709
+ }
710
+ }
711
+
712
+ const usage = json.usage as Record<string, number> | undefined;
713
+
714
+ return {
715
+ id: String(json.id ?? ""),
716
+ model: String(json.model ?? ""),
717
+ content,
718
+ stopReason: String(
719
+ (json.stop_reason as string) ?? "end_turn",
720
+ ),
721
+ usage: {
722
+ inputTokens: usage?.input_tokens ?? 0,
723
+ outputTokens: usage?.output_tokens ?? 0,
724
+ cacheReadInputTokens: usage?.cache_read_input_tokens,
725
+ cacheCreationInputTokens: usage?.cache_creation_input_tokens,
726
+ },
727
+ };
728
+ }
729
+
730
+ /**
731
+ * Accumulate a streaming upstream SSE response into a GatewayResponse.
732
+ *
733
+ * Used for OpenAI requests where we need to convert the accumulated
734
+ * response to OpenAI format before returning to the client.
735
+ */
736
+ async function accumulateStreamResponse(
737
+ upstreamResponse: Response,
738
+ ): Promise<GatewayResponse> {
739
+ const accumulator = createStreamAccumulator();
740
+ const reader = upstreamResponse.body!.getReader();
741
+
742
+ for await (const { event, data } of parseSSEStream(reader)) {
743
+ accumulator.processEvent(event, data);
744
+ }
745
+
746
+ return accumulator.getResponse();
747
+ }
748
+
749
+ /**
750
+ * Convert a GatewayResponse to a non-streaming HTTP Response.
751
+ */
752
+ function nonStreamHttpResponse(resp: GatewayResponse): Response {
753
+ const body = buildAnthropicNonStreamResponse(resp);
754
+ return new Response(JSON.stringify(body), {
755
+ status: 200,
756
+ headers: { "content-type": "application/json" },
757
+ });
758
+ }
759
+
760
+ /**
761
+ * Convert a GatewayResponse to a streaming SSE HTTP Response.
762
+ */
763
+ function streamHttpResponse(resp: GatewayResponse): Response {
764
+ // Build the full SSE text for a text-only response
765
+ const textBlocks = resp.content.filter(
766
+ (b): b is { type: "text"; text: string } => b.type === "text",
767
+ );
768
+ const fullText = textBlocks.map((b) => b.text).join("");
769
+
770
+ const sseBody = buildSSETextResponse(resp.id, resp.model, fullText, {
771
+ inputTokens: resp.usage.inputTokens,
772
+ outputTokens: resp.usage.outputTokens,
773
+ });
774
+
775
+ return new Response(sseBody, {
776
+ status: 200,
777
+ headers: {
778
+ "content-type": "text/event-stream",
779
+ "cache-control": "no-cache",
780
+ connection: "keep-alive",
781
+ },
782
+ });
783
+ }
784
+
785
+ // ---------------------------------------------------------------------------
786
+ // Post-response processing
787
+ // ---------------------------------------------------------------------------
788
+
789
+ /**
790
+ * Run after a successful response: calibrate, store temporal messages,
791
+ * and schedule background work (distillation, curation).
792
+ */
793
+ function postResponse(
794
+ req: GatewayRequest,
795
+ resp: GatewayResponse,
796
+ sessionState: SessionState,
797
+ config: GatewayConfig,
798
+ ): void {
799
+ const { sessionID, projectPath } = sessionState;
800
+
801
+ try {
802
+ // --- Calibrate overhead from real token counts ---
803
+ const actualInput =
804
+ (resp.usage.inputTokens ?? 0) +
805
+ (resp.usage.cacheReadInputTokens ?? 0) +
806
+ (resp.usage.cacheCreationInputTokens ?? 0);
807
+ calibrate(
808
+ actualInput,
809
+ sessionID,
810
+ getLastTransformedCount(sessionID),
811
+ );
812
+
813
+ // --- Temporal storage ---
814
+ // Store all messages (user + assistant) from this turn.
815
+ // Convert gateway messages to Lore format.
816
+ const loreMessages = gatewayMessagesToLore(req.messages, sessionID);
817
+ resolveToolResults(loreMessages);
818
+
819
+ // Store the latest user message (last user message in the array)
820
+ for (let i = loreMessages.length - 1; i >= 0; i--) {
821
+ if (loreMessages[i].info.role === "user") {
822
+ temporal.store({
823
+ projectPath,
824
+ info: loreMessages[i].info,
825
+ parts: loreMessages[i].parts,
826
+ });
827
+ break;
828
+ }
829
+ }
830
+
831
+ // Build and store the assistant response message
832
+ const assistantMsg = gatewayMessagesToLore(
833
+ [{ role: "assistant", content: resp.content }],
834
+ sessionID,
835
+ )[0];
836
+ updateAssistantMessageTokens(assistantMsg, resp.usage, resp.model);
837
+ temporal.store({
838
+ projectPath,
839
+ info: assistantMsg.info,
840
+ parts: assistantMsg.parts,
841
+ });
842
+
843
+ // Update session state
844
+ sessionState.turnsSinceCuration =
845
+ (sessionState.turnsSinceCuration ?? 0) + 1;
846
+
847
+ // --- Schedule background work (fire-and-forget) ---
848
+ scheduleBackgroundWork(sessionState, config);
849
+ } catch (e) {
850
+ log.error("post-response processing failed:", e);
851
+ }
852
+ }
853
+
854
+ /**
855
+ * Schedule background distillation and curation (fire-and-forget).
856
+ */
857
+ function scheduleBackgroundWork(
858
+ sessionState: SessionState,
859
+ config: GatewayConfig,
860
+ ): void {
861
+ const { sessionID, projectPath } = sessionState;
862
+ const llm = getLLMClient(config);
863
+ const cfg = loreConfig();
864
+ const model = getWorkerModel();
865
+
866
+ // Check if urgent distillation is needed (gradient flagged it).
867
+ // Mark urgent: true so these bypass the batch queue — the gradient is
868
+ // in overflow and needs the result before the next user turn.
869
+ if (needsUrgentDistillation()) {
870
+ distillation
871
+ .run({
872
+ llm,
873
+ projectPath,
874
+ sessionID,
875
+ model,
876
+ force: true,
877
+ urgent: true,
878
+ })
879
+ .catch((e) => log.error("background distillation failed:", e));
880
+ }
881
+
882
+ // Check if pending messages exceed maxSegment threshold
883
+ const pending = temporal.undistilledCount(projectPath, sessionID);
884
+ if (pending >= cfg.distillation.maxSegment) {
885
+ log.info(
886
+ `incremental distillation: ${pending} undistilled messages in ${sessionID.slice(0, 16)}`,
887
+ );
888
+ distillation
889
+ .run({ llm, projectPath, sessionID, model })
890
+ .catch((e) => log.error("background distillation failed:", e));
891
+ }
892
+
893
+ // Curation: run periodically when the knowledge system is enabled
894
+ if (
895
+ cfg.knowledge.enabled &&
896
+ cfg.curator.onIdle &&
897
+ sessionState.turnsSinceCuration >= cfg.curator.afterTurns
898
+ ) {
899
+ curator
900
+ .run({ llm, projectPath, sessionID, model })
901
+ .then(() => {
902
+ sessionState.turnsSinceCuration = 0;
903
+ // Invalidate LTM cache after curation changes knowledge entries
904
+ ltmSessionCache.delete(sessionID);
905
+ })
906
+ .catch((e) => log.error("background curation failed:", e));
907
+ }
908
+ }
909
+
910
+ // ---------------------------------------------------------------------------
911
+ // Case 1: Compaction interception
912
+ // ---------------------------------------------------------------------------
913
+
914
+ async function handleCompaction(
915
+ req: GatewayRequest,
916
+ config: GatewayConfig,
917
+ ): Promise<Response> {
918
+ // Identify session
919
+ const projectPath = cachedProjectPath ?? getProjectPath(req.system, req.rawHeaders);
920
+ await initIfNeeded(projectPath, config);
921
+
922
+ const { sessionID } = await identifySession(req, projectPath);
923
+ const sessionState = getOrCreateSession(sessionID, projectPath);
924
+ const llm = getLLMClient(config);
925
+
926
+ log.info(`compaction intercepted for session ${sessionID.slice(0, 16)}`);
927
+
928
+ // 1. Force-distill all undistilled messages.
929
+ // Mark urgent: true — client is blocking on the compaction response.
930
+ const model = getWorkerModel();
931
+ await distillation.run({
932
+ llm,
933
+ projectPath,
934
+ sessionID,
935
+ model,
936
+ force: true,
937
+ urgent: true,
938
+ });
939
+
940
+ // 2. Load distillation summaries
941
+ const distillations = distillation.loadForSession(projectPath, sessionID);
942
+
943
+ // 3. Extract previous summary from the request (if any)
944
+ const previousSummary = extractPreviousSummary(req);
945
+
946
+ // 4. Build knowledge block
947
+ const cfg = loreConfig();
948
+ const entries = cfg.knowledge.enabled
949
+ ? ltm.forProject(projectPath, cfg.crossProject)
950
+ : [];
951
+ const knowledge = entries.length
952
+ ? formatKnowledge(
953
+ entries.map((e) => ({
954
+ category: e.category,
955
+ title: e.title,
956
+ content: e.content,
957
+ })),
958
+ )
959
+ : "";
960
+
961
+ // 5. Build the compact prompt
962
+ const compactPrompt = buildCompactPrompt({
963
+ hasDistillations: distillations.length > 0,
964
+ knowledge,
965
+ previousSummary,
966
+ });
967
+
968
+ // 6. Build context with distillation summaries
969
+ let context = "";
970
+ if (distillations.length > 0) {
971
+ context =
972
+ `## Lore Pre-computed Session Summaries\n\n` +
973
+ `The following ${distillations.length} summary chunk(s) were pre-computed ` +
974
+ `from the conversation history. Use these as the authoritative source.\n\n` +
975
+ distillations
976
+ .map(
977
+ (d, i) =>
978
+ `### Chunk ${i + 1}${d.generation > 0 ? " (consolidated)" : ""}\n${d.observations}`,
979
+ )
980
+ .join("\n\n");
981
+ }
982
+
983
+ // 7. Generate the compaction summary via LLM
984
+ const userContent = context
985
+ ? `${context}\n\n---\n\n${compactPrompt}`
986
+ : compactPrompt;
987
+
988
+ const summaryText = await llm.prompt(compactPrompt, userContent, {
989
+ model: cfg.model,
990
+ workerID: "lore-compact",
991
+ urgent: true, // Client is blocking on this response
992
+ });
993
+
994
+ const summary = summaryText ?? "(Compaction failed — no summary generated.)";
995
+
996
+ // 8. Build and return the response
997
+ const resp = buildCompactionResponse(sessionID, summary, req.model);
998
+
999
+ if (req.stream) {
1000
+ return streamHttpResponse(resp);
1001
+ }
1002
+ return nonStreamHttpResponse(resp);
1003
+ }
1004
+
1005
+ // ---------------------------------------------------------------------------
1006
+ // Case 2: Title/summary passthrough
1007
+ // ---------------------------------------------------------------------------
1008
+
1009
+ async function handlePassthrough(
1010
+ req: GatewayRequest,
1011
+ config: GatewayConfig,
1012
+ ): Promise<Response> {
1013
+ const upstreamResponse = await forwardToUpstream(req, config);
1014
+
1015
+ // For streaming, pipe through unchanged
1016
+ if (req.stream && upstreamResponse.body) {
1017
+ return new Response(upstreamResponse.body, {
1018
+ status: upstreamResponse.status,
1019
+ headers: {
1020
+ "content-type":
1021
+ upstreamResponse.headers.get("content-type") ??
1022
+ "text/event-stream",
1023
+ },
1024
+ });
1025
+ }
1026
+
1027
+ // For non-streaming, pass through the JSON response as-is
1028
+ const body = await upstreamResponse.text();
1029
+ return new Response(body, {
1030
+ status: upstreamResponse.status,
1031
+ headers: {
1032
+ "content-type": "application/json",
1033
+ },
1034
+ });
1035
+ }
1036
+
1037
+ // ---------------------------------------------------------------------------
1038
+ // Case 3: Normal conversation turn — full pipeline
1039
+ // ---------------------------------------------------------------------------
1040
+
1041
+ async function handleConversationTurn(
1042
+ req: GatewayRequest,
1043
+ config: GatewayConfig,
1044
+ ): Promise<Response> {
1045
+ // --- 1. Project path & init ---
1046
+ const projectPath = getProjectPath(req.system, req.rawHeaders);
1047
+ await initIfNeeded(projectPath, config);
1048
+
1049
+ // --- 2. Capture auth credentials for background workers ---
1050
+ const cred = extractAuth(req.rawHeaders);
1051
+ if (cred) {
1052
+ setLastSeenAuth(cred);
1053
+ }
1054
+
1055
+ // --- 3. Session identification ---
1056
+ const { sessionID, isNew } = await identifySession(req, projectPath);
1057
+ const sessionState = getOrCreateSession(sessionID, projectPath);
1058
+
1059
+ // Bind auth credential to this session for background workers
1060
+ if (cred) {
1061
+ setSessionAuth(sessionID, cred);
1062
+ }
1063
+
1064
+ // Track fingerprint for future correlation
1065
+ if (isNew) {
1066
+ const fingerprint = await fingerprintMessages(
1067
+ req.messages.map((m) => ({ role: m.role, content: m.content })),
1068
+ {
1069
+ model: req.model,
1070
+ authSuffix: cred ? authFingerprint(cred) : "",
1071
+ },
1072
+ );
1073
+ sessionState.fingerprint = fingerprint;
1074
+ }
1075
+
1076
+ // Always update message count for proximity matching
1077
+ sessionState.messageCount = req.messages.length;
1078
+
1079
+ // Track session model for worker model discovery
1080
+ lastSeenSessionModel = req.model;
1081
+
1082
+ // --- Inject pending recall from previous turn (Case 2: mixed tools) ---
1083
+ if (sessionState.pendingRecall) {
1084
+ if (isPendingRecallValid(sessionState.pendingRecall)) {
1085
+ const injected = injectPendingRecall(req, sessionState.pendingRecall);
1086
+ if (injected) {
1087
+ log.info(
1088
+ `injected pending recall result into request for session ${sessionID.slice(0, 16)}`,
1089
+ );
1090
+ } else {
1091
+ log.warn(
1092
+ `failed to inject pending recall — conversation structure mismatch`,
1093
+ );
1094
+ }
1095
+ } else {
1096
+ log.warn(
1097
+ `discarding expired pending recall for session ${sessionID.slice(0, 16)}`,
1098
+ );
1099
+ }
1100
+ sessionState.pendingRecall = undefined;
1101
+ }
1102
+
1103
+ log.info(
1104
+ `turn: session=${sessionID.slice(0, 16)} messages=${req.messages.length} ` +
1105
+ `model=${req.model} stream=${req.stream} new=${isNew}`,
1106
+ );
1107
+
1108
+ // --- 4. Set model limits ---
1109
+ const modelSpec = getModelSpec(req.model);
1110
+ setModelLimits({ context: modelSpec.context, output: modelSpec.output });
1111
+
1112
+ // Cost-aware layer-0 cap: explicit config wins > cost formula > disabled.
1113
+ const cfg = loreConfig();
1114
+ if (cfg.budget.maxLayer0Tokens !== undefined) {
1115
+ setMaxLayer0Tokens(cfg.budget.maxLayer0Tokens);
1116
+ } else if (modelSpec.cacheReadCost && cfg.budget.targetCacheReadCostPerTurn > 0) {
1117
+ setMaxLayer0Tokens(computeLayer0Cap(
1118
+ cfg.budget.targetCacheReadCostPerTurn,
1119
+ modelSpec.cacheReadCost,
1120
+ ));
1121
+ }
1122
+
1123
+ // --- 5. Cold-cache idle-resume ---
1124
+ const thresholdMs = cfg.idleResumeMinutes * 60_000;
1125
+ const idleResult = onIdleResume(sessionID, thresholdMs);
1126
+ if (idleResult.triggered) {
1127
+ ltmSessionCache.delete(sessionID);
1128
+ log.info(
1129
+ `session idle ${Math.round(idleResult.idleMs / 60_000)}min — refreshing caches`,
1130
+ );
1131
+ }
1132
+
1133
+ // --- 6. LTM injection into system prompt ---
1134
+ let modifiedSystem = req.system;
1135
+ if (cfg.knowledge.enabled) {
1136
+ try {
1137
+ let cached = ltmSessionCache.get(sessionID);
1138
+
1139
+ if (!cached) {
1140
+ const ltmFraction = cfg.budget.ltm;
1141
+ const ltmBudget = getLtmBudget(ltmFraction);
1142
+ const entries = ltm.forSession(projectPath, sessionID, ltmBudget);
1143
+ if (entries.length) {
1144
+ const formatted = formatKnowledge(
1145
+ entries.map((e) => ({
1146
+ category: e.category,
1147
+ title: e.title,
1148
+ content: e.content,
1149
+ })),
1150
+ ltmBudget,
1151
+ );
1152
+
1153
+ if (formatted) {
1154
+ const tokenCount = Math.ceil(formatted.length / 3);
1155
+ cached = { formatted, tokenCount };
1156
+ ltmSessionCache.set(sessionID, cached);
1157
+ }
1158
+ }
1159
+ }
1160
+
1161
+ if (cached) {
1162
+ setLtmTokens(cached.tokenCount, sessionID);
1163
+ modifiedSystem = `${req.system}\n\n${cached.formatted}`;
1164
+ } else {
1165
+ setLtmTokens(0, sessionID);
1166
+ }
1167
+ } catch (e) {
1168
+ log.error("LTM injection failed:", e);
1169
+ setLtmTokens(0, sessionID);
1170
+ } finally {
1171
+ consumeCameOutOfIdle(sessionID);
1172
+ }
1173
+ } else {
1174
+ setLtmTokens(0, sessionID);
1175
+ consumeCameOutOfIdle(sessionID);
1176
+ }
1177
+
1178
+ // First-run greeting
1179
+ if (isFirstRun()) {
1180
+ modifiedSystem +=
1181
+ "\n\n[Lore plugin] This is the first time Lore has been activated. " +
1182
+ "Briefly let the user know that Lore is now active and their " +
1183
+ "coding agent will get progressively smarter on this codebase " +
1184
+ "over time as knowledge accumulates across sessions.";
1185
+ }
1186
+
1187
+ // Lore knowledge file commit reminder
1188
+ if (cfg.knowledge.enabled) {
1189
+ const filesToTrack = [".lore.md"];
1190
+ if (cfg.agentsFile.enabled) filesToTrack.push(cfg.agentsFile.path);
1191
+ modifiedSystem +=
1192
+ `\n\nWhen making git commits, always check if ${filesToTrack.join(" and ")} ` +
1193
+ `have unstaged changes and include them in the commit. These files contain ` +
1194
+ `shared project knowledge managed by lore and must be version-controlled.`;
1195
+ }
1196
+
1197
+ // --- 7. Gradient transform on messages ---
1198
+ const loreMessages = gatewayMessagesToLore(req.messages, sessionID);
1199
+ resolveToolResults(loreMessages);
1200
+
1201
+ const result = transform({
1202
+ messages: loreMessages,
1203
+ projectPath,
1204
+ sessionID,
1205
+ });
1206
+
1207
+ // Drop trailing pure-text assistant messages to prevent prefill errors
1208
+ while (
1209
+ result.messages.length > 0 &&
1210
+ result.messages.at(-1)!.info.role !== "user"
1211
+ ) {
1212
+ const last = result.messages.at(-1)!;
1213
+ const hasToolParts = last.parts.some((p) => p.type === "tool");
1214
+ if (hasToolParts) break;
1215
+ result.messages.pop();
1216
+ }
1217
+
1218
+ // --- 8. Build the modified request ---
1219
+ // Reconstruct GatewayMessages from the transformed Lore messages.
1220
+ // loreMessagesToGateway reconstructs tool_result blocks from assistant's
1221
+ // completed/error tool parts; removeOrphanedToolResults is a safety net
1222
+ // that catches any remaining orphaned tool_result references.
1223
+ const transformedMessages = loreMessagesToGateway(result.messages);
1224
+ removeOrphanedToolResults(transformedMessages);
1225
+
1226
+ const modifiedReq: GatewayRequest = {
1227
+ ...req,
1228
+ system: modifiedSystem,
1229
+ messages: transformedMessages,
1230
+ };
1231
+
1232
+ // --- 8b. Inject recall tool ---
1233
+ // Only inject if the client doesn't already have a recall tool (e.g. from
1234
+ // a host plugin like OpenCode) and the request has other tools (so it's a
1235
+ // coding agent, not a bare chat).
1236
+ if (modifiedReq.tools.length > 0 && !clientHasRecallTool(modifiedReq.tools)) {
1237
+ modifiedReq.tools = [...modifiedReq.tools, RECALL_GATEWAY_TOOL];
1238
+ }
1239
+
1240
+ // --- 9. Forward to upstream ---
1241
+ // Enable prompt caching for conversation turns:
1242
+ // - System prompt: explicit breakpoint with 5m TTL (frequent turns)
1243
+ // - Conversation: breakpoint on last block so Anthropic caches the prefix
1244
+ // Title/summary passthrough (handlePassthrough) never reaches here — it
1245
+ // forwards the raw request without buildAnthropicRequest, so no caching.
1246
+ const cacheOptions: AnthropicCacheOptions = {
1247
+ systemTTL: "5m",
1248
+ cacheConversation: true,
1249
+ };
1250
+ const upstreamResponse = await forwardToUpstream(
1251
+ modifiedReq,
1252
+ config,
1253
+ undefined,
1254
+ cacheOptions,
1255
+ );
1256
+
1257
+ if (!upstreamResponse.ok) {
1258
+ const errorBody = await upstreamResponse.text();
1259
+ log.error(
1260
+ `upstream error: ${upstreamResponse.status} ${errorBody.slice(0, 500)}`,
1261
+ );
1262
+ return new Response(errorBody, {
1263
+ status: upstreamResponse.status,
1264
+ headers: { "content-type": "application/json" },
1265
+ });
1266
+ }
1267
+
1268
+ if (req.stream && upstreamResponse.body) {
1269
+ // Streaming: forward events and accumulate in parallel.
1270
+ // Pass recall context so the accumulator can intercept recall tool_use.
1271
+ const hasRecallTool = modifiedReq.tools.some(
1272
+ (t) => t.name === RECALL_TOOL_NAME,
1273
+ );
1274
+ return buildStreamingResponse(
1275
+ upstreamResponse,
1276
+ (resp) => postResponse(req, resp, sessionState, config),
1277
+ hasRecallTool
1278
+ ? { modifiedReq, config, sessionState, cacheOptions }
1279
+ : undefined,
1280
+ );
1281
+ }
1282
+
1283
+ // Non-streaming (also used for OpenAI protocol via accumulateStreamResponse)
1284
+ const resp = await accumulateNonStreamResponse(upstreamResponse);
1285
+
1286
+ // --- Recall interception (non-streaming) ---
1287
+ if (hasRecallToolUse(resp)) {
1288
+ const recallBlock = findRecallToolUse(resp)!;
1289
+ const { result, input } = await executeRecall(
1290
+ recallBlock,
1291
+ sessionState.projectPath,
1292
+ sessionState.sessionID,
1293
+ );
1294
+
1295
+ if (hasOtherToolUse(resp)) {
1296
+ // Case 2: recall + other tools — store pending, strip recall from response
1297
+ const position = resp.content.indexOf(recallBlock);
1298
+ sessionState.pendingRecall = {
1299
+ toolUseId: recallBlock.id,
1300
+ input,
1301
+ position,
1302
+ result,
1303
+ timestamp: Date.now(),
1304
+ };
1305
+ log.info(
1306
+ `recall (non-stream, mixed): stored pending result for session ${sessionState.sessionID.slice(0, 16)}`,
1307
+ );
1308
+ const cleanResp = stripRecallFromResponse(resp);
1309
+ postResponse(req, cleanResp, sessionState, config);
1310
+ return nonStreamHttpResponse(cleanResp);
1311
+ }
1312
+
1313
+ // Case 1: recall-only — send follow-up request
1314
+ log.info(
1315
+ `recall (non-stream, only): executing follow-up for session ${sessionState.sessionID.slice(0, 16)}`,
1316
+ );
1317
+ const followUp = buildRecallFollowUp(modifiedReq, resp, result, recallBlock);
1318
+ // Strip recall from the follow-up tools (already done by buildRecallFollowUp)
1319
+ const followUpResponse = await forwardToUpstream(
1320
+ followUp,
1321
+ config,
1322
+ undefined,
1323
+ cacheOptions,
1324
+ );
1325
+
1326
+ if (!followUpResponse.ok) {
1327
+ const errorBody = await followUpResponse.text();
1328
+ log.error(
1329
+ `recall follow-up upstream error: ${followUpResponse.status} ${errorBody.slice(0, 500)}`,
1330
+ );
1331
+ // Fall back to the original response without recall
1332
+ const cleanResp = stripRecallFromResponse(resp);
1333
+ postResponse(req, cleanResp, sessionState, config);
1334
+ return nonStreamHttpResponse(cleanResp);
1335
+ }
1336
+
1337
+ const continuationResp = await accumulateNonStreamResponse(followUpResponse);
1338
+
1339
+ // Merge usage from both requests
1340
+ continuationResp.usage.inputTokens += resp.usage.inputTokens;
1341
+ continuationResp.usage.outputTokens += resp.usage.outputTokens;
1342
+ if (resp.usage.cacheReadInputTokens) {
1343
+ continuationResp.usage.cacheReadInputTokens =
1344
+ (continuationResp.usage.cacheReadInputTokens ?? 0) +
1345
+ resp.usage.cacheReadInputTokens;
1346
+ }
1347
+ if (resp.usage.cacheCreationInputTokens) {
1348
+ continuationResp.usage.cacheCreationInputTokens =
1349
+ (continuationResp.usage.cacheCreationInputTokens ?? 0) +
1350
+ resp.usage.cacheCreationInputTokens;
1351
+ }
1352
+
1353
+ postResponse(req, continuationResp, sessionState, config);
1354
+ return nonStreamHttpResponse(continuationResp);
1355
+ }
1356
+
1357
+ postResponse(req, resp, sessionState, config);
1358
+ return nonStreamHttpResponse(resp);
1359
+ }
1360
+
1361
+ // ---------------------------------------------------------------------------
1362
+ // Lore message → Gateway message conversion
1363
+ // ---------------------------------------------------------------------------
1364
+
1365
+ /**
1366
+ * Convert transformed Lore messages back to gateway message format.
1367
+ *
1368
+ * This reverses `gatewayMessagesToLore` after gradient transform has
1369
+ * potentially trimmed/reordered messages.
1370
+ *
1371
+ * Completed/error tool parts on assistant messages produce BOTH a `tool_use`
1372
+ * block on the assistant AND a corresponding `tool_result` block injected at
1373
+ * the start of the following user message. This makes the conversion
1374
+ * self-contained: tool pairing is reconstructed from whatever messages
1375
+ * survived gradient eviction, without depending on cross-message `tool_result`
1376
+ * parts that can become orphaned when the assistant message is evicted.
1377
+ *
1378
+ * `resolveToolResults()` strips `tool: "result"` parts from user messages
1379
+ * after pairing, so under normal operation those parts are gone. The fallback
1380
+ * handling for residual `tool: "result"` parts is kept for robustness.
1381
+ */
1382
+ /** @internal Exported for tests. */
1383
+ export function loreMessagesToGateway(
1384
+ messages: LoreMessageWithParts[],
1385
+ ): Array<{ role: "user" | "assistant"; content: GatewayContentBlock[] }> {
1386
+ const out: Array<{
1387
+ role: "user" | "assistant";
1388
+ content: GatewayContentBlock[];
1389
+ }> = [];
1390
+
1391
+ // tool_result blocks reconstructed from the preceding assistant message's
1392
+ // completed/error tool parts. Injected at the start of the next user message.
1393
+ let pendingToolResults: GatewayContentBlock[] = [];
1394
+
1395
+ for (const msg of messages) {
1396
+ const content: GatewayContentBlock[] = [];
1397
+
1398
+ if (msg.info.role === "user") {
1399
+ // Inject reconstructed tool_result blocks from preceding assistant
1400
+ content.push(...pendingToolResults);
1401
+ pendingToolResults = [];
1402
+ } else {
1403
+ // New assistant message — reset pending results (shouldn't have any
1404
+ // in well-formed conversations, but handles back-to-back assistants)
1405
+ pendingToolResults = [];
1406
+ }
1407
+
1408
+ for (const part of msg.parts) {
1409
+ switch (part.type) {
1410
+ case "text":
1411
+ content.push({
1412
+ type: "text",
1413
+ text: (part as { text: string }).text,
1414
+ });
1415
+ break;
1416
+ case "reasoning":
1417
+ content.push({
1418
+ type: "thinking",
1419
+ thinking: (part as { text: string }).text ?? "",
1420
+ });
1421
+ break;
1422
+ case "tool": {
1423
+ const toolPart = part as {
1424
+ type: "tool";
1425
+ tool: string;
1426
+ callID: string;
1427
+ state: {
1428
+ status: string;
1429
+ input?: unknown;
1430
+ output?: string;
1431
+ error?: string;
1432
+ };
1433
+ };
1434
+ if (toolPart.tool === "result") {
1435
+ // Residual tool_result part (should have been stripped by
1436
+ // resolveToolResults, but handle gracefully for robustness)
1437
+ content.push({
1438
+ type: "tool_result",
1439
+ toolUseId: toolPart.callID,
1440
+ content: toolPart.state.output ?? "",
1441
+ });
1442
+ } else {
1443
+ // Emit tool_use on this assistant message
1444
+ content.push({
1445
+ type: "tool_use",
1446
+ id: toolPart.callID,
1447
+ name: toolPart.tool,
1448
+ input: toolPart.state.input ?? {},
1449
+ });
1450
+ // Completed/error tool parts: queue a tool_result for the next
1451
+ // user message. This reconstructs the Anthropic API's split-
1452
+ // message format from Lore's single-message representation.
1453
+ if (toolPart.state.status === "completed") {
1454
+ pendingToolResults.push({
1455
+ type: "tool_result",
1456
+ toolUseId: toolPart.callID,
1457
+ content: toolPart.state.output ?? "",
1458
+ });
1459
+ } else if (toolPart.state.status === "error") {
1460
+ pendingToolResults.push({
1461
+ type: "tool_result",
1462
+ toolUseId: toolPart.callID,
1463
+ content: toolPart.state.error ?? "[error]",
1464
+ isError: true,
1465
+ });
1466
+ }
1467
+ // Pending tool parts (not yet resolved) only emit tool_use —
1468
+ // the model will see an unresolved tool call. sanitizeToolParts
1469
+ // in gradient.ts converts these to error state before this point.
1470
+ }
1471
+ break;
1472
+ }
1473
+ // Generic / unknown parts — skip or represent as text
1474
+ default:
1475
+ if ("text" in part && typeof part.text === "string") {
1476
+ content.push({ type: "text", text: part.text });
1477
+ }
1478
+ break;
1479
+ }
1480
+ }
1481
+
1482
+ out.push({ role: msg.info.role as "user" | "assistant", content });
1483
+ }
1484
+
1485
+ return out;
1486
+ }
1487
+
1488
+ // ---------------------------------------------------------------------------
1489
+ // Post-conversion validation: remove orphaned tool_result blocks
1490
+ // ---------------------------------------------------------------------------
1491
+
1492
+ /**
1493
+ * Belt-and-suspenders safety net: ensures every `tool_result` block on a user
1494
+ * message references a `tool_use` block on the immediately preceding assistant
1495
+ * message. Removes orphans and logs a warning.
1496
+ *
1497
+ * This should never fire under normal operation (resolveToolResults strips
1498
+ * redundant tool_result parts, and loreMessagesToGateway reconstructs them
1499
+ * from the assistant's completed tool parts). But if a future code path
1500
+ * introduces orphaned references, this catches them before they reach the API.
1501
+ */
1502
+ /** @internal Exported for tests. */
1503
+ export function removeOrphanedToolResults(
1504
+ messages: Array<{
1505
+ role: "user" | "assistant";
1506
+ content: GatewayContentBlock[];
1507
+ }>,
1508
+ ): void {
1509
+ for (let i = 0; i < messages.length; i++) {
1510
+ const msg = messages[i]!;
1511
+ if (msg.role !== "user") continue;
1512
+ if (!msg.content.some((b) => b.type === "tool_result")) continue;
1513
+
1514
+ // Collect tool_use IDs from the preceding assistant message
1515
+ const prev =
1516
+ i > 0 && messages[i - 1]!.role === "assistant"
1517
+ ? messages[i - 1]!
1518
+ : null;
1519
+ const toolUseIds = new Set(
1520
+ (prev?.content ?? [])
1521
+ .filter((b): b is GatewayToolUseBlock => b.type === "tool_use")
1522
+ .map((b) => b.id),
1523
+ );
1524
+
1525
+ // Remove tool_result blocks that reference missing tool_use IDs
1526
+ const before = msg.content.length;
1527
+ msg.content = msg.content.filter(
1528
+ (b) =>
1529
+ b.type !== "tool_result" ||
1530
+ toolUseIds.has((b as GatewayToolResultBlock).toolUseId),
1531
+ );
1532
+ if (msg.content.length < before) {
1533
+ log.warn(
1534
+ `removed ${before - msg.content.length} orphaned tool_result block(s) from message ${i}`,
1535
+ );
1536
+ }
1537
+ // If the user message is now empty, add placeholder text so the API
1538
+ // doesn't reject an empty content array.
1539
+ if (msg.content.length === 0) {
1540
+ msg.content = [{ type: "text", text: "[tool results provided]" }];
1541
+ }
1542
+ }
1543
+ }
1544
+
1545
+ // ---------------------------------------------------------------------------
1546
+ // Error response builder
1547
+ // ---------------------------------------------------------------------------
1548
+
1549
+ function errorResponse(status: number, message: string): Response {
1550
+ return new Response(
1551
+ JSON.stringify({
1552
+ type: "error",
1553
+ error: {
1554
+ type: "server_error",
1555
+ message,
1556
+ },
1557
+ }),
1558
+ {
1559
+ status,
1560
+ headers: { "content-type": "application/json" },
1561
+ },
1562
+ );
1563
+ }
1564
+
1565
+ // ---------------------------------------------------------------------------
1566
+ // Main entry point
1567
+ // ---------------------------------------------------------------------------
1568
+
1569
+ /**
1570
+ * Process an incoming gateway request through the full Lore pipeline.
1571
+ *
1572
+ * Returns a standard `Response` object — either a streaming SSE response
1573
+ * or a JSON response, depending on the client's `stream` setting.
1574
+ */
1575
+ export async function handleRequest(
1576
+ req: GatewayRequest,
1577
+ config: GatewayConfig,
1578
+ ): Promise<Response> {
1579
+ try {
1580
+ // Capture auth credentials early for background workers
1581
+ const earlyAuth = extractAuth(req.rawHeaders);
1582
+ if (earlyAuth) {
1583
+ setLastSeenAuth(earlyAuth);
1584
+ }
1585
+
1586
+ // --- Case 1: Compaction request → intercept ---
1587
+ if (isCompactionRequest(req)) {
1588
+ return await handleCompaction(req, config);
1589
+ }
1590
+
1591
+ // --- Case 2: Title/summary request → passthrough ---
1592
+ if (isTitleOrSummaryRequest(req)) {
1593
+ return await handlePassthrough(req, config);
1594
+ }
1595
+
1596
+ // --- Case 3: Normal conversation turn → full pipeline ---
1597
+ return await handleConversationTurn(req, config);
1598
+ } catch (err) {
1599
+ const message =
1600
+ err instanceof Error ? err.message : "Unknown gateway error";
1601
+ log.error("pipeline error:", err);
1602
+ return errorResponse(502, message);
1603
+ }
1604
+ }