@loreai/gateway 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +3548 -0
- package/dist/index.js.map +7 -0
- package/package.json +53 -0
- package/src/auth.ts +133 -0
- package/src/batch-queue.ts +555 -0
- package/src/compaction.ts +195 -0
- package/src/config.ts +199 -0
- package/src/idle.ts +246 -0
- package/src/index.ts +41 -0
- package/src/llm-adapter.ts +110 -0
- package/src/pipeline.ts +1604 -0
- package/src/recall.ts +301 -0
- package/src/recorder.ts +192 -0
- package/src/server.ts +250 -0
- package/src/session.ts +207 -0
- package/src/stream/anthropic.ts +708 -0
- package/src/temporal-adapter.ts +307 -0
- package/src/translate/anthropic.ts +425 -0
- package/src/translate/openai.ts +536 -0
- package/src/translate/types.ts +177 -0
- package/src/worker-model.ts +408 -0
package/src/pipeline.ts
ADDED
|
@@ -0,0 +1,1604 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core request processing pipeline for the Lore gateway.
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates the full flow for every request:
|
|
5
|
+
* session identification → LTM injection → gradient transform →
|
|
6
|
+
* upstream forwarding → response accumulation → calibration →
|
|
7
|
+
* temporal storage → background work scheduling.
|
|
8
|
+
*
|
|
9
|
+
* Three request classes are handled:
|
|
10
|
+
* 1. Compaction requests → intercepted, never forwarded upstream.
|
|
11
|
+
* 2. Title/summary requests → forwarded transparently, no Lore processing.
|
|
12
|
+
* 3. Normal conversation turns → full pipeline.
|
|
13
|
+
*/
|
|
14
|
+
import type { LoreMessageWithParts, LLMClient } from "@loreai/core";
|
|
15
|
+
import {
|
|
16
|
+
load,
|
|
17
|
+
config as loreConfig,
|
|
18
|
+
ensureProject,
|
|
19
|
+
isFirstRun,
|
|
20
|
+
temporal,
|
|
21
|
+
ltm,
|
|
22
|
+
distillation,
|
|
23
|
+
curator,
|
|
24
|
+
log,
|
|
25
|
+
transform,
|
|
26
|
+
setModelLimits,
|
|
27
|
+
setLtmTokens,
|
|
28
|
+
getLtmBudget,
|
|
29
|
+
setMaxLayer0Tokens,
|
|
30
|
+
computeLayer0Cap,
|
|
31
|
+
calibrate,
|
|
32
|
+
getLastTransformedCount,
|
|
33
|
+
onIdleResume,
|
|
34
|
+
consumeCameOutOfIdle,
|
|
35
|
+
needsUrgentDistillation,
|
|
36
|
+
formatKnowledge,
|
|
37
|
+
buildCompactPrompt,
|
|
38
|
+
} from "@loreai/core";
|
|
39
|
+
|
|
40
|
+
import type {
|
|
41
|
+
GatewayRequest,
|
|
42
|
+
GatewayResponse,
|
|
43
|
+
GatewayContentBlock,
|
|
44
|
+
GatewayToolUseBlock,
|
|
45
|
+
GatewayToolResultBlock,
|
|
46
|
+
SessionState,
|
|
47
|
+
} from "./translate/types";
|
|
48
|
+
import type { GatewayConfig } from "./config";
|
|
49
|
+
import { getProjectPath, resolveUpstreamRoute } from "./config";
|
|
50
|
+
import {
|
|
51
|
+
generateSessionID,
|
|
52
|
+
fingerprintMessages,
|
|
53
|
+
MESSAGE_COUNT_PROXIMITY_THRESHOLD,
|
|
54
|
+
} from "./session";
|
|
55
|
+
import {
|
|
56
|
+
isCompactionRequest,
|
|
57
|
+
isTitleOrSummaryRequest,
|
|
58
|
+
extractPreviousSummary,
|
|
59
|
+
buildCompactionResponse,
|
|
60
|
+
} from "./compaction";
|
|
61
|
+
import {
|
|
62
|
+
buildAnthropicRequest,
|
|
63
|
+
buildAnthropicNonStreamResponse,
|
|
64
|
+
type AnthropicCacheOptions,
|
|
65
|
+
} from "./translate/anthropic";
|
|
66
|
+
import {
|
|
67
|
+
buildOpenAIUpstreamRequest,
|
|
68
|
+
buildOpenAIResponse,
|
|
69
|
+
} from "./translate/openai";
|
|
70
|
+
import {
|
|
71
|
+
createStreamAccumulator,
|
|
72
|
+
createRecallAwareAccumulator,
|
|
73
|
+
parseSSEStream,
|
|
74
|
+
buildSSETextResponse,
|
|
75
|
+
formatSSEEvent,
|
|
76
|
+
type StreamAccumulator,
|
|
77
|
+
} from "./stream/anthropic";
|
|
78
|
+
import {
|
|
79
|
+
gatewayMessagesToLore,
|
|
80
|
+
updateAssistantMessageTokens,
|
|
81
|
+
resolveToolResults,
|
|
82
|
+
} from "./temporal-adapter";
|
|
83
|
+
import { createGatewayLLMClient } from "./llm-adapter";
|
|
84
|
+
import { createBatchLLMClient } from "./batch-queue";
|
|
85
|
+
import {
|
|
86
|
+
extractAuth,
|
|
87
|
+
authFingerprint,
|
|
88
|
+
setLastSeenAuth,
|
|
89
|
+
setSessionAuth,
|
|
90
|
+
resolveAuth,
|
|
91
|
+
} from "./auth";
|
|
92
|
+
import type { UpstreamInterceptor } from "./recorder";
|
|
93
|
+
import { startIdleScheduler, buildIdleWorkHandler } from "./idle";
|
|
94
|
+
import { getWorkerModel, resetWorkerModelState } from "./worker-model";
|
|
95
|
+
import {
|
|
96
|
+
RECALL_GATEWAY_TOOL,
|
|
97
|
+
RECALL_TOOL_NAME,
|
|
98
|
+
executeRecall,
|
|
99
|
+
findRecallToolUse,
|
|
100
|
+
hasRecallToolUse,
|
|
101
|
+
hasOtherToolUse,
|
|
102
|
+
clientHasRecallTool,
|
|
103
|
+
isPendingRecallValid,
|
|
104
|
+
injectPendingRecall,
|
|
105
|
+
buildRecallFollowUp,
|
|
106
|
+
stripRecallFromResponse,
|
|
107
|
+
} from "./recall";
|
|
108
|
+
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
// Module state
|
|
111
|
+
// ---------------------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
/** One-time initialization flag. */
|
|
114
|
+
let initialized = false;
|
|
115
|
+
|
|
116
|
+
/** Active upstream interceptor — used for recording/replay. */
|
|
117
|
+
let activeInterceptor: UpstreamInterceptor | undefined;
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Set (or clear) the module-level upstream interceptor.
|
|
121
|
+
*
|
|
122
|
+
* When set, every call to `forwardToUpstream` passes through the interceptor
|
|
123
|
+
* instead of calling `fetch` directly. Used by the recording and replay
|
|
124
|
+
* scripts to capture or replay upstream traffic without modifying individual
|
|
125
|
+
* call sites.
|
|
126
|
+
*/
|
|
127
|
+
export function setUpstreamInterceptor(
|
|
128
|
+
interceptor: UpstreamInterceptor | undefined,
|
|
129
|
+
): void {
|
|
130
|
+
activeInterceptor = interceptor;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Reset all module-level singleton state.
|
|
135
|
+
*
|
|
136
|
+
* Intended for test harnesses only — allows multiple independent gateway
|
|
137
|
+
* instances to run sequentially in the same Bun process without leaking
|
|
138
|
+
* session state, initialization flags, or cached project paths across test
|
|
139
|
+
* suites.
|
|
140
|
+
*/
|
|
141
|
+
export async function resetPipelineState(): Promise<void> {
|
|
142
|
+
initialized = false;
|
|
143
|
+
cachedProjectPath = null;
|
|
144
|
+
sessions.clear();
|
|
145
|
+
ltmSessionCache.clear();
|
|
146
|
+
// Shut down batch queue gracefully before clearing the client
|
|
147
|
+
if (llmClient && "shutdown" in llmClient) {
|
|
148
|
+
await (llmClient as LLMClient & { shutdown: () => Promise<void> }).shutdown();
|
|
149
|
+
}
|
|
150
|
+
llmClient = null;
|
|
151
|
+
activeInterceptor = undefined;
|
|
152
|
+
if (stopIdleScheduler) {
|
|
153
|
+
stopIdleScheduler();
|
|
154
|
+
stopIdleScheduler = null;
|
|
155
|
+
}
|
|
156
|
+
lastSeenSessionModel = null;
|
|
157
|
+
resetWorkerModelState();
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/** Cached project path from the first request that carried a system prompt. */
|
|
161
|
+
let cachedProjectPath: string | null = null;
|
|
162
|
+
|
|
163
|
+
/** Per-session state tracked across requests. */
|
|
164
|
+
const sessions = new Map<string, SessionState>();
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Per-session LTM cache for byte-stability.
|
|
168
|
+
*
|
|
169
|
+
* Without caching, `ltm.forSession()` re-scores entries against evolving
|
|
170
|
+
* session context every turn, producing different formatted text → system
|
|
171
|
+
* prompt changes at byte 0 → total cache invalidation on every turn.
|
|
172
|
+
*/
|
|
173
|
+
const ltmSessionCache = new Map<
|
|
174
|
+
string,
|
|
175
|
+
{ formatted: string; tokenCount: number }
|
|
176
|
+
>();
|
|
177
|
+
|
|
178
|
+
/** Cached LLM client for background workers. */
|
|
179
|
+
let llmClient: LLMClient | null = null;
|
|
180
|
+
|
|
181
|
+
/** Cleanup function for the idle scheduler timer. */
|
|
182
|
+
let stopIdleScheduler: (() => void) | null = null;
|
|
183
|
+
|
|
184
|
+
/** Last seen session model ID — used for worker model discovery context. */
|
|
185
|
+
let lastSeenSessionModel: string | null = null;
|
|
186
|
+
|
|
187
|
+
// ---------------------------------------------------------------------------
|
|
188
|
+
// Model limits — hardcoded for known models, fallback for unknown
|
|
189
|
+
// ---------------------------------------------------------------------------
|
|
190
|
+
|
|
191
|
+
type ModelSpec = {
|
|
192
|
+
context: number;
|
|
193
|
+
output: number;
|
|
194
|
+
/** Cache-read cost per token in USD (Anthropic: 10% of input price). */
|
|
195
|
+
cacheReadCost?: number;
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
const MODEL_SPECS: Record<string, ModelSpec> = {
|
|
199
|
+
// Pricing: https://docs.anthropic.com/en/docs/about-claude/models
|
|
200
|
+
// Cache-read = input_price / 1_000_000 * 0.1 (10% of input for Anthropic)
|
|
201
|
+
"claude-opus-4": { context: 200_000, output: 32_000, cacheReadCost: 15 / 1_000_000 * 0.1 },
|
|
202
|
+
"claude-sonnet-4": { context: 200_000, output: 16_000, cacheReadCost: 3 / 1_000_000 * 0.1 },
|
|
203
|
+
"claude-sonnet-3-5": { context: 200_000, output: 8_192, cacheReadCost: 3 / 1_000_000 * 0.1 },
|
|
204
|
+
"claude-haiku-3-5": { context: 200_000, output: 8_192, cacheReadCost: 0.80 / 1_000_000 * 0.1 },
|
|
205
|
+
};
|
|
206
|
+
|
|
207
|
+
const DEFAULT_MODEL_SPEC: ModelSpec = { context: 200_000, output: 8_192 };
|
|
208
|
+
|
|
209
|
+
function getModelSpec(model: string): ModelSpec {
|
|
210
|
+
// Check for prefix matches: "claude-opus-4-20250514" → "claude-opus-4"
|
|
211
|
+
for (const [prefix, spec] of Object.entries(MODEL_SPECS)) {
|
|
212
|
+
if (model.startsWith(prefix)) return spec;
|
|
213
|
+
}
|
|
214
|
+
return DEFAULT_MODEL_SPEC;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// ---------------------------------------------------------------------------
|
|
218
|
+
// Initialization
|
|
219
|
+
// ---------------------------------------------------------------------------
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* One-time init: load Lore config, ensure project exists in DB, start idle scheduler.
|
|
223
|
+
* Safe to call multiple times — only the first call does work.
|
|
224
|
+
*/
|
|
225
|
+
async function initIfNeeded(projectPath: string, config?: GatewayConfig): Promise<void> {
|
|
226
|
+
if (initialized) return;
|
|
227
|
+
|
|
228
|
+
await load(projectPath);
|
|
229
|
+
ensureProject(projectPath);
|
|
230
|
+
initialized = true;
|
|
231
|
+
cachedProjectPath = projectPath;
|
|
232
|
+
|
|
233
|
+
// Start the idle scheduler for background work (distillation, curation,
|
|
234
|
+
// pruning, AGENTS.md export). Uses a 30s poll interval and fires for any
|
|
235
|
+
// session whose lastRequestTime exceeds the idle timeout.
|
|
236
|
+
if (config && !stopIdleScheduler) {
|
|
237
|
+
const llm = getLLMClient(config);
|
|
238
|
+
const sessionModelID = lastSeenSessionModel ?? (loreConfig().model?.modelID ?? "claude-sonnet-4-20250514");
|
|
239
|
+
const idleHandler = buildIdleWorkHandler(
|
|
240
|
+
projectPath,
|
|
241
|
+
llm,
|
|
242
|
+
config.upstreamAnthropic,
|
|
243
|
+
() => resolveAuth(),
|
|
244
|
+
sessionModelID,
|
|
245
|
+
// onLtmInvalidated: clear the LTM session cache
|
|
246
|
+
() => ltmSessionCache.clear(),
|
|
247
|
+
);
|
|
248
|
+
stopIdleScheduler = startIdleScheduler(config, sessions, idleHandler);
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
log.info(`gateway pipeline initialized: ${projectPath}`);
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
function getLLMClient(config: GatewayConfig): LLMClient {
|
|
255
|
+
if (!llmClient) {
|
|
256
|
+
const cfg = loreConfig();
|
|
257
|
+
const defaultModel = cfg.model ?? {
|
|
258
|
+
providerID: "anthropic",
|
|
259
|
+
modelID: "claude-sonnet-4-20250514",
|
|
260
|
+
};
|
|
261
|
+
const inner = createGatewayLLMClient(
|
|
262
|
+
config.upstreamAnthropic,
|
|
263
|
+
resolveAuth,
|
|
264
|
+
defaultModel,
|
|
265
|
+
);
|
|
266
|
+
|
|
267
|
+
// Wrap with batch queue for 50% cost savings on non-urgent worker calls.
|
|
268
|
+
// Enabled by default — disable via LORE_BATCH_DISABLED=1.
|
|
269
|
+
const batchDisabled = process.env.LORE_BATCH_DISABLED === "1";
|
|
270
|
+
if (batchDisabled) {
|
|
271
|
+
llmClient = inner;
|
|
272
|
+
} else {
|
|
273
|
+
llmClient = createBatchLLMClient(
|
|
274
|
+
inner,
|
|
275
|
+
config.upstreamAnthropic,
|
|
276
|
+
resolveAuth,
|
|
277
|
+
defaultModel,
|
|
278
|
+
);
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
return llmClient;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// ---------------------------------------------------------------------------
|
|
285
|
+
// Session management helpers
|
|
286
|
+
// ---------------------------------------------------------------------------
|
|
287
|
+
|
|
288
|
+
function getOrCreateSession(
|
|
289
|
+
sessionID: string,
|
|
290
|
+
projectPath: string,
|
|
291
|
+
): SessionState {
|
|
292
|
+
let state = sessions.get(sessionID);
|
|
293
|
+
if (!state) {
|
|
294
|
+
state = {
|
|
295
|
+
sessionID,
|
|
296
|
+
projectPath,
|
|
297
|
+
fingerprint: "",
|
|
298
|
+
lastRequestTime: Date.now(),
|
|
299
|
+
messageCount: 0,
|
|
300
|
+
turnsSinceCuration: 0,
|
|
301
|
+
};
|
|
302
|
+
sessions.set(sessionID, state);
|
|
303
|
+
}
|
|
304
|
+
state.lastRequestTime = Date.now();
|
|
305
|
+
|
|
306
|
+
// Lazy cleanup: discard expired pending recall on access
|
|
307
|
+
if (state.pendingRecall && !isPendingRecallValid(state.pendingRecall)) {
|
|
308
|
+
log.warn(
|
|
309
|
+
`lazy cleanup: discarding expired pending recall for session ${sessionID.slice(0, 16)}`,
|
|
310
|
+
);
|
|
311
|
+
state.pendingRecall = undefined;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
return state;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* Identify or create a session from the incoming request messages.
|
|
319
|
+
*
|
|
320
|
+
* Uses a fingerprint of the first user message combined with
|
|
321
|
+
* message-count proximity to correlate requests to sessions.
|
|
322
|
+
* Forked sessions (which share the same first message) are
|
|
323
|
+
* disambiguated by a significant drop in message count.
|
|
324
|
+
*/
|
|
325
|
+
async function identifySession(
|
|
326
|
+
req: GatewayRequest,
|
|
327
|
+
_projectPath: string,
|
|
328
|
+
): Promise<{ sessionID: string; isNew: boolean }> {
|
|
329
|
+
const rawMessages = req.messages.map((m) => ({
|
|
330
|
+
role: m.role,
|
|
331
|
+
content: m.content,
|
|
332
|
+
}));
|
|
333
|
+
const cred = extractAuth(req.rawHeaders);
|
|
334
|
+
const fingerprint = await fingerprintMessages(rawMessages, {
|
|
335
|
+
model: req.model,
|
|
336
|
+
authSuffix: cred ? authFingerprint(cred) : "",
|
|
337
|
+
});
|
|
338
|
+
const msgCount = req.messages.length;
|
|
339
|
+
|
|
340
|
+
// Find the best matching session: same fingerprint + closest message count
|
|
341
|
+
let bestMatch: { sid: string; countDiff: number } | null = null;
|
|
342
|
+
|
|
343
|
+
for (const [sid, state] of sessions) {
|
|
344
|
+
if (state.fingerprint !== fingerprint) continue;
|
|
345
|
+
|
|
346
|
+
const diff = msgCount - state.messageCount;
|
|
347
|
+
|
|
348
|
+
// Normal session: count grows by 2–6 per turn.
|
|
349
|
+
// Fork: count drops significantly (parent at 600, fork at 300).
|
|
350
|
+
// Reject if the count dropped too far (likely a fork).
|
|
351
|
+
if (diff < -MESSAGE_COUNT_PROXIMITY_THRESHOLD) continue;
|
|
352
|
+
|
|
353
|
+
const absDiff = Math.abs(diff);
|
|
354
|
+
if (!bestMatch || absDiff < bestMatch.countDiff) {
|
|
355
|
+
bestMatch = { sid, countDiff: absDiff };
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
if (bestMatch) {
|
|
360
|
+
return { sessionID: bestMatch.sid, isNew: false };
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
// No matching session → create new
|
|
364
|
+
const sessionID = generateSessionID();
|
|
365
|
+
return { sessionID, isNew: true };
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
// ---------------------------------------------------------------------------
|
|
369
|
+
// Upstream forwarding
|
|
370
|
+
// ---------------------------------------------------------------------------
|
|
371
|
+
|
|
372
|
+
/**
|
|
373
|
+
* Forward a request to the upstream provider (Anthropic or OpenAI).
|
|
374
|
+
*
|
|
375
|
+
* When an interceptor is provided (or a module-level one is active), the
|
|
376
|
+
* interceptor is called instead of `fetch` directly. This enables recording
|
|
377
|
+
* and replay without modifying individual call sites.
|
|
378
|
+
*
|
|
379
|
+
* Returns the raw fetch Response (may be streaming or non-streaming).
|
|
380
|
+
*/
|
|
381
|
+
async function forwardToUpstream(
|
|
382
|
+
req: GatewayRequest,
|
|
383
|
+
config: GatewayConfig,
|
|
384
|
+
interceptor?: UpstreamInterceptor,
|
|
385
|
+
cache?: AnthropicCacheOptions,
|
|
386
|
+
): Promise<Response> {
|
|
387
|
+
let url: string;
|
|
388
|
+
let headers: Record<string, string>;
|
|
389
|
+
let body: unknown;
|
|
390
|
+
|
|
391
|
+
// Infer upstream from model name; fall back to protocol + env-var defaults.
|
|
392
|
+
const route = resolveUpstreamRoute(req.model);
|
|
393
|
+
const effectiveProtocol = route?.protocol ?? req.protocol;
|
|
394
|
+
const effectiveUpstreamBase = route?.url ?? (effectiveProtocol === "openai" ? config.upstreamOpenAI : config.upstreamAnthropic);
|
|
395
|
+
|
|
396
|
+
if (effectiveProtocol === "openai") {
|
|
397
|
+
const result = buildOpenAIUpstreamRequest(req, effectiveUpstreamBase);
|
|
398
|
+
url = result.url;
|
|
399
|
+
headers = result.headers;
|
|
400
|
+
body = result.body;
|
|
401
|
+
} else {
|
|
402
|
+
const result = buildAnthropicRequest(req, cache);
|
|
403
|
+
url = `${effectiveUpstreamBase}${result.url}`;
|
|
404
|
+
headers = result.headers;
|
|
405
|
+
body = result.body;
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
const effectiveInterceptor = interceptor ?? activeInterceptor;
|
|
409
|
+
|
|
410
|
+
if (effectiveInterceptor) {
|
|
411
|
+
return effectiveInterceptor(
|
|
412
|
+
body,
|
|
413
|
+
req.model,
|
|
414
|
+
req.stream,
|
|
415
|
+
() =>
|
|
416
|
+
fetch(url, {
|
|
417
|
+
method: "POST",
|
|
418
|
+
headers,
|
|
419
|
+
body: JSON.stringify(body),
|
|
420
|
+
}),
|
|
421
|
+
);
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
return fetch(url, {
|
|
425
|
+
method: "POST",
|
|
426
|
+
headers,
|
|
427
|
+
body: JSON.stringify(body),
|
|
428
|
+
});
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
// ---------------------------------------------------------------------------
|
|
432
|
+
// Response builders
|
|
433
|
+
// ---------------------------------------------------------------------------
|
|
434
|
+
|
|
435
|
+
/**
|
|
436
|
+
* Create a streaming SSE response from upstream with parallel accumulation.
|
|
437
|
+
*
|
|
438
|
+
* When `recallContext` is provided, uses a recall-aware accumulator that
|
|
439
|
+
* transparently intercepts recall tool_use blocks:
|
|
440
|
+
* - **Case 1 (recall-only)**: pauses client stream, executes recall, sends
|
|
441
|
+
* a follow-up request, and pipes the continuation into the same HTTP
|
|
442
|
+
* response stream.
|
|
443
|
+
* - **Case 2 (mixed tools)**: suppresses recall blocks, stores the pending
|
|
444
|
+
* result for injection into the next request.
|
|
445
|
+
*/
|
|
446
|
+
function buildStreamingResponse(
|
|
447
|
+
upstreamResponse: Response,
|
|
448
|
+
onComplete: (response: GatewayResponse) => void,
|
|
449
|
+
recallContext?: {
|
|
450
|
+
modifiedReq: GatewayRequest;
|
|
451
|
+
config: GatewayConfig;
|
|
452
|
+
sessionState: SessionState;
|
|
453
|
+
cacheOptions: AnthropicCacheOptions;
|
|
454
|
+
},
|
|
455
|
+
): Response {
|
|
456
|
+
const recallAccum = recallContext
|
|
457
|
+
? createRecallAwareAccumulator(RECALL_TOOL_NAME)
|
|
458
|
+
: null;
|
|
459
|
+
const accumulator: StreamAccumulator = recallAccum ?? createStreamAccumulator();
|
|
460
|
+
const encoder = new TextEncoder();
|
|
461
|
+
|
|
462
|
+
const stream = new ReadableStream({
|
|
463
|
+
async start(controller) {
|
|
464
|
+
try {
|
|
465
|
+
// Parse and forward upstream SSE events
|
|
466
|
+
const reader = upstreamResponse.body!.getReader();
|
|
467
|
+
for await (const { event, data } of parseSSEStream(reader)) {
|
|
468
|
+
const forwarded = accumulator.processEvent(event, data);
|
|
469
|
+
if (forwarded) {
|
|
470
|
+
controller.enqueue(encoder.encode(forwarded));
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
// --- Recall interception (streaming) ---
|
|
475
|
+
if (recallAccum?.hasRecall()) {
|
|
476
|
+
const resp = recallAccum.getResponse();
|
|
477
|
+
const recallBlock = findRecallToolUse(resp);
|
|
478
|
+
|
|
479
|
+
if (recallBlock && recallContext) {
|
|
480
|
+
const { result, input } = await executeRecall(
|
|
481
|
+
recallBlock,
|
|
482
|
+
recallContext.sessionState.projectPath,
|
|
483
|
+
recallContext.sessionState.sessionID,
|
|
484
|
+
);
|
|
485
|
+
|
|
486
|
+
if (recallAccum.hasOtherTools()) {
|
|
487
|
+
// Case 2: mixed tools — store pending, forward held-back events
|
|
488
|
+
const position = resp.content.indexOf(recallBlock);
|
|
489
|
+
recallContext.sessionState.pendingRecall = {
|
|
490
|
+
toolUseId: recallBlock.id,
|
|
491
|
+
input,
|
|
492
|
+
position,
|
|
493
|
+
result,
|
|
494
|
+
timestamp: Date.now(),
|
|
495
|
+
};
|
|
496
|
+
log.info(
|
|
497
|
+
`recall (stream, mixed): stored pending result for session ` +
|
|
498
|
+
`${recallContext.sessionState.sessionID.slice(0, 16)}`,
|
|
499
|
+
);
|
|
500
|
+
|
|
501
|
+
// Emit a synthetic "[Searching memory...]" text block after all
|
|
502
|
+
// other tool blocks. The accumulator already re-indexed other
|
|
503
|
+
// tools to fill the gap, so this goes at clientBlockCount.
|
|
504
|
+
const searchingIdx = recallAccum.clientBlockCount();
|
|
505
|
+
const syntheticCase2 = [
|
|
506
|
+
formatSSEEvent("content_block_start", JSON.stringify({
|
|
507
|
+
type: "content_block_start",
|
|
508
|
+
index: searchingIdx,
|
|
509
|
+
content_block: { type: "text", text: "" },
|
|
510
|
+
})),
|
|
511
|
+
formatSSEEvent("content_block_delta", JSON.stringify({
|
|
512
|
+
type: "content_block_delta",
|
|
513
|
+
index: searchingIdx,
|
|
514
|
+
delta: { type: "text_delta", text: "\n\n[Searching memory...]" },
|
|
515
|
+
})),
|
|
516
|
+
formatSSEEvent("content_block_stop", JSON.stringify({
|
|
517
|
+
type: "content_block_stop",
|
|
518
|
+
index: searchingIdx,
|
|
519
|
+
})),
|
|
520
|
+
].join("");
|
|
521
|
+
controller.enqueue(encoder.encode(syntheticCase2));
|
|
522
|
+
|
|
523
|
+
// Forward the held-back message_delta + message_stop
|
|
524
|
+
const heldBack = recallAccum.heldBackEvents();
|
|
525
|
+
if (heldBack) {
|
|
526
|
+
controller.enqueue(encoder.encode(heldBack));
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
controller.close();
|
|
530
|
+
|
|
531
|
+
// Post-stream: use stripped response for temporal storage
|
|
532
|
+
const cleanResp = stripRecallFromResponse(resp);
|
|
533
|
+
onComplete(cleanResp);
|
|
534
|
+
return;
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
// Case 1: recall-only — send follow-up, pipe continuation
|
|
538
|
+
log.info(
|
|
539
|
+
`recall (stream, only): executing follow-up for session ` +
|
|
540
|
+
`${recallContext.sessionState.sessionID.slice(0, 16)}`,
|
|
541
|
+
);
|
|
542
|
+
|
|
543
|
+
// Emit a synthetic "[Searching memory...]" text block at the
|
|
544
|
+
// suppressed recall index so the client sees a natural indicator
|
|
545
|
+
// during the pause while the recall executes.
|
|
546
|
+
const searchingIndex = recallAccum.clientBlockCount();
|
|
547
|
+
const syntheticBlock = [
|
|
548
|
+
formatSSEEvent("content_block_start", JSON.stringify({
|
|
549
|
+
type: "content_block_start",
|
|
550
|
+
index: searchingIndex,
|
|
551
|
+
content_block: { type: "text", text: "" },
|
|
552
|
+
})),
|
|
553
|
+
formatSSEEvent("content_block_delta", JSON.stringify({
|
|
554
|
+
type: "content_block_delta",
|
|
555
|
+
index: searchingIndex,
|
|
556
|
+
delta: { type: "text_delta", text: "\n\n[Searching memory...]" },
|
|
557
|
+
})),
|
|
558
|
+
formatSSEEvent("content_block_stop", JSON.stringify({
|
|
559
|
+
type: "content_block_stop",
|
|
560
|
+
index: searchingIndex,
|
|
561
|
+
})),
|
|
562
|
+
].join("");
|
|
563
|
+
controller.enqueue(encoder.encode(syntheticBlock));
|
|
564
|
+
|
|
565
|
+
const followUp = buildRecallFollowUp(
|
|
566
|
+
recallContext.modifiedReq,
|
|
567
|
+
resp,
|
|
568
|
+
result,
|
|
569
|
+
recallBlock,
|
|
570
|
+
);
|
|
571
|
+
const followUpResponse = await forwardToUpstream(
|
|
572
|
+
followUp,
|
|
573
|
+
recallContext.config,
|
|
574
|
+
undefined,
|
|
575
|
+
recallContext.cacheOptions,
|
|
576
|
+
);
|
|
577
|
+
|
|
578
|
+
if (!followUpResponse.ok) {
|
|
579
|
+
const errorBody = await followUpResponse.text();
|
|
580
|
+
log.error(
|
|
581
|
+
`recall follow-up upstream error: ${followUpResponse.status} ${errorBody.slice(0, 500)}`,
|
|
582
|
+
);
|
|
583
|
+
// Forward the held-back events to close the stream gracefully
|
|
584
|
+
const heldBack = recallAccum.heldBackEvents();
|
|
585
|
+
if (heldBack) {
|
|
586
|
+
controller.enqueue(encoder.encode(heldBack));
|
|
587
|
+
}
|
|
588
|
+
controller.close();
|
|
589
|
+
const cleanResp = stripRecallFromResponse(resp);
|
|
590
|
+
onComplete(cleanResp);
|
|
591
|
+
return;
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
// Pipe the continuation stream into the same HTTP response.
|
|
595
|
+
// Suppress message_start (client already has one) and re-index
|
|
596
|
+
// content blocks to continue from where the client left off.
|
|
597
|
+
// +1 accounts for the synthetic "[Searching memory...]" block.
|
|
598
|
+
// Use clientBlockCount (not recallBlockIndex) — this is the number
|
|
599
|
+
// of blocks the client has already seen, so continuation blocks
|
|
600
|
+
// start at clientBlockCount + 1 (for the synthetic block).
|
|
601
|
+
const blockOffset = recallAccum.clientBlockCount() + 1;
|
|
602
|
+
const contReader = followUpResponse.body!.getReader();
|
|
603
|
+
|
|
604
|
+
for await (const { event: contEvent, data: contData } of parseSSEStream(contReader)) {
|
|
605
|
+
if (contEvent === "message_start") {
|
|
606
|
+
// Suppress — client already received one
|
|
607
|
+
continue;
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
// Re-index content block events
|
|
611
|
+
if (
|
|
612
|
+
contEvent === "content_block_start" ||
|
|
613
|
+
contEvent === "content_block_delta" ||
|
|
614
|
+
contEvent === "content_block_stop"
|
|
615
|
+
) {
|
|
616
|
+
try {
|
|
617
|
+
const parsed = JSON.parse(contData) as Record<string, unknown>;
|
|
618
|
+
if (typeof parsed.index === "number") {
|
|
619
|
+
parsed.index = (parsed.index as number) + blockOffset;
|
|
620
|
+
const adjusted = formatSSEEvent(
|
|
621
|
+
contEvent,
|
|
622
|
+
JSON.stringify(parsed),
|
|
623
|
+
);
|
|
624
|
+
controller.enqueue(encoder.encode(adjusted));
|
|
625
|
+
continue;
|
|
626
|
+
}
|
|
627
|
+
} catch {
|
|
628
|
+
// Fall through to forward as-is
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
// Forward message_delta, message_stop, and other events as-is
|
|
633
|
+
const forwarded = formatSSEEvent(contEvent, contData);
|
|
634
|
+
controller.enqueue(encoder.encode(forwarded));
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
controller.close();
|
|
638
|
+
|
|
639
|
+
// Post-stream: accumulate the continuation for temporal storage.
|
|
640
|
+
// We use resp (original) + continuation for a complete picture,
|
|
641
|
+
// but for simplicity just store the continuation response since
|
|
642
|
+
// it's what the model actually produced for the client.
|
|
643
|
+
// The continuation accumulator was not wired — use the original
|
|
644
|
+
// response's pre-recall content + continuation's content.
|
|
645
|
+
// For now, call onComplete with the original response so at least
|
|
646
|
+
// the pre-recall content is stored. The continuation's text is
|
|
647
|
+
// visible to the client but not separately stored — acceptable
|
|
648
|
+
// since temporal storage captures the full conversation on next turn.
|
|
649
|
+
onComplete(resp);
|
|
650
|
+
return;
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
// No recall — normal path
|
|
655
|
+
controller.close();
|
|
656
|
+
const response = accumulator.getResponse();
|
|
657
|
+
onComplete(response);
|
|
658
|
+
} catch (err) {
|
|
659
|
+
log.error("streaming pipeline error:", err);
|
|
660
|
+
controller.error(err);
|
|
661
|
+
}
|
|
662
|
+
},
|
|
663
|
+
});
|
|
664
|
+
|
|
665
|
+
return new Response(stream, {
|
|
666
|
+
status: 200,
|
|
667
|
+
headers: {
|
|
668
|
+
"content-type": "text/event-stream",
|
|
669
|
+
"cache-control": "no-cache",
|
|
670
|
+
connection: "keep-alive",
|
|
671
|
+
},
|
|
672
|
+
});
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
/**
|
|
676
|
+
* Accumulate a non-streaming upstream response into a GatewayResponse.
|
|
677
|
+
*/
|
|
678
|
+
async function accumulateNonStreamResponse(
|
|
679
|
+
upstreamResponse: Response,
|
|
680
|
+
): Promise<GatewayResponse> {
|
|
681
|
+
const json = (await upstreamResponse.json()) as Record<string, unknown>;
|
|
682
|
+
|
|
683
|
+
const content: GatewayContentBlock[] = [];
|
|
684
|
+
const rawContent = json.content as Array<Record<string, unknown>> | undefined;
|
|
685
|
+
if (rawContent) {
|
|
686
|
+
for (const block of rawContent) {
|
|
687
|
+
switch (block.type) {
|
|
688
|
+
case "text":
|
|
689
|
+
content.push({ type: "text", text: String(block.text ?? "") });
|
|
690
|
+
break;
|
|
691
|
+
case "thinking":
|
|
692
|
+
content.push({
|
|
693
|
+
type: "thinking",
|
|
694
|
+
thinking: String(block.thinking ?? ""),
|
|
695
|
+
...(block.signature
|
|
696
|
+
? { signature: String(block.signature) }
|
|
697
|
+
: undefined),
|
|
698
|
+
});
|
|
699
|
+
break;
|
|
700
|
+
case "tool_use":
|
|
701
|
+
content.push({
|
|
702
|
+
type: "tool_use",
|
|
703
|
+
id: String(block.id ?? ""),
|
|
704
|
+
name: String(block.name ?? ""),
|
|
705
|
+
input: block.input,
|
|
706
|
+
});
|
|
707
|
+
break;
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
const usage = json.usage as Record<string, number> | undefined;
|
|
713
|
+
|
|
714
|
+
return {
|
|
715
|
+
id: String(json.id ?? ""),
|
|
716
|
+
model: String(json.model ?? ""),
|
|
717
|
+
content,
|
|
718
|
+
stopReason: String(
|
|
719
|
+
(json.stop_reason as string) ?? "end_turn",
|
|
720
|
+
),
|
|
721
|
+
usage: {
|
|
722
|
+
inputTokens: usage?.input_tokens ?? 0,
|
|
723
|
+
outputTokens: usage?.output_tokens ?? 0,
|
|
724
|
+
cacheReadInputTokens: usage?.cache_read_input_tokens,
|
|
725
|
+
cacheCreationInputTokens: usage?.cache_creation_input_tokens,
|
|
726
|
+
},
|
|
727
|
+
};
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
/**
|
|
731
|
+
* Accumulate a streaming upstream SSE response into a GatewayResponse.
|
|
732
|
+
*
|
|
733
|
+
* Used for OpenAI requests where we need to convert the accumulated
|
|
734
|
+
* response to OpenAI format before returning to the client.
|
|
735
|
+
*/
|
|
736
|
+
async function accumulateStreamResponse(
|
|
737
|
+
upstreamResponse: Response,
|
|
738
|
+
): Promise<GatewayResponse> {
|
|
739
|
+
const accumulator = createStreamAccumulator();
|
|
740
|
+
const reader = upstreamResponse.body!.getReader();
|
|
741
|
+
|
|
742
|
+
for await (const { event, data } of parseSSEStream(reader)) {
|
|
743
|
+
accumulator.processEvent(event, data);
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
return accumulator.getResponse();
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
/**
|
|
750
|
+
* Convert a GatewayResponse to a non-streaming HTTP Response.
|
|
751
|
+
*/
|
|
752
|
+
function nonStreamHttpResponse(resp: GatewayResponse): Response {
|
|
753
|
+
const body = buildAnthropicNonStreamResponse(resp);
|
|
754
|
+
return new Response(JSON.stringify(body), {
|
|
755
|
+
status: 200,
|
|
756
|
+
headers: { "content-type": "application/json" },
|
|
757
|
+
});
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
/**
|
|
761
|
+
* Convert a GatewayResponse to a streaming SSE HTTP Response.
|
|
762
|
+
*/
|
|
763
|
+
function streamHttpResponse(resp: GatewayResponse): Response {
|
|
764
|
+
// Build the full SSE text for a text-only response
|
|
765
|
+
const textBlocks = resp.content.filter(
|
|
766
|
+
(b): b is { type: "text"; text: string } => b.type === "text",
|
|
767
|
+
);
|
|
768
|
+
const fullText = textBlocks.map((b) => b.text).join("");
|
|
769
|
+
|
|
770
|
+
const sseBody = buildSSETextResponse(resp.id, resp.model, fullText, {
|
|
771
|
+
inputTokens: resp.usage.inputTokens,
|
|
772
|
+
outputTokens: resp.usage.outputTokens,
|
|
773
|
+
});
|
|
774
|
+
|
|
775
|
+
return new Response(sseBody, {
|
|
776
|
+
status: 200,
|
|
777
|
+
headers: {
|
|
778
|
+
"content-type": "text/event-stream",
|
|
779
|
+
"cache-control": "no-cache",
|
|
780
|
+
connection: "keep-alive",
|
|
781
|
+
},
|
|
782
|
+
});
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
// ---------------------------------------------------------------------------
|
|
786
|
+
// Post-response processing
|
|
787
|
+
// ---------------------------------------------------------------------------
|
|
788
|
+
|
|
789
|
+
/**
|
|
790
|
+
* Run after a successful response: calibrate, store temporal messages,
|
|
791
|
+
* and schedule background work (distillation, curation).
|
|
792
|
+
*/
|
|
793
|
+
function postResponse(
|
|
794
|
+
req: GatewayRequest,
|
|
795
|
+
resp: GatewayResponse,
|
|
796
|
+
sessionState: SessionState,
|
|
797
|
+
config: GatewayConfig,
|
|
798
|
+
): void {
|
|
799
|
+
const { sessionID, projectPath } = sessionState;
|
|
800
|
+
|
|
801
|
+
try {
|
|
802
|
+
// --- Calibrate overhead from real token counts ---
|
|
803
|
+
const actualInput =
|
|
804
|
+
(resp.usage.inputTokens ?? 0) +
|
|
805
|
+
(resp.usage.cacheReadInputTokens ?? 0) +
|
|
806
|
+
(resp.usage.cacheCreationInputTokens ?? 0);
|
|
807
|
+
calibrate(
|
|
808
|
+
actualInput,
|
|
809
|
+
sessionID,
|
|
810
|
+
getLastTransformedCount(sessionID),
|
|
811
|
+
);
|
|
812
|
+
|
|
813
|
+
// --- Temporal storage ---
|
|
814
|
+
// Store all messages (user + assistant) from this turn.
|
|
815
|
+
// Convert gateway messages to Lore format.
|
|
816
|
+
const loreMessages = gatewayMessagesToLore(req.messages, sessionID);
|
|
817
|
+
resolveToolResults(loreMessages);
|
|
818
|
+
|
|
819
|
+
// Store the latest user message (last user message in the array)
|
|
820
|
+
for (let i = loreMessages.length - 1; i >= 0; i--) {
|
|
821
|
+
if (loreMessages[i].info.role === "user") {
|
|
822
|
+
temporal.store({
|
|
823
|
+
projectPath,
|
|
824
|
+
info: loreMessages[i].info,
|
|
825
|
+
parts: loreMessages[i].parts,
|
|
826
|
+
});
|
|
827
|
+
break;
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
// Build and store the assistant response message
|
|
832
|
+
const assistantMsg = gatewayMessagesToLore(
|
|
833
|
+
[{ role: "assistant", content: resp.content }],
|
|
834
|
+
sessionID,
|
|
835
|
+
)[0];
|
|
836
|
+
updateAssistantMessageTokens(assistantMsg, resp.usage, resp.model);
|
|
837
|
+
temporal.store({
|
|
838
|
+
projectPath,
|
|
839
|
+
info: assistantMsg.info,
|
|
840
|
+
parts: assistantMsg.parts,
|
|
841
|
+
});
|
|
842
|
+
|
|
843
|
+
// Update session state
|
|
844
|
+
sessionState.turnsSinceCuration =
|
|
845
|
+
(sessionState.turnsSinceCuration ?? 0) + 1;
|
|
846
|
+
|
|
847
|
+
// --- Schedule background work (fire-and-forget) ---
|
|
848
|
+
scheduleBackgroundWork(sessionState, config);
|
|
849
|
+
} catch (e) {
|
|
850
|
+
log.error("post-response processing failed:", e);
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
/**
|
|
855
|
+
* Schedule background distillation and curation (fire-and-forget).
|
|
856
|
+
*/
|
|
857
|
+
function scheduleBackgroundWork(
|
|
858
|
+
sessionState: SessionState,
|
|
859
|
+
config: GatewayConfig,
|
|
860
|
+
): void {
|
|
861
|
+
const { sessionID, projectPath } = sessionState;
|
|
862
|
+
const llm = getLLMClient(config);
|
|
863
|
+
const cfg = loreConfig();
|
|
864
|
+
const model = getWorkerModel();
|
|
865
|
+
|
|
866
|
+
// Check if urgent distillation is needed (gradient flagged it).
|
|
867
|
+
// Mark urgent: true so these bypass the batch queue — the gradient is
|
|
868
|
+
// in overflow and needs the result before the next user turn.
|
|
869
|
+
if (needsUrgentDistillation()) {
|
|
870
|
+
distillation
|
|
871
|
+
.run({
|
|
872
|
+
llm,
|
|
873
|
+
projectPath,
|
|
874
|
+
sessionID,
|
|
875
|
+
model,
|
|
876
|
+
force: true,
|
|
877
|
+
urgent: true,
|
|
878
|
+
})
|
|
879
|
+
.catch((e) => log.error("background distillation failed:", e));
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
// Check if pending messages exceed maxSegment threshold
|
|
883
|
+
const pending = temporal.undistilledCount(projectPath, sessionID);
|
|
884
|
+
if (pending >= cfg.distillation.maxSegment) {
|
|
885
|
+
log.info(
|
|
886
|
+
`incremental distillation: ${pending} undistilled messages in ${sessionID.slice(0, 16)}`,
|
|
887
|
+
);
|
|
888
|
+
distillation
|
|
889
|
+
.run({ llm, projectPath, sessionID, model })
|
|
890
|
+
.catch((e) => log.error("background distillation failed:", e));
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
// Curation: run periodically when the knowledge system is enabled
|
|
894
|
+
if (
|
|
895
|
+
cfg.knowledge.enabled &&
|
|
896
|
+
cfg.curator.onIdle &&
|
|
897
|
+
sessionState.turnsSinceCuration >= cfg.curator.afterTurns
|
|
898
|
+
) {
|
|
899
|
+
curator
|
|
900
|
+
.run({ llm, projectPath, sessionID, model })
|
|
901
|
+
.then(() => {
|
|
902
|
+
sessionState.turnsSinceCuration = 0;
|
|
903
|
+
// Invalidate LTM cache after curation changes knowledge entries
|
|
904
|
+
ltmSessionCache.delete(sessionID);
|
|
905
|
+
})
|
|
906
|
+
.catch((e) => log.error("background curation failed:", e));
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
// ---------------------------------------------------------------------------
|
|
911
|
+
// Case 1: Compaction interception
|
|
912
|
+
// ---------------------------------------------------------------------------
|
|
913
|
+
|
|
914
|
+
async function handleCompaction(
|
|
915
|
+
req: GatewayRequest,
|
|
916
|
+
config: GatewayConfig,
|
|
917
|
+
): Promise<Response> {
|
|
918
|
+
// Identify session
|
|
919
|
+
const projectPath = cachedProjectPath ?? getProjectPath(req.system, req.rawHeaders);
|
|
920
|
+
await initIfNeeded(projectPath, config);
|
|
921
|
+
|
|
922
|
+
const { sessionID } = await identifySession(req, projectPath);
|
|
923
|
+
const sessionState = getOrCreateSession(sessionID, projectPath);
|
|
924
|
+
const llm = getLLMClient(config);
|
|
925
|
+
|
|
926
|
+
log.info(`compaction intercepted for session ${sessionID.slice(0, 16)}`);
|
|
927
|
+
|
|
928
|
+
// 1. Force-distill all undistilled messages.
|
|
929
|
+
// Mark urgent: true — client is blocking on the compaction response.
|
|
930
|
+
const model = getWorkerModel();
|
|
931
|
+
await distillation.run({
|
|
932
|
+
llm,
|
|
933
|
+
projectPath,
|
|
934
|
+
sessionID,
|
|
935
|
+
model,
|
|
936
|
+
force: true,
|
|
937
|
+
urgent: true,
|
|
938
|
+
});
|
|
939
|
+
|
|
940
|
+
// 2. Load distillation summaries
|
|
941
|
+
const distillations = distillation.loadForSession(projectPath, sessionID);
|
|
942
|
+
|
|
943
|
+
// 3. Extract previous summary from the request (if any)
|
|
944
|
+
const previousSummary = extractPreviousSummary(req);
|
|
945
|
+
|
|
946
|
+
// 4. Build knowledge block
|
|
947
|
+
const cfg = loreConfig();
|
|
948
|
+
const entries = cfg.knowledge.enabled
|
|
949
|
+
? ltm.forProject(projectPath, cfg.crossProject)
|
|
950
|
+
: [];
|
|
951
|
+
const knowledge = entries.length
|
|
952
|
+
? formatKnowledge(
|
|
953
|
+
entries.map((e) => ({
|
|
954
|
+
category: e.category,
|
|
955
|
+
title: e.title,
|
|
956
|
+
content: e.content,
|
|
957
|
+
})),
|
|
958
|
+
)
|
|
959
|
+
: "";
|
|
960
|
+
|
|
961
|
+
// 5. Build the compact prompt
|
|
962
|
+
const compactPrompt = buildCompactPrompt({
|
|
963
|
+
hasDistillations: distillations.length > 0,
|
|
964
|
+
knowledge,
|
|
965
|
+
previousSummary,
|
|
966
|
+
});
|
|
967
|
+
|
|
968
|
+
// 6. Build context with distillation summaries
|
|
969
|
+
let context = "";
|
|
970
|
+
if (distillations.length > 0) {
|
|
971
|
+
context =
|
|
972
|
+
`## Lore Pre-computed Session Summaries\n\n` +
|
|
973
|
+
`The following ${distillations.length} summary chunk(s) were pre-computed ` +
|
|
974
|
+
`from the conversation history. Use these as the authoritative source.\n\n` +
|
|
975
|
+
distillations
|
|
976
|
+
.map(
|
|
977
|
+
(d, i) =>
|
|
978
|
+
`### Chunk ${i + 1}${d.generation > 0 ? " (consolidated)" : ""}\n${d.observations}`,
|
|
979
|
+
)
|
|
980
|
+
.join("\n\n");
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
// 7. Generate the compaction summary via LLM
|
|
984
|
+
const userContent = context
|
|
985
|
+
? `${context}\n\n---\n\n${compactPrompt}`
|
|
986
|
+
: compactPrompt;
|
|
987
|
+
|
|
988
|
+
const summaryText = await llm.prompt(compactPrompt, userContent, {
|
|
989
|
+
model: cfg.model,
|
|
990
|
+
workerID: "lore-compact",
|
|
991
|
+
urgent: true, // Client is blocking on this response
|
|
992
|
+
});
|
|
993
|
+
|
|
994
|
+
const summary = summaryText ?? "(Compaction failed — no summary generated.)";
|
|
995
|
+
|
|
996
|
+
// 8. Build and return the response
|
|
997
|
+
const resp = buildCompactionResponse(sessionID, summary, req.model);
|
|
998
|
+
|
|
999
|
+
if (req.stream) {
|
|
1000
|
+
return streamHttpResponse(resp);
|
|
1001
|
+
}
|
|
1002
|
+
return nonStreamHttpResponse(resp);
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
// ---------------------------------------------------------------------------
|
|
1006
|
+
// Case 2: Title/summary passthrough
|
|
1007
|
+
// ---------------------------------------------------------------------------
|
|
1008
|
+
|
|
1009
|
+
async function handlePassthrough(
|
|
1010
|
+
req: GatewayRequest,
|
|
1011
|
+
config: GatewayConfig,
|
|
1012
|
+
): Promise<Response> {
|
|
1013
|
+
const upstreamResponse = await forwardToUpstream(req, config);
|
|
1014
|
+
|
|
1015
|
+
// For streaming, pipe through unchanged
|
|
1016
|
+
if (req.stream && upstreamResponse.body) {
|
|
1017
|
+
return new Response(upstreamResponse.body, {
|
|
1018
|
+
status: upstreamResponse.status,
|
|
1019
|
+
headers: {
|
|
1020
|
+
"content-type":
|
|
1021
|
+
upstreamResponse.headers.get("content-type") ??
|
|
1022
|
+
"text/event-stream",
|
|
1023
|
+
},
|
|
1024
|
+
});
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
// For non-streaming, pass through the JSON response as-is
|
|
1028
|
+
const body = await upstreamResponse.text();
|
|
1029
|
+
return new Response(body, {
|
|
1030
|
+
status: upstreamResponse.status,
|
|
1031
|
+
headers: {
|
|
1032
|
+
"content-type": "application/json",
|
|
1033
|
+
},
|
|
1034
|
+
});
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
// ---------------------------------------------------------------------------
|
|
1038
|
+
// Case 3: Normal conversation turn — full pipeline
|
|
1039
|
+
// ---------------------------------------------------------------------------
|
|
1040
|
+
|
|
1041
|
+
async function handleConversationTurn(
|
|
1042
|
+
req: GatewayRequest,
|
|
1043
|
+
config: GatewayConfig,
|
|
1044
|
+
): Promise<Response> {
|
|
1045
|
+
// --- 1. Project path & init ---
|
|
1046
|
+
const projectPath = getProjectPath(req.system, req.rawHeaders);
|
|
1047
|
+
await initIfNeeded(projectPath, config);
|
|
1048
|
+
|
|
1049
|
+
// --- 2. Capture auth credentials for background workers ---
|
|
1050
|
+
const cred = extractAuth(req.rawHeaders);
|
|
1051
|
+
if (cred) {
|
|
1052
|
+
setLastSeenAuth(cred);
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
// --- 3. Session identification ---
|
|
1056
|
+
const { sessionID, isNew } = await identifySession(req, projectPath);
|
|
1057
|
+
const sessionState = getOrCreateSession(sessionID, projectPath);
|
|
1058
|
+
|
|
1059
|
+
// Bind auth credential to this session for background workers
|
|
1060
|
+
if (cred) {
|
|
1061
|
+
setSessionAuth(sessionID, cred);
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
// Track fingerprint for future correlation
|
|
1065
|
+
if (isNew) {
|
|
1066
|
+
const fingerprint = await fingerprintMessages(
|
|
1067
|
+
req.messages.map((m) => ({ role: m.role, content: m.content })),
|
|
1068
|
+
{
|
|
1069
|
+
model: req.model,
|
|
1070
|
+
authSuffix: cred ? authFingerprint(cred) : "",
|
|
1071
|
+
},
|
|
1072
|
+
);
|
|
1073
|
+
sessionState.fingerprint = fingerprint;
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
// Always update message count for proximity matching
|
|
1077
|
+
sessionState.messageCount = req.messages.length;
|
|
1078
|
+
|
|
1079
|
+
// Track session model for worker model discovery
|
|
1080
|
+
lastSeenSessionModel = req.model;
|
|
1081
|
+
|
|
1082
|
+
// --- Inject pending recall from previous turn (Case 2: mixed tools) ---
|
|
1083
|
+
if (sessionState.pendingRecall) {
|
|
1084
|
+
if (isPendingRecallValid(sessionState.pendingRecall)) {
|
|
1085
|
+
const injected = injectPendingRecall(req, sessionState.pendingRecall);
|
|
1086
|
+
if (injected) {
|
|
1087
|
+
log.info(
|
|
1088
|
+
`injected pending recall result into request for session ${sessionID.slice(0, 16)}`,
|
|
1089
|
+
);
|
|
1090
|
+
} else {
|
|
1091
|
+
log.warn(
|
|
1092
|
+
`failed to inject pending recall — conversation structure mismatch`,
|
|
1093
|
+
);
|
|
1094
|
+
}
|
|
1095
|
+
} else {
|
|
1096
|
+
log.warn(
|
|
1097
|
+
`discarding expired pending recall for session ${sessionID.slice(0, 16)}`,
|
|
1098
|
+
);
|
|
1099
|
+
}
|
|
1100
|
+
sessionState.pendingRecall = undefined;
|
|
1101
|
+
}
|
|
1102
|
+
|
|
1103
|
+
log.info(
|
|
1104
|
+
`turn: session=${sessionID.slice(0, 16)} messages=${req.messages.length} ` +
|
|
1105
|
+
`model=${req.model} stream=${req.stream} new=${isNew}`,
|
|
1106
|
+
);
|
|
1107
|
+
|
|
1108
|
+
// --- 4. Set model limits ---
|
|
1109
|
+
const modelSpec = getModelSpec(req.model);
|
|
1110
|
+
setModelLimits({ context: modelSpec.context, output: modelSpec.output });
|
|
1111
|
+
|
|
1112
|
+
// Cost-aware layer-0 cap: explicit config wins > cost formula > disabled.
|
|
1113
|
+
const cfg = loreConfig();
|
|
1114
|
+
if (cfg.budget.maxLayer0Tokens !== undefined) {
|
|
1115
|
+
setMaxLayer0Tokens(cfg.budget.maxLayer0Tokens);
|
|
1116
|
+
} else if (modelSpec.cacheReadCost && cfg.budget.targetCacheReadCostPerTurn > 0) {
|
|
1117
|
+
setMaxLayer0Tokens(computeLayer0Cap(
|
|
1118
|
+
cfg.budget.targetCacheReadCostPerTurn,
|
|
1119
|
+
modelSpec.cacheReadCost,
|
|
1120
|
+
));
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
// --- 5. Cold-cache idle-resume ---
|
|
1124
|
+
const thresholdMs = cfg.idleResumeMinutes * 60_000;
|
|
1125
|
+
const idleResult = onIdleResume(sessionID, thresholdMs);
|
|
1126
|
+
if (idleResult.triggered) {
|
|
1127
|
+
ltmSessionCache.delete(sessionID);
|
|
1128
|
+
log.info(
|
|
1129
|
+
`session idle ${Math.round(idleResult.idleMs / 60_000)}min — refreshing caches`,
|
|
1130
|
+
);
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
// --- 6. LTM injection into system prompt ---
|
|
1134
|
+
let modifiedSystem = req.system;
|
|
1135
|
+
if (cfg.knowledge.enabled) {
|
|
1136
|
+
try {
|
|
1137
|
+
let cached = ltmSessionCache.get(sessionID);
|
|
1138
|
+
|
|
1139
|
+
if (!cached) {
|
|
1140
|
+
const ltmFraction = cfg.budget.ltm;
|
|
1141
|
+
const ltmBudget = getLtmBudget(ltmFraction);
|
|
1142
|
+
const entries = ltm.forSession(projectPath, sessionID, ltmBudget);
|
|
1143
|
+
if (entries.length) {
|
|
1144
|
+
const formatted = formatKnowledge(
|
|
1145
|
+
entries.map((e) => ({
|
|
1146
|
+
category: e.category,
|
|
1147
|
+
title: e.title,
|
|
1148
|
+
content: e.content,
|
|
1149
|
+
})),
|
|
1150
|
+
ltmBudget,
|
|
1151
|
+
);
|
|
1152
|
+
|
|
1153
|
+
if (formatted) {
|
|
1154
|
+
const tokenCount = Math.ceil(formatted.length / 3);
|
|
1155
|
+
cached = { formatted, tokenCount };
|
|
1156
|
+
ltmSessionCache.set(sessionID, cached);
|
|
1157
|
+
}
|
|
1158
|
+
}
|
|
1159
|
+
}
|
|
1160
|
+
|
|
1161
|
+
if (cached) {
|
|
1162
|
+
setLtmTokens(cached.tokenCount, sessionID);
|
|
1163
|
+
modifiedSystem = `${req.system}\n\n${cached.formatted}`;
|
|
1164
|
+
} else {
|
|
1165
|
+
setLtmTokens(0, sessionID);
|
|
1166
|
+
}
|
|
1167
|
+
} catch (e) {
|
|
1168
|
+
log.error("LTM injection failed:", e);
|
|
1169
|
+
setLtmTokens(0, sessionID);
|
|
1170
|
+
} finally {
|
|
1171
|
+
consumeCameOutOfIdle(sessionID);
|
|
1172
|
+
}
|
|
1173
|
+
} else {
|
|
1174
|
+
setLtmTokens(0, sessionID);
|
|
1175
|
+
consumeCameOutOfIdle(sessionID);
|
|
1176
|
+
}
|
|
1177
|
+
|
|
1178
|
+
// First-run greeting
|
|
1179
|
+
if (isFirstRun()) {
|
|
1180
|
+
modifiedSystem +=
|
|
1181
|
+
"\n\n[Lore plugin] This is the first time Lore has been activated. " +
|
|
1182
|
+
"Briefly let the user know that Lore is now active and their " +
|
|
1183
|
+
"coding agent will get progressively smarter on this codebase " +
|
|
1184
|
+
"over time as knowledge accumulates across sessions.";
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
// Lore knowledge file commit reminder
|
|
1188
|
+
if (cfg.knowledge.enabled) {
|
|
1189
|
+
const filesToTrack = [".lore.md"];
|
|
1190
|
+
if (cfg.agentsFile.enabled) filesToTrack.push(cfg.agentsFile.path);
|
|
1191
|
+
modifiedSystem +=
|
|
1192
|
+
`\n\nWhen making git commits, always check if ${filesToTrack.join(" and ")} ` +
|
|
1193
|
+
`have unstaged changes and include them in the commit. These files contain ` +
|
|
1194
|
+
`shared project knowledge managed by lore and must be version-controlled.`;
|
|
1195
|
+
}
|
|
1196
|
+
|
|
1197
|
+
// --- 7. Gradient transform on messages ---
|
|
1198
|
+
const loreMessages = gatewayMessagesToLore(req.messages, sessionID);
|
|
1199
|
+
resolveToolResults(loreMessages);
|
|
1200
|
+
|
|
1201
|
+
const result = transform({
|
|
1202
|
+
messages: loreMessages,
|
|
1203
|
+
projectPath,
|
|
1204
|
+
sessionID,
|
|
1205
|
+
});
|
|
1206
|
+
|
|
1207
|
+
// Drop trailing pure-text assistant messages to prevent prefill errors
|
|
1208
|
+
while (
|
|
1209
|
+
result.messages.length > 0 &&
|
|
1210
|
+
result.messages.at(-1)!.info.role !== "user"
|
|
1211
|
+
) {
|
|
1212
|
+
const last = result.messages.at(-1)!;
|
|
1213
|
+
const hasToolParts = last.parts.some((p) => p.type === "tool");
|
|
1214
|
+
if (hasToolParts) break;
|
|
1215
|
+
result.messages.pop();
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
// --- 8. Build the modified request ---
|
|
1219
|
+
// Reconstruct GatewayMessages from the transformed Lore messages.
|
|
1220
|
+
// loreMessagesToGateway reconstructs tool_result blocks from assistant's
|
|
1221
|
+
// completed/error tool parts; removeOrphanedToolResults is a safety net
|
|
1222
|
+
// that catches any remaining orphaned tool_result references.
|
|
1223
|
+
const transformedMessages = loreMessagesToGateway(result.messages);
|
|
1224
|
+
removeOrphanedToolResults(transformedMessages);
|
|
1225
|
+
|
|
1226
|
+
const modifiedReq: GatewayRequest = {
|
|
1227
|
+
...req,
|
|
1228
|
+
system: modifiedSystem,
|
|
1229
|
+
messages: transformedMessages,
|
|
1230
|
+
};
|
|
1231
|
+
|
|
1232
|
+
// --- 8b. Inject recall tool ---
|
|
1233
|
+
// Only inject if the client doesn't already have a recall tool (e.g. from
|
|
1234
|
+
// a host plugin like OpenCode) and the request has other tools (so it's a
|
|
1235
|
+
// coding agent, not a bare chat).
|
|
1236
|
+
if (modifiedReq.tools.length > 0 && !clientHasRecallTool(modifiedReq.tools)) {
|
|
1237
|
+
modifiedReq.tools = [...modifiedReq.tools, RECALL_GATEWAY_TOOL];
|
|
1238
|
+
}
|
|
1239
|
+
|
|
1240
|
+
// --- 9. Forward to upstream ---
|
|
1241
|
+
// Enable prompt caching for conversation turns:
|
|
1242
|
+
// - System prompt: explicit breakpoint with 5m TTL (frequent turns)
|
|
1243
|
+
// - Conversation: breakpoint on last block so Anthropic caches the prefix
|
|
1244
|
+
// Title/summary passthrough (handlePassthrough) never reaches here — it
|
|
1245
|
+
// forwards the raw request without buildAnthropicRequest, so no caching.
|
|
1246
|
+
const cacheOptions: AnthropicCacheOptions = {
|
|
1247
|
+
systemTTL: "5m",
|
|
1248
|
+
cacheConversation: true,
|
|
1249
|
+
};
|
|
1250
|
+
const upstreamResponse = await forwardToUpstream(
|
|
1251
|
+
modifiedReq,
|
|
1252
|
+
config,
|
|
1253
|
+
undefined,
|
|
1254
|
+
cacheOptions,
|
|
1255
|
+
);
|
|
1256
|
+
|
|
1257
|
+
if (!upstreamResponse.ok) {
|
|
1258
|
+
const errorBody = await upstreamResponse.text();
|
|
1259
|
+
log.error(
|
|
1260
|
+
`upstream error: ${upstreamResponse.status} ${errorBody.slice(0, 500)}`,
|
|
1261
|
+
);
|
|
1262
|
+
return new Response(errorBody, {
|
|
1263
|
+
status: upstreamResponse.status,
|
|
1264
|
+
headers: { "content-type": "application/json" },
|
|
1265
|
+
});
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
if (req.stream && upstreamResponse.body) {
|
|
1269
|
+
// Streaming: forward events and accumulate in parallel.
|
|
1270
|
+
// Pass recall context so the accumulator can intercept recall tool_use.
|
|
1271
|
+
const hasRecallTool = modifiedReq.tools.some(
|
|
1272
|
+
(t) => t.name === RECALL_TOOL_NAME,
|
|
1273
|
+
);
|
|
1274
|
+
return buildStreamingResponse(
|
|
1275
|
+
upstreamResponse,
|
|
1276
|
+
(resp) => postResponse(req, resp, sessionState, config),
|
|
1277
|
+
hasRecallTool
|
|
1278
|
+
? { modifiedReq, config, sessionState, cacheOptions }
|
|
1279
|
+
: undefined,
|
|
1280
|
+
);
|
|
1281
|
+
}
|
|
1282
|
+
|
|
1283
|
+
// Non-streaming (also used for OpenAI protocol via accumulateStreamResponse)
|
|
1284
|
+
const resp = await accumulateNonStreamResponse(upstreamResponse);
|
|
1285
|
+
|
|
1286
|
+
// --- Recall interception (non-streaming) ---
|
|
1287
|
+
if (hasRecallToolUse(resp)) {
|
|
1288
|
+
const recallBlock = findRecallToolUse(resp)!;
|
|
1289
|
+
const { result, input } = await executeRecall(
|
|
1290
|
+
recallBlock,
|
|
1291
|
+
sessionState.projectPath,
|
|
1292
|
+
sessionState.sessionID,
|
|
1293
|
+
);
|
|
1294
|
+
|
|
1295
|
+
if (hasOtherToolUse(resp)) {
|
|
1296
|
+
// Case 2: recall + other tools — store pending, strip recall from response
|
|
1297
|
+
const position = resp.content.indexOf(recallBlock);
|
|
1298
|
+
sessionState.pendingRecall = {
|
|
1299
|
+
toolUseId: recallBlock.id,
|
|
1300
|
+
input,
|
|
1301
|
+
position,
|
|
1302
|
+
result,
|
|
1303
|
+
timestamp: Date.now(),
|
|
1304
|
+
};
|
|
1305
|
+
log.info(
|
|
1306
|
+
`recall (non-stream, mixed): stored pending result for session ${sessionState.sessionID.slice(0, 16)}`,
|
|
1307
|
+
);
|
|
1308
|
+
const cleanResp = stripRecallFromResponse(resp);
|
|
1309
|
+
postResponse(req, cleanResp, sessionState, config);
|
|
1310
|
+
return nonStreamHttpResponse(cleanResp);
|
|
1311
|
+
}
|
|
1312
|
+
|
|
1313
|
+
// Case 1: recall-only — send follow-up request
|
|
1314
|
+
log.info(
|
|
1315
|
+
`recall (non-stream, only): executing follow-up for session ${sessionState.sessionID.slice(0, 16)}`,
|
|
1316
|
+
);
|
|
1317
|
+
const followUp = buildRecallFollowUp(modifiedReq, resp, result, recallBlock);
|
|
1318
|
+
// Strip recall from the follow-up tools (already done by buildRecallFollowUp)
|
|
1319
|
+
const followUpResponse = await forwardToUpstream(
|
|
1320
|
+
followUp,
|
|
1321
|
+
config,
|
|
1322
|
+
undefined,
|
|
1323
|
+
cacheOptions,
|
|
1324
|
+
);
|
|
1325
|
+
|
|
1326
|
+
if (!followUpResponse.ok) {
|
|
1327
|
+
const errorBody = await followUpResponse.text();
|
|
1328
|
+
log.error(
|
|
1329
|
+
`recall follow-up upstream error: ${followUpResponse.status} ${errorBody.slice(0, 500)}`,
|
|
1330
|
+
);
|
|
1331
|
+
// Fall back to the original response without recall
|
|
1332
|
+
const cleanResp = stripRecallFromResponse(resp);
|
|
1333
|
+
postResponse(req, cleanResp, sessionState, config);
|
|
1334
|
+
return nonStreamHttpResponse(cleanResp);
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1337
|
+
const continuationResp = await accumulateNonStreamResponse(followUpResponse);
|
|
1338
|
+
|
|
1339
|
+
// Merge usage from both requests
|
|
1340
|
+
continuationResp.usage.inputTokens += resp.usage.inputTokens;
|
|
1341
|
+
continuationResp.usage.outputTokens += resp.usage.outputTokens;
|
|
1342
|
+
if (resp.usage.cacheReadInputTokens) {
|
|
1343
|
+
continuationResp.usage.cacheReadInputTokens =
|
|
1344
|
+
(continuationResp.usage.cacheReadInputTokens ?? 0) +
|
|
1345
|
+
resp.usage.cacheReadInputTokens;
|
|
1346
|
+
}
|
|
1347
|
+
if (resp.usage.cacheCreationInputTokens) {
|
|
1348
|
+
continuationResp.usage.cacheCreationInputTokens =
|
|
1349
|
+
(continuationResp.usage.cacheCreationInputTokens ?? 0) +
|
|
1350
|
+
resp.usage.cacheCreationInputTokens;
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
postResponse(req, continuationResp, sessionState, config);
|
|
1354
|
+
return nonStreamHttpResponse(continuationResp);
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
postResponse(req, resp, sessionState, config);
|
|
1358
|
+
return nonStreamHttpResponse(resp);
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
// ---------------------------------------------------------------------------
|
|
1362
|
+
// Lore message → Gateway message conversion
|
|
1363
|
+
// ---------------------------------------------------------------------------
|
|
1364
|
+
|
|
1365
|
+
/**
|
|
1366
|
+
* Convert transformed Lore messages back to gateway message format.
|
|
1367
|
+
*
|
|
1368
|
+
* This reverses `gatewayMessagesToLore` after gradient transform has
|
|
1369
|
+
* potentially trimmed/reordered messages.
|
|
1370
|
+
*
|
|
1371
|
+
* Completed/error tool parts on assistant messages produce BOTH a `tool_use`
|
|
1372
|
+
* block on the assistant AND a corresponding `tool_result` block injected at
|
|
1373
|
+
* the start of the following user message. This makes the conversion
|
|
1374
|
+
* self-contained: tool pairing is reconstructed from whatever messages
|
|
1375
|
+
* survived gradient eviction, without depending on cross-message `tool_result`
|
|
1376
|
+
* parts that can become orphaned when the assistant message is evicted.
|
|
1377
|
+
*
|
|
1378
|
+
* `resolveToolResults()` strips `tool: "result"` parts from user messages
|
|
1379
|
+
* after pairing, so under normal operation those parts are gone. The fallback
|
|
1380
|
+
* handling for residual `tool: "result"` parts is kept for robustness.
|
|
1381
|
+
*/
|
|
1382
|
+
/** @internal Exported for tests. */
|
|
1383
|
+
export function loreMessagesToGateway(
|
|
1384
|
+
messages: LoreMessageWithParts[],
|
|
1385
|
+
): Array<{ role: "user" | "assistant"; content: GatewayContentBlock[] }> {
|
|
1386
|
+
const out: Array<{
|
|
1387
|
+
role: "user" | "assistant";
|
|
1388
|
+
content: GatewayContentBlock[];
|
|
1389
|
+
}> = [];
|
|
1390
|
+
|
|
1391
|
+
// tool_result blocks reconstructed from the preceding assistant message's
|
|
1392
|
+
// completed/error tool parts. Injected at the start of the next user message.
|
|
1393
|
+
let pendingToolResults: GatewayContentBlock[] = [];
|
|
1394
|
+
|
|
1395
|
+
for (const msg of messages) {
|
|
1396
|
+
const content: GatewayContentBlock[] = [];
|
|
1397
|
+
|
|
1398
|
+
if (msg.info.role === "user") {
|
|
1399
|
+
// Inject reconstructed tool_result blocks from preceding assistant
|
|
1400
|
+
content.push(...pendingToolResults);
|
|
1401
|
+
pendingToolResults = [];
|
|
1402
|
+
} else {
|
|
1403
|
+
// New assistant message — reset pending results (shouldn't have any
|
|
1404
|
+
// in well-formed conversations, but handles back-to-back assistants)
|
|
1405
|
+
pendingToolResults = [];
|
|
1406
|
+
}
|
|
1407
|
+
|
|
1408
|
+
for (const part of msg.parts) {
|
|
1409
|
+
switch (part.type) {
|
|
1410
|
+
case "text":
|
|
1411
|
+
content.push({
|
|
1412
|
+
type: "text",
|
|
1413
|
+
text: (part as { text: string }).text,
|
|
1414
|
+
});
|
|
1415
|
+
break;
|
|
1416
|
+
case "reasoning":
|
|
1417
|
+
content.push({
|
|
1418
|
+
type: "thinking",
|
|
1419
|
+
thinking: (part as { text: string }).text ?? "",
|
|
1420
|
+
});
|
|
1421
|
+
break;
|
|
1422
|
+
case "tool": {
|
|
1423
|
+
const toolPart = part as {
|
|
1424
|
+
type: "tool";
|
|
1425
|
+
tool: string;
|
|
1426
|
+
callID: string;
|
|
1427
|
+
state: {
|
|
1428
|
+
status: string;
|
|
1429
|
+
input?: unknown;
|
|
1430
|
+
output?: string;
|
|
1431
|
+
error?: string;
|
|
1432
|
+
};
|
|
1433
|
+
};
|
|
1434
|
+
if (toolPart.tool === "result") {
|
|
1435
|
+
// Residual tool_result part (should have been stripped by
|
|
1436
|
+
// resolveToolResults, but handle gracefully for robustness)
|
|
1437
|
+
content.push({
|
|
1438
|
+
type: "tool_result",
|
|
1439
|
+
toolUseId: toolPart.callID,
|
|
1440
|
+
content: toolPart.state.output ?? "",
|
|
1441
|
+
});
|
|
1442
|
+
} else {
|
|
1443
|
+
// Emit tool_use on this assistant message
|
|
1444
|
+
content.push({
|
|
1445
|
+
type: "tool_use",
|
|
1446
|
+
id: toolPart.callID,
|
|
1447
|
+
name: toolPart.tool,
|
|
1448
|
+
input: toolPart.state.input ?? {},
|
|
1449
|
+
});
|
|
1450
|
+
// Completed/error tool parts: queue a tool_result for the next
|
|
1451
|
+
// user message. This reconstructs the Anthropic API's split-
|
|
1452
|
+
// message format from Lore's single-message representation.
|
|
1453
|
+
if (toolPart.state.status === "completed") {
|
|
1454
|
+
pendingToolResults.push({
|
|
1455
|
+
type: "tool_result",
|
|
1456
|
+
toolUseId: toolPart.callID,
|
|
1457
|
+
content: toolPart.state.output ?? "",
|
|
1458
|
+
});
|
|
1459
|
+
} else if (toolPart.state.status === "error") {
|
|
1460
|
+
pendingToolResults.push({
|
|
1461
|
+
type: "tool_result",
|
|
1462
|
+
toolUseId: toolPart.callID,
|
|
1463
|
+
content: toolPart.state.error ?? "[error]",
|
|
1464
|
+
isError: true,
|
|
1465
|
+
});
|
|
1466
|
+
}
|
|
1467
|
+
// Pending tool parts (not yet resolved) only emit tool_use —
|
|
1468
|
+
// the model will see an unresolved tool call. sanitizeToolParts
|
|
1469
|
+
// in gradient.ts converts these to error state before this point.
|
|
1470
|
+
}
|
|
1471
|
+
break;
|
|
1472
|
+
}
|
|
1473
|
+
// Generic / unknown parts — skip or represent as text
|
|
1474
|
+
default:
|
|
1475
|
+
if ("text" in part && typeof part.text === "string") {
|
|
1476
|
+
content.push({ type: "text", text: part.text });
|
|
1477
|
+
}
|
|
1478
|
+
break;
|
|
1479
|
+
}
|
|
1480
|
+
}
|
|
1481
|
+
|
|
1482
|
+
out.push({ role: msg.info.role as "user" | "assistant", content });
|
|
1483
|
+
}
|
|
1484
|
+
|
|
1485
|
+
return out;
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
// ---------------------------------------------------------------------------
|
|
1489
|
+
// Post-conversion validation: remove orphaned tool_result blocks
|
|
1490
|
+
// ---------------------------------------------------------------------------
|
|
1491
|
+
|
|
1492
|
+
/**
|
|
1493
|
+
* Belt-and-suspenders safety net: ensures every `tool_result` block on a user
|
|
1494
|
+
* message references a `tool_use` block on the immediately preceding assistant
|
|
1495
|
+
* message. Removes orphans and logs a warning.
|
|
1496
|
+
*
|
|
1497
|
+
* This should never fire under normal operation (resolveToolResults strips
|
|
1498
|
+
* redundant tool_result parts, and loreMessagesToGateway reconstructs them
|
|
1499
|
+
* from the assistant's completed tool parts). But if a future code path
|
|
1500
|
+
* introduces orphaned references, this catches them before they reach the API.
|
|
1501
|
+
*/
|
|
1502
|
+
/** @internal Exported for tests. */
|
|
1503
|
+
export function removeOrphanedToolResults(
|
|
1504
|
+
messages: Array<{
|
|
1505
|
+
role: "user" | "assistant";
|
|
1506
|
+
content: GatewayContentBlock[];
|
|
1507
|
+
}>,
|
|
1508
|
+
): void {
|
|
1509
|
+
for (let i = 0; i < messages.length; i++) {
|
|
1510
|
+
const msg = messages[i]!;
|
|
1511
|
+
if (msg.role !== "user") continue;
|
|
1512
|
+
if (!msg.content.some((b) => b.type === "tool_result")) continue;
|
|
1513
|
+
|
|
1514
|
+
// Collect tool_use IDs from the preceding assistant message
|
|
1515
|
+
const prev =
|
|
1516
|
+
i > 0 && messages[i - 1]!.role === "assistant"
|
|
1517
|
+
? messages[i - 1]!
|
|
1518
|
+
: null;
|
|
1519
|
+
const toolUseIds = new Set(
|
|
1520
|
+
(prev?.content ?? [])
|
|
1521
|
+
.filter((b): b is GatewayToolUseBlock => b.type === "tool_use")
|
|
1522
|
+
.map((b) => b.id),
|
|
1523
|
+
);
|
|
1524
|
+
|
|
1525
|
+
// Remove tool_result blocks that reference missing tool_use IDs
|
|
1526
|
+
const before = msg.content.length;
|
|
1527
|
+
msg.content = msg.content.filter(
|
|
1528
|
+
(b) =>
|
|
1529
|
+
b.type !== "tool_result" ||
|
|
1530
|
+
toolUseIds.has((b as GatewayToolResultBlock).toolUseId),
|
|
1531
|
+
);
|
|
1532
|
+
if (msg.content.length < before) {
|
|
1533
|
+
log.warn(
|
|
1534
|
+
`removed ${before - msg.content.length} orphaned tool_result block(s) from message ${i}`,
|
|
1535
|
+
);
|
|
1536
|
+
}
|
|
1537
|
+
// If the user message is now empty, add placeholder text so the API
|
|
1538
|
+
// doesn't reject an empty content array.
|
|
1539
|
+
if (msg.content.length === 0) {
|
|
1540
|
+
msg.content = [{ type: "text", text: "[tool results provided]" }];
|
|
1541
|
+
}
|
|
1542
|
+
}
|
|
1543
|
+
}
|
|
1544
|
+
|
|
1545
|
+
// ---------------------------------------------------------------------------
|
|
1546
|
+
// Error response builder
|
|
1547
|
+
// ---------------------------------------------------------------------------
|
|
1548
|
+
|
|
1549
|
+
function errorResponse(status: number, message: string): Response {
|
|
1550
|
+
return new Response(
|
|
1551
|
+
JSON.stringify({
|
|
1552
|
+
type: "error",
|
|
1553
|
+
error: {
|
|
1554
|
+
type: "server_error",
|
|
1555
|
+
message,
|
|
1556
|
+
},
|
|
1557
|
+
}),
|
|
1558
|
+
{
|
|
1559
|
+
status,
|
|
1560
|
+
headers: { "content-type": "application/json" },
|
|
1561
|
+
},
|
|
1562
|
+
);
|
|
1563
|
+
}
|
|
1564
|
+
|
|
1565
|
+
// ---------------------------------------------------------------------------
|
|
1566
|
+
// Main entry point
|
|
1567
|
+
// ---------------------------------------------------------------------------
|
|
1568
|
+
|
|
1569
|
+
/**
|
|
1570
|
+
* Process an incoming gateway request through the full Lore pipeline.
|
|
1571
|
+
*
|
|
1572
|
+
* Returns a standard `Response` object — either a streaming SSE response
|
|
1573
|
+
* or a JSON response, depending on the client's `stream` setting.
|
|
1574
|
+
*/
|
|
1575
|
+
export async function handleRequest(
|
|
1576
|
+
req: GatewayRequest,
|
|
1577
|
+
config: GatewayConfig,
|
|
1578
|
+
): Promise<Response> {
|
|
1579
|
+
try {
|
|
1580
|
+
// Capture auth credentials early for background workers
|
|
1581
|
+
const earlyAuth = extractAuth(req.rawHeaders);
|
|
1582
|
+
if (earlyAuth) {
|
|
1583
|
+
setLastSeenAuth(earlyAuth);
|
|
1584
|
+
}
|
|
1585
|
+
|
|
1586
|
+
// --- Case 1: Compaction request → intercept ---
|
|
1587
|
+
if (isCompactionRequest(req)) {
|
|
1588
|
+
return await handleCompaction(req, config);
|
|
1589
|
+
}
|
|
1590
|
+
|
|
1591
|
+
// --- Case 2: Title/summary request → passthrough ---
|
|
1592
|
+
if (isTitleOrSummaryRequest(req)) {
|
|
1593
|
+
return await handlePassthrough(req, config);
|
|
1594
|
+
}
|
|
1595
|
+
|
|
1596
|
+
// --- Case 3: Normal conversation turn → full pipeline ---
|
|
1597
|
+
return await handleConversationTurn(req, config);
|
|
1598
|
+
} catch (err) {
|
|
1599
|
+
const message =
|
|
1600
|
+
err instanceof Error ? err.message : "Unknown gateway error";
|
|
1601
|
+
log.error("pipeline error:", err);
|
|
1602
|
+
return errorResponse(502, message);
|
|
1603
|
+
}
|
|
1604
|
+
}
|