@loreai/gateway 0.14.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin.cjs +27 -0
- package/dist/index.cjs +1042 -0
- package/dist/index.d.cts +21 -0
- package/package.json +10 -10
- package/dist/index.js +0 -50087
- package/src/auth.ts +0 -133
- package/src/batch-queue.ts +0 -575
- package/src/cache-analytics.ts +0 -344
- package/src/cli/agents.ts +0 -107
- package/src/cli/bin.ts +0 -11
- package/src/cli/help.ts +0 -55
- package/src/cli/lib/binary.ts +0 -353
- package/src/cli/lib/bspatch.ts +0 -306
- package/src/cli/lib/delta-upgrade.ts +0 -790
- package/src/cli/lib/errors.ts +0 -48
- package/src/cli/lib/ghcr.ts +0 -389
- package/src/cli/lib/patch-cache.ts +0 -342
- package/src/cli/lib/upgrade.ts +0 -454
- package/src/cli/lib/version-check.ts +0 -385
- package/src/cli/main.ts +0 -152
- package/src/cli/run.ts +0 -181
- package/src/cli/start.ts +0 -82
- package/src/cli/upgrade.ts +0 -311
- package/src/cli/version.ts +0 -22
- package/src/compaction.ts +0 -195
- package/src/config.ts +0 -199
- package/src/idle.ts +0 -240
- package/src/index.ts +0 -41
- package/src/llm-adapter.ts +0 -182
- package/src/pipeline.ts +0 -1681
- package/src/recall.ts +0 -433
- package/src/recorder.ts +0 -192
- package/src/server.ts +0 -250
- package/src/session.ts +0 -207
- package/src/stream/anthropic.ts +0 -708
- package/src/temporal-adapter.ts +0 -310
- package/src/translate/anthropic.ts +0 -469
- package/src/translate/openai.ts +0 -536
- package/src/translate/types.ts +0 -222
- package/src/worker-model.ts +0 -408
package/src/pipeline.ts
DELETED
|
@@ -1,1681 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Core request processing pipeline for the Lore gateway.
|
|
3
|
-
*
|
|
4
|
-
* Orchestrates the full flow for every request:
|
|
5
|
-
* session identification → LTM injection → gradient transform →
|
|
6
|
-
* upstream forwarding → response accumulation → calibration →
|
|
7
|
-
* temporal storage → background work scheduling.
|
|
8
|
-
*
|
|
9
|
-
* Three request classes are handled:
|
|
10
|
-
* 1. Compaction requests → intercepted, never forwarded upstream.
|
|
11
|
-
* 2. Title/summary requests → forwarded transparently, no Lore processing.
|
|
12
|
-
* 3. Normal conversation turns → full pipeline.
|
|
13
|
-
*/
|
|
14
|
-
import type { LoreMessageWithParts, LLMClient } from "@loreai/core";
|
|
15
|
-
import {
|
|
16
|
-
load,
|
|
17
|
-
config as loreConfig,
|
|
18
|
-
ensureProject,
|
|
19
|
-
temporal,
|
|
20
|
-
ltm,
|
|
21
|
-
distillation,
|
|
22
|
-
curator,
|
|
23
|
-
log,
|
|
24
|
-
transform,
|
|
25
|
-
setModelLimits,
|
|
26
|
-
setLtmTokens,
|
|
27
|
-
getLtmBudget,
|
|
28
|
-
setMaxLayer0Tokens,
|
|
29
|
-
computeLayer0Cap,
|
|
30
|
-
calibrate,
|
|
31
|
-
getLastTransformedCount,
|
|
32
|
-
onIdleResume,
|
|
33
|
-
consumeCameOutOfIdle,
|
|
34
|
-
needsUrgentDistillation,
|
|
35
|
-
formatKnowledge,
|
|
36
|
-
buildCompactPrompt,
|
|
37
|
-
} from "@loreai/core";
|
|
38
|
-
|
|
39
|
-
import type {
|
|
40
|
-
GatewayRequest,
|
|
41
|
-
GatewayResponse,
|
|
42
|
-
GatewayContentBlock,
|
|
43
|
-
GatewayToolUseBlock,
|
|
44
|
-
GatewayToolResultBlock,
|
|
45
|
-
SessionState,
|
|
46
|
-
} from "./translate/types";
|
|
47
|
-
import type { GatewayConfig } from "./config";
|
|
48
|
-
import { getProjectPath, resolveUpstreamRoute } from "./config";
|
|
49
|
-
import {
|
|
50
|
-
generateSessionID,
|
|
51
|
-
fingerprintMessages,
|
|
52
|
-
MESSAGE_COUNT_PROXIMITY_THRESHOLD,
|
|
53
|
-
} from "./session";
|
|
54
|
-
import {
|
|
55
|
-
isCompactionRequest,
|
|
56
|
-
isTitleOrSummaryRequest,
|
|
57
|
-
extractPreviousSummary,
|
|
58
|
-
buildCompactionResponse,
|
|
59
|
-
} from "./compaction";
|
|
60
|
-
import {
|
|
61
|
-
buildAnthropicRequest,
|
|
62
|
-
buildAnthropicNonStreamResponse,
|
|
63
|
-
type AnthropicCacheOptions,
|
|
64
|
-
} from "./translate/anthropic";
|
|
65
|
-
import {
|
|
66
|
-
buildOpenAIUpstreamRequest,
|
|
67
|
-
buildOpenAIResponse,
|
|
68
|
-
} from "./translate/openai";
|
|
69
|
-
import {
|
|
70
|
-
createStreamAccumulator,
|
|
71
|
-
createRecallAwareAccumulator,
|
|
72
|
-
parseSSEStream,
|
|
73
|
-
buildSSETextResponse,
|
|
74
|
-
formatSSEEvent,
|
|
75
|
-
type StreamAccumulator,
|
|
76
|
-
} from "./stream/anthropic";
|
|
77
|
-
import {
|
|
78
|
-
gatewayMessagesToLore,
|
|
79
|
-
updateAssistantMessageTokens,
|
|
80
|
-
resolveToolResults,
|
|
81
|
-
} from "./temporal-adapter";
|
|
82
|
-
import { createGatewayLLMClient } from "./llm-adapter";
|
|
83
|
-
import { createBatchLLMClient } from "./batch-queue";
|
|
84
|
-
import {
|
|
85
|
-
extractAuth,
|
|
86
|
-
authFingerprint,
|
|
87
|
-
setLastSeenAuth,
|
|
88
|
-
setSessionAuth,
|
|
89
|
-
resolveAuth,
|
|
90
|
-
} from "./auth";
|
|
91
|
-
import type { UpstreamInterceptor } from "./recorder";
|
|
92
|
-
import { startIdleScheduler, buildIdleWorkHandler } from "./idle";
|
|
93
|
-
import { getWorkerModel, resetWorkerModelState } from "./worker-model";
|
|
94
|
-
import { analyzeCacheTurn } from "./cache-analytics";
|
|
95
|
-
import {
|
|
96
|
-
RECALL_GATEWAY_TOOL,
|
|
97
|
-
RECALL_TOOL_NAME,
|
|
98
|
-
executeRecall,
|
|
99
|
-
findRecallToolUse,
|
|
100
|
-
hasRecallToolUse,
|
|
101
|
-
hasOtherToolUse,
|
|
102
|
-
clientHasRecallTool,
|
|
103
|
-
buildRecallFollowUp,
|
|
104
|
-
buildRecallMarker,
|
|
105
|
-
recallStoreKey,
|
|
106
|
-
expandRecallMarkers,
|
|
107
|
-
cleanupRecallStore,
|
|
108
|
-
replaceRecallWithMarker,
|
|
109
|
-
} from "./recall";
|
|
110
|
-
|
|
111
|
-
// ---------------------------------------------------------------------------
|
|
112
|
-
// Module state
|
|
113
|
-
// ---------------------------------------------------------------------------
|
|
114
|
-
|
|
115
|
-
/** One-time initialization flag. */
|
|
116
|
-
let initialized = false;
|
|
117
|
-
|
|
118
|
-
/** Active upstream interceptor — used for recording/replay. */
|
|
119
|
-
let activeInterceptor: UpstreamInterceptor | undefined;
|
|
120
|
-
|
|
121
|
-
/**
|
|
122
|
-
* Set (or clear) the module-level upstream interceptor.
|
|
123
|
-
*
|
|
124
|
-
* When set, every call to `forwardToUpstream` passes through the interceptor
|
|
125
|
-
* instead of calling `fetch` directly. Used by the recording and replay
|
|
126
|
-
* scripts to capture or replay upstream traffic without modifying individual
|
|
127
|
-
* call sites.
|
|
128
|
-
*/
|
|
129
|
-
export function setUpstreamInterceptor(
|
|
130
|
-
interceptor: UpstreamInterceptor | undefined,
|
|
131
|
-
): void {
|
|
132
|
-
activeInterceptor = interceptor;
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
/**
|
|
136
|
-
* Reset all module-level singleton state.
|
|
137
|
-
*
|
|
138
|
-
* Intended for test harnesses only — allows multiple independent gateway
|
|
139
|
-
* instances to run sequentially in the same Bun process without leaking
|
|
140
|
-
* session state, initialization flags, or cached project paths across test
|
|
141
|
-
* suites.
|
|
142
|
-
*/
|
|
143
|
-
export async function resetPipelineState(): Promise<void> {
|
|
144
|
-
initialized = false;
|
|
145
|
-
cachedProjectPath = null;
|
|
146
|
-
sessions.clear();
|
|
147
|
-
ltmSessionCache.clear();
|
|
148
|
-
ltmPinnedText.clear();
|
|
149
|
-
// Shut down batch queue gracefully before clearing the client
|
|
150
|
-
if (llmClient && "shutdown" in llmClient) {
|
|
151
|
-
await (llmClient as LLMClient & { shutdown: () => Promise<void> }).shutdown();
|
|
152
|
-
}
|
|
153
|
-
llmClient = null;
|
|
154
|
-
activeInterceptor = undefined;
|
|
155
|
-
if (stopIdleScheduler) {
|
|
156
|
-
stopIdleScheduler();
|
|
157
|
-
stopIdleScheduler = null;
|
|
158
|
-
}
|
|
159
|
-
lastSeenSessionModel = null;
|
|
160
|
-
resetWorkerModelState();
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
/** Cached project path from the first request that carried a system prompt. */
|
|
164
|
-
let cachedProjectPath: string | null = null;
|
|
165
|
-
|
|
166
|
-
/** Per-session state tracked across requests. */
|
|
167
|
-
const sessions = new Map<string, SessionState>();
|
|
168
|
-
|
|
169
|
-
/**
|
|
170
|
-
* Per-session LTM cache for byte-stability.
|
|
171
|
-
*
|
|
172
|
-
* Without caching, `ltm.forSession()` re-scores entries against evolving
|
|
173
|
-
* session context every turn, producing different formatted text → system
|
|
174
|
-
* prompt changes at byte 0 → total cache invalidation on every turn.
|
|
175
|
-
*/
|
|
176
|
-
const ltmSessionCache = new Map<
|
|
177
|
-
string,
|
|
178
|
-
{ formatted: string; tokenCount: number }
|
|
179
|
-
>();
|
|
180
|
-
|
|
181
|
-
/**
|
|
182
|
-
* Pinned LTM text per session — the text currently being injected into the
|
|
183
|
-
* system prompt. When ltmSessionCache is invalidated and recomputed, we
|
|
184
|
-
* compare the new text against the pin. Only update if >5% character
|
|
185
|
-
* difference to avoid cache busts from minor BM25 re-ranking changes.
|
|
186
|
-
*/
|
|
187
|
-
const ltmPinnedText = new Map<
|
|
188
|
-
string,
|
|
189
|
-
{ formatted: string; tokenCount: number }
|
|
190
|
-
>();
|
|
191
|
-
|
|
192
|
-
/**
|
|
193
|
-
* Measure character-level difference between two strings as a ratio (0..1).
|
|
194
|
-
* Uses a simple length + common-prefix heuristic — not a full diff, but
|
|
195
|
-
* sufficient to detect "substantially the same" vs "meaningfully different".
|
|
196
|
-
*/
|
|
197
|
-
function textDiffRatio(a: string, b: string): number {
|
|
198
|
-
if (a === b) return 0;
|
|
199
|
-
if (!a || !b) return 1;
|
|
200
|
-
|
|
201
|
-
// Common prefix length
|
|
202
|
-
const minLen = Math.min(a.length, b.length);
|
|
203
|
-
const maxLen = Math.max(a.length, b.length);
|
|
204
|
-
let common = 0;
|
|
205
|
-
for (let i = 0; i < minLen; i++) {
|
|
206
|
-
if (a[i] === b[i]) common++;
|
|
207
|
-
else break;
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
// Common suffix length (non-overlapping with prefix)
|
|
211
|
-
let suffix = 0;
|
|
212
|
-
for (let i = 0; i < minLen - common; i++) {
|
|
213
|
-
if (a[a.length - 1 - i] === b[b.length - 1 - i]) suffix++;
|
|
214
|
-
else break;
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
const matched = common + suffix;
|
|
218
|
-
return 1 - matched / maxLen;
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
/** Cached LLM client for background workers. */
|
|
222
|
-
let llmClient: LLMClient | null = null;
|
|
223
|
-
|
|
224
|
-
/** Cleanup function for the idle scheduler timer. */
|
|
225
|
-
let stopIdleScheduler: (() => void) | null = null;
|
|
226
|
-
|
|
227
|
-
/** Last seen session model ID — used for worker model discovery context. */
|
|
228
|
-
let lastSeenSessionModel: string | null = null;
|
|
229
|
-
|
|
230
|
-
// ---------------------------------------------------------------------------
|
|
231
|
-
// Model limits — hardcoded for known models, fallback for unknown
|
|
232
|
-
// ---------------------------------------------------------------------------
|
|
233
|
-
|
|
234
|
-
type ModelSpec = {
|
|
235
|
-
context: number;
|
|
236
|
-
output: number;
|
|
237
|
-
/** Cache-read cost per token in USD (Anthropic: 10% of input price). */
|
|
238
|
-
cacheReadCost?: number;
|
|
239
|
-
};
|
|
240
|
-
|
|
241
|
-
const MODEL_SPECS: Record<string, ModelSpec> = {
|
|
242
|
-
// Pricing: https://docs.anthropic.com/en/docs/about-claude/models
|
|
243
|
-
// Cache-read = input_price / 1_000_000 * 0.1 (10% of input for Anthropic)
|
|
244
|
-
"claude-opus-4": { context: 200_000, output: 32_000, cacheReadCost: 15 / 1_000_000 * 0.1 },
|
|
245
|
-
"claude-sonnet-4": { context: 200_000, output: 16_000, cacheReadCost: 3 / 1_000_000 * 0.1 },
|
|
246
|
-
"claude-sonnet-3-5": { context: 200_000, output: 8_192, cacheReadCost: 3 / 1_000_000 * 0.1 },
|
|
247
|
-
"claude-haiku-3-5": { context: 200_000, output: 8_192, cacheReadCost: 0.80 / 1_000_000 * 0.1 },
|
|
248
|
-
};
|
|
249
|
-
|
|
250
|
-
const DEFAULT_MODEL_SPEC: ModelSpec = { context: 200_000, output: 8_192 };
|
|
251
|
-
|
|
252
|
-
function getModelSpec(model: string): ModelSpec {
|
|
253
|
-
// Check for prefix matches: "claude-opus-4-20250514" → "claude-opus-4"
|
|
254
|
-
for (const [prefix, spec] of Object.entries(MODEL_SPECS)) {
|
|
255
|
-
if (model.startsWith(prefix)) return spec;
|
|
256
|
-
}
|
|
257
|
-
return DEFAULT_MODEL_SPEC;
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
// ---------------------------------------------------------------------------
|
|
261
|
-
// Initialization
|
|
262
|
-
// ---------------------------------------------------------------------------
|
|
263
|
-
|
|
264
|
-
/**
|
|
265
|
-
* One-time init: load Lore config, ensure project exists in DB, start idle scheduler.
|
|
266
|
-
* Safe to call multiple times — only the first call does work.
|
|
267
|
-
*/
|
|
268
|
-
async function initIfNeeded(projectPath: string, config?: GatewayConfig): Promise<void> {
|
|
269
|
-
if (initialized) return;
|
|
270
|
-
|
|
271
|
-
await load(projectPath);
|
|
272
|
-
ensureProject(projectPath);
|
|
273
|
-
initialized = true;
|
|
274
|
-
cachedProjectPath = projectPath;
|
|
275
|
-
|
|
276
|
-
// Start the idle scheduler for background work (distillation, curation,
|
|
277
|
-
// pruning, AGENTS.md export). Uses a 30s poll interval and fires for any
|
|
278
|
-
// session whose lastRequestTime exceeds the idle timeout.
|
|
279
|
-
if (config && !stopIdleScheduler) {
|
|
280
|
-
const llm = getLLMClient(config);
|
|
281
|
-
const sessionModelID = lastSeenSessionModel ?? (loreConfig().model?.modelID ?? "claude-sonnet-4-20250514");
|
|
282
|
-
const idleHandler = buildIdleWorkHandler(
|
|
283
|
-
projectPath,
|
|
284
|
-
llm,
|
|
285
|
-
config.upstreamAnthropic,
|
|
286
|
-
() => resolveAuth(),
|
|
287
|
-
sessionModelID,
|
|
288
|
-
);
|
|
289
|
-
stopIdleScheduler = startIdleScheduler(config, sessions, idleHandler);
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
log.info(`gateway pipeline initialized: ${projectPath}`);
|
|
293
|
-
}
|
|
294
|
-
|
|
295
|
-
function getLLMClient(config: GatewayConfig): LLMClient {
|
|
296
|
-
if (!llmClient) {
|
|
297
|
-
const cfg = loreConfig();
|
|
298
|
-
const defaultModel = cfg.model ?? {
|
|
299
|
-
providerID: "anthropic",
|
|
300
|
-
modelID: "claude-sonnet-4-20250514",
|
|
301
|
-
};
|
|
302
|
-
const inner = createGatewayLLMClient(
|
|
303
|
-
config.upstreamAnthropic,
|
|
304
|
-
resolveAuth,
|
|
305
|
-
defaultModel,
|
|
306
|
-
);
|
|
307
|
-
|
|
308
|
-
// Wrap with batch queue for 50% cost savings on non-urgent worker calls.
|
|
309
|
-
// Enabled by default — disable via LORE_BATCH_DISABLED=1.
|
|
310
|
-
const batchDisabled = process.env.LORE_BATCH_DISABLED === "1";
|
|
311
|
-
if (batchDisabled) {
|
|
312
|
-
llmClient = inner;
|
|
313
|
-
} else {
|
|
314
|
-
llmClient = createBatchLLMClient(
|
|
315
|
-
inner,
|
|
316
|
-
config.upstreamAnthropic,
|
|
317
|
-
resolveAuth,
|
|
318
|
-
defaultModel,
|
|
319
|
-
);
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
|
-
return llmClient;
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
// ---------------------------------------------------------------------------
|
|
326
|
-
// Session management helpers
|
|
327
|
-
// ---------------------------------------------------------------------------
|
|
328
|
-
|
|
329
|
-
function getOrCreateSession(
|
|
330
|
-
sessionID: string,
|
|
331
|
-
projectPath: string,
|
|
332
|
-
): SessionState {
|
|
333
|
-
let state = sessions.get(sessionID);
|
|
334
|
-
if (!state) {
|
|
335
|
-
state = {
|
|
336
|
-
sessionID,
|
|
337
|
-
projectPath,
|
|
338
|
-
fingerprint: "",
|
|
339
|
-
lastRequestTime: Date.now(),
|
|
340
|
-
messageCount: 0,
|
|
341
|
-
turnsSinceCuration: 0,
|
|
342
|
-
recallStore: new Map(),
|
|
343
|
-
cacheAnalytics: {
|
|
344
|
-
lastRequestBody: null,
|
|
345
|
-
lastRequestBodyLength: 0,
|
|
346
|
-
lastCacheRead: 0,
|
|
347
|
-
lastCacheCreation: 0,
|
|
348
|
-
turnCount: 0,
|
|
349
|
-
bustCount: 0,
|
|
350
|
-
},
|
|
351
|
-
};
|
|
352
|
-
sessions.set(sessionID, state);
|
|
353
|
-
}
|
|
354
|
-
state.lastRequestTime = Date.now();
|
|
355
|
-
|
|
356
|
-
// Ensure recallStore exists (upgrade from older session state)
|
|
357
|
-
if (!state.recallStore) {
|
|
358
|
-
state.recallStore = new Map();
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
return state;
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
/**
|
|
365
|
-
* Identify or create a session from the incoming request messages.
|
|
366
|
-
*
|
|
367
|
-
* Uses a fingerprint of the first user message combined with
|
|
368
|
-
* message-count proximity to correlate requests to sessions.
|
|
369
|
-
* Forked sessions (which share the same first message) are
|
|
370
|
-
* disambiguated by a significant drop in message count.
|
|
371
|
-
*/
|
|
372
|
-
async function identifySession(
|
|
373
|
-
req: GatewayRequest,
|
|
374
|
-
_projectPath: string,
|
|
375
|
-
): Promise<{ sessionID: string; isNew: boolean }> {
|
|
376
|
-
const rawMessages = req.messages.map((m) => ({
|
|
377
|
-
role: m.role,
|
|
378
|
-
content: m.content,
|
|
379
|
-
}));
|
|
380
|
-
const cred = extractAuth(req.rawHeaders);
|
|
381
|
-
const fingerprint = await fingerprintMessages(rawMessages, {
|
|
382
|
-
model: req.model,
|
|
383
|
-
authSuffix: cred ? authFingerprint(cred) : "",
|
|
384
|
-
});
|
|
385
|
-
const msgCount = req.messages.length;
|
|
386
|
-
|
|
387
|
-
// Find the best matching session: same fingerprint + closest message count
|
|
388
|
-
let bestMatch: { sid: string; countDiff: number } | null = null;
|
|
389
|
-
|
|
390
|
-
for (const [sid, state] of sessions) {
|
|
391
|
-
if (state.fingerprint !== fingerprint) continue;
|
|
392
|
-
|
|
393
|
-
const diff = msgCount - state.messageCount;
|
|
394
|
-
|
|
395
|
-
// Normal session: count grows by 2–6 per turn.
|
|
396
|
-
// Fork: count drops significantly (parent at 600, fork at 300).
|
|
397
|
-
// Reject if the count dropped too far (likely a fork).
|
|
398
|
-
if (diff < -MESSAGE_COUNT_PROXIMITY_THRESHOLD) continue;
|
|
399
|
-
|
|
400
|
-
const absDiff = Math.abs(diff);
|
|
401
|
-
if (!bestMatch || absDiff < bestMatch.countDiff) {
|
|
402
|
-
bestMatch = { sid, countDiff: absDiff };
|
|
403
|
-
}
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
if (bestMatch) {
|
|
407
|
-
return { sessionID: bestMatch.sid, isNew: false };
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
// No matching session → create new
|
|
411
|
-
const sessionID = generateSessionID();
|
|
412
|
-
return { sessionID, isNew: true };
|
|
413
|
-
}
|
|
414
|
-
|
|
415
|
-
// ---------------------------------------------------------------------------
|
|
416
|
-
// Upstream forwarding
|
|
417
|
-
// ---------------------------------------------------------------------------
|
|
418
|
-
|
|
419
|
-
/** Result from forwardToUpstream — includes the serialized body for cache analytics. */
|
|
420
|
-
type UpstreamResult = {
|
|
421
|
-
response: Response;
|
|
422
|
-
/** The serialized JSON body sent to the upstream provider. */
|
|
423
|
-
serializedBody: string;
|
|
424
|
-
};
|
|
425
|
-
|
|
426
|
-
/**
|
|
427
|
-
* Forward a request to the upstream provider (Anthropic or OpenAI).
|
|
428
|
-
*
|
|
429
|
-
* When an interceptor is provided (or a module-level one is active), the
|
|
430
|
-
* interceptor is called instead of `fetch` directly. This enables recording
|
|
431
|
-
* and replay without modifying individual call sites.
|
|
432
|
-
*
|
|
433
|
-
* Returns the raw fetch Response alongside the serialized request body
|
|
434
|
-
* (for cache analytics prefix comparison).
|
|
435
|
-
*/
|
|
436
|
-
async function forwardToUpstream(
|
|
437
|
-
req: GatewayRequest,
|
|
438
|
-
config: GatewayConfig,
|
|
439
|
-
interceptor?: UpstreamInterceptor,
|
|
440
|
-
cache?: AnthropicCacheOptions,
|
|
441
|
-
): Promise<UpstreamResult> {
|
|
442
|
-
let url: string;
|
|
443
|
-
let headers: Record<string, string>;
|
|
444
|
-
let body: unknown;
|
|
445
|
-
|
|
446
|
-
// Infer upstream from model name; fall back to protocol + env-var defaults.
|
|
447
|
-
const route = resolveUpstreamRoute(req.model);
|
|
448
|
-
const effectiveProtocol = route?.protocol ?? req.protocol;
|
|
449
|
-
const effectiveUpstreamBase = route?.url ?? (effectiveProtocol === "openai" ? config.upstreamOpenAI : config.upstreamAnthropic);
|
|
450
|
-
|
|
451
|
-
if (effectiveProtocol === "openai") {
|
|
452
|
-
const result = buildOpenAIUpstreamRequest(req, effectiveUpstreamBase);
|
|
453
|
-
url = result.url;
|
|
454
|
-
headers = result.headers;
|
|
455
|
-
body = result.body;
|
|
456
|
-
} else {
|
|
457
|
-
const result = buildAnthropicRequest(req, cache);
|
|
458
|
-
url = `${effectiveUpstreamBase}${result.url}`;
|
|
459
|
-
headers = result.headers;
|
|
460
|
-
body = result.body;
|
|
461
|
-
}
|
|
462
|
-
|
|
463
|
-
const serializedBody = JSON.stringify(body);
|
|
464
|
-
const effectiveInterceptor = interceptor ?? activeInterceptor;
|
|
465
|
-
|
|
466
|
-
if (effectiveInterceptor) {
|
|
467
|
-
const response = await effectiveInterceptor(
|
|
468
|
-
body,
|
|
469
|
-
req.model,
|
|
470
|
-
req.stream,
|
|
471
|
-
() =>
|
|
472
|
-
fetch(url, {
|
|
473
|
-
method: "POST",
|
|
474
|
-
headers,
|
|
475
|
-
body: serializedBody,
|
|
476
|
-
}),
|
|
477
|
-
);
|
|
478
|
-
return { response, serializedBody };
|
|
479
|
-
}
|
|
480
|
-
|
|
481
|
-
const response = await fetch(url, {
|
|
482
|
-
method: "POST",
|
|
483
|
-
headers,
|
|
484
|
-
body: serializedBody,
|
|
485
|
-
});
|
|
486
|
-
return { response, serializedBody };
|
|
487
|
-
}
|
|
488
|
-
|
|
489
|
-
// ---------------------------------------------------------------------------
|
|
490
|
-
// Response builders
|
|
491
|
-
// ---------------------------------------------------------------------------
|
|
492
|
-
|
|
493
|
-
/**
|
|
494
|
-
* Create a streaming SSE response from upstream with parallel accumulation.
|
|
495
|
-
*
|
|
496
|
-
* When `recallContext` is provided, uses a recall-aware accumulator that
|
|
497
|
-
* transparently intercepts recall tool_use blocks:
|
|
498
|
-
* - **Case 1 (recall-only)**: pauses client stream, executes recall, sends
|
|
499
|
-
* a follow-up request, and pipes the continuation into the same HTTP
|
|
500
|
-
* response stream.
|
|
501
|
-
* - **Case 2 (mixed tools)**: suppresses recall blocks, stores the pending
|
|
502
|
-
* result for injection into the next request.
|
|
503
|
-
*/
|
|
504
|
-
function buildStreamingResponse(
|
|
505
|
-
upstreamResponse: Response,
|
|
506
|
-
onComplete: (response: GatewayResponse) => void,
|
|
507
|
-
recallContext?: {
|
|
508
|
-
modifiedReq: GatewayRequest;
|
|
509
|
-
config: GatewayConfig;
|
|
510
|
-
sessionState: SessionState;
|
|
511
|
-
cacheOptions: AnthropicCacheOptions;
|
|
512
|
-
},
|
|
513
|
-
): Response {
|
|
514
|
-
const recallAccum = recallContext
|
|
515
|
-
? createRecallAwareAccumulator(RECALL_TOOL_NAME)
|
|
516
|
-
: null;
|
|
517
|
-
const accumulator: StreamAccumulator = recallAccum ?? createStreamAccumulator();
|
|
518
|
-
const encoder = new TextEncoder();
|
|
519
|
-
|
|
520
|
-
const stream = new ReadableStream({
|
|
521
|
-
async start(controller) {
|
|
522
|
-
try {
|
|
523
|
-
// Parse and forward upstream SSE events
|
|
524
|
-
const reader = upstreamResponse.body!.getReader();
|
|
525
|
-
for await (const { event, data } of parseSSEStream(reader)) {
|
|
526
|
-
const forwarded = accumulator.processEvent(event, data);
|
|
527
|
-
if (forwarded) {
|
|
528
|
-
controller.enqueue(encoder.encode(forwarded));
|
|
529
|
-
}
|
|
530
|
-
}
|
|
531
|
-
|
|
532
|
-
// --- Recall interception (streaming) ---
|
|
533
|
-
if (recallAccum?.hasRecall()) {
|
|
534
|
-
const resp = recallAccum.getResponse();
|
|
535
|
-
const recallBlock = findRecallToolUse(resp);
|
|
536
|
-
|
|
537
|
-
if (recallBlock && recallContext) {
|
|
538
|
-
const { result, input } = await executeRecall(
|
|
539
|
-
recallBlock,
|
|
540
|
-
recallContext.sessionState.projectPath,
|
|
541
|
-
recallContext.sessionState.sessionID,
|
|
542
|
-
);
|
|
543
|
-
|
|
544
|
-
const scope = input.scope ?? "all";
|
|
545
|
-
|
|
546
|
-
// Store recall result for marker round-trip expansion
|
|
547
|
-
const storeKey = recallStoreKey(input.query, scope);
|
|
548
|
-
const position = resp.content.indexOf(recallBlock);
|
|
549
|
-
recallContext.sessionState.recallStore.set(storeKey, {
|
|
550
|
-
toolUseId: recallBlock.id,
|
|
551
|
-
input,
|
|
552
|
-
position,
|
|
553
|
-
result,
|
|
554
|
-
});
|
|
555
|
-
|
|
556
|
-
// Emit marker text block in place of the suppressed recall block
|
|
557
|
-
const markerText = buildRecallMarker(input.query, scope);
|
|
558
|
-
const markerIdx = recallAccum.clientBlockCount();
|
|
559
|
-
const syntheticMarker = [
|
|
560
|
-
formatSSEEvent("content_block_start", JSON.stringify({
|
|
561
|
-
type: "content_block_start",
|
|
562
|
-
index: markerIdx,
|
|
563
|
-
content_block: { type: "text", text: "" },
|
|
564
|
-
})),
|
|
565
|
-
formatSSEEvent("content_block_delta", JSON.stringify({
|
|
566
|
-
type: "content_block_delta",
|
|
567
|
-
index: markerIdx,
|
|
568
|
-
delta: { type: "text_delta", text: markerText },
|
|
569
|
-
})),
|
|
570
|
-
formatSSEEvent("content_block_stop", JSON.stringify({
|
|
571
|
-
type: "content_block_stop",
|
|
572
|
-
index: markerIdx,
|
|
573
|
-
})),
|
|
574
|
-
].join("");
|
|
575
|
-
controller.enqueue(encoder.encode(syntheticMarker));
|
|
576
|
-
|
|
577
|
-
if (recallAccum.hasOtherTools()) {
|
|
578
|
-
// Forward held-back events, close stream
|
|
579
|
-
log.info(
|
|
580
|
-
`recall (stream, mixed): stored result for session ` +
|
|
581
|
-
`${recallContext.sessionState.sessionID.slice(0, 16)}`,
|
|
582
|
-
);
|
|
583
|
-
|
|
584
|
-
const heldBack = recallAccum.heldBackEvents();
|
|
585
|
-
if (heldBack) {
|
|
586
|
-
controller.enqueue(encoder.encode(heldBack));
|
|
587
|
-
}
|
|
588
|
-
|
|
589
|
-
controller.close();
|
|
590
|
-
|
|
591
|
-
// Post-stream: store response with marker text (not raw tool_use)
|
|
592
|
-
const markerResp = replaceRecallWithMarker(resp);
|
|
593
|
-
onComplete(markerResp);
|
|
594
|
-
return;
|
|
595
|
-
}
|
|
596
|
-
|
|
597
|
-
// Recall-only — send follow-up, pipe continuation
|
|
598
|
-
log.info(
|
|
599
|
-
`recall (stream, only): executing follow-up for session ` +
|
|
600
|
-
`${recallContext.sessionState.sessionID.slice(0, 16)}`,
|
|
601
|
-
);
|
|
602
|
-
|
|
603
|
-
const followUp = buildRecallFollowUp(
|
|
604
|
-
recallContext.modifiedReq,
|
|
605
|
-
resp,
|
|
606
|
-
result,
|
|
607
|
-
recallBlock,
|
|
608
|
-
);
|
|
609
|
-
let followUpResponse: Response;
|
|
610
|
-
try {
|
|
611
|
-
({ response: followUpResponse } = await forwardToUpstream(
|
|
612
|
-
followUp,
|
|
613
|
-
recallContext.config,
|
|
614
|
-
undefined,
|
|
615
|
-
recallContext.cacheOptions,
|
|
616
|
-
));
|
|
617
|
-
} catch (fetchErr) {
|
|
618
|
-
log.error(
|
|
619
|
-
`recall follow-up fetch error for session ${recallContext.sessionState.sessionID.slice(0, 16)}:`,
|
|
620
|
-
fetchErr,
|
|
621
|
-
);
|
|
622
|
-
const heldBack = recallAccum.heldBackEvents();
|
|
623
|
-
if (heldBack) {
|
|
624
|
-
controller.enqueue(encoder.encode(heldBack));
|
|
625
|
-
}
|
|
626
|
-
controller.close();
|
|
627
|
-
const markerResp = replaceRecallWithMarker(resp);
|
|
628
|
-
onComplete(markerResp);
|
|
629
|
-
return;
|
|
630
|
-
}
|
|
631
|
-
|
|
632
|
-
log.info(
|
|
633
|
-
`recall follow-up response: status=${followUpResponse.status} ` +
|
|
634
|
-
`hasBody=${!!followUpResponse.body} session=${recallContext.sessionState.sessionID.slice(0, 16)}`,
|
|
635
|
-
);
|
|
636
|
-
|
|
637
|
-
if (!followUpResponse.ok) {
|
|
638
|
-
const errorBody = await followUpResponse.text();
|
|
639
|
-
log.error(
|
|
640
|
-
`recall follow-up upstream error: ${followUpResponse.status} ${errorBody.slice(0, 500)}`,
|
|
641
|
-
);
|
|
642
|
-
// Forward the held-back events to close the stream gracefully
|
|
643
|
-
const heldBack = recallAccum.heldBackEvents();
|
|
644
|
-
if (heldBack) {
|
|
645
|
-
controller.enqueue(encoder.encode(heldBack));
|
|
646
|
-
}
|
|
647
|
-
controller.close();
|
|
648
|
-
const markerResp = replaceRecallWithMarker(resp);
|
|
649
|
-
onComplete(markerResp);
|
|
650
|
-
return;
|
|
651
|
-
}
|
|
652
|
-
|
|
653
|
-
// Pipe the continuation stream into the same HTTP response.
|
|
654
|
-
// Suppress message_start (client already has one) and re-index
|
|
655
|
-
// content blocks to continue from where the client left off.
|
|
656
|
-
// +1 accounts for the synthetic marker block.
|
|
657
|
-
const blockOffset = recallAccum.clientBlockCount() + 1;
|
|
658
|
-
const contReader = followUpResponse.body!.getReader();
|
|
659
|
-
let contEventCount = 0;
|
|
660
|
-
|
|
661
|
-
for await (const { event: contEvent, data: contData } of parseSSEStream(contReader)) {
|
|
662
|
-
contEventCount++;
|
|
663
|
-
if (contEvent === "message_start") {
|
|
664
|
-
// Suppress — client already received one
|
|
665
|
-
continue;
|
|
666
|
-
}
|
|
667
|
-
|
|
668
|
-
// Re-index content block events
|
|
669
|
-
if (
|
|
670
|
-
contEvent === "content_block_start" ||
|
|
671
|
-
contEvent === "content_block_delta" ||
|
|
672
|
-
contEvent === "content_block_stop"
|
|
673
|
-
) {
|
|
674
|
-
try {
|
|
675
|
-
const parsed = JSON.parse(contData) as Record<string, unknown>;
|
|
676
|
-
if (typeof parsed.index === "number") {
|
|
677
|
-
parsed.index = (parsed.index as number) + blockOffset;
|
|
678
|
-
const adjusted = formatSSEEvent(
|
|
679
|
-
contEvent,
|
|
680
|
-
JSON.stringify(parsed),
|
|
681
|
-
);
|
|
682
|
-
controller.enqueue(encoder.encode(adjusted));
|
|
683
|
-
continue;
|
|
684
|
-
}
|
|
685
|
-
} catch {
|
|
686
|
-
// Fall through to forward as-is
|
|
687
|
-
}
|
|
688
|
-
}
|
|
689
|
-
|
|
690
|
-
// Forward message_delta, message_stop, and other events as-is
|
|
691
|
-
const forwarded = formatSSEEvent(contEvent, contData);
|
|
692
|
-
controller.enqueue(encoder.encode(forwarded));
|
|
693
|
-
}
|
|
694
|
-
|
|
695
|
-
log.info(
|
|
696
|
-
`recall follow-up stream complete: ${contEventCount} events piped, ` +
|
|
697
|
-
`session=${recallContext.sessionState.sessionID.slice(0, 16)}`,
|
|
698
|
-
);
|
|
699
|
-
|
|
700
|
-
controller.close();
|
|
701
|
-
|
|
702
|
-
// Post-stream: store response with marker text for temporal storage.
|
|
703
|
-
// The marker replaces the raw tool_use, so future turns can
|
|
704
|
-
// round-trip the marker ↔ tool_use/tool_result correctly.
|
|
705
|
-
const markerResp = replaceRecallWithMarker(resp);
|
|
706
|
-
onComplete(markerResp);
|
|
707
|
-
return;
|
|
708
|
-
}
|
|
709
|
-
}
|
|
710
|
-
|
|
711
|
-
// No recall — normal path
|
|
712
|
-
controller.close();
|
|
713
|
-
const response = accumulator.getResponse();
|
|
714
|
-
onComplete(response);
|
|
715
|
-
} catch (err) {
|
|
716
|
-
log.error("streaming pipeline error:", err);
|
|
717
|
-
controller.error(err);
|
|
718
|
-
}
|
|
719
|
-
},
|
|
720
|
-
});
|
|
721
|
-
|
|
722
|
-
return new Response(stream, {
|
|
723
|
-
status: 200,
|
|
724
|
-
headers: {
|
|
725
|
-
"content-type": "text/event-stream",
|
|
726
|
-
"cache-control": "no-cache",
|
|
727
|
-
connection: "keep-alive",
|
|
728
|
-
},
|
|
729
|
-
});
|
|
730
|
-
}
|
|
731
|
-
|
|
732
|
-
/**
|
|
733
|
-
* Accumulate a non-streaming upstream response into a GatewayResponse.
|
|
734
|
-
*/
|
|
735
|
-
async function accumulateNonStreamResponse(
|
|
736
|
-
upstreamResponse: Response,
|
|
737
|
-
): Promise<GatewayResponse> {
|
|
738
|
-
const json = (await upstreamResponse.json()) as Record<string, unknown>;
|
|
739
|
-
|
|
740
|
-
const content: GatewayContentBlock[] = [];
|
|
741
|
-
const rawContent = json.content as Array<Record<string, unknown>> | undefined;
|
|
742
|
-
if (rawContent) {
|
|
743
|
-
for (const block of rawContent) {
|
|
744
|
-
switch (block.type) {
|
|
745
|
-
case "text":
|
|
746
|
-
content.push({ type: "text", text: String(block.text ?? "") });
|
|
747
|
-
break;
|
|
748
|
-
case "thinking":
|
|
749
|
-
content.push({
|
|
750
|
-
type: "thinking",
|
|
751
|
-
thinking: String(block.thinking ?? ""),
|
|
752
|
-
...(block.signature
|
|
753
|
-
? { signature: String(block.signature) }
|
|
754
|
-
: undefined),
|
|
755
|
-
});
|
|
756
|
-
break;
|
|
757
|
-
case "tool_use":
|
|
758
|
-
content.push({
|
|
759
|
-
type: "tool_use",
|
|
760
|
-
id: String(block.id ?? ""),
|
|
761
|
-
name: String(block.name ?? ""),
|
|
762
|
-
input: block.input,
|
|
763
|
-
});
|
|
764
|
-
break;
|
|
765
|
-
}
|
|
766
|
-
}
|
|
767
|
-
}
|
|
768
|
-
|
|
769
|
-
const usage = json.usage as Record<string, number> | undefined;
|
|
770
|
-
|
|
771
|
-
return {
|
|
772
|
-
id: String(json.id ?? ""),
|
|
773
|
-
model: String(json.model ?? ""),
|
|
774
|
-
content,
|
|
775
|
-
stopReason: String(
|
|
776
|
-
(json.stop_reason as string) ?? "end_turn",
|
|
777
|
-
),
|
|
778
|
-
usage: {
|
|
779
|
-
inputTokens: usage?.input_tokens ?? 0,
|
|
780
|
-
outputTokens: usage?.output_tokens ?? 0,
|
|
781
|
-
cacheReadInputTokens: usage?.cache_read_input_tokens,
|
|
782
|
-
cacheCreationInputTokens: usage?.cache_creation_input_tokens,
|
|
783
|
-
},
|
|
784
|
-
};
|
|
785
|
-
}
|
|
786
|
-
|
|
787
|
-
/**
|
|
788
|
-
* Accumulate a streaming upstream SSE response into a GatewayResponse.
|
|
789
|
-
*
|
|
790
|
-
* Used for OpenAI requests where we need to convert the accumulated
|
|
791
|
-
* response to OpenAI format before returning to the client.
|
|
792
|
-
*/
|
|
793
|
-
async function accumulateStreamResponse(
|
|
794
|
-
upstreamResponse: Response,
|
|
795
|
-
): Promise<GatewayResponse> {
|
|
796
|
-
const accumulator = createStreamAccumulator();
|
|
797
|
-
const reader = upstreamResponse.body!.getReader();
|
|
798
|
-
|
|
799
|
-
for await (const { event, data } of parseSSEStream(reader)) {
|
|
800
|
-
accumulator.processEvent(event, data);
|
|
801
|
-
}
|
|
802
|
-
|
|
803
|
-
return accumulator.getResponse();
|
|
804
|
-
}
|
|
805
|
-
|
|
806
|
-
/**
|
|
807
|
-
* Convert a GatewayResponse to a non-streaming HTTP Response.
|
|
808
|
-
*/
|
|
809
|
-
function nonStreamHttpResponse(resp: GatewayResponse): Response {
|
|
810
|
-
const body = buildAnthropicNonStreamResponse(resp);
|
|
811
|
-
return new Response(JSON.stringify(body), {
|
|
812
|
-
status: 200,
|
|
813
|
-
headers: { "content-type": "application/json" },
|
|
814
|
-
});
|
|
815
|
-
}
|
|
816
|
-
|
|
817
|
-
/**
|
|
818
|
-
* Convert a GatewayResponse to a streaming SSE HTTP Response.
|
|
819
|
-
*/
|
|
820
|
-
function streamHttpResponse(resp: GatewayResponse): Response {
|
|
821
|
-
// Build the full SSE text for a text-only response
|
|
822
|
-
const textBlocks = resp.content.filter(
|
|
823
|
-
(b): b is { type: "text"; text: string } => b.type === "text",
|
|
824
|
-
);
|
|
825
|
-
const fullText = textBlocks.map((b) => b.text).join("");
|
|
826
|
-
|
|
827
|
-
const sseBody = buildSSETextResponse(resp.id, resp.model, fullText, {
|
|
828
|
-
inputTokens: resp.usage.inputTokens,
|
|
829
|
-
outputTokens: resp.usage.outputTokens,
|
|
830
|
-
});
|
|
831
|
-
|
|
832
|
-
return new Response(sseBody, {
|
|
833
|
-
status: 200,
|
|
834
|
-
headers: {
|
|
835
|
-
"content-type": "text/event-stream",
|
|
836
|
-
"cache-control": "no-cache",
|
|
837
|
-
connection: "keep-alive",
|
|
838
|
-
},
|
|
839
|
-
});
|
|
840
|
-
}
|
|
841
|
-
|
|
842
|
-
// ---------------------------------------------------------------------------
|
|
843
|
-
// Post-response processing
|
|
844
|
-
// ---------------------------------------------------------------------------
|
|
845
|
-
|
|
846
|
-
/**
|
|
847
|
-
* Run after a successful response: calibrate, store temporal messages,
|
|
848
|
-
* and schedule background work (distillation, curation).
|
|
849
|
-
*/
|
|
850
|
-
function postResponse(
|
|
851
|
-
req: GatewayRequest,
|
|
852
|
-
resp: GatewayResponse,
|
|
853
|
-
sessionState: SessionState,
|
|
854
|
-
config: GatewayConfig,
|
|
855
|
-
/** Serialized JSON body sent upstream — for cache prefix comparison. */
|
|
856
|
-
requestBody?: string,
|
|
857
|
-
): void {
|
|
858
|
-
const { sessionID, projectPath } = sessionState;
|
|
859
|
-
|
|
860
|
-
try {
|
|
861
|
-
// --- Calibrate overhead from real token counts ---
|
|
862
|
-
const actualInput =
|
|
863
|
-
(resp.usage.inputTokens ?? 0) +
|
|
864
|
-
(resp.usage.cacheReadInputTokens ?? 0) +
|
|
865
|
-
(resp.usage.cacheCreationInputTokens ?? 0);
|
|
866
|
-
calibrate(
|
|
867
|
-
actualInput,
|
|
868
|
-
sessionID,
|
|
869
|
-
getLastTransformedCount(sessionID),
|
|
870
|
-
);
|
|
871
|
-
|
|
872
|
-
// --- Cache analytics ---
|
|
873
|
-
if (requestBody) {
|
|
874
|
-
analyzeCacheTurn(sessionState.cacheAnalytics, requestBody, resp.usage, sessionID);
|
|
875
|
-
}
|
|
876
|
-
|
|
877
|
-
// --- Temporal storage ---
|
|
878
|
-
// Store all messages (user + assistant) from this turn.
|
|
879
|
-
// Convert gateway messages to Lore format.
|
|
880
|
-
const loreMessages = gatewayMessagesToLore(req.messages, sessionID);
|
|
881
|
-
resolveToolResults(loreMessages);
|
|
882
|
-
|
|
883
|
-
// Store the latest user message (last user message in the array)
|
|
884
|
-
for (let i = loreMessages.length - 1; i >= 0; i--) {
|
|
885
|
-
if (loreMessages[i].info.role === "user") {
|
|
886
|
-
temporal.store({
|
|
887
|
-
projectPath,
|
|
888
|
-
info: loreMessages[i].info,
|
|
889
|
-
parts: loreMessages[i].parts,
|
|
890
|
-
});
|
|
891
|
-
break;
|
|
892
|
-
}
|
|
893
|
-
}
|
|
894
|
-
|
|
895
|
-
// Build and store the assistant response message
|
|
896
|
-
const assistantMsg = gatewayMessagesToLore(
|
|
897
|
-
[{ role: "assistant", content: resp.content }],
|
|
898
|
-
sessionID,
|
|
899
|
-
)[0];
|
|
900
|
-
updateAssistantMessageTokens(assistantMsg, resp.usage, resp.model);
|
|
901
|
-
temporal.store({
|
|
902
|
-
projectPath,
|
|
903
|
-
info: assistantMsg.info,
|
|
904
|
-
parts: assistantMsg.parts,
|
|
905
|
-
});
|
|
906
|
-
|
|
907
|
-
// Update session state
|
|
908
|
-
sessionState.turnsSinceCuration =
|
|
909
|
-
(sessionState.turnsSinceCuration ?? 0) + 1;
|
|
910
|
-
|
|
911
|
-
// --- Schedule background work (fire-and-forget) ---
|
|
912
|
-
scheduleBackgroundWork(sessionState, config);
|
|
913
|
-
} catch (e) {
|
|
914
|
-
log.error("post-response processing failed:", e);
|
|
915
|
-
}
|
|
916
|
-
}
|
|
917
|
-
|
|
918
|
-
/**
|
|
919
|
-
* Schedule background distillation and curation (fire-and-forget).
|
|
920
|
-
*/
|
|
921
|
-
function scheduleBackgroundWork(
|
|
922
|
-
sessionState: SessionState,
|
|
923
|
-
config: GatewayConfig,
|
|
924
|
-
): void {
|
|
925
|
-
const { sessionID, projectPath } = sessionState;
|
|
926
|
-
const llm = getLLMClient(config);
|
|
927
|
-
const cfg = loreConfig();
|
|
928
|
-
const model = getWorkerModel();
|
|
929
|
-
|
|
930
|
-
// Check if urgent distillation is needed (gradient flagged it).
|
|
931
|
-
// Mark urgent: true so these bypass the batch queue — the gradient is
|
|
932
|
-
// in overflow and needs the result before the next user turn.
|
|
933
|
-
if (needsUrgentDistillation()) {
|
|
934
|
-
distillation
|
|
935
|
-
.run({
|
|
936
|
-
llm,
|
|
937
|
-
projectPath,
|
|
938
|
-
sessionID,
|
|
939
|
-
model,
|
|
940
|
-
force: true,
|
|
941
|
-
urgent: true,
|
|
942
|
-
})
|
|
943
|
-
.catch((e) => log.error("background distillation failed:", e));
|
|
944
|
-
}
|
|
945
|
-
|
|
946
|
-
// Check if pending messages exceed maxSegment threshold
|
|
947
|
-
const pending = temporal.undistilledCount(projectPath, sessionID);
|
|
948
|
-
if (pending >= cfg.distillation.maxSegment) {
|
|
949
|
-
log.info(
|
|
950
|
-
`incremental distillation: ${pending} undistilled messages in ${sessionID.slice(0, 16)}`,
|
|
951
|
-
);
|
|
952
|
-
distillation
|
|
953
|
-
.run({ llm, projectPath, sessionID, model })
|
|
954
|
-
.catch((e) => log.error("background distillation failed:", e));
|
|
955
|
-
}
|
|
956
|
-
|
|
957
|
-
// Curation: run periodically when the knowledge system is enabled
|
|
958
|
-
if (
|
|
959
|
-
cfg.knowledge.enabled &&
|
|
960
|
-
cfg.curator.onIdle &&
|
|
961
|
-
sessionState.turnsSinceCuration >= cfg.curator.afterTurns
|
|
962
|
-
) {
|
|
963
|
-
curator
|
|
964
|
-
.run({ llm, projectPath, sessionID, model })
|
|
965
|
-
.then(() => {
|
|
966
|
-
sessionState.turnsSinceCuration = 0;
|
|
967
|
-
// Invalidate LTM cache after curation changes knowledge entries
|
|
968
|
-
ltmSessionCache.delete(sessionID);
|
|
969
|
-
})
|
|
970
|
-
.catch((e) => log.error("background curation failed:", e));
|
|
971
|
-
}
|
|
972
|
-
}
|
|
973
|
-
|
|
974
|
-
// ---------------------------------------------------------------------------
|
|
975
|
-
// Case 1: Compaction interception
|
|
976
|
-
// ---------------------------------------------------------------------------
|
|
977
|
-
|
|
978
|
-
async function handleCompaction(
|
|
979
|
-
req: GatewayRequest,
|
|
980
|
-
config: GatewayConfig,
|
|
981
|
-
): Promise<Response> {
|
|
982
|
-
// Identify session
|
|
983
|
-
const projectPath = cachedProjectPath ?? getProjectPath(req.system, req.rawHeaders);
|
|
984
|
-
await initIfNeeded(projectPath, config);
|
|
985
|
-
|
|
986
|
-
const { sessionID } = await identifySession(req, projectPath);
|
|
987
|
-
const sessionState = getOrCreateSession(sessionID, projectPath);
|
|
988
|
-
const llm = getLLMClient(config);
|
|
989
|
-
|
|
990
|
-
log.info(`compaction intercepted for session ${sessionID.slice(0, 16)}`);
|
|
991
|
-
|
|
992
|
-
// 1. Force-distill all undistilled messages.
|
|
993
|
-
// Mark urgent: true — client is blocking on the compaction response.
|
|
994
|
-
const model = getWorkerModel();
|
|
995
|
-
await distillation.run({
|
|
996
|
-
llm,
|
|
997
|
-
projectPath,
|
|
998
|
-
sessionID,
|
|
999
|
-
model,
|
|
1000
|
-
force: true,
|
|
1001
|
-
urgent: true,
|
|
1002
|
-
});
|
|
1003
|
-
|
|
1004
|
-
// 2. Load distillation summaries
|
|
1005
|
-
const distillations = distillation.loadForSession(projectPath, sessionID);
|
|
1006
|
-
|
|
1007
|
-
// 3. Extract previous summary from the request (if any)
|
|
1008
|
-
const previousSummary = extractPreviousSummary(req);
|
|
1009
|
-
|
|
1010
|
-
// 4. Build knowledge block
|
|
1011
|
-
const cfg = loreConfig();
|
|
1012
|
-
const entries = cfg.knowledge.enabled
|
|
1013
|
-
? ltm.forProject(projectPath, cfg.crossProject)
|
|
1014
|
-
: [];
|
|
1015
|
-
const knowledge = entries.length
|
|
1016
|
-
? formatKnowledge(
|
|
1017
|
-
entries.map((e) => ({
|
|
1018
|
-
category: e.category,
|
|
1019
|
-
title: e.title,
|
|
1020
|
-
content: e.content,
|
|
1021
|
-
})),
|
|
1022
|
-
)
|
|
1023
|
-
: "";
|
|
1024
|
-
|
|
1025
|
-
// 5. Build the compact prompt
|
|
1026
|
-
const compactPrompt = buildCompactPrompt({
|
|
1027
|
-
hasDistillations: distillations.length > 0,
|
|
1028
|
-
knowledge,
|
|
1029
|
-
previousSummary,
|
|
1030
|
-
});
|
|
1031
|
-
|
|
1032
|
-
// 6. Build context with distillation summaries
|
|
1033
|
-
let context = "";
|
|
1034
|
-
if (distillations.length > 0) {
|
|
1035
|
-
context =
|
|
1036
|
-
`## Lore Pre-computed Session Summaries\n\n` +
|
|
1037
|
-
`The following ${distillations.length} summary chunk(s) were pre-computed ` +
|
|
1038
|
-
`from the conversation history. Use these as the authoritative source.\n\n` +
|
|
1039
|
-
distillations
|
|
1040
|
-
.map(
|
|
1041
|
-
(d, i) =>
|
|
1042
|
-
`### Chunk ${i + 1}${d.generation > 0 ? " (consolidated)" : ""}\n${d.observations}`,
|
|
1043
|
-
)
|
|
1044
|
-
.join("\n\n");
|
|
1045
|
-
}
|
|
1046
|
-
|
|
1047
|
-
// 7. Generate the compaction summary via LLM
|
|
1048
|
-
const userContent = context
|
|
1049
|
-
? `${context}\n\n---\n\n${compactPrompt}`
|
|
1050
|
-
: compactPrompt;
|
|
1051
|
-
|
|
1052
|
-
const summaryText = await llm.prompt(compactPrompt, userContent, {
|
|
1053
|
-
model: cfg.model,
|
|
1054
|
-
workerID: "lore-compact",
|
|
1055
|
-
urgent: true, // Client is blocking on this response
|
|
1056
|
-
});
|
|
1057
|
-
|
|
1058
|
-
const summary = summaryText ?? "(Compaction failed — no summary generated.)";
|
|
1059
|
-
|
|
1060
|
-
// 8. Build and return the response
|
|
1061
|
-
const resp = buildCompactionResponse(sessionID, summary, req.model);
|
|
1062
|
-
|
|
1063
|
-
if (req.stream) {
|
|
1064
|
-
return streamHttpResponse(resp);
|
|
1065
|
-
}
|
|
1066
|
-
return nonStreamHttpResponse(resp);
|
|
1067
|
-
}
|
|
1068
|
-
|
|
1069
|
-
// ---------------------------------------------------------------------------
|
|
1070
|
-
// Case 2: Title/summary passthrough
|
|
1071
|
-
// ---------------------------------------------------------------------------
|
|
1072
|
-
|
|
1073
|
-
async function handlePassthrough(
|
|
1074
|
-
req: GatewayRequest,
|
|
1075
|
-
config: GatewayConfig,
|
|
1076
|
-
): Promise<Response> {
|
|
1077
|
-
const { response: upstreamResponse } = await forwardToUpstream(req, config);
|
|
1078
|
-
|
|
1079
|
-
// For streaming, pipe through unchanged
|
|
1080
|
-
if (req.stream && upstreamResponse.body) {
|
|
1081
|
-
return new Response(upstreamResponse.body, {
|
|
1082
|
-
status: upstreamResponse.status,
|
|
1083
|
-
headers: {
|
|
1084
|
-
"content-type":
|
|
1085
|
-
upstreamResponse.headers.get("content-type") ??
|
|
1086
|
-
"text/event-stream",
|
|
1087
|
-
},
|
|
1088
|
-
});
|
|
1089
|
-
}
|
|
1090
|
-
|
|
1091
|
-
// For non-streaming, pass through the JSON response as-is
|
|
1092
|
-
const body = await upstreamResponse.text();
|
|
1093
|
-
return new Response(body, {
|
|
1094
|
-
status: upstreamResponse.status,
|
|
1095
|
-
headers: {
|
|
1096
|
-
"content-type": "application/json",
|
|
1097
|
-
},
|
|
1098
|
-
});
|
|
1099
|
-
}
|
|
1100
|
-
|
|
1101
|
-
// ---------------------------------------------------------------------------
|
|
1102
|
-
// Case 3: Normal conversation turn — full pipeline
|
|
1103
|
-
// ---------------------------------------------------------------------------
|
|
1104
|
-
|
|
1105
|
-
async function handleConversationTurn(
|
|
1106
|
-
req: GatewayRequest,
|
|
1107
|
-
config: GatewayConfig,
|
|
1108
|
-
): Promise<Response> {
|
|
1109
|
-
// --- 1. Project path & init ---
|
|
1110
|
-
const projectPath = getProjectPath(req.system, req.rawHeaders);
|
|
1111
|
-
await initIfNeeded(projectPath, config);
|
|
1112
|
-
|
|
1113
|
-
// --- 2. Capture auth credentials for background workers ---
|
|
1114
|
-
const cred = extractAuth(req.rawHeaders);
|
|
1115
|
-
if (cred) {
|
|
1116
|
-
setLastSeenAuth(cred);
|
|
1117
|
-
}
|
|
1118
|
-
|
|
1119
|
-
// --- 3. Session identification ---
|
|
1120
|
-
const { sessionID, isNew } = await identifySession(req, projectPath);
|
|
1121
|
-
const sessionState = getOrCreateSession(sessionID, projectPath);
|
|
1122
|
-
|
|
1123
|
-
// Bind auth credential to this session for background workers
|
|
1124
|
-
if (cred) {
|
|
1125
|
-
setSessionAuth(sessionID, cred);
|
|
1126
|
-
}
|
|
1127
|
-
|
|
1128
|
-
// Track fingerprint for future correlation
|
|
1129
|
-
if (isNew) {
|
|
1130
|
-
const fingerprint = await fingerprintMessages(
|
|
1131
|
-
req.messages.map((m) => ({ role: m.role, content: m.content })),
|
|
1132
|
-
{
|
|
1133
|
-
model: req.model,
|
|
1134
|
-
authSuffix: cred ? authFingerprint(cred) : "",
|
|
1135
|
-
},
|
|
1136
|
-
);
|
|
1137
|
-
sessionState.fingerprint = fingerprint;
|
|
1138
|
-
}
|
|
1139
|
-
|
|
1140
|
-
// Always update message count for proximity matching
|
|
1141
|
-
sessionState.messageCount = req.messages.length;
|
|
1142
|
-
|
|
1143
|
-
// Track session model for worker model discovery
|
|
1144
|
-
lastSeenSessionModel = req.model;
|
|
1145
|
-
|
|
1146
|
-
// --- Expand recall markers from previous turns ---
|
|
1147
|
-
// Scan all assistant messages for marker text blocks and restore them
|
|
1148
|
-
// to tool_use + tool_result pairs before forwarding upstream.
|
|
1149
|
-
if (sessionState.recallStore.size > 0) {
|
|
1150
|
-
const expanded = expandRecallMarkers(req, sessionState.recallStore);
|
|
1151
|
-
if (expanded) {
|
|
1152
|
-
log.info(
|
|
1153
|
-
`expanded recall markers for session ${sessionID.slice(0, 16)}`,
|
|
1154
|
-
);
|
|
1155
|
-
}
|
|
1156
|
-
// Clean up orphaned store entries (markers evicted by gradient)
|
|
1157
|
-
cleanupRecallStore(req, sessionState.recallStore);
|
|
1158
|
-
}
|
|
1159
|
-
|
|
1160
|
-
log.info(
|
|
1161
|
-
`turn: session=${sessionID.slice(0, 16)} messages=${req.messages.length} ` +
|
|
1162
|
-
`model=${req.model} stream=${req.stream} new=${isNew}`,
|
|
1163
|
-
);
|
|
1164
|
-
|
|
1165
|
-
// --- 4. Set model limits ---
|
|
1166
|
-
const modelSpec = getModelSpec(req.model);
|
|
1167
|
-
setModelLimits({ context: modelSpec.context, output: modelSpec.output });
|
|
1168
|
-
|
|
1169
|
-
// Cost-aware layer-0 cap: explicit config wins > cost formula > disabled.
|
|
1170
|
-
const cfg = loreConfig();
|
|
1171
|
-
if (cfg.budget.maxLayer0Tokens !== undefined) {
|
|
1172
|
-
setMaxLayer0Tokens(cfg.budget.maxLayer0Tokens);
|
|
1173
|
-
} else if (modelSpec.cacheReadCost && cfg.budget.targetCacheReadCostPerTurn > 0) {
|
|
1174
|
-
setMaxLayer0Tokens(computeLayer0Cap(
|
|
1175
|
-
cfg.budget.targetCacheReadCostPerTurn,
|
|
1176
|
-
modelSpec.cacheReadCost,
|
|
1177
|
-
));
|
|
1178
|
-
}
|
|
1179
|
-
|
|
1180
|
-
// --- 5. Cold-cache idle-resume ---
|
|
1181
|
-
const thresholdMs = cfg.idleResumeMinutes * 60_000;
|
|
1182
|
-
const idleResult = onIdleResume(sessionID, thresholdMs);
|
|
1183
|
-
if (idleResult.triggered) {
|
|
1184
|
-
ltmSessionCache.delete(sessionID);
|
|
1185
|
-
log.info(
|
|
1186
|
-
`session idle ${Math.round(idleResult.idleMs / 60_000)}min — refreshing caches`,
|
|
1187
|
-
);
|
|
1188
|
-
}
|
|
1189
|
-
|
|
1190
|
-
// --- 6. LTM injection (kept separate from host system prompt for caching) ---
|
|
1191
|
-
let ltmText: string | undefined;
|
|
1192
|
-
if (cfg.knowledge.enabled) {
|
|
1193
|
-
try {
|
|
1194
|
-
let cached = ltmSessionCache.get(sessionID);
|
|
1195
|
-
|
|
1196
|
-
if (!cached) {
|
|
1197
|
-
const ltmFraction = cfg.budget.ltm;
|
|
1198
|
-
const ltmBudget = getLtmBudget(ltmFraction);
|
|
1199
|
-
const entries = ltm.forSession(projectPath, sessionID, ltmBudget);
|
|
1200
|
-
if (entries.length) {
|
|
1201
|
-
const formatted = formatKnowledge(
|
|
1202
|
-
entries.map((e) => ({
|
|
1203
|
-
category: e.category,
|
|
1204
|
-
title: e.title,
|
|
1205
|
-
content: e.content,
|
|
1206
|
-
})),
|
|
1207
|
-
ltmBudget,
|
|
1208
|
-
);
|
|
1209
|
-
|
|
1210
|
-
if (formatted) {
|
|
1211
|
-
const tokenCount = Math.ceil(formatted.length / 3);
|
|
1212
|
-
cached = { formatted, tokenCount };
|
|
1213
|
-
ltmSessionCache.set(sessionID, cached);
|
|
1214
|
-
}
|
|
1215
|
-
}
|
|
1216
|
-
}
|
|
1217
|
-
|
|
1218
|
-
if (cached) {
|
|
1219
|
-
// Content-diff pinning: only update the injected LTM text if the
|
|
1220
|
-
// new content differs by >5% from what's currently pinned. This
|
|
1221
|
-
// prevents cache busts from minor BM25 re-ranking after background
|
|
1222
|
-
// curation/consolidation invalidates the LTM cache.
|
|
1223
|
-
const pinned = ltmPinnedText.get(sessionID);
|
|
1224
|
-
if (pinned && textDiffRatio(pinned.formatted, cached.formatted) < 0.05) {
|
|
1225
|
-
// Near-identical — keep the pinned text to preserve cache prefix
|
|
1226
|
-
ltmText = pinned.formatted;
|
|
1227
|
-
setLtmTokens(pinned.tokenCount, sessionID);
|
|
1228
|
-
} else {
|
|
1229
|
-
// Substantially different or first injection — pin the new text
|
|
1230
|
-
ltmPinnedText.set(sessionID, cached);
|
|
1231
|
-
ltmText = cached.formatted;
|
|
1232
|
-
setLtmTokens(cached.tokenCount, sessionID);
|
|
1233
|
-
}
|
|
1234
|
-
} else {
|
|
1235
|
-
setLtmTokens(0, sessionID);
|
|
1236
|
-
}
|
|
1237
|
-
} catch (e) {
|
|
1238
|
-
log.error("LTM injection failed:", e);
|
|
1239
|
-
setLtmTokens(0, sessionID);
|
|
1240
|
-
} finally {
|
|
1241
|
-
consumeCameOutOfIdle(sessionID);
|
|
1242
|
-
}
|
|
1243
|
-
} else {
|
|
1244
|
-
setLtmTokens(0, sessionID);
|
|
1245
|
-
consumeCameOutOfIdle(sessionID);
|
|
1246
|
-
}
|
|
1247
|
-
|
|
1248
|
-
// --- 7. Gradient transform on messages ---
|
|
1249
|
-
const loreMessages = gatewayMessagesToLore(req.messages, sessionID);
|
|
1250
|
-
resolveToolResults(loreMessages);
|
|
1251
|
-
|
|
1252
|
-
const result = transform({
|
|
1253
|
-
messages: loreMessages,
|
|
1254
|
-
projectPath,
|
|
1255
|
-
sessionID,
|
|
1256
|
-
});
|
|
1257
|
-
|
|
1258
|
-
// Drop trailing pure-text assistant messages to prevent prefill errors
|
|
1259
|
-
while (
|
|
1260
|
-
result.messages.length > 0 &&
|
|
1261
|
-
result.messages.at(-1)!.info.role !== "user"
|
|
1262
|
-
) {
|
|
1263
|
-
const last = result.messages.at(-1)!;
|
|
1264
|
-
const hasToolParts = last.parts.some((p) => p.type === "tool");
|
|
1265
|
-
if (hasToolParts) break;
|
|
1266
|
-
result.messages.pop();
|
|
1267
|
-
}
|
|
1268
|
-
|
|
1269
|
-
// --- 8. Build the modified request ---
|
|
1270
|
-
// Reconstruct GatewayMessages from the transformed Lore messages.
|
|
1271
|
-
// loreMessagesToGateway reconstructs tool_result blocks from assistant's
|
|
1272
|
-
// completed/error tool parts; removeOrphanedToolResults is a safety net
|
|
1273
|
-
// that catches any remaining orphaned tool_result references.
|
|
1274
|
-
const transformedMessages = loreMessagesToGateway(result.messages);
|
|
1275
|
-
removeOrphanedToolResults(transformedMessages);
|
|
1276
|
-
|
|
1277
|
-
const modifiedReq: GatewayRequest = {
|
|
1278
|
-
...req,
|
|
1279
|
-
// Host system prompt is passed through unmodified — LTM is injected
|
|
1280
|
-
// as a separate system block via cache options for prefix stability.
|
|
1281
|
-
messages: transformedMessages,
|
|
1282
|
-
};
|
|
1283
|
-
|
|
1284
|
-
// --- 8b. Inject recall tool (with git reminder appended to description) ---
|
|
1285
|
-
// Only inject if the client doesn't already have a recall tool (e.g. from
|
|
1286
|
-
// a host plugin like OpenCode) and the request has other tools (so it's a
|
|
1287
|
-
// coding agent, not a bare chat).
|
|
1288
|
-
if (modifiedReq.tools.length > 0 && !clientHasRecallTool(modifiedReq.tools)) {
|
|
1289
|
-
// Build the recall tool with git reminder baked into its description.
|
|
1290
|
-
// This keeps the reminder in the stable tools prefix (1h cache) rather
|
|
1291
|
-
// than the volatile system prompt.
|
|
1292
|
-
const recallTool = cfg.knowledge.enabled
|
|
1293
|
-
? {
|
|
1294
|
-
...RECALL_GATEWAY_TOOL,
|
|
1295
|
-
description:
|
|
1296
|
-
RECALL_GATEWAY_TOOL.description +
|
|
1297
|
-
"\n\nWhen making git commits, always check if .lore.md " +
|
|
1298
|
-
"has unstaged changes and include it in the commit. " +
|
|
1299
|
-
"This file contains shared project knowledge managed " +
|
|
1300
|
-
"by lore and must be version-controlled.",
|
|
1301
|
-
}
|
|
1302
|
-
: RECALL_GATEWAY_TOOL;
|
|
1303
|
-
modifiedReq.tools = [...modifiedReq.tools, recallTool];
|
|
1304
|
-
}
|
|
1305
|
-
|
|
1306
|
-
// --- 9. Forward to upstream ---
|
|
1307
|
-
// Enable prompt caching for conversation turns with layered breakpoints:
|
|
1308
|
-
// - System prompt: 1h TTL (host prompt is very stable within a session)
|
|
1309
|
-
// - LTM: separate system block (no breakpoint, benefits from prefix)
|
|
1310
|
-
// - Tools: 1h TTL on last tool (recall + git reminder are static)
|
|
1311
|
-
// - Conversation: 5m TTL on last message block
|
|
1312
|
-
// Title/summary passthrough (handlePassthrough) never reaches here — it
|
|
1313
|
-
// forwards the raw request without buildAnthropicRequest, so no caching.
|
|
1314
|
-
const cacheOptions: AnthropicCacheOptions = {
|
|
1315
|
-
systemTTL: "1h",
|
|
1316
|
-
ltmSystem: ltmText,
|
|
1317
|
-
cacheTools: true,
|
|
1318
|
-
cacheConversation: true,
|
|
1319
|
-
};
|
|
1320
|
-
const { response: upstreamResponse, serializedBody: requestBody } =
|
|
1321
|
-
await forwardToUpstream(
|
|
1322
|
-
modifiedReq,
|
|
1323
|
-
config,
|
|
1324
|
-
undefined,
|
|
1325
|
-
cacheOptions,
|
|
1326
|
-
);
|
|
1327
|
-
|
|
1328
|
-
if (!upstreamResponse.ok) {
|
|
1329
|
-
const errorBody = await upstreamResponse.text();
|
|
1330
|
-
log.error(
|
|
1331
|
-
`upstream error: ${upstreamResponse.status} ${errorBody.slice(0, 500)}`,
|
|
1332
|
-
);
|
|
1333
|
-
return new Response(errorBody, {
|
|
1334
|
-
status: upstreamResponse.status,
|
|
1335
|
-
headers: { "content-type": "application/json" },
|
|
1336
|
-
});
|
|
1337
|
-
}
|
|
1338
|
-
|
|
1339
|
-
if (req.stream && upstreamResponse.body) {
|
|
1340
|
-
// Streaming: forward events and accumulate in parallel.
|
|
1341
|
-
// Pass recall context so the accumulator can intercept recall tool_use.
|
|
1342
|
-
const hasRecallTool = modifiedReq.tools.some(
|
|
1343
|
-
(t) => t.name === RECALL_TOOL_NAME,
|
|
1344
|
-
);
|
|
1345
|
-
return buildStreamingResponse(
|
|
1346
|
-
upstreamResponse,
|
|
1347
|
-
(resp) => postResponse(req, resp, sessionState, config, requestBody),
|
|
1348
|
-
hasRecallTool
|
|
1349
|
-
? { modifiedReq, config, sessionState, cacheOptions }
|
|
1350
|
-
: undefined,
|
|
1351
|
-
);
|
|
1352
|
-
}
|
|
1353
|
-
|
|
1354
|
-
// Non-streaming (also used for OpenAI protocol via accumulateStreamResponse)
|
|
1355
|
-
const resp = await accumulateNonStreamResponse(upstreamResponse);
|
|
1356
|
-
|
|
1357
|
-
// --- Recall interception (non-streaming) ---
|
|
1358
|
-
if (hasRecallToolUse(resp)) {
|
|
1359
|
-
const recallBlock = findRecallToolUse(resp)!;
|
|
1360
|
-
const { result, input } = await executeRecall(
|
|
1361
|
-
recallBlock,
|
|
1362
|
-
sessionState.projectPath,
|
|
1363
|
-
sessionState.sessionID,
|
|
1364
|
-
);
|
|
1365
|
-
|
|
1366
|
-
// Store recall result for marker round-trip expansion
|
|
1367
|
-
const storeKey = recallStoreKey(input.query, input.scope ?? "all");
|
|
1368
|
-
const position = resp.content.indexOf(recallBlock);
|
|
1369
|
-
sessionState.recallStore.set(storeKey, {
|
|
1370
|
-
toolUseId: recallBlock.id,
|
|
1371
|
-
input,
|
|
1372
|
-
position,
|
|
1373
|
-
result,
|
|
1374
|
-
});
|
|
1375
|
-
|
|
1376
|
-
// Replace recall tool_use with marker text in the response
|
|
1377
|
-
const markerResp = replaceRecallWithMarker(resp);
|
|
1378
|
-
|
|
1379
|
-
if (hasOtherToolUse(resp)) {
|
|
1380
|
-
// Mixed tools — return response with marker replacing recall tool_use
|
|
1381
|
-
log.info(
|
|
1382
|
-
`recall (non-stream, mixed): stored result for session ${sessionState.sessionID.slice(0, 16)}`,
|
|
1383
|
-
);
|
|
1384
|
-
postResponse(req, markerResp, sessionState, config, requestBody);
|
|
1385
|
-
return nonStreamHttpResponse(markerResp);
|
|
1386
|
-
}
|
|
1387
|
-
|
|
1388
|
-
// Recall-only — send follow-up request for seamless UX
|
|
1389
|
-
log.info(
|
|
1390
|
-
`recall (non-stream, only): executing follow-up for session ${sessionState.sessionID.slice(0, 16)}`,
|
|
1391
|
-
);
|
|
1392
|
-
const followUp = buildRecallFollowUp(modifiedReq, resp, result, recallBlock);
|
|
1393
|
-
let followUpResponse: Response;
|
|
1394
|
-
({ response: followUpResponse } = await forwardToUpstream(
|
|
1395
|
-
followUp,
|
|
1396
|
-
config,
|
|
1397
|
-
undefined,
|
|
1398
|
-
cacheOptions,
|
|
1399
|
-
));
|
|
1400
|
-
|
|
1401
|
-
if (!followUpResponse.ok) {
|
|
1402
|
-
const errorBody = await followUpResponse.text();
|
|
1403
|
-
log.error(
|
|
1404
|
-
`recall follow-up upstream error: ${followUpResponse.status} ${errorBody.slice(0, 500)}`,
|
|
1405
|
-
);
|
|
1406
|
-
// Fall back to response with marker (no continuation)
|
|
1407
|
-
postResponse(req, markerResp, sessionState, config, requestBody);
|
|
1408
|
-
return nonStreamHttpResponse(markerResp);
|
|
1409
|
-
}
|
|
1410
|
-
|
|
1411
|
-
const continuationResp = await accumulateNonStreamResponse(followUpResponse);
|
|
1412
|
-
|
|
1413
|
-
// Merge usage from both requests
|
|
1414
|
-
continuationResp.usage.inputTokens += resp.usage.inputTokens;
|
|
1415
|
-
continuationResp.usage.outputTokens += resp.usage.outputTokens;
|
|
1416
|
-
if (resp.usage.cacheReadInputTokens) {
|
|
1417
|
-
continuationResp.usage.cacheReadInputTokens =
|
|
1418
|
-
(continuationResp.usage.cacheReadInputTokens ?? 0) +
|
|
1419
|
-
resp.usage.cacheReadInputTokens;
|
|
1420
|
-
}
|
|
1421
|
-
if (resp.usage.cacheCreationInputTokens) {
|
|
1422
|
-
continuationResp.usage.cacheCreationInputTokens =
|
|
1423
|
-
(continuationResp.usage.cacheCreationInputTokens ?? 0) +
|
|
1424
|
-
resp.usage.cacheCreationInputTokens;
|
|
1425
|
-
}
|
|
1426
|
-
|
|
1427
|
-
postResponse(req, continuationResp, sessionState, config, requestBody);
|
|
1428
|
-
return nonStreamHttpResponse(continuationResp);
|
|
1429
|
-
}
|
|
1430
|
-
|
|
1431
|
-
postResponse(req, resp, sessionState, config, requestBody);
|
|
1432
|
-
return nonStreamHttpResponse(resp);
|
|
1433
|
-
}
|
|
1434
|
-
|
|
1435
|
-
// ---------------------------------------------------------------------------
|
|
1436
|
-
// Lore message → Gateway message conversion
|
|
1437
|
-
// ---------------------------------------------------------------------------
|
|
1438
|
-
|
|
1439
|
-
/**
|
|
1440
|
-
* Convert transformed Lore messages back to gateway message format.
|
|
1441
|
-
*
|
|
1442
|
-
* This reverses `gatewayMessagesToLore` after gradient transform has
|
|
1443
|
-
* potentially trimmed/reordered messages.
|
|
1444
|
-
*
|
|
1445
|
-
* Completed/error tool parts on assistant messages produce BOTH a `tool_use`
|
|
1446
|
-
* block on the assistant AND a corresponding `tool_result` block injected at
|
|
1447
|
-
* the start of the following user message. This makes the conversion
|
|
1448
|
-
* self-contained: tool pairing is reconstructed from whatever messages
|
|
1449
|
-
* survived gradient eviction, without depending on cross-message `tool_result`
|
|
1450
|
-
* parts that can become orphaned when the assistant message is evicted.
|
|
1451
|
-
*
|
|
1452
|
-
* `resolveToolResults()` strips `tool: "result"` parts from user messages
|
|
1453
|
-
* after pairing, so under normal operation those parts are gone. The fallback
|
|
1454
|
-
* handling for residual `tool: "result"` parts is kept for robustness.
|
|
1455
|
-
*/
|
|
1456
|
-
/** @internal Exported for tests. */
|
|
1457
|
-
export function loreMessagesToGateway(
|
|
1458
|
-
messages: LoreMessageWithParts[],
|
|
1459
|
-
): Array<{ role: "user" | "assistant"; content: GatewayContentBlock[] }> {
|
|
1460
|
-
const out: Array<{
|
|
1461
|
-
role: "user" | "assistant";
|
|
1462
|
-
content: GatewayContentBlock[];
|
|
1463
|
-
}> = [];
|
|
1464
|
-
|
|
1465
|
-
// tool_result blocks reconstructed from the preceding assistant message's
|
|
1466
|
-
// completed/error tool parts. Injected at the start of the next user message.
|
|
1467
|
-
let pendingToolResults: GatewayContentBlock[] = [];
|
|
1468
|
-
|
|
1469
|
-
for (const msg of messages) {
|
|
1470
|
-
const content: GatewayContentBlock[] = [];
|
|
1471
|
-
|
|
1472
|
-
if (msg.info.role === "user") {
|
|
1473
|
-
// Inject reconstructed tool_result blocks from preceding assistant
|
|
1474
|
-
content.push(...pendingToolResults);
|
|
1475
|
-
pendingToolResults = [];
|
|
1476
|
-
} else {
|
|
1477
|
-
// New assistant message — reset pending results (shouldn't have any
|
|
1478
|
-
// in well-formed conversations, but handles back-to-back assistants)
|
|
1479
|
-
pendingToolResults = [];
|
|
1480
|
-
}
|
|
1481
|
-
|
|
1482
|
-
for (const part of msg.parts) {
|
|
1483
|
-
switch (part.type) {
|
|
1484
|
-
case "text":
|
|
1485
|
-
content.push({
|
|
1486
|
-
type: "text",
|
|
1487
|
-
text: (part as { text: string }).text,
|
|
1488
|
-
});
|
|
1489
|
-
break;
|
|
1490
|
-
case "reasoning":
|
|
1491
|
-
content.push({
|
|
1492
|
-
type: "thinking",
|
|
1493
|
-
thinking: (part as { text: string }).text ?? "",
|
|
1494
|
-
...((part as { signature?: string }).signature != null
|
|
1495
|
-
? { signature: (part as { signature?: string }).signature }
|
|
1496
|
-
: undefined),
|
|
1497
|
-
});
|
|
1498
|
-
break;
|
|
1499
|
-
case "tool": {
|
|
1500
|
-
const toolPart = part as {
|
|
1501
|
-
type: "tool";
|
|
1502
|
-
tool: string;
|
|
1503
|
-
callID: string;
|
|
1504
|
-
state: {
|
|
1505
|
-
status: string;
|
|
1506
|
-
input?: unknown;
|
|
1507
|
-
output?: string;
|
|
1508
|
-
error?: string;
|
|
1509
|
-
};
|
|
1510
|
-
};
|
|
1511
|
-
if (toolPart.tool === "result") {
|
|
1512
|
-
// Residual tool_result part (should have been stripped by
|
|
1513
|
-
// resolveToolResults, but handle gracefully for robustness)
|
|
1514
|
-
content.push({
|
|
1515
|
-
type: "tool_result",
|
|
1516
|
-
toolUseId: toolPart.callID,
|
|
1517
|
-
content: toolPart.state.output ?? "",
|
|
1518
|
-
});
|
|
1519
|
-
} else {
|
|
1520
|
-
// Emit tool_use on this assistant message
|
|
1521
|
-
content.push({
|
|
1522
|
-
type: "tool_use",
|
|
1523
|
-
id: toolPart.callID,
|
|
1524
|
-
name: toolPart.tool,
|
|
1525
|
-
input: toolPart.state.input ?? {},
|
|
1526
|
-
});
|
|
1527
|
-
// Completed/error tool parts: queue a tool_result for the next
|
|
1528
|
-
// user message. This reconstructs the Anthropic API's split-
|
|
1529
|
-
// message format from Lore's single-message representation.
|
|
1530
|
-
if (toolPart.state.status === "completed") {
|
|
1531
|
-
pendingToolResults.push({
|
|
1532
|
-
type: "tool_result",
|
|
1533
|
-
toolUseId: toolPart.callID,
|
|
1534
|
-
content: toolPart.state.output ?? "",
|
|
1535
|
-
});
|
|
1536
|
-
} else if (toolPart.state.status === "error") {
|
|
1537
|
-
pendingToolResults.push({
|
|
1538
|
-
type: "tool_result",
|
|
1539
|
-
toolUseId: toolPart.callID,
|
|
1540
|
-
content: toolPart.state.error ?? "[error]",
|
|
1541
|
-
isError: true,
|
|
1542
|
-
});
|
|
1543
|
-
}
|
|
1544
|
-
// Pending tool parts (not yet resolved) only emit tool_use —
|
|
1545
|
-
// the model will see an unresolved tool call. sanitizeToolParts
|
|
1546
|
-
// in gradient.ts converts these to error state before this point.
|
|
1547
|
-
}
|
|
1548
|
-
break;
|
|
1549
|
-
}
|
|
1550
|
-
// Generic / unknown parts — skip or represent as text
|
|
1551
|
-
default:
|
|
1552
|
-
if ("text" in part && typeof part.text === "string") {
|
|
1553
|
-
content.push({ type: "text", text: part.text });
|
|
1554
|
-
}
|
|
1555
|
-
break;
|
|
1556
|
-
}
|
|
1557
|
-
}
|
|
1558
|
-
|
|
1559
|
-
out.push({ role: msg.info.role as "user" | "assistant", content });
|
|
1560
|
-
}
|
|
1561
|
-
|
|
1562
|
-
return out;
|
|
1563
|
-
}
|
|
1564
|
-
|
|
1565
|
-
// ---------------------------------------------------------------------------
|
|
1566
|
-
// Post-conversion validation: remove orphaned tool_result blocks
|
|
1567
|
-
// ---------------------------------------------------------------------------
|
|
1568
|
-
|
|
1569
|
-
/**
|
|
1570
|
-
* Belt-and-suspenders safety net: ensures every `tool_result` block on a user
|
|
1571
|
-
* message references a `tool_use` block on the immediately preceding assistant
|
|
1572
|
-
* message. Removes orphans and logs a warning.
|
|
1573
|
-
*
|
|
1574
|
-
* This should never fire under normal operation (resolveToolResults strips
|
|
1575
|
-
* redundant tool_result parts, and loreMessagesToGateway reconstructs them
|
|
1576
|
-
* from the assistant's completed tool parts). But if a future code path
|
|
1577
|
-
* introduces orphaned references, this catches them before they reach the API.
|
|
1578
|
-
*/
|
|
1579
|
-
/** @internal Exported for tests. */
|
|
1580
|
-
export function removeOrphanedToolResults(
|
|
1581
|
-
messages: Array<{
|
|
1582
|
-
role: "user" | "assistant";
|
|
1583
|
-
content: GatewayContentBlock[];
|
|
1584
|
-
}>,
|
|
1585
|
-
): void {
|
|
1586
|
-
for (let i = 0; i < messages.length; i++) {
|
|
1587
|
-
const msg = messages[i]!;
|
|
1588
|
-
if (msg.role !== "user") continue;
|
|
1589
|
-
if (!msg.content.some((b) => b.type === "tool_result")) continue;
|
|
1590
|
-
|
|
1591
|
-
// Collect tool_use IDs from the preceding assistant message
|
|
1592
|
-
const prev =
|
|
1593
|
-
i > 0 && messages[i - 1]!.role === "assistant"
|
|
1594
|
-
? messages[i - 1]!
|
|
1595
|
-
: null;
|
|
1596
|
-
const toolUseIds = new Set(
|
|
1597
|
-
(prev?.content ?? [])
|
|
1598
|
-
.filter((b): b is GatewayToolUseBlock => b.type === "tool_use")
|
|
1599
|
-
.map((b) => b.id),
|
|
1600
|
-
);
|
|
1601
|
-
|
|
1602
|
-
// Remove tool_result blocks that reference missing tool_use IDs
|
|
1603
|
-
const before = msg.content.length;
|
|
1604
|
-
msg.content = msg.content.filter(
|
|
1605
|
-
(b) =>
|
|
1606
|
-
b.type !== "tool_result" ||
|
|
1607
|
-
toolUseIds.has((b as GatewayToolResultBlock).toolUseId),
|
|
1608
|
-
);
|
|
1609
|
-
if (msg.content.length < before) {
|
|
1610
|
-
log.warn(
|
|
1611
|
-
`removed ${before - msg.content.length} orphaned tool_result block(s) from message ${i}`,
|
|
1612
|
-
);
|
|
1613
|
-
}
|
|
1614
|
-
// If the user message is now empty, add placeholder text so the API
|
|
1615
|
-
// doesn't reject an empty content array.
|
|
1616
|
-
if (msg.content.length === 0) {
|
|
1617
|
-
msg.content = [{ type: "text", text: "[tool results provided]" }];
|
|
1618
|
-
}
|
|
1619
|
-
}
|
|
1620
|
-
}
|
|
1621
|
-
|
|
1622
|
-
// ---------------------------------------------------------------------------
|
|
1623
|
-
// Error response builder
|
|
1624
|
-
// ---------------------------------------------------------------------------
|
|
1625
|
-
|
|
1626
|
-
function errorResponse(status: number, message: string): Response {
|
|
1627
|
-
return new Response(
|
|
1628
|
-
JSON.stringify({
|
|
1629
|
-
type: "error",
|
|
1630
|
-
error: {
|
|
1631
|
-
type: "server_error",
|
|
1632
|
-
message,
|
|
1633
|
-
},
|
|
1634
|
-
}),
|
|
1635
|
-
{
|
|
1636
|
-
status,
|
|
1637
|
-
headers: { "content-type": "application/json" },
|
|
1638
|
-
},
|
|
1639
|
-
);
|
|
1640
|
-
}
|
|
1641
|
-
|
|
1642
|
-
// ---------------------------------------------------------------------------
|
|
1643
|
-
// Main entry point
|
|
1644
|
-
// ---------------------------------------------------------------------------
|
|
1645
|
-
|
|
1646
|
-
/**
|
|
1647
|
-
* Process an incoming gateway request through the full Lore pipeline.
|
|
1648
|
-
*
|
|
1649
|
-
* Returns a standard `Response` object — either a streaming SSE response
|
|
1650
|
-
* or a JSON response, depending on the client's `stream` setting.
|
|
1651
|
-
*/
|
|
1652
|
-
export async function handleRequest(
|
|
1653
|
-
req: GatewayRequest,
|
|
1654
|
-
config: GatewayConfig,
|
|
1655
|
-
): Promise<Response> {
|
|
1656
|
-
try {
|
|
1657
|
-
// Capture auth credentials early for background workers
|
|
1658
|
-
const earlyAuth = extractAuth(req.rawHeaders);
|
|
1659
|
-
if (earlyAuth) {
|
|
1660
|
-
setLastSeenAuth(earlyAuth);
|
|
1661
|
-
}
|
|
1662
|
-
|
|
1663
|
-
// --- Case 1: Compaction request → intercept ---
|
|
1664
|
-
if (isCompactionRequest(req)) {
|
|
1665
|
-
return await handleCompaction(req, config);
|
|
1666
|
-
}
|
|
1667
|
-
|
|
1668
|
-
// --- Case 2: Title/summary request → passthrough ---
|
|
1669
|
-
if (isTitleOrSummaryRequest(req)) {
|
|
1670
|
-
return await handlePassthrough(req, config);
|
|
1671
|
-
}
|
|
1672
|
-
|
|
1673
|
-
// --- Case 3: Normal conversation turn → full pipeline ---
|
|
1674
|
-
return await handleConversationTurn(req, config);
|
|
1675
|
-
} catch (err) {
|
|
1676
|
-
const message =
|
|
1677
|
-
err instanceof Error ? err.message : "Unknown gateway error";
|
|
1678
|
-
log.error("pipeline error:", err);
|
|
1679
|
-
return errorResponse(502, message);
|
|
1680
|
-
}
|
|
1681
|
-
}
|