@loreai/core 0.0.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +26 -5
  3. package/dist/bun/agents-file.d.ts +59 -0
  4. package/dist/bun/agents-file.d.ts.map +1 -0
  5. package/dist/bun/config.d.ts +58 -0
  6. package/dist/bun/config.d.ts.map +1 -0
  7. package/dist/bun/curator.d.ts +35 -0
  8. package/dist/bun/curator.d.ts.map +1 -0
  9. package/dist/bun/db/driver.bun.d.ts +5 -0
  10. package/dist/bun/db/driver.bun.d.ts.map +1 -0
  11. package/dist/bun/db/driver.node.d.ts +15 -0
  12. package/dist/bun/db/driver.node.d.ts.map +1 -0
  13. package/dist/bun/db.d.ts +22 -0
  14. package/dist/bun/db.d.ts.map +1 -0
  15. package/dist/bun/distillation.d.ts +32 -0
  16. package/dist/bun/distillation.d.ts.map +1 -0
  17. package/dist/bun/embedding.d.ts +90 -0
  18. package/dist/bun/embedding.d.ts.map +1 -0
  19. package/dist/bun/gradient.d.ts +73 -0
  20. package/dist/bun/gradient.d.ts.map +1 -0
  21. package/dist/bun/index.d.ts +19 -0
  22. package/dist/bun/index.d.ts.map +1 -0
  23. package/dist/bun/index.js +28236 -0
  24. package/dist/bun/index.js.map +7 -0
  25. package/dist/bun/lat-reader.d.ts +69 -0
  26. package/dist/bun/lat-reader.d.ts.map +1 -0
  27. package/dist/bun/log.d.ts +17 -0
  28. package/dist/bun/log.d.ts.map +1 -0
  29. package/dist/bun/ltm.d.ts +138 -0
  30. package/dist/bun/ltm.d.ts.map +1 -0
  31. package/dist/bun/markdown.d.ts +37 -0
  32. package/dist/bun/markdown.d.ts.map +1 -0
  33. package/dist/bun/prompt.d.ts +47 -0
  34. package/dist/bun/prompt.d.ts.map +1 -0
  35. package/dist/bun/recall.d.ts +41 -0
  36. package/dist/bun/recall.d.ts.map +1 -0
  37. package/dist/bun/search.d.ts +113 -0
  38. package/dist/bun/search.d.ts.map +1 -0
  39. package/dist/bun/temporal.d.ts +66 -0
  40. package/dist/bun/temporal.d.ts.map +1 -0
  41. package/dist/bun/types.d.ts +180 -0
  42. package/dist/bun/types.d.ts.map +1 -0
  43. package/dist/bun/worker.d.ts +6 -0
  44. package/dist/bun/worker.d.ts.map +1 -0
  45. package/dist/node/agents-file.d.ts +59 -0
  46. package/dist/node/agents-file.d.ts.map +1 -0
  47. package/dist/node/config.d.ts +58 -0
  48. package/dist/node/config.d.ts.map +1 -0
  49. package/dist/node/curator.d.ts +35 -0
  50. package/dist/node/curator.d.ts.map +1 -0
  51. package/dist/node/db/driver.bun.d.ts +5 -0
  52. package/dist/node/db/driver.bun.d.ts.map +1 -0
  53. package/dist/node/db/driver.node.d.ts +15 -0
  54. package/dist/node/db/driver.node.d.ts.map +1 -0
  55. package/dist/node/db.d.ts +22 -0
  56. package/dist/node/db.d.ts.map +1 -0
  57. package/dist/node/distillation.d.ts +32 -0
  58. package/dist/node/distillation.d.ts.map +1 -0
  59. package/dist/node/embedding.d.ts +90 -0
  60. package/dist/node/embedding.d.ts.map +1 -0
  61. package/dist/node/gradient.d.ts +73 -0
  62. package/dist/node/gradient.d.ts.map +1 -0
  63. package/dist/node/index.d.ts +19 -0
  64. package/dist/node/index.d.ts.map +1 -0
  65. package/dist/node/index.js +28253 -0
  66. package/dist/node/index.js.map +7 -0
  67. package/dist/node/lat-reader.d.ts +69 -0
  68. package/dist/node/lat-reader.d.ts.map +1 -0
  69. package/dist/node/log.d.ts +17 -0
  70. package/dist/node/log.d.ts.map +1 -0
  71. package/dist/node/ltm.d.ts +138 -0
  72. package/dist/node/ltm.d.ts.map +1 -0
  73. package/dist/node/markdown.d.ts +37 -0
  74. package/dist/node/markdown.d.ts.map +1 -0
  75. package/dist/node/prompt.d.ts +47 -0
  76. package/dist/node/prompt.d.ts.map +1 -0
  77. package/dist/node/recall.d.ts +41 -0
  78. package/dist/node/recall.d.ts.map +1 -0
  79. package/dist/node/search.d.ts +113 -0
  80. package/dist/node/search.d.ts.map +1 -0
  81. package/dist/node/temporal.d.ts +66 -0
  82. package/dist/node/temporal.d.ts.map +1 -0
  83. package/dist/node/types.d.ts +180 -0
  84. package/dist/node/types.d.ts.map +1 -0
  85. package/dist/node/worker.d.ts +6 -0
  86. package/dist/node/worker.d.ts.map +1 -0
  87. package/dist/types/agents-file.d.ts +59 -0
  88. package/dist/types/agents-file.d.ts.map +1 -0
  89. package/dist/types/config.d.ts +58 -0
  90. package/dist/types/config.d.ts.map +1 -0
  91. package/dist/types/curator.d.ts +35 -0
  92. package/dist/types/curator.d.ts.map +1 -0
  93. package/dist/types/db/driver.bun.d.ts +5 -0
  94. package/dist/types/db/driver.bun.d.ts.map +1 -0
  95. package/dist/types/db/driver.node.d.ts +15 -0
  96. package/dist/types/db/driver.node.d.ts.map +1 -0
  97. package/dist/types/db.d.ts +22 -0
  98. package/dist/types/db.d.ts.map +1 -0
  99. package/dist/types/distillation.d.ts +32 -0
  100. package/dist/types/distillation.d.ts.map +1 -0
  101. package/dist/types/embedding.d.ts +90 -0
  102. package/dist/types/embedding.d.ts.map +1 -0
  103. package/dist/types/gradient.d.ts +73 -0
  104. package/dist/types/gradient.d.ts.map +1 -0
  105. package/dist/types/index.d.ts +19 -0
  106. package/dist/types/index.d.ts.map +1 -0
  107. package/dist/types/lat-reader.d.ts +69 -0
  108. package/dist/types/lat-reader.d.ts.map +1 -0
  109. package/dist/types/log.d.ts +17 -0
  110. package/dist/types/log.d.ts.map +1 -0
  111. package/dist/types/ltm.d.ts +138 -0
  112. package/dist/types/ltm.d.ts.map +1 -0
  113. package/dist/types/markdown.d.ts +37 -0
  114. package/dist/types/markdown.d.ts.map +1 -0
  115. package/dist/types/prompt.d.ts +47 -0
  116. package/dist/types/prompt.d.ts.map +1 -0
  117. package/dist/types/recall.d.ts +41 -0
  118. package/dist/types/recall.d.ts.map +1 -0
  119. package/dist/types/search.d.ts +113 -0
  120. package/dist/types/search.d.ts.map +1 -0
  121. package/dist/types/temporal.d.ts +66 -0
  122. package/dist/types/temporal.d.ts.map +1 -0
  123. package/dist/types/types.d.ts +180 -0
  124. package/dist/types/types.d.ts.map +1 -0
  125. package/dist/types/worker.d.ts +6 -0
  126. package/dist/types/worker.d.ts.map +1 -0
  127. package/package.json +48 -5
  128. package/src/agents-file.ts +406 -0
  129. package/src/config.ts +132 -0
  130. package/src/curator.ts +220 -0
  131. package/src/db/driver.bun.ts +18 -0
  132. package/src/db/driver.node.ts +54 -0
  133. package/src/db.ts +433 -0
  134. package/src/distillation.ts +433 -0
  135. package/src/embedding.ts +528 -0
  136. package/src/gradient.ts +1387 -0
  137. package/src/index.ts +109 -0
  138. package/src/lat-reader.ts +374 -0
  139. package/src/log.ts +27 -0
  140. package/src/ltm.ts +861 -0
  141. package/src/markdown.ts +129 -0
  142. package/src/prompt.ts +454 -0
  143. package/src/recall.ts +446 -0
  144. package/src/search.ts +330 -0
  145. package/src/temporal.ts +379 -0
  146. package/src/types.ts +199 -0
  147. package/src/worker.ts +26 -0
@@ -0,0 +1,1387 @@
1
+ import type { LoreMessage, LorePart, LoreMessageWithParts, LoreToolPart, LoreTextPart, LoreToolState, LoreToolStateCompleted } from "./types";
2
+ import { isTextPart, isReasoningPart, isToolPart } from "./types";
3
+ import { db, ensureProject, loadForceMinLayer, saveForceMinLayer } from "./db";
4
+ import { config } from "./config";
5
+ import { formatDistillations } from "./prompt";
6
+ import { normalize } from "./markdown";
7
+
8
+ type MessageWithParts = LoreMessageWithParts;
9
+
10
+ // Token estimate: ~3 chars per token. Validated against real API data across
11
+ // 200+ turn-pairs: chars/3 gives ~1.68x ratio (actual/estimate), best among
12
+ // heuristics tested. The gap is overhead (system prompt, tool definitions,
13
+ // conversation structure) which calibratedOverhead captures via EMA.
14
+ function estimate(text: string): number {
15
+ return Math.ceil(text.length / 3);
16
+ }
17
+
18
+ function estimateParts(parts: LorePart[]): number {
19
+ let total = 0;
20
+ for (const part of parts) {
21
+ if (isTextPart(part)) total += estimate(part.text);
22
+ else if (isReasoningPart(part) && part.text)
23
+ total += estimate(part.text);
24
+ else if (isToolPart(part) && part.state.status === "completed")
25
+ total += estimate(part.state.output) + estimate(part.tool) + 50;
26
+ else total += 20; // metadata overhead for other part types
27
+ }
28
+ return total;
29
+ }
30
+
31
+ function estimateMessage(msg: MessageWithParts): number {
32
+ return estimateParts(msg.parts) + 20; // role/metadata overhead
33
+ }
34
+
35
+ // Cached model context limit — set by system transform hook, used by message transform
36
+ let contextLimit = 200_000; // sensible default
37
+ let outputReserved = 32_000;
38
+
39
+ // Conservative overhead reserve for first-turn (before calibration):
40
+ // accounts for provider system prompt + AGENTS.md + tool definitions + env info
41
+ const FIRST_TURN_OVERHEAD = 15_000;
42
+
43
+ // Calibrated overhead: actual tokens used minus our message estimate.
44
+ // Null = not yet calibrated (first turn). Updated after every assistant response.
45
+ // Shared across all sessions — this is model-level overhead (system prompt,
46
+ // tool definitions, provider headers) that doesn't vary per session.
47
+ let calibratedOverhead: number | null = null;
48
+
49
+ // ---------------------------------------------------------------------------
50
+ // Per-session state
51
+ //
52
+ // All calibration, layer-tracking, and window-ID state is scoped per session
53
+ // using an in-memory Map. This prevents worker sessions (lore-distill,
54
+ // lore-curator) from corrupting the main session's sticky-layer guard and
55
+ // delta-estimation state when their transform() calls return layer 0.
56
+ //
57
+ // forceMinLayer is the one field that MUST survive process restarts: when the
58
+ // API returns "prompt is too long", the error handler sets forceMinLayer=2.
59
+ // If OpenCode restarts before the next turn, the escalation is lost and the
60
+ // overflow repeats. forceMinLayer is persisted to SQLite (session_state table)
61
+ // and loaded on first access. All other state rebuilds from the first API
62
+ // response via UNCALIBRATED_SAFETY.
63
+ // ---------------------------------------------------------------------------
64
+
65
+ type SessionState = {
66
+ /** Exact input token count from the last successful API response */
67
+ lastKnownInput: number;
68
+ /** LTM tokens that were in-flight when lastKnownInput was recorded */
69
+ lastKnownLtm: number;
70
+ /** Total messages sent to the model in the last turn (compressed count on layers 1-4) */
71
+ lastKnownMessageCount: number;
72
+ /** Number of messages in the most recent transform() output */
73
+ lastTransformedCount: number;
74
+ /** Layer used by the most recent transform() call — sticky-layer guard */
75
+ lastLayer: SafetyLayer;
76
+ /** Message IDs in the most recent transform() output — ID-based delta estimation */
77
+ lastWindowMessageIDs: Set<string>;
78
+ /** One-shot force escalation: skip layers below this on the next transform() */
79
+ forceMinLayer: SafetyLayer;
80
+ /** Token estimate from the most recent transform() output (compressed window) */
81
+ lastTransformEstimate: number;
82
+ /** Distilled prefix cache (Approach C) */
83
+ prefixCache: PrefixCache | null;
84
+ /** Raw window pin cache (Approach B) */
85
+ rawWindowCache: RawWindowCache | null;
86
+ };
87
+
88
+ function makeSessionState(): SessionState {
89
+ return {
90
+ lastKnownInput: 0,
91
+ lastKnownLtm: 0,
92
+ lastKnownMessageCount: 0,
93
+ lastTransformedCount: 0,
94
+ lastLayer: 0,
95
+ lastWindowMessageIDs: new Set(),
96
+ forceMinLayer: 0,
97
+ lastTransformEstimate: 0,
98
+ prefixCache: null,
99
+ rawWindowCache: null,
100
+ };
101
+ }
102
+
103
+ const sessionStates = new Map<string, SessionState>();
104
+
105
+ function getSessionState(sessionID: string): SessionState {
106
+ let state = sessionStates.get(sessionID);
107
+ if (!state) {
108
+ state = makeSessionState();
109
+ // Restore persisted forceMinLayer from DB — survives process restarts.
110
+ // Critical for "prompt too long" recovery: the error handler sets
111
+ // forceMinLayer=2, but if OpenCode restarts before the next turn,
112
+ // the in-memory escalation would be lost without this.
113
+ state.forceMinLayer = loadForceMinLayer(sessionID) as SafetyLayer;
114
+ sessionStates.set(sessionID, state);
115
+ }
116
+ return state;
117
+ }
118
+
119
+ // LTM tokens injected via system transform hook this turn.
120
+ // Set by setLtmTokens() after the system hook runs; consumed by transform().
121
+ let ltmTokens = 0;
122
+
123
+ export function setModelLimits(limits: { context: number; output: number }) {
124
+ contextLimit = limits.context || 200_000;
125
+ // NOTE: this cap of 32K matches what @ai-sdk/anthropic sends as max_tokens for
126
+ // claude-opus-4-6 (the SDK doesn't recognise the -6 variant and falls back to
127
+ // the generic claude-opus-4- pattern with maxOutputTokens=32K). If the SDK is
128
+ // updated to send the model's actual limit (128K for opus-4-6), this cap will
129
+ // become wrong — the effective max input would drop from 168K to 72K but our
130
+ // budget would still assume 168K. At that point, remove the cap.
131
+ outputReserved = Math.min(limits.output || 32_000, 32_000);
132
+ }
133
+
134
+ /** Called by the system transform hook after formatting LTM knowledge. */
135
+ export function setLtmTokens(tokens: number) {
136
+ ltmTokens = tokens;
137
+ }
138
+
139
+ /** Returns the current LTM token count (for tests and diagnostics). */
140
+ export function getLtmTokens(): number {
141
+ return ltmTokens;
142
+ }
143
+
144
+ /**
145
+ * Returns the token budget available for LTM system-prompt injection.
146
+ * This is the usable context (after output + overhead) multiplied by
147
+ * the configured ltm budget fraction. Call this from the system transform
148
+ * hook to cap how many tokens formatKnowledge may use.
149
+ */
150
+ export function getLtmBudget(ltmFraction: number): number {
151
+ const overhead = calibratedOverhead ?? FIRST_TURN_OVERHEAD;
152
+ const usable = Math.max(0, contextLimit - outputReserved - overhead);
153
+ return Math.floor(usable * ltmFraction);
154
+ }
155
+
156
+ // Called after each assistant message completes with real token usage data.
157
+ // actualInput = tokens.input + tokens.cache.read + tokens.cache.write
158
+ // sessionID = session that produced this response (for exact-tracking validity)
159
+ // messageCount = number of messages that were sent (for delta estimation)
160
+ //
161
+ // Overhead calibration uses lastTransformEstimate (the token estimate from the
162
+ // compressed window that was actually sent to the model) instead of re-estimating
163
+ // all session messages. On compressed sessions, all-message estimate >> actualInput,
164
+ // which clamped overhead to 0 and broke budget calculations.
165
+ export function calibrate(
166
+ actualInput: number,
167
+ sessionID?: string,
168
+ messageCount?: number,
169
+ ) {
170
+ // Use the transform's own estimate for the compressed window it produced.
171
+ // This is the correct baseline: it estimates the same messages the model saw.
172
+ const messageEstimate = sessionID
173
+ ? getSessionState(sessionID).lastTransformEstimate
174
+ : 0;
175
+
176
+ // Update global overhead calibration (shared across sessions — model-level).
177
+ // Skip when actualInput > 0 but no transform estimate exists yet (no baseline
178
+ // to compare against). Allow when both are 0 (test setup to zero overhead) or
179
+ // when we have a real transform estimate.
180
+ if (messageEstimate > 0 || actualInput === 0) {
181
+ const overhead = Math.max(0, actualInput - messageEstimate);
182
+ calibratedOverhead =
183
+ calibratedOverhead === null
184
+ ? overhead
185
+ : Math.round(calibratedOverhead * 0.7 + overhead * 0.3);
186
+ }
187
+
188
+ // Store per-session exact counts for the proactive layer 0 decision.
189
+ if (sessionID !== undefined) {
190
+ const state = getSessionState(sessionID);
191
+ state.lastKnownInput = actualInput;
192
+ state.lastKnownLtm = ltmTokens;
193
+ if (messageCount !== undefined) state.lastKnownMessageCount = messageCount;
194
+ }
195
+ }
196
+
197
+ export function getOverhead(): number {
198
+ return calibratedOverhead ?? FIRST_TURN_OVERHEAD;
199
+ }
200
+
201
+ /**
202
+ * Returns the number of messages in the most recent transform() output for
203
+ * the given session. Used by calibrate() to track the compressed window size.
204
+ */
205
+ export function getLastTransformedCount(sessionID: string): number {
206
+ return sessionStates.get(sessionID)?.lastTransformedCount ?? 0;
207
+ }
208
+
209
+ /** Returns the token estimate from the most recent transform() output. */
210
+ export function getLastTransformEstimate(sessionID: string): number {
211
+ return sessionStates.get(sessionID)?.lastTransformEstimate ?? 0;
212
+ }
213
+
214
+ /** Returns the layer used by the most recent transform() call. For testing. */
215
+ export function getLastLayer(sessionID?: string): SafetyLayer {
216
+ if (sessionID) return sessionStates.get(sessionID)?.lastLayer ?? 0;
217
+ // Fallback for tests: return from the first (and usually only) session state
218
+ const first = sessionStates.values().next().value;
219
+ return first?.lastLayer ?? 0;
220
+ }
221
+
222
+ /**
223
+ * Force the next transform() call for this session to use at least the given layer.
224
+ * Called when the API returns "prompt is too long" so the next attempt
225
+ * trims the context enough to fit within the model's context window.
226
+ */
227
+ export function setForceMinLayer(layer: SafetyLayer, sessionID?: string) {
228
+ if (sessionID) {
229
+ getSessionState(sessionID).forceMinLayer = layer;
230
+ saveForceMinLayer(sessionID, layer);
231
+ } else {
232
+ // Fallback for tests / callers without session ID: set on all active sessions
233
+ for (const [sid, state] of sessionStates.entries()) {
234
+ state.forceMinLayer = layer;
235
+ saveForceMinLayer(sid, layer);
236
+ }
237
+ }
238
+ }
239
+
240
+ // For testing only — reset all calibration and force-escalation state
241
+ export function resetCalibration(sessionID?: string) {
242
+ calibratedOverhead = null;
243
+ if (sessionID) {
244
+ saveForceMinLayer(sessionID, 0); // clear persisted state
245
+ sessionStates.delete(sessionID);
246
+ } else {
247
+ for (const sid of sessionStates.keys()) {
248
+ saveForceMinLayer(sid, 0);
249
+ }
250
+ sessionStates.clear();
251
+ }
252
+ }
253
+
254
+ type Distillation = {
255
+ id: string;
256
+ observations: string;
257
+ generation: number;
258
+ token_count: number;
259
+ created_at: number;
260
+ session_id: string;
261
+ };
262
+
263
+ // Load non-archived distillations for the in-context prefix.
264
+ // Archived gen-0 entries (preserved after meta-distillation) are excluded here
265
+ // but remain searchable via the recall tool's searchDistillations().
266
+ function loadDistillations(
267
+ projectPath: string,
268
+ sessionID?: string,
269
+ ): Distillation[] {
270
+ const pid = ensureProject(projectPath);
271
+ const query = sessionID
272
+ ? "SELECT id, observations, generation, token_count, created_at, session_id FROM distillations WHERE project_id = ? AND session_id = ? AND archived = 0 ORDER BY created_at ASC"
273
+ : "SELECT id, observations, generation, token_count, created_at, session_id FROM distillations WHERE project_id = ? AND archived = 0 ORDER BY created_at ASC";
274
+ const params = sessionID ? [pid, sessionID] : [pid];
275
+ return db()
276
+ .query(query)
277
+ .all(...params) as Distillation[];
278
+ }
279
+
280
+ // Strip all <system-reminder>...</system-reminder> blocks from message text.
281
+ // For the user-message wrapper pattern, extracts the actual user text.
282
+ // For all other reminders (build-switch, plan reminders, etc.), drops them entirely.
283
+ // These tags are added by OpenCode in-memory or persisted as synthetic parts —
284
+ // leaving them in the raw window causes the model to echo the format.
285
+ function stripSystemReminders(text: string): string {
286
+ return text
287
+ .replace(/<system-reminder>[\s\S]*?<\/system-reminder>\n?/g, (match) => {
288
+ const inner = match.match(
289
+ /The user sent the following message:\n([\s\S]*?)\n\nPlease address/,
290
+ );
291
+ return inner ? inner[1].trim() + "\n" : "";
292
+ })
293
+ .replace(/\n{3,}/g, "\n\n")
294
+ .trim();
295
+ }
296
+
297
+ function cleanParts(parts: LorePart[]): LorePart[] {
298
+ const cleaned = parts.map((part) => {
299
+ if (!isTextPart(part)) return part;
300
+ const text = stripSystemReminders(part.text);
301
+ if (text === part.text) return part;
302
+ return { ...part, text } as LorePart;
303
+ });
304
+ // Filter out text parts that became empty after stripping
305
+ const filtered = cleaned.filter(
306
+ (part) =>
307
+ !isTextPart(part) ||
308
+ part.text.trim().length > 0,
309
+ );
310
+ // If all parts were stripped (e.g. a user message that was purely build-switch synthetic
311
+ // content), keep a minimal placeholder so the message survives toModelMessages.
312
+ // Without this, the message gets dropped and the conversation ends with an assistant message,
313
+ // causing Anthropic's "does not support assistant message prefill" error.
314
+ if (filtered.length === 0 && parts.length > 0) {
315
+ const first = parts[0];
316
+ if (isTextPart(first)) {
317
+ return [{ ...first, text: "..." } as LorePart];
318
+ }
319
+ }
320
+ return filtered.length > 0 ? filtered : parts;
321
+ }
322
+
323
+ // Build a metadata annotation for a stripped tool output, preserving key signals
324
+ // about what was lost without requiring an LLM call. Inspired by the per-token
325
+ // scalar bias β from "Fast KV Compaction via Attention Matching" (Zweiger et al.,
326
+ // 2025) — when tokens are removed, preserving metadata about the removed content
327
+ // helps the model compensate for information loss and decide whether to recall.
328
+ // Reference: https://arxiv.org/abs/2602.16284
329
+ function toolStripAnnotation(toolName: string, output: string): string {
330
+ const lines = output.split("\n").length;
331
+ const chars = output.length;
332
+
333
+ // Detect key signals via lightweight heuristics — no LLM call
334
+ const hasError = /\b(?:error|fail(?:ed|ure)?|exception|panic|traceback)\b/i.test(output);
335
+ const paths = output.match(/(?:[\w.-]+\/)+[\w.-]+\.\w{1,5}/g);
336
+ const uniquePaths = paths ? [...new Set(paths)].slice(0, 5) : [];
337
+
338
+ let annotation = `[output omitted — ${toolName}: ${lines} lines`;
339
+ if (hasError) annotation += ", contained errors";
340
+ if (uniquePaths.length > 0) annotation += `, paths: ${uniquePaths.join(", ")}`;
341
+ annotation += " — use recall for details]";
342
+ return annotation;
343
+ }
344
+
345
+ // ---------------------------------------------------------------------------
346
+ // Content-aware deduplication
347
+ // ---------------------------------------------------------------------------
348
+ // Inspired by Dirac's ContextManager file-read deduplication: detects when the
349
+ // same content appears multiple times in the conversation (e.g., the same file
350
+ // read multiple times, or the same command output repeated) and replaces earlier
351
+ // occurrences with compact annotations. This reduces token pressure before layer
352
+ // selection, potentially keeping sessions at lower (less lossy) gradient layers.
353
+
354
+ // Minimum output size (chars) to consider for dedup — annotations for smaller
355
+ // outputs would cost more tokens than the original content.
356
+ const DEDUP_MIN_CHARS = 600;
357
+
358
+ /** Fast FNV-1a hash for content comparison. */
359
+ function simpleHash(str: string): number {
360
+ let hash = 0x811c9dc5;
361
+ for (let i = 0; i < str.length; i++) {
362
+ hash ^= str.charCodeAt(i);
363
+ hash = (hash * 0x01000193) >>> 0;
364
+ }
365
+ return hash;
366
+ }
367
+
368
+ /** Extract file path from a tool's input JSON.
369
+ * Handles common formats: {"path": "/foo.ts"}, {"filePath": "/foo.ts"},
370
+ * and plain text fallback. */
371
+ function extractFilePath(input: string): string | undefined {
372
+ try {
373
+ const parsed = JSON.parse(input);
374
+ return parsed.path || parsed.filePath || parsed.file;
375
+ } catch {
376
+ // Plain text — try to extract a path-like string
377
+ const match = input.match(/(?:[\w.-]+\/)+[\w.-]+\.\w{1,5}/);
378
+ return match?.[0];
379
+ }
380
+ }
381
+
382
+ /** Annotation for deduplicated tool output — follows the toolStripAnnotation() pattern. */
383
+ function dedupAnnotation(toolName: string, filePath?: string): string {
384
+ if (filePath) {
385
+ return `[earlier version of ${filePath} — see latest read below for current content]`;
386
+ }
387
+ return `[duplicate output — same content as later ${toolName} in this session — use recall for details]`;
388
+ }
389
+
390
+ /**
391
+ * Replace duplicate tool outputs with compact back-references, keeping only
392
+ * the latest occurrence of each unique output. Reduces context token usage
393
+ * without information loss — the model sees the most recent content intact.
394
+ *
395
+ * Deduplicates by:
396
+ * 1. Exact content hash: identical tool outputs (same file read twice, same command output)
397
+ * 2. Same-file reads: read_file outputs for the same path (content may differ due to edits)
398
+ *
399
+ * The current turn (from currentTurnIdx onward) is never touched — the model
400
+ * needs full context for its active work. Tool parts are never removed entirely;
401
+ * only state.output is replaced with a compact annotation.
402
+ *
403
+ * Returns the original array reference (not a copy) when no duplicates exist.
404
+ */
405
+ export function deduplicateToolOutputs(
406
+ messages: MessageWithParts[],
407
+ currentTurnIdx: number,
408
+ ): MessageWithParts[] {
409
+ // Track latest occurrence: contentKey → latest message index
410
+ const contentLatest = new Map<string, number>();
411
+ // Track latest read by file path: "read:path" → latest message index
412
+ const fileLatest = new Map<string, number>();
413
+
414
+ // Also include current-turn reads in the "latest" tracking so we properly
415
+ // recognize earlier reads as duplicates of current-turn content.
416
+ for (let i = 0; i < messages.length; i++) {
417
+ for (const part of messages[i].parts) {
418
+ if (!isToolPart(part) || part.state.status !== "completed") continue;
419
+ const output = part.state.output;
420
+ if (!output || output.length < DEDUP_MIN_CHARS) continue;
421
+
422
+ const key = `${part.tool}:${simpleHash(output)}`;
423
+ contentLatest.set(key, i);
424
+
425
+ // For read-type tools, also track by file path
426
+ if (part.tool === "read_file" || part.tool === "read") {
427
+ const inputStr = typeof part.state.input === "string"
428
+ ? part.state.input
429
+ : JSON.stringify(part.state.input);
430
+ const fp = extractFilePath(inputStr);
431
+ if (fp) fileLatest.set(`read:${fp}`, i);
432
+ }
433
+ }
434
+ }
435
+
436
+ // Second pass: replace earlier occurrences (but never touch the current turn)
437
+ let changed = false;
438
+ const result = messages.map((msg, msgIdx) => {
439
+ if (msgIdx >= currentTurnIdx) return msg; // sacred boundary
440
+
441
+ let partsChanged = false;
442
+ const parts = msg.parts.map((part) => {
443
+ if (!isToolPart(part) || part.state.status !== "completed") return part;
444
+ const output = part.state.output;
445
+ if (!output || output.length < DEDUP_MIN_CHARS) return part;
446
+
447
+ // Check exact-match dedup: is this the latest occurrence of this content?
448
+ const contentKey = `${part.tool}:${simpleHash(output)}`;
449
+ const isLatestContent = contentLatest.get(contentKey) === msgIdx;
450
+
451
+ // Check file-path dedup for read tools: is this the latest read of this file?
452
+ let filePath: string | undefined;
453
+ let isLatestFile = true;
454
+ if (part.tool === "read_file" || part.tool === "read") {
455
+ const inputStr = typeof part.state.input === "string"
456
+ ? part.state.input
457
+ : JSON.stringify(part.state.input);
458
+ filePath = extractFilePath(inputStr);
459
+ if (filePath) isLatestFile = fileLatest.get(`read:${filePath}`) === msgIdx;
460
+ }
461
+
462
+ // Keep if this is both the latest content AND latest file read (or not a read tool)
463
+ if (isLatestContent && isLatestFile) return part;
464
+
465
+ // This is a duplicate — replace with compact annotation
466
+ partsChanged = true;
467
+ return {
468
+ ...part,
469
+ state: {
470
+ ...part.state,
471
+ output: dedupAnnotation(part.tool, filePath),
472
+ },
473
+ } as LorePart;
474
+ });
475
+
476
+ if (!partsChanged) return msg;
477
+ changed = true;
478
+ return { ...msg, parts };
479
+ });
480
+
481
+ return changed ? result : messages;
482
+ }
483
+
484
+ // Ensure every tool part in the window has a terminal state (completed or error).
485
+ // Pending/running tool parts produce tool_use blocks at the API level but have no
486
+ // output to generate a matching tool_result — causing Anthropic to reject the request
487
+ // with "tool_use ids were found without tool_result blocks immediately after".
488
+ // This happens when a session errors mid-tool-execution (e.g. context overflow) and
489
+ // the tool part remains in pending/running state on the next transform.
490
+ // Converting to error state generates both tool_use + tool_result(is_error=true).
491
+ function sanitizeToolParts(
492
+ messages: MessageWithParts[],
493
+ ): MessageWithParts[] {
494
+ let changed = false;
495
+ const result = messages.map((msg) => {
496
+ if (msg.info.role !== "assistant") return msg;
497
+
498
+ let partsChanged = false;
499
+ const parts = msg.parts.map((part) => {
500
+ if (!isToolPart(part)) return part;
501
+ const { status } = part.state;
502
+ if (status === "completed" || status === "error") return part;
503
+
504
+ // pending or running → convert to error so SDK emits tool_result
505
+ partsChanged = true;
506
+ const now = Date.now();
507
+ return {
508
+ ...part,
509
+ state: {
510
+ status: "error" as const,
511
+ input: part.state.input,
512
+ error: "[tool execution interrupted — session recovered]",
513
+ metadata:
514
+ "metadata" in part.state ? part.state.metadata : undefined,
515
+ time: {
516
+ start: "time" in part.state ? part.state.time.start : now,
517
+ end: now,
518
+ },
519
+ },
520
+ } as LorePart;
521
+ });
522
+
523
+ if (!partsChanged) return msg;
524
+ changed = true;
525
+ return { ...msg, parts };
526
+ });
527
+
528
+ return changed ? result : messages;
529
+ }
530
+
531
+ function stripToolOutputs(parts: LorePart[]): LorePart[] {
532
+ return parts.map((part) => {
533
+ if (!isToolPart(part)) return part;
534
+ if (part.state.status !== "completed") return part;
535
+ return {
536
+ ...part,
537
+ state: {
538
+ ...part.state,
539
+ output: toolStripAnnotation(part.tool, part.state.output),
540
+ },
541
+ } as LorePart;
542
+ });
543
+ }
544
+
545
+ function stripToTextOnly(parts: LorePart[]): LorePart[] {
546
+ const stripped = parts
547
+ .filter(isTextPart)
548
+ .map((p) => ({
549
+ ...p,
550
+ text: normalize(stripSystemReminders(p.text)),
551
+ }))
552
+ .filter((p) => p.text.trim().length > 0) as LorePart[];
553
+ // Guard against empty result — keep a placeholder so the message survives
554
+ // toModelMessages and the conversation doesn't end with an assistant message.
555
+ if (stripped.length === 0 && parts.length > 0) {
556
+ const first = parts.find(isTextPart);
557
+ if (first) return [{ ...first, text: "..." } as LorePart];
558
+ }
559
+ return stripped;
560
+ }
561
+
562
+ // --- Phase 2: Temporal anchoring at read time ---
563
+
564
+ function formatRelativeTime(date: Date, now: Date): string {
565
+ const diffMs = now.getTime() - date.getTime();
566
+ const diffDays = Math.floor(diffMs / (1000 * 60 * 60 * 24));
567
+ if (diffDays === 0) return "today";
568
+ if (diffDays === 1) return "yesterday";
569
+ if (diffDays < 7) return `${diffDays} days ago`;
570
+ if (diffDays < 14) return "1 week ago";
571
+ if (diffDays < 30) return `${Math.floor(diffDays / 7)} weeks ago`;
572
+ if (diffDays < 60) return "1 month ago";
573
+ if (diffDays < 365) return `${Math.floor(diffDays / 30)} months ago`;
574
+ return `${Math.floor(diffDays / 365)} year${Math.floor(diffDays / 365) > 1 ? "s" : ""} ago`;
575
+ }
576
+
577
+ function parseDateFromContent(s: string): Date | null {
578
+ // "Month Day, Year" e.g. "January 15, 2026"
579
+ const simple = s.match(/([A-Z][a-z]+)\s+(\d{1,2}),?\s+(\d{4})/);
580
+ if (simple) {
581
+ const d = new Date(`${simple[1]} ${simple[2]}, ${simple[3]}`);
582
+ if (!isNaN(d.getTime())) return d;
583
+ }
584
+ // "Month D-D, Year" range — use start
585
+ const range = s.match(/([A-Z][a-z]+)\s+(\d{1,2})-\d{1,2},?\s+(\d{4})/);
586
+ if (range) {
587
+ const d = new Date(`${range[1]} ${range[2]}, ${range[3]}`);
588
+ if (!isNaN(d.getTime())) return d;
589
+ }
590
+ // "late/early/mid Month Year"
591
+ const vague = s.match(/(late|early|mid)[- ]?([A-Z][a-z]+)\s+(\d{4})/i);
592
+ if (vague) {
593
+ const day =
594
+ vague[1].toLowerCase() === "early"
595
+ ? 7
596
+ : vague[1].toLowerCase() === "late"
597
+ ? 23
598
+ : 15;
599
+ const d = new Date(`${vague[2]} ${day}, ${vague[3]}`);
600
+ if (!isNaN(d.getTime())) return d;
601
+ }
602
+ return null;
603
+ }
604
+
605
+ // Expand "(meaning DATE)" and "(estimated DATE)" annotations with a relative offset.
606
+ // Past future-intent lines get "(likely already happened)" appended.
607
+ function expandInlineEstimatedDates(text: string, now: Date): string {
608
+ return text.replace(
609
+ /\(((?:meaning|estimated)\s+)([^)]+\d{4})\)/gi,
610
+ (match, prefix: string, dateContent: string) => {
611
+ const d = parseDateFromContent(dateContent);
612
+ if (!d) return match;
613
+ const rel = formatRelativeTime(d, now);
614
+ // Detect future-intent by looking backwards on the same line
615
+ const matchIdx = text.indexOf(match);
616
+ const lineStart = text.lastIndexOf("\n", matchIdx) + 1;
617
+ const linePrefix = text.slice(lineStart, matchIdx);
618
+ const isFutureIntent =
619
+ /\b(?:will|plans?\s+to|planning\s+to|going\s+to|intends?\s+to)\b/i.test(
620
+ linePrefix,
621
+ );
622
+ if (d < now && isFutureIntent)
623
+ return `(${prefix}${dateContent} — ${rel}, likely already happened)`;
624
+ return `(${prefix}${dateContent} — ${rel})`;
625
+ },
626
+ );
627
+ }
628
+
629
+ // Add relative time annotations to "Date: Month D, Year" section headers
630
+ // and gap markers between non-consecutive dates.
631
+ function addRelativeTimeToObservations(text: string, now: Date): string {
632
+ // First pass: expand inline "(meaning DATE)" annotations
633
+ const withInline = expandInlineEstimatedDates(text, now);
634
+
635
+ // Second pass: annotate date headers and add gap markers
636
+ const dateHeaderRe = /^(Date:\s*)([A-Z][a-z]+ \d{1,2}, \d{4})$/gm;
637
+ const found: Array<{
638
+ index: number;
639
+ date: Date;
640
+ full: string;
641
+ prefix: string;
642
+ ds: string;
643
+ }> = [];
644
+ let m: RegExpExecArray | null;
645
+ while ((m = dateHeaderRe.exec(withInline)) !== null) {
646
+ const d = new Date(m[2]);
647
+ if (!isNaN(d.getTime()))
648
+ found.push({
649
+ index: m.index,
650
+ date: d,
651
+ full: m[0],
652
+ prefix: m[1],
653
+ ds: m[2],
654
+ });
655
+ }
656
+ if (!found.length) return withInline;
657
+
658
+ let result = "";
659
+ let last = 0;
660
+ for (let i = 0; i < found.length; i++) {
661
+ const curr = found[i];
662
+ const prev = found[i - 1];
663
+ result += withInline.slice(last, curr.index);
664
+ // Gap marker between non-consecutive dates
665
+ if (prev) {
666
+ const gapDays = Math.floor(
667
+ (curr.date.getTime() - prev.date.getTime()) / 86400000,
668
+ );
669
+ if (gapDays > 1) {
670
+ const gap =
671
+ gapDays < 7
672
+ ? `[${gapDays} days later]`
673
+ : gapDays < 14
674
+ ? "[1 week later]"
675
+ : gapDays < 30
676
+ ? `[${Math.floor(gapDays / 7)} weeks later]`
677
+ : gapDays < 60
678
+ ? "[1 month later]"
679
+ : `[${Math.floor(gapDays / 30)} months later]`;
680
+ result += `\n${gap}\n\n`;
681
+ }
682
+ }
683
+ result += `${curr.prefix}${curr.ds} (${formatRelativeTime(curr.date, now)})`;
684
+ last = curr.index + curr.full.length;
685
+ }
686
+ result += withInline.slice(last);
687
+ return result;
688
+ }
689
+
690
+ // Build synthetic user/assistant message pair wrapping formatted distillation text.
691
+ // Shared by the cached and non-cached prefix paths.
692
+ function buildPrefixMessages(formatted: string): MessageWithParts[] {
693
+ return [
694
+ {
695
+ info: {
696
+ id: "lore-distilled-user",
697
+ sessionID: "",
698
+ role: "user" as const,
699
+ time: { created: 0 },
700
+ agent: "",
701
+ model: { providerID: "", modelID: "" },
702
+ },
703
+ parts: [
704
+ {
705
+ id: "lore-distilled-user-part",
706
+ sessionID: "",
707
+ messageID: "lore-distilled-user",
708
+ type: "text" as const,
709
+ text: "[Memory context follows — do not reference this format in your responses]",
710
+ time: { start: 0, end: 0 },
711
+ },
712
+ ],
713
+ },
714
+ {
715
+ info: {
716
+ id: "lore-distilled-assistant",
717
+ sessionID: "",
718
+ role: "assistant" as const,
719
+ time: { created: 0 },
720
+ parentID: "lore-distilled-user",
721
+ modelID: "",
722
+ providerID: "",
723
+ mode: "memory",
724
+ path: { cwd: "", root: "" },
725
+ cost: 0,
726
+ tokens: {
727
+ input: 0,
728
+ output: 0,
729
+ reasoning: 0,
730
+ cache: { read: 0, write: 0 },
731
+ },
732
+ },
733
+ parts: [
734
+ {
735
+ id: "lore-distilled-assistant-part",
736
+ sessionID: "",
737
+ messageID: "lore-distilled-assistant",
738
+ type: "text" as const,
739
+ text: formatted + "\n\nI'm ready to continue.",
740
+ time: { start: 0, end: 0 },
741
+ },
742
+ ],
743
+ },
744
+ ];
745
+ }
746
+
747
+ // Build a synthetic message pair containing the distilled history.
748
+ // Non-cached path — used by layers 2-4 which already cause full cache invalidation.
749
+ function distilledPrefix(distillations: Distillation[]): MessageWithParts[] {
750
+ if (!distillations.length) return [];
751
+ const now = new Date();
752
+ const annotated = distillations.map((d) => ({
753
+ ...d,
754
+ observations: addRelativeTimeToObservations(d.observations, now),
755
+ }));
756
+ const formatted = formatDistillations(annotated);
757
+ if (!formatted) return [];
758
+ return buildPrefixMessages(formatted);
759
+ }
760
+
761
+ // --- Approach C: Append-only distillation prefix cache ---
762
+ //
763
+ // Caches the rendered prefix text per session. When new distillations arrive,
764
+ // only renders the new rows and appends them to the cached text. This keeps
765
+ // the prefix byte-identical between distillation runs, preserving the prompt
766
+ // cache. Only meta-distillation (which rewrites gen-0 rows into gen-1) causes
767
+ // a full re-render — and that happens roughly every 80-100 turns.
768
+
769
+ type PrefixCache = {
770
+ /** The session this cache belongs to */
771
+ sessionID: string;
772
+ /** ID of the last distillation row included in the cached text */
773
+ lastDistillationID: string;
774
+ /** Number of rows that produced the cached text */
775
+ rowCount: number;
776
+ /** The rendered text (used to build delta appends) */
777
+ cachedText: string;
778
+ /** Ready-to-use message pair */
779
+ prefixMessages: MessageWithParts[];
780
+ /** Token estimate of prefixMessages */
781
+ prefixTokens: number;
782
+ };
783
+
784
+ /**
785
+ * Return the distilled prefix messages, reusing cached content when possible.
786
+ * Uses per-session state from sessState.prefixCache (no module-level cache).
787
+ *
788
+ * Cache hit — no new rows: returns the exact same prefixMessages object
789
+ * (byte-identical content, prompt cache preserved).
790
+ * Cache miss — new rows appended: renders only the delta, appends to cached
791
+ * text, updates cache.
792
+ * Full reset — first call, or rows were rewritten by meta-distillation:
793
+ * renders everything from scratch.
794
+ */
795
+ function distilledPrefixCached(
796
+ distillations: Distillation[],
797
+ sessionID: string,
798
+ sessState: SessionState,
799
+ ): { messages: MessageWithParts[]; tokens: number } {
800
+ if (!distillations.length) {
801
+ sessState.prefixCache = null;
802
+ return { messages: [], tokens: 0 };
803
+ }
804
+
805
+ const lastRow = distillations[distillations.length - 1];
806
+ const prefixCache = sessState.prefixCache;
807
+
808
+ // Cache is valid when: same session, row count only grew (no rewrites),
809
+ // and the last previously-cached row still exists at the same position.
810
+ const cacheValid =
811
+ prefixCache !== null &&
812
+ prefixCache.sessionID === sessionID &&
813
+ prefixCache.rowCount <= distillations.length &&
814
+ (prefixCache.rowCount === 0 ||
815
+ distillations[prefixCache.rowCount - 1]?.id ===
816
+ prefixCache.lastDistillationID);
817
+
818
+ if (cacheValid) {
819
+ if (prefixCache!.lastDistillationID === lastRow.id) {
820
+ // No new rows — return cached prefix as-is (byte-identical for prompt cache)
821
+ return {
822
+ messages: prefixCache!.prefixMessages,
823
+ tokens: prefixCache!.prefixTokens,
824
+ };
825
+ }
826
+
827
+ // New rows appended — render only the delta and append to cached text
828
+ const newRows = distillations.slice(prefixCache!.rowCount);
829
+ const now = new Date();
830
+ const annotated = newRows.map((d) => ({
831
+ ...d,
832
+ observations: addRelativeTimeToObservations(d.observations, now),
833
+ }));
834
+ const deltaText = formatDistillations(annotated);
835
+
836
+ if (deltaText) {
837
+ const fullText = prefixCache!.cachedText + "\n\n" + deltaText;
838
+ const messages = buildPrefixMessages(fullText);
839
+ const tokens = messages.reduce((sum, m) => sum + estimateMessage(m), 0);
840
+ sessState.prefixCache = {
841
+ sessionID,
842
+ lastDistillationID: lastRow.id,
843
+ rowCount: distillations.length,
844
+ cachedText: fullText,
845
+ prefixMessages: messages,
846
+ prefixTokens: tokens,
847
+ };
848
+ return { messages, tokens };
849
+ }
850
+ }
851
+
852
+ // Full re-render: first call or meta-distillation rewrote rows
853
+ const now = new Date();
854
+ const annotated = distillations.map((d) => ({
855
+ ...d,
856
+ observations: addRelativeTimeToObservations(d.observations, now),
857
+ }));
858
+ const fullText = formatDistillations(annotated);
859
+ if (!fullText) {
860
+ sessState.prefixCache = null;
861
+ return { messages: [], tokens: 0 };
862
+ }
863
+
864
+ const messages = buildPrefixMessages(fullText);
865
+ const tokens = messages.reduce((sum, m) => sum + estimateMessage(m), 0);
866
+ sessState.prefixCache = {
867
+ sessionID,
868
+ lastDistillationID: lastRow.id,
869
+ rowCount: distillations.length,
870
+ cachedText: fullText,
871
+ prefixMessages: messages,
872
+ prefixTokens: tokens,
873
+ };
874
+ return { messages, tokens };
875
+ }
876
+
877
+ // For testing only — reset prefix cache state for a specific session (or all)
878
+ export function resetPrefixCache(sessionID?: string) {
879
+ if (sessionID) {
880
+ const state = sessionStates.get(sessionID);
881
+ if (state) state.prefixCache = null;
882
+ } else {
883
+ for (const state of sessionStates.values()) state.prefixCache = null;
884
+ }
885
+ }
886
+
887
+ // --- Approach B: Lazy raw window eviction ---
888
+ //
889
+ // Tracks the ID of the first (oldest) message in the previous raw window.
890
+ // On the next turn, if the window starting at that message still fits within
891
+ // the raw budget, the cutoff is pinned — no messages are evicted and the raw
892
+ // window stays byte-identical for caching purposes. Only when the pinned
893
+ // window no longer fits (e.g. a large tool response pushed us over) is the
894
+ // cutoff allowed to advance forward by one message at a time.
895
+ //
896
+ // This eliminates the "window sliding on every turn" problem that was the
897
+ // dominant source of cache misses in gradient mode: each new turn appends a
898
+ // message to the conversation, but the start of the raw window only moves
899
+ // when it must.
900
+ //
901
+ // Reset conditions: session changes, or layer escalates to 2+ (the pinned
902
+ // window was too large even with stripping — something genuinely changed).
903
+
904
+ type RawWindowCache = {
905
+ sessionID: string;
906
+ /** ID of the first message in the pinned raw window */
907
+ firstMessageID: string;
908
+ };
909
+
910
+ // For testing only — reset raw window cache state for a specific session (or all)
911
+ export function resetRawWindowCache(sessionID?: string) {
912
+ if (sessionID) {
913
+ const state = sessionStates.get(sessionID);
914
+ if (state) state.rawWindowCache = null;
915
+ } else {
916
+ for (const state of sessionStates.values()) state.rawWindowCache = null;
917
+ }
918
+ }
919
+
920
+ /**
921
+ * Layer-1 tryFit with lazy eviction.
922
+ * Uses per-session rawWindowCache from sessState (no module-level cache).
923
+ *
924
+ * Attempts to reuse the previous raw window cutoff before falling back to a
925
+ * full backward scan. If the pinned window fits, returns it unchanged (same
926
+ * message objects, byte-identical for prompt caching). If it doesn't fit,
927
+ * delegates to the normal tryFit which finds the new minimal cutoff and
928
+ * updates the cache.
929
+ */
930
+ function tryFitStable(input: {
931
+ messages: MessageWithParts[];
932
+ prefix: MessageWithParts[];
933
+ prefixTokens: number;
934
+ distilledBudget: number;
935
+ rawBudget: number;
936
+ sessionID: string;
937
+ sessState: SessionState;
938
+ }): Omit<TransformResult, "layer" | "usable" | "distilledBudget" | "rawBudget"> | null {
939
+ // If the prefix already overflows its budget there's no point trying.
940
+ if (input.prefixTokens > input.distilledBudget && input.prefix.length > 0)
941
+ return null;
942
+
943
+ const rawWindowCache = input.sessState.rawWindowCache;
944
+ const cacheValid =
945
+ rawWindowCache !== null && rawWindowCache.sessionID === input.sessionID;
946
+
947
+ if (cacheValid) {
948
+ const pinnedIdx = input.messages.findIndex(
949
+ (m) => m.info.id === rawWindowCache!.firstMessageID,
950
+ );
951
+
952
+ if (pinnedIdx !== -1) {
953
+ // Measure the token cost of the pinned window.
954
+ const pinnedWindow = input.messages.slice(pinnedIdx);
955
+ const pinnedTokens = pinnedWindow.reduce(
956
+ (sum, m) => sum + estimateMessage(m),
957
+ 0,
958
+ );
959
+
960
+ if (pinnedTokens <= input.rawBudget) {
961
+ // Pinned window still fits — keep it. Apply system-reminder cleanup
962
+ // only (strip:"none" is the layer-1 mode), returning the same message
963
+ // object references wherever nothing changed.
964
+ const processed = pinnedWindow.map((msg) => {
965
+ const parts = cleanParts(msg.parts);
966
+ return parts !== msg.parts ? { info: msg.info, parts } : msg;
967
+ });
968
+ const total = input.prefixTokens + pinnedTokens;
969
+ return {
970
+ messages: [...input.prefix, ...processed],
971
+ distilledTokens: input.prefixTokens,
972
+ rawTokens: pinnedTokens,
973
+ totalTokens: total,
974
+ };
975
+ }
976
+ // Pinned window is too large — fall through to the normal scan below.
977
+ }
978
+ }
979
+
980
+ // Normal backward scan to find the tightest fitting cutoff.
981
+ const result = tryFit({
982
+ messages: input.messages,
983
+ prefix: input.prefix,
984
+ prefixTokens: input.prefixTokens,
985
+ distilledBudget: input.distilledBudget,
986
+ rawBudget: input.rawBudget,
987
+ strip: "none",
988
+ });
989
+
990
+ if (result) {
991
+ // Update the raw window cache: the first non-prefix message is the oldest
992
+ // raw message in the new window. Pin to its ID for the next turn.
993
+ const rawStart = result.messages[input.prefix.length];
994
+ if (rawStart) {
995
+ input.sessState.rawWindowCache = {
996
+ sessionID: input.sessionID,
997
+ firstMessageID: rawStart.info.id,
998
+ };
999
+ }
1000
+ }
1001
+
1002
+ return result;
1003
+ }
1004
+
1005
+ export type SafetyLayer = 0 | 1 | 2 | 3 | 4;
1006
+
1007
+ export type TransformResult = {
1008
+ messages: MessageWithParts[];
1009
+ layer: SafetyLayer;
1010
+ distilledTokens: number;
1011
+ rawTokens: number;
1012
+ totalTokens: number;
1013
+ // Budget context (for display in context inspector)
1014
+ usable: number;
1015
+ distilledBudget: number;
1016
+ rawBudget: number;
1017
+ };
1018
+
1019
+ // Signal that we need urgent distillation
1020
+ let urgentDistillation = false;
1021
+ export function needsUrgentDistillation(): boolean {
1022
+ const v = urgentDistillation;
1023
+ urgentDistillation = false;
1024
+ return v;
1025
+ }
1026
+
1027
+ function transformInner(input: {
1028
+ messages: MessageWithParts[];
1029
+ projectPath: string;
1030
+ sessionID?: string;
1031
+ }): TransformResult {
1032
+ const cfg = config();
1033
+ const overhead = getOverhead();
1034
+ // Usable = full context minus output reservation minus fixed overhead (system + tools)
1035
+ // minus LTM tokens already injected into the system prompt this turn.
1036
+ const usable = Math.max(
1037
+ 0,
1038
+ contextLimit - outputReserved - overhead - ltmTokens,
1039
+ );
1040
+ const distilledBudget = Math.floor(usable * cfg.budget.distilled);
1041
+ const rawBudget = Math.floor(usable * cfg.budget.raw);
1042
+
1043
+ // --- Force escalation (reactive error recovery) ---
1044
+ // When the API previously rejected with "prompt is too long", skip layers
1045
+ // below the forced minimum to ensure enough trimming on the next attempt.
1046
+ // One-shot: consumed here and reset to 0 (both in-memory and on disk).
1047
+ const sid = input.sessionID ?? input.messages[0]?.info.sessionID;
1048
+ const sessState = sid ? getSessionState(sid) : makeSessionState();
1049
+ let effectiveMinLayer = sessState.forceMinLayer;
1050
+ sessState.forceMinLayer = 0;
1051
+ if (sid && effectiveMinLayer > 0) saveForceMinLayer(sid, 0);
1052
+
1053
+ // --- Approach A: Cache-preserving passthrough ---
1054
+ // Use exact token count from the previous API response when available.
1055
+ // Only the delta (messages added since last call) uses chars/3 estimation,
1056
+ // making the layer-0 decision highly accurate from the API's own tokenizer.
1057
+ // maxInput = absolute ceiling the API enforces: input_tokens + max_tokens <= context
1058
+ const maxInput = contextLimit - outputReserved;
1059
+
1060
+ // True when we have real API token data from a previous turn in this session.
1061
+ // When false (first turn / session change), chars/3 estimates may still diverge
1062
+ // from the real tokenizer — so tryFit output must be validated with a safety
1063
+ // multiplier before being used.
1064
+ const calibrated = sessState.lastKnownInput > 0;
1065
+
1066
+ // On uncalibrated turns, apply this multiplier to tryFit's estimated total to
1067
+ // approximate the real token count. chars/3 undercounts by ~1.68x on real data,
1068
+ // but overhead EMA captures most of the gap. 1.5 provides a safe margin.
1069
+ const UNCALIBRATED_SAFETY = 1.5;
1070
+
1071
+ // Returns true if the tryFit result is safe to use: either we have calibrated
1072
+ // data (exact) or the estimated total * safety factor fits within maxInput.
1073
+ function fitsWithSafetyMargin(result: { totalTokens: number } | null): boolean {
1074
+ if (!result) return false;
1075
+ if (calibrated) return true;
1076
+ return result.totalTokens * UNCALIBRATED_SAFETY <= maxInput;
1077
+ }
1078
+
1079
+ // --- Sticky layer guard (Option C) ---
1080
+ // After a compressed turn (layer >= 1), don't allow layer 0 re-entry until
1081
+ // the session genuinely shrinks (e.g. after compaction deletes messages).
1082
+ // Prevents the calibration oscillation: a compressed turn stores
1083
+ // lastKnownInput=100K for a 50-message window, but the next turn's
1084
+ // input.messages has 300 raw messages. The delta estimation treats the 250
1085
+ // evicted messages as "new" and undercounts their tokens, producing an
1086
+ // expectedInput that fits in layer 0 — but the actual tokens are ~190K.
1087
+ // Only applied when calibrated (same session, per-session state) to avoid
1088
+ // affecting other sessions including worker sessions.
1089
+ if (calibrated && sessState.lastLayer >= 1 && input.messages.length >= sessState.lastKnownMessageCount) {
1090
+ effectiveMinLayer = Math.max(effectiveMinLayer, 1) as SafetyLayer;
1091
+ }
1092
+
1093
+ let expectedInput: number;
1094
+ if (calibrated) {
1095
+ // Exact approach: prior API count + estimate of only genuinely new messages.
1096
+ // Use message ID tracking (Option B) to identify new messages accurately.
1097
+ // After compression, the "last window" is a subset of the full message array —
1098
+ // counting by index would treat evicted messages as new (off-by-250 error).
1099
+ const newMessages = sessState.lastWindowMessageIDs.size > 0
1100
+ ? input.messages.filter((m) => !sessState.lastWindowMessageIDs.has(m.info.id))
1101
+ : input.messages.slice(-Math.max(0, input.messages.length - sessState.lastKnownMessageCount));
1102
+ const newMsgTokens = newMessages.reduce((s, m) => s + estimateMessage(m), 0);
1103
+ const ltmDelta = ltmTokens - sessState.lastKnownLtm;
1104
+ expectedInput = sessState.lastKnownInput + newMsgTokens + ltmDelta;
1105
+ } else {
1106
+ // First turn or session change: fall back to chars/3 estimate + overhead.
1107
+ const messageTokens = input.messages.reduce((s, m) => s + estimateMessage(m), 0);
1108
+ expectedInput = messageTokens + overhead + ltmTokens;
1109
+ }
1110
+
1111
+ // When uncalibrated, apply safety multiplier to the layer-0 decision too.
1112
+ // chars/3 undercounts by ~1.63x on real sessions — without this, a session
1113
+ // estimated at 146K passes layer 0 but actually costs 214K → overflow.
1114
+ const layer0Input = calibrated ? expectedInput : expectedInput * UNCALIBRATED_SAFETY;
1115
+
1116
+ if (effectiveMinLayer === 0 && layer0Input <= maxInput) {
1117
+ // All messages fit — return unmodified to preserve append-only prompt-cache pattern.
1118
+ // Raw messages are strictly better context than lossy distilled summaries.
1119
+ const messageTokens = calibrated
1120
+ ? expectedInput - (ltmTokens - sessState.lastKnownLtm) // approximate raw portion
1121
+ : expectedInput - overhead - ltmTokens;
1122
+ return {
1123
+ messages: input.messages,
1124
+ layer: 0,
1125
+ distilledTokens: 0,
1126
+ rawTokens: Math.max(0, messageTokens),
1127
+ totalTokens: Math.max(0, messageTokens),
1128
+ usable,
1129
+ distilledBudget,
1130
+ rawBudget,
1131
+ };
1132
+ }
1133
+
1134
+ // --- Gradient mode: context exhausted (or force-escalated), compress older messages ---
1135
+
1136
+ // Pre-pass: deduplicate repeated tool outputs before layer selection.
1137
+ // Keeps only the latest occurrence of each unique output, replacing earlier
1138
+ // ones with compact annotations. This can save thousands of tokens for sessions
1139
+ // with repeated file reads, potentially avoiding escalation to higher layers.
1140
+ const turnStart = currentTurnStart(input.messages);
1141
+ const dedupMessages = deduplicateToolOutputs(input.messages, turnStart);
1142
+
1143
+ const distillations = sid ? loadDistillations(input.projectPath, sid) : [];
1144
+
1145
+ // Layer 1 uses the append-only cached prefix (Approach C) to keep the
1146
+ // distilled content byte-identical between distillation runs, preserving
1147
+ // the prompt cache. Layers 2-4 already cause full cache invalidation via
1148
+ // tool stripping / message restructuring, so they use the non-cached path.
1149
+ const cached = sid
1150
+ ? distilledPrefixCached(distillations, sid, sessState)
1151
+ : (() => {
1152
+ const msgs = distilledPrefix(distillations);
1153
+ return { messages: msgs, tokens: msgs.reduce((sum, m) => sum + estimateMessage(m), 0) };
1154
+ })();
1155
+
1156
+ // Layer 1: Normal budget allocation with lazy raw window eviction (Approach B).
1157
+ // tryFitStable reuses the previous cutoff when it still fits, keeping the raw
1158
+ // window byte-identical across turns for prompt caching. Only advances the
1159
+ // cutoff when a genuinely oversized message forces eviction.
1160
+ // Skipped when force-escalated to layer 2+ (previous attempt already failed at this level).
1161
+ if (effectiveMinLayer <= 1) {
1162
+ const layer1 = sid
1163
+ ? tryFitStable({
1164
+ messages: dedupMessages,
1165
+ prefix: cached.messages,
1166
+ prefixTokens: cached.tokens,
1167
+ distilledBudget,
1168
+ rawBudget,
1169
+ sessionID: sid,
1170
+ sessState,
1171
+ })
1172
+ : tryFit({
1173
+ messages: dedupMessages,
1174
+ prefix: cached.messages,
1175
+ prefixTokens: cached.tokens,
1176
+ distilledBudget,
1177
+ rawBudget,
1178
+ strip: "none",
1179
+ });
1180
+ if (fitsWithSafetyMargin(layer1)) return { ...layer1!, layer: 1, usable, distilledBudget, rawBudget };
1181
+ }
1182
+
1183
+ // Layer 1 didn't fit (or was force-skipped) — reset the raw window cache.
1184
+ // Layers 2-4 use full scans and already break the prompt cache.
1185
+ sessState.rawWindowCache = null;
1186
+
1187
+ // Layer 2: Strip tool outputs from older messages, keep last 2 turns
1188
+ // Skipped when force-escalated to layer 3+.
1189
+ if (effectiveMinLayer <= 2) {
1190
+ const layer2 = tryFit({
1191
+ messages: dedupMessages,
1192
+ prefix: cached.messages,
1193
+ prefixTokens: cached.tokens,
1194
+ distilledBudget,
1195
+ rawBudget: Math.floor(usable * 0.5), // give raw more room
1196
+ strip: "old-tools",
1197
+ protectedTurns: 2,
1198
+ });
1199
+ if (fitsWithSafetyMargin(layer2)) {
1200
+ urgentDistillation = true;
1201
+ return { ...layer2!, layer: 2, usable, distilledBudget, rawBudget };
1202
+ }
1203
+ }
1204
+
1205
+ // Layer 3: Strip ALL tool outputs, drop oldest distillations
1206
+ const trimmedDistillations = distillations.slice(-5);
1207
+ const trimmedPrefix = distilledPrefix(trimmedDistillations);
1208
+ const trimmedPrefixTokens = trimmedPrefix.reduce(
1209
+ (sum, m) => sum + estimateMessage(m),
1210
+ 0,
1211
+ );
1212
+ const layer3 = tryFit({
1213
+ messages: dedupMessages,
1214
+ prefix: trimmedPrefix,
1215
+ prefixTokens: trimmedPrefixTokens,
1216
+ distilledBudget: Math.floor(usable * 0.15),
1217
+ rawBudget: Math.floor(usable * 0.55),
1218
+ strip: "all-tools",
1219
+ });
1220
+ if (fitsWithSafetyMargin(layer3)) {
1221
+ urgentDistillation = true;
1222
+ return { ...layer3!, layer: 3, usable, distilledBudget, rawBudget };
1223
+ }
1224
+
1225
+ // Layer 4: Emergency — last 2 distillations, last 3 raw messages with tool parts intact.
1226
+ // We do NOT strip tool parts here: doing so would cause an infinite tool-call loop because
1227
+ // the model would lose sight of its own in-progress tool calls and re-invoke them endlessly.
1228
+ // Instead, we aggressively drop old messages and rely on the `recall` tool (which the model
1229
+ // is always instructed to use) to retrieve any older details it needs.
1230
+ urgentDistillation = true;
1231
+ const nuclearDistillations = distillations.slice(-2);
1232
+ const nuclearPrefix = distilledPrefix(nuclearDistillations);
1233
+ const nuclearPrefixTokens = nuclearPrefix.reduce(
1234
+ (sum, m) => sum + estimateMessage(m),
1235
+ 0,
1236
+ );
1237
+ const nuclearRaw = input.messages.slice(-3).map((m) => ({
1238
+ info: m.info,
1239
+ parts: cleanParts(m.parts),
1240
+ }));
1241
+ const nuclearRawTokens = nuclearRaw.reduce(
1242
+ (sum, m) => sum + estimateMessage(m),
1243
+ 0,
1244
+ );
1245
+
1246
+ return {
1247
+ messages: [...nuclearPrefix, ...nuclearRaw],
1248
+ layer: 4,
1249
+ distilledTokens: nuclearPrefixTokens,
1250
+ rawTokens: nuclearRawTokens,
1251
+ totalTokens: nuclearPrefixTokens + nuclearRawTokens,
1252
+ usable,
1253
+ distilledBudget,
1254
+ rawBudget,
1255
+ };
1256
+ }
1257
+
1258
+ // Public wrapper: records the compressed message count for calibration.
1259
+ // Calibration needs to know how many messages were SENT to the model (the
1260
+ // compressed window), not the total DB count. On layer 0 these are equal;
1261
+ // on layers 1-4 the compressed window is smaller, and the delta on the next
1262
+ // turn must be computed relative to the compressed count — otherwise the
1263
+ // expected input on the next turn is anchored to the compressed input token
1264
+ // count but the "new messages" delta is computed against the full DB count,
1265
+ // making newMsgCount ≈ 0 and causing layer 0 passthrough on an overflowing session.
1266
+ export function transform(input: {
1267
+ messages: MessageWithParts[];
1268
+ projectPath: string;
1269
+ sessionID?: string;
1270
+ }): TransformResult {
1271
+ const result = transformInner(input);
1272
+
1273
+ // Sanitize non-terminal tool parts before the window reaches the SDK.
1274
+ // Must run after transformInner (covers all layers 0-4) and before the
1275
+ // trailing-drop loop in index.ts sees the messages.
1276
+ result.messages = sanitizeToolParts(result.messages);
1277
+
1278
+ const sid = input.sessionID ?? input.messages[0]?.info.sessionID;
1279
+ if (sid) {
1280
+ const state = getSessionState(sid);
1281
+ state.lastTransformedCount = result.messages.length;
1282
+ state.lastTransformEstimate = result.totalTokens;
1283
+ state.lastLayer = result.layer;
1284
+ state.lastWindowMessageIDs = new Set(result.messages.map((m) => m.info.id));
1285
+ }
1286
+ return result;
1287
+ }
1288
+
1289
+ // Compute our message-only estimate for a set of messages (for calibration use)
1290
+ export function estimateMessages(messages: MessageWithParts[]): number {
1291
+ return messages.reduce((sum, m) => sum + estimateMessage(m), 0);
1292
+ }
1293
+
1294
+ // Identify the current agentic turn: the last user message plus all subsequent
1295
+ // assistant messages that share its ID as parentID. These messages form an atomic
1296
+ // unit — the model must see all of them or it will lose track of its own prior
1297
+ // tool calls and re-issue them in an infinite loop.
1298
+ function currentTurnStart(messages: MessageWithParts[]): number {
1299
+ // Find the last user message
1300
+ let lastUserIdx = -1;
1301
+ for (let i = messages.length - 1; i >= 0; i--) {
1302
+ if (messages[i].info.role === "user") {
1303
+ lastUserIdx = i;
1304
+ break;
1305
+ }
1306
+ }
1307
+ if (lastUserIdx === -1) return 0; // no user message — treat all as current turn
1308
+ return lastUserIdx;
1309
+ }
1310
+
1311
+ function tryFit(input: {
1312
+ messages: MessageWithParts[];
1313
+ prefix: MessageWithParts[];
1314
+ prefixTokens: number;
1315
+ distilledBudget: number;
1316
+ rawBudget: number;
1317
+ strip: "none" | "old-tools" | "all-tools";
1318
+ protectedTurns?: number;
1319
+ }): Omit<TransformResult, "layer" | "usable" | "distilledBudget" | "rawBudget"> | null {
1320
+ // If distilled prefix exceeds its budget, fail this layer
1321
+ if (input.prefixTokens > input.distilledBudget && input.prefix.length > 0)
1322
+ return null;
1323
+
1324
+ // Identify the current turn (last user message + all following assistant messages).
1325
+ // These are always included — they must never be evicted. If they alone exceed the
1326
+ // raw budget, escalate to the next layer (which strips tool outputs to reduce size).
1327
+ const turnStart = currentTurnStart(input.messages);
1328
+ const currentTurn = input.messages.slice(turnStart);
1329
+ const currentTurnTokens = currentTurn.reduce((s, m) => s + estimateMessage(m), 0);
1330
+
1331
+ if (currentTurnTokens > input.rawBudget) {
1332
+ // Current turn alone exceeds budget — can't fit even with everything else dropped.
1333
+ // Signal failure so the caller escalates to the next layer (tool-output stripping).
1334
+ return null;
1335
+ }
1336
+
1337
+ // Walk backwards through older messages (before the current turn),
1338
+ // filling the remaining budget after reserving space for the current turn.
1339
+ const olderMessages = input.messages.slice(0, turnStart);
1340
+ const remainingBudget = input.rawBudget - currentTurnTokens;
1341
+ let olderTokens = 0;
1342
+ let cutoff = olderMessages.length; // default: include none of the older messages
1343
+ const protectedTurns = input.protectedTurns ?? 0;
1344
+
1345
+ for (let i = olderMessages.length - 1; i >= 0; i--) {
1346
+ const msg = olderMessages[i];
1347
+ const tokens = estimateMessage(msg);
1348
+ if (olderTokens + tokens > remainingBudget) {
1349
+ cutoff = i + 1;
1350
+ break;
1351
+ }
1352
+ olderTokens += tokens;
1353
+ if (i === 0) cutoff = 0;
1354
+ }
1355
+
1356
+ const rawMessages = [...olderMessages.slice(cutoff), ...currentTurn];
1357
+ const rawTokens = olderTokens + currentTurnTokens;
1358
+
1359
+ // Apply system-reminder stripping + optional tool output stripping.
1360
+ // The current turn (end of rawMessages) is always "protected" — never stripped.
1361
+ const currentTurnSet = new Set(currentTurn.map((m) => m.info.id));
1362
+ const processed = rawMessages.map((msg, idx) => {
1363
+ const fromEnd = rawMessages.length - idx;
1364
+ const isCurrentTurn = currentTurnSet.has(msg.info.id);
1365
+ const isProtected =
1366
+ isCurrentTurn ||
1367
+ input.strip === "none" ||
1368
+ (input.strip === "old-tools" && fromEnd <= protectedTurns * 2);
1369
+ const parts = isProtected
1370
+ ? cleanParts(msg.parts)
1371
+ : cleanParts(
1372
+ input.strip === "all-tools"
1373
+ ? stripToolOutputs(msg.parts)
1374
+ : stripToolOutputs(msg.parts),
1375
+ );
1376
+ const changed = parts !== msg.parts;
1377
+ return changed ? { info: msg.info, parts } : msg;
1378
+ });
1379
+
1380
+ const total = input.prefixTokens + rawTokens;
1381
+ return {
1382
+ messages: [...input.prefix, ...processed],
1383
+ distilledTokens: input.prefixTokens,
1384
+ rawTokens,
1385
+ totalTokens: total,
1386
+ };
1387
+ }