@loreai/core 0.0.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +26 -5
- package/dist/bun/agents-file.d.ts +59 -0
- package/dist/bun/agents-file.d.ts.map +1 -0
- package/dist/bun/config.d.ts +58 -0
- package/dist/bun/config.d.ts.map +1 -0
- package/dist/bun/curator.d.ts +35 -0
- package/dist/bun/curator.d.ts.map +1 -0
- package/dist/bun/db/driver.bun.d.ts +5 -0
- package/dist/bun/db/driver.bun.d.ts.map +1 -0
- package/dist/bun/db/driver.node.d.ts +15 -0
- package/dist/bun/db/driver.node.d.ts.map +1 -0
- package/dist/bun/db.d.ts +22 -0
- package/dist/bun/db.d.ts.map +1 -0
- package/dist/bun/distillation.d.ts +32 -0
- package/dist/bun/distillation.d.ts.map +1 -0
- package/dist/bun/embedding.d.ts +90 -0
- package/dist/bun/embedding.d.ts.map +1 -0
- package/dist/bun/gradient.d.ts +73 -0
- package/dist/bun/gradient.d.ts.map +1 -0
- package/dist/bun/index.d.ts +19 -0
- package/dist/bun/index.d.ts.map +1 -0
- package/dist/bun/index.js +28236 -0
- package/dist/bun/index.js.map +7 -0
- package/dist/bun/lat-reader.d.ts +69 -0
- package/dist/bun/lat-reader.d.ts.map +1 -0
- package/dist/bun/log.d.ts +17 -0
- package/dist/bun/log.d.ts.map +1 -0
- package/dist/bun/ltm.d.ts +138 -0
- package/dist/bun/ltm.d.ts.map +1 -0
- package/dist/bun/markdown.d.ts +37 -0
- package/dist/bun/markdown.d.ts.map +1 -0
- package/dist/bun/prompt.d.ts +47 -0
- package/dist/bun/prompt.d.ts.map +1 -0
- package/dist/bun/recall.d.ts +41 -0
- package/dist/bun/recall.d.ts.map +1 -0
- package/dist/bun/search.d.ts +113 -0
- package/dist/bun/search.d.ts.map +1 -0
- package/dist/bun/temporal.d.ts +66 -0
- package/dist/bun/temporal.d.ts.map +1 -0
- package/dist/bun/types.d.ts +180 -0
- package/dist/bun/types.d.ts.map +1 -0
- package/dist/bun/worker.d.ts +6 -0
- package/dist/bun/worker.d.ts.map +1 -0
- package/dist/node/agents-file.d.ts +59 -0
- package/dist/node/agents-file.d.ts.map +1 -0
- package/dist/node/config.d.ts +58 -0
- package/dist/node/config.d.ts.map +1 -0
- package/dist/node/curator.d.ts +35 -0
- package/dist/node/curator.d.ts.map +1 -0
- package/dist/node/db/driver.bun.d.ts +5 -0
- package/dist/node/db/driver.bun.d.ts.map +1 -0
- package/dist/node/db/driver.node.d.ts +15 -0
- package/dist/node/db/driver.node.d.ts.map +1 -0
- package/dist/node/db.d.ts +22 -0
- package/dist/node/db.d.ts.map +1 -0
- package/dist/node/distillation.d.ts +32 -0
- package/dist/node/distillation.d.ts.map +1 -0
- package/dist/node/embedding.d.ts +90 -0
- package/dist/node/embedding.d.ts.map +1 -0
- package/dist/node/gradient.d.ts +73 -0
- package/dist/node/gradient.d.ts.map +1 -0
- package/dist/node/index.d.ts +19 -0
- package/dist/node/index.d.ts.map +1 -0
- package/dist/node/index.js +28253 -0
- package/dist/node/index.js.map +7 -0
- package/dist/node/lat-reader.d.ts +69 -0
- package/dist/node/lat-reader.d.ts.map +1 -0
- package/dist/node/log.d.ts +17 -0
- package/dist/node/log.d.ts.map +1 -0
- package/dist/node/ltm.d.ts +138 -0
- package/dist/node/ltm.d.ts.map +1 -0
- package/dist/node/markdown.d.ts +37 -0
- package/dist/node/markdown.d.ts.map +1 -0
- package/dist/node/prompt.d.ts +47 -0
- package/dist/node/prompt.d.ts.map +1 -0
- package/dist/node/recall.d.ts +41 -0
- package/dist/node/recall.d.ts.map +1 -0
- package/dist/node/search.d.ts +113 -0
- package/dist/node/search.d.ts.map +1 -0
- package/dist/node/temporal.d.ts +66 -0
- package/dist/node/temporal.d.ts.map +1 -0
- package/dist/node/types.d.ts +180 -0
- package/dist/node/types.d.ts.map +1 -0
- package/dist/node/worker.d.ts +6 -0
- package/dist/node/worker.d.ts.map +1 -0
- package/dist/types/agents-file.d.ts +59 -0
- package/dist/types/agents-file.d.ts.map +1 -0
- package/dist/types/config.d.ts +58 -0
- package/dist/types/config.d.ts.map +1 -0
- package/dist/types/curator.d.ts +35 -0
- package/dist/types/curator.d.ts.map +1 -0
- package/dist/types/db/driver.bun.d.ts +5 -0
- package/dist/types/db/driver.bun.d.ts.map +1 -0
- package/dist/types/db/driver.node.d.ts +15 -0
- package/dist/types/db/driver.node.d.ts.map +1 -0
- package/dist/types/db.d.ts +22 -0
- package/dist/types/db.d.ts.map +1 -0
- package/dist/types/distillation.d.ts +32 -0
- package/dist/types/distillation.d.ts.map +1 -0
- package/dist/types/embedding.d.ts +90 -0
- package/dist/types/embedding.d.ts.map +1 -0
- package/dist/types/gradient.d.ts +73 -0
- package/dist/types/gradient.d.ts.map +1 -0
- package/dist/types/index.d.ts +19 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/lat-reader.d.ts +69 -0
- package/dist/types/lat-reader.d.ts.map +1 -0
- package/dist/types/log.d.ts +17 -0
- package/dist/types/log.d.ts.map +1 -0
- package/dist/types/ltm.d.ts +138 -0
- package/dist/types/ltm.d.ts.map +1 -0
- package/dist/types/markdown.d.ts +37 -0
- package/dist/types/markdown.d.ts.map +1 -0
- package/dist/types/prompt.d.ts +47 -0
- package/dist/types/prompt.d.ts.map +1 -0
- package/dist/types/recall.d.ts +41 -0
- package/dist/types/recall.d.ts.map +1 -0
- package/dist/types/search.d.ts +113 -0
- package/dist/types/search.d.ts.map +1 -0
- package/dist/types/temporal.d.ts +66 -0
- package/dist/types/temporal.d.ts.map +1 -0
- package/dist/types/types.d.ts +180 -0
- package/dist/types/types.d.ts.map +1 -0
- package/dist/types/worker.d.ts +6 -0
- package/dist/types/worker.d.ts.map +1 -0
- package/package.json +48 -5
- package/src/agents-file.ts +406 -0
- package/src/config.ts +132 -0
- package/src/curator.ts +220 -0
- package/src/db/driver.bun.ts +18 -0
- package/src/db/driver.node.ts +54 -0
- package/src/db.ts +433 -0
- package/src/distillation.ts +433 -0
- package/src/embedding.ts +528 -0
- package/src/gradient.ts +1387 -0
- package/src/index.ts +109 -0
- package/src/lat-reader.ts +374 -0
- package/src/log.ts +27 -0
- package/src/ltm.ts +861 -0
- package/src/markdown.ts +129 -0
- package/src/prompt.ts +454 -0
- package/src/recall.ts +446 -0
- package/src/search.ts +330 -0
- package/src/temporal.ts +379 -0
- package/src/types.ts +199 -0
- package/src/worker.ts +26 -0
package/src/gradient.ts
ADDED
|
@@ -0,0 +1,1387 @@
|
|
|
1
|
+
import type { LoreMessage, LorePart, LoreMessageWithParts, LoreToolPart, LoreTextPart, LoreToolState, LoreToolStateCompleted } from "./types";
|
|
2
|
+
import { isTextPart, isReasoningPart, isToolPart } from "./types";
|
|
3
|
+
import { db, ensureProject, loadForceMinLayer, saveForceMinLayer } from "./db";
|
|
4
|
+
import { config } from "./config";
|
|
5
|
+
import { formatDistillations } from "./prompt";
|
|
6
|
+
import { normalize } from "./markdown";
|
|
7
|
+
|
|
8
|
+
type MessageWithParts = LoreMessageWithParts;
|
|
9
|
+
|
|
10
|
+
// Token estimate: ~3 chars per token. Validated against real API data across
|
|
11
|
+
// 200+ turn-pairs: chars/3 gives ~1.68x ratio (actual/estimate), best among
|
|
12
|
+
// heuristics tested. The gap is overhead (system prompt, tool definitions,
|
|
13
|
+
// conversation structure) which calibratedOverhead captures via EMA.
|
|
14
|
+
function estimate(text: string): number {
|
|
15
|
+
return Math.ceil(text.length / 3);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function estimateParts(parts: LorePart[]): number {
|
|
19
|
+
let total = 0;
|
|
20
|
+
for (const part of parts) {
|
|
21
|
+
if (isTextPart(part)) total += estimate(part.text);
|
|
22
|
+
else if (isReasoningPart(part) && part.text)
|
|
23
|
+
total += estimate(part.text);
|
|
24
|
+
else if (isToolPart(part) && part.state.status === "completed")
|
|
25
|
+
total += estimate(part.state.output) + estimate(part.tool) + 50;
|
|
26
|
+
else total += 20; // metadata overhead for other part types
|
|
27
|
+
}
|
|
28
|
+
return total;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function estimateMessage(msg: MessageWithParts): number {
|
|
32
|
+
return estimateParts(msg.parts) + 20; // role/metadata overhead
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Cached model context limit — set by system transform hook, used by message transform
|
|
36
|
+
let contextLimit = 200_000; // sensible default
|
|
37
|
+
let outputReserved = 32_000;
|
|
38
|
+
|
|
39
|
+
// Conservative overhead reserve for first-turn (before calibration):
|
|
40
|
+
// accounts for provider system prompt + AGENTS.md + tool definitions + env info
|
|
41
|
+
const FIRST_TURN_OVERHEAD = 15_000;
|
|
42
|
+
|
|
43
|
+
// Calibrated overhead: actual tokens used minus our message estimate.
|
|
44
|
+
// Null = not yet calibrated (first turn). Updated after every assistant response.
|
|
45
|
+
// Shared across all sessions — this is model-level overhead (system prompt,
|
|
46
|
+
// tool definitions, provider headers) that doesn't vary per session.
|
|
47
|
+
let calibratedOverhead: number | null = null;
|
|
48
|
+
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
// Per-session state
|
|
51
|
+
//
|
|
52
|
+
// All calibration, layer-tracking, and window-ID state is scoped per session
|
|
53
|
+
// using an in-memory Map. This prevents worker sessions (lore-distill,
|
|
54
|
+
// lore-curator) from corrupting the main session's sticky-layer guard and
|
|
55
|
+
// delta-estimation state when their transform() calls return layer 0.
|
|
56
|
+
//
|
|
57
|
+
// forceMinLayer is the one field that MUST survive process restarts: when the
|
|
58
|
+
// API returns "prompt is too long", the error handler sets forceMinLayer=2.
|
|
59
|
+
// If OpenCode restarts before the next turn, the escalation is lost and the
|
|
60
|
+
// overflow repeats. forceMinLayer is persisted to SQLite (session_state table)
|
|
61
|
+
// and loaded on first access. All other state rebuilds from the first API
|
|
62
|
+
// response via UNCALIBRATED_SAFETY.
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
type SessionState = {
|
|
66
|
+
/** Exact input token count from the last successful API response */
|
|
67
|
+
lastKnownInput: number;
|
|
68
|
+
/** LTM tokens that were in-flight when lastKnownInput was recorded */
|
|
69
|
+
lastKnownLtm: number;
|
|
70
|
+
/** Total messages sent to the model in the last turn (compressed count on layers 1-4) */
|
|
71
|
+
lastKnownMessageCount: number;
|
|
72
|
+
/** Number of messages in the most recent transform() output */
|
|
73
|
+
lastTransformedCount: number;
|
|
74
|
+
/** Layer used by the most recent transform() call — sticky-layer guard */
|
|
75
|
+
lastLayer: SafetyLayer;
|
|
76
|
+
/** Message IDs in the most recent transform() output — ID-based delta estimation */
|
|
77
|
+
lastWindowMessageIDs: Set<string>;
|
|
78
|
+
/** One-shot force escalation: skip layers below this on the next transform() */
|
|
79
|
+
forceMinLayer: SafetyLayer;
|
|
80
|
+
/** Token estimate from the most recent transform() output (compressed window) */
|
|
81
|
+
lastTransformEstimate: number;
|
|
82
|
+
/** Distilled prefix cache (Approach C) */
|
|
83
|
+
prefixCache: PrefixCache | null;
|
|
84
|
+
/** Raw window pin cache (Approach B) */
|
|
85
|
+
rawWindowCache: RawWindowCache | null;
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
function makeSessionState(): SessionState {
|
|
89
|
+
return {
|
|
90
|
+
lastKnownInput: 0,
|
|
91
|
+
lastKnownLtm: 0,
|
|
92
|
+
lastKnownMessageCount: 0,
|
|
93
|
+
lastTransformedCount: 0,
|
|
94
|
+
lastLayer: 0,
|
|
95
|
+
lastWindowMessageIDs: new Set(),
|
|
96
|
+
forceMinLayer: 0,
|
|
97
|
+
lastTransformEstimate: 0,
|
|
98
|
+
prefixCache: null,
|
|
99
|
+
rawWindowCache: null,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const sessionStates = new Map<string, SessionState>();
|
|
104
|
+
|
|
105
|
+
function getSessionState(sessionID: string): SessionState {
|
|
106
|
+
let state = sessionStates.get(sessionID);
|
|
107
|
+
if (!state) {
|
|
108
|
+
state = makeSessionState();
|
|
109
|
+
// Restore persisted forceMinLayer from DB — survives process restarts.
|
|
110
|
+
// Critical for "prompt too long" recovery: the error handler sets
|
|
111
|
+
// forceMinLayer=2, but if OpenCode restarts before the next turn,
|
|
112
|
+
// the in-memory escalation would be lost without this.
|
|
113
|
+
state.forceMinLayer = loadForceMinLayer(sessionID) as SafetyLayer;
|
|
114
|
+
sessionStates.set(sessionID, state);
|
|
115
|
+
}
|
|
116
|
+
return state;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// LTM tokens injected via system transform hook this turn.
|
|
120
|
+
// Set by setLtmTokens() after the system hook runs; consumed by transform().
|
|
121
|
+
let ltmTokens = 0;
|
|
122
|
+
|
|
123
|
+
export function setModelLimits(limits: { context: number; output: number }) {
|
|
124
|
+
contextLimit = limits.context || 200_000;
|
|
125
|
+
// NOTE: this cap of 32K matches what @ai-sdk/anthropic sends as max_tokens for
|
|
126
|
+
// claude-opus-4-6 (the SDK doesn't recognise the -6 variant and falls back to
|
|
127
|
+
// the generic claude-opus-4- pattern with maxOutputTokens=32K). If the SDK is
|
|
128
|
+
// updated to send the model's actual limit (128K for opus-4-6), this cap will
|
|
129
|
+
// become wrong — the effective max input would drop from 168K to 72K but our
|
|
130
|
+
// budget would still assume 168K. At that point, remove the cap.
|
|
131
|
+
outputReserved = Math.min(limits.output || 32_000, 32_000);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/** Called by the system transform hook after formatting LTM knowledge. */
|
|
135
|
+
export function setLtmTokens(tokens: number) {
|
|
136
|
+
ltmTokens = tokens;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/** Returns the current LTM token count (for tests and diagnostics). */
|
|
140
|
+
export function getLtmTokens(): number {
|
|
141
|
+
return ltmTokens;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Returns the token budget available for LTM system-prompt injection.
|
|
146
|
+
* This is the usable context (after output + overhead) multiplied by
|
|
147
|
+
* the configured ltm budget fraction. Call this from the system transform
|
|
148
|
+
* hook to cap how many tokens formatKnowledge may use.
|
|
149
|
+
*/
|
|
150
|
+
export function getLtmBudget(ltmFraction: number): number {
|
|
151
|
+
const overhead = calibratedOverhead ?? FIRST_TURN_OVERHEAD;
|
|
152
|
+
const usable = Math.max(0, contextLimit - outputReserved - overhead);
|
|
153
|
+
return Math.floor(usable * ltmFraction);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Called after each assistant message completes with real token usage data.
|
|
157
|
+
// actualInput = tokens.input + tokens.cache.read + tokens.cache.write
|
|
158
|
+
// sessionID = session that produced this response (for exact-tracking validity)
|
|
159
|
+
// messageCount = number of messages that were sent (for delta estimation)
|
|
160
|
+
//
|
|
161
|
+
// Overhead calibration uses lastTransformEstimate (the token estimate from the
|
|
162
|
+
// compressed window that was actually sent to the model) instead of re-estimating
|
|
163
|
+
// all session messages. On compressed sessions, all-message estimate >> actualInput,
|
|
164
|
+
// which clamped overhead to 0 and broke budget calculations.
|
|
165
|
+
export function calibrate(
|
|
166
|
+
actualInput: number,
|
|
167
|
+
sessionID?: string,
|
|
168
|
+
messageCount?: number,
|
|
169
|
+
) {
|
|
170
|
+
// Use the transform's own estimate for the compressed window it produced.
|
|
171
|
+
// This is the correct baseline: it estimates the same messages the model saw.
|
|
172
|
+
const messageEstimate = sessionID
|
|
173
|
+
? getSessionState(sessionID).lastTransformEstimate
|
|
174
|
+
: 0;
|
|
175
|
+
|
|
176
|
+
// Update global overhead calibration (shared across sessions — model-level).
|
|
177
|
+
// Skip when actualInput > 0 but no transform estimate exists yet (no baseline
|
|
178
|
+
// to compare against). Allow when both are 0 (test setup to zero overhead) or
|
|
179
|
+
// when we have a real transform estimate.
|
|
180
|
+
if (messageEstimate > 0 || actualInput === 0) {
|
|
181
|
+
const overhead = Math.max(0, actualInput - messageEstimate);
|
|
182
|
+
calibratedOverhead =
|
|
183
|
+
calibratedOverhead === null
|
|
184
|
+
? overhead
|
|
185
|
+
: Math.round(calibratedOverhead * 0.7 + overhead * 0.3);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Store per-session exact counts for the proactive layer 0 decision.
|
|
189
|
+
if (sessionID !== undefined) {
|
|
190
|
+
const state = getSessionState(sessionID);
|
|
191
|
+
state.lastKnownInput = actualInput;
|
|
192
|
+
state.lastKnownLtm = ltmTokens;
|
|
193
|
+
if (messageCount !== undefined) state.lastKnownMessageCount = messageCount;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
export function getOverhead(): number {
|
|
198
|
+
return calibratedOverhead ?? FIRST_TURN_OVERHEAD;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Returns the number of messages in the most recent transform() output for
|
|
203
|
+
* the given session. Used by calibrate() to track the compressed window size.
|
|
204
|
+
*/
|
|
205
|
+
export function getLastTransformedCount(sessionID: string): number {
|
|
206
|
+
return sessionStates.get(sessionID)?.lastTransformedCount ?? 0;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/** Returns the token estimate from the most recent transform() output. */
|
|
210
|
+
export function getLastTransformEstimate(sessionID: string): number {
|
|
211
|
+
return sessionStates.get(sessionID)?.lastTransformEstimate ?? 0;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/** Returns the layer used by the most recent transform() call. For testing. */
|
|
215
|
+
export function getLastLayer(sessionID?: string): SafetyLayer {
|
|
216
|
+
if (sessionID) return sessionStates.get(sessionID)?.lastLayer ?? 0;
|
|
217
|
+
// Fallback for tests: return from the first (and usually only) session state
|
|
218
|
+
const first = sessionStates.values().next().value;
|
|
219
|
+
return first?.lastLayer ?? 0;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Force the next transform() call for this session to use at least the given layer.
|
|
224
|
+
* Called when the API returns "prompt is too long" so the next attempt
|
|
225
|
+
* trims the context enough to fit within the model's context window.
|
|
226
|
+
*/
|
|
227
|
+
export function setForceMinLayer(layer: SafetyLayer, sessionID?: string) {
|
|
228
|
+
if (sessionID) {
|
|
229
|
+
getSessionState(sessionID).forceMinLayer = layer;
|
|
230
|
+
saveForceMinLayer(sessionID, layer);
|
|
231
|
+
} else {
|
|
232
|
+
// Fallback for tests / callers without session ID: set on all active sessions
|
|
233
|
+
for (const [sid, state] of sessionStates.entries()) {
|
|
234
|
+
state.forceMinLayer = layer;
|
|
235
|
+
saveForceMinLayer(sid, layer);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// For testing only — reset all calibration and force-escalation state
|
|
241
|
+
export function resetCalibration(sessionID?: string) {
|
|
242
|
+
calibratedOverhead = null;
|
|
243
|
+
if (sessionID) {
|
|
244
|
+
saveForceMinLayer(sessionID, 0); // clear persisted state
|
|
245
|
+
sessionStates.delete(sessionID);
|
|
246
|
+
} else {
|
|
247
|
+
for (const sid of sessionStates.keys()) {
|
|
248
|
+
saveForceMinLayer(sid, 0);
|
|
249
|
+
}
|
|
250
|
+
sessionStates.clear();
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
type Distillation = {
|
|
255
|
+
id: string;
|
|
256
|
+
observations: string;
|
|
257
|
+
generation: number;
|
|
258
|
+
token_count: number;
|
|
259
|
+
created_at: number;
|
|
260
|
+
session_id: string;
|
|
261
|
+
};
|
|
262
|
+
|
|
263
|
+
// Load non-archived distillations for the in-context prefix.
|
|
264
|
+
// Archived gen-0 entries (preserved after meta-distillation) are excluded here
|
|
265
|
+
// but remain searchable via the recall tool's searchDistillations().
|
|
266
|
+
function loadDistillations(
|
|
267
|
+
projectPath: string,
|
|
268
|
+
sessionID?: string,
|
|
269
|
+
): Distillation[] {
|
|
270
|
+
const pid = ensureProject(projectPath);
|
|
271
|
+
const query = sessionID
|
|
272
|
+
? "SELECT id, observations, generation, token_count, created_at, session_id FROM distillations WHERE project_id = ? AND session_id = ? AND archived = 0 ORDER BY created_at ASC"
|
|
273
|
+
: "SELECT id, observations, generation, token_count, created_at, session_id FROM distillations WHERE project_id = ? AND archived = 0 ORDER BY created_at ASC";
|
|
274
|
+
const params = sessionID ? [pid, sessionID] : [pid];
|
|
275
|
+
return db()
|
|
276
|
+
.query(query)
|
|
277
|
+
.all(...params) as Distillation[];
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Strip all <system-reminder>...</system-reminder> blocks from message text.
|
|
281
|
+
// For the user-message wrapper pattern, extracts the actual user text.
|
|
282
|
+
// For all other reminders (build-switch, plan reminders, etc.), drops them entirely.
|
|
283
|
+
// These tags are added by OpenCode in-memory or persisted as synthetic parts —
|
|
284
|
+
// leaving them in the raw window causes the model to echo the format.
|
|
285
|
+
function stripSystemReminders(text: string): string {
|
|
286
|
+
return text
|
|
287
|
+
.replace(/<system-reminder>[\s\S]*?<\/system-reminder>\n?/g, (match) => {
|
|
288
|
+
const inner = match.match(
|
|
289
|
+
/The user sent the following message:\n([\s\S]*?)\n\nPlease address/,
|
|
290
|
+
);
|
|
291
|
+
return inner ? inner[1].trim() + "\n" : "";
|
|
292
|
+
})
|
|
293
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
294
|
+
.trim();
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
function cleanParts(parts: LorePart[]): LorePart[] {
|
|
298
|
+
const cleaned = parts.map((part) => {
|
|
299
|
+
if (!isTextPart(part)) return part;
|
|
300
|
+
const text = stripSystemReminders(part.text);
|
|
301
|
+
if (text === part.text) return part;
|
|
302
|
+
return { ...part, text } as LorePart;
|
|
303
|
+
});
|
|
304
|
+
// Filter out text parts that became empty after stripping
|
|
305
|
+
const filtered = cleaned.filter(
|
|
306
|
+
(part) =>
|
|
307
|
+
!isTextPart(part) ||
|
|
308
|
+
part.text.trim().length > 0,
|
|
309
|
+
);
|
|
310
|
+
// If all parts were stripped (e.g. a user message that was purely build-switch synthetic
|
|
311
|
+
// content), keep a minimal placeholder so the message survives toModelMessages.
|
|
312
|
+
// Without this, the message gets dropped and the conversation ends with an assistant message,
|
|
313
|
+
// causing Anthropic's "does not support assistant message prefill" error.
|
|
314
|
+
if (filtered.length === 0 && parts.length > 0) {
|
|
315
|
+
const first = parts[0];
|
|
316
|
+
if (isTextPart(first)) {
|
|
317
|
+
return [{ ...first, text: "..." } as LorePart];
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
return filtered.length > 0 ? filtered : parts;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Build a metadata annotation for a stripped tool output, preserving key signals
|
|
324
|
+
// about what was lost without requiring an LLM call. Inspired by the per-token
|
|
325
|
+
// scalar bias β from "Fast KV Compaction via Attention Matching" (Zweiger et al.,
|
|
326
|
+
// 2025) — when tokens are removed, preserving metadata about the removed content
|
|
327
|
+
// helps the model compensate for information loss and decide whether to recall.
|
|
328
|
+
// Reference: https://arxiv.org/abs/2602.16284
|
|
329
|
+
function toolStripAnnotation(toolName: string, output: string): string {
|
|
330
|
+
const lines = output.split("\n").length;
|
|
331
|
+
const chars = output.length;
|
|
332
|
+
|
|
333
|
+
// Detect key signals via lightweight heuristics — no LLM call
|
|
334
|
+
const hasError = /\b(?:error|fail(?:ed|ure)?|exception|panic|traceback)\b/i.test(output);
|
|
335
|
+
const paths = output.match(/(?:[\w.-]+\/)+[\w.-]+\.\w{1,5}/g);
|
|
336
|
+
const uniquePaths = paths ? [...new Set(paths)].slice(0, 5) : [];
|
|
337
|
+
|
|
338
|
+
let annotation = `[output omitted — ${toolName}: ${lines} lines`;
|
|
339
|
+
if (hasError) annotation += ", contained errors";
|
|
340
|
+
if (uniquePaths.length > 0) annotation += `, paths: ${uniquePaths.join(", ")}`;
|
|
341
|
+
annotation += " — use recall for details]";
|
|
342
|
+
return annotation;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// ---------------------------------------------------------------------------
|
|
346
|
+
// Content-aware deduplication
|
|
347
|
+
// ---------------------------------------------------------------------------
|
|
348
|
+
// Inspired by Dirac's ContextManager file-read deduplication: detects when the
|
|
349
|
+
// same content appears multiple times in the conversation (e.g., the same file
|
|
350
|
+
// read multiple times, or the same command output repeated) and replaces earlier
|
|
351
|
+
// occurrences with compact annotations. This reduces token pressure before layer
|
|
352
|
+
// selection, potentially keeping sessions at lower (less lossy) gradient layers.
|
|
353
|
+
|
|
354
|
+
// Minimum output size (chars) to consider for dedup — annotations for smaller
|
|
355
|
+
// outputs would cost more tokens than the original content.
|
|
356
|
+
const DEDUP_MIN_CHARS = 600;
|
|
357
|
+
|
|
358
|
+
/** Fast FNV-1a hash for content comparison. */
|
|
359
|
+
function simpleHash(str: string): number {
|
|
360
|
+
let hash = 0x811c9dc5;
|
|
361
|
+
for (let i = 0; i < str.length; i++) {
|
|
362
|
+
hash ^= str.charCodeAt(i);
|
|
363
|
+
hash = (hash * 0x01000193) >>> 0;
|
|
364
|
+
}
|
|
365
|
+
return hash;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/** Extract file path from a tool's input JSON.
|
|
369
|
+
* Handles common formats: {"path": "/foo.ts"}, {"filePath": "/foo.ts"},
|
|
370
|
+
* and plain text fallback. */
|
|
371
|
+
function extractFilePath(input: string): string | undefined {
|
|
372
|
+
try {
|
|
373
|
+
const parsed = JSON.parse(input);
|
|
374
|
+
return parsed.path || parsed.filePath || parsed.file;
|
|
375
|
+
} catch {
|
|
376
|
+
// Plain text — try to extract a path-like string
|
|
377
|
+
const match = input.match(/(?:[\w.-]+\/)+[\w.-]+\.\w{1,5}/);
|
|
378
|
+
return match?.[0];
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
/** Annotation for deduplicated tool output — follows the toolStripAnnotation() pattern. */
|
|
383
|
+
function dedupAnnotation(toolName: string, filePath?: string): string {
|
|
384
|
+
if (filePath) {
|
|
385
|
+
return `[earlier version of ${filePath} — see latest read below for current content]`;
|
|
386
|
+
}
|
|
387
|
+
return `[duplicate output — same content as later ${toolName} in this session — use recall for details]`;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
/**
|
|
391
|
+
* Replace duplicate tool outputs with compact back-references, keeping only
|
|
392
|
+
* the latest occurrence of each unique output. Reduces context token usage
|
|
393
|
+
* without information loss — the model sees the most recent content intact.
|
|
394
|
+
*
|
|
395
|
+
* Deduplicates by:
|
|
396
|
+
* 1. Exact content hash: identical tool outputs (same file read twice, same command output)
|
|
397
|
+
* 2. Same-file reads: read_file outputs for the same path (content may differ due to edits)
|
|
398
|
+
*
|
|
399
|
+
* The current turn (from currentTurnIdx onward) is never touched — the model
|
|
400
|
+
* needs full context for its active work. Tool parts are never removed entirely;
|
|
401
|
+
* only state.output is replaced with a compact annotation.
|
|
402
|
+
*
|
|
403
|
+
* Returns the original array reference (not a copy) when no duplicates exist.
|
|
404
|
+
*/
|
|
405
|
+
export function deduplicateToolOutputs(
|
|
406
|
+
messages: MessageWithParts[],
|
|
407
|
+
currentTurnIdx: number,
|
|
408
|
+
): MessageWithParts[] {
|
|
409
|
+
// Track latest occurrence: contentKey → latest message index
|
|
410
|
+
const contentLatest = new Map<string, number>();
|
|
411
|
+
// Track latest read by file path: "read:path" → latest message index
|
|
412
|
+
const fileLatest = new Map<string, number>();
|
|
413
|
+
|
|
414
|
+
// Also include current-turn reads in the "latest" tracking so we properly
|
|
415
|
+
// recognize earlier reads as duplicates of current-turn content.
|
|
416
|
+
for (let i = 0; i < messages.length; i++) {
|
|
417
|
+
for (const part of messages[i].parts) {
|
|
418
|
+
if (!isToolPart(part) || part.state.status !== "completed") continue;
|
|
419
|
+
const output = part.state.output;
|
|
420
|
+
if (!output || output.length < DEDUP_MIN_CHARS) continue;
|
|
421
|
+
|
|
422
|
+
const key = `${part.tool}:${simpleHash(output)}`;
|
|
423
|
+
contentLatest.set(key, i);
|
|
424
|
+
|
|
425
|
+
// For read-type tools, also track by file path
|
|
426
|
+
if (part.tool === "read_file" || part.tool === "read") {
|
|
427
|
+
const inputStr = typeof part.state.input === "string"
|
|
428
|
+
? part.state.input
|
|
429
|
+
: JSON.stringify(part.state.input);
|
|
430
|
+
const fp = extractFilePath(inputStr);
|
|
431
|
+
if (fp) fileLatest.set(`read:${fp}`, i);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// Second pass: replace earlier occurrences (but never touch the current turn)
|
|
437
|
+
let changed = false;
|
|
438
|
+
const result = messages.map((msg, msgIdx) => {
|
|
439
|
+
if (msgIdx >= currentTurnIdx) return msg; // sacred boundary
|
|
440
|
+
|
|
441
|
+
let partsChanged = false;
|
|
442
|
+
const parts = msg.parts.map((part) => {
|
|
443
|
+
if (!isToolPart(part) || part.state.status !== "completed") return part;
|
|
444
|
+
const output = part.state.output;
|
|
445
|
+
if (!output || output.length < DEDUP_MIN_CHARS) return part;
|
|
446
|
+
|
|
447
|
+
// Check exact-match dedup: is this the latest occurrence of this content?
|
|
448
|
+
const contentKey = `${part.tool}:${simpleHash(output)}`;
|
|
449
|
+
const isLatestContent = contentLatest.get(contentKey) === msgIdx;
|
|
450
|
+
|
|
451
|
+
// Check file-path dedup for read tools: is this the latest read of this file?
|
|
452
|
+
let filePath: string | undefined;
|
|
453
|
+
let isLatestFile = true;
|
|
454
|
+
if (part.tool === "read_file" || part.tool === "read") {
|
|
455
|
+
const inputStr = typeof part.state.input === "string"
|
|
456
|
+
? part.state.input
|
|
457
|
+
: JSON.stringify(part.state.input);
|
|
458
|
+
filePath = extractFilePath(inputStr);
|
|
459
|
+
if (filePath) isLatestFile = fileLatest.get(`read:${filePath}`) === msgIdx;
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
// Keep if this is both the latest content AND latest file read (or not a read tool)
|
|
463
|
+
if (isLatestContent && isLatestFile) return part;
|
|
464
|
+
|
|
465
|
+
// This is a duplicate — replace with compact annotation
|
|
466
|
+
partsChanged = true;
|
|
467
|
+
return {
|
|
468
|
+
...part,
|
|
469
|
+
state: {
|
|
470
|
+
...part.state,
|
|
471
|
+
output: dedupAnnotation(part.tool, filePath),
|
|
472
|
+
},
|
|
473
|
+
} as LorePart;
|
|
474
|
+
});
|
|
475
|
+
|
|
476
|
+
if (!partsChanged) return msg;
|
|
477
|
+
changed = true;
|
|
478
|
+
return { ...msg, parts };
|
|
479
|
+
});
|
|
480
|
+
|
|
481
|
+
return changed ? result : messages;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
// Ensure every tool part in the window has a terminal state (completed or error).
|
|
485
|
+
// Pending/running tool parts produce tool_use blocks at the API level but have no
|
|
486
|
+
// output to generate a matching tool_result — causing Anthropic to reject the request
|
|
487
|
+
// with "tool_use ids were found without tool_result blocks immediately after".
|
|
488
|
+
// This happens when a session errors mid-tool-execution (e.g. context overflow) and
|
|
489
|
+
// the tool part remains in pending/running state on the next transform.
|
|
490
|
+
// Converting to error state generates both tool_use + tool_result(is_error=true).
|
|
491
|
+
function sanitizeToolParts(
|
|
492
|
+
messages: MessageWithParts[],
|
|
493
|
+
): MessageWithParts[] {
|
|
494
|
+
let changed = false;
|
|
495
|
+
const result = messages.map((msg) => {
|
|
496
|
+
if (msg.info.role !== "assistant") return msg;
|
|
497
|
+
|
|
498
|
+
let partsChanged = false;
|
|
499
|
+
const parts = msg.parts.map((part) => {
|
|
500
|
+
if (!isToolPart(part)) return part;
|
|
501
|
+
const { status } = part.state;
|
|
502
|
+
if (status === "completed" || status === "error") return part;
|
|
503
|
+
|
|
504
|
+
// pending or running → convert to error so SDK emits tool_result
|
|
505
|
+
partsChanged = true;
|
|
506
|
+
const now = Date.now();
|
|
507
|
+
return {
|
|
508
|
+
...part,
|
|
509
|
+
state: {
|
|
510
|
+
status: "error" as const,
|
|
511
|
+
input: part.state.input,
|
|
512
|
+
error: "[tool execution interrupted — session recovered]",
|
|
513
|
+
metadata:
|
|
514
|
+
"metadata" in part.state ? part.state.metadata : undefined,
|
|
515
|
+
time: {
|
|
516
|
+
start: "time" in part.state ? part.state.time.start : now,
|
|
517
|
+
end: now,
|
|
518
|
+
},
|
|
519
|
+
},
|
|
520
|
+
} as LorePart;
|
|
521
|
+
});
|
|
522
|
+
|
|
523
|
+
if (!partsChanged) return msg;
|
|
524
|
+
changed = true;
|
|
525
|
+
return { ...msg, parts };
|
|
526
|
+
});
|
|
527
|
+
|
|
528
|
+
return changed ? result : messages;
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
function stripToolOutputs(parts: LorePart[]): LorePart[] {
|
|
532
|
+
return parts.map((part) => {
|
|
533
|
+
if (!isToolPart(part)) return part;
|
|
534
|
+
if (part.state.status !== "completed") return part;
|
|
535
|
+
return {
|
|
536
|
+
...part,
|
|
537
|
+
state: {
|
|
538
|
+
...part.state,
|
|
539
|
+
output: toolStripAnnotation(part.tool, part.state.output),
|
|
540
|
+
},
|
|
541
|
+
} as LorePart;
|
|
542
|
+
});
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
function stripToTextOnly(parts: LorePart[]): LorePart[] {
|
|
546
|
+
const stripped = parts
|
|
547
|
+
.filter(isTextPart)
|
|
548
|
+
.map((p) => ({
|
|
549
|
+
...p,
|
|
550
|
+
text: normalize(stripSystemReminders(p.text)),
|
|
551
|
+
}))
|
|
552
|
+
.filter((p) => p.text.trim().length > 0) as LorePart[];
|
|
553
|
+
// Guard against empty result — keep a placeholder so the message survives
|
|
554
|
+
// toModelMessages and the conversation doesn't end with an assistant message.
|
|
555
|
+
if (stripped.length === 0 && parts.length > 0) {
|
|
556
|
+
const first = parts.find(isTextPart);
|
|
557
|
+
if (first) return [{ ...first, text: "..." } as LorePart];
|
|
558
|
+
}
|
|
559
|
+
return stripped;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
// --- Phase 2: Temporal anchoring at read time ---
|
|
563
|
+
|
|
564
|
+
function formatRelativeTime(date: Date, now: Date): string {
|
|
565
|
+
const diffMs = now.getTime() - date.getTime();
|
|
566
|
+
const diffDays = Math.floor(diffMs / (1000 * 60 * 60 * 24));
|
|
567
|
+
if (diffDays === 0) return "today";
|
|
568
|
+
if (diffDays === 1) return "yesterday";
|
|
569
|
+
if (diffDays < 7) return `${diffDays} days ago`;
|
|
570
|
+
if (diffDays < 14) return "1 week ago";
|
|
571
|
+
if (diffDays < 30) return `${Math.floor(diffDays / 7)} weeks ago`;
|
|
572
|
+
if (diffDays < 60) return "1 month ago";
|
|
573
|
+
if (diffDays < 365) return `${Math.floor(diffDays / 30)} months ago`;
|
|
574
|
+
return `${Math.floor(diffDays / 365)} year${Math.floor(diffDays / 365) > 1 ? "s" : ""} ago`;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
function parseDateFromContent(s: string): Date | null {
|
|
578
|
+
// "Month Day, Year" e.g. "January 15, 2026"
|
|
579
|
+
const simple = s.match(/([A-Z][a-z]+)\s+(\d{1,2}),?\s+(\d{4})/);
|
|
580
|
+
if (simple) {
|
|
581
|
+
const d = new Date(`${simple[1]} ${simple[2]}, ${simple[3]}`);
|
|
582
|
+
if (!isNaN(d.getTime())) return d;
|
|
583
|
+
}
|
|
584
|
+
// "Month D-D, Year" range — use start
|
|
585
|
+
const range = s.match(/([A-Z][a-z]+)\s+(\d{1,2})-\d{1,2},?\s+(\d{4})/);
|
|
586
|
+
if (range) {
|
|
587
|
+
const d = new Date(`${range[1]} ${range[2]}, ${range[3]}`);
|
|
588
|
+
if (!isNaN(d.getTime())) return d;
|
|
589
|
+
}
|
|
590
|
+
// "late/early/mid Month Year"
|
|
591
|
+
const vague = s.match(/(late|early|mid)[- ]?([A-Z][a-z]+)\s+(\d{4})/i);
|
|
592
|
+
if (vague) {
|
|
593
|
+
const day =
|
|
594
|
+
vague[1].toLowerCase() === "early"
|
|
595
|
+
? 7
|
|
596
|
+
: vague[1].toLowerCase() === "late"
|
|
597
|
+
? 23
|
|
598
|
+
: 15;
|
|
599
|
+
const d = new Date(`${vague[2]} ${day}, ${vague[3]}`);
|
|
600
|
+
if (!isNaN(d.getTime())) return d;
|
|
601
|
+
}
|
|
602
|
+
return null;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
// Expand "(meaning DATE)" and "(estimated DATE)" annotations with a relative offset.
|
|
606
|
+
// Past future-intent lines get "(likely already happened)" appended.
|
|
607
|
+
function expandInlineEstimatedDates(text: string, now: Date): string {
|
|
608
|
+
return text.replace(
|
|
609
|
+
/\(((?:meaning|estimated)\s+)([^)]+\d{4})\)/gi,
|
|
610
|
+
(match, prefix: string, dateContent: string) => {
|
|
611
|
+
const d = parseDateFromContent(dateContent);
|
|
612
|
+
if (!d) return match;
|
|
613
|
+
const rel = formatRelativeTime(d, now);
|
|
614
|
+
// Detect future-intent by looking backwards on the same line
|
|
615
|
+
const matchIdx = text.indexOf(match);
|
|
616
|
+
const lineStart = text.lastIndexOf("\n", matchIdx) + 1;
|
|
617
|
+
const linePrefix = text.slice(lineStart, matchIdx);
|
|
618
|
+
const isFutureIntent =
|
|
619
|
+
/\b(?:will|plans?\s+to|planning\s+to|going\s+to|intends?\s+to)\b/i.test(
|
|
620
|
+
linePrefix,
|
|
621
|
+
);
|
|
622
|
+
if (d < now && isFutureIntent)
|
|
623
|
+
return `(${prefix}${dateContent} — ${rel}, likely already happened)`;
|
|
624
|
+
return `(${prefix}${dateContent} — ${rel})`;
|
|
625
|
+
},
|
|
626
|
+
);
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
// Add relative time annotations to "Date: Month D, Year" section headers
|
|
630
|
+
// and gap markers between non-consecutive dates.
|
|
631
|
+
function addRelativeTimeToObservations(text: string, now: Date): string {
|
|
632
|
+
// First pass: expand inline "(meaning DATE)" annotations
|
|
633
|
+
const withInline = expandInlineEstimatedDates(text, now);
|
|
634
|
+
|
|
635
|
+
// Second pass: annotate date headers and add gap markers
|
|
636
|
+
const dateHeaderRe = /^(Date:\s*)([A-Z][a-z]+ \d{1,2}, \d{4})$/gm;
|
|
637
|
+
const found: Array<{
|
|
638
|
+
index: number;
|
|
639
|
+
date: Date;
|
|
640
|
+
full: string;
|
|
641
|
+
prefix: string;
|
|
642
|
+
ds: string;
|
|
643
|
+
}> = [];
|
|
644
|
+
let m: RegExpExecArray | null;
|
|
645
|
+
while ((m = dateHeaderRe.exec(withInline)) !== null) {
|
|
646
|
+
const d = new Date(m[2]);
|
|
647
|
+
if (!isNaN(d.getTime()))
|
|
648
|
+
found.push({
|
|
649
|
+
index: m.index,
|
|
650
|
+
date: d,
|
|
651
|
+
full: m[0],
|
|
652
|
+
prefix: m[1],
|
|
653
|
+
ds: m[2],
|
|
654
|
+
});
|
|
655
|
+
}
|
|
656
|
+
if (!found.length) return withInline;
|
|
657
|
+
|
|
658
|
+
let result = "";
|
|
659
|
+
let last = 0;
|
|
660
|
+
for (let i = 0; i < found.length; i++) {
|
|
661
|
+
const curr = found[i];
|
|
662
|
+
const prev = found[i - 1];
|
|
663
|
+
result += withInline.slice(last, curr.index);
|
|
664
|
+
// Gap marker between non-consecutive dates
|
|
665
|
+
if (prev) {
|
|
666
|
+
const gapDays = Math.floor(
|
|
667
|
+
(curr.date.getTime() - prev.date.getTime()) / 86400000,
|
|
668
|
+
);
|
|
669
|
+
if (gapDays > 1) {
|
|
670
|
+
const gap =
|
|
671
|
+
gapDays < 7
|
|
672
|
+
? `[${gapDays} days later]`
|
|
673
|
+
: gapDays < 14
|
|
674
|
+
? "[1 week later]"
|
|
675
|
+
: gapDays < 30
|
|
676
|
+
? `[${Math.floor(gapDays / 7)} weeks later]`
|
|
677
|
+
: gapDays < 60
|
|
678
|
+
? "[1 month later]"
|
|
679
|
+
: `[${Math.floor(gapDays / 30)} months later]`;
|
|
680
|
+
result += `\n${gap}\n\n`;
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
result += `${curr.prefix}${curr.ds} (${formatRelativeTime(curr.date, now)})`;
|
|
684
|
+
last = curr.index + curr.full.length;
|
|
685
|
+
}
|
|
686
|
+
result += withInline.slice(last);
|
|
687
|
+
return result;
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
// Build synthetic user/assistant message pair wrapping formatted distillation text.
|
|
691
|
+
// Shared by the cached and non-cached prefix paths.
|
|
692
|
+
function buildPrefixMessages(formatted: string): MessageWithParts[] {
|
|
693
|
+
return [
|
|
694
|
+
{
|
|
695
|
+
info: {
|
|
696
|
+
id: "lore-distilled-user",
|
|
697
|
+
sessionID: "",
|
|
698
|
+
role: "user" as const,
|
|
699
|
+
time: { created: 0 },
|
|
700
|
+
agent: "",
|
|
701
|
+
model: { providerID: "", modelID: "" },
|
|
702
|
+
},
|
|
703
|
+
parts: [
|
|
704
|
+
{
|
|
705
|
+
id: "lore-distilled-user-part",
|
|
706
|
+
sessionID: "",
|
|
707
|
+
messageID: "lore-distilled-user",
|
|
708
|
+
type: "text" as const,
|
|
709
|
+
text: "[Memory context follows — do not reference this format in your responses]",
|
|
710
|
+
time: { start: 0, end: 0 },
|
|
711
|
+
},
|
|
712
|
+
],
|
|
713
|
+
},
|
|
714
|
+
{
|
|
715
|
+
info: {
|
|
716
|
+
id: "lore-distilled-assistant",
|
|
717
|
+
sessionID: "",
|
|
718
|
+
role: "assistant" as const,
|
|
719
|
+
time: { created: 0 },
|
|
720
|
+
parentID: "lore-distilled-user",
|
|
721
|
+
modelID: "",
|
|
722
|
+
providerID: "",
|
|
723
|
+
mode: "memory",
|
|
724
|
+
path: { cwd: "", root: "" },
|
|
725
|
+
cost: 0,
|
|
726
|
+
tokens: {
|
|
727
|
+
input: 0,
|
|
728
|
+
output: 0,
|
|
729
|
+
reasoning: 0,
|
|
730
|
+
cache: { read: 0, write: 0 },
|
|
731
|
+
},
|
|
732
|
+
},
|
|
733
|
+
parts: [
|
|
734
|
+
{
|
|
735
|
+
id: "lore-distilled-assistant-part",
|
|
736
|
+
sessionID: "",
|
|
737
|
+
messageID: "lore-distilled-assistant",
|
|
738
|
+
type: "text" as const,
|
|
739
|
+
text: formatted + "\n\nI'm ready to continue.",
|
|
740
|
+
time: { start: 0, end: 0 },
|
|
741
|
+
},
|
|
742
|
+
],
|
|
743
|
+
},
|
|
744
|
+
];
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
// Build a synthetic message pair containing the distilled history.
|
|
748
|
+
// Non-cached path — used by layers 2-4 which already cause full cache invalidation.
|
|
749
|
+
function distilledPrefix(distillations: Distillation[]): MessageWithParts[] {
|
|
750
|
+
if (!distillations.length) return [];
|
|
751
|
+
const now = new Date();
|
|
752
|
+
const annotated = distillations.map((d) => ({
|
|
753
|
+
...d,
|
|
754
|
+
observations: addRelativeTimeToObservations(d.observations, now),
|
|
755
|
+
}));
|
|
756
|
+
const formatted = formatDistillations(annotated);
|
|
757
|
+
if (!formatted) return [];
|
|
758
|
+
return buildPrefixMessages(formatted);
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
// --- Approach C: Append-only distillation prefix cache ---
|
|
762
|
+
//
|
|
763
|
+
// Caches the rendered prefix text per session. When new distillations arrive,
|
|
764
|
+
// only renders the new rows and appends them to the cached text. This keeps
|
|
765
|
+
// the prefix byte-identical between distillation runs, preserving the prompt
|
|
766
|
+
// cache. Only meta-distillation (which rewrites gen-0 rows into gen-1) causes
|
|
767
|
+
// a full re-render — and that happens roughly every 80-100 turns.
|
|
768
|
+
|
|
769
|
+
type PrefixCache = {
|
|
770
|
+
/** The session this cache belongs to */
|
|
771
|
+
sessionID: string;
|
|
772
|
+
/** ID of the last distillation row included in the cached text */
|
|
773
|
+
lastDistillationID: string;
|
|
774
|
+
/** Number of rows that produced the cached text */
|
|
775
|
+
rowCount: number;
|
|
776
|
+
/** The rendered text (used to build delta appends) */
|
|
777
|
+
cachedText: string;
|
|
778
|
+
/** Ready-to-use message pair */
|
|
779
|
+
prefixMessages: MessageWithParts[];
|
|
780
|
+
/** Token estimate of prefixMessages */
|
|
781
|
+
prefixTokens: number;
|
|
782
|
+
};
|
|
783
|
+
|
|
784
|
+
/**
|
|
785
|
+
* Return the distilled prefix messages, reusing cached content when possible.
|
|
786
|
+
* Uses per-session state from sessState.prefixCache (no module-level cache).
|
|
787
|
+
*
|
|
788
|
+
* Cache hit — no new rows: returns the exact same prefixMessages object
|
|
789
|
+
* (byte-identical content, prompt cache preserved).
|
|
790
|
+
* Cache miss — new rows appended: renders only the delta, appends to cached
|
|
791
|
+
* text, updates cache.
|
|
792
|
+
* Full reset — first call, or rows were rewritten by meta-distillation:
|
|
793
|
+
* renders everything from scratch.
|
|
794
|
+
*/
|
|
795
|
+
function distilledPrefixCached(
|
|
796
|
+
distillations: Distillation[],
|
|
797
|
+
sessionID: string,
|
|
798
|
+
sessState: SessionState,
|
|
799
|
+
): { messages: MessageWithParts[]; tokens: number } {
|
|
800
|
+
if (!distillations.length) {
|
|
801
|
+
sessState.prefixCache = null;
|
|
802
|
+
return { messages: [], tokens: 0 };
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
const lastRow = distillations[distillations.length - 1];
|
|
806
|
+
const prefixCache = sessState.prefixCache;
|
|
807
|
+
|
|
808
|
+
// Cache is valid when: same session, row count only grew (no rewrites),
|
|
809
|
+
// and the last previously-cached row still exists at the same position.
|
|
810
|
+
const cacheValid =
|
|
811
|
+
prefixCache !== null &&
|
|
812
|
+
prefixCache.sessionID === sessionID &&
|
|
813
|
+
prefixCache.rowCount <= distillations.length &&
|
|
814
|
+
(prefixCache.rowCount === 0 ||
|
|
815
|
+
distillations[prefixCache.rowCount - 1]?.id ===
|
|
816
|
+
prefixCache.lastDistillationID);
|
|
817
|
+
|
|
818
|
+
if (cacheValid) {
|
|
819
|
+
if (prefixCache!.lastDistillationID === lastRow.id) {
|
|
820
|
+
// No new rows — return cached prefix as-is (byte-identical for prompt cache)
|
|
821
|
+
return {
|
|
822
|
+
messages: prefixCache!.prefixMessages,
|
|
823
|
+
tokens: prefixCache!.prefixTokens,
|
|
824
|
+
};
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
// New rows appended — render only the delta and append to cached text
|
|
828
|
+
const newRows = distillations.slice(prefixCache!.rowCount);
|
|
829
|
+
const now = new Date();
|
|
830
|
+
const annotated = newRows.map((d) => ({
|
|
831
|
+
...d,
|
|
832
|
+
observations: addRelativeTimeToObservations(d.observations, now),
|
|
833
|
+
}));
|
|
834
|
+
const deltaText = formatDistillations(annotated);
|
|
835
|
+
|
|
836
|
+
if (deltaText) {
|
|
837
|
+
const fullText = prefixCache!.cachedText + "\n\n" + deltaText;
|
|
838
|
+
const messages = buildPrefixMessages(fullText);
|
|
839
|
+
const tokens = messages.reduce((sum, m) => sum + estimateMessage(m), 0);
|
|
840
|
+
sessState.prefixCache = {
|
|
841
|
+
sessionID,
|
|
842
|
+
lastDistillationID: lastRow.id,
|
|
843
|
+
rowCount: distillations.length,
|
|
844
|
+
cachedText: fullText,
|
|
845
|
+
prefixMessages: messages,
|
|
846
|
+
prefixTokens: tokens,
|
|
847
|
+
};
|
|
848
|
+
return { messages, tokens };
|
|
849
|
+
}
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
// Full re-render: first call or meta-distillation rewrote rows
|
|
853
|
+
const now = new Date();
|
|
854
|
+
const annotated = distillations.map((d) => ({
|
|
855
|
+
...d,
|
|
856
|
+
observations: addRelativeTimeToObservations(d.observations, now),
|
|
857
|
+
}));
|
|
858
|
+
const fullText = formatDistillations(annotated);
|
|
859
|
+
if (!fullText) {
|
|
860
|
+
sessState.prefixCache = null;
|
|
861
|
+
return { messages: [], tokens: 0 };
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
const messages = buildPrefixMessages(fullText);
|
|
865
|
+
const tokens = messages.reduce((sum, m) => sum + estimateMessage(m), 0);
|
|
866
|
+
sessState.prefixCache = {
|
|
867
|
+
sessionID,
|
|
868
|
+
lastDistillationID: lastRow.id,
|
|
869
|
+
rowCount: distillations.length,
|
|
870
|
+
cachedText: fullText,
|
|
871
|
+
prefixMessages: messages,
|
|
872
|
+
prefixTokens: tokens,
|
|
873
|
+
};
|
|
874
|
+
return { messages, tokens };
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
// For testing only — reset prefix cache state for a specific session (or all)
|
|
878
|
+
export function resetPrefixCache(sessionID?: string) {
|
|
879
|
+
if (sessionID) {
|
|
880
|
+
const state = sessionStates.get(sessionID);
|
|
881
|
+
if (state) state.prefixCache = null;
|
|
882
|
+
} else {
|
|
883
|
+
for (const state of sessionStates.values()) state.prefixCache = null;
|
|
884
|
+
}
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
// --- Approach B: Lazy raw window eviction ---
|
|
888
|
+
//
|
|
889
|
+
// Tracks the ID of the first (oldest) message in the previous raw window.
|
|
890
|
+
// On the next turn, if the window starting at that message still fits within
|
|
891
|
+
// the raw budget, the cutoff is pinned — no messages are evicted and the raw
|
|
892
|
+
// window stays byte-identical for caching purposes. Only when the pinned
|
|
893
|
+
// window no longer fits (e.g. a large tool response pushed us over) is the
|
|
894
|
+
// cutoff allowed to advance forward by one message at a time.
|
|
895
|
+
//
|
|
896
|
+
// This eliminates the "window sliding on every turn" problem that was the
|
|
897
|
+
// dominant source of cache misses in gradient mode: each new turn appends a
|
|
898
|
+
// message to the conversation, but the start of the raw window only moves
|
|
899
|
+
// when it must.
|
|
900
|
+
//
|
|
901
|
+
// Reset conditions: session changes, or layer escalates to 2+ (the pinned
|
|
902
|
+
// window was too large even with stripping — something genuinely changed).
|
|
903
|
+
|
|
904
|
+
type RawWindowCache = {
|
|
905
|
+
sessionID: string;
|
|
906
|
+
/** ID of the first message in the pinned raw window */
|
|
907
|
+
firstMessageID: string;
|
|
908
|
+
};
|
|
909
|
+
|
|
910
|
+
// For testing only — reset raw window cache state for a specific session (or all)
|
|
911
|
+
export function resetRawWindowCache(sessionID?: string) {
|
|
912
|
+
if (sessionID) {
|
|
913
|
+
const state = sessionStates.get(sessionID);
|
|
914
|
+
if (state) state.rawWindowCache = null;
|
|
915
|
+
} else {
|
|
916
|
+
for (const state of sessionStates.values()) state.rawWindowCache = null;
|
|
917
|
+
}
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
/**
|
|
921
|
+
* Layer-1 tryFit with lazy eviction.
|
|
922
|
+
* Uses per-session rawWindowCache from sessState (no module-level cache).
|
|
923
|
+
*
|
|
924
|
+
* Attempts to reuse the previous raw window cutoff before falling back to a
|
|
925
|
+
* full backward scan. If the pinned window fits, returns it unchanged (same
|
|
926
|
+
* message objects, byte-identical for prompt caching). If it doesn't fit,
|
|
927
|
+
* delegates to the normal tryFit which finds the new minimal cutoff and
|
|
928
|
+
* updates the cache.
|
|
929
|
+
*/
|
|
930
|
+
function tryFitStable(input: {
|
|
931
|
+
messages: MessageWithParts[];
|
|
932
|
+
prefix: MessageWithParts[];
|
|
933
|
+
prefixTokens: number;
|
|
934
|
+
distilledBudget: number;
|
|
935
|
+
rawBudget: number;
|
|
936
|
+
sessionID: string;
|
|
937
|
+
sessState: SessionState;
|
|
938
|
+
}): Omit<TransformResult, "layer" | "usable" | "distilledBudget" | "rawBudget"> | null {
|
|
939
|
+
// If the prefix already overflows its budget there's no point trying.
|
|
940
|
+
if (input.prefixTokens > input.distilledBudget && input.prefix.length > 0)
|
|
941
|
+
return null;
|
|
942
|
+
|
|
943
|
+
const rawWindowCache = input.sessState.rawWindowCache;
|
|
944
|
+
const cacheValid =
|
|
945
|
+
rawWindowCache !== null && rawWindowCache.sessionID === input.sessionID;
|
|
946
|
+
|
|
947
|
+
if (cacheValid) {
|
|
948
|
+
const pinnedIdx = input.messages.findIndex(
|
|
949
|
+
(m) => m.info.id === rawWindowCache!.firstMessageID,
|
|
950
|
+
);
|
|
951
|
+
|
|
952
|
+
if (pinnedIdx !== -1) {
|
|
953
|
+
// Measure the token cost of the pinned window.
|
|
954
|
+
const pinnedWindow = input.messages.slice(pinnedIdx);
|
|
955
|
+
const pinnedTokens = pinnedWindow.reduce(
|
|
956
|
+
(sum, m) => sum + estimateMessage(m),
|
|
957
|
+
0,
|
|
958
|
+
);
|
|
959
|
+
|
|
960
|
+
if (pinnedTokens <= input.rawBudget) {
|
|
961
|
+
// Pinned window still fits — keep it. Apply system-reminder cleanup
|
|
962
|
+
// only (strip:"none" is the layer-1 mode), returning the same message
|
|
963
|
+
// object references wherever nothing changed.
|
|
964
|
+
const processed = pinnedWindow.map((msg) => {
|
|
965
|
+
const parts = cleanParts(msg.parts);
|
|
966
|
+
return parts !== msg.parts ? { info: msg.info, parts } : msg;
|
|
967
|
+
});
|
|
968
|
+
const total = input.prefixTokens + pinnedTokens;
|
|
969
|
+
return {
|
|
970
|
+
messages: [...input.prefix, ...processed],
|
|
971
|
+
distilledTokens: input.prefixTokens,
|
|
972
|
+
rawTokens: pinnedTokens,
|
|
973
|
+
totalTokens: total,
|
|
974
|
+
};
|
|
975
|
+
}
|
|
976
|
+
// Pinned window is too large — fall through to the normal scan below.
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
// Normal backward scan to find the tightest fitting cutoff.
|
|
981
|
+
const result = tryFit({
|
|
982
|
+
messages: input.messages,
|
|
983
|
+
prefix: input.prefix,
|
|
984
|
+
prefixTokens: input.prefixTokens,
|
|
985
|
+
distilledBudget: input.distilledBudget,
|
|
986
|
+
rawBudget: input.rawBudget,
|
|
987
|
+
strip: "none",
|
|
988
|
+
});
|
|
989
|
+
|
|
990
|
+
if (result) {
|
|
991
|
+
// Update the raw window cache: the first non-prefix message is the oldest
|
|
992
|
+
// raw message in the new window. Pin to its ID for the next turn.
|
|
993
|
+
const rawStart = result.messages[input.prefix.length];
|
|
994
|
+
if (rawStart) {
|
|
995
|
+
input.sessState.rawWindowCache = {
|
|
996
|
+
sessionID: input.sessionID,
|
|
997
|
+
firstMessageID: rawStart.info.id,
|
|
998
|
+
};
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
return result;
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
export type SafetyLayer = 0 | 1 | 2 | 3 | 4;
|
|
1006
|
+
|
|
1007
|
+
export type TransformResult = {
|
|
1008
|
+
messages: MessageWithParts[];
|
|
1009
|
+
layer: SafetyLayer;
|
|
1010
|
+
distilledTokens: number;
|
|
1011
|
+
rawTokens: number;
|
|
1012
|
+
totalTokens: number;
|
|
1013
|
+
// Budget context (for display in context inspector)
|
|
1014
|
+
usable: number;
|
|
1015
|
+
distilledBudget: number;
|
|
1016
|
+
rawBudget: number;
|
|
1017
|
+
};
|
|
1018
|
+
|
|
1019
|
+
// Signal that we need urgent distillation
|
|
1020
|
+
let urgentDistillation = false;
|
|
1021
|
+
export function needsUrgentDistillation(): boolean {
|
|
1022
|
+
const v = urgentDistillation;
|
|
1023
|
+
urgentDistillation = false;
|
|
1024
|
+
return v;
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
function transformInner(input: {
|
|
1028
|
+
messages: MessageWithParts[];
|
|
1029
|
+
projectPath: string;
|
|
1030
|
+
sessionID?: string;
|
|
1031
|
+
}): TransformResult {
|
|
1032
|
+
const cfg = config();
|
|
1033
|
+
const overhead = getOverhead();
|
|
1034
|
+
// Usable = full context minus output reservation minus fixed overhead (system + tools)
|
|
1035
|
+
// minus LTM tokens already injected into the system prompt this turn.
|
|
1036
|
+
const usable = Math.max(
|
|
1037
|
+
0,
|
|
1038
|
+
contextLimit - outputReserved - overhead - ltmTokens,
|
|
1039
|
+
);
|
|
1040
|
+
const distilledBudget = Math.floor(usable * cfg.budget.distilled);
|
|
1041
|
+
const rawBudget = Math.floor(usable * cfg.budget.raw);
|
|
1042
|
+
|
|
1043
|
+
// --- Force escalation (reactive error recovery) ---
|
|
1044
|
+
// When the API previously rejected with "prompt is too long", skip layers
|
|
1045
|
+
// below the forced minimum to ensure enough trimming on the next attempt.
|
|
1046
|
+
// One-shot: consumed here and reset to 0 (both in-memory and on disk).
|
|
1047
|
+
const sid = input.sessionID ?? input.messages[0]?.info.sessionID;
|
|
1048
|
+
const sessState = sid ? getSessionState(sid) : makeSessionState();
|
|
1049
|
+
let effectiveMinLayer = sessState.forceMinLayer;
|
|
1050
|
+
sessState.forceMinLayer = 0;
|
|
1051
|
+
if (sid && effectiveMinLayer > 0) saveForceMinLayer(sid, 0);
|
|
1052
|
+
|
|
1053
|
+
// --- Approach A: Cache-preserving passthrough ---
|
|
1054
|
+
// Use exact token count from the previous API response when available.
|
|
1055
|
+
// Only the delta (messages added since last call) uses chars/3 estimation,
|
|
1056
|
+
// making the layer-0 decision highly accurate from the API's own tokenizer.
|
|
1057
|
+
// maxInput = absolute ceiling the API enforces: input_tokens + max_tokens <= context
|
|
1058
|
+
const maxInput = contextLimit - outputReserved;
|
|
1059
|
+
|
|
1060
|
+
// True when we have real API token data from a previous turn in this session.
|
|
1061
|
+
// When false (first turn / session change), chars/3 estimates may still diverge
|
|
1062
|
+
// from the real tokenizer — so tryFit output must be validated with a safety
|
|
1063
|
+
// multiplier before being used.
|
|
1064
|
+
const calibrated = sessState.lastKnownInput > 0;
|
|
1065
|
+
|
|
1066
|
+
// On uncalibrated turns, apply this multiplier to tryFit's estimated total to
|
|
1067
|
+
// approximate the real token count. chars/3 undercounts by ~1.68x on real data,
|
|
1068
|
+
// but overhead EMA captures most of the gap. 1.5 provides a safe margin.
|
|
1069
|
+
const UNCALIBRATED_SAFETY = 1.5;
|
|
1070
|
+
|
|
1071
|
+
// Returns true if the tryFit result is safe to use: either we have calibrated
|
|
1072
|
+
// data (exact) or the estimated total * safety factor fits within maxInput.
|
|
1073
|
+
function fitsWithSafetyMargin(result: { totalTokens: number } | null): boolean {
|
|
1074
|
+
if (!result) return false;
|
|
1075
|
+
if (calibrated) return true;
|
|
1076
|
+
return result.totalTokens * UNCALIBRATED_SAFETY <= maxInput;
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
// --- Sticky layer guard (Option C) ---
|
|
1080
|
+
// After a compressed turn (layer >= 1), don't allow layer 0 re-entry until
|
|
1081
|
+
// the session genuinely shrinks (e.g. after compaction deletes messages).
|
|
1082
|
+
// Prevents the calibration oscillation: a compressed turn stores
|
|
1083
|
+
// lastKnownInput=100K for a 50-message window, but the next turn's
|
|
1084
|
+
// input.messages has 300 raw messages. The delta estimation treats the 250
|
|
1085
|
+
// evicted messages as "new" and undercounts their tokens, producing an
|
|
1086
|
+
// expectedInput that fits in layer 0 — but the actual tokens are ~190K.
|
|
1087
|
+
// Only applied when calibrated (same session, per-session state) to avoid
|
|
1088
|
+
// affecting other sessions including worker sessions.
|
|
1089
|
+
if (calibrated && sessState.lastLayer >= 1 && input.messages.length >= sessState.lastKnownMessageCount) {
|
|
1090
|
+
effectiveMinLayer = Math.max(effectiveMinLayer, 1) as SafetyLayer;
|
|
1091
|
+
}
|
|
1092
|
+
|
|
1093
|
+
let expectedInput: number;
|
|
1094
|
+
if (calibrated) {
|
|
1095
|
+
// Exact approach: prior API count + estimate of only genuinely new messages.
|
|
1096
|
+
// Use message ID tracking (Option B) to identify new messages accurately.
|
|
1097
|
+
// After compression, the "last window" is a subset of the full message array —
|
|
1098
|
+
// counting by index would treat evicted messages as new (off-by-250 error).
|
|
1099
|
+
const newMessages = sessState.lastWindowMessageIDs.size > 0
|
|
1100
|
+
? input.messages.filter((m) => !sessState.lastWindowMessageIDs.has(m.info.id))
|
|
1101
|
+
: input.messages.slice(-Math.max(0, input.messages.length - sessState.lastKnownMessageCount));
|
|
1102
|
+
const newMsgTokens = newMessages.reduce((s, m) => s + estimateMessage(m), 0);
|
|
1103
|
+
const ltmDelta = ltmTokens - sessState.lastKnownLtm;
|
|
1104
|
+
expectedInput = sessState.lastKnownInput + newMsgTokens + ltmDelta;
|
|
1105
|
+
} else {
|
|
1106
|
+
// First turn or session change: fall back to chars/3 estimate + overhead.
|
|
1107
|
+
const messageTokens = input.messages.reduce((s, m) => s + estimateMessage(m), 0);
|
|
1108
|
+
expectedInput = messageTokens + overhead + ltmTokens;
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1111
|
+
// When uncalibrated, apply safety multiplier to the layer-0 decision too.
|
|
1112
|
+
// chars/3 undercounts by ~1.63x on real sessions — without this, a session
|
|
1113
|
+
// estimated at 146K passes layer 0 but actually costs 214K → overflow.
|
|
1114
|
+
const layer0Input = calibrated ? expectedInput : expectedInput * UNCALIBRATED_SAFETY;
|
|
1115
|
+
|
|
1116
|
+
if (effectiveMinLayer === 0 && layer0Input <= maxInput) {
|
|
1117
|
+
// All messages fit — return unmodified to preserve append-only prompt-cache pattern.
|
|
1118
|
+
// Raw messages are strictly better context than lossy distilled summaries.
|
|
1119
|
+
const messageTokens = calibrated
|
|
1120
|
+
? expectedInput - (ltmTokens - sessState.lastKnownLtm) // approximate raw portion
|
|
1121
|
+
: expectedInput - overhead - ltmTokens;
|
|
1122
|
+
return {
|
|
1123
|
+
messages: input.messages,
|
|
1124
|
+
layer: 0,
|
|
1125
|
+
distilledTokens: 0,
|
|
1126
|
+
rawTokens: Math.max(0, messageTokens),
|
|
1127
|
+
totalTokens: Math.max(0, messageTokens),
|
|
1128
|
+
usable,
|
|
1129
|
+
distilledBudget,
|
|
1130
|
+
rawBudget,
|
|
1131
|
+
};
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
// --- Gradient mode: context exhausted (or force-escalated), compress older messages ---
|
|
1135
|
+
|
|
1136
|
+
// Pre-pass: deduplicate repeated tool outputs before layer selection.
|
|
1137
|
+
// Keeps only the latest occurrence of each unique output, replacing earlier
|
|
1138
|
+
// ones with compact annotations. This can save thousands of tokens for sessions
|
|
1139
|
+
// with repeated file reads, potentially avoiding escalation to higher layers.
|
|
1140
|
+
const turnStart = currentTurnStart(input.messages);
|
|
1141
|
+
const dedupMessages = deduplicateToolOutputs(input.messages, turnStart);
|
|
1142
|
+
|
|
1143
|
+
const distillations = sid ? loadDistillations(input.projectPath, sid) : [];
|
|
1144
|
+
|
|
1145
|
+
// Layer 1 uses the append-only cached prefix (Approach C) to keep the
|
|
1146
|
+
// distilled content byte-identical between distillation runs, preserving
|
|
1147
|
+
// the prompt cache. Layers 2-4 already cause full cache invalidation via
|
|
1148
|
+
// tool stripping / message restructuring, so they use the non-cached path.
|
|
1149
|
+
const cached = sid
|
|
1150
|
+
? distilledPrefixCached(distillations, sid, sessState)
|
|
1151
|
+
: (() => {
|
|
1152
|
+
const msgs = distilledPrefix(distillations);
|
|
1153
|
+
return { messages: msgs, tokens: msgs.reduce((sum, m) => sum + estimateMessage(m), 0) };
|
|
1154
|
+
})();
|
|
1155
|
+
|
|
1156
|
+
// Layer 1: Normal budget allocation with lazy raw window eviction (Approach B).
|
|
1157
|
+
// tryFitStable reuses the previous cutoff when it still fits, keeping the raw
|
|
1158
|
+
// window byte-identical across turns for prompt caching. Only advances the
|
|
1159
|
+
// cutoff when a genuinely oversized message forces eviction.
|
|
1160
|
+
// Skipped when force-escalated to layer 2+ (previous attempt already failed at this level).
|
|
1161
|
+
if (effectiveMinLayer <= 1) {
|
|
1162
|
+
const layer1 = sid
|
|
1163
|
+
? tryFitStable({
|
|
1164
|
+
messages: dedupMessages,
|
|
1165
|
+
prefix: cached.messages,
|
|
1166
|
+
prefixTokens: cached.tokens,
|
|
1167
|
+
distilledBudget,
|
|
1168
|
+
rawBudget,
|
|
1169
|
+
sessionID: sid,
|
|
1170
|
+
sessState,
|
|
1171
|
+
})
|
|
1172
|
+
: tryFit({
|
|
1173
|
+
messages: dedupMessages,
|
|
1174
|
+
prefix: cached.messages,
|
|
1175
|
+
prefixTokens: cached.tokens,
|
|
1176
|
+
distilledBudget,
|
|
1177
|
+
rawBudget,
|
|
1178
|
+
strip: "none",
|
|
1179
|
+
});
|
|
1180
|
+
if (fitsWithSafetyMargin(layer1)) return { ...layer1!, layer: 1, usable, distilledBudget, rawBudget };
|
|
1181
|
+
}
|
|
1182
|
+
|
|
1183
|
+
// Layer 1 didn't fit (or was force-skipped) — reset the raw window cache.
|
|
1184
|
+
// Layers 2-4 use full scans and already break the prompt cache.
|
|
1185
|
+
sessState.rawWindowCache = null;
|
|
1186
|
+
|
|
1187
|
+
// Layer 2: Strip tool outputs from older messages, keep last 2 turns
|
|
1188
|
+
// Skipped when force-escalated to layer 3+.
|
|
1189
|
+
if (effectiveMinLayer <= 2) {
|
|
1190
|
+
const layer2 = tryFit({
|
|
1191
|
+
messages: dedupMessages,
|
|
1192
|
+
prefix: cached.messages,
|
|
1193
|
+
prefixTokens: cached.tokens,
|
|
1194
|
+
distilledBudget,
|
|
1195
|
+
rawBudget: Math.floor(usable * 0.5), // give raw more room
|
|
1196
|
+
strip: "old-tools",
|
|
1197
|
+
protectedTurns: 2,
|
|
1198
|
+
});
|
|
1199
|
+
if (fitsWithSafetyMargin(layer2)) {
|
|
1200
|
+
urgentDistillation = true;
|
|
1201
|
+
return { ...layer2!, layer: 2, usable, distilledBudget, rawBudget };
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1205
|
+
// Layer 3: Strip ALL tool outputs, drop oldest distillations
|
|
1206
|
+
const trimmedDistillations = distillations.slice(-5);
|
|
1207
|
+
const trimmedPrefix = distilledPrefix(trimmedDistillations);
|
|
1208
|
+
const trimmedPrefixTokens = trimmedPrefix.reduce(
|
|
1209
|
+
(sum, m) => sum + estimateMessage(m),
|
|
1210
|
+
0,
|
|
1211
|
+
);
|
|
1212
|
+
const layer3 = tryFit({
|
|
1213
|
+
messages: dedupMessages,
|
|
1214
|
+
prefix: trimmedPrefix,
|
|
1215
|
+
prefixTokens: trimmedPrefixTokens,
|
|
1216
|
+
distilledBudget: Math.floor(usable * 0.15),
|
|
1217
|
+
rawBudget: Math.floor(usable * 0.55),
|
|
1218
|
+
strip: "all-tools",
|
|
1219
|
+
});
|
|
1220
|
+
if (fitsWithSafetyMargin(layer3)) {
|
|
1221
|
+
urgentDistillation = true;
|
|
1222
|
+
return { ...layer3!, layer: 3, usable, distilledBudget, rawBudget };
|
|
1223
|
+
}
|
|
1224
|
+
|
|
1225
|
+
// Layer 4: Emergency — last 2 distillations, last 3 raw messages with tool parts intact.
|
|
1226
|
+
// We do NOT strip tool parts here: doing so would cause an infinite tool-call loop because
|
|
1227
|
+
// the model would lose sight of its own in-progress tool calls and re-invoke them endlessly.
|
|
1228
|
+
// Instead, we aggressively drop old messages and rely on the `recall` tool (which the model
|
|
1229
|
+
// is always instructed to use) to retrieve any older details it needs.
|
|
1230
|
+
urgentDistillation = true;
|
|
1231
|
+
const nuclearDistillations = distillations.slice(-2);
|
|
1232
|
+
const nuclearPrefix = distilledPrefix(nuclearDistillations);
|
|
1233
|
+
const nuclearPrefixTokens = nuclearPrefix.reduce(
|
|
1234
|
+
(sum, m) => sum + estimateMessage(m),
|
|
1235
|
+
0,
|
|
1236
|
+
);
|
|
1237
|
+
const nuclearRaw = input.messages.slice(-3).map((m) => ({
|
|
1238
|
+
info: m.info,
|
|
1239
|
+
parts: cleanParts(m.parts),
|
|
1240
|
+
}));
|
|
1241
|
+
const nuclearRawTokens = nuclearRaw.reduce(
|
|
1242
|
+
(sum, m) => sum + estimateMessage(m),
|
|
1243
|
+
0,
|
|
1244
|
+
);
|
|
1245
|
+
|
|
1246
|
+
return {
|
|
1247
|
+
messages: [...nuclearPrefix, ...nuclearRaw],
|
|
1248
|
+
layer: 4,
|
|
1249
|
+
distilledTokens: nuclearPrefixTokens,
|
|
1250
|
+
rawTokens: nuclearRawTokens,
|
|
1251
|
+
totalTokens: nuclearPrefixTokens + nuclearRawTokens,
|
|
1252
|
+
usable,
|
|
1253
|
+
distilledBudget,
|
|
1254
|
+
rawBudget,
|
|
1255
|
+
};
|
|
1256
|
+
}
|
|
1257
|
+
|
|
1258
|
+
// Public wrapper: records the compressed message count for calibration.
|
|
1259
|
+
// Calibration needs to know how many messages were SENT to the model (the
|
|
1260
|
+
// compressed window), not the total DB count. On layer 0 these are equal;
|
|
1261
|
+
// on layers 1-4 the compressed window is smaller, and the delta on the next
|
|
1262
|
+
// turn must be computed relative to the compressed count — otherwise the
|
|
1263
|
+
// expected input on the next turn is anchored to the compressed input token
|
|
1264
|
+
// count but the "new messages" delta is computed against the full DB count,
|
|
1265
|
+
// making newMsgCount ≈ 0 and causing layer 0 passthrough on an overflowing session.
|
|
1266
|
+
export function transform(input: {
|
|
1267
|
+
messages: MessageWithParts[];
|
|
1268
|
+
projectPath: string;
|
|
1269
|
+
sessionID?: string;
|
|
1270
|
+
}): TransformResult {
|
|
1271
|
+
const result = transformInner(input);
|
|
1272
|
+
|
|
1273
|
+
// Sanitize non-terminal tool parts before the window reaches the SDK.
|
|
1274
|
+
// Must run after transformInner (covers all layers 0-4) and before the
|
|
1275
|
+
// trailing-drop loop in index.ts sees the messages.
|
|
1276
|
+
result.messages = sanitizeToolParts(result.messages);
|
|
1277
|
+
|
|
1278
|
+
const sid = input.sessionID ?? input.messages[0]?.info.sessionID;
|
|
1279
|
+
if (sid) {
|
|
1280
|
+
const state = getSessionState(sid);
|
|
1281
|
+
state.lastTransformedCount = result.messages.length;
|
|
1282
|
+
state.lastTransformEstimate = result.totalTokens;
|
|
1283
|
+
state.lastLayer = result.layer;
|
|
1284
|
+
state.lastWindowMessageIDs = new Set(result.messages.map((m) => m.info.id));
|
|
1285
|
+
}
|
|
1286
|
+
return result;
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
// Compute our message-only estimate for a set of messages (for calibration use)
|
|
1290
|
+
export function estimateMessages(messages: MessageWithParts[]): number {
|
|
1291
|
+
return messages.reduce((sum, m) => sum + estimateMessage(m), 0);
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
// Identify the current agentic turn: the last user message plus all subsequent
|
|
1295
|
+
// assistant messages that share its ID as parentID. These messages form an atomic
|
|
1296
|
+
// unit — the model must see all of them or it will lose track of its own prior
|
|
1297
|
+
// tool calls and re-issue them in an infinite loop.
|
|
1298
|
+
function currentTurnStart(messages: MessageWithParts[]): number {
|
|
1299
|
+
// Find the last user message
|
|
1300
|
+
let lastUserIdx = -1;
|
|
1301
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
1302
|
+
if (messages[i].info.role === "user") {
|
|
1303
|
+
lastUserIdx = i;
|
|
1304
|
+
break;
|
|
1305
|
+
}
|
|
1306
|
+
}
|
|
1307
|
+
if (lastUserIdx === -1) return 0; // no user message — treat all as current turn
|
|
1308
|
+
return lastUserIdx;
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
function tryFit(input: {
|
|
1312
|
+
messages: MessageWithParts[];
|
|
1313
|
+
prefix: MessageWithParts[];
|
|
1314
|
+
prefixTokens: number;
|
|
1315
|
+
distilledBudget: number;
|
|
1316
|
+
rawBudget: number;
|
|
1317
|
+
strip: "none" | "old-tools" | "all-tools";
|
|
1318
|
+
protectedTurns?: number;
|
|
1319
|
+
}): Omit<TransformResult, "layer" | "usable" | "distilledBudget" | "rawBudget"> | null {
|
|
1320
|
+
// If distilled prefix exceeds its budget, fail this layer
|
|
1321
|
+
if (input.prefixTokens > input.distilledBudget && input.prefix.length > 0)
|
|
1322
|
+
return null;
|
|
1323
|
+
|
|
1324
|
+
// Identify the current turn (last user message + all following assistant messages).
|
|
1325
|
+
// These are always included — they must never be evicted. If they alone exceed the
|
|
1326
|
+
// raw budget, escalate to the next layer (which strips tool outputs to reduce size).
|
|
1327
|
+
const turnStart = currentTurnStart(input.messages);
|
|
1328
|
+
const currentTurn = input.messages.slice(turnStart);
|
|
1329
|
+
const currentTurnTokens = currentTurn.reduce((s, m) => s + estimateMessage(m), 0);
|
|
1330
|
+
|
|
1331
|
+
if (currentTurnTokens > input.rawBudget) {
|
|
1332
|
+
// Current turn alone exceeds budget — can't fit even with everything else dropped.
|
|
1333
|
+
// Signal failure so the caller escalates to the next layer (tool-output stripping).
|
|
1334
|
+
return null;
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1337
|
+
// Walk backwards through older messages (before the current turn),
|
|
1338
|
+
// filling the remaining budget after reserving space for the current turn.
|
|
1339
|
+
const olderMessages = input.messages.slice(0, turnStart);
|
|
1340
|
+
const remainingBudget = input.rawBudget - currentTurnTokens;
|
|
1341
|
+
let olderTokens = 0;
|
|
1342
|
+
let cutoff = olderMessages.length; // default: include none of the older messages
|
|
1343
|
+
const protectedTurns = input.protectedTurns ?? 0;
|
|
1344
|
+
|
|
1345
|
+
for (let i = olderMessages.length - 1; i >= 0; i--) {
|
|
1346
|
+
const msg = olderMessages[i];
|
|
1347
|
+
const tokens = estimateMessage(msg);
|
|
1348
|
+
if (olderTokens + tokens > remainingBudget) {
|
|
1349
|
+
cutoff = i + 1;
|
|
1350
|
+
break;
|
|
1351
|
+
}
|
|
1352
|
+
olderTokens += tokens;
|
|
1353
|
+
if (i === 0) cutoff = 0;
|
|
1354
|
+
}
|
|
1355
|
+
|
|
1356
|
+
const rawMessages = [...olderMessages.slice(cutoff), ...currentTurn];
|
|
1357
|
+
const rawTokens = olderTokens + currentTurnTokens;
|
|
1358
|
+
|
|
1359
|
+
// Apply system-reminder stripping + optional tool output stripping.
|
|
1360
|
+
// The current turn (end of rawMessages) is always "protected" — never stripped.
|
|
1361
|
+
const currentTurnSet = new Set(currentTurn.map((m) => m.info.id));
|
|
1362
|
+
const processed = rawMessages.map((msg, idx) => {
|
|
1363
|
+
const fromEnd = rawMessages.length - idx;
|
|
1364
|
+
const isCurrentTurn = currentTurnSet.has(msg.info.id);
|
|
1365
|
+
const isProtected =
|
|
1366
|
+
isCurrentTurn ||
|
|
1367
|
+
input.strip === "none" ||
|
|
1368
|
+
(input.strip === "old-tools" && fromEnd <= protectedTurns * 2);
|
|
1369
|
+
const parts = isProtected
|
|
1370
|
+
? cleanParts(msg.parts)
|
|
1371
|
+
: cleanParts(
|
|
1372
|
+
input.strip === "all-tools"
|
|
1373
|
+
? stripToolOutputs(msg.parts)
|
|
1374
|
+
: stripToolOutputs(msg.parts),
|
|
1375
|
+
);
|
|
1376
|
+
const changed = parts !== msg.parts;
|
|
1377
|
+
return changed ? { info: msg.info, parts } : msg;
|
|
1378
|
+
});
|
|
1379
|
+
|
|
1380
|
+
const total = input.prefixTokens + rawTokens;
|
|
1381
|
+
return {
|
|
1382
|
+
messages: [...input.prefix, ...processed],
|
|
1383
|
+
distilledTokens: input.prefixTokens,
|
|
1384
|
+
rawTokens,
|
|
1385
|
+
totalTokens: total,
|
|
1386
|
+
};
|
|
1387
|
+
}
|