@loreai/core 0.20.0 → 0.20.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bun/config.d.ts +1 -1
- package/dist/bun/config.d.ts.map +1 -1
- package/dist/bun/distillation.d.ts +4 -0
- package/dist/bun/distillation.d.ts.map +1 -1
- package/dist/bun/gradient.d.ts +58 -16
- package/dist/bun/gradient.d.ts.map +1 -1
- package/dist/bun/index.d.ts +1 -1
- package/dist/bun/index.d.ts.map +1 -1
- package/dist/bun/index.js +101 -102
- package/dist/bun/index.js.map +3 -3
- package/dist/node/config.d.ts +1 -1
- package/dist/node/config.d.ts.map +1 -1
- package/dist/node/distillation.d.ts +4 -0
- package/dist/node/distillation.d.ts.map +1 -1
- package/dist/node/gradient.d.ts +58 -16
- package/dist/node/gradient.d.ts.map +1 -1
- package/dist/node/index.d.ts +1 -1
- package/dist/node/index.d.ts.map +1 -1
- package/dist/node/index.js +101 -102
- package/dist/node/index.js.map +3 -3
- package/dist/types/config.d.ts +1 -1
- package/dist/types/config.d.ts.map +1 -1
- package/dist/types/distillation.d.ts +4 -0
- package/dist/types/distillation.d.ts.map +1 -1
- package/dist/types/gradient.d.ts +58 -16
- package/dist/types/gradient.d.ts.map +1 -1
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/config.ts +4 -12
- package/src/distillation.ts +12 -1
- package/src/gradient.ts +205 -190
- package/src/index.ts +8 -4
package/src/gradient.ts
CHANGED
|
@@ -37,167 +37,129 @@ function estimateMessage(msg: MessageWithParts): number {
|
|
|
37
37
|
let contextLimit = 200_000; // sensible default
|
|
38
38
|
let outputReserved = 32_000;
|
|
39
39
|
|
|
40
|
-
// Cost-aware layer-0 token cap. When > 0, the layer-0 passthrough gate uses
|
|
41
|
-
// min(maxInput, maxLayer0Tokens) instead of maxInput alone. Derived from the
|
|
42
|
-
// model's cache-read cost: cap = targetCostPerTurn / costPerToken. This prevents
|
|
43
|
-
// expensive models from sending huge contexts at layer 0, where cache-read costs
|
|
44
|
-
// compound linearly across turns. Set to 0 to disable (use full context).
|
|
45
|
-
let maxLayer0Tokens = 0;
|
|
46
|
-
|
|
47
|
-
const MIN_LAYER0_FLOOR = 40_000;
|
|
48
|
-
|
|
49
40
|
// ---------------------------------------------------------------------------
|
|
50
|
-
//
|
|
41
|
+
// Tier-based context management
|
|
51
42
|
//
|
|
52
|
-
//
|
|
53
|
-
//
|
|
54
|
-
//
|
|
43
|
+
// Three quality tiers based on empirical model effectiveness:
|
|
44
|
+
// Tier 1: 0 – 200K tokens (best quality, preferred operating range)
|
|
45
|
+
// Tier 2: 200K – 500K tokens (acceptable quality)
|
|
46
|
+
// Tier 3: 500K – model context limit (degraded, compress when economical)
|
|
55
47
|
//
|
|
56
|
-
//
|
|
57
|
-
//
|
|
58
|
-
//
|
|
48
|
+
// At each tier boundary, a per-turn economic comparison decides whether to
|
|
49
|
+
// compress (bust the cache) or continue growing:
|
|
50
|
+
// bustCost = compressedSize × cacheWriteCostPerToken
|
|
51
|
+
// continueCost = currentSize × cacheReadCostPerToken
|
|
52
|
+
// If bustCost ≥ threshold × continueCost, don't compress — reads are cheap.
|
|
53
|
+
//
|
|
54
|
+
// Rolling bust detection: if 5+ consecutive turns bust the cache, stop trying
|
|
55
|
+
// to compress — something structural is causing busts, and compression just
|
|
56
|
+
// adds cost on top.
|
|
59
57
|
// ---------------------------------------------------------------------------
|
|
60
58
|
|
|
61
|
-
/**
|
|
62
|
-
|
|
63
|
-
let maxContextTokensCeiling = 0;
|
|
59
|
+
/** Tier boundary tokens. Configurable for testing. */
|
|
60
|
+
const TIER_BOUNDARIES = [200_000, 500_000] as const;
|
|
64
61
|
|
|
65
|
-
|
|
62
|
+
/** Cache pricing per token (USD). Set by host adapter via setCachePricing(). */
|
|
63
|
+
let cacheWriteCostPerToken = 0;
|
|
64
|
+
let cacheReadCostPerToken = 0;
|
|
66
65
|
|
|
67
|
-
/**
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
66
|
+
/**
|
|
67
|
+
* Set cache pricing for the current model. Called by the host adapter after
|
|
68
|
+
* looking up model cost data. Required for tier-based bust-vs-continue
|
|
69
|
+
* decisions. When not set (both 0), tier decisions fall back to conservative
|
|
70
|
+
* defaults: always compress at tier boundaries.
|
|
71
|
+
*/
|
|
72
|
+
export function setCachePricing(writeCost: number, readCost: number) {
|
|
73
|
+
cacheWriteCostPerToken = Math.max(0, writeCost);
|
|
74
|
+
cacheReadCostPerToken = Math.max(0, readCost);
|
|
74
75
|
}
|
|
75
76
|
|
|
76
|
-
/**
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
export function setMaxContextTokens(tokens: number) {
|
|
80
|
-
maxContextTokensCeiling = Math.max(0, Math.floor(tokens));
|
|
77
|
+
/** Returns current pricing (for tests). */
|
|
78
|
+
export function getCachePricing(): { write: number; read: number } {
|
|
79
|
+
return { write: cacheWriteCostPerToken, read: cacheReadCostPerToken };
|
|
81
80
|
}
|
|
82
81
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
82
|
+
// Cost-aware layer-0 token cap. When > 0, the layer-0 passthrough gate uses
|
|
83
|
+
// min(maxInput, maxLayer0Tokens) instead of maxInput alone. Derived from the
|
|
84
|
+
// model's cache-read cost: cap = targetCostPerTurn / costPerToken. This prevents
|
|
85
|
+
// expensive models from sending huge contexts at layer 0, where cache-read costs
|
|
86
|
+
// compound linearly across turns. Set to 0 to disable (use full context).
|
|
87
|
+
let maxLayer0Tokens = 0;
|
|
88
|
+
|
|
89
|
+
const MIN_LAYER0_FLOOR = 40_000;
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Decide whether compression is economical at a tier boundary.
|
|
93
|
+
*
|
|
94
|
+
* @param currentTokens - expected input tokens if we stay at the current layer
|
|
95
|
+
* @param compressedTokens - expected tokens after compression
|
|
96
|
+
* @param consecutiveBusts - how many turns in a row we've busted the cache
|
|
97
|
+
* @param threshold - bust cost must be < threshold × continue cost to compress (default 0.85)
|
|
98
|
+
* @returns true if compression is worth it
|
|
99
|
+
*/
|
|
100
|
+
export function shouldCompress(
|
|
101
|
+
currentTokens: number,
|
|
102
|
+
compressedTokens: number,
|
|
103
|
+
consecutiveBusts: number,
|
|
104
|
+
threshold = 0.85,
|
|
105
|
+
): boolean {
|
|
106
|
+
// Rolling bust detection: if we've been busting 5+ turns in a row,
|
|
107
|
+
// stop trying to compress — it's clearly not helping.
|
|
108
|
+
if (consecutiveBusts >= 5) return false;
|
|
109
|
+
|
|
110
|
+
// If no pricing data, fall back to conservative: do NOT compress.
|
|
111
|
+
// Compression busts the cache, which is expensive. Without pricing data
|
|
112
|
+
// we can't prove it's worthwhile, so err on the side of keeping the cache.
|
|
113
|
+
if (cacheWriteCostPerToken <= 0 || cacheReadCostPerToken <= 0) return false;
|
|
114
|
+
|
|
115
|
+
const bustCost = compressedTokens * cacheWriteCostPerToken;
|
|
116
|
+
const continueCost = currentTokens * cacheReadCostPerToken;
|
|
117
|
+
|
|
118
|
+
// Compress only if the bust cost is meaningfully less than continuing
|
|
119
|
+
return bustCost < threshold * continueCost;
|
|
86
120
|
}
|
|
87
121
|
|
|
88
122
|
/**
|
|
89
|
-
*
|
|
90
|
-
*
|
|
91
|
-
|
|
123
|
+
* Determine which tier the given token count falls into.
|
|
124
|
+
* Returns 0, 1, or 2 corresponding to the tier index.
|
|
125
|
+
*/
|
|
126
|
+
export function getTier(tokens: number): number {
|
|
127
|
+
if (tokens <= TIER_BOUNDARIES[0]) return 0;
|
|
128
|
+
if (tokens <= TIER_BOUNDARIES[1]) return 1;
|
|
129
|
+
return 2;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Record cache usage from an API response. Tracks consecutive busts for
|
|
134
|
+
* the rolling bust detection used by shouldCompress().
|
|
135
|
+
*
|
|
136
|
+
* A "bust" is when cache_write > 50% of total input tokens.
|
|
92
137
|
*
|
|
93
138
|
* @param cacheWrite - cache_creation_input_tokens from the API response
|
|
94
139
|
* @param cacheRead - cache_read_input_tokens from the API response
|
|
140
|
+
* @param inputTokens - total input_tokens from the API response (includes uncached)
|
|
95
141
|
* @param sessionID - session that produced this response
|
|
96
142
|
*/
|
|
97
|
-
export function
|
|
143
|
+
export function recordCacheUsage(
|
|
98
144
|
cacheWrite: number,
|
|
99
145
|
cacheRead: number,
|
|
146
|
+
inputTokens: number,
|
|
100
147
|
sessionID?: string,
|
|
101
|
-
lastLayer?: number,
|
|
102
148
|
): void {
|
|
103
149
|
if (!sessionID) return;
|
|
104
150
|
const state = getSessionState(sessionID);
|
|
105
151
|
|
|
106
|
-
//
|
|
107
|
-
//
|
|
108
|
-
//
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
// Recovery hatch: after 5+ consecutive Layer 4 turns, the shrunken cap
|
|
116
|
-
// may be what's trapping us. Relax it by 10% per turn to give layers
|
|
117
|
-
// 1-3 a chance to fit. From 130K floor: turns 5-9 → 143K→157K→173K→190K→209K.
|
|
118
|
-
if (
|
|
119
|
-
state.consecutiveLayer4 >= 5 &&
|
|
120
|
-
state.dynamicContextCap > 0 &&
|
|
121
|
-
maxContextTokensCeiling > 0
|
|
122
|
-
) {
|
|
123
|
-
state.dynamicContextCap = Math.min(
|
|
124
|
-
maxContextTokensCeiling,
|
|
125
|
-
Math.floor(state.dynamicContextCap * 1.10),
|
|
126
|
-
);
|
|
127
|
-
}
|
|
128
|
-
return;
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
// Non-Layer-4 turn: reset the consecutive counter (also before total===0
|
|
132
|
-
// guard — a zero-usage non-L4 turn must not leave a stale count).
|
|
133
|
-
if (lastLayer !== undefined) {
|
|
134
|
-
state.consecutiveLayer4 = 0;
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
const total = cacheWrite + cacheRead;
|
|
138
|
-
if (total === 0) return;
|
|
139
|
-
|
|
140
|
-
// Bust ratio: fraction of total input that was cache-written (0 = all reads, 1 = all writes)
|
|
141
|
-
const bustRatio = cacheWrite / total;
|
|
142
|
-
|
|
143
|
-
// EMA update (α = 0.3 for smoothing — responsive but not twitchy)
|
|
144
|
-
state.bustRateEMA =
|
|
145
|
-
state.bustRateEMA < 0
|
|
146
|
-
? bustRatio // first observation
|
|
147
|
-
: state.bustRateEMA * 0.7 + bustRatio * 0.3;
|
|
148
|
-
|
|
149
|
-
// Inter-bust interval tracking: a "bust" is when >50% of input is writes
|
|
150
|
-
const now = Date.now();
|
|
151
|
-
if (bustRatio > 0.5) {
|
|
152
|
-
if (state.lastBustAt > 0) {
|
|
153
|
-
const interval = now - state.lastBustAt;
|
|
154
|
-
state.interBustIntervalEMA =
|
|
155
|
-
state.interBustIntervalEMA < 0
|
|
156
|
-
? interval
|
|
157
|
-
: state.interBustIntervalEMA * 0.7 + interval * 0.3;
|
|
158
|
-
}
|
|
159
|
-
state.lastBustAt = now;
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
// Adapt per-session cap based on bust rate and interval
|
|
163
|
-
adaptContextCap(state);
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
/** Adapt the per-session context cap based on bust rate and break frequency. */
|
|
167
|
-
function adaptContextCap(state: SessionState): void {
|
|
168
|
-
if (maxContextTokensCeiling <= 0) return; // disabled
|
|
169
|
-
|
|
170
|
-
const cap = state.dynamicContextCap > 0
|
|
171
|
-
? state.dynamicContextCap
|
|
172
|
-
: maxContextTokensCeiling;
|
|
173
|
-
|
|
174
|
-
let newCap = cap;
|
|
175
|
-
|
|
176
|
-
// Primary signal: bust rate EMA
|
|
177
|
-
if (state.bustRateEMA > 0.8) {
|
|
178
|
-
// Mostly writes — tighten by 10%
|
|
179
|
-
newCap = Math.floor(cap * 0.90);
|
|
180
|
-
} else if (state.bustRateEMA < 0.3) {
|
|
181
|
-
// Mostly reads — relax by 5% (slower than tightening)
|
|
182
|
-
newCap = Math.floor(cap * 1.05);
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
// Secondary signal: inter-bust interval
|
|
186
|
-
if (state.interBustIntervalEMA > 0) {
|
|
187
|
-
if (state.interBustIntervalEMA < 2 * 60_000) {
|
|
188
|
-
// Busts less than 2 min apart — proactively tighten by extra 5%
|
|
189
|
-
newCap = Math.floor(newCap * 0.95);
|
|
190
|
-
} else if (state.interBustIntervalEMA > 10 * 60_000) {
|
|
191
|
-
// Busts more than 10 min apart — allow extra relaxation
|
|
192
|
-
newCap = Math.floor(newCap * 1.03);
|
|
152
|
+
// Use total input tokens as denominator (includes uncached input),
|
|
153
|
+
// not just cacheWrite + cacheRead, to avoid inflated bust ratios
|
|
154
|
+
// when a large fraction of tokens is uncached.
|
|
155
|
+
const total = inputTokens > 0 ? inputTokens : cacheWrite + cacheRead;
|
|
156
|
+
if (total > 0) {
|
|
157
|
+
if (cacheWrite / total > 0.5) {
|
|
158
|
+
state.consecutiveBusts++;
|
|
159
|
+
} else {
|
|
160
|
+
state.consecutiveBusts = 0;
|
|
193
161
|
}
|
|
194
162
|
}
|
|
195
|
-
|
|
196
|
-
// Clamp to [floor, ceiling]
|
|
197
|
-
state.dynamicContextCap = Math.max(
|
|
198
|
-
MIN_CONTEXT_FLOOR,
|
|
199
|
-
Math.min(maxContextTokensCeiling, newCap),
|
|
200
|
-
);
|
|
201
163
|
}
|
|
202
164
|
|
|
203
165
|
// Conservative overhead reserve for first-turn (before calibration):
|
|
@@ -286,22 +248,10 @@ type SessionState = {
|
|
|
286
248
|
postIdleCompact: boolean;
|
|
287
249
|
/** Consecutive turns at layer >= 2. When >= 3, log a compaction hint. */
|
|
288
250
|
consecutiveHighLayer: number;
|
|
289
|
-
/** Consecutive
|
|
290
|
-
*
|
|
291
|
-
*
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
// --- Cost-aware context cap dynamic state ---
|
|
295
|
-
|
|
296
|
-
/** EMA of bust ratio (cacheWrite / total). -1 = uninitialized. */
|
|
297
|
-
bustRateEMA: number;
|
|
298
|
-
/** EMA of time between full busts (ms). -1 = uninitialized. */
|
|
299
|
-
interBustIntervalEMA: number;
|
|
300
|
-
/** Epoch ms of the last full bust (cacheWrite > 50% of total). 0 = never. */
|
|
301
|
-
lastBustAt: number;
|
|
302
|
-
/** Per-session dynamic context cap (tokens). Adjusted by adaptContextCap().
|
|
303
|
-
* 0 = use the static ceiling (maxContextTokensCeiling). */
|
|
304
|
-
dynamicContextCap: number;
|
|
251
|
+
/** Consecutive turns where the cache was busted (>50% writes).
|
|
252
|
+
* Used for rolling bust detection: after 5+ consecutive busts, stop
|
|
253
|
+
* trying to compress and warn that the conversation is unsustainable. */
|
|
254
|
+
consecutiveBusts: number;
|
|
305
255
|
|
|
306
256
|
/**
|
|
307
257
|
* Distillation row snapshot — cached to avoid hitting the DB on every
|
|
@@ -335,12 +285,7 @@ function makeSessionState(): SessionState {
|
|
|
335
285
|
cameOutOfIdle: false,
|
|
336
286
|
postIdleCompact: false,
|
|
337
287
|
consecutiveHighLayer: 0,
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
bustRateEMA: -1,
|
|
341
|
-
interBustIntervalEMA: -1,
|
|
342
|
-
lastBustAt: 0,
|
|
343
|
-
dynamicContextCap: 0,
|
|
288
|
+
consecutiveBusts: 0,
|
|
344
289
|
|
|
345
290
|
distillationSnapshot: null,
|
|
346
291
|
};
|
|
@@ -359,9 +304,8 @@ function getSessionState(sessionID: string): SessionState {
|
|
|
359
304
|
state.forceMinLayer = loadForceMinLayer(sessionID) as SafetyLayer;
|
|
360
305
|
|
|
361
306
|
// Restore gradient calibration state from DB (v24) — avoids uncalibrated
|
|
362
|
-
// first turns after restart. Without this,
|
|
363
|
-
//
|
|
364
|
-
// prevents onIdleResume() from detecting idle gaps.
|
|
307
|
+
// first turns after restart. Without this, lastTurnAt=0 prevents
|
|
308
|
+
// onIdleResume() from detecting idle gaps.
|
|
365
309
|
//
|
|
366
310
|
// Atomic restore: lastTurnAt > 0 is the proxy for "gradient state was
|
|
367
311
|
// ever flushed to DB". Restore all fields together or none — avoids
|
|
@@ -369,13 +313,12 @@ function getSessionState(sessionID: string): SessionState {
|
|
|
369
313
|
// could be mistaken for "never persisted".
|
|
370
314
|
const persisted = loadSessionTracking(sessionID);
|
|
371
315
|
if (persisted && persisted.lastTurnAt > 0) {
|
|
372
|
-
state.dynamicContextCap = persisted.dynamicContextCap;
|
|
373
|
-
state.bustRateEMA = persisted.bustRateEMA;
|
|
374
|
-
state.interBustIntervalEMA = persisted.interBustIntervalEMA;
|
|
375
316
|
state.lastLayer = persisted.lastLayer as SafetyLayer;
|
|
376
317
|
state.lastKnownInput = persisted.lastKnownInput;
|
|
377
318
|
state.lastTurnAt = persisted.lastTurnAt;
|
|
378
|
-
|
|
319
|
+
// consecutiveBusts is persisted in the dynamicContextCap column
|
|
320
|
+
// (repurposed, see saveGradientState).
|
|
321
|
+
state.consecutiveBusts = persisted.dynamicContextCap;
|
|
379
322
|
}
|
|
380
323
|
|
|
381
324
|
sessionStates.set(sessionID, state);
|
|
@@ -619,6 +562,8 @@ export function setForceMinLayer(layer: SafetyLayer, sessionID?: string) {
|
|
|
619
562
|
// For testing only — reset all calibration and force-escalation state
|
|
620
563
|
export function resetCalibration(sessionID?: string) {
|
|
621
564
|
calibratedOverhead = null;
|
|
565
|
+
cacheWriteCostPerToken = 0;
|
|
566
|
+
cacheReadCostPerToken = 0;
|
|
622
567
|
if (sessionID) {
|
|
623
568
|
saveForceMinLayer(sessionID, 0); // clear persisted state
|
|
624
569
|
sessionStates.delete(sessionID);
|
|
@@ -643,9 +588,7 @@ export function inspectSessionState(sessionID: string): {
|
|
|
643
588
|
postIdleCompact: boolean;
|
|
644
589
|
lastTurnAt: number;
|
|
645
590
|
distillationSnapshot: DistillationSnapshot | null;
|
|
646
|
-
|
|
647
|
-
dynamicContextCap: number;
|
|
648
|
-
consecutiveLayer4: number;
|
|
591
|
+
consecutiveBusts: number;
|
|
649
592
|
} | null {
|
|
650
593
|
const state = sessionStates.get(sessionID);
|
|
651
594
|
if (!state) return null;
|
|
@@ -656,12 +599,38 @@ export function inspectSessionState(sessionID: string): {
|
|
|
656
599
|
postIdleCompact: state.postIdleCompact,
|
|
657
600
|
lastTurnAt: state.lastTurnAt,
|
|
658
601
|
distillationSnapshot: state.distillationSnapshot,
|
|
659
|
-
|
|
660
|
-
dynamicContextCap: state.dynamicContextCap,
|
|
661
|
-
consecutiveLayer4: state.consecutiveLayer4,
|
|
602
|
+
consecutiveBusts: state.consecutiveBusts,
|
|
662
603
|
};
|
|
663
604
|
}
|
|
664
605
|
|
|
606
|
+
/**
|
|
607
|
+
* Return the consecutive-bust counter for a session (read-only).
|
|
608
|
+
* Returns 0 if the session has no in-memory state — callers treat this
|
|
609
|
+
* as "no bust pressure" which is the safe default.
|
|
610
|
+
*
|
|
611
|
+
* Uses Map.get() instead of getSessionState() to avoid creating phantom
|
|
612
|
+
* SessionState entries with zeroed calibration fields, which would cause
|
|
613
|
+
* the next transform() call to treat the session as uncalibrated.
|
|
614
|
+
*/
|
|
615
|
+
export function getConsecutiveBusts(sessionID: string): number {
|
|
616
|
+
return sessionStates.get(sessionID)?.consecutiveBusts ?? 0;
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
/** Bust-pressure threshold for meta-distillation: consecutive busts ≥ this
|
|
620
|
+
* value trigger earlier consolidation of gen-0 segments. */
|
|
621
|
+
export const BUST_PRESSURE_THRESHOLD = 3;
|
|
622
|
+
|
|
623
|
+
/**
|
|
624
|
+
* Compute the effective meta-distillation threshold under bust pressure.
|
|
625
|
+
* When busts ≥ BUST_PRESSURE_THRESHOLD, lowers the threshold to 1/4 of the
|
|
626
|
+
* configured value (min 3) to consolidate the distilled prefix earlier.
|
|
627
|
+
*/
|
|
628
|
+
export function effectiveMetaThreshold(busts: number, configThreshold: number): number {
|
|
629
|
+
return busts >= BUST_PRESSURE_THRESHOLD
|
|
630
|
+
? Math.max(3, Math.floor(configThreshold / 4))
|
|
631
|
+
: configThreshold;
|
|
632
|
+
}
|
|
633
|
+
|
|
665
634
|
/**
|
|
666
635
|
* For testing only — set the session's lastTurnAt field. Used to simulate
|
|
667
636
|
* idle gaps without sleeping. Creates the session state if not present so
|
|
@@ -683,13 +652,12 @@ export function saveGradientState(sessionID: string): void {
|
|
|
683
652
|
if (!state) return;
|
|
684
653
|
|
|
685
654
|
saveSessionTracking(sessionID, {
|
|
686
|
-
dynamicContextCap: state.dynamicContextCap,
|
|
687
|
-
bustRateEMA: state.bustRateEMA,
|
|
688
|
-
interBustIntervalEMA: state.interBustIntervalEMA,
|
|
689
655
|
lastLayer: state.lastLayer,
|
|
690
656
|
lastKnownInput: state.lastKnownInput,
|
|
691
657
|
lastTurnAt: state.lastTurnAt,
|
|
692
|
-
|
|
658
|
+
// Repurpose the dead dynamicContextCap column (v24, always 0 now)
|
|
659
|
+
// to persist consecutiveBusts — avoids a new DB migration.
|
|
660
|
+
dynamicContextCap: state.consecutiveBusts,
|
|
693
661
|
});
|
|
694
662
|
}
|
|
695
663
|
|
|
@@ -1591,6 +1559,10 @@ export type TransformResult = {
|
|
|
1591
1559
|
// relevance scoring. Set on Layer 4 (emergency) where the context is
|
|
1592
1560
|
// fully reset and mid-session knowledge may have changed relevance.
|
|
1593
1561
|
refreshLtm: boolean;
|
|
1562
|
+
/** When set, the conversation is growing unsustainably — 5+ consecutive
|
|
1563
|
+
* cache busts detected. The pipeline should inject a warning message
|
|
1564
|
+
* advising the user to compact or start a new conversation. */
|
|
1565
|
+
unsustainable?: boolean;
|
|
1594
1566
|
};
|
|
1595
1567
|
|
|
1596
1568
|
// Per-session urgent distillation tracking.
|
|
@@ -1624,17 +1596,11 @@ function transformInner(input: {
|
|
|
1624
1596
|
contextLimit - outputReserved - overhead - sessLtmTokens,
|
|
1625
1597
|
);
|
|
1626
1598
|
|
|
1627
|
-
//
|
|
1628
|
-
//
|
|
1629
|
-
//
|
|
1630
|
-
//
|
|
1631
|
-
|
|
1632
|
-
const effectiveCap = sid && sessState.dynamicContextCap > 0
|
|
1633
|
-
? sessState.dynamicContextCap
|
|
1634
|
-
: maxContextTokensCeiling;
|
|
1635
|
-
const usable = effectiveCap > 0 && usableRaw > effectiveCap
|
|
1636
|
-
? effectiveCap
|
|
1637
|
-
: usableRaw;
|
|
1599
|
+
// No EMA-driven adaptive cap — use the full available context budget.
|
|
1600
|
+
// The layer-0 cap (maxLayer0Tokens) still applies for per-turn read cost,
|
|
1601
|
+
// and tier-based bust-vs-continue decisions control whether to compress
|
|
1602
|
+
// at quality boundaries.
|
|
1603
|
+
const usable = usableRaw;
|
|
1638
1604
|
|
|
1639
1605
|
const distilledBudget = Math.floor(usable * cfg.budget.distilled);
|
|
1640
1606
|
// Base raw budget. May be overridden below for post-idle compact mode.
|
|
@@ -1705,11 +1671,8 @@ function transformInner(input: {
|
|
|
1705
1671
|
sessState.postIdleCompact = false;
|
|
1706
1672
|
// Skip layer 0 — don't pass through all raw messages on a cold cache.
|
|
1707
1673
|
effectiveMinLayer = Math.max(effectiveMinLayer, 1) as SafetyLayer;
|
|
1708
|
-
// Use a tighter raw budget
|
|
1709
|
-
|
|
1710
|
-
// the cap, use a tighter 20% to limit cold-write cost directly.
|
|
1711
|
-
const postIdleRawFraction = effectiveCap > 0 ? 0.30 : 0.20;
|
|
1712
|
-
rawBudget = Math.floor(usable * postIdleRawFraction);
|
|
1674
|
+
// Use a tighter raw budget on cold cache to limit write cost.
|
|
1675
|
+
rawBudget = Math.floor(usable * 0.20);
|
|
1713
1676
|
log.info(
|
|
1714
1677
|
`post-idle compact: session=${sid} rawBudget=${rawBudget}` +
|
|
1715
1678
|
` (${Math.floor(usable * cfg.budget.raw)}→${rawBudget})`,
|
|
@@ -1768,9 +1731,50 @@ function transformInner(input: {
|
|
|
1768
1731
|
distilledBudget,
|
|
1769
1732
|
rawBudget,
|
|
1770
1733
|
refreshLtm: false,
|
|
1734
|
+
unsustainable: sid ? getSessionState(sid).consecutiveBusts >= 5 : false,
|
|
1771
1735
|
};
|
|
1772
1736
|
}
|
|
1773
1737
|
|
|
1738
|
+
// --- Tier-based bust-vs-continue gate ---
|
|
1739
|
+
// When expectedInput exceeds the layer-0 cap but still fits in the model's
|
|
1740
|
+
// context window, check whether compression is economically justified.
|
|
1741
|
+
// If not (bust cost ≥ 85% of continue cost), skip compression and pass
|
|
1742
|
+
// through at layer 0 — the cache reads are cheap enough to justify the
|
|
1743
|
+
// larger context, and raw messages are better quality than distilled.
|
|
1744
|
+
if (
|
|
1745
|
+
effectiveMinLayer === 0 &&
|
|
1746
|
+
layer0Input > layer0Ceiling &&
|
|
1747
|
+
layer0Input <= maxInput &&
|
|
1748
|
+
sid
|
|
1749
|
+
) {
|
|
1750
|
+
const busts = getSessionState(sid).consecutiveBusts;
|
|
1751
|
+
// For compression, estimate the compressed size as the layer-1 budget
|
|
1752
|
+
// (distilled + raw fractions). This is a rough upper bound — actual
|
|
1753
|
+
// compressed output may be smaller.
|
|
1754
|
+
const compressedEstimate = distilledBudget + rawBudget;
|
|
1755
|
+
if (!shouldCompress(Math.round(layer0Input), compressedEstimate, busts)) {
|
|
1756
|
+
const messageTokens = calibrated
|
|
1757
|
+
? expectedInput - (sessLtmTokens - sessState.lastKnownLtm)
|
|
1758
|
+
: expectedInput - overhead - sessLtmTokens;
|
|
1759
|
+
log.info(
|
|
1760
|
+
`tier gate: session=${sid} skipping compression — bustCost not justified` +
|
|
1761
|
+
` (input=${Math.round(layer0Input)} compressed=${compressedEstimate} busts=${busts})`,
|
|
1762
|
+
);
|
|
1763
|
+
return {
|
|
1764
|
+
messages: input.messages,
|
|
1765
|
+
layer: 0,
|
|
1766
|
+
distilledTokens: 0,
|
|
1767
|
+
rawTokens: Math.max(0, messageTokens),
|
|
1768
|
+
totalTokens: Math.max(0, messageTokens),
|
|
1769
|
+
usable,
|
|
1770
|
+
distilledBudget,
|
|
1771
|
+
rawBudget,
|
|
1772
|
+
refreshLtm: false,
|
|
1773
|
+
unsustainable: busts >= 5,
|
|
1774
|
+
};
|
|
1775
|
+
}
|
|
1776
|
+
}
|
|
1777
|
+
|
|
1774
1778
|
// --- Gradient mode: context exhausted (or force-escalated), compress older messages ---
|
|
1775
1779
|
|
|
1776
1780
|
// Pre-pass: deduplicate repeated tool outputs before layer selection.
|
|
@@ -1855,7 +1859,15 @@ function transformInner(input: {
|
|
|
1855
1859
|
if (sid && (s > 0 || cached.tokens === 0)) {
|
|
1856
1860
|
urgentDistillationMap.set(sid, true);
|
|
1857
1861
|
}
|
|
1858
|
-
return {
|
|
1862
|
+
return {
|
|
1863
|
+
...result!,
|
|
1864
|
+
layer: stageLayer,
|
|
1865
|
+
usable,
|
|
1866
|
+
distilledBudget,
|
|
1867
|
+
rawBudget,
|
|
1868
|
+
refreshLtm: false,
|
|
1869
|
+
unsustainable: sid ? getSessionState(sid).consecutiveBusts >= 5 : false,
|
|
1870
|
+
};
|
|
1859
1871
|
}
|
|
1860
1872
|
}
|
|
1861
1873
|
|
|
@@ -1916,6 +1928,8 @@ function transformInner(input: {
|
|
|
1916
1928
|
const nuclearRaw = [...olderMessages, ...currentTurn];
|
|
1917
1929
|
const nuclearRawTokens = olderTokens + currentTurnTokens;
|
|
1918
1930
|
|
|
1931
|
+
const unsustainable = sid ? getSessionState(sid).consecutiveBusts >= 5 : false;
|
|
1932
|
+
|
|
1919
1933
|
return {
|
|
1920
1934
|
messages: [...nuclearPrefix, ...nuclearRaw],
|
|
1921
1935
|
layer: 4,
|
|
@@ -1926,6 +1940,7 @@ function transformInner(input: {
|
|
|
1926
1940
|
distilledBudget,
|
|
1927
1941
|
rawBudget,
|
|
1928
1942
|
refreshLtm: true,
|
|
1943
|
+
unsustainable,
|
|
1929
1944
|
};
|
|
1930
1945
|
}
|
|
1931
1946
|
|
|
@@ -1977,7 +1992,7 @@ export function transform(input: {
|
|
|
1977
1992
|
log.info(
|
|
1978
1993
|
`gradient: session=${sid} layer=${result.layer} tokens=${result.totalTokens}` +
|
|
1979
1994
|
` (distilled=${result.distilledTokens} raw=${result.rawTokens})` +
|
|
1980
|
-
` usable=${result.usable}
|
|
1995
|
+
` usable=${result.usable} tier=${getTier(result.totalTokens)} l0cap=${maxLayer0Tokens || "off"}`,
|
|
1981
1996
|
);
|
|
1982
1997
|
}
|
|
1983
1998
|
return result;
|
package/src/index.ts
CHANGED
|
@@ -94,10 +94,11 @@ export {
|
|
|
94
94
|
setModelLimits,
|
|
95
95
|
setMaxLayer0Tokens,
|
|
96
96
|
computeLayer0Cap,
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
97
|
+
setCachePricing,
|
|
98
|
+
getCachePricing,
|
|
99
|
+
shouldCompress,
|
|
100
|
+
getTier,
|
|
101
|
+
recordCacheUsage,
|
|
101
102
|
needsUrgentDistillation,
|
|
102
103
|
calibrate,
|
|
103
104
|
setLtmTokens,
|
|
@@ -116,6 +117,9 @@ export {
|
|
|
116
117
|
// gaps without sleeping. Not part of the public API.
|
|
117
118
|
setLastTurnAtForTest,
|
|
118
119
|
inspectSessionState,
|
|
120
|
+
getConsecutiveBusts,
|
|
121
|
+
BUST_PRESSURE_THRESHOLD,
|
|
122
|
+
effectiveMetaThreshold,
|
|
119
123
|
} from "./gradient";
|
|
120
124
|
export {
|
|
121
125
|
formatKnowledge,
|