@loreai/core 0.19.0 → 0.20.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bun/agents-file.d.ts.map +1 -1
- package/dist/bun/config.d.ts +1 -1
- package/dist/bun/config.d.ts.map +1 -1
- package/dist/bun/db.d.ts +13 -1
- package/dist/bun/db.d.ts.map +1 -1
- package/dist/bun/embedding.d.ts.map +1 -1
- package/dist/bun/git.d.ts.map +1 -1
- package/dist/bun/gradient.d.ts +39 -13
- package/dist/bun/gradient.d.ts.map +1 -1
- package/dist/bun/hosted.d.ts +36 -0
- package/dist/bun/hosted.d.ts.map +1 -0
- package/dist/bun/index.d.ts +3 -2
- package/dist/bun/index.d.ts.map +1 -1
- package/dist/bun/index.js +295 -235
- package/dist/bun/index.js.map +4 -4
- package/dist/bun/lat-reader.d.ts.map +1 -1
- package/dist/node/agents-file.d.ts.map +1 -1
- package/dist/node/config.d.ts +1 -1
- package/dist/node/config.d.ts.map +1 -1
- package/dist/node/db.d.ts +13 -1
- package/dist/node/db.d.ts.map +1 -1
- package/dist/node/embedding.d.ts.map +1 -1
- package/dist/node/git.d.ts.map +1 -1
- package/dist/node/gradient.d.ts +39 -13
- package/dist/node/gradient.d.ts.map +1 -1
- package/dist/node/hosted.d.ts +36 -0
- package/dist/node/hosted.d.ts.map +1 -0
- package/dist/node/index.d.ts +3 -2
- package/dist/node/index.d.ts.map +1 -1
- package/dist/node/index.js +295 -235
- package/dist/node/index.js.map +4 -4
- package/dist/node/lat-reader.d.ts.map +1 -1
- package/dist/types/agents-file.d.ts.map +1 -1
- package/dist/types/config.d.ts +1 -1
- package/dist/types/config.d.ts.map +1 -1
- package/dist/types/db.d.ts +13 -1
- package/dist/types/db.d.ts.map +1 -1
- package/dist/types/embedding.d.ts.map +1 -1
- package/dist/types/git.d.ts.map +1 -1
- package/dist/types/gradient.d.ts +39 -13
- package/dist/types/gradient.d.ts.map +1 -1
- package/dist/types/hosted.d.ts +36 -0
- package/dist/types/hosted.d.ts.map +1 -0
- package/dist/types/index.d.ts +3 -2
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/lat-reader.d.ts.map +1 -1
- package/package.json +2 -1
- package/src/agents-file.ts +12 -0
- package/src/config.ts +14 -17
- package/src/db.ts +39 -6
- package/src/embedding.ts +43 -5
- package/src/git.ts +4 -0
- package/src/gradient.ts +167 -145
- package/src/hosted.ts +46 -0
- package/src/index.ts +9 -4
- package/src/lat-reader.ts +4 -0
package/src/embedding.ts
CHANGED
|
@@ -28,6 +28,27 @@ import type {
|
|
|
28
28
|
* embedding calls but bounded enough to avoid minutes-long hangs. */
|
|
29
29
|
const EMBED_TIMEOUT_MS = 10_000;
|
|
30
30
|
|
|
31
|
+
/**
|
|
32
|
+
* Safe per-text character limit for local ONNX inference. The Nomic v1.5 model
|
|
33
|
+
* supports up to 8192 tokens, but ONNX runtime OOMs on inputs near that ceiling
|
|
34
|
+
* (error codes 284432024, 287180544, 144786472). Pre-truncating to ~4096 tokens
|
|
35
|
+
* worth of characters keeps the tensor well within safe allocation bounds.
|
|
36
|
+
* The worker's `truncation: true` remains as a safety net.
|
|
37
|
+
*/
|
|
38
|
+
const LOCAL_MAX_CHARS = 4096 * 4; // ~4096 tokens × ~4 chars/token
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Truncate a string to LOCAL_MAX_CHARS without splitting a UTF-16 surrogate pair.
|
|
42
|
+
* If the cut falls on a high surrogate (0xD800-0xDBFF), backs up one char.
|
|
43
|
+
*/
|
|
44
|
+
function safeLocalTruncate(text: string): string {
|
|
45
|
+
if (text.length <= LOCAL_MAX_CHARS) return text;
|
|
46
|
+
let end = LOCAL_MAX_CHARS;
|
|
47
|
+
const code = text.charCodeAt(end - 1);
|
|
48
|
+
if (code >= 0xD800 && code <= 0xDBFF) end--; // don't split surrogate pair
|
|
49
|
+
return text.slice(0, end);
|
|
50
|
+
}
|
|
51
|
+
|
|
31
52
|
// ---------------------------------------------------------------------------
|
|
32
53
|
// Provider interface
|
|
33
54
|
// ---------------------------------------------------------------------------
|
|
@@ -332,9 +353,10 @@ class LocalProvider implements EmbeddingProvider {
|
|
|
332
353
|
localProviderKnownBroken = true;
|
|
333
354
|
if (!localProviderErrorLogged) {
|
|
334
355
|
localProviderErrorLogged = true;
|
|
335
|
-
log.
|
|
356
|
+
log.error(
|
|
336
357
|
`local embedding provider failed to init: ${msg.error}. ` +
|
|
337
358
|
`Set VOYAGE_API_KEY/OPENAI_API_KEY for automatic remote fallback.`,
|
|
359
|
+
new Error(`embedding worker init failed: ${msg.error}`),
|
|
338
360
|
);
|
|
339
361
|
}
|
|
340
362
|
for (const [, p] of this.pendingRequests) {
|
|
@@ -351,6 +373,7 @@ class LocalProvider implements EmbeddingProvider {
|
|
|
351
373
|
this.worker.on("error", (err: Error) => {
|
|
352
374
|
this.workerInitError = err.message;
|
|
353
375
|
this.workerReady = false;
|
|
376
|
+
log.error("embedding worker crashed:", err);
|
|
354
377
|
for (const [, p] of this.pendingRequests) {
|
|
355
378
|
p.reject(new LocalProviderUnavailableError(err));
|
|
356
379
|
}
|
|
@@ -361,6 +384,10 @@ class LocalProvider implements EmbeddingProvider {
|
|
|
361
384
|
this.worker.on("exit", (code) => {
|
|
362
385
|
if (code !== 0 && !this.workerInitError) {
|
|
363
386
|
this.workerInitError = `embedding worker exited with code ${code}`;
|
|
387
|
+
log.error(
|
|
388
|
+
this.workerInitError,
|
|
389
|
+
new Error(this.workerInitError),
|
|
390
|
+
);
|
|
364
391
|
}
|
|
365
392
|
this.workerReady = false;
|
|
366
393
|
for (const [, p] of this.pendingRequests) {
|
|
@@ -396,9 +423,13 @@ class LocalProvider implements EmbeddingProvider {
|
|
|
396
423
|
async embed(texts: string[], inputType: "document" | "query"): Promise<Float32Array[]> {
|
|
397
424
|
await this.ensureWorker();
|
|
398
425
|
|
|
426
|
+
// Pre-truncate texts that exceed the safe ONNX inference limit.
|
|
427
|
+
// This prevents OOM on single inputs near the model's 8192-token max.
|
|
428
|
+
const truncated = texts.map(safeLocalTruncate);
|
|
429
|
+
|
|
399
430
|
// Prepend Nomic task instruction prefix.
|
|
400
431
|
const prefix = inputType === "document" ? "search_document: " : "search_query: ";
|
|
401
|
-
const prefixed =
|
|
432
|
+
const prefixed = truncated.map((t) => prefix + t);
|
|
402
433
|
|
|
403
434
|
const id = this.nextRequestId++;
|
|
404
435
|
// Recall queries (single query-type texts) get high priority so they
|
|
@@ -842,6 +873,7 @@ export function embedKnowledgeEntry(
|
|
|
842
873
|
title: string,
|
|
843
874
|
content: string,
|
|
844
875
|
): void {
|
|
876
|
+
if (!isAvailable()) return;
|
|
845
877
|
const text = `${title}\n${content}`;
|
|
846
878
|
embed([text], "document")
|
|
847
879
|
.then(([vec]) => {
|
|
@@ -850,7 +882,7 @@ export function embedKnowledgeEntry(
|
|
|
850
882
|
.run(toBlob(vec), id);
|
|
851
883
|
})
|
|
852
884
|
.catch((err) => {
|
|
853
|
-
log.
|
|
885
|
+
log.error("embedding failed for knowledge entry", id, ":", err);
|
|
854
886
|
});
|
|
855
887
|
}
|
|
856
888
|
|
|
@@ -863,6 +895,7 @@ export function embedDistillation(
|
|
|
863
895
|
id: string,
|
|
864
896
|
observations: string,
|
|
865
897
|
): void {
|
|
898
|
+
if (!isAvailable()) return;
|
|
866
899
|
embed([observations], "document")
|
|
867
900
|
.then(([vec]) => {
|
|
868
901
|
db()
|
|
@@ -870,7 +903,7 @@ export function embedDistillation(
|
|
|
870
903
|
.run(toBlob(vec), id);
|
|
871
904
|
})
|
|
872
905
|
.catch((err) => {
|
|
873
|
-
log.
|
|
906
|
+
log.error("embedding failed for distillation", id, ":", err);
|
|
874
907
|
});
|
|
875
908
|
}
|
|
876
909
|
|
|
@@ -884,6 +917,7 @@ export function embedTemporalMessage(
|
|
|
884
917
|
id: string,
|
|
885
918
|
content: string,
|
|
886
919
|
): void {
|
|
920
|
+
if (!isAvailable()) return;
|
|
887
921
|
// Skip very short messages — they don't carry enough semantic signal
|
|
888
922
|
// to be useful in vector search and would waste embedding capacity.
|
|
889
923
|
if (content.length < 50) return;
|
|
@@ -895,7 +929,7 @@ export function embedTemporalMessage(
|
|
|
895
929
|
.run(toBlob(vec), id);
|
|
896
930
|
})
|
|
897
931
|
.catch((err) => {
|
|
898
|
-
log.
|
|
932
|
+
log.error("embedding failed for temporal message", id, ":", err);
|
|
899
933
|
});
|
|
900
934
|
}
|
|
901
935
|
|
|
@@ -1199,6 +1233,8 @@ export async function backfillEmbeddings(): Promise<number> {
|
|
|
1199
1233
|
} catch (err) {
|
|
1200
1234
|
// log.error sends to Sentry via captureException
|
|
1201
1235
|
log.error(`embedding backfill batch failed (${batch.length} items):`, err);
|
|
1236
|
+
// Provider is dead — no point retrying remaining batches.
|
|
1237
|
+
if (err instanceof LocalProviderUnavailableError) break;
|
|
1202
1238
|
}
|
|
1203
1239
|
// No yieldToEventLoop() needed — embed() is truly async (worker thread).
|
|
1204
1240
|
}
|
|
@@ -1259,6 +1295,8 @@ export async function backfillDistillationEmbeddings(): Promise<number> {
|
|
|
1259
1295
|
} catch (err) {
|
|
1260
1296
|
// log.error sends to Sentry via captureException
|
|
1261
1297
|
log.error(`distillation embedding backfill batch failed (${batch.length} items):`, err);
|
|
1298
|
+
// Provider is dead — no point retrying remaining batches.
|
|
1299
|
+
if (err instanceof LocalProviderUnavailableError) break;
|
|
1262
1300
|
}
|
|
1263
1301
|
|
|
1264
1302
|
if (embedded >= nextProgressAt) {
|
package/src/git.ts
CHANGED
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
*/
|
|
14
14
|
|
|
15
15
|
import { execSync } from "child_process";
|
|
16
|
+
import { isHostedMode } from "./hosted";
|
|
16
17
|
|
|
17
18
|
// ---------------------------------------------------------------------------
|
|
18
19
|
// URL normalization
|
|
@@ -95,6 +96,9 @@ export function clearGitRemoteCache(): void {
|
|
|
95
96
|
* subprocess calls — `git remote -v` only runs once per unique path.
|
|
96
97
|
*/
|
|
97
98
|
export function getGitRemote(path: string): string | null {
|
|
99
|
+
// In hosted mode, never run git subprocesses with client-controlled cwd.
|
|
100
|
+
if (isHostedMode()) return null;
|
|
101
|
+
|
|
98
102
|
const cached = gitRemoteCache.get(path);
|
|
99
103
|
if (cached !== undefined) return cached;
|
|
100
104
|
|
package/src/gradient.ts
CHANGED
|
@@ -37,134 +37,129 @@ function estimateMessage(msg: MessageWithParts): number {
|
|
|
37
37
|
let contextLimit = 200_000; // sensible default
|
|
38
38
|
let outputReserved = 32_000;
|
|
39
39
|
|
|
40
|
-
// Cost-aware layer-0 token cap. When > 0, the layer-0 passthrough gate uses
|
|
41
|
-
// min(maxInput, maxLayer0Tokens) instead of maxInput alone. Derived from the
|
|
42
|
-
// model's cache-read cost: cap = targetCostPerTurn / costPerToken. This prevents
|
|
43
|
-
// expensive models from sending huge contexts at layer 0, where cache-read costs
|
|
44
|
-
// compound linearly across turns. Set to 0 to disable (use full context).
|
|
45
|
-
let maxLayer0Tokens = 0;
|
|
46
|
-
|
|
47
|
-
const MIN_LAYER0_FLOOR = 40_000;
|
|
48
|
-
|
|
49
40
|
// ---------------------------------------------------------------------------
|
|
50
|
-
//
|
|
41
|
+
// Tier-based context management
|
|
42
|
+
//
|
|
43
|
+
// Three quality tiers based on empirical model effectiveness:
|
|
44
|
+
// Tier 1: 0 – 200K tokens (best quality, preferred operating range)
|
|
45
|
+
// Tier 2: 200K – 500K tokens (acceptable quality)
|
|
46
|
+
// Tier 3: 500K – model context limit (degraded, compress when economical)
|
|
51
47
|
//
|
|
52
|
-
//
|
|
53
|
-
//
|
|
54
|
-
//
|
|
48
|
+
// At each tier boundary, a per-turn economic comparison decides whether to
|
|
49
|
+
// compress (bust the cache) or continue growing:
|
|
50
|
+
// bustCost = compressedSize × cacheWriteCostPerToken
|
|
51
|
+
// continueCost = currentSize × cacheReadCostPerToken
|
|
52
|
+
// If bustCost ≥ threshold × continueCost, don't compress — reads are cheap.
|
|
55
53
|
//
|
|
56
|
-
//
|
|
57
|
-
//
|
|
58
|
-
//
|
|
54
|
+
// Rolling bust detection: if 5+ consecutive turns bust the cache, stop trying
|
|
55
|
+
// to compress — something structural is causing busts, and compression just
|
|
56
|
+
// adds cost on top.
|
|
59
57
|
// ---------------------------------------------------------------------------
|
|
60
58
|
|
|
61
|
-
/**
|
|
62
|
-
|
|
63
|
-
let maxContextTokensCeiling = 0;
|
|
59
|
+
/** Tier boundary tokens. Configurable for testing. */
|
|
60
|
+
const TIER_BOUNDARIES = [200_000, 500_000] as const;
|
|
64
61
|
|
|
65
|
-
|
|
62
|
+
/** Cache pricing per token (USD). Set by host adapter via setCachePricing(). */
|
|
63
|
+
let cacheWriteCostPerToken = 0;
|
|
64
|
+
let cacheReadCostPerToken = 0;
|
|
66
65
|
|
|
67
|
-
/**
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
66
|
+
/**
|
|
67
|
+
* Set cache pricing for the current model. Called by the host adapter after
|
|
68
|
+
* looking up model cost data. Required for tier-based bust-vs-continue
|
|
69
|
+
* decisions. When not set (both 0), tier decisions fall back to conservative
|
|
70
|
+
* defaults: always compress at tier boundaries.
|
|
71
|
+
*/
|
|
72
|
+
export function setCachePricing(writeCost: number, readCost: number) {
|
|
73
|
+
cacheWriteCostPerToken = Math.max(0, writeCost);
|
|
74
|
+
cacheReadCostPerToken = Math.max(0, readCost);
|
|
74
75
|
}
|
|
75
76
|
|
|
76
|
-
/**
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
77
|
+
/** Returns current pricing (for tests). */
|
|
78
|
+
export function getCachePricing(): { write: number; read: number } {
|
|
79
|
+
return { write: cacheWriteCostPerToken, read: cacheReadCostPerToken };
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Cost-aware layer-0 token cap. When > 0, the layer-0 passthrough gate uses
|
|
83
|
+
// min(maxInput, maxLayer0Tokens) instead of maxInput alone. Derived from the
|
|
84
|
+
// model's cache-read cost: cap = targetCostPerTurn / costPerToken. This prevents
|
|
85
|
+
// expensive models from sending huge contexts at layer 0, where cache-read costs
|
|
86
|
+
// compound linearly across turns. Set to 0 to disable (use full context).
|
|
87
|
+
let maxLayer0Tokens = 0;
|
|
88
|
+
|
|
89
|
+
const MIN_LAYER0_FLOOR = 40_000;
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Decide whether compression is economical at a tier boundary.
|
|
93
|
+
*
|
|
94
|
+
* @param currentTokens - expected input tokens if we stay at the current layer
|
|
95
|
+
* @param compressedTokens - expected tokens after compression
|
|
96
|
+
* @param consecutiveBusts - how many turns in a row we've busted the cache
|
|
97
|
+
* @param threshold - bust cost must be < threshold × continue cost to compress (default 0.85)
|
|
98
|
+
* @returns true if compression is worth it
|
|
99
|
+
*/
|
|
100
|
+
export function shouldCompress(
|
|
101
|
+
currentTokens: number,
|
|
102
|
+
compressedTokens: number,
|
|
103
|
+
consecutiveBusts: number,
|
|
104
|
+
threshold = 0.85,
|
|
105
|
+
): boolean {
|
|
106
|
+
// Rolling bust detection: if we've been busting 5+ turns in a row,
|
|
107
|
+
// stop trying to compress — it's clearly not helping.
|
|
108
|
+
if (consecutiveBusts >= 5) return false;
|
|
109
|
+
|
|
110
|
+
// If no pricing data, fall back to conservative: do NOT compress.
|
|
111
|
+
// Compression busts the cache, which is expensive. Without pricing data
|
|
112
|
+
// we can't prove it's worthwhile, so err on the side of keeping the cache.
|
|
113
|
+
if (cacheWriteCostPerToken <= 0 || cacheReadCostPerToken <= 0) return false;
|
|
114
|
+
|
|
115
|
+
const bustCost = compressedTokens * cacheWriteCostPerToken;
|
|
116
|
+
const continueCost = currentTokens * cacheReadCostPerToken;
|
|
117
|
+
|
|
118
|
+
// Compress only if the bust cost is meaningfully less than continuing
|
|
119
|
+
return bustCost < threshold * continueCost;
|
|
81
120
|
}
|
|
82
121
|
|
|
83
|
-
/**
|
|
84
|
-
|
|
85
|
-
|
|
122
|
+
/**
|
|
123
|
+
* Determine which tier the given token count falls into.
|
|
124
|
+
* Returns 0, 1, or 2 corresponding to the tier index.
|
|
125
|
+
*/
|
|
126
|
+
export function getTier(tokens: number): number {
|
|
127
|
+
if (tokens <= TIER_BOUNDARIES[0]) return 0;
|
|
128
|
+
if (tokens <= TIER_BOUNDARIES[1]) return 1;
|
|
129
|
+
return 2;
|
|
86
130
|
}
|
|
87
131
|
|
|
88
132
|
/**
|
|
89
|
-
*
|
|
90
|
-
*
|
|
91
|
-
*
|
|
133
|
+
* Record cache usage from an API response. Tracks consecutive busts for
|
|
134
|
+
* the rolling bust detection used by shouldCompress().
|
|
135
|
+
*
|
|
136
|
+
* A "bust" is when cache_write > 50% of total input tokens.
|
|
92
137
|
*
|
|
93
138
|
* @param cacheWrite - cache_creation_input_tokens from the API response
|
|
94
139
|
* @param cacheRead - cache_read_input_tokens from the API response
|
|
140
|
+
* @param inputTokens - total input_tokens from the API response (includes uncached)
|
|
95
141
|
* @param sessionID - session that produced this response
|
|
96
142
|
*/
|
|
97
|
-
export function
|
|
143
|
+
export function recordCacheUsage(
|
|
98
144
|
cacheWrite: number,
|
|
99
145
|
cacheRead: number,
|
|
146
|
+
inputTokens: number,
|
|
100
147
|
sessionID?: string,
|
|
101
148
|
): void {
|
|
102
149
|
if (!sessionID) return;
|
|
103
150
|
const state = getSessionState(sessionID);
|
|
104
|
-
const total = cacheWrite + cacheRead;
|
|
105
|
-
if (total === 0) return;
|
|
106
|
-
|
|
107
|
-
// Bust ratio: fraction of total input that was cache-written (0 = all reads, 1 = all writes)
|
|
108
|
-
const bustRatio = cacheWrite / total;
|
|
109
|
-
|
|
110
|
-
// EMA update (α = 0.3 for smoothing — responsive but not twitchy)
|
|
111
|
-
state.bustRateEMA =
|
|
112
|
-
state.bustRateEMA < 0
|
|
113
|
-
? bustRatio // first observation
|
|
114
|
-
: state.bustRateEMA * 0.7 + bustRatio * 0.3;
|
|
115
|
-
|
|
116
|
-
// Inter-bust interval tracking: a "bust" is when >50% of input is writes
|
|
117
|
-
const now = Date.now();
|
|
118
|
-
if (bustRatio > 0.5) {
|
|
119
|
-
if (state.lastBustAt > 0) {
|
|
120
|
-
const interval = now - state.lastBustAt;
|
|
121
|
-
state.interBustIntervalEMA =
|
|
122
|
-
state.interBustIntervalEMA < 0
|
|
123
|
-
? interval
|
|
124
|
-
: state.interBustIntervalEMA * 0.7 + interval * 0.3;
|
|
125
|
-
}
|
|
126
|
-
state.lastBustAt = now;
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
// Adapt per-session cap based on bust rate and interval
|
|
130
|
-
adaptContextCap(state);
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
/** Adapt the per-session context cap based on bust rate and break frequency. */
|
|
134
|
-
function adaptContextCap(state: SessionState): void {
|
|
135
|
-
if (maxContextTokensCeiling <= 0) return; // disabled
|
|
136
|
-
|
|
137
|
-
const cap = state.dynamicContextCap > 0
|
|
138
|
-
? state.dynamicContextCap
|
|
139
|
-
: maxContextTokensCeiling;
|
|
140
|
-
|
|
141
|
-
let newCap = cap;
|
|
142
|
-
|
|
143
|
-
// Primary signal: bust rate EMA
|
|
144
|
-
if (state.bustRateEMA > 0.8) {
|
|
145
|
-
// Mostly writes — tighten by 10%
|
|
146
|
-
newCap = Math.floor(cap * 0.90);
|
|
147
|
-
} else if (state.bustRateEMA < 0.3) {
|
|
148
|
-
// Mostly reads — relax by 5% (slower than tightening)
|
|
149
|
-
newCap = Math.floor(cap * 1.05);
|
|
150
|
-
}
|
|
151
151
|
|
|
152
|
-
//
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
152
|
+
// Use total input tokens as denominator (includes uncached input),
|
|
153
|
+
// not just cacheWrite + cacheRead, to avoid inflated bust ratios
|
|
154
|
+
// when a large fraction of tokens is uncached.
|
|
155
|
+
const total = inputTokens > 0 ? inputTokens : cacheWrite + cacheRead;
|
|
156
|
+
if (total > 0) {
|
|
157
|
+
if (cacheWrite / total > 0.5) {
|
|
158
|
+
state.consecutiveBusts++;
|
|
159
|
+
} else {
|
|
160
|
+
state.consecutiveBusts = 0;
|
|
160
161
|
}
|
|
161
162
|
}
|
|
162
|
-
|
|
163
|
-
// Clamp to [floor, ceiling]
|
|
164
|
-
state.dynamicContextCap = Math.max(
|
|
165
|
-
MIN_CONTEXT_FLOOR,
|
|
166
|
-
Math.min(maxContextTokensCeiling, newCap),
|
|
167
|
-
);
|
|
168
163
|
}
|
|
169
164
|
|
|
170
165
|
// Conservative overhead reserve for first-turn (before calibration):
|
|
@@ -253,18 +248,10 @@ type SessionState = {
|
|
|
253
248
|
postIdleCompact: boolean;
|
|
254
249
|
/** Consecutive turns at layer >= 2. When >= 3, log a compaction hint. */
|
|
255
250
|
consecutiveHighLayer: number;
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
bustRateEMA: number;
|
|
261
|
-
/** EMA of time between full busts (ms). -1 = uninitialized. */
|
|
262
|
-
interBustIntervalEMA: number;
|
|
263
|
-
/** Epoch ms of the last full bust (cacheWrite > 50% of total). 0 = never. */
|
|
264
|
-
lastBustAt: number;
|
|
265
|
-
/** Per-session dynamic context cap (tokens). Adjusted by adaptContextCap().
|
|
266
|
-
* 0 = use the static ceiling (maxContextTokensCeiling). */
|
|
267
|
-
dynamicContextCap: number;
|
|
251
|
+
/** Consecutive turns where the cache was busted (>50% writes).
|
|
252
|
+
* Used for rolling bust detection: after 5+ consecutive busts, stop
|
|
253
|
+
* trying to compress and warn that the conversation is unsustainable. */
|
|
254
|
+
consecutiveBusts: number;
|
|
268
255
|
|
|
269
256
|
/**
|
|
270
257
|
* Distillation row snapshot — cached to avoid hitting the DB on every
|
|
@@ -298,11 +285,7 @@ function makeSessionState(): SessionState {
|
|
|
298
285
|
cameOutOfIdle: false,
|
|
299
286
|
postIdleCompact: false,
|
|
300
287
|
consecutiveHighLayer: 0,
|
|
301
|
-
|
|
302
|
-
bustRateEMA: -1,
|
|
303
|
-
interBustIntervalEMA: -1,
|
|
304
|
-
lastBustAt: 0,
|
|
305
|
-
dynamicContextCap: 0,
|
|
288
|
+
consecutiveBusts: 0,
|
|
306
289
|
|
|
307
290
|
distillationSnapshot: null,
|
|
308
291
|
};
|
|
@@ -321,9 +304,8 @@ function getSessionState(sessionID: string): SessionState {
|
|
|
321
304
|
state.forceMinLayer = loadForceMinLayer(sessionID) as SafetyLayer;
|
|
322
305
|
|
|
323
306
|
// Restore gradient calibration state from DB (v24) — avoids uncalibrated
|
|
324
|
-
// first turns after restart. Without this,
|
|
325
|
-
//
|
|
326
|
-
// prevents onIdleResume() from detecting idle gaps.
|
|
307
|
+
// first turns after restart. Without this, lastTurnAt=0 prevents
|
|
308
|
+
// onIdleResume() from detecting idle gaps.
|
|
327
309
|
//
|
|
328
310
|
// Atomic restore: lastTurnAt > 0 is the proxy for "gradient state was
|
|
329
311
|
// ever flushed to DB". Restore all fields together or none — avoids
|
|
@@ -331,13 +313,12 @@ function getSessionState(sessionID: string): SessionState {
|
|
|
331
313
|
// could be mistaken for "never persisted".
|
|
332
314
|
const persisted = loadSessionTracking(sessionID);
|
|
333
315
|
if (persisted && persisted.lastTurnAt > 0) {
|
|
334
|
-
state.dynamicContextCap = persisted.dynamicContextCap;
|
|
335
|
-
state.bustRateEMA = persisted.bustRateEMA;
|
|
336
|
-
state.interBustIntervalEMA = persisted.interBustIntervalEMA;
|
|
337
316
|
state.lastLayer = persisted.lastLayer as SafetyLayer;
|
|
338
317
|
state.lastKnownInput = persisted.lastKnownInput;
|
|
339
318
|
state.lastTurnAt = persisted.lastTurnAt;
|
|
340
|
-
|
|
319
|
+
// consecutiveBusts is persisted in the dynamicContextCap column
|
|
320
|
+
// (repurposed, see saveGradientState).
|
|
321
|
+
state.consecutiveBusts = persisted.dynamicContextCap;
|
|
341
322
|
}
|
|
342
323
|
|
|
343
324
|
sessionStates.set(sessionID, state);
|
|
@@ -581,6 +562,8 @@ export function setForceMinLayer(layer: SafetyLayer, sessionID?: string) {
|
|
|
581
562
|
// For testing only — reset all calibration and force-escalation state
|
|
582
563
|
export function resetCalibration(sessionID?: string) {
|
|
583
564
|
calibratedOverhead = null;
|
|
565
|
+
cacheWriteCostPerToken = 0;
|
|
566
|
+
cacheReadCostPerToken = 0;
|
|
584
567
|
if (sessionID) {
|
|
585
568
|
saveForceMinLayer(sessionID, 0); // clear persisted state
|
|
586
569
|
sessionStates.delete(sessionID);
|
|
@@ -605,6 +588,7 @@ export function inspectSessionState(sessionID: string): {
|
|
|
605
588
|
postIdleCompact: boolean;
|
|
606
589
|
lastTurnAt: number;
|
|
607
590
|
distillationSnapshot: DistillationSnapshot | null;
|
|
591
|
+
consecutiveBusts: number;
|
|
608
592
|
} | null {
|
|
609
593
|
const state = sessionStates.get(sessionID);
|
|
610
594
|
if (!state) return null;
|
|
@@ -615,6 +599,7 @@ export function inspectSessionState(sessionID: string): {
|
|
|
615
599
|
postIdleCompact: state.postIdleCompact,
|
|
616
600
|
lastTurnAt: state.lastTurnAt,
|
|
617
601
|
distillationSnapshot: state.distillationSnapshot,
|
|
602
|
+
consecutiveBusts: state.consecutiveBusts,
|
|
618
603
|
};
|
|
619
604
|
}
|
|
620
605
|
|
|
@@ -639,13 +624,12 @@ export function saveGradientState(sessionID: string): void {
|
|
|
639
624
|
if (!state) return;
|
|
640
625
|
|
|
641
626
|
saveSessionTracking(sessionID, {
|
|
642
|
-
dynamicContextCap: state.dynamicContextCap,
|
|
643
|
-
bustRateEMA: state.bustRateEMA,
|
|
644
|
-
interBustIntervalEMA: state.interBustIntervalEMA,
|
|
645
627
|
lastLayer: state.lastLayer,
|
|
646
628
|
lastKnownInput: state.lastKnownInput,
|
|
647
629
|
lastTurnAt: state.lastTurnAt,
|
|
648
|
-
|
|
630
|
+
// Repurpose the dead dynamicContextCap column (v24, always 0 now)
|
|
631
|
+
// to persist consecutiveBusts — avoids a new DB migration.
|
|
632
|
+
dynamicContextCap: state.consecutiveBusts,
|
|
649
633
|
});
|
|
650
634
|
}
|
|
651
635
|
|
|
@@ -1547,6 +1531,10 @@ export type TransformResult = {
|
|
|
1547
1531
|
// relevance scoring. Set on Layer 4 (emergency) where the context is
|
|
1548
1532
|
// fully reset and mid-session knowledge may have changed relevance.
|
|
1549
1533
|
refreshLtm: boolean;
|
|
1534
|
+
/** When set, the conversation is growing unsustainably — 5+ consecutive
|
|
1535
|
+
* cache busts detected. The pipeline should inject a warning message
|
|
1536
|
+
* advising the user to compact or start a new conversation. */
|
|
1537
|
+
unsustainable?: boolean;
|
|
1550
1538
|
};
|
|
1551
1539
|
|
|
1552
1540
|
// Per-session urgent distillation tracking.
|
|
@@ -1580,17 +1568,11 @@ function transformInner(input: {
|
|
|
1580
1568
|
contextLimit - outputReserved - overhead - sessLtmTokens,
|
|
1581
1569
|
);
|
|
1582
1570
|
|
|
1583
|
-
//
|
|
1584
|
-
//
|
|
1585
|
-
//
|
|
1586
|
-
//
|
|
1587
|
-
|
|
1588
|
-
const effectiveCap = sid && sessState.dynamicContextCap > 0
|
|
1589
|
-
? sessState.dynamicContextCap
|
|
1590
|
-
: maxContextTokensCeiling;
|
|
1591
|
-
const usable = effectiveCap > 0 && usableRaw > effectiveCap
|
|
1592
|
-
? effectiveCap
|
|
1593
|
-
: usableRaw;
|
|
1571
|
+
// No EMA-driven adaptive cap — use the full available context budget.
|
|
1572
|
+
// The layer-0 cap (maxLayer0Tokens) still applies for per-turn read cost,
|
|
1573
|
+
// and tier-based bust-vs-continue decisions control whether to compress
|
|
1574
|
+
// at quality boundaries.
|
|
1575
|
+
const usable = usableRaw;
|
|
1594
1576
|
|
|
1595
1577
|
const distilledBudget = Math.floor(usable * cfg.budget.distilled);
|
|
1596
1578
|
// Base raw budget. May be overridden below for post-idle compact mode.
|
|
@@ -1661,11 +1643,8 @@ function transformInner(input: {
|
|
|
1661
1643
|
sessState.postIdleCompact = false;
|
|
1662
1644
|
// Skip layer 0 — don't pass through all raw messages on a cold cache.
|
|
1663
1645
|
effectiveMinLayer = Math.max(effectiveMinLayer, 1) as SafetyLayer;
|
|
1664
|
-
// Use a tighter raw budget
|
|
1665
|
-
|
|
1666
|
-
// the cap, use a tighter 20% to limit cold-write cost directly.
|
|
1667
|
-
const postIdleRawFraction = effectiveCap > 0 ? 0.30 : 0.20;
|
|
1668
|
-
rawBudget = Math.floor(usable * postIdleRawFraction);
|
|
1646
|
+
// Use a tighter raw budget on cold cache to limit write cost.
|
|
1647
|
+
rawBudget = Math.floor(usable * 0.20);
|
|
1669
1648
|
log.info(
|
|
1670
1649
|
`post-idle compact: session=${sid} rawBudget=${rawBudget}` +
|
|
1671
1650
|
` (${Math.floor(usable * cfg.budget.raw)}→${rawBudget})`,
|
|
@@ -1727,6 +1706,46 @@ function transformInner(input: {
|
|
|
1727
1706
|
};
|
|
1728
1707
|
}
|
|
1729
1708
|
|
|
1709
|
+
// --- Tier-based bust-vs-continue gate ---
|
|
1710
|
+
// When expectedInput exceeds the layer-0 cap but still fits in the model's
|
|
1711
|
+
// context window, check whether compression is economically justified.
|
|
1712
|
+
// If not (bust cost ≥ 85% of continue cost), skip compression and pass
|
|
1713
|
+
// through at layer 0 — the cache reads are cheap enough to justify the
|
|
1714
|
+
// larger context, and raw messages are better quality than distilled.
|
|
1715
|
+
if (
|
|
1716
|
+
effectiveMinLayer === 0 &&
|
|
1717
|
+
layer0Input > layer0Ceiling &&
|
|
1718
|
+
layer0Input <= maxInput &&
|
|
1719
|
+
sid
|
|
1720
|
+
) {
|
|
1721
|
+
const busts = getSessionState(sid).consecutiveBusts;
|
|
1722
|
+
// For compression, estimate the compressed size as the layer-1 budget
|
|
1723
|
+
// (distilled + raw fractions). This is a rough upper bound — actual
|
|
1724
|
+
// compressed output may be smaller.
|
|
1725
|
+
const compressedEstimate = distilledBudget + rawBudget;
|
|
1726
|
+
if (!shouldCompress(Math.round(layer0Input), compressedEstimate, busts)) {
|
|
1727
|
+
const messageTokens = calibrated
|
|
1728
|
+
? expectedInput - (sessLtmTokens - sessState.lastKnownLtm)
|
|
1729
|
+
: expectedInput - overhead - sessLtmTokens;
|
|
1730
|
+
log.info(
|
|
1731
|
+
`tier gate: session=${sid} skipping compression — bustCost not justified` +
|
|
1732
|
+
` (input=${Math.round(layer0Input)} compressed=${compressedEstimate} busts=${busts})`,
|
|
1733
|
+
);
|
|
1734
|
+
return {
|
|
1735
|
+
messages: input.messages,
|
|
1736
|
+
layer: 0,
|
|
1737
|
+
distilledTokens: 0,
|
|
1738
|
+
rawTokens: Math.max(0, messageTokens),
|
|
1739
|
+
totalTokens: Math.max(0, messageTokens),
|
|
1740
|
+
usable,
|
|
1741
|
+
distilledBudget,
|
|
1742
|
+
rawBudget,
|
|
1743
|
+
refreshLtm: false,
|
|
1744
|
+
unsustainable: busts >= 5,
|
|
1745
|
+
};
|
|
1746
|
+
}
|
|
1747
|
+
}
|
|
1748
|
+
|
|
1730
1749
|
// --- Gradient mode: context exhausted (or force-escalated), compress older messages ---
|
|
1731
1750
|
|
|
1732
1751
|
// Pre-pass: deduplicate repeated tool outputs before layer selection.
|
|
@@ -1872,6 +1891,8 @@ function transformInner(input: {
|
|
|
1872
1891
|
const nuclearRaw = [...olderMessages, ...currentTurn];
|
|
1873
1892
|
const nuclearRawTokens = olderTokens + currentTurnTokens;
|
|
1874
1893
|
|
|
1894
|
+
const unsustainable = sid ? getSessionState(sid).consecutiveBusts >= 5 : false;
|
|
1895
|
+
|
|
1875
1896
|
return {
|
|
1876
1897
|
messages: [...nuclearPrefix, ...nuclearRaw],
|
|
1877
1898
|
layer: 4,
|
|
@@ -1882,6 +1903,7 @@ function transformInner(input: {
|
|
|
1882
1903
|
distilledBudget,
|
|
1883
1904
|
rawBudget,
|
|
1884
1905
|
refreshLtm: true,
|
|
1906
|
+
unsustainable,
|
|
1885
1907
|
};
|
|
1886
1908
|
}
|
|
1887
1909
|
|
|
@@ -1933,7 +1955,7 @@ export function transform(input: {
|
|
|
1933
1955
|
log.info(
|
|
1934
1956
|
`gradient: session=${sid} layer=${result.layer} tokens=${result.totalTokens}` +
|
|
1935
1957
|
` (distilled=${result.distilledTokens} raw=${result.rawTokens})` +
|
|
1936
|
-
` usable=${result.usable}
|
|
1958
|
+
` usable=${result.usable} tier=${getTier(result.totalTokens)} l0cap=${maxLayer0Tokens || "off"}`,
|
|
1937
1959
|
);
|
|
1938
1960
|
}
|
|
1939
1961
|
return result;
|
package/src/hosted.ts
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* hosted.ts — Hosted/remote mode flag for @loreai/core.
|
|
3
|
+
*
|
|
4
|
+
* When the gateway runs remotely (different machine/container from the
|
|
5
|
+
* developer's workspace), filesystem operations that use client-controlled
|
|
6
|
+
* paths are unsafe:
|
|
7
|
+
*
|
|
8
|
+
* - `git remote -v` subprocess with attacker-controlled cwd
|
|
9
|
+
* - `.lore.json` config read from attacker-controlled path
|
|
10
|
+
* - `.lore.md` / AGENTS.md read/write at attacker-controlled path
|
|
11
|
+
* - `lat.md/` recursive directory scan at attacker-controlled path
|
|
12
|
+
* - `fs.watch()` on attacker-controlled paths
|
|
13
|
+
*
|
|
14
|
+
* Setting hosted mode causes all these operations to become safe no-ops.
|
|
15
|
+
* The gateway sets this flag during startup when `LORE_HOSTED_MODE=1`.
|
|
16
|
+
*
|
|
17
|
+
* This is a process-wide flag — once set, it cannot be unset (the only
|
|
18
|
+
* consumer is the gateway process, and hosted mode is a startup decision).
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
let _hostedMode = false;
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Enable hosted mode. Once enabled, cannot be disabled.
|
|
25
|
+
* All filesystem operations using client-controlled paths become no-ops.
|
|
26
|
+
*/
|
|
27
|
+
export function enableHostedMode(): void {
|
|
28
|
+
_hostedMode = true;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Returns true if hosted mode is active — filesystem operations using
|
|
33
|
+
* client-controlled paths should be skipped.
|
|
34
|
+
*/
|
|
35
|
+
export function isHostedMode(): boolean {
|
|
36
|
+
return _hostedMode;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Reset hosted mode flag. **Test-only** — production code should never
|
|
41
|
+
* call this. Exported so tests can toggle hosted mode without process
|
|
42
|
+
* restarts.
|
|
43
|
+
*/
|
|
44
|
+
export function _resetHostedModeForTest(): void {
|
|
45
|
+
_hostedMode = false;
|
|
46
|
+
}
|