opencode-lore 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "opencode-lore",
3
- "version": "0.1.4",
3
+ "version": "0.2.0",
4
4
  "type": "module",
5
5
  "license": "MIT",
6
6
  "description": "Three-tier memory architecture for OpenCode — distillation, not summarization",
@@ -34,7 +34,7 @@
34
34
  ],
35
35
  "repository": {
36
36
  "type": "git",
37
- "url": "https://github.com/BYK/opencode-lore.git"
37
+ "url": "git+https://github.com/BYK/opencode-lore.git"
38
38
  },
39
39
  "keywords": [
40
40
  "opencode",
@@ -11,8 +11,7 @@
11
11
  import { existsSync, readFileSync, writeFileSync, mkdirSync } from "fs";
12
12
  import { dirname } from "path";
13
13
  import * as ltm from "./ltm";
14
- import { formatKnowledge } from "./prompt";
15
- import { unescapeMarkdown } from "./markdown";
14
+ import { serialize, inline, h, ul, liph, strong, t, root, unescapeMarkdown } from "./markdown";
16
15
 
17
16
  // ---------------------------------------------------------------------------
18
17
  // Constants
@@ -158,23 +157,40 @@ function buildSection(projectPath: string): string {
158
157
  if (!entries.length) {
159
158
  return "\n";
160
159
  }
161
- const formatted = formatKnowledge(
162
- entries.map((e) => ({ category: e.category, title: e.title, content: e.content })),
163
- );
164
- if (!formatted) return "\n";
165
160
 
166
- // Inject <!-- lore:UUID --> above each bullet line
167
- const idByTitle = new Map(entries.map((e) => [e.title, e.id]));
168
- const lines = formatted.split("\n");
161
+ // Group entries by category, preserving DB order (confidence DESC, updated_at DESC).
162
+ const grouped = new Map<string, typeof entries>();
163
+ for (const e of entries) {
164
+ const group = grouped.get(e.category) ?? [];
165
+ group.push(e);
166
+ grouped.set(e.category, group);
167
+ }
168
+
169
+ // Build the section body by iterating entries directly, emitting each entry
170
+ // with its own <!-- lore:UUID --> marker. This avoids the title-based Map
171
+ // deduplication bug where multiple entries with the same title all got the
172
+ // same UUID marker from the last Map.set() winner.
169
173
  const out: string[] = [""];
170
- for (const line of lines) {
171
- const bulletMatch = line.match(/^\*\s+\*\*(.+?)\*\*/);
172
- if (bulletMatch) {
173
- const id = idByTitle.get(bulletMatch[1]);
174
- if (id) out.push(`<!-- lore:${id} -->`);
174
+
175
+ // Section heading
176
+ out.push("## Long-term Knowledge");
177
+
178
+ for (const [category, items] of grouped) {
179
+ out.push("");
180
+ out.push(`### ${category.charAt(0).toUpperCase() + category.slice(1)}`);
181
+ out.push("");
182
+ for (const entry of items) {
183
+ out.push(`<!-- lore:${entry.id} -->`);
184
+ // Render the bullet using remark serializer for proper markdown escaping.
185
+ // serialize(root(ul([liph(...)]))) produces "* **Title**: content\n".
186
+ // Trim the trailing newline since we join with \n ourselves.
187
+ const bullet = serialize(
188
+ root(ul([liph(strong(inline(entry.title)), t(": " + inline(entry.content)))]))
189
+ ).trimEnd();
190
+ out.push(bullet);
175
191
  }
176
- out.push(line);
177
192
  }
193
+
178
194
  out.push("");
179
195
  return out.join("\n");
180
196
  }
package/src/config.ts CHANGED
@@ -30,6 +30,14 @@ export const LoreConfig = z.object({
30
30
  afterTurns: z.number().min(1).default(10),
31
31
  })
32
32
  .default({}),
33
+ pruning: z
34
+ .object({
35
+ /** Days to keep distilled temporal messages before pruning. Default: 120. */
36
+ retention: z.number().min(1).default(120),
37
+ /** Max total temporal_messages storage in MB before emergency pruning. Default: 1024 (1 GB). */
38
+ maxStorage: z.number().min(50).default(1024),
39
+ })
40
+ .default({}),
33
41
  crossProject: z.boolean().default(true),
34
42
  agentsFile: z
35
43
  .object({
package/src/db.ts CHANGED
@@ -2,7 +2,7 @@ import { Database } from "bun:sqlite";
2
2
  import { join } from "path";
3
3
  import { mkdirSync } from "fs";
4
4
 
5
- const SCHEMA_VERSION = 2;
5
+ const SCHEMA_VERSION = 3;
6
6
 
7
7
  const MIGRATIONS: string[] = [
8
8
  `
@@ -124,6 +124,12 @@ const MIGRATIONS: string[] = [
124
124
  -- Version 2: Replace narrative+facts with observations text
125
125
  ALTER TABLE distillations ADD COLUMN observations TEXT NOT NULL DEFAULT '';
126
126
  `,
127
+ `
128
+ -- Version 3: One-time vacuum to reclaim accumulated free pages, and enable
129
+ -- incremental auto-vacuum so future deletes return pages to the OS.
130
+ -- VACUUM must run outside a transaction and cannot be in a multi-statement
131
+ -- exec, so it is handled specially in the migrate() function.
132
+ `,
127
133
  ];
128
134
 
129
135
  function dataDir() {
@@ -142,10 +148,17 @@ export function db(): Database {
142
148
  instance = new Database(path, { create: true });
143
149
  instance.exec("PRAGMA journal_mode = WAL");
144
150
  instance.exec("PRAGMA foreign_keys = ON");
151
+ // Return freed pages to the OS incrementally on each transaction commit
152
+ // instead of accumulating a free-page list that bloats the file.
153
+ instance.exec("PRAGMA auto_vacuum = INCREMENTAL");
145
154
  migrate(instance);
146
155
  return instance;
147
156
  }
148
157
 
158
+ // Index of the migration that performs a one-time VACUUM.
159
+ // VACUUM cannot run inside a transaction, so migrate() handles it specially.
160
+ const VACUUM_MIGRATION_INDEX = 2; // 0-based index of version-3 migration
161
+
149
162
  function migrate(database: Database) {
150
163
  const row = database
151
164
  .query(
@@ -161,7 +174,16 @@ function migrate(database: Database) {
161
174
  : 0;
162
175
  if (current >= MIGRATIONS.length) return;
163
176
  for (let i = current; i < MIGRATIONS.length; i++) {
164
- database.exec(MIGRATIONS[i]);
177
+ if (i === VACUUM_MIGRATION_INDEX) {
178
+ // VACUUM cannot run inside a transaction. Run it directly.
179
+ // auto_vacuum mode must be set *before* VACUUM — SQLite bakes it into
180
+ // the file header during the rebuild. After this, every subsequent
181
+ // startup's "PRAGMA auto_vacuum = INCREMENTAL" is a no-op (already set).
182
+ database.exec("PRAGMA auto_vacuum = INCREMENTAL");
183
+ database.exec("VACUUM");
184
+ } else {
185
+ database.exec(MIGRATIONS[i]);
186
+ }
165
187
  }
166
188
  // Update version to latest. Migration 0 inserts version=1 via its own INSERT,
167
189
  // but subsequent migrations don't update it, so always normalize to MIGRATIONS.length.
package/src/gradient.ts CHANGED
@@ -40,12 +40,35 @@ const FIRST_TURN_OVERHEAD = 15_000;
40
40
  // Null = not yet calibrated (first turn). Updated after every assistant response.
41
41
  let calibratedOverhead: number | null = null;
42
42
 
43
+ // --- Exact token tracking ---
44
+ // Stores the real input token count from the last successful API response.
45
+ // Used for the layer 0 passthrough decision: instead of estimating the full
46
+ // message array with chars/4, we take the exact count from the previous turn
47
+ // and only estimate the small delta (new messages). 99%+ of the count is
48
+ // exact from the API's own tokenizer, virtually eliminating overflow errors.
49
+ let lastKnownInput = 0;
50
+ let lastKnownLtm = 0;
51
+ let lastKnownSessionID: string | null = null;
52
+ let lastKnownMessageCount = 0;
53
+
54
+ // --- Force escalation ---
55
+ // Set when the API returns "prompt is too long" — forces the transform to skip
56
+ // layer 0 (and optionally layer 1) on the next call to ensure the context is
57
+ // trimmed enough to fit. Cleared after one use (one-shot).
58
+ let forceMinLayer: SafetyLayer = 0;
59
+
43
60
  // LTM tokens injected via system transform hook this turn.
44
61
  // Set by setLtmTokens() after the system hook runs; consumed by transform().
45
62
  let ltmTokens = 0;
46
63
 
47
64
  export function setModelLimits(limits: { context: number; output: number }) {
48
65
  contextLimit = limits.context || 200_000;
66
+ // NOTE: this cap of 32K matches what @ai-sdk/anthropic sends as max_tokens for
67
+ // claude-opus-4-6 (the SDK doesn't recognise the -6 variant and falls back to
68
+ // the generic claude-opus-4- pattern with maxOutputTokens=32K). If the SDK is
69
+ // updated to send the model's actual limit (128K for opus-4-6), this cap will
70
+ // become wrong — the effective max input would drop from 168K to 72K but our
71
+ // budget would still assume 168K. At that point, remove the cap.
49
72
  outputReserved = Math.min(limits.output || 32_000, 32_000);
50
73
  }
51
74
 
@@ -72,9 +95,22 @@ export function getLtmBudget(ltmFraction: number): number {
72
95
  }
73
96
 
74
97
  // Called after each assistant message completes with real token usage data.
75
- // actualInput = tokens.input + tokens.cache.read (all tokens that went into the model)
98
+ // actualInput = tokens.input + tokens.cache.read (all tokens the model saw)
76
99
  // messageEstimate = our chars/4 estimate of the messages we sent
77
- export function calibrate(actualInput: number, messageEstimate: number) {
100
+ // sessionID = session that produced this response (for exact-tracking validity)
101
+ // messageCount = number of messages that were sent (for delta estimation)
102
+ export function calibrate(
103
+ actualInput: number,
104
+ messageEstimate: number,
105
+ sessionID?: string,
106
+ messageCount?: number,
107
+ ) {
108
+ // Store exact counts for the proactive layer 0 decision.
109
+ lastKnownInput = actualInput;
110
+ lastKnownLtm = ltmTokens;
111
+ if (sessionID !== undefined) lastKnownSessionID = sessionID;
112
+ if (messageCount !== undefined) lastKnownMessageCount = messageCount;
113
+
78
114
  const overhead = Math.max(0, actualInput - messageEstimate);
79
115
  // Smooth with EMA (alpha=0.3) once calibrated, or set directly on first call
80
116
  calibratedOverhead =
@@ -87,9 +123,23 @@ export function getOverhead(): number {
87
123
  return calibratedOverhead ?? FIRST_TURN_OVERHEAD;
88
124
  }
89
125
 
90
- // For testing only — reset calibration state
126
+ /**
127
+ * Force the next transform() call to use at least the given layer.
128
+ * Called when the API returns "prompt is too long" so the next attempt
129
+ * trims the context enough to fit within the model's context window.
130
+ */
131
+ export function setForceMinLayer(layer: SafetyLayer) {
132
+ forceMinLayer = layer;
133
+ }
134
+
135
+ // For testing only — reset all calibration and force-escalation state
91
136
  export function resetCalibration() {
92
137
  calibratedOverhead = null;
138
+ lastKnownInput = 0;
139
+ lastKnownLtm = 0;
140
+ lastKnownSessionID = null;
141
+ lastKnownMessageCount = 0;
142
+ forceMinLayer = 0;
93
143
  }
94
144
 
95
145
  type Distillation = {
@@ -317,16 +367,9 @@ function addRelativeTimeToObservations(text: string, now: Date): string {
317
367
  return result;
318
368
  }
319
369
 
320
- // Build a synthetic message pair containing the distilled history
321
- function distilledPrefix(distillations: Distillation[]): MessageWithParts[] {
322
- if (!distillations.length) return [];
323
- const now = new Date();
324
- const annotated = distillations.map((d) => ({
325
- ...d,
326
- observations: addRelativeTimeToObservations(d.observations, now),
327
- }));
328
- const formatted = formatDistillations(annotated);
329
- if (!formatted) return [];
370
+ // Build synthetic user/assistant message pair wrapping formatted distillation text.
371
+ // Shared by the cached and non-cached prefix paths.
372
+ function buildPrefixMessages(formatted: string): MessageWithParts[] {
330
373
  return [
331
374
  {
332
375
  info: {
@@ -381,7 +424,252 @@ function distilledPrefix(distillations: Distillation[]): MessageWithParts[] {
381
424
  ];
382
425
  }
383
426
 
384
- export type SafetyLayer = 1 | 2 | 3 | 4;
427
+ // Build a synthetic message pair containing the distilled history.
428
+ // Non-cached path — used by layers 2-4 which already cause full cache invalidation.
429
+ function distilledPrefix(distillations: Distillation[]): MessageWithParts[] {
430
+ if (!distillations.length) return [];
431
+ const now = new Date();
432
+ const annotated = distillations.map((d) => ({
433
+ ...d,
434
+ observations: addRelativeTimeToObservations(d.observations, now),
435
+ }));
436
+ const formatted = formatDistillations(annotated);
437
+ if (!formatted) return [];
438
+ return buildPrefixMessages(formatted);
439
+ }
440
+
441
+ // --- Approach C: Append-only distillation prefix cache ---
442
+ //
443
+ // Caches the rendered prefix text per session. When new distillations arrive,
444
+ // only renders the new rows and appends them to the cached text. This keeps
445
+ // the prefix byte-identical between distillation runs, preserving the prompt
446
+ // cache. Only meta-distillation (which rewrites gen-0 rows into gen-1) causes
447
+ // a full re-render — and that happens roughly every 80-100 turns.
448
+
449
+ type PrefixCache = {
450
+ /** The session this cache belongs to */
451
+ sessionID: string;
452
+ /** ID of the last distillation row included in the cached text */
453
+ lastDistillationID: string;
454
+ /** Number of rows that produced the cached text */
455
+ rowCount: number;
456
+ /** The rendered text (used to build delta appends) */
457
+ cachedText: string;
458
+ /** Ready-to-use message pair */
459
+ prefixMessages: MessageWithParts[];
460
+ /** Token estimate of prefixMessages */
461
+ prefixTokens: number;
462
+ };
463
+
464
+ let prefixCache: PrefixCache | null = null;
465
+
466
+ /**
467
+ * Return the distilled prefix messages, reusing cached content when possible.
468
+ *
469
+ * Cache hit — no new rows: returns the exact same prefixMessages object
470
+ * (byte-identical content, prompt cache preserved).
471
+ * Cache miss — new rows appended: renders only the delta, appends to cached
472
+ * text, updates cache.
473
+ * Full reset — session changed, or rows were rewritten by meta-distillation:
474
+ * renders everything from scratch.
475
+ */
476
+ function distilledPrefixCached(
477
+ distillations: Distillation[],
478
+ sessionID: string,
479
+ ): { messages: MessageWithParts[]; tokens: number } {
480
+ if (!distillations.length) {
481
+ prefixCache = null;
482
+ return { messages: [], tokens: 0 };
483
+ }
484
+
485
+ const lastRow = distillations[distillations.length - 1];
486
+
487
+ // Cache is valid when: same session, row count only grew (no rewrites),
488
+ // and the last previously-cached row still exists at the same position.
489
+ const cacheValid =
490
+ prefixCache !== null &&
491
+ prefixCache.sessionID === sessionID &&
492
+ prefixCache.rowCount <= distillations.length &&
493
+ (prefixCache.rowCount === 0 ||
494
+ distillations[prefixCache.rowCount - 1]?.id ===
495
+ prefixCache.lastDistillationID);
496
+
497
+ if (cacheValid) {
498
+ if (prefixCache!.lastDistillationID === lastRow.id) {
499
+ // No new rows — return cached prefix as-is (byte-identical for prompt cache)
500
+ return {
501
+ messages: prefixCache!.prefixMessages,
502
+ tokens: prefixCache!.prefixTokens,
503
+ };
504
+ }
505
+
506
+ // New rows appended — render only the delta and append to cached text
507
+ const newRows = distillations.slice(prefixCache!.rowCount);
508
+ const now = new Date();
509
+ const annotated = newRows.map((d) => ({
510
+ ...d,
511
+ observations: addRelativeTimeToObservations(d.observations, now),
512
+ }));
513
+ const deltaText = formatDistillations(annotated);
514
+
515
+ if (deltaText) {
516
+ const fullText = prefixCache!.cachedText + "\n\n" + deltaText;
517
+ const messages = buildPrefixMessages(fullText);
518
+ const tokens = messages.reduce((sum, m) => sum + estimateMessage(m), 0);
519
+ prefixCache = {
520
+ sessionID,
521
+ lastDistillationID: lastRow.id,
522
+ rowCount: distillations.length,
523
+ cachedText: fullText,
524
+ prefixMessages: messages,
525
+ prefixTokens: tokens,
526
+ };
527
+ return { messages, tokens };
528
+ }
529
+ }
530
+
531
+ // Full re-render: first call, session change, or meta-distillation rewrote rows
532
+ const now = new Date();
533
+ const annotated = distillations.map((d) => ({
534
+ ...d,
535
+ observations: addRelativeTimeToObservations(d.observations, now),
536
+ }));
537
+ const fullText = formatDistillations(annotated);
538
+ if (!fullText) {
539
+ prefixCache = null;
540
+ return { messages: [], tokens: 0 };
541
+ }
542
+
543
+ const messages = buildPrefixMessages(fullText);
544
+ const tokens = messages.reduce((sum, m) => sum + estimateMessage(m), 0);
545
+ prefixCache = {
546
+ sessionID,
547
+ lastDistillationID: lastRow.id,
548
+ rowCount: distillations.length,
549
+ cachedText: fullText,
550
+ prefixMessages: messages,
551
+ prefixTokens: tokens,
552
+ };
553
+ return { messages, tokens };
554
+ }
555
+
556
+ // For testing only — reset prefix cache state
557
+ export function resetPrefixCache() {
558
+ prefixCache = null;
559
+ }
560
+
561
+ // --- Approach B: Lazy raw window eviction ---
562
+ //
563
+ // Tracks the ID of the first (oldest) message in the previous raw window.
564
+ // On the next turn, if the window starting at that message still fits within
565
+ // the raw budget, the cutoff is pinned — no messages are evicted and the raw
566
+ // window stays byte-identical for caching purposes. Only when the pinned
567
+ // window no longer fits (e.g. a large tool response pushed us over) is the
568
+ // cutoff allowed to advance forward by one message at a time.
569
+ //
570
+ // This eliminates the "window sliding on every turn" problem that was the
571
+ // dominant source of cache misses in gradient mode: each new turn appends a
572
+ // message to the conversation, but the start of the raw window only moves
573
+ // when it must.
574
+ //
575
+ // Reset conditions: session changes, or layer escalates to 2+ (the pinned
576
+ // window was too large even with stripping — something genuinely changed).
577
+
578
+ type RawWindowCache = {
579
+ sessionID: string;
580
+ /** ID of the first message in the pinned raw window */
581
+ firstMessageID: string;
582
+ };
583
+
584
+ let rawWindowCache: RawWindowCache | null = null;
585
+
586
+ export function resetRawWindowCache() {
587
+ rawWindowCache = null;
588
+ }
589
+
590
+ /**
591
+ * Layer-1 tryFit with lazy eviction.
592
+ *
593
+ * Attempts to reuse the previous raw window cutoff before falling back to a
594
+ * full backward scan. If the pinned window fits, returns it unchanged (same
595
+ * message objects, byte-identical for prompt caching). If it doesn't fit,
596
+ * delegates to the normal tryFit which finds the new minimal cutoff and
597
+ * updates the cache.
598
+ */
599
+ function tryFitStable(input: {
600
+ messages: MessageWithParts[];
601
+ prefix: MessageWithParts[];
602
+ prefixTokens: number;
603
+ distilledBudget: number;
604
+ rawBudget: number;
605
+ sessionID: string;
606
+ }): Omit<TransformResult, "layer" | "usable" | "distilledBudget" | "rawBudget"> | null {
607
+ // If the prefix already overflows its budget there's no point trying.
608
+ if (input.prefixTokens > input.distilledBudget && input.prefix.length > 0)
609
+ return null;
610
+
611
+ const cacheValid =
612
+ rawWindowCache !== null && rawWindowCache.sessionID === input.sessionID;
613
+
614
+ if (cacheValid) {
615
+ const pinnedIdx = input.messages.findIndex(
616
+ (m) => m.info.id === rawWindowCache!.firstMessageID,
617
+ );
618
+
619
+ if (pinnedIdx !== -1) {
620
+ // Measure the token cost of the pinned window.
621
+ const pinnedWindow = input.messages.slice(pinnedIdx);
622
+ const pinnedTokens = pinnedWindow.reduce(
623
+ (sum, m) => sum + estimateMessage(m),
624
+ 0,
625
+ );
626
+
627
+ if (pinnedTokens <= input.rawBudget) {
628
+ // Pinned window still fits — keep it. Apply system-reminder cleanup
629
+ // only (strip:"none" is the layer-1 mode), returning the same message
630
+ // object references wherever nothing changed.
631
+ const processed = pinnedWindow.map((msg) => {
632
+ const parts = cleanParts(msg.parts);
633
+ return parts !== msg.parts ? { info: msg.info, parts } : msg;
634
+ });
635
+ const total = input.prefixTokens + pinnedTokens;
636
+ return {
637
+ messages: [...input.prefix, ...processed],
638
+ distilledTokens: input.prefixTokens,
639
+ rawTokens: pinnedTokens,
640
+ totalTokens: total,
641
+ };
642
+ }
643
+ // Pinned window is too large — fall through to the normal scan below.
644
+ }
645
+ }
646
+
647
+ // Normal backward scan to find the tightest fitting cutoff.
648
+ const result = tryFit({
649
+ messages: input.messages,
650
+ prefix: input.prefix,
651
+ prefixTokens: input.prefixTokens,
652
+ distilledBudget: input.distilledBudget,
653
+ rawBudget: input.rawBudget,
654
+ strip: "none",
655
+ });
656
+
657
+ if (result) {
658
+ // Update the raw window cache: the first non-prefix message is the oldest
659
+ // raw message in the new window. Pin to its ID for the next turn.
660
+ const rawStart = result.messages[input.prefix.length];
661
+ if (rawStart) {
662
+ rawWindowCache = {
663
+ sessionID: input.sessionID,
664
+ firstMessageID: rawStart.info.id,
665
+ };
666
+ }
667
+ }
668
+
669
+ return result;
670
+ }
671
+
672
+ export type SafetyLayer = 0 | 1 | 2 | 3 | 4;
385
673
 
386
674
  export type TransformResult = {
387
675
  messages: MessageWithParts[];
@@ -419,36 +707,115 @@ export function transform(input: {
419
707
  const distilledBudget = Math.floor(usable * cfg.budget.distilled);
420
708
  const rawBudget = Math.floor(usable * cfg.budget.raw);
421
709
 
422
- // Find the session ID from messages
710
+ // --- Force escalation (reactive error recovery) ---
711
+ // When the API previously rejected with "prompt is too long", skip layers
712
+ // below the forced minimum to ensure enough trimming on the next attempt.
713
+ // One-shot: consumed here and reset to 0.
714
+ const effectiveMinLayer = forceMinLayer;
715
+ forceMinLayer = 0;
716
+
717
+ // --- Approach A: Cache-preserving passthrough ---
718
+ // Use exact token count from the previous API response when available.
719
+ // Only the delta (messages added since last call) uses chars/4 estimation,
720
+ // making the layer-0 decision 99%+ accurate from the API's own tokenizer.
721
+ // maxInput = absolute ceiling the API enforces: input_tokens + max_tokens <= context
722
+ const maxInput = contextLimit - outputReserved;
423
723
  const sid = input.sessionID ?? input.messages[0]?.info.sessionID;
724
+
725
+ let expectedInput: number;
726
+ if (lastKnownInput > 0 && sid === lastKnownSessionID) {
727
+ // Exact approach: prior API count + estimate of only the new messages.
728
+ const newMsgCount = Math.max(0, input.messages.length - lastKnownMessageCount);
729
+ const newMsgTokens = newMsgCount > 0
730
+ ? input.messages.slice(-newMsgCount).reduce((s, m) => s + estimateMessage(m), 0)
731
+ : 0;
732
+ const ltmDelta = ltmTokens - lastKnownLtm;
733
+ expectedInput = lastKnownInput + newMsgTokens + ltmDelta;
734
+ } else {
735
+ // First turn or session change: fall back to chars/4 + overhead.
736
+ const messageTokens = input.messages.reduce((s, m) => s + estimateMessage(m), 0);
737
+ expectedInput = messageTokens + overhead + ltmTokens;
738
+ }
739
+
740
+ if (effectiveMinLayer === 0 && expectedInput <= maxInput) {
741
+ // All messages fit — return unmodified to preserve append-only prompt-cache pattern.
742
+ // Raw messages are strictly better context than lossy distilled summaries.
743
+ const messageTokens = lastKnownInput > 0 && sid === lastKnownSessionID
744
+ ? expectedInput - (ltmTokens - lastKnownLtm) // approximate raw portion
745
+ : expectedInput - overhead - ltmTokens;
746
+ return {
747
+ messages: input.messages,
748
+ layer: 0,
749
+ distilledTokens: 0,
750
+ rawTokens: Math.max(0, messageTokens),
751
+ totalTokens: Math.max(0, messageTokens),
752
+ usable,
753
+ distilledBudget,
754
+ rawBudget,
755
+ };
756
+ }
757
+
758
+ // --- Gradient mode: context exhausted (or force-escalated), compress older messages ---
759
+
424
760
  const distillations = sid ? loadDistillations(input.projectPath, sid) : [];
425
- const prefix = distilledPrefix(distillations);
426
- const prefixTokens = prefix.reduce((sum, m) => sum + estimateMessage(m), 0);
427
761
 
428
- // Layer 1: Normal budget allocation
429
- const layer1 = tryFit({
430
- messages: input.messages,
431
- prefix,
432
- prefixTokens,
433
- distilledBudget,
434
- rawBudget,
435
- strip: "none",
436
- });
437
- if (layer1) return { ...layer1, layer: 1, usable, distilledBudget, rawBudget };
762
+ // Layer 1 uses the append-only cached prefix (Approach C) to keep the
763
+ // distilled content byte-identical between distillation runs, preserving
764
+ // the prompt cache. Layers 2-4 already cause full cache invalidation via
765
+ // tool stripping / message restructuring, so they use the non-cached path.
766
+ const cached = sid
767
+ ? distilledPrefixCached(distillations, sid)
768
+ : (() => {
769
+ const msgs = distilledPrefix(distillations);
770
+ return { messages: msgs, tokens: msgs.reduce((sum, m) => sum + estimateMessage(m), 0) };
771
+ })();
772
+
773
+ // Layer 1: Normal budget allocation with lazy raw window eviction (Approach B).
774
+ // tryFitStable reuses the previous cutoff when it still fits, keeping the raw
775
+ // window byte-identical across turns for prompt caching. Only advances the
776
+ // cutoff when a genuinely oversized message forces eviction.
777
+ // Skipped when force-escalated to layer 2+ (previous attempt already failed at this level).
778
+ if (effectiveMinLayer <= 1) {
779
+ const layer1 = sid
780
+ ? tryFitStable({
781
+ messages: input.messages,
782
+ prefix: cached.messages,
783
+ prefixTokens: cached.tokens,
784
+ distilledBudget,
785
+ rawBudget,
786
+ sessionID: sid,
787
+ })
788
+ : tryFit({
789
+ messages: input.messages,
790
+ prefix: cached.messages,
791
+ prefixTokens: cached.tokens,
792
+ distilledBudget,
793
+ rawBudget,
794
+ strip: "none",
795
+ });
796
+ if (layer1) return { ...layer1, layer: 1, usable, distilledBudget, rawBudget };
797
+ }
798
+
799
+ // Layer 1 didn't fit (or was force-skipped) — reset the raw window cache.
800
+ // Layers 2-4 use full scans and already break the prompt cache.
801
+ rawWindowCache = null;
438
802
 
439
803
  // Layer 2: Strip tool outputs from older messages, keep last 2 turns
440
- const layer2 = tryFit({
441
- messages: input.messages,
442
- prefix,
443
- prefixTokens,
444
- distilledBudget,
445
- rawBudget: Math.floor(usable * 0.5), // give raw more room
446
- strip: "old-tools",
447
- protectedTurns: 2,
448
- });
449
- if (layer2) {
450
- urgentDistillation = true;
451
- return { ...layer2, layer: 2, usable, distilledBudget, rawBudget };
804
+ // Skipped when force-escalated to layer 3+.
805
+ if (effectiveMinLayer <= 2) {
806
+ const layer2 = tryFit({
807
+ messages: input.messages,
808
+ prefix: cached.messages,
809
+ prefixTokens: cached.tokens,
810
+ distilledBudget,
811
+ rawBudget: Math.floor(usable * 0.5), // give raw more room
812
+ strip: "old-tools",
813
+ protectedTurns: 2,
814
+ });
815
+ if (layer2) {
816
+ urgentDistillation = true;
817
+ return { ...layer2, layer: 2, usable, distilledBudget, rawBudget };
818
+ }
452
819
  }
453
820
 
454
821
  // Layer 3: Strip ALL tool outputs, drop oldest distillations
package/src/index.ts CHANGED
@@ -13,9 +13,11 @@ import {
13
13
  estimateMessages,
14
14
  setLtmTokens,
15
15
  getLtmBudget,
16
+ setForceMinLayer,
16
17
  } from "./gradient";
17
18
  import { formatKnowledge } from "./prompt";
18
19
  import { createRecallTool } from "./reflect";
20
+ import { shouldImport, importFromFile, exportToFile } from "./agents-file";
19
21
 
20
22
  export const LorePlugin: Plugin = async (ctx) => {
21
23
  const projectPath = ctx.worktree || ctx.directory;
@@ -33,6 +35,23 @@ export const LorePlugin: Plugin = async (ctx) => {
33
35
  }).catch(() => {});
34
36
  }
35
37
 
38
+ // Import from AGENTS.md at startup if it has changed since last export
39
+ // (hand-written entries, edits from other machines, or merge conflicts).
40
+ {
41
+ const cfg = config();
42
+ if (cfg.agentsFile.enabled) {
43
+ const filePath = `${projectPath}/${cfg.agentsFile.path}`;
44
+ if (shouldImport({ projectPath, filePath })) {
45
+ try {
46
+ importFromFile({ projectPath, filePath });
47
+ console.error("[lore] imported knowledge from", cfg.agentsFile.path);
48
+ } catch (e) {
49
+ console.error("[lore] agents-file import error:", e);
50
+ }
51
+ }
52
+ }
53
+ }
54
+
36
55
  // Prune any corrupted/oversized knowledge entries left by the AGENTS.md
37
56
  // backslash-escaping bug or curator hallucinations. Sets confidence → 0
38
57
  // (below the 0.2 query threshold) so they stop polluting the context.
@@ -179,7 +198,9 @@ export const LorePlugin: Plugin = async (ctx) => {
179
198
  backgroundDistill(msg.sessionID);
180
199
  }
181
200
 
182
- // Calibrate overhead estimate using real token counts
201
+ // Calibrate overhead estimate using real token counts.
202
+ // Also store the exact input count + message count for the proactive
203
+ // layer-0 decision (avoids full chars/4 re-estimation each turn).
183
204
  const allMsgs = await ctx.client.session.messages({
184
205
  path: { id: msg.sessionID },
185
206
  });
@@ -189,7 +210,7 @@ export const LorePlugin: Plugin = async (ctx) => {
189
210
  .map((m) => ({ info: m.info, parts: m.parts }));
190
211
  const msgEstimate = estimateMessages(withParts);
191
212
  const actualInput = msg.tokens.input + msg.tokens.cache.read;
192
- calibrate(actualInput, msgEstimate);
213
+ calibrate(actualInput, msgEstimate, msg.sessionID, withParts.length);
193
214
  }
194
215
  }
195
216
  }
@@ -198,13 +219,59 @@ export const LorePlugin: Plugin = async (ctx) => {
198
219
  }
199
220
  }
200
221
 
222
+ if (event.type === "session.error") {
223
+ // Detect "prompt is too long" API errors and auto-recover:
224
+ // 1. Force the gradient transform to escalate on the next call (skip layer 0/1)
225
+ // 2. Force distillation to capture all temporal data before compaction
226
+ // 3. Trigger compaction so the session recovers without user intervention
227
+ const error = (event.properties as Record<string, unknown>).error as
228
+ | { name?: string; data?: { message?: string } }
229
+ | undefined;
230
+ const isPromptTooLong =
231
+ error?.name === "APIError" &&
232
+ typeof error?.data?.message === "string" &&
233
+ (error.data.message.includes("prompt is too long") ||
234
+ error.data.message.includes("context length exceeded") ||
235
+ error.data.message.includes("maximum context length"));
236
+
237
+ if (isPromptTooLong) {
238
+ const sessionID = (event.properties as Record<string, unknown>).sessionID as
239
+ | string
240
+ | undefined;
241
+ console.error(
242
+ `[lore] detected 'prompt too long' error — forcing distillation + compaction (session: ${sessionID?.substring(0, 16)})`,
243
+ );
244
+ // Force layer 2 on next transform — layers 0 and 1 were already too large.
245
+ setForceMinLayer(2);
246
+
247
+ if (sessionID) {
248
+ // Force distillation to capture all undistilled messages before
249
+ // compaction replaces the session message history.
250
+ await backgroundDistill(sessionID, true);
251
+
252
+ // Trigger compaction automatically — the compacting hook will inject
253
+ // Lore's custom distillation-aware prompt.
254
+ try {
255
+ const sessions = await ctx.client.session.list();
256
+ const session = sessions.data?.find((s) => s.id.startsWith(sessionID));
257
+ if (session) {
258
+ // providerID/modelID are optional — omit to use the session's current model
259
+ await ctx.client.session.summarize({ path: { id: session.id } });
260
+ }
261
+ } catch (e) {
262
+ console.error("[lore] auto-compaction failed:", e);
263
+ }
264
+ }
265
+ }
266
+ }
267
+
201
268
  if (event.type === "session.idle") {
202
269
  const sessionID = event.properties.sessionID;
203
270
  if (await shouldSkip(sessionID)) return;
204
271
  if (!activeSessions.has(sessionID)) return;
205
272
 
206
273
  // Run background distillation for any remaining undistilled messages
207
- backgroundDistill(sessionID);
274
+ await backgroundDistill(sessionID);
208
275
 
209
276
  // Run curator periodically
210
277
  const cfg = config();
@@ -212,9 +279,39 @@ export const LorePlugin: Plugin = async (ctx) => {
212
279
  cfg.curator.onIdle ||
213
280
  turnsSinceCuration >= cfg.curator.afterTurns
214
281
  ) {
215
- backgroundCurate(sessionID);
282
+ await backgroundCurate(sessionID);
216
283
  turnsSinceCuration = 0;
217
284
  }
285
+
286
+ // Prune temporal messages after distillation and curation have run.
287
+ // Pass 1: TTL — remove distilled messages older than retention period.
288
+ // Pass 2: Size cap — evict oldest distilled messages if over the limit.
289
+ // Undistilled messages are never touched.
290
+ try {
291
+ const { ttlDeleted, capDeleted } = temporal.prune({
292
+ projectPath,
293
+ retentionDays: cfg.pruning.retention,
294
+ maxStorageMB: cfg.pruning.maxStorage,
295
+ });
296
+ if (ttlDeleted > 0 || capDeleted > 0) {
297
+ console.error(
298
+ `[lore] pruned temporal messages: ${ttlDeleted} by TTL, ${capDeleted} by size cap`,
299
+ );
300
+ }
301
+ } catch (e) {
302
+ console.error("[lore] pruning error:", e);
303
+ }
304
+
305
+ // Export curated knowledge to AGENTS.md after distillation + curation.
306
+ try {
307
+ const agentsCfg = cfg.agentsFile;
308
+ if (agentsCfg.enabled) {
309
+ const filePath = `${projectPath}/${agentsCfg.path}`;
310
+ exportToFile({ projectPath, filePath });
311
+ }
312
+ } catch (e) {
313
+ console.error("[lore] agents-file export error:", e);
314
+ }
218
315
  }
219
316
  },
220
317
 
@@ -264,7 +361,9 @@ export const LorePlugin: Plugin = async (ctx) => {
264
361
  }
265
362
  },
266
363
 
267
- // Transform message history: distilled prefix + raw recent
364
+ // Transform message history: distilled prefix + raw recent.
365
+ // Layer 0 = passthrough (messages fit without compression) — output.messages
366
+ // is left untouched to preserve the append-only pattern for prompt caching.
268
367
  "experimental.chat.messages.transform": async (_input, output) => {
269
368
  if (!output.messages.length) return;
270
369
 
@@ -275,71 +374,44 @@ export const LorePlugin: Plugin = async (ctx) => {
275
374
  projectPath,
276
375
  sessionID,
277
376
  });
278
- while (
279
- result.messages.length > 0 &&
280
- result.messages.at(-1)!.info.role !== "user"
281
- ) {
282
- const last = result.messages.at(-1)!;
283
- if (last.parts.some((p) => p.type === "tool")) break;
284
- const dropped = result.messages.pop()!;
285
- console.error(
286
- "[lore] WARN: dropping trailing",
287
- dropped.info.role,
288
- "message to prevent prefill error. id:",
289
- dropped.info.id,
290
- );
377
+
378
+ // Only restructure messages when the gradient transform is active (layers 1-4).
379
+ // Layer 0 means all messages fit within the context budget — leave them alone
380
+ // so the append-only sequence stays intact for prompt caching.
381
+ if (result.layer > 0) {
382
+ while (
383
+ result.messages.length > 0 &&
384
+ result.messages.at(-1)!.info.role !== "user"
385
+ ) {
386
+ const last = result.messages.at(-1)!;
387
+ if (last.parts.some((p) => p.type === "tool")) break;
388
+ const dropped = result.messages.pop()!;
389
+ console.error(
390
+ "[lore] WARN: dropping trailing",
391
+ dropped.info.role,
392
+ "message to prevent prefill error. id:",
393
+ dropped.info.id,
394
+ );
395
+ }
396
+ output.messages.splice(0, output.messages.length, ...result.messages);
291
397
  }
292
- output.messages.splice(0, output.messages.length, ...result.messages);
293
398
 
294
399
  if (result.layer >= 2 && sessionID) {
295
400
  backgroundDistill(sessionID);
296
401
  }
297
-
298
- // Look up statsPart AFTER the transform so the PATCHed text is clean
299
- // (system-reminder wrappers stripped). Looking up before would persist
300
- // ephemeral system-reminder content, making it visible in the UI.
301
- const lastUserMsg = [...output.messages].reverse().find((m) => m.info.role === "user");
302
- const statsPart = lastUserMsg?.parts.find((p) => p.type === "text");
303
-
304
- if (sessionID && statsPart && lastUserMsg) {
305
- const loreMeta = {
306
- layer: result.layer,
307
- distilledTokens: result.distilledTokens,
308
- rawTokens: result.rawTokens,
309
- totalTokens: result.totalTokens,
310
- usable: result.usable,
311
- distilledBudget: result.distilledBudget,
312
- rawBudget: result.rawBudget,
313
- updatedAt: Date.now(),
314
- };
315
- // Use the SDK's internal HTTP client so the request goes through
316
- // the same base URL, custom fetch, and interceptors that OpenCode
317
- // configured — no dependency on ctx.serverUrl being reachable.
318
- const httpClient = (ctx.client as any)._client;
319
- httpClient.patch({
320
- url: "/session/{sessionID}/message/{messageID}/part/{partID}",
321
- path: {
322
- sessionID,
323
- messageID: lastUserMsg.info.id,
324
- partID: statsPart.id,
325
- },
326
- body: {
327
- ...(statsPart as Record<string, unknown>),
328
- metadata: {
329
- ...((statsPart as { metadata?: Record<string, unknown> }).metadata ?? {}),
330
- lore: loreMeta,
331
- },
332
- },
333
- headers: { "Content-Type": "application/json" },
334
- }).catch(() => {
335
- // Non-critical: gradient stats metadata is for UI display only.
336
- // Server may not be reachable (e.g. TUI-only mode). Silently ignore.
337
- });
338
- }
339
402
  },
340
403
 
341
- // Replace compaction prompt with distillation-aware prompt when manual /compact is used
404
+ // Replace compaction prompt with distillation-aware prompt when manual /compact is used.
405
+ // Also force distillation first so all temporal data is captured before compaction
406
+ // replaces the session message history.
342
407
  "experimental.session.compacting": async (input, output) => {
408
+ // Force distillation to capture any undistilled messages. This is critical:
409
+ // compaction will replace all messages with a summary, so we must persist
410
+ // everything to Lore's temporal store before that happens.
411
+ if (input.sessionID && activeSessions.has(input.sessionID)) {
412
+ await backgroundDistill(input.sessionID, true);
413
+ }
414
+
343
415
  const entries = ltm.forProject(projectPath, config().crossProject);
344
416
  const knowledge = entries.length
345
417
  ? formatKnowledge(
package/src/ltm.ts CHANGED
@@ -36,6 +36,33 @@ export function create(input: {
36
36
  input.scope === "project" && input.projectPath
37
37
  ? ensureProject(input.projectPath)
38
38
  : null;
39
+
40
+ // Dedup guard: if an entry with the same project_id + title already exists,
41
+ // update its content instead of inserting a duplicate. This prevents the
42
+ // curator from creating multiple entries for the same concept across sessions.
43
+ // Note: when an explicit id is provided (cross-machine import), skip dedup —
44
+ // the caller (importFromFile) already handles duplicate detection by UUID.
45
+ if (!input.id) {
46
+ const existing = (
47
+ pid !== null
48
+ ? db()
49
+ .query(
50
+ "SELECT id FROM knowledge WHERE project_id = ? AND LOWER(title) = LOWER(?) AND confidence > 0 LIMIT 1",
51
+ )
52
+ .get(pid, input.title)
53
+ : db()
54
+ .query(
55
+ "SELECT id FROM knowledge WHERE project_id IS NULL AND LOWER(title) = LOWER(?) AND confidence > 0 LIMIT 1",
56
+ )
57
+ .get(input.title)
58
+ ) as { id: string } | null;
59
+
60
+ if (existing) {
61
+ update(existing.id, { content: input.content });
62
+ return existing.id;
63
+ }
64
+ }
65
+
39
66
  const id = input.id ?? uuidv7();
40
67
  const now = Date.now();
41
68
  db()
package/src/prompt.ts CHANGED
@@ -176,18 +176,23 @@ ${entries.join("\n\n---\n\n")}`;
176
176
 
177
177
  export const CURATOR_SYSTEM = `You are a long-term memory curator. Your job is to extract durable knowledge from a conversation that should persist across sessions.
178
178
 
179
- Focus on knowledge that will remain true and useful beyond the current task:
180
- - User preferences and working style
181
- - Architectural decisions and their rationale
182
- - Project conventions and patterns
183
- - Environment setup details
184
- - Recurring gotchas or constraints
185
- - Important relationships between components
179
+ Focus ONLY on knowledge that helps a coding agent work effectively on THIS codebase:
180
+ - Architectural decisions and their rationale (why something was built a certain way)
181
+ - Non-obvious implementation patterns and conventions specific to the project
182
+ - Recurring gotchas, constraints, or traps in the codebase
183
+ - Environment/tooling setup details that affect development
184
+ - Important relationships between components that aren't obvious from reading the code
185
+ - User preferences and working style specific to how they use this project
186
186
 
187
187
  Do NOT extract:
188
188
  - Task-specific details (file currently being edited, current bug being fixed)
189
189
  - Temporary state (current branch, in-progress work)
190
190
  - Information that will change frequently
191
+ - Ecosystem descriptions, product announcements, or marketing content
192
+ - Business strategy, roadmap, or organizational information
193
+ - Information that's readily available in public documentation or READMEs
194
+ - Knowledge about unrelated projects or repositories unless explicitly cross-project
195
+ - Restatements of what the code obviously does (e.g. "the auth module handles authentication")
191
196
 
192
197
  BREVITY IS CRITICAL — each entry must be concise:
193
198
  - content MUST be under 500 words (roughly 2000 characters)
@@ -244,7 +249,10 @@ export function curatorUser(input: {
244
249
  ---
245
250
  Recent conversation to extract knowledge from:
246
251
 
247
- ${input.messages}`;
252
+ ${input.messages}
253
+
254
+ ---
255
+ IMPORTANT: If any new entries you would create are semantically duplicative of existing entries (same concept, different wording), prefer updating the existing entry rather than creating a new one. Only create new entries for genuinely distinct knowledge.`;
248
256
  }
249
257
 
250
258
  // Format distillations for injection into the message context.
package/src/temporal.ts CHANGED
@@ -228,3 +228,94 @@ export function undistilledCount(
228
228
  .get(...params) as { count: number }
229
229
  ).count;
230
230
  }
231
+
232
+ export type PruneResult = {
233
+ /** Rows deleted by the TTL pass (distilled=1 AND older than retention period). */
234
+ ttlDeleted: number;
235
+ /** Rows deleted by the size-cap pass (distilled=1, oldest-first, to get under maxStorage). */
236
+ capDeleted: number;
237
+ };
238
+
239
+ /**
240
+ * Prune temporal messages for a project using a two-pass Hybrid C strategy:
241
+ *
242
+ * Pass 1 — TTL: delete messages where distilled=1 AND created_at is older than
243
+ * retentionDays. This covers normal operation — both distillation and curation
244
+ * have had ample time to process anything that old.
245
+ *
246
+ * Pass 2 — Size cap: if total temporal storage for the project still exceeds
247
+ * maxStorageMB, delete the oldest distilled=1 messages (regardless of age)
248
+ * until under the cap.
249
+ *
250
+ * Invariant: undistilled messages (distilled=0) are NEVER deleted by either pass.
251
+ */
252
+ export function prune(input: {
253
+ projectPath: string;
254
+ retentionDays: number;
255
+ maxStorageMB: number;
256
+ }): PruneResult {
257
+ const database = db();
258
+ const pid = ensureProject(input.projectPath);
259
+ const cutoff = Date.now() - input.retentionDays * 24 * 60 * 60 * 1000;
260
+
261
+ // Pass 1: TTL — delete distilled messages older than the retention window.
262
+ // Note: result.changes is inflated by FTS trigger side-effects, so we count
263
+ // eligible rows before deletion to get the accurate number deleted.
264
+ const ttlEligible = (
265
+ database
266
+ .query(
267
+ "SELECT COUNT(*) as c FROM temporal_messages WHERE project_id = ? AND distilled = 1 AND created_at < ?",
268
+ )
269
+ .get(pid, cutoff) as { c: number }
270
+ ).c;
271
+ if (ttlEligible > 0) {
272
+ database
273
+ .query(
274
+ "DELETE FROM temporal_messages WHERE project_id = ? AND distilled = 1 AND created_at < ?",
275
+ )
276
+ .run(pid, cutoff);
277
+ }
278
+ const ttlDeleted = ttlEligible;
279
+
280
+ // Pass 2: Size cap — check if total storage for this project exceeds the
281
+ // limit and if so, evict the oldest distilled messages until under the cap.
282
+ const maxBytes = input.maxStorageMB * 1024 * 1024;
283
+ const totalBytes = (
284
+ database
285
+ .query("SELECT SUM(LENGTH(content)) as b FROM temporal_messages WHERE project_id = ?")
286
+ .get(pid) as { b: number | null }
287
+ ).b ?? 0;
288
+
289
+ let capDeleted = 0;
290
+ if (totalBytes > maxBytes) {
291
+ // Collect oldest distilled messages until we've accounted for enough bytes
292
+ // to drop below the cap. Delete them in a single batch.
293
+ const candidates = database
294
+ .query(
295
+ "SELECT id, LENGTH(content) as size FROM temporal_messages WHERE project_id = ? AND distilled = 1 ORDER BY created_at ASC",
296
+ )
297
+ .all(pid) as { id: string; size: number }[];
298
+
299
+ const toDelete: string[] = [];
300
+ let freed = 0;
301
+ const excess = totalBytes - maxBytes;
302
+ for (const row of candidates) {
303
+ if (freed >= excess) break;
304
+ toDelete.push(row.id);
305
+ freed += row.size;
306
+ }
307
+
308
+ if (toDelete.length) {
309
+ const placeholders = toDelete.map(() => "?").join(",");
310
+ database
311
+ .query(
312
+ `DELETE FROM temporal_messages WHERE id IN (${placeholders})`,
313
+ )
314
+ .run(...toDelete);
315
+ // toDelete.length is the accurate count — result.changes is inflated by FTS triggers.
316
+ capDeleted = toDelete.length;
317
+ }
318
+ }
319
+
320
+ return { ttlDeleted, capDeleted };
321
+ }