npm - opencode-lore - Versions diffs - 0.1.3 → 0.2.0 - Mend

opencode-lore 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "opencode-lore",
-  "version": "0.1.3",
+  "version": "0.2.0",
   "type": "module",
   "license": "MIT",
   "description": "Three-tier memory architecture for OpenCode — distillation, not summarization",
@@ -34,7 +34,7 @@
   ],
   "repository": {
     "type": "git",
-    "url": "https://github.com/BYK/opencode-lore.git"
+    "url": "git+https://github.com/BYK/opencode-lore.git"
   },
   "keywords": [
     "opencode",

package/src/agents-file.ts CHANGED Viewed

@@ -11,8 +11,7 @@
 import { existsSync, readFileSync, writeFileSync, mkdirSync } from "fs";
 import { dirname } from "path";
 import * as ltm from "./ltm";
-import { formatKnowledge } from "./prompt";
-import { unescapeMarkdown } from "./markdown";
+import { serialize, inline, h, ul, liph, strong, t, root, unescapeMarkdown } from "./markdown";
 // ---------------------------------------------------------------------------
 // Constants
@@ -158,23 +157,40 @@ function buildSection(projectPath: string): string {
   if (!entries.length) {
     return "\n";
   }
-  const formatted = formatKnowledge(
-    entries.map((e) => ({ category: e.category, title: e.title, content: e.content })),
-  );
-  if (!formatted) return "\n";
-  // Inject <!-- lore:UUID --> above each bullet line
-  const idByTitle = new Map(entries.map((e) => [e.title, e.id]));
-  const lines = formatted.split("\n");
+  // Group entries by category, preserving DB order (confidence DESC, updated_at DESC).
+  const grouped = new Map<string, typeof entries>();
+  for (const e of entries) {
+    const group = grouped.get(e.category) ?? [];
+    group.push(e);
+    grouped.set(e.category, group);
+  }
+  // Build the section body by iterating entries directly, emitting each entry
+  // with its own <!-- lore:UUID --> marker. This avoids the title-based Map
+  // deduplication bug where multiple entries with the same title all got the
+  // same UUID marker from the last Map.set() winner.
   const out: string[] = [""];
-  for (const line of lines) {
-    const bulletMatch = line.match(/^\*\s+\*\*(.+?)\*\*/);
-    if (bulletMatch) {
-      const id = idByTitle.get(bulletMatch[1]);
-      if (id) out.push(`<!-- lore:${id} -->`);
+  // Section heading
+  out.push("## Long-term Knowledge");
+  for (const [category, items] of grouped) {
+    out.push("");
+    out.push(`### ${category.charAt(0).toUpperCase() + category.slice(1)}`);
+    out.push("");
+    for (const entry of items) {
+      out.push(`<!-- lore:${entry.id} -->`);
+      // Render the bullet using remark serializer for proper markdown escaping.
+      // serialize(root(ul([liph(...)]))) produces "* **Title**: content\n".
+      // Trim the trailing newline since we join with \n ourselves.
+      const bullet = serialize(
+        root(ul([liph(strong(inline(entry.title)), t(": " + inline(entry.content)))]))
+      ).trimEnd();
+      out.push(bullet);
     }
-    out.push(line);
   }
   out.push("");
   return out.join("\n");
 }

package/src/config.ts CHANGED Viewed

@@ -30,6 +30,14 @@ export const LoreConfig = z.object({
       afterTurns: z.number().min(1).default(10),
     })
     .default({}),
+  pruning: z
+    .object({
+      /** Days to keep distilled temporal messages before pruning. Default: 120. */
+      retention: z.number().min(1).default(120),
+      /** Max total temporal_messages storage in MB before emergency pruning. Default: 1024 (1 GB). */
+      maxStorage: z.number().min(50).default(1024),
+    })
+    .default({}),
   crossProject: z.boolean().default(true),
   agentsFile: z
     .object({

package/src/db.ts CHANGED Viewed

@@ -2,7 +2,7 @@ import { Database } from "bun:sqlite";
 import { join } from "path";
 import { mkdirSync } from "fs";
-const SCHEMA_VERSION = 2;
+const SCHEMA_VERSION = 3;
 const MIGRATIONS: string[] = [
   `
@@ -124,6 +124,12 @@ const MIGRATIONS: string[] = [
   -- Version 2: Replace narrative+facts with observations text
   ALTER TABLE distillations ADD COLUMN observations TEXT NOT NULL DEFAULT '';
   `,
+  `
+  -- Version 3: One-time vacuum to reclaim accumulated free pages, and enable
+  -- incremental auto-vacuum so future deletes return pages to the OS.
+  -- VACUUM must run outside a transaction and cannot be in a multi-statement
+  -- exec, so it is handled specially in the migrate() function.
+  `,
 ];
 function dataDir() {
@@ -142,10 +148,17 @@ export function db(): Database {
   instance = new Database(path, { create: true });
   instance.exec("PRAGMA journal_mode = WAL");
   instance.exec("PRAGMA foreign_keys = ON");
+  // Return freed pages to the OS incrementally on each transaction commit
+  // instead of accumulating a free-page list that bloats the file.
+  instance.exec("PRAGMA auto_vacuum = INCREMENTAL");
   migrate(instance);
   return instance;
 }
+// Index of the migration that performs a one-time VACUUM.
+// VACUUM cannot run inside a transaction, so migrate() handles it specially.
+const VACUUM_MIGRATION_INDEX = 2; // 0-based index of version-3 migration
 function migrate(database: Database) {
   const row = database
     .query(
@@ -161,7 +174,16 @@ function migrate(database: Database) {
     : 0;
   if (current >= MIGRATIONS.length) return;
   for (let i = current; i < MIGRATIONS.length; i++) {
-    database.exec(MIGRATIONS[i]);
+    if (i === VACUUM_MIGRATION_INDEX) {
+      // VACUUM cannot run inside a transaction. Run it directly.
+      // auto_vacuum mode must be set *before* VACUUM — SQLite bakes it into
+      // the file header during the rebuild. After this, every subsequent
+      // startup's "PRAGMA auto_vacuum = INCREMENTAL" is a no-op (already set).
+      database.exec("PRAGMA auto_vacuum = INCREMENTAL");
+      database.exec("VACUUM");
+    } else {
+      database.exec(MIGRATIONS[i]);
+    }
   }
   // Update version to latest. Migration 0 inserts version=1 via its own INSERT,
   // but subsequent migrations don't update it, so always normalize to MIGRATIONS.length.

package/src/gradient.ts CHANGED Viewed

@@ -40,12 +40,35 @@ const FIRST_TURN_OVERHEAD = 15_000;
 // Null = not yet calibrated (first turn). Updated after every assistant response.
 let calibratedOverhead: number | null = null;
+// --- Exact token tracking ---
+// Stores the real input token count from the last successful API response.
+// Used for the layer 0 passthrough decision: instead of estimating the full
+// message array with chars/4, we take the exact count from the previous turn
+// and only estimate the small delta (new messages). 99%+ of the count is
+// exact from the API's own tokenizer, virtually eliminating overflow errors.
+let lastKnownInput = 0;
+let lastKnownLtm = 0;
+let lastKnownSessionID: string | null = null;
+let lastKnownMessageCount = 0;
+// --- Force escalation ---
+// Set when the API returns "prompt is too long" — forces the transform to skip
+// layer 0 (and optionally layer 1) on the next call to ensure the context is
+// trimmed enough to fit. Cleared after one use (one-shot).
+let forceMinLayer: SafetyLayer = 0;
 // LTM tokens injected via system transform hook this turn.
 // Set by setLtmTokens() after the system hook runs; consumed by transform().
 let ltmTokens = 0;
 export function setModelLimits(limits: { context: number; output: number }) {
   contextLimit = limits.context || 200_000;
+  // NOTE: this cap of 32K matches what @ai-sdk/anthropic sends as max_tokens for
+  // claude-opus-4-6 (the SDK doesn't recognise the -6 variant and falls back to
+  // the generic claude-opus-4- pattern with maxOutputTokens=32K).  If the SDK is
+  // updated to send the model's actual limit (128K for opus-4-6), this cap will
+  // become wrong — the effective max input would drop from 168K to 72K but our
+  // budget would still assume 168K.  At that point, remove the cap.
   outputReserved = Math.min(limits.output || 32_000, 32_000);
 }
@@ -72,9 +95,22 @@ export function getLtmBudget(ltmFraction: number): number {
 }
 // Called after each assistant message completes with real token usage data.
-// actualInput = tokens.input + tokens.cache.read (all tokens that went into the model)
+// actualInput    = tokens.input + tokens.cache.read (all tokens the model saw)
 // messageEstimate = our chars/4 estimate of the messages we sent
-export function calibrate(actualInput: number, messageEstimate: number) {
+// sessionID      = session that produced this response (for exact-tracking validity)
+// messageCount   = number of messages that were sent (for delta estimation)
+export function calibrate(
+  actualInput: number,
+  messageEstimate: number,
+  sessionID?: string,
+  messageCount?: number,
+) {
+  // Store exact counts for the proactive layer 0 decision.
+  lastKnownInput = actualInput;
+  lastKnownLtm = ltmTokens;
+  if (sessionID !== undefined) lastKnownSessionID = sessionID;
+  if (messageCount !== undefined) lastKnownMessageCount = messageCount;
   const overhead = Math.max(0, actualInput - messageEstimate);
   // Smooth with EMA (alpha=0.3) once calibrated, or set directly on first call
   calibratedOverhead =
@@ -87,9 +123,23 @@ export function getOverhead(): number {
   return calibratedOverhead ?? FIRST_TURN_OVERHEAD;
 }
-// For testing only — reset calibration state
+/**
+ * Force the next transform() call to use at least the given layer.
+ * Called when the API returns "prompt is too long" so the next attempt
+ * trims the context enough to fit within the model's context window.
+ */
+export function setForceMinLayer(layer: SafetyLayer) {
+  forceMinLayer = layer;
+}
+// For testing only — reset all calibration and force-escalation state
 export function resetCalibration() {
   calibratedOverhead = null;
+  lastKnownInput = 0;
+  lastKnownLtm = 0;
+  lastKnownSessionID = null;
+  lastKnownMessageCount = 0;
+  forceMinLayer = 0;
 }
 type Distillation = {
@@ -317,16 +367,9 @@ function addRelativeTimeToObservations(text: string, now: Date): string {
   return result;
 }
-// Build a synthetic message pair containing the distilled history
-function distilledPrefix(distillations: Distillation[]): MessageWithParts[] {
-  if (!distillations.length) return [];
-  const now = new Date();
-  const annotated = distillations.map((d) => ({
-    ...d,
-    observations: addRelativeTimeToObservations(d.observations, now),
-  }));
-  const formatted = formatDistillations(annotated);
-  if (!formatted) return [];
+// Build synthetic user/assistant message pair wrapping formatted distillation text.
+// Shared by the cached and non-cached prefix paths.
+function buildPrefixMessages(formatted: string): MessageWithParts[] {
   return [
     {
       info: {
@@ -381,7 +424,252 @@ function distilledPrefix(distillations: Distillation[]): MessageWithParts[] {
   ];
 }
-export type SafetyLayer = 1 | 2 | 3 | 4;
+// Build a synthetic message pair containing the distilled history.
+// Non-cached path — used by layers 2-4 which already cause full cache invalidation.
+function distilledPrefix(distillations: Distillation[]): MessageWithParts[] {
+  if (!distillations.length) return [];
+  const now = new Date();
+  const annotated = distillations.map((d) => ({
+    ...d,
+    observations: addRelativeTimeToObservations(d.observations, now),
+  }));
+  const formatted = formatDistillations(annotated);
+  if (!formatted) return [];
+  return buildPrefixMessages(formatted);
+}
+// --- Approach C: Append-only distillation prefix cache ---
+//
+// Caches the rendered prefix text per session. When new distillations arrive,
+// only renders the new rows and appends them to the cached text. This keeps
+// the prefix byte-identical between distillation runs, preserving the prompt
+// cache. Only meta-distillation (which rewrites gen-0 rows into gen-1) causes
+// a full re-render — and that happens roughly every 80-100 turns.
+type PrefixCache = {
+  /** The session this cache belongs to */
+  sessionID: string;
+  /** ID of the last distillation row included in the cached text */
+  lastDistillationID: string;
+  /** Number of rows that produced the cached text */
+  rowCount: number;
+  /** The rendered text (used to build delta appends) */
+  cachedText: string;
+  /** Ready-to-use message pair */
+  prefixMessages: MessageWithParts[];
+  /** Token estimate of prefixMessages */
+  prefixTokens: number;
+};
+let prefixCache: PrefixCache | null = null;
+/**
+ * Return the distilled prefix messages, reusing cached content when possible.
+ *
+ * Cache hit  — no new rows: returns the exact same prefixMessages object
+ *              (byte-identical content, prompt cache preserved).
+ * Cache miss — new rows appended: renders only the delta, appends to cached
+ *              text, updates cache.
+ * Full reset — session changed, or rows were rewritten by meta-distillation:
+ *              renders everything from scratch.
+ */
+function distilledPrefixCached(
+  distillations: Distillation[],
+  sessionID: string,
+): { messages: MessageWithParts[]; tokens: number } {
+  if (!distillations.length) {
+    prefixCache = null;
+    return { messages: [], tokens: 0 };
+  }
+  const lastRow = distillations[distillations.length - 1];
+  // Cache is valid when: same session, row count only grew (no rewrites),
+  // and the last previously-cached row still exists at the same position.
+  const cacheValid =
+    prefixCache !== null &&
+    prefixCache.sessionID === sessionID &&
+    prefixCache.rowCount <= distillations.length &&
+    (prefixCache.rowCount === 0 ||
+      distillations[prefixCache.rowCount - 1]?.id ===
+        prefixCache.lastDistillationID);
+  if (cacheValid) {
+    if (prefixCache!.lastDistillationID === lastRow.id) {
+      // No new rows — return cached prefix as-is (byte-identical for prompt cache)
+      return {
+        messages: prefixCache!.prefixMessages,
+        tokens: prefixCache!.prefixTokens,
+      };
+    }
+    // New rows appended — render only the delta and append to cached text
+    const newRows = distillations.slice(prefixCache!.rowCount);
+    const now = new Date();
+    const annotated = newRows.map((d) => ({
+      ...d,
+      observations: addRelativeTimeToObservations(d.observations, now),
+    }));
+    const deltaText = formatDistillations(annotated);
+    if (deltaText) {
+      const fullText = prefixCache!.cachedText + "\n\n" + deltaText;
+      const messages = buildPrefixMessages(fullText);
+      const tokens = messages.reduce((sum, m) => sum + estimateMessage(m), 0);
+      prefixCache = {
+        sessionID,
+        lastDistillationID: lastRow.id,
+        rowCount: distillations.length,
+        cachedText: fullText,
+        prefixMessages: messages,
+        prefixTokens: tokens,
+      };
+      return { messages, tokens };
+    }
+  }
+  // Full re-render: first call, session change, or meta-distillation rewrote rows
+  const now = new Date();
+  const annotated = distillations.map((d) => ({
+    ...d,
+    observations: addRelativeTimeToObservations(d.observations, now),
+  }));
+  const fullText = formatDistillations(annotated);
+  if (!fullText) {
+    prefixCache = null;
+    return { messages: [], tokens: 0 };
+  }
+  const messages = buildPrefixMessages(fullText);
+  const tokens = messages.reduce((sum, m) => sum + estimateMessage(m), 0);
+  prefixCache = {
+    sessionID,
+    lastDistillationID: lastRow.id,
+    rowCount: distillations.length,
+    cachedText: fullText,
+    prefixMessages: messages,
+    prefixTokens: tokens,
+  };
+  return { messages, tokens };
+}
+// For testing only — reset prefix cache state
+export function resetPrefixCache() {
+  prefixCache = null;
+}
+// --- Approach B: Lazy raw window eviction ---
+//
+// Tracks the ID of the first (oldest) message in the previous raw window.
+// On the next turn, if the window starting at that message still fits within
+// the raw budget, the cutoff is pinned — no messages are evicted and the raw
+// window stays byte-identical for caching purposes. Only when the pinned
+// window no longer fits (e.g. a large tool response pushed us over) is the
+// cutoff allowed to advance forward by one message at a time.
+//
+// This eliminates the "window sliding on every turn" problem that was the
+// dominant source of cache misses in gradient mode: each new turn appends a
+// message to the conversation, but the start of the raw window only moves
+// when it must.
+//
+// Reset conditions: session changes, or layer escalates to 2+ (the pinned
+// window was too large even with stripping — something genuinely changed).
+type RawWindowCache = {
+  sessionID: string;
+  /** ID of the first message in the pinned raw window */
+  firstMessageID: string;
+};
+let rawWindowCache: RawWindowCache | null = null;
+export function resetRawWindowCache() {
+  rawWindowCache = null;
+}
+/**
+ * Layer-1 tryFit with lazy eviction.
+ *
+ * Attempts to reuse the previous raw window cutoff before falling back to a
+ * full backward scan. If the pinned window fits, returns it unchanged (same
+ * message objects, byte-identical for prompt caching). If it doesn't fit,
+ * delegates to the normal tryFit which finds the new minimal cutoff and
+ * updates the cache.
+ */
+function tryFitStable(input: {
+  messages: MessageWithParts[];
+  prefix: MessageWithParts[];
+  prefixTokens: number;
+  distilledBudget: number;
+  rawBudget: number;
+  sessionID: string;
+}): Omit<TransformResult, "layer" | "usable" | "distilledBudget" | "rawBudget"> | null {
+  // If the prefix already overflows its budget there's no point trying.
+  if (input.prefixTokens > input.distilledBudget && input.prefix.length > 0)
+    return null;
+  const cacheValid =
+    rawWindowCache !== null && rawWindowCache.sessionID === input.sessionID;
+  if (cacheValid) {
+    const pinnedIdx = input.messages.findIndex(
+      (m) => m.info.id === rawWindowCache!.firstMessageID,
+    );
+    if (pinnedIdx !== -1) {
+      // Measure the token cost of the pinned window.
+      const pinnedWindow = input.messages.slice(pinnedIdx);
+      const pinnedTokens = pinnedWindow.reduce(
+        (sum, m) => sum + estimateMessage(m),
+        0,
+      );
+      if (pinnedTokens <= input.rawBudget) {
+        // Pinned window still fits — keep it. Apply system-reminder cleanup
+        // only (strip:"none" is the layer-1 mode), returning the same message
+        // object references wherever nothing changed.
+        const processed = pinnedWindow.map((msg) => {
+          const parts = cleanParts(msg.parts);
+          return parts !== msg.parts ? { info: msg.info, parts } : msg;
+        });
+        const total = input.prefixTokens + pinnedTokens;
+        return {
+          messages: [...input.prefix, ...processed],
+          distilledTokens: input.prefixTokens,
+          rawTokens: pinnedTokens,
+          totalTokens: total,
+        };
+      }
+      // Pinned window is too large — fall through to the normal scan below.
+    }
+  }
+  // Normal backward scan to find the tightest fitting cutoff.
+  const result = tryFit({
+    messages: input.messages,
+    prefix: input.prefix,
+    prefixTokens: input.prefixTokens,
+    distilledBudget: input.distilledBudget,
+    rawBudget: input.rawBudget,
+    strip: "none",
+  });
+  if (result) {
+    // Update the raw window cache: the first non-prefix message is the oldest
+    // raw message in the new window. Pin to its ID for the next turn.
+    const rawStart = result.messages[input.prefix.length];
+    if (rawStart) {
+      rawWindowCache = {
+        sessionID: input.sessionID,
+        firstMessageID: rawStart.info.id,
+      };
+    }
+  }
+  return result;
+}
+export type SafetyLayer = 0 | 1 | 2 | 3 | 4;
 export type TransformResult = {
   messages: MessageWithParts[];
@@ -419,36 +707,115 @@ export function transform(input: {
   const distilledBudget = Math.floor(usable * cfg.budget.distilled);
   const rawBudget = Math.floor(usable * cfg.budget.raw);
-  // Find the session ID from messages
+  // --- Force escalation (reactive error recovery) ---
+  // When the API previously rejected with "prompt is too long", skip layers
+  // below the forced minimum to ensure enough trimming on the next attempt.
+  // One-shot: consumed here and reset to 0.
+  const effectiveMinLayer = forceMinLayer;
+  forceMinLayer = 0;
+  // --- Approach A: Cache-preserving passthrough ---
+  // Use exact token count from the previous API response when available.
+  // Only the delta (messages added since last call) uses chars/4 estimation,
+  // making the layer-0 decision 99%+ accurate from the API's own tokenizer.
+  // maxInput = absolute ceiling the API enforces: input_tokens + max_tokens <= context
+  const maxInput = contextLimit - outputReserved;
   const sid = input.sessionID ?? input.messages[0]?.info.sessionID;
+  let expectedInput: number;
+  if (lastKnownInput > 0 && sid === lastKnownSessionID) {
+    // Exact approach: prior API count + estimate of only the new messages.
+    const newMsgCount = Math.max(0, input.messages.length - lastKnownMessageCount);
+    const newMsgTokens = newMsgCount > 0
+      ? input.messages.slice(-newMsgCount).reduce((s, m) => s + estimateMessage(m), 0)
+      : 0;
+    const ltmDelta = ltmTokens - lastKnownLtm;
+    expectedInput = lastKnownInput + newMsgTokens + ltmDelta;
+  } else {
+    // First turn or session change: fall back to chars/4 + overhead.
+    const messageTokens = input.messages.reduce((s, m) => s + estimateMessage(m), 0);
+    expectedInput = messageTokens + overhead + ltmTokens;
+  }
+  if (effectiveMinLayer === 0 && expectedInput <= maxInput) {
+    // All messages fit — return unmodified to preserve append-only prompt-cache pattern.
+    // Raw messages are strictly better context than lossy distilled summaries.
+    const messageTokens = lastKnownInput > 0 && sid === lastKnownSessionID
+      ? expectedInput - (ltmTokens - lastKnownLtm)  // approximate raw portion
+      : expectedInput - overhead - ltmTokens;
+    return {
+      messages: input.messages,
+      layer: 0,
+      distilledTokens: 0,
+      rawTokens: Math.max(0, messageTokens),
+      totalTokens: Math.max(0, messageTokens),
+      usable,
+      distilledBudget,
+      rawBudget,
+    };
+  }
+  // --- Gradient mode: context exhausted (or force-escalated), compress older messages ---
   const distillations = sid ? loadDistillations(input.projectPath, sid) : [];
-  const prefix = distilledPrefix(distillations);
-  const prefixTokens = prefix.reduce((sum, m) => sum + estimateMessage(m), 0);
-  // Layer 1: Normal budget allocation
-  const layer1 = tryFit({
-    messages: input.messages,
-    prefix,
-    prefixTokens,
-    distilledBudget,
-    rawBudget,
-    strip: "none",
-  });
-  if (layer1) return { ...layer1, layer: 1, usable, distilledBudget, rawBudget };
+  // Layer 1 uses the append-only cached prefix (Approach C) to keep the
+  // distilled content byte-identical between distillation runs, preserving
+  // the prompt cache. Layers 2-4 already cause full cache invalidation via
+  // tool stripping / message restructuring, so they use the non-cached path.
+  const cached = sid
+    ? distilledPrefixCached(distillations, sid)
+    : (() => {
+        const msgs = distilledPrefix(distillations);
+        return { messages: msgs, tokens: msgs.reduce((sum, m) => sum + estimateMessage(m), 0) };
+      })();
+  // Layer 1: Normal budget allocation with lazy raw window eviction (Approach B).
+  // tryFitStable reuses the previous cutoff when it still fits, keeping the raw
+  // window byte-identical across turns for prompt caching. Only advances the
+  // cutoff when a genuinely oversized message forces eviction.
+  // Skipped when force-escalated to layer 2+ (previous attempt already failed at this level).
+  if (effectiveMinLayer <= 1) {
+    const layer1 = sid
+      ? tryFitStable({
+          messages: input.messages,
+          prefix: cached.messages,
+          prefixTokens: cached.tokens,
+          distilledBudget,
+          rawBudget,
+          sessionID: sid,
+        })
+      : tryFit({
+          messages: input.messages,
+          prefix: cached.messages,
+          prefixTokens: cached.tokens,
+          distilledBudget,
+          rawBudget,
+          strip: "none",
+        });
+    if (layer1) return { ...layer1, layer: 1, usable, distilledBudget, rawBudget };
+  }
+  // Layer 1 didn't fit (or was force-skipped) — reset the raw window cache.
+  // Layers 2-4 use full scans and already break the prompt cache.
+  rawWindowCache = null;
   // Layer 2: Strip tool outputs from older messages, keep last 2 turns
-  const layer2 = tryFit({
-    messages: input.messages,
-    prefix,
-    prefixTokens,
-    distilledBudget,
-    rawBudget: Math.floor(usable * 0.5), // give raw more room
-    strip: "old-tools",
-    protectedTurns: 2,
-  });
-  if (layer2) {
-    urgentDistillation = true;
-    return { ...layer2, layer: 2, usable, distilledBudget, rawBudget };
+  // Skipped when force-escalated to layer 3+.
+  if (effectiveMinLayer <= 2) {
+    const layer2 = tryFit({
+      messages: input.messages,
+      prefix: cached.messages,
+      prefixTokens: cached.tokens,
+      distilledBudget,
+      rawBudget: Math.floor(usable * 0.5), // give raw more room
+      strip: "old-tools",
+      protectedTurns: 2,
+    });
+    if (layer2) {
+      urgentDistillation = true;
+      return { ...layer2, layer: 2, usable, distilledBudget, rawBudget };
+    }
   }
   // Layer 3: Strip ALL tool outputs, drop oldest distillations

package/src/index.ts CHANGED Viewed

@@ -13,9 +13,11 @@ import {
   estimateMessages,
   setLtmTokens,
   getLtmBudget,
+  setForceMinLayer,
 } from "./gradient";
 import { formatKnowledge } from "./prompt";
 import { createRecallTool } from "./reflect";
+import { shouldImport, importFromFile, exportToFile } from "./agents-file";
 export const LorePlugin: Plugin = async (ctx) => {
   const projectPath = ctx.worktree || ctx.directory;
@@ -33,6 +35,23 @@ export const LorePlugin: Plugin = async (ctx) => {
     }).catch(() => {});
   }
+  // Import from AGENTS.md at startup if it has changed since last export
+  // (hand-written entries, edits from other machines, or merge conflicts).
+  {
+    const cfg = config();
+    if (cfg.agentsFile.enabled) {
+      const filePath = `${projectPath}/${cfg.agentsFile.path}`;
+      if (shouldImport({ projectPath, filePath })) {
+        try {
+          importFromFile({ projectPath, filePath });
+          console.error("[lore] imported knowledge from", cfg.agentsFile.path);
+        } catch (e) {
+          console.error("[lore] agents-file import error:", e);
+        }
+      }
+    }
+  }
   // Prune any corrupted/oversized knowledge entries left by the AGENTS.md
   // backslash-escaping bug or curator hallucinations. Sets confidence → 0
   // (below the 0.2 query threshold) so they stop polluting the context.
@@ -179,7 +198,9 @@ export const LorePlugin: Plugin = async (ctx) => {
                 backgroundDistill(msg.sessionID);
               }
-              // Calibrate overhead estimate using real token counts
+              // Calibrate overhead estimate using real token counts.
+              // Also store the exact input count + message count for the proactive
+              // layer-0 decision (avoids full chars/4 re-estimation each turn).
               const allMsgs = await ctx.client.session.messages({
                 path: { id: msg.sessionID },
               });
@@ -189,7 +210,7 @@ export const LorePlugin: Plugin = async (ctx) => {
                   .map((m) => ({ info: m.info, parts: m.parts }));
                 const msgEstimate = estimateMessages(withParts);
                 const actualInput = msg.tokens.input + msg.tokens.cache.read;
-                calibrate(actualInput, msgEstimate);
+                calibrate(actualInput, msgEstimate, msg.sessionID, withParts.length);
               }
             }
           }
@@ -198,13 +219,59 @@ export const LorePlugin: Plugin = async (ctx) => {
         }
       }
+      if (event.type === "session.error") {
+        // Detect "prompt is too long" API errors and auto-recover:
+        // 1. Force the gradient transform to escalate on the next call (skip layer 0/1)
+        // 2. Force distillation to capture all temporal data before compaction
+        // 3. Trigger compaction so the session recovers without user intervention
+        const error = (event.properties as Record<string, unknown>).error as
+          | { name?: string; data?: { message?: string } }
+          | undefined;
+        const isPromptTooLong =
+          error?.name === "APIError" &&
+          typeof error?.data?.message === "string" &&
+          (error.data.message.includes("prompt is too long") ||
+            error.data.message.includes("context length exceeded") ||
+            error.data.message.includes("maximum context length"));
+        if (isPromptTooLong) {
+          const sessionID = (event.properties as Record<string, unknown>).sessionID as
+            | string
+            | undefined;
+          console.error(
+            `[lore] detected 'prompt too long' error — forcing distillation + compaction (session: ${sessionID?.substring(0, 16)})`,
+          );
+          // Force layer 2 on next transform — layers 0 and 1 were already too large.
+          setForceMinLayer(2);
+          if (sessionID) {
+            // Force distillation to capture all undistilled messages before
+            // compaction replaces the session message history.
+            await backgroundDistill(sessionID, true);
+            // Trigger compaction automatically — the compacting hook will inject
+            // Lore's custom distillation-aware prompt.
+            try {
+              const sessions = await ctx.client.session.list();
+              const session = sessions.data?.find((s) => s.id.startsWith(sessionID));
+              if (session) {
+                // providerID/modelID are optional — omit to use the session's current model
+                await ctx.client.session.summarize({ path: { id: session.id } });
+              }
+            } catch (e) {
+              console.error("[lore] auto-compaction failed:", e);
+            }
+          }
+        }
+      }
       if (event.type === "session.idle") {
         const sessionID = event.properties.sessionID;
         if (await shouldSkip(sessionID)) return;
         if (!activeSessions.has(sessionID)) return;
         // Run background distillation for any remaining undistilled messages
-        backgroundDistill(sessionID);
+        await backgroundDistill(sessionID);
         // Run curator periodically
         const cfg = config();
@@ -212,9 +279,39 @@ export const LorePlugin: Plugin = async (ctx) => {
           cfg.curator.onIdle ||
           turnsSinceCuration >= cfg.curator.afterTurns
         ) {
-          backgroundCurate(sessionID);
+          await backgroundCurate(sessionID);
           turnsSinceCuration = 0;
         }
+        // Prune temporal messages after distillation and curation have run.
+        // Pass 1: TTL — remove distilled messages older than retention period.
+        // Pass 2: Size cap — evict oldest distilled messages if over the limit.
+        // Undistilled messages are never touched.
+        try {
+          const { ttlDeleted, capDeleted } = temporal.prune({
+            projectPath,
+            retentionDays: cfg.pruning.retention,
+            maxStorageMB: cfg.pruning.maxStorage,
+          });
+          if (ttlDeleted > 0 || capDeleted > 0) {
+            console.error(
+              `[lore] pruned temporal messages: ${ttlDeleted} by TTL, ${capDeleted} by size cap`,
+            );
+          }
+        } catch (e) {
+          console.error("[lore] pruning error:", e);
+        }
+        // Export curated knowledge to AGENTS.md after distillation + curation.
+        try {
+          const agentsCfg = cfg.agentsFile;
+          if (agentsCfg.enabled) {
+            const filePath = `${projectPath}/${agentsCfg.path}`;
+            exportToFile({ projectPath, filePath });
+          }
+        } catch (e) {
+          console.error("[lore] agents-file export error:", e);
+        }
       }
     },
@@ -264,7 +361,9 @@ export const LorePlugin: Plugin = async (ctx) => {
       }
     },
-    // Transform message history: distilled prefix + raw recent
+    // Transform message history: distilled prefix + raw recent.
+    // Layer 0 = passthrough (messages fit without compression) — output.messages
+    // is left untouched to preserve the append-only pattern for prompt caching.
     "experimental.chat.messages.transform": async (_input, output) => {
       if (!output.messages.length) return;
@@ -275,66 +374,44 @@ export const LorePlugin: Plugin = async (ctx) => {
         projectPath,
         sessionID,
       });
-      while (
-        result.messages.length > 0 &&
-        result.messages.at(-1)!.info.role !== "user"
-      ) {
-        const last = result.messages.at(-1)!;
-        if (last.parts.some((p) => p.type === "tool")) break;
-        const dropped = result.messages.pop()!;
-        console.error(
-          "[lore] WARN: dropping trailing",
-          dropped.info.role,
-          "message to prevent prefill error. id:",
-          dropped.info.id,
-        );
+      // Only restructure messages when the gradient transform is active (layers 1-4).
+      // Layer 0 means all messages fit within the context budget — leave them alone
+      // so the append-only sequence stays intact for prompt caching.
+      if (result.layer > 0) {
+        while (
+          result.messages.length > 0 &&
+          result.messages.at(-1)!.info.role !== "user"
+        ) {
+          const last = result.messages.at(-1)!;
+          if (last.parts.some((p) => p.type === "tool")) break;
+          const dropped = result.messages.pop()!;
+          console.error(
+            "[lore] WARN: dropping trailing",
+            dropped.info.role,
+            "message to prevent prefill error. id:",
+            dropped.info.id,
+          );
+        }
+        output.messages.splice(0, output.messages.length, ...result.messages);
       }
-      output.messages.splice(0, output.messages.length, ...result.messages);
       if (result.layer >= 2 && sessionID) {
         backgroundDistill(sessionID);
       }
-      // Look up statsPart AFTER the transform so the PATCHed text is clean
-      // (system-reminder wrappers stripped). Looking up before would persist
-      // ephemeral system-reminder content, making it visible in the UI.
-      const lastUserMsg = [...output.messages].reverse().find((m) => m.info.role === "user");
-      const statsPart = lastUserMsg?.parts.find((p) => p.type === "text");
-      if (sessionID && statsPart && lastUserMsg) {
-        const loreMeta = {
-          layer: result.layer,
-          distilledTokens: result.distilledTokens,
-          rawTokens: result.rawTokens,
-          totalTokens: result.totalTokens,
-          usable: result.usable,
-          distilledBudget: result.distilledBudget,
-          rawBudget: result.rawBudget,
-          updatedAt: Date.now(),
-        };
-        const url = new URL(
-          `/session/${sessionID}/message/${lastUserMsg.info.id}/part/${statsPart.id}`,
-          ctx.serverUrl,
-        );
-        const updatedPart = {
-          ...(statsPart as Record<string, unknown>),
-          metadata: {
-            ...((statsPart as { metadata?: Record<string, unknown> }).metadata ?? {}),
-            lore: loreMeta,
-          },
-        };
-        fetch(url, {
-          method: "PATCH",
-          headers: { "Content-Type": "application/json" },
-          body: JSON.stringify(updatedPart),
-        }).catch((e: unknown) => {
-          console.error("[lore] failed to write gradient stats to part metadata:", e);
-        });
-      }
     },
-    // Replace compaction prompt with distillation-aware prompt when manual /compact is used
+    // Replace compaction prompt with distillation-aware prompt when manual /compact is used.
+    // Also force distillation first so all temporal data is captured before compaction
+    // replaces the session message history.
     "experimental.session.compacting": async (input, output) => {
+      // Force distillation to capture any undistilled messages. This is critical:
+      // compaction will replace all messages with a summary, so we must persist
+      // everything to Lore's temporal store before that happens.
+      if (input.sessionID && activeSessions.has(input.sessionID)) {
+        await backgroundDistill(input.sessionID, true);
+      }
       const entries = ltm.forProject(projectPath, config().crossProject);
       const knowledge = entries.length
         ? formatKnowledge(

package/src/ltm.ts CHANGED Viewed

@@ -36,6 +36,33 @@ export function create(input: {
     input.scope === "project" && input.projectPath
       ? ensureProject(input.projectPath)
       : null;
+  // Dedup guard: if an entry with the same project_id + title already exists,
+  // update its content instead of inserting a duplicate. This prevents the
+  // curator from creating multiple entries for the same concept across sessions.
+  // Note: when an explicit id is provided (cross-machine import), skip dedup —
+  // the caller (importFromFile) already handles duplicate detection by UUID.
+  if (!input.id) {
+    const existing = (
+      pid !== null
+        ? db()
+            .query(
+              "SELECT id FROM knowledge WHERE project_id = ? AND LOWER(title) = LOWER(?) AND confidence > 0 LIMIT 1",
+            )
+            .get(pid, input.title)
+        : db()
+            .query(
+              "SELECT id FROM knowledge WHERE project_id IS NULL AND LOWER(title) = LOWER(?) AND confidence > 0 LIMIT 1",
+            )
+            .get(input.title)
+    ) as { id: string } | null;
+    if (existing) {
+      update(existing.id, { content: input.content });
+      return existing.id;
+    }
+  }
   const id = input.id ?? uuidv7();
   const now = Date.now();
   db()

package/src/prompt.ts CHANGED Viewed

@@ -176,18 +176,23 @@ ${entries.join("\n\n---\n\n")}`;
 export const CURATOR_SYSTEM = `You are a long-term memory curator. Your job is to extract durable knowledge from a conversation that should persist across sessions.
-Focus on knowledge that will remain true and useful beyond the current task:
-- User preferences and working style
-- Architectural decisions and their rationale
-- Project conventions and patterns
-- Environment setup details
-- Recurring gotchas or constraints
-- Important relationships between components
+Focus ONLY on knowledge that helps a coding agent work effectively on THIS codebase:
+- Architectural decisions and their rationale (why something was built a certain way)
+- Non-obvious implementation patterns and conventions specific to the project
+- Recurring gotchas, constraints, or traps in the codebase
+- Environment/tooling setup details that affect development
+- Important relationships between components that aren't obvious from reading the code
+- User preferences and working style specific to how they use this project
 Do NOT extract:
 - Task-specific details (file currently being edited, current bug being fixed)
 - Temporary state (current branch, in-progress work)
 - Information that will change frequently
+- Ecosystem descriptions, product announcements, or marketing content
+- Business strategy, roadmap, or organizational information
+- Information that's readily available in public documentation or READMEs
+- Knowledge about unrelated projects or repositories unless explicitly cross-project
+- Restatements of what the code obviously does (e.g. "the auth module handles authentication")
 BREVITY IS CRITICAL — each entry must be concise:
 - content MUST be under 500 words (roughly 2000 characters)
@@ -244,7 +249,10 @@ export function curatorUser(input: {
 ---
 Recent conversation to extract knowledge from:
-${input.messages}`;
+${input.messages}
+---
+IMPORTANT: If any new entries you would create are semantically duplicative of existing entries (same concept, different wording), prefer updating the existing entry rather than creating a new one. Only create new entries for genuinely distinct knowledge.`;
 }
 // Format distillations for injection into the message context.

package/src/temporal.ts CHANGED Viewed

@@ -228,3 +228,94 @@ export function undistilledCount(
       .get(...params) as { count: number }
   ).count;
 }
+export type PruneResult = {
+  /** Rows deleted by the TTL pass (distilled=1 AND older than retention period). */
+  ttlDeleted: number;
+  /** Rows deleted by the size-cap pass (distilled=1, oldest-first, to get under maxStorage). */
+  capDeleted: number;
+};
+/**
+ * Prune temporal messages for a project using a two-pass Hybrid C strategy:
+ *
+ * Pass 1 — TTL: delete messages where distilled=1 AND created_at is older than
+ * retentionDays. This covers normal operation — both distillation and curation
+ * have had ample time to process anything that old.
+ *
+ * Pass 2 — Size cap: if total temporal storage for the project still exceeds
+ * maxStorageMB, delete the oldest distilled=1 messages (regardless of age)
+ * until under the cap.
+ *
+ * Invariant: undistilled messages (distilled=0) are NEVER deleted by either pass.
+ */
+export function prune(input: {
+  projectPath: string;
+  retentionDays: number;
+  maxStorageMB: number;
+}): PruneResult {
+  const database = db();
+  const pid = ensureProject(input.projectPath);
+  const cutoff = Date.now() - input.retentionDays * 24 * 60 * 60 * 1000;
+  // Pass 1: TTL — delete distilled messages older than the retention window.
+  // Note: result.changes is inflated by FTS trigger side-effects, so we count
+  // eligible rows before deletion to get the accurate number deleted.
+  const ttlEligible = (
+    database
+      .query(
+        "SELECT COUNT(*) as c FROM temporal_messages WHERE project_id = ? AND distilled = 1 AND created_at < ?",
+      )
+      .get(pid, cutoff) as { c: number }
+  ).c;
+  if (ttlEligible > 0) {
+    database
+      .query(
+        "DELETE FROM temporal_messages WHERE project_id = ? AND distilled = 1 AND created_at < ?",
+      )
+      .run(pid, cutoff);
+  }
+  const ttlDeleted = ttlEligible;
+  // Pass 2: Size cap — check if total storage for this project exceeds the
+  // limit and if so, evict the oldest distilled messages until under the cap.
+  const maxBytes = input.maxStorageMB * 1024 * 1024;
+  const totalBytes = (
+    database
+      .query("SELECT SUM(LENGTH(content)) as b FROM temporal_messages WHERE project_id = ?")
+      .get(pid) as { b: number | null }
+  ).b ?? 0;
+  let capDeleted = 0;
+  if (totalBytes > maxBytes) {
+    // Collect oldest distilled messages until we've accounted for enough bytes
+    // to drop below the cap. Delete them in a single batch.
+    const candidates = database
+      .query(
+        "SELECT id, LENGTH(content) as size FROM temporal_messages WHERE project_id = ? AND distilled = 1 ORDER BY created_at ASC",
+      )
+      .all(pid) as { id: string; size: number }[];
+    const toDelete: string[] = [];
+    let freed = 0;
+    const excess = totalBytes - maxBytes;
+    for (const row of candidates) {
+      if (freed >= excess) break;
+      toDelete.push(row.id);
+      freed += row.size;
+    }
+    if (toDelete.length) {
+      const placeholders = toDelete.map(() => "?").join(",");
+      database
+        .query(
+          `DELETE FROM temporal_messages WHERE id IN (${placeholders})`,
+        )
+        .run(...toDelete);
+      // toDelete.length is the accurate count — result.changes is inflated by FTS triggers.
+      capDeleted = toDelete.length;
+    }
+  }
+  return { ttlDeleted, capDeleted };
+}