npm - opencode-lore - Versions diffs - 0.2.0 → 0.2.2 - Mend

opencode-lore 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "opencode-lore",
-  "version": "0.2.0",
+  "version": "0.2.2",
   "type": "module",
   "license": "MIT",
   "description": "Three-tier memory architecture for OpenCode — distillation, not summarization",

package/src/distillation.ts CHANGED Viewed

@@ -117,6 +117,32 @@ export type Distillation = {
   created_at: number;
 };
+/** Load all distillations for a session, oldest first. */
+export function loadForSession(
+  projectPath: string,
+  sessionID: string,
+): Distillation[] {
+  const pid = ensureProject(projectPath);
+  const rows = db()
+    .query(
+      "SELECT id, project_id, session_id, observations, source_ids, generation, token_count, created_at FROM distillations WHERE project_id = ? AND session_id = ? ORDER BY created_at ASC",
+    )
+    .all(pid, sessionID) as Array<{
+    id: string;
+    project_id: string;
+    session_id: string;
+    observations: string;
+    source_ids: string;
+    generation: number;
+    token_count: number;
+    created_at: number;
+  }>;
+  return rows.map((r) => ({
+    ...r,
+    source_ids: JSON.parse(r.source_ids) as string[],
+  }));
+}
 function storeDistillation(input: {
   projectPath: string;
   sessionID: string;

package/src/gradient.ts CHANGED Viewed

@@ -722,8 +722,27 @@ export function transform(input: {
   const maxInput = contextLimit - outputReserved;
   const sid = input.sessionID ?? input.messages[0]?.info.sessionID;
+  // True when we have real API token data from a previous turn in this session.
+  // When false (first turn / session change), chars/4 estimates can undercount by
+  // up to 1.8x — so tryFit output must be validated with a safety multiplier before
+  // being used, to prevent sending an apparently-fitting window that actually overflows.
+  const calibrated = lastKnownInput > 0 && sid === lastKnownSessionID;
+  // On uncalibrated turns, apply this multiplier to tryFit's estimated total to
+  // approximate the real token count. 1.5 is conservative but not so aggressive
+  // that it forces layer 4 on modestly-sized sessions.
+  const UNCALIBRATED_SAFETY = 1.5;
+  // Returns true if the tryFit result is safe to use: either we have calibrated
+  // data (exact) or the estimated total * safety factor fits within maxInput.
+  function fitsWithSafetyMargin(result: { totalTokens: number } | null): boolean {
+    if (!result) return false;
+    if (calibrated) return true;
+    return result.totalTokens * UNCALIBRATED_SAFETY <= maxInput;
+  }
   let expectedInput: number;
-  if (lastKnownInput > 0 && sid === lastKnownSessionID) {
+  if (calibrated) {
     // Exact approach: prior API count + estimate of only the new messages.
     const newMsgCount = Math.max(0, input.messages.length - lastKnownMessageCount);
     const newMsgTokens = newMsgCount > 0
@@ -793,7 +812,7 @@ export function transform(input: {
           rawBudget,
           strip: "none",
         });
-    if (layer1) return { ...layer1, layer: 1, usable, distilledBudget, rawBudget };
+    if (fitsWithSafetyMargin(layer1)) return { ...layer1!, layer: 1, usable, distilledBudget, rawBudget };
   }
   // Layer 1 didn't fit (or was force-skipped) — reset the raw window cache.
@@ -812,9 +831,9 @@ export function transform(input: {
       strip: "old-tools",
       protectedTurns: 2,
     });
-    if (layer2) {
+    if (fitsWithSafetyMargin(layer2)) {
       urgentDistillation = true;
-      return { ...layer2, layer: 2, usable, distilledBudget, rawBudget };
+      return { ...layer2!, layer: 2, usable, distilledBudget, rawBudget };
     }
   }
@@ -833,9 +852,9 @@ export function transform(input: {
     rawBudget: Math.floor(usable * 0.55),
     strip: "all-tools",
   });
-  if (layer3) {
+  if (fitsWithSafetyMargin(layer3)) {
     urgentDistillation = true;
-    return { ...layer3, layer: 3, usable, distilledBudget, rawBudget };
+    return { ...layer3!, layer: 3, usable, distilledBudget, rawBudget };
   }
   // Layer 4: Emergency — last 2 distillations, last 3 raw messages with tool parts intact.

package/src/index.ts CHANGED Viewed

@@ -188,7 +188,12 @@ export const LorePlugin: Plugin = async (ctx) => {
             if (
               msg.role === "assistant" &&
               msg.tokens &&
-              (msg.tokens.input > 0 || msg.tokens.cache.read > 0)
+              // Include cache.write: tokens written to cache were fully sent to the
+              // model (they were processed, just not read from a prior cache slot).
+              // Omitting cache.write causes a dramatic undercount on cold-cache turns
+              // where cache.read=0 but 150K+ tokens were written — leading the gradient
+              // to think only 3 tokens went in and passing the full session as layer 0.
+              (msg.tokens.input > 0 || msg.tokens.cache.read > 0 || msg.tokens.cache.write > 0)
             ) {
               const pending = temporal.undistilledCount(projectPath, msg.sessionID);
               if (pending >= config().distillation.maxSegment) {
@@ -201,6 +206,9 @@ export const LorePlugin: Plugin = async (ctx) => {
               // Calibrate overhead estimate using real token counts.
               // Also store the exact input count + message count for the proactive
               // layer-0 decision (avoids full chars/4 re-estimation each turn).
+              // actualInput = all tokens the model processed as input, regardless of
+              // whether they were new (input), read from cache (cache.read), or newly
+              // written to cache (cache.write). All three contribute to the context window.
               const allMsgs = await ctx.client.session.messages({
                 path: { id: msg.sessionID },
               });
@@ -209,7 +217,8 @@ export const LorePlugin: Plugin = async (ctx) => {
                   .filter((m) => m.info.id !== msg.id)
                   .map((m) => ({ info: m.info, parts: m.parts }));
                 const msgEstimate = estimateMessages(withParts);
-                const actualInput = msg.tokens.input + msg.tokens.cache.read;
+                const actualInput =
+                  msg.tokens.input + msg.tokens.cache.read + msg.tokens.cache.write;
                 calibrate(actualInput, msgEstimate, msg.sessionID, withParts.length);
               }
             }
@@ -224,43 +233,44 @@ export const LorePlugin: Plugin = async (ctx) => {
         // 1. Force the gradient transform to escalate on the next call (skip layer 0/1)
         // 2. Force distillation to capture all temporal data before compaction
         // 3. Trigger compaction so the session recovers without user intervention
-        const error = (event.properties as Record<string, unknown>).error as
-          | { name?: string; data?: { message?: string } }
+        const rawError = (event.properties as Record<string, unknown>).error;
+        // Diagnostic: log the full error shape so we can verify our detection matches
+        console.error("[lore] session.error received:", JSON.stringify(rawError, null, 2));
+        const error = rawError as
+          | { name?: string; message?: string; data?: { message?: string } }
           | undefined;
+        // Match both shapes: error.data.message (APIError wrapper) and error.message (direct)
+        const errorMessage = error?.data?.message ?? error?.message ?? "";
         const isPromptTooLong =
-          error?.name === "APIError" &&
-          typeof error?.data?.message === "string" &&
-          (error.data.message.includes("prompt is too long") ||
-            error.data.message.includes("context length exceeded") ||
-            error.data.message.includes("maximum context length"));
+          typeof errorMessage === "string" &&
+          (errorMessage.includes("prompt is too long") ||
+            errorMessage.includes("context length exceeded") ||
+            errorMessage.includes("maximum context length") ||
+            errorMessage.includes("ContextWindowExceededError") ||
+            errorMessage.includes("too many tokens"));
+        console.error(
+          `[lore] session.error isPromptTooLong=${isPromptTooLong} (name=${error?.name}, message=${errorMessage.substring(0, 120)})`,
+        );
         if (isPromptTooLong) {
           const sessionID = (event.properties as Record<string, unknown>).sessionID as
             | string
             | undefined;
           console.error(
-            `[lore] detected 'prompt too long' error — forcing distillation + compaction (session: ${sessionID?.substring(0, 16)})`,
+            `[lore] detected 'prompt too long' error — forcing distillation + layer escalation (session: ${sessionID?.substring(0, 16)})`,
           );
           // Force layer 2 on next transform — layers 0 and 1 were already too large.
+          // The gradient at layers 2-4 will compress the context enough for the next turn.
+          // Do NOT call session.summarize() here — it sends all messages to the model,
+          // which would overflow again and create a stuck compaction loop.
           setForceMinLayer(2);
           if (sessionID) {
-            // Force distillation to capture all undistilled messages before
-            // compaction replaces the session message history.
+            // Force distillation to capture all undistilled messages into the temporal
+            // store so they're preserved even if the session is later compacted manually.
             await backgroundDistill(sessionID, true);
-            // Trigger compaction automatically — the compacting hook will inject
-            // Lore's custom distillation-aware prompt.
-            try {
-              const sessions = await ctx.client.session.list();
-              const session = sessions.data?.find((s) => s.id.startsWith(sessionID));
-              if (session) {
-                // providerID/modelID are optional — omit to use the session's current model
-                await ctx.client.session.summarize({ path: { id: session.id } });
-              }
-            } catch (e) {
-              console.error("[lore] auto-compaction failed:", e);
-            }
           }
         }
       }
@@ -379,12 +389,13 @@ export const LorePlugin: Plugin = async (ctx) => {
       // Layer 0 means all messages fit within the context budget — leave them alone
       // so the append-only sequence stays intact for prompt caching.
       if (result.layer > 0) {
+        // The API requires the conversation to end with a user message.
+        // Always drop trailing non-user messages — even assistant messages with
+        // tool parts. A hard API error is worse than the model re-invoking a tool.
         while (
           result.messages.length > 0 &&
           result.messages.at(-1)!.info.role !== "user"
         ) {
-          const last = result.messages.at(-1)!;
-          if (last.parts.some((p) => p.type === "tool")) break;
           const dropped = result.messages.pop()!;
           console.error(
             "[lore] WARN: dropping trailing",
@@ -401,17 +412,25 @@ export const LorePlugin: Plugin = async (ctx) => {
       }
     },
-    // Replace compaction prompt with distillation-aware prompt when manual /compact is used.
-    // Also force distillation first so all temporal data is captured before compaction
-    // replaces the session message history.
+    // Replace compaction prompt with distillation-aware prompt when /compact is used.
+    // Strategy: run chunked distillation first so all messages are captured in segments
+    // that each fit within the model's context, then inject the pre-computed summaries
+    // as context so the model consolidates them rather than re-reading all raw messages.
+    // This prevents the overflow→compaction→overflow stuck loop.
     "experimental.session.compacting": async (input, output) => {
-      // Force distillation to capture any undistilled messages. This is critical:
-      // compaction will replace all messages with a summary, so we must persist
-      // everything to Lore's temporal store before that happens.
+      // Chunked distillation: split all undistilled messages into segments that each
+      // fit within the model's context window and distill them independently.
+      // This is safe even when the full session exceeds the context limit.
       if (input.sessionID && activeSessions.has(input.sessionID)) {
         await backgroundDistill(input.sessionID, true);
       }
+      // Load all distillation summaries produced for this session (oldest first).
+      // These are the chunked observations — the model will consolidate them.
+      const distillations = input.sessionID
+        ? distillation.loadForSession(projectPath, input.sessionID)
+        : [];
       const entries = ltm.forProject(projectPath, config().crossProject);
       const knowledge = entries.length
         ? formatKnowledge(
@@ -423,9 +442,24 @@ export const LorePlugin: Plugin = async (ctx) => {
           )
         : "";
+      // Inject each distillation chunk as a context string so the model has access
+      // to pre-computed summaries. Even if the raw messages overflow context, these
+      // summaries are compact and will fit.
+      if (distillations.length > 0) {
+        output.context.push(
+          `## Lore Pre-computed Session Summaries\n\nThe following ${distillations.length} summary chunk(s) were pre-computed from the conversation history. Use these as the authoritative source — do not re-summarize the raw messages above if they conflict.\n\n` +
+            distillations
+              .map(
+                (d, i) =>
+                  `### Chunk ${i + 1}${d.generation > 0 ? " (consolidated)" : ""}\n${d.observations}`,
+              )
+              .join("\n\n"),
+        );
+      }
       output.prompt = `You are creating a distilled memory summary for an AI coding agent. This summary will be the ONLY context available in the next part of the conversation.
-Structure your response as follows:
+${distillations.length > 0 ? "Lore has pre-computed chunked summaries of the session history (injected above as context). Consolidate those summaries into a single coherent narrative. Do NOT re-read or re-summarize the raw conversation messages — trust the pre-computed summaries.\n\n" : ""}Structure your response as follows:
 ## Session History