opencode-lore 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "opencode-lore",
3
- "version": "0.2.0",
3
+ "version": "0.2.2",
4
4
  "type": "module",
5
5
  "license": "MIT",
6
6
  "description": "Three-tier memory architecture for OpenCode — distillation, not summarization",
@@ -117,6 +117,32 @@ export type Distillation = {
117
117
  created_at: number;
118
118
  };
119
119
 
120
+ /** Load all distillations for a session, oldest first. */
121
+ export function loadForSession(
122
+ projectPath: string,
123
+ sessionID: string,
124
+ ): Distillation[] {
125
+ const pid = ensureProject(projectPath);
126
+ const rows = db()
127
+ .query(
128
+ "SELECT id, project_id, session_id, observations, source_ids, generation, token_count, created_at FROM distillations WHERE project_id = ? AND session_id = ? ORDER BY created_at ASC",
129
+ )
130
+ .all(pid, sessionID) as Array<{
131
+ id: string;
132
+ project_id: string;
133
+ session_id: string;
134
+ observations: string;
135
+ source_ids: string;
136
+ generation: number;
137
+ token_count: number;
138
+ created_at: number;
139
+ }>;
140
+ return rows.map((r) => ({
141
+ ...r,
142
+ source_ids: JSON.parse(r.source_ids) as string[],
143
+ }));
144
+ }
145
+
120
146
  function storeDistillation(input: {
121
147
  projectPath: string;
122
148
  sessionID: string;
package/src/gradient.ts CHANGED
@@ -722,8 +722,27 @@ export function transform(input: {
722
722
  const maxInput = contextLimit - outputReserved;
723
723
  const sid = input.sessionID ?? input.messages[0]?.info.sessionID;
724
724
 
725
+ // True when we have real API token data from a previous turn in this session.
726
+ // When false (first turn / session change), chars/4 estimates can undercount by
727
+ // up to 1.8x — so tryFit output must be validated with a safety multiplier before
728
+ // being used, to prevent sending an apparently-fitting window that actually overflows.
729
+ const calibrated = lastKnownInput > 0 && sid === lastKnownSessionID;
730
+
731
+ // On uncalibrated turns, apply this multiplier to tryFit's estimated total to
732
+ // approximate the real token count. 1.5 is conservative but not so aggressive
733
+ // that it forces layer 4 on modestly-sized sessions.
734
+ const UNCALIBRATED_SAFETY = 1.5;
735
+
736
+ // Returns true if the tryFit result is safe to use: either we have calibrated
737
+ // data (exact) or the estimated total * safety factor fits within maxInput.
738
+ function fitsWithSafetyMargin(result: { totalTokens: number } | null): boolean {
739
+ if (!result) return false;
740
+ if (calibrated) return true;
741
+ return result.totalTokens * UNCALIBRATED_SAFETY <= maxInput;
742
+ }
743
+
725
744
  let expectedInput: number;
726
- if (lastKnownInput > 0 && sid === lastKnownSessionID) {
745
+ if (calibrated) {
727
746
  // Exact approach: prior API count + estimate of only the new messages.
728
747
  const newMsgCount = Math.max(0, input.messages.length - lastKnownMessageCount);
729
748
  const newMsgTokens = newMsgCount > 0
@@ -793,7 +812,7 @@ export function transform(input: {
793
812
  rawBudget,
794
813
  strip: "none",
795
814
  });
796
- if (layer1) return { ...layer1, layer: 1, usable, distilledBudget, rawBudget };
815
+ if (fitsWithSafetyMargin(layer1)) return { ...layer1!, layer: 1, usable, distilledBudget, rawBudget };
797
816
  }
798
817
 
799
818
  // Layer 1 didn't fit (or was force-skipped) — reset the raw window cache.
@@ -812,9 +831,9 @@ export function transform(input: {
812
831
  strip: "old-tools",
813
832
  protectedTurns: 2,
814
833
  });
815
- if (layer2) {
834
+ if (fitsWithSafetyMargin(layer2)) {
816
835
  urgentDistillation = true;
817
- return { ...layer2, layer: 2, usable, distilledBudget, rawBudget };
836
+ return { ...layer2!, layer: 2, usable, distilledBudget, rawBudget };
818
837
  }
819
838
  }
820
839
 
@@ -833,9 +852,9 @@ export function transform(input: {
833
852
  rawBudget: Math.floor(usable * 0.55),
834
853
  strip: "all-tools",
835
854
  });
836
- if (layer3) {
855
+ if (fitsWithSafetyMargin(layer3)) {
837
856
  urgentDistillation = true;
838
- return { ...layer3, layer: 3, usable, distilledBudget, rawBudget };
857
+ return { ...layer3!, layer: 3, usable, distilledBudget, rawBudget };
839
858
  }
840
859
 
841
860
  // Layer 4: Emergency — last 2 distillations, last 3 raw messages with tool parts intact.
package/src/index.ts CHANGED
@@ -188,7 +188,12 @@ export const LorePlugin: Plugin = async (ctx) => {
188
188
  if (
189
189
  msg.role === "assistant" &&
190
190
  msg.tokens &&
191
- (msg.tokens.input > 0 || msg.tokens.cache.read > 0)
191
+ // Include cache.write: tokens written to cache were fully sent to the
192
+ // model (they were processed, just not read from a prior cache slot).
193
+ // Omitting cache.write causes a dramatic undercount on cold-cache turns
194
+ // where cache.read=0 but 150K+ tokens were written — leading the gradient
195
+ // to think only 3 tokens went in and passing the full session as layer 0.
196
+ (msg.tokens.input > 0 || msg.tokens.cache.read > 0 || msg.tokens.cache.write > 0)
192
197
  ) {
193
198
  const pending = temporal.undistilledCount(projectPath, msg.sessionID);
194
199
  if (pending >= config().distillation.maxSegment) {
@@ -201,6 +206,9 @@ export const LorePlugin: Plugin = async (ctx) => {
201
206
  // Calibrate overhead estimate using real token counts.
202
207
  // Also store the exact input count + message count for the proactive
203
208
  // layer-0 decision (avoids full chars/4 re-estimation each turn).
209
+ // actualInput = all tokens the model processed as input, regardless of
210
+ // whether they were new (input), read from cache (cache.read), or newly
211
+ // written to cache (cache.write). All three contribute to the context window.
204
212
  const allMsgs = await ctx.client.session.messages({
205
213
  path: { id: msg.sessionID },
206
214
  });
@@ -209,7 +217,8 @@ export const LorePlugin: Plugin = async (ctx) => {
209
217
  .filter((m) => m.info.id !== msg.id)
210
218
  .map((m) => ({ info: m.info, parts: m.parts }));
211
219
  const msgEstimate = estimateMessages(withParts);
212
- const actualInput = msg.tokens.input + msg.tokens.cache.read;
220
+ const actualInput =
221
+ msg.tokens.input + msg.tokens.cache.read + msg.tokens.cache.write;
213
222
  calibrate(actualInput, msgEstimate, msg.sessionID, withParts.length);
214
223
  }
215
224
  }
@@ -224,43 +233,44 @@ export const LorePlugin: Plugin = async (ctx) => {
224
233
  // 1. Force the gradient transform to escalate on the next call (skip layer 0/1)
225
234
  // 2. Force distillation to capture all temporal data before compaction
226
235
  // 3. Trigger compaction so the session recovers without user intervention
227
- const error = (event.properties as Record<string, unknown>).error as
228
- | { name?: string; data?: { message?: string } }
236
+ const rawError = (event.properties as Record<string, unknown>).error;
237
+ // Diagnostic: log the full error shape so we can verify our detection matches
238
+ console.error("[lore] session.error received:", JSON.stringify(rawError, null, 2));
239
+
240
+ const error = rawError as
241
+ | { name?: string; message?: string; data?: { message?: string } }
229
242
  | undefined;
243
+ // Match both shapes: error.data.message (APIError wrapper) and error.message (direct)
244
+ const errorMessage = error?.data?.message ?? error?.message ?? "";
230
245
  const isPromptTooLong =
231
- error?.name === "APIError" &&
232
- typeof error?.data?.message === "string" &&
233
- (error.data.message.includes("prompt is too long") ||
234
- error.data.message.includes("context length exceeded") ||
235
- error.data.message.includes("maximum context length"));
246
+ typeof errorMessage === "string" &&
247
+ (errorMessage.includes("prompt is too long") ||
248
+ errorMessage.includes("context length exceeded") ||
249
+ errorMessage.includes("maximum context length") ||
250
+ errorMessage.includes("ContextWindowExceededError") ||
251
+ errorMessage.includes("too many tokens"));
252
+
253
+ console.error(
254
+ `[lore] session.error isPromptTooLong=${isPromptTooLong} (name=${error?.name}, message=${errorMessage.substring(0, 120)})`,
255
+ );
236
256
 
237
257
  if (isPromptTooLong) {
238
258
  const sessionID = (event.properties as Record<string, unknown>).sessionID as
239
259
  | string
240
260
  | undefined;
241
261
  console.error(
242
- `[lore] detected 'prompt too long' error — forcing distillation + compaction (session: ${sessionID?.substring(0, 16)})`,
262
+ `[lore] detected 'prompt too long' error — forcing distillation + layer escalation (session: ${sessionID?.substring(0, 16)})`,
243
263
  );
244
264
  // Force layer 2 on next transform — layers 0 and 1 were already too large.
265
+ // The gradient at layers 2-4 will compress the context enough for the next turn.
266
+ // Do NOT call session.summarize() here — it sends all messages to the model,
267
+ // which would overflow again and create a stuck compaction loop.
245
268
  setForceMinLayer(2);
246
269
 
247
270
  if (sessionID) {
248
- // Force distillation to capture all undistilled messages before
249
- // compaction replaces the session message history.
271
+ // Force distillation to capture all undistilled messages into the temporal
272
+ // store so they're preserved even if the session is later compacted manually.
250
273
  await backgroundDistill(sessionID, true);
251
-
252
- // Trigger compaction automatically — the compacting hook will inject
253
- // Lore's custom distillation-aware prompt.
254
- try {
255
- const sessions = await ctx.client.session.list();
256
- const session = sessions.data?.find((s) => s.id.startsWith(sessionID));
257
- if (session) {
258
- // providerID/modelID are optional — omit to use the session's current model
259
- await ctx.client.session.summarize({ path: { id: session.id } });
260
- }
261
- } catch (e) {
262
- console.error("[lore] auto-compaction failed:", e);
263
- }
264
274
  }
265
275
  }
266
276
  }
@@ -379,12 +389,13 @@ export const LorePlugin: Plugin = async (ctx) => {
379
389
  // Layer 0 means all messages fit within the context budget — leave them alone
380
390
  // so the append-only sequence stays intact for prompt caching.
381
391
  if (result.layer > 0) {
392
+ // The API requires the conversation to end with a user message.
393
+ // Always drop trailing non-user messages — even assistant messages with
394
+ // tool parts. A hard API error is worse than the model re-invoking a tool.
382
395
  while (
383
396
  result.messages.length > 0 &&
384
397
  result.messages.at(-1)!.info.role !== "user"
385
398
  ) {
386
- const last = result.messages.at(-1)!;
387
- if (last.parts.some((p) => p.type === "tool")) break;
388
399
  const dropped = result.messages.pop()!;
389
400
  console.error(
390
401
  "[lore] WARN: dropping trailing",
@@ -401,17 +412,25 @@ export const LorePlugin: Plugin = async (ctx) => {
401
412
  }
402
413
  },
403
414
 
404
- // Replace compaction prompt with distillation-aware prompt when manual /compact is used.
405
- // Also force distillation first so all temporal data is captured before compaction
406
- // replaces the session message history.
415
+ // Replace compaction prompt with distillation-aware prompt when /compact is used.
416
+ // Strategy: run chunked distillation first so all messages are captured in segments
417
+ // that each fit within the model's context, then inject the pre-computed summaries
418
+ // as context so the model consolidates them rather than re-reading all raw messages.
419
+ // This prevents the overflow→compaction→overflow stuck loop.
407
420
  "experimental.session.compacting": async (input, output) => {
408
- // Force distillation to capture any undistilled messages. This is critical:
409
- // compaction will replace all messages with a summary, so we must persist
410
- // everything to Lore's temporal store before that happens.
421
+ // Chunked distillation: split all undistilled messages into segments that each
422
+ // fit within the model's context window and distill them independently.
423
+ // This is safe even when the full session exceeds the context limit.
411
424
  if (input.sessionID && activeSessions.has(input.sessionID)) {
412
425
  await backgroundDistill(input.sessionID, true);
413
426
  }
414
427
 
428
+ // Load all distillation summaries produced for this session (oldest first).
429
+ // These are the chunked observations — the model will consolidate them.
430
+ const distillations = input.sessionID
431
+ ? distillation.loadForSession(projectPath, input.sessionID)
432
+ : [];
433
+
415
434
  const entries = ltm.forProject(projectPath, config().crossProject);
416
435
  const knowledge = entries.length
417
436
  ? formatKnowledge(
@@ -423,9 +442,24 @@ export const LorePlugin: Plugin = async (ctx) => {
423
442
  )
424
443
  : "";
425
444
 
445
+ // Inject each distillation chunk as a context string so the model has access
446
+ // to pre-computed summaries. Even if the raw messages overflow context, these
447
+ // summaries are compact and will fit.
448
+ if (distillations.length > 0) {
449
+ output.context.push(
450
+ `## Lore Pre-computed Session Summaries\n\nThe following ${distillations.length} summary chunk(s) were pre-computed from the conversation history. Use these as the authoritative source — do not re-summarize the raw messages above if they conflict.\n\n` +
451
+ distillations
452
+ .map(
453
+ (d, i) =>
454
+ `### Chunk ${i + 1}${d.generation > 0 ? " (consolidated)" : ""}\n${d.observations}`,
455
+ )
456
+ .join("\n\n"),
457
+ );
458
+ }
459
+
426
460
  output.prompt = `You are creating a distilled memory summary for an AI coding agent. This summary will be the ONLY context available in the next part of the conversation.
427
461
 
428
- Structure your response as follows:
462
+ ${distillations.length > 0 ? "Lore has pre-computed chunked summaries of the session history (injected above as context). Consolidate those summaries into a single coherent narrative. Do NOT re-read or re-summarize the raw conversation messages — trust the pre-computed summaries.\n\n" : ""}Structure your response as follows:
429
463
 
430
464
  ## Session History
431
465