npm - clawmem - Versions diffs - 0.4.2 → 0.5.0 - Mend

clawmem 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/AGENTS.md +5 -4
package/CLAUDE.md +5 -4
package/README.md +12 -6
package/SKILL.md +10 -3
package/package.json +1 -1
package/src/clawmem.ts +99 -0
package/src/hooks/decision-extractor.ts +5 -1
package/src/memory.ts +12 -3
package/src/normalize.ts +390 -0
package/src/observer.ts +9 -3

package/AGENTS.md CHANGED Viewed

@@ -354,6 +354,7 @@ Pin, snooze, and forget are **manual MCP tools** — not automated. The agent sh
 - Do NOT pin everything — pin is for persistent high-priority items, not temporary boosting.
 - Do NOT forget memories to "clean up" — let confidence decay and contradiction detection handle it naturally.
 - Do NOT run `build_graphs` after every reindex — A-MEM creates per-doc links automatically. Only after bulk ingestion or when `intent_search` returns weak graph results.
+- Do NOT run `clawmem mine` autonomously — it is a bulk ingestion command (same category as `update`/`reindex`). Suggest it to the user when they mention old conversation exports, but let them run it. Bulk import has disk/embedding cost implications that need user consent.
 ## Tool Selection (one-liner)
@@ -435,16 +436,16 @@ compositeScore = (0.10 × searchScore + 0.70 × recencyScore + 0.20 × confidenc
 | Content Type | Half-Life | Effect |
 |--------------|-----------|--------|
-| decision, hub | ∞ | Never decay |
+| decision, preference, hub | ∞ | Never decay |
 | antipattern | ∞ | Never decay — accumulated negative patterns persist |
 | project | 120 days | Slow decay |
 | research | 90 days | Moderate decay |
-| note | 60 days | Default |
-| progress | 45 days | Faster decay |
+| problem, milestone, note | 60 days | Default |
+| conversation, progress | 45 days | Faster decay |
 | handoff | 30 days | Fast — recent matters most |
 Half-lives extend up to 3× for frequently-accessed memories (access reinforcement decays over 90 days).
-Attention decay: non-durable types (handoff, progress, note, project) lose 5% confidence per week without access. Decision/hub/research/antipattern are exempt.
+Attention decay: non-durable types (handoff, progress, conversation, note, project) lose 5% confidence per week without access. Decision/preference/hub/research/antipattern are exempt.
 ## Indexing & Graph Building

package/CLAUDE.md CHANGED Viewed

@@ -354,6 +354,7 @@ Pin, snooze, and forget are **manual MCP tools** — not automated. The agent sh
 - Do NOT pin everything — pin is for persistent high-priority items, not temporary boosting.
 - Do NOT forget memories to "clean up" — let confidence decay and contradiction detection handle it naturally.
 - Do NOT run `build_graphs` after every reindex — A-MEM creates per-doc links automatically. Only after bulk ingestion or when `intent_search` returns weak graph results.
+- Do NOT run `clawmem mine` autonomously — it is a bulk ingestion command (same category as `update`/`reindex`). Suggest it to the user when they mention old conversation exports, but let them run it. Bulk import has disk/embedding cost implications that need user consent.
 ## Tool Selection (one-liner)
@@ -435,16 +436,16 @@ compositeScore = (0.10 × searchScore + 0.70 × recencyScore + 0.20 × confidenc
 | Content Type | Half-Life | Effect |
 |--------------|-----------|--------|
-| decision, hub | ∞ | Never decay |
+| decision, preference, hub | ∞ | Never decay |
 | antipattern | ∞ | Never decay — accumulated negative patterns persist |
 | project | 120 days | Slow decay |
 | research | 90 days | Moderate decay |
-| note | 60 days | Default |
-| progress | 45 days | Faster decay |
+| problem, milestone, note | 60 days | Default |
+| conversation, progress | 45 days | Faster decay |
 | handoff | 30 days | Fast — recent matters most |
 Half-lives extend up to 3× for frequently-accessed memories (access reinforcement decays over 90 days).
-Attention decay: non-durable types (handoff, progress, note, project) lose 5% confidence per week without access. Decision/hub/research/antipattern are exempt.
+Attention decay: non-durable types (handoff, progress, conversation, note, project) lose 5% confidence per week without access. Decision/preference/hub/research/antipattern are exempt.
 ## Indexing & Graph Building

package/README.md CHANGED Viewed

@@ -18,7 +18,8 @@ ClawMem turns your markdown notes, project docs, and research dumps into persist
 - **Surfaces relevant context** on every prompt (context-surfacing hook)
 - **Bootstraps sessions** with your profile, latest handoff, recent decisions, and stale notes
-- **Captures decisions** from session transcripts using a local GGUF observer model
+- **Captures decisions, preferences, milestones, and problems** from session transcripts using a local GGUF observer model
+- **Imports conversation exports** from Claude Code, ChatGPT, Claude.ai, Slack, and plain text via `clawmem mine`
 - **Generates handoffs** at session end so the next session can pick up where you left off
 - **Learns what matters** via a feedback loop that boosts referenced notes and decays unused ones
 - **Guards against prompt injection** in surfaced content
@@ -643,6 +644,7 @@ clawmem collection list                         List collections
 clawmem collection remove <name>                Remove a collection
 clawmem update [--pull] [--embed]               Incremental re-scan
+clawmem mine <dir> [-c name] [--embed]         Import conversation exports (Claude, ChatGPT, Slack)
 clawmem embed [-f]                              Generate fragment embeddings
 clawmem reindex [--force]                       Full re-index
 clawmem watch                                   File watcher daemon
@@ -759,7 +761,7 @@ Hooks installed by `clawmem setup hooks`:
 | `postcompact-inject` | SessionStart | Re-injects authoritative context after compaction: precompact state + recent decisions + antipatterns + vault context (1200 token budget) |
 | `curator-nudge` | SessionStart | Surfaces curator report actions, nudges when report is stale (>7 days) |
 | `precompact-extract` | PreCompact | Extracts decisions, file paths, open questions before auto-compaction → writes `precompact-state.md` to auto-memory |
-| `decision-extractor` | Stop | GGUF observer extracts structured decisions, infers causal links, detects contradictions with prior decisions |
+| `decision-extractor` | Stop | GGUF observer extracts structured observations (decisions, preferences, milestones, problems, bugfixes, features, refactors, discoveries), infers causal links, detects contradictions with prior decisions |
 | `handoff-generator` | Stop | GGUF observer generates rich handoff, regex fallback |
 | `feedback-loop` | Stop | Silently boosts referenced notes, decays unused ones, records co-activation + usage relations between co-referenced docs, tracks utility signals (surfaced vs referenced ratio for lifecycle automation) |
@@ -813,15 +815,19 @@ For WHY and ENTITY queries, the search pipeline expands results through the memo
 | Type | Half-life | Baseline | Notes |
 |---|---|---|---|
 | `decision` | ∞ | 0.85 | Never decays |
+| `preference` | ∞ | 0.80 | Never decays — user preferences are durable facts |
 | `hub` | ∞ | 0.80 | Never decays |
+| `antipattern` | ∞ | 0.75 | Never decays — accumulated negative patterns persist |
+| `problem` | 60 days | 0.75 | High priority but resolves over time |
 | `research` | 90 days | 0.70 | |
+| `milestone` | 60 days | 0.70 | Important at the time, fades as project moves forward |
 | `project` | 120 days | 0.65 | |
 | `handoff` | 30 days | 0.60 | Fast decay — most recent matters |
+| `conversation` | 45 days | 0.55 | Imported chat exchanges |
 | `progress` | 45 days | 0.50 | |
 | `note` | 60 days | 0.50 | Default |
-| `antipattern` | ∞ | 0.75 | Never decays — accumulated negative patterns persist |
-Content types are inferred from frontmatter or file path patterns. Half-lives extend up to 3× for frequently-accessed memories (access reinforcement, decays over 90 days). Non-durable types (handoff, progress, note, project) lose 5% confidence per week without access (attention decay). Decision/hub/research/antipattern are exempt.
+Content types are inferred from frontmatter or file path patterns. Half-lives extend up to 3× for frequently-accessed memories (access reinforcement, decays over 90 days). Non-durable types (handoff, progress, conversation, note, project) lose 5% confidence per week without access (attention decay). Decision/preference/hub/research/antipattern are exempt.
 **Quality scoring:** Each document gets a `quality_score` (0.0–1.0) computed during indexing based on length, structure (headings, lists), decision/correction keywords, and frontmatter richness. Applied as `qualityMultiplier = 0.7 + 0.6 × qualityScore` (range: 0.7× penalty to 1.3× boost).
@@ -868,7 +874,7 @@ Documents are split into semantic fragments (sections, lists, code blocks, front
 ### Local Observer Agent
-Uses the LLM server (shared with query expansion and intent classification) to extract structured observations from session transcripts: type, title, facts, narrative, concepts, files read/modified. Falls back to regex patterns if the model is unavailable.
+Uses the LLM server (shared with query expansion and intent classification) to extract structured observations from session transcripts. Observation types: `decision`, `bugfix`, `feature`, `refactor`, `discovery`, `change`, `preference`, `milestone`, `problem`. Each observation includes title, facts, narrative, concepts, and files read/modified. Preferences, milestones, and problems get first-class content_type treatment with dedicated confidence baselines and half-lives instead of being flattened to generic "observation". Falls back to regex patterns if the model is unavailable.
 ### User Profile
@@ -943,7 +949,7 @@ title: "Document Title"
 tags: [tag1, tag2]
 domain: "infrastructure"
 workstream: "project-name"
-content_type: "decision"   # decision|hub|research|project|handoff|progress|note
+content_type: "decision"   # decision|preference|hub|research|project|handoff|conversation|progress|note
 review_by: "2026-03-01"
 ---
 ```

package/SKILL.md CHANGED Viewed

@@ -442,12 +442,12 @@ compositeScore = (0.10 x searchScore + 0.70 x recencyScore + 0.20 x confidenceSc
 | Content Type | Half-Life | Effect |
 |--------------|-----------|--------|
-| decision, hub | infinity | Never decay |
+| decision, preference, hub | infinity | Never decay |
 | antipattern | infinity | Never decay — accumulated negative patterns persist |
 | project | 120 days | Slow decay |
 | research | 90 days | Moderate decay |
-| note | 60 days | Default |
-| progress | 45 days | Faster decay |
+| problem, milestone, note | 60 days | Default |
+| conversation, progress | 45 days | Faster decay |
 | handoff | 30 days | Fast — recent matters most |
 Half-lives extend up to 3x for frequently-accessed memories (access reinforcement decays over 90 days).
@@ -566,6 +566,7 @@ When `decision-extractor` detects a new decision contradicting an old one, the o
 - Do NOT pin everything — pin is for persistent high-priority items.
 - Do NOT forget memories to "clean up" — let confidence decay and contradiction detection handle it.
 - Do NOT run `build_graphs` after every reindex — A-MEM creates per-doc links automatically.
+- Do NOT run `clawmem mine` autonomously — it is a bulk ingestion command. Suggest it to the user when they mention old conversation exports, but let them run it.
 ---
@@ -657,6 +658,12 @@ Symptom: reindex --force crashes with UNIQUE constraint
   -> Force deactivates rows but UNIQUE(collection, path) doesn't discriminate by active flag.
   -> Fixed: indexer.ts reactivates inactive rows instead of inserting.
+Symptom: `clawmem update` crashes with "Binding expected string, TypedArray, boolean, number, bigint or null"
+  -> YAML frontmatter values like `title: 2023-09-27` or `title: true` are coerced by gray-matter
+     into Date objects or booleans. Bun's SQLite driver rejects these as bind parameters.
+  -> Fixed v0.4.2: `parseDocument()` runtime-checks all frontmatter fields via `str()` helper.
+  -> Affects: title, domain, workstream, content_type, review_by.
 Symptom: CLI reindex/update falls back to node-llama-cpp
   -> GPU env vars only in systemd drop-in, not in wrapper script.
   -> Fixed: bin/clawmem wrapper exports CLAWMEM_EMBED_URL/LLM_URL/RERANK_URL defaults.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clawmem",
-  "version": "0.4.2",
+  "version": "0.5.0",
   "description": "On-device context engine and memory for AI agents. Claude Code and OpenClaw. Hooks + MCP server + hybrid RAG search.",
   "type": "module",
   "bin": {

package/src/clawmem.ts CHANGED Viewed

@@ -235,6 +235,101 @@ async function cmdUpdate(args: string[]) {
   }
 }
+async function cmdMine(args: string[]) {
+  const { values, positionals } = parseArgs({
+    args,
+    options: {
+      collection: { type: "string", short: "c" },
+      embed: { type: "boolean", default: false },
+      "dry-run": { type: "boolean", default: false },
+    },
+    allowPositionals: true,
+  });
+  const dir = positionals[0];
+  if (!dir) die("Usage: clawmem mine <directory> [-c collection-name] [--embed] [--dry-run]");
+  const absDir = pathResolve(dir);
+  if (!existsSync(absDir)) die(`Directory not found: ${absDir}`);
+  const { scanConversationDir, normalizeFile, chunkConversation } = await import("./normalize.ts");
+  console.log(`${c.cyan}Scanning for conversation files${c.reset} in ${absDir}`);
+  const files = scanConversationDir(absDir);
+  if (files.length === 0) die("No conversation files found (.json, .jsonl, .txt, .md)");
+  console.log(`  Found ${files.length} candidate files`);
+  // Normalize and chunk
+  let totalChunks = 0;
+  let totalConversations = 0;
+  const allChunks: { title: string; body: string; sourcePath: string; chunkIndex: number }[] = [];
+  for (const file of files) {
+    const conv = normalizeFile(file);
+    if (!conv) continue;
+    totalConversations++;
+    const chunks = chunkConversation(conv);
+    if (chunks.length === 0) continue;
+    console.log(`  ${c.green}✓${c.reset} ${conv.source} (${conv.format}, ${conv.messages.length} messages → ${chunks.length} chunks)`);
+    for (const chunk of chunks) {
+      chunk.sourcePath = file.replace(absDir + "/", "");
+    }
+    allChunks.push(...chunks);
+    totalChunks += chunks.length;
+  }
+  if (totalConversations === 0) die("No conversation files could be parsed");
+  console.log(`\n${c.bold}Parsed:${c.reset} ${totalConversations} conversations → ${totalChunks} exchange chunks`);
+  if (values["dry-run"]) {
+    console.log(`${c.yellow}Dry run — no changes made${c.reset}`);
+    return;
+  }
+  // Write chunks as markdown to a staging directory (outside source tree), then index
+  const collectionName = values.collection || "conversations";
+  const { tmpdir } = await import("os");
+  const stagingDir = pathResolve(tmpdir(), `clawmem-mine-${Date.now()}`);
+  mkdirSync(stagingDir, { recursive: true });
+  const { rmSync } = await import("fs");
+  try {
+    const writePromises: Promise<number>[] = [];
+    for (const chunk of allChunks) {
+      const safeSource = chunk.sourcePath.replace(/[\/\\]/g, "_").replace(/\.[^.]+$/, "");
+      const filename = `${safeSource}_${String(chunk.chunkIndex).padStart(4, "0")}.md`;
+      const esc = (s: string) => s.replace(/"/g, '\\"');
+      const frontmatter = [
+        "---",
+        `title: "${esc(chunk.title)}"`,
+        `content_type: conversation`,
+        `source: "${esc(chunk.sourcePath)}"`,
+        "---",
+        "",
+        chunk.body,
+      ].join("\n");
+      writePromises.push(Bun.write(pathResolve(stagingDir, filename), frontmatter));
+    }
+    await Promise.all(writePromises);
+    // Index through existing pipeline
+    const s = getStore();
+    console.log(`\n${c.cyan}Indexing ${totalChunks} conversation chunks${c.reset} as collection '${collectionName}'`);
+    const stats = await indexCollection(s, collectionName, stagingDir, "**/*.md");
+    console.log(`  ${c.green}+${stats.added}${c.reset} added, ${c.yellow}~${stats.updated}${c.reset} updated, ${c.dim}=${stats.unchanged}${c.reset} unchanged`);
+    if (values.embed) {
+      console.log();
+      await cmdEmbed([]);
+    } else {
+      console.log(`\nRun ${c.cyan}clawmem embed${c.reset} to generate embeddings for the imported conversations`);
+    }
+  } finally {
+    rmSync(stagingDir, { recursive: true, force: true });
+  }
+}
 async function cmdEmbed(args: string[]) {
   const { values } = parseArgs({
     args,
@@ -1695,6 +1790,9 @@ async function main() {
       case "update":
         await cmdUpdate(subArgs);
         break;
+      case "mine":
+        await cmdMine(subArgs);
+        break;
       case "embed":
         await cmdEmbed(subArgs);
         break;
@@ -2289,6 +2387,7 @@ ${c.bold}Setup:${c.reset}
 ${c.bold}Indexing:${c.reset}
   clawmem update [--pull] [--embed]    Re-scan collections (--embed auto-embeds)
+  clawmem mine <dir> [-c name] [--embed]  Import conversation exports (Claude, ChatGPT, Slack)
   clawmem embed [-f]                   Generate fragment embeddings
   clawmem reindex [--force] [--enrich]  Full re-index (--enrich: run entity extraction + links on all docs)
   clawmem watch                        File watcher daemon

package/src/hooks/decision-extractor.ts CHANGED Viewed

@@ -335,7 +335,11 @@ export async function decisionExtractor(
         const doc = store.findActiveDocument("_clawmem", obsPath);
         if (doc) {
           store.updateDocumentMeta(doc.id, {
-            content_type: obs.type === "decision" ? "decision" : "observation",
+            content_type: obs.type === "decision" ? "decision"
+              : obs.type === "preference" ? "preference"
+              : obs.type === "milestone" ? "milestone"
+              : obs.type === "problem" ? "problem"
+              : "observation",
             confidence: 0.80,
           });
           store.updateObservationFields(obsPath, "_clawmem", {

package/src/memory.ts CHANGED Viewed

@@ -12,9 +12,13 @@
 export const HALF_LIVES: Record<string, number> = {
   handoff: 30,
   progress: 45,
+  conversation: 45,
+  problem: 60,
+  milestone: 60,
   note: 60,
   research: 90,
   project: 120,
+  preference: Infinity,
   decision: Infinity,
   hub: Infinity,
 };
@@ -25,10 +29,14 @@ export const HALF_LIVES: Record<string, number> = {
 export const TYPE_BASELINES: Record<string, number> = {
   decision: 0.85,
+  preference: 0.80,
   hub: 0.80,
+  problem: 0.75,
   research: 0.70,
+  milestone: 0.70,
   project: 0.65,
   handoff: 0.60,
+  conversation: 0.55,
   progress: 0.50,
   note: 0.50,
 };
@@ -37,7 +45,7 @@ export const TYPE_BASELINES: Record<string, number> = {
 // Content Type Inference
 // =============================================================================
-export type ContentType = "decision" | "hub" | "research" | "project" | "handoff" | "progress" | "note";
+export type ContentType = "decision" | "preference" | "hub" | "research" | "project" | "handoff" | "conversation" | "progress" | "milestone" | "problem" | "note";
 export function inferContentType(path: string, explicitType?: string): ContentType {
   if (explicitType && explicitType in TYPE_BASELINES) return explicitType as ContentType;
@@ -48,6 +56,7 @@ export function inferContentType(path: string, explicitType?: string): ContentTy
   if (lower.includes("research") || lower.includes("investigation") || lower.includes("analysis")) return "research";
   if (lower.includes("project") || lower.includes("epic") || lower.includes("initiative")) return "project";
   if (lower.includes("handoff") || lower.includes("handover") || lower.includes("session")) return "handoff";
+  if (lower.includes("conversation") || lower.includes("convo") || lower.includes("chat") || lower.includes("transcript")) return "conversation";
   if (lower.includes("progress") || lower.includes("status") || lower.includes("standup") || lower.includes("changelog")) return "progress";
   return "note";
 }
@@ -65,7 +74,7 @@ export type MemoryType = "episodic" | "semantic" | "procedural";
  * - procedural: how-to, patterns, workflows (actionable)
  */
 export function inferMemoryType(path: string, contentType: string, body?: string): MemoryType {
-  if (["handoff", "progress"].includes(contentType)) return "episodic";
+  if (["handoff", "progress", "conversation"].includes(contentType)) return "episodic";
   if (["decision", "hub", "research"].includes(contentType)) return "semantic";
   if (body && /\b(step\s+\d|workflow|recipe|how\s+to|procedure|runbook|playbook)\b/i.test(body)) return "procedural";
   if (path.includes("sop") || path.includes("runbook") || path.includes("playbook")) return "procedural";
@@ -141,7 +150,7 @@ export function confidenceScore(
   // Attention decay: reduce confidence if not accessed recently (5% per week)
   // Only apply to episodic/progress content — skip for durable types (decision, hub, research)
   // Also skip if last_accessed_at was backfilled from modified_at (no real access yet)
-  const DECAY_EXEMPT_TYPES = new Set(["decision", "hub", "research", "antipattern"]);
+  const DECAY_EXEMPT_TYPES = new Set(["decision", "hub", "research", "antipattern", "preference"]);
   let attentionDecay = 1.0;
   if (lastAccessedAt && !DECAY_EXEMPT_TYPES.has(contentType)) {
     const lastAccess = typeof lastAccessedAt === "string" ? new Date(lastAccessedAt) : lastAccessedAt;

package/src/normalize.ts ADDED Viewed

@@ -0,0 +1,390 @@
+/**
+ * normalize.ts — Conversation format normalizer for ClawMem
+ *
+ * Converts chat export files into normalized markdown documents suitable for
+ * ClawMem's indexing pipeline. Supports:
+ *   - Claude Code JSONL sessions
+ *   - Claude.ai JSON exports (flat + privacy export)
+ *   - ChatGPT conversations.json (mapping tree)
+ *   - Slack JSON exports (DMs + channels)
+ *   - Plain text with user/assistant markers
+ *
+ * Each exchange pair (user + assistant) becomes one markdown chunk.
+ * Inspired by MemPalace normalize.py, rewritten for TypeScript/Bun.
+ */
+import { readFileSync, readdirSync, statSync } from "fs";
+import { basename, extname, join, relative } from "path";
+// =============================================================================
+// Types
+// =============================================================================
+export type Message = { role: "user" | "assistant"; content: string };
+export type NormalizedConversation = {
+  source: string;           // original filename
+  format: string;           // detected format
+  messages: Message[];      // normalized messages
+};
+export type ConversationChunk = {
+  title: string;            // "Exchange N" or extracted topic
+  body: string;             // markdown body
+  sourcePath: string;       // relative path of source file
+  chunkIndex: number;
+};
+// =============================================================================
+// Format Detection & Normalization
+// =============================================================================
+const CONVO_EXTENSIONS = new Set([".txt", ".md", ".json", ".jsonl"]);
+const SKIP_DIRS = new Set([".git", "node_modules", "__pycache__", ".venv", "venv", "dist", "build", ".next", ".mempalace", ".grepai", "tool-results"]);
+export function normalizeFile(filepath: string): NormalizedConversation | null {
+  let content: string;
+  try {
+    content = readFileSync(filepath, "utf-8");
+  } catch {
+    return null;
+  }
+  if (!content.trim()) return null;
+  const ext = extname(filepath).toLowerCase();
+  // Try JSONL formats first (Claude Code, Codex CLI)
+  if (ext === ".jsonl" || (content.trim().startsWith("{") && content.includes("\n{"))) {
+    const cc = tryClaudeCodeJsonl(content);
+    if (cc) return { source: basename(filepath), format: "claude-code", messages: cc };
+    const codex = tryCodexJsonl(content);
+    if (codex) return { source: basename(filepath), format: "codex-cli", messages: codex };
+  }
+  // Try JSON formats
+  if (ext === ".json" || content.trim().startsWith("{") || content.trim().startsWith("[")) {
+    try {
+      const data = JSON.parse(content);
+      const claude = tryClaudeAiJson(data);
+      if (claude) return { source: basename(filepath), format: "claude-ai", messages: claude };
+      const chatgpt = tryChatGptJson(data);
+      if (chatgpt) return { source: basename(filepath), format: "chatgpt", messages: chatgpt };
+      const slack = trySlackJson(data);
+      if (slack) return { source: basename(filepath), format: "slack", messages: slack };
+    } catch {
+      // Not valid JSON
+    }
+  }
+  // Try plain text with user/assistant markers
+  const plain = tryPlainText(content);
+  if (plain) return { source: basename(filepath), format: "plain-text", messages: plain };
+  return null;
+}
+// =============================================================================
+// Format Parsers
+// =============================================================================
+function tryClaudeCodeJsonl(content: string): Message[] | null {
+  const lines = content.trim().split("\n").filter(l => l.trim());
+  const messages: Message[] = [];
+  for (const line of lines) {
+    let entry: any;
+    try { entry = JSON.parse(line); } catch { continue; }
+    if (typeof entry !== "object" || !entry) continue;
+    const msgType = entry.type ?? "";
+    const message = entry.message ?? {};
+    if (msgType === "human" || msgType === "user") {
+      const text = extractContent(message.content);
+      if (text) messages.push({ role: "user", content: text });
+    } else if (msgType === "assistant") {
+      const text = extractContent(message.content);
+      if (text) messages.push({ role: "assistant", content: text });
+    }
+  }
+  return messages.length >= 2 ? messages : null;
+}
+function tryCodexJsonl(content: string): Message[] | null {
+  const lines = content.trim().split("\n").filter(l => l.trim());
+  const messages: Message[] = [];
+  let hasSessionMeta = false;
+  for (const line of lines) {
+    let entry: any;
+    try { entry = JSON.parse(line); } catch { continue; }
+    if (typeof entry !== "object" || !entry) continue;
+    if (entry.type === "session_meta") { hasSessionMeta = true; continue; }
+    if (entry.type !== "event_msg") continue;
+    const payload = entry.payload;
+    if (typeof payload !== "object" || !payload) continue;
+    const text = typeof payload.message === "string" ? payload.message.trim() : "";
+    if (!text) continue;
+    if (payload.type === "user_message") messages.push({ role: "user", content: text });
+    else if (payload.type === "agent_message") messages.push({ role: "assistant", content: text });
+  }
+  return messages.length >= 2 && hasSessionMeta ? messages : null;
+}
+function tryClaudeAiJson(data: any): Message[] | null {
+  // Privacy export: array of conversation objects with chat_messages
+  if (Array.isArray(data) && data.length > 0 && data[0]?.chat_messages) {
+    const messages: Message[] = [];
+    for (const convo of data) {
+      for (const item of convo.chat_messages ?? []) {
+        const role = item.role ?? "";
+        const text = extractContent(item.content);
+        if ((role === "user" || role === "human") && text) messages.push({ role: "user", content: text });
+        else if ((role === "assistant" || role === "ai") && text) messages.push({ role: "assistant", content: text });
+      }
+    }
+    return messages.length >= 2 ? messages : null;
+  }
+  // Flat messages list or wrapped in { messages: [...] }
+  let msgs = data;
+  if (typeof data === "object" && !Array.isArray(data)) {
+    msgs = data.messages ?? data.chat_messages ?? [];
+  }
+  if (!Array.isArray(msgs)) return null;
+  const messages: Message[] = [];
+  for (const item of msgs) {
+    if (typeof item !== "object" || !item) continue;
+    const role = item.role ?? "";
+    const text = extractContent(item.content);
+    if ((role === "user" || role === "human") && text) messages.push({ role: "user", content: text });
+    else if ((role === "assistant" || role === "ai") && text) messages.push({ role: "assistant", content: text });
+  }
+  return messages.length >= 2 ? messages : null;
+}
+function tryChatGptJson(data: any): Message[] | null {
+  if (typeof data !== "object" || !data?.mapping) return null;
+  const mapping = data.mapping;
+  const messages: Message[] = [];
+  // Find root node (parent=null, no message)
+  let rootId: string | null = null;
+  let fallback: string | null = null;
+  for (const [nodeId, node] of Object.entries(mapping) as [string, any][]) {
+    if (node.parent === null) {
+      if (!node.message) { rootId = nodeId; break; }
+      else if (!fallback) fallback = nodeId;
+    }
+  }
+  rootId = rootId ?? fallback;
+  if (!rootId) return null;
+  // Walk the tree
+  let currentId: string | null = rootId;
+  const visited = new Set<string>();
+  while (currentId && !visited.has(currentId)) {
+    visited.add(currentId);
+    const node = (mapping as any)[currentId];
+    if (node?.message) {
+      const role = node.message.author?.role ?? "";
+      const content = node.message.content;
+      const parts = content?.parts ?? [];
+      const text = parts.filter((p: any) => typeof p === "string").join(" ").trim();
+      if (role === "user" && text) messages.push({ role: "user", content: text });
+      else if (role === "assistant" && text) messages.push({ role: "assistant", content: text });
+    }
+    currentId = node?.children?.[0] ?? null;
+  }
+  return messages.length >= 2 ? messages : null;
+}
+function trySlackJson(data: any): Message[] | null {
+  if (!Array.isArray(data)) return null;
+  // Count unique speakers — only support 2-party DMs
+  const speakers = new Set<string>();
+  for (const item of data) {
+    if (typeof item !== "object" || item?.type !== "message") continue;
+    const userId = item.user ?? item.username ?? "";
+    if (userId) speakers.add(userId);
+    if (speakers.size > 2) return null; // multi-person channel, unsupported
+  }
+  if (speakers.size < 2) return null;
+  const messages: Message[] = [];
+  const speakerList = [...speakers];
+  const roleMap: Record<string, "user" | "assistant"> = {
+    [speakerList[0]]: "user",
+    [speakerList[1]]: "assistant",
+  };
+  for (const item of data) {
+    if (typeof item !== "object" || item?.type !== "message") continue;
+    const userId = item.user ?? item.username ?? "";
+    const text = (item.text ?? "").trim();
+    if (!text || !roleMap[userId]) continue;
+    messages.push({ role: roleMap[userId], content: text });
+  }
+  return messages.length >= 2 ? messages : null;
+}
+function tryPlainText(content: string): Message[] | null {
+  const messages: Message[] = [];
+  // Only match explicit role prefixes (User:, Human:, Assistant:, etc.)
+  // Do NOT match bare blockquotes (> ) — too many false positives with markdown
+  const lines = content.split("\n");
+  let currentRole: "user" | "assistant" | null = null;
+  let currentText: string[] = [];
+  for (const line of lines) {
+    const trimmed = line.trim();
+    let newRole: "user" | "assistant" | null = null;
+    if (/^(User|Human)\s*:\s*/i.test(trimmed)) {
+      newRole = "user";
+    } else if (/^(Assistant|AI|Claude|GPT|Bot)\s*:\s*/i.test(trimmed)) {
+      newRole = "assistant";
+    }
+    if (newRole) {
+      if (currentRole && currentText.length > 0) {
+        const text = currentText.join("\n").trim();
+        if (text) messages.push({ role: currentRole, content: text });
+      }
+      currentRole = newRole;
+      // Strip the role prefix
+      const cleaned = trimmed.replace(/^(User|Human|Assistant|AI|Claude|GPT|Bot)\s*:\s*/i, "");
+      currentText = cleaned ? [cleaned] : [];
+    } else if (currentRole) {
+      currentText.push(trimmed);
+    }
+  }
+  // Flush last
+  if (currentRole && currentText.length > 0) {
+    const text = currentText.join("\n").trim();
+    if (text) messages.push({ role: currentRole, content: text });
+  }
+  // Require at least 2 exchanges AND both roles present (prevents false positives)
+  const hasUser = messages.some(m => m.role === "user");
+  const hasAssistant = messages.some(m => m.role === "assistant");
+  return messages.length >= 4 && hasUser && hasAssistant ? messages : null;
+}
+// =============================================================================
+// Content Extraction
+// =============================================================================
+function extractContent(content: any): string {
+  if (typeof content === "string") return content.trim();
+  if (Array.isArray(content)) {
+    return content
+      .map(item => {
+        if (typeof item === "string") return item;
+        if (typeof item === "object" && item?.type === "text") return item.text ?? "";
+        return "";
+      })
+      .join(" ")
+      .trim();
+  }
+  if (typeof content === "object" && content) return (content.text ?? "").trim();
+  return "";
+}
+// =============================================================================
+// Chunking — Exchange Pairs
+// =============================================================================
+const MIN_CHUNK_CHARS = 30;
+export function chunkConversation(conv: NormalizedConversation): ConversationChunk[] {
+  const chunks: ConversationChunk[] = [];
+  const { messages, source } = conv;
+  for (let i = 0; i < messages.length; i++) {
+    if (messages[i].role !== "user") continue;
+    const userMsg = messages[i].content;
+    // Collect ALL consecutive assistant messages (handles split replies)
+    const assistantParts: string[] = [];
+    while (i + 1 < messages.length && messages[i + 1].role === "assistant") {
+      assistantParts.push(messages[i + 1].content);
+      i++;
+    }
+    const assistantMsg = assistantParts.join("\n\n");
+    // Build markdown chunk
+    const title = extractExchangeTitle(userMsg, chunks.length + 1);
+    const body = formatExchangeMarkdown(userMsg, assistantMsg);
+    if (body.length >= MIN_CHUNK_CHARS) {
+      chunks.push({
+        title,
+        body,
+        sourcePath: source,
+        chunkIndex: chunks.length,
+      });
+    }
+  }
+  return chunks;
+}
+function extractExchangeTitle(userMessage: string, index: number): string {
+  // Use the first line/sentence of the user message, capped at 80 chars
+  const firstLine = userMessage.split("\n")[0].trim();
+  if (firstLine.length <= 80) return firstLine;
+  return firstLine.slice(0, 77) + "...";
+}
+function formatExchangeMarkdown(userMsg: string, assistantMsg: string): string {
+  const lines: string[] = [];
+  lines.push("**User:**", userMsg, "");
+  if (assistantMsg) {
+    lines.push("**Assistant:**", assistantMsg, "");
+  }
+  return lines.join("\n");
+}
+// =============================================================================
+// Directory Scanner
+// =============================================================================
+export function scanConversationDir(dir: string): string[] {
+  const files: string[] = [];
+  function walk(d: string) {
+    let entries: string[];
+    try { entries = readdirSync(d); } catch { return; }
+    for (const entry of entries) {
+      const fullPath = join(d, entry);
+      try {
+        const stat = statSync(fullPath);
+        if (stat.isDirectory()) {
+          if (!SKIP_DIRS.has(entry)) walk(fullPath);
+        } else if (stat.isFile()) {
+          const ext = extname(entry).toLowerCase();
+          if (CONVO_EXTENSIONS.has(ext)) files.push(fullPath);
+        }
+      } catch { continue; }
+    }
+  }
+  walk(dir);
+  return files;
+}

package/src/observer.ts CHANGED Viewed

@@ -15,7 +15,7 @@ import { MAX_LLM_GENERATE_TIMEOUT_MS } from "./limits.ts";
 // =============================================================================
 export type Observation = {
-  type: "decision" | "bugfix" | "feature" | "refactor" | "discovery" | "change";
+  type: "decision" | "bugfix" | "feature" | "refactor" | "discovery" | "change" | "preference" | "milestone" | "problem";
   title: string;
   facts: string[];
   narrative: string;
@@ -51,7 +51,7 @@ const OBSERVATION_SYSTEM_PROMPT = `You are an observer analyzing a coding sessio
 For each significant action, decision, or discovery, output an <observation> XML element.
 <observation>
-  <type>one of: decision, bugfix, feature, refactor, discovery, change</type>
+  <type>one of: decision, bugfix, feature, refactor, discovery, change, preference, milestone, problem</type>
   <title>Brief descriptive title (max 80 chars)</title>
   <facts>
     <fact>Individual atomic fact</fact>
@@ -69,7 +69,12 @@ Rules:
 - Each fact should be a standalone, atomic piece of information
 - The narrative should explain WHY something was done, not just WHAT
 - Only include files that were explicitly mentioned in the transcript
-- If no significant observations, output nothing`;
+- If no significant observations, output nothing
+Type guidance:
+- preference: user expresses a preference, habit, or way of working (e.g., "don't use subagents for this", "I prefer single PRs")
+- milestone: significant completion point, version release, deployment, or phase transition
+- problem: persistent issue, recurring bug, architectural limitation, or unresolved blocker`;
 const SUMMARY_SYSTEM_PROMPT = `You are a session summarizer. Analyze this coding session transcript and output a structured summary.
@@ -118,6 +123,7 @@ function prepareTranscript(messages: TranscriptMessage[]): string {
 const VALID_OBSERVATION_TYPES = new Set([
   "decision", "bugfix", "feature", "refactor", "discovery", "change",
+  "preference", "milestone", "problem",
 ]);
 const VALID_CONCEPTS = new Set([