npm - clementine-agent - Versions diffs - 1.0.32 → 1.0.33 - Mend

clementine-agent 1.0.32 → 1.0.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/memory/embeddings.d.ts +7 -0
package/dist/memory/embeddings.js +14 -0
package/dist/memory/store.d.ts +6 -0
package/dist/memory/store.js +112 -33
package/dist/tools/shared.d.ts +5 -0
package/package.json +1 -1

package/dist/memory/embeddings.d.ts CHANGED Viewed

@@ -35,4 +35,11 @@ export declare function deserializeEmbedding(buf: Buffer): Float32Array;
  * Check if the embedding system is ready (vocabulary loaded with sufficient words).
  */
 export declare function isReady(): boolean;
+/**
+ * Stable hash of the current vocabulary's word→dimension mapping. When this
+ * changes, previously-stored embedding vectors become silently incorrect
+ * because dimension N now represents a different word. Callers (MemoryStore
+ * backfill) use this hash to detect staleness and invalidate stored vectors.
+ */
+export declare function getVocabHash(): string;
 //# sourceMappingURL=embeddings.d.ts.map

package/dist/memory/embeddings.js CHANGED Viewed

@@ -9,6 +9,7 @@
  * Query-time: embed the query, compute cosine similarity against stored vectors.
  */
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
+import { createHash } from 'node:crypto';
 import path from 'node:path';
 import pino from 'pino';
 import { BASE_DIR } from '../config.js';
@@ -163,6 +164,19 @@ export function isReady() {
     loadVocab();
     return vocabWords.length >= 50; // need at least 50 vocab words
 }
+/**
+ * Stable hash of the current vocabulary's word→dimension mapping. When this
+ * changes, previously-stored embedding vectors become silently incorrect
+ * because dimension N now represents a different word. Callers (MemoryStore
+ * backfill) use this hash to detect staleness and invalidate stored vectors.
+ */
+export function getVocabHash() {
+    loadVocab();
+    if (vocabWords.length === 0)
+        return '';
+    // Order-sensitive: dimension assignment depends on insertion order.
+    return createHash('sha1').update(vocabWords.join('|')).digest('hex').slice(0, 16);
+}
 const STOP_WORDS = new Set([
     'the', 'be', 'to', 'of', 'and', 'in', 'that', 'have', 'it', 'for',
     'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but',

package/dist/memory/store.d.ts CHANGED Viewed

@@ -174,10 +174,15 @@ export declare class MemoryStore {
         salienceThreshold?: number;
         accessLogRetentionDays?: number;
         transcriptRetentionDays?: number;
+        behavioralRetentionDays?: number;
     }): {
         episodicPruned: number;
         accessLogPruned: number;
         transcriptsPruned: number;
+        skillUsagePruned: number;
+        feedbackPruned: number;
+        reflectionsPruned: number;
+        usageLogPruned: number;
     };
     /**
      * Get chunks within a date range, ordered chronologically.
@@ -533,6 +538,7 @@ export declare class MemoryStore {
     buildEmbeddings(): {
         vocabSize: number;
         backfilled: number;
+        invalidated: number;
     };
     /**
      * Delete all chunks, wikilinks, file hash, and access log for a given file.

package/dist/memory/store.js CHANGED Viewed

@@ -10,9 +10,10 @@
  * (single-user, one MCP subprocess handles all writes).
  */
 import { createHash } from 'node:crypto';
-import { appendFileSync, existsSync, mkdirSync, readFileSync, readdirSync, statSync } from 'node:fs';
+import { appendFileSync, existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs';
 import path from 'node:path';
 import Database from 'better-sqlite3';
+import { BASE_DIR } from '../config.js';
 import * as embeddingsModule from './embeddings.js';
 import { chunkFile } from './chunker.js';
 import { mmrRerank } from './mmr.js';
@@ -184,6 +185,24 @@ export class MemoryStore {
         catch {
             // Index already exists
         }
+        // Hot-path indices: every chat turn sorts/filters chunks by updated_at
+        // (recency) and by (agent_slug, updated_at) for agent-scoped recent
+        // context. Without these the queries do full table scans.
+        try {
+            this.conn.exec('CREATE INDEX idx_chunks_updated_at ON chunks(updated_at DESC)');
+        }
+        catch { /* already exists */ }
+        try {
+            this.conn.exec('CREATE INDEX idx_chunks_agent_updated ON chunks(agent_slug, updated_at DESC)');
+        }
+        catch { /* already exists */ }
+        // Embedding filter — searchByEmbedding's base predicate is
+        // `embedding IS NOT NULL`; a partial index turns that into an
+        // index-only scan for the candidate set.
+        try {
+            this.conn.exec('CREATE INDEX idx_chunks_has_embedding ON chunks(id) WHERE embedding IS NOT NULL');
+        }
+        catch { /* already exists */ }
         // Access log table for salience tracking
         this.conn.exec(`
       CREATE TABLE IF NOT EXISTS access_log (
@@ -581,32 +600,35 @@ export class MemoryStore {
                 stats.filesDeleted++;
             }
         }
-        // Process changed/new files
-        for (const filePath of filesToUpdate) {
+        // Process changed/new files inside a single transaction so a 1000-file
+        // sync produces one WAL commit instead of 1000+. Prepared statements are
+        // hoisted out of the loop — better-sqlite3 caches by SQL text anyway, but
+        // the explicit handle avoids re-parsing and makes the intent clear.
+        const insertStmt = this.conn.prepare(`INSERT INTO chunks
+       (source_file, section, content, chunk_type, frontmatter_json, content_hash, category, topic)
+       VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
+        const upsertHashStmt = this.conn.prepare(`INSERT OR REPLACE INTO file_hashes (rel_path, content_hash, last_synced)
+       VALUES (?, ?, datetime('now'))`);
+        const processFile = (filePath) => {
             const rel = path.relative(this.vaultDir, filePath);
             const chunks = chunkFile(filePath, this.vaultDir);
             if (chunks.length === 0)
-                continue;
-            // Delete old chunks for this file
+                return;
             this.deleteFileChunks(rel);
-            // Insert new chunks
-            const insertStmt = this.conn.prepare(`INSERT INTO chunks
-         (source_file, section, content, chunk_type, frontmatter_json, content_hash, category, topic)
-         VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
             for (const chunk of chunks) {
                 insertStmt.run(chunk.sourceFile, chunk.section, chunk.content, chunk.chunkType, chunk.frontmatterJson, chunk.contentHash, chunk.category ?? null, chunk.topic ?? null);
             }
-            // Parse and index wikilinks
             this.indexWikilinks(rel, filePath);
-            // Update file hash
             const bytes = readFileSync(filePath);
             const fileHash = createHash('sha256').update(bytes).digest('hex').slice(0, 16);
-            this.conn
-                .prepare(`INSERT OR REPLACE INTO file_hashes (rel_path, content_hash, last_synced)
-           VALUES (?, ?, datetime('now'))`)
-                .run(rel, fileHash);
+            upsertHashStmt.run(rel, fileHash);
             stats.filesUpdated++;
-        }
+        };
+        const processAll = this.conn.transaction((files) => {
+            for (const f of files)
+                processFile(f);
+        });
+        processAll(filesToUpdate);
         // Count total chunks
         const countRow = this.conn
             .prepare('SELECT COUNT(*) as cnt FROM chunks')
@@ -838,17 +860,20 @@ export class MemoryStore {
      * Scans chunks that have stored embeddings and returns top matches.
      */
     searchByEmbedding(queryVec, limit, agentSlug, strict = false) {
-        const rows = this.conn
-            .prepare(`SELECT id, source_file, section, content, chunk_type, embedding, salience, agent_slug, updated_at, category, topic
-         FROM chunks
-         WHERE embedding IS NOT NULL`)
-            .all();
+        // Push agent-isolation into SQL so we don't deserialize embeddings for
+        // rows we'd immediately reject. Soft isolation (non-strict) still loads
+        // all embeddings because the boost is applied post-scoring, but at
+        // least strict mode no longer scans foreign-agent chunks.
+        let sql = 'SELECT id, source_file, section, content, chunk_type, embedding, salience, agent_slug, updated_at, category, topic FROM chunks WHERE embedding IS NOT NULL';
+        const params = [];
+        if (strict && agentSlug) {
+            sql += ' AND (agent_slug IS NULL OR agent_slug = ?)';
+            params.push(agentSlug);
+        }
+        const rows = this.conn.prepare(sql).all(...params);
         const scored = [];
         for (const row of rows) {
             try {
-                // Hard isolation: skip chunks from other agents (allow own + global)
-                if (strict && agentSlug && row.agent_slug !== null && row.agent_slug !== agentSlug)
-                    continue;
                 const vec = embeddingsModule.deserializeEmbedding(row.embedding);
                 const sim = embeddingsModule.cosineSimilarity(queryVec, vec);
                 if (sim < 0.15)
@@ -1148,6 +1173,10 @@ export class MemoryStore {
         const threshold = opts.salienceThreshold ?? 0.01;
         const accessRetention = opts.accessLogRetentionDays ?? 60;
         const transcriptRetention = opts.transcriptRetentionDays ?? 90;
+        // Behavioral telemetry kept longer than transcripts so the feedback loop
+        // (getFeedbackStats, getBehavioralPatterns, getSkillsToSuppress) has a
+        // wide enough window to aggregate meaningful signal.
+        const behavioralRetention = opts.behavioralRetentionDays ?? 180;
         // Prune stale episodic chunks (not vault-sourced content)
         const episodicResult = this.conn
             .prepare(`DELETE FROM chunks
@@ -1167,10 +1196,30 @@ export class MemoryStore {
             .prepare(`DELETE FROM transcripts
          WHERE created_at < datetime('now', ?)`)
             .run(`-${transcriptRetention} days`);
+        // Behavioral telemetry pruning — these tables were previously unbounded.
+        // Each is append-only, so a rolling window is safe; aggregate stats
+        // consume the window directly rather than historical totals.
+        const skillUsageResult = this.conn
+            .prepare(`DELETE FROM skill_usage WHERE retrieved_at < datetime('now', ?)`)
+            .run(`-${behavioralRetention} days`);
+        const feedbackResult = this.conn
+            .prepare(`DELETE FROM feedback WHERE created_at < datetime('now', ?)`)
+            .run(`-${behavioralRetention} days`);
+        const reflectionsResult = this.conn
+            .prepare(`DELETE FROM session_reflections WHERE created_at < datetime('now', ?)`)
+            .run(`-${behavioralRetention} days`);
+        // Usage log is denser (per-exchange) — keep a shorter window.
+        const usageResult = this.conn
+            .prepare(`DELETE FROM usage_log WHERE created_at < datetime('now', ?)`)
+            .run(`-${Math.min(behavioralRetention, 90)} days`);
         return {
             episodicPruned: episodicResult.changes,
             accessLogPruned: accessResult.changes,
             transcriptsPruned: transcriptResult.changes,
+            skillUsagePruned: skillUsageResult.changes,
+            feedbackPruned: feedbackResult.changes,
+            reflectionsPruned: reflectionsResult.changes,
+            usageLogPruned: usageResult.changes,
         };
     }
     // ── Timeline Query ─────────────────────────────────────────────
@@ -2045,25 +2094,55 @@ export class MemoryStore {
             .prepare('SELECT id, content FROM chunks')
             .all();
         if (rows.length === 0)
-            return { vocabSize: 0, backfilled: 0 };
+            return { vocabSize: 0, backfilled: 0, invalidated: 0 };
+        // Capture prior vocab hash BEFORE rebuild. If buildVocab produces a
+        // different word→dimension mapping, previously-stored embedding vectors
+        // become silently wrong (dimension N now represents a different word).
+        const hashFile = path.join(BASE_DIR, '.embedding-vocab.hash');
+        let priorHash = '';
+        try {
+            if (existsSync(hashFile))
+                priorHash = readFileSync(hashFile, 'utf-8').trim();
+        }
+        catch { /* first run */ }
         // Build vocabulary from entire corpus (including consolidated summaries)
         embeddingsModule.buildVocab(rows.map((r) => r.content));
         if (!embeddingsModule.isReady())
-            return { vocabSize: 0, backfilled: 0 };
+            return { vocabSize: 0, backfilled: 0, invalidated: 0 };
+        // If the vocab shifted, invalidate every stored vector so they re-embed
+        // against the new word→dim mapping. Without this, old vectors silently
+        // mismatch query vectors and cosine similarity returns nonsense.
+        const newHash = embeddingsModule.getVocabHash();
+        let invalidated = 0;
+        if (priorHash && priorHash !== newHash) {
+            const res = this.conn.prepare('UPDATE chunks SET embedding = NULL WHERE embedding IS NOT NULL').run();
+            invalidated = res.changes;
+            // Count is returned in the result object — callers (maintenance cycle)
+            // log it there. No local logger in this file to avoid the import.
+        }
+        try {
+            writeFileSync(hashFile, newHash);
+        }
+        catch { /* non-fatal */ }
         // Backfill embeddings for all chunks that don't have one
         const missing = this.conn
             .prepare('SELECT id, content FROM chunks WHERE embedding IS NULL')
             .all();
         const updateStmt = this.conn.prepare('UPDATE chunks SET embedding = ? WHERE id = ?');
         let backfilled = 0;
-        for (const row of missing) {
-            const vec = embeddingsModule.embed(row.content);
-            if (vec) {
-                updateStmt.run(embeddingsModule.serializeEmbedding(vec), row.id);
-                backfilled++;
+        // Wrap backfill in a transaction — potentially thousands of UPDATEs
+        // per vocab shift, and a single WAL commit is dramatically faster.
+        const backfillAll = this.conn.transaction((items) => {
+            for (const row of items) {
+                const vec = embeddingsModule.embed(row.content);
+                if (vec) {
+                    updateStmt.run(embeddingsModule.serializeEmbedding(vec), row.id);
+                    backfilled++;
+                }
             }
-        }
-        return { vocabSize: rows.length, backfilled };
+        });
+        backfillAll(missing);
+        return { vocabSize: rows.length, backfilled, invalidated };
     }
     // ── Helpers ───────────────────────────────────────────────────────
     /**

package/dist/tools/shared.d.ts CHANGED Viewed

@@ -86,10 +86,15 @@ export type MemoryStoreType = {
         salienceThreshold?: number;
         accessLogRetentionDays?: number;
         transcriptRetentionDays?: number;
+        behavioralRetentionDays?: number;
     }): {
         episodicPruned: number;
         accessLogPruned: number;
         transcriptsPruned: number;
+        skillUsagePruned: number;
+        feedbackPruned: number;
+        reflectionsPruned: number;
+        usageLogPruned: number;
     };
     checkDuplicate(content: string, sourceFile?: string): {
         isDuplicate: boolean;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clementine-agent",
-  "version": "1.0.32",
+  "version": "1.0.33",
   "description": "Clementine — Personal AI Assistant (TypeScript)",
   "type": "module",
   "main": "dist/index.js",