persyst-mcp 1.0.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/git.js CHANGED
@@ -1,37 +1,38 @@
1
1
  /**
2
- * git.js — Git Commit Ingestion
2
+ * git.js — Git Commit Ingestion & Analysis
3
3
  *
4
4
  * Reads git log from a repository and converts commits into memories.
5
- * Useful for giving coding agents context about a project's history.
5
+ * Performs commit categorization, file diff analysis, and imports notes.
6
6
  *
7
- * Each commit becomes a memory like:
8
- * "[abc1234] Fix login bug by John on 2024-01-15"
9
- *
10
- * Deduplicates by commit hash so you can ingest safely multiple times.
7
+ * IMPORTANT: Uses async execFile instead of execSync to avoid blocking
8
+ * the Node.js event loop during git operations (Bug 4 fix).
11
9
  */
12
10
 
13
- import { execSync } from 'child_process';
11
+ import { execFile } from 'child_process';
12
+ import { promisify } from 'util';
13
+
14
+ const execFileAsync = promisify(execFile);
14
15
 
15
16
  /**
16
17
  * Read the N most recent git commits from a repository.
17
18
  *
18
19
  * @param {string} repoPath - Absolute path to the git repo
19
20
  * @param {number} count - Number of commits to read (default: 20)
20
- * @returns {Array<{hash: string, message: string, author: string, date: string, fullText: string}>}
21
+ * @returns {Promise<Array<{hash: string, message: string, author: string, date: string, fullText: string, files: string[], importance: number}>>}
21
22
  */
22
- export function getRecentCommits(repoPath, count = 20) {
23
+ export async function getRecentCommits(repoPath, count = 20) {
23
24
  try {
24
25
  // Use a delimiter to split commits reliably
25
26
  const DELIM = '---PERSYST-COMMIT---';
26
27
  const format = `${DELIM}%n%H%n%an%n%ai%n%s%n%b`;
27
28
 
28
- const output = execSync(
29
- `git log -n ${count} --pretty=format:"${format}"`,
29
+ const { stdout: output } = await execFileAsync(
30
+ 'git',
31
+ ['log', `-n`, `${count}`, `--pretty=format:${format}`],
30
32
  {
31
33
  cwd: repoPath,
32
34
  encoding: 'utf-8',
33
35
  timeout: 10000, // 10s timeout
34
- stdio: ['pipe', 'pipe', 'pipe'] // Suppress stderr
35
36
  }
36
37
  );
37
38
 
@@ -49,17 +50,37 @@ export function getRecentCommits(repoPath, count = 20) {
49
50
  const subject = lines[3].trim();
50
51
  const body = lines.slice(4).join(' ').trim();
51
52
 
53
+ // Fetch git notes if available (represents PR metadata)
54
+ const notes = await getGitNotes(repoPath, hash);
55
+
52
56
  // Build a readable memory string
53
- const fullText = body
57
+ let fullText = body
54
58
  ? `[${hash.slice(0, 7)}] ${subject} — by ${author} on ${date}. ${body}`
55
59
  : `[${hash.slice(0, 7)}] ${subject} — by ${author} on ${date}`;
56
60
 
57
- commits.push({ hash, message: subject, author, date, fullText });
61
+ if (notes) {
62
+ fullText += ` [PR Notes] ${notes}`;
63
+ }
64
+
65
+ // Fetch files touched
66
+ const files = await getCommitFiles(repoPath, hash);
67
+
68
+ // Classify importance based on message
69
+ const classification = classifyCommit(subject);
70
+
71
+ commits.push({
72
+ hash,
73
+ message: subject,
74
+ author,
75
+ date,
76
+ fullText,
77
+ files,
78
+ importance: classification.importance
79
+ });
58
80
  }
59
81
 
60
82
  return commits;
61
83
  } catch (err) {
62
- // Not a git repo, or git not installed
63
84
  const message = err.message || String(err);
64
85
  if (message.includes('not a git repository')) {
65
86
  throw new Error(`Not a git repository: ${repoPath}`);
@@ -77,17 +98,17 @@ export function getRecentCommits(repoPath, count = 20) {
77
98
  *
78
99
  * @param {string} repoPath - Absolute path to the git repo
79
100
  * @param {string} hash - Full commit hash
80
- * @returns {string[]} List of changed file paths
101
+ * @returns {Promise<string[]>} List of changed file paths
81
102
  */
82
- export function getCommitFiles(repoPath, hash) {
103
+ export async function getCommitFiles(repoPath, hash) {
83
104
  try {
84
- const output = execSync(
85
- `git diff-tree --no-commit-id --name-only -r ${hash}`,
105
+ const { stdout: output } = await execFileAsync(
106
+ 'git',
107
+ ['diff-tree', '--no-commit-id', '--name-only', '-r', hash],
86
108
  {
87
109
  cwd: repoPath,
88
110
  encoding: 'utf-8',
89
111
  timeout: 5000,
90
- stdio: ['pipe', 'pipe', 'pipe']
91
112
  }
92
113
  );
93
114
  return output.trim().split('\n').filter(Boolean);
@@ -95,3 +116,49 @@ export function getCommitFiles(repoPath, hash) {
95
116
  return [];
96
117
  }
97
118
  }
119
+
120
+ /**
121
+ * Fetch git notes (representing PR metadata or additional annotations).
122
+ */
123
+ export async function getGitNotes(repoPath, hash) {
124
+ try {
125
+ const { stdout: output } = await execFileAsync(
126
+ 'git',
127
+ ['notes', 'show', hash],
128
+ {
129
+ cwd: repoPath,
130
+ encoding: 'utf-8',
131
+ timeout: 3000,
132
+ }
133
+ );
134
+ return output.trim();
135
+ } catch {
136
+ return '';
137
+ }
138
+ }
139
+
140
+ /**
141
+ * Categorize commit and assign importance.
142
+ */
143
+ export function classifyCommit(subject) {
144
+ const s = subject.toLowerCase().trim();
145
+ if (
146
+ s.startsWith('feat:') ||
147
+ s.startsWith('fix:') ||
148
+ s.startsWith('refactor:') ||
149
+ s.startsWith('breaking:') ||
150
+ s.startsWith('decision:')
151
+ ) {
152
+ return { type: 'architectural', importance: 0.9 };
153
+ }
154
+ if (
155
+ s.startsWith('chore:') ||
156
+ s.startsWith('docs:') ||
157
+ s.startsWith('test:') ||
158
+ s.startsWith('style:') ||
159
+ s.startsWith('ci:')
160
+ ) {
161
+ return { type: 'chore', importance: 0.4 };
162
+ }
163
+ return { type: 'other', importance: 0.6 };
164
+ }
package/src/search.js CHANGED
@@ -1,49 +1,43 @@
1
1
  /**
2
- * search.js — Hybrid Search Engine
2
+ * search.js — Hybrid Search & Context Optimization Engine
3
3
  *
4
- * Combines two search strategies for best results:
5
- *
6
- * 1. KEYWORD SEARCH (FTS5 + BM25)
7
- * Finds exact word matches. Fast. "React" finds "React".
8
- *
9
- * 2. SEMANTIC SEARCH (sqlite-vec + embeddings)
10
- * → Finds by meaning. "dark mode" matches "night theme".
11
- *
12
- * 3. HYBRID = keyword + semantic merged
13
- * → Keyword matches get a +0.2 score boost on top of semantic score.
14
- * → Best of both worlds.
4
+ * Combines keyword and semantic searches, integrates temporal decay,
5
+ * applies agent reputation scores, generates cryptographic search attestations,
6
+ * builds graph-hopped optimized LLM context prompts, and applies MMR
7
+ * for diverse result retrieval.
15
8
  */
16
9
 
17
- import { generateEmbedding } from './embeddings.js';
18
- import {
10
+ import db, {
19
11
  searchKeyword,
20
12
  searchVector,
21
13
  getMemoryById,
22
- boostMemory
14
+ boostMemory,
15
+ getProvenance,
16
+ getMemoriesByEntity
23
17
  } from './database.js';
24
-
25
- // ============================================================
26
- // HYBRID SEARCH (the main export)
27
- // ============================================================
18
+ import { generateEmbedding } from './embeddings.js';
19
+ import { createAttestation } from './attestation.js';
20
+ import { searchCache, LRUCache } from './cache.js';
28
21
 
29
22
  /**
30
23
  * Search memories using both keyword and semantic strategies.
31
- *
32
- * How it works:
33
- * 1. Run FTS5 keyword search → get matching memory IDs
34
- * 2. Run vector semantic search → get memories ranked by meaning
35
- * 3. If a memory appears in BOTH, boost its score by +0.2
36
- * 4. Sort by combined score, return top N
24
+ * Results are cached in the LRU cache for repeated queries.
37
25
  *
38
26
  * @param {string} queryText - What to search for
39
27
  * @param {number} limit - Max results to return (default: 5)
40
- * @returns {Promise<Array>} Ranked search results with scores
41
- *
42
- * @example
43
- * const results = await searchHybrid("night theme", 5);
44
- * // Will find memories about "dark mode" via semantic match
28
+ * @param {string|null} agentId - Identifying string for the querying agent
29
+ * @param {string|null} sessionId - Session identifier
30
+ * @returns {Promise<Array>} Ranked search results (with .attestation property attached)
45
31
  */
46
- export async function searchHybrid(queryText, limit = 5) {
32
+ export async function searchHybrid(queryText, limit = 5, agentId = null, sessionId = null) {
33
+ // --- Check LRU cache first (Feature 1) ---
34
+ const cacheKey = LRUCache.key(queryText, limit);
35
+ const cached = searchCache.get(cacheKey);
36
+ if (cached) {
37
+ console.error(`[persyst-cache] Cache HIT for query: "${queryText.slice(0, 50)}..."`);
38
+ return cached;
39
+ }
40
+
47
41
  // --- Step 1: Keyword search (fast, exact matches) ---
48
42
  const keywordHits = searchKeyword(queryText, limit * 2);
49
43
  const keywordIds = new Set(keywordHits.map(r => r.id));
@@ -56,17 +50,22 @@ export async function searchHybrid(queryText, limit = 5) {
56
50
  id: r.rowid,
57
51
  distance: r.distance,
58
52
  // Convert L2 distance to 0-1 similarity score
59
- // For normalized vectors: cosine_sim = 1 - (L2_distance² / 2)
60
53
  similarity: Math.max(0, 1 - (r.distance * r.distance) / 2)
61
54
  }));
62
55
 
63
56
  // --- Step 3: Merge results with keyword boost ---
64
- const combined = semanticResults.map(r => ({
65
- id: r.id,
66
- similarity: r.similarity,
67
- hybrid_score: r.similarity + (keywordIds.has(r.id) ? 0.2 : 0),
68
- keyword_match: keywordIds.has(r.id)
69
- }));
57
+ const combined = semanticResults
58
+ .map(r => {
59
+ const isKeywordMatch = keywordIds.has(r.id);
60
+ return {
61
+ id: r.id,
62
+ similarity: r.similarity,
63
+ hybrid_score: r.similarity + (isKeywordMatch ? 0.2 : 0),
64
+ keyword_match: isKeywordMatch
65
+ };
66
+ })
67
+ // Filter out low similarity semantic matches if they have no keyword match (threshold 0.35)
68
+ .filter(r => r.keyword_match || r.similarity >= 0.35);
70
69
 
71
70
  // Add keyword-only hits that semantic search missed
72
71
  const semanticIds = new Set(semanticResults.map(r => r.id));
@@ -81,29 +80,356 @@ export async function searchHybrid(queryText, limit = 5) {
81
80
  }
82
81
  }
83
82
 
84
- // --- Step 4: Sort by score, fetch full data, return top N ---
85
- combined.sort((a, b) => b.hybrid_score - a.hybrid_score);
86
- const topResults = combined.slice(0, limit);
87
-
88
- const results = topResults
83
+ // --- Step 4: Fetch full details, apply reputation adjust, sort and return top N ---
84
+ const finalResults = combined
89
85
  .map(r => {
90
86
  const memory = getMemoryById(r.id);
91
- if (!memory) return null; // Memory was deleted between search and fetch
87
+ if (!memory) return null; // Memory was archived or deleted
92
88
 
93
- // Boost importance since this memory was useful
89
+ // Boost memory access metrics
94
90
  boostMemory(r.id);
95
91
 
92
+ // Fetch reputation stats for weighting
93
+ let reputationScore = 1.0;
94
+ let reputationWarning = false;
95
+ const prov = memory.provenance;
96
+ if (prov && prov.source_type === 'agent' && prov.source_id) {
97
+ const agentRow = db.prepare('SELECT reputation_score FROM agent_stats WHERE agent_id = ?').get(prov.source_id);
98
+ if (agentRow) {
99
+ reputationScore = agentRow.reputation_score;
100
+ if (reputationScore < 0.5) {
101
+ reputationWarning = true;
102
+ }
103
+ }
104
+ }
105
+
106
+ // Final score formula: base_score * agent_reputation
107
+ const finalScore = r.hybrid_score * reputationScore;
108
+
96
109
  return {
97
110
  id: memory.id,
98
111
  content: memory.content,
99
112
  importance_score: memory.importance_score,
100
113
  created_at: memory.created_at,
114
+ last_accessed: memory.last_accessed,
101
115
  similarity: r.similarity.toFixed(4),
102
- hybrid_score: r.hybrid_score.toFixed(4),
103
- keyword_match: r.keyword_match
116
+ hybrid_score: finalScore.toFixed(4),
117
+ keyword_match: r.keyword_match,
118
+ reputation_warning: reputationWarning,
119
+ provenance: prov
104
120
  };
105
121
  })
106
- .filter(Boolean); // Remove nulls from deleted memories
122
+ .filter(Boolean);
123
+
124
+ // Sort by final score descending
125
+ finalResults.sort((a, b) => parseFloat(b.hybrid_score) - parseFloat(a.hybrid_score));
126
+
127
+ // --- Step 5: Apply MMR for diverse retrieval (Feature 3) ---
128
+ const mmrResults = applyMMR(finalResults, limit);
129
+
130
+ // Generate cryptographic attestation for audit trails
131
+ const attestation = createAttestation(queryText, mmrResults, agentId, sessionId);
132
+
133
+ // Attach attestation object directly to the array to preserve compatibility with existing tests
134
+ mmrResults.attestation = attestation;
135
+
136
+ // --- Store in LRU cache (Feature 1) ---
137
+ searchCache.set(cacheKey, mmrResults);
138
+
139
+ return mmrResults;
140
+ }
141
+
142
+ /**
143
+ * Apply Maximal Marginal Relevance (MMR) re-ranking for diverse results.
144
+ *
145
+ * MMR balances relevance with diversity by penalizing candidates that
146
+ * are too similar to already-selected results.
147
+ *
148
+ * @param {Array} candidates - Scored search results
149
+ * @param {number} limit - Max results to return
150
+ * @param {number} lambda - Trade-off parameter (0.7 = 70% relevance, 30% diversity)
151
+ * @returns {Array} MMR-reranked results
152
+ */
153
+ function applyMMR(candidates, limit, lambda = 0.7) {
154
+ if (candidates.length <= limit) return candidates;
155
+
156
+ const selected = [];
157
+ const remaining = [...candidates];
158
+
159
+ // Always pick the top-scored result first
160
+ selected.push(remaining.shift());
161
+
162
+ while (selected.length < limit && remaining.length > 0) {
163
+ let bestIdx = -1;
164
+ let bestMMRScore = -Infinity;
165
+
166
+ for (let i = 0; i < remaining.length; i++) {
167
+ const candidate = remaining[i];
168
+ const relevance = parseFloat(candidate.hybrid_score);
169
+
170
+ // Calculate max similarity to any already-selected result
171
+ // Using content-based Jaccard similarity as a proxy
172
+ let maxSimToSelected = 0;
173
+ for (const sel of selected) {
174
+ const sim = jaccardSimilarity(candidate.content, sel.content);
175
+ if (sim > maxSimToSelected) maxSimToSelected = sim;
176
+ }
177
+
178
+ // MMR score = λ * relevance - (1 - λ) * max_similarity_to_selected
179
+ const mmrScore = lambda * relevance - (1 - lambda) * maxSimToSelected;
180
+
181
+ if (mmrScore > bestMMRScore) {
182
+ bestMMRScore = mmrScore;
183
+ bestIdx = i;
184
+ }
185
+ }
186
+
187
+ if (bestIdx >= 0) {
188
+ selected.push(remaining.splice(bestIdx, 1)[0]);
189
+ } else {
190
+ break;
191
+ }
192
+ }
193
+
194
+ return selected;
195
+ }
196
+
197
+ /**
198
+ * Compute Jaccard similarity between two text strings.
199
+ * Uses word-level tokenization for efficiency.
200
+ *
201
+ * @param {string} a - First text
202
+ * @param {string} b - Second text
203
+ * @returns {number} Similarity score between 0 and 1
204
+ */
205
+ function jaccardSimilarity(a, b) {
206
+ const wordsA = new Set(a.toLowerCase().split(/\s+/));
207
+ const wordsB = new Set(b.toLowerCase().split(/\s+/));
208
+
209
+ let intersection = 0;
210
+ for (const word of wordsA) {
211
+ if (wordsB.has(word)) intersection++;
212
+ }
213
+
214
+ const union = wordsA.size + wordsB.size - intersection;
215
+ return union === 0 ? 0 : intersection / union;
216
+ }
217
+
218
+ /**
219
+ * Optimizes the retrieved context by walking the knowledge graph and compressing content to fit max_tokens.
220
+ *
221
+ * @param {string} queryText - User's query
222
+ * @param {number} maxTokens - Hard limit of tokens for context prompt
223
+ * @param {string|null} agentId - Querying agent identifier
224
+ * @param {string|null} sessionId - Current session ID
225
+ */
226
+ export async function getOptimizedContext(queryText, maxTokens, agentId = null, sessionId = null) {
227
+ // 1. Run hybrid search to fetch top 20 memories
228
+ const searchHits = await searchHybrid(queryText, 20, agentId, sessionId);
229
+ const candidates = new Map();
230
+
231
+ for (const hit of searchHits) {
232
+ candidates.set(hit.id, {
233
+ id: hit.id,
234
+ content: hit.content,
235
+ importance_score: hit.importance_score,
236
+ created_at: hit.created_at,
237
+ last_accessed: hit.last_accessed,
238
+ score: parseFloat(hit.hybrid_score),
239
+ provenance: hit.provenance,
240
+ source: 'search'
241
+ });
242
+
243
+ // 2. Perform Graph Hop
244
+ const edges = db.prepare(`
245
+ SELECT * FROM edges
246
+ WHERE (source_id = ? AND source_type = 'memory')
247
+ OR (target_id = ? AND target_type = 'memory')
248
+ `).all(hit.id, hit.id);
249
+
250
+ const entityIds = [];
251
+ for (const edge of edges) {
252
+ if (edge.source_type === 'entity') entityIds.push(edge.source_id);
253
+ if (edge.target_type === 'entity') entityIds.push(edge.target_id);
254
+ }
255
+
256
+ for (const entId of entityIds) {
257
+ const otherMemories = getMemoriesByEntity(entId);
258
+ for (const other of otherMemories) {
259
+ if (other.id === hit.id) continue;
260
+ if (candidates.has(other.id)) continue;
261
+
262
+ const otherProv = getProvenance(other.id);
263
+ candidates.set(other.id, {
264
+ id: other.id,
265
+ content: other.content,
266
+ importance_score: other.importance_score,
267
+ created_at: other.created_at,
268
+ last_accessed: other.last_accessed,
269
+ score: parseFloat(hit.hybrid_score) * 0.5, // 50% graph-hop penalty
270
+ provenance: otherProv,
271
+ source: 'hop'
272
+ });
273
+ }
274
+ }
275
+ }
276
+
277
+ // 3. Apply Scoring Adjustments
278
+ const now = Math.floor(Date.now() / 1000);
279
+ const list = Array.from(candidates.values());
280
+
281
+ for (const c of list) {
282
+ // 3a. Temporal decay: score *= exp(-0.01 * hours_since_accessed)
283
+ const hours = Math.max(0, (now - c.last_accessed) / 3600);
284
+ c.score *= Math.exp(-0.01 * hours);
285
+
286
+ // 3b. Agent reputation weighting
287
+ let reputationScore = 1.0;
288
+ if (c.provenance && c.provenance.source_type === 'agent' && c.provenance.source_id) {
289
+ const agentRow = db.prepare('SELECT reputation_score FROM agent_stats WHERE agent_id = ?').get(c.provenance.source_id);
290
+ if (agentRow) {
291
+ reputationScore = agentRow.reputation_score;
292
+ }
293
+ }
294
+ c.score *= reputationScore;
295
+ }
296
+
297
+ // 4. Sort candidates
298
+ list.sort((a, b) => b.score - a.score);
299
+
300
+ // 5. Compress context to fit maxTokens
301
+ let currentTokens = 0;
302
+ const accepted = [];
303
+
304
+ for (const c of list) {
305
+ // Heuristic: ~4 characters per token + format headers (~15 tokens)
306
+ const estimatedTokens = Math.max(1, Math.ceil(c.content.length / 4) + 15);
307
+ if (currentTokens + estimatedTokens > maxTokens) {
308
+ continue;
309
+ }
310
+ currentTokens += estimatedTokens;
311
+ accepted.push(c);
312
+ }
313
+
314
+ // 6. Format LLM injection context string
315
+ let context = '=== RETRIEVED AGENT MEMORY CONTEXT ===\n';
316
+ if (accepted.length === 0) {
317
+ context += 'No relevant memories retrieved.\n';
318
+ } else {
319
+ for (const a of accepted) {
320
+ let sourceTag = 'Source: manual';
321
+ if (a.provenance) {
322
+ sourceTag = `Source: ${a.provenance.source_type}${a.provenance.source_id ? ` (${a.provenance.source_id})` : ''}`;
323
+ }
324
+ context += `[Memory #${a.id}] (Score: ${a.score.toFixed(4)}, ${sourceTag})\n${a.content}\n---\n`;
325
+ }
326
+ }
327
+ context += '=== END OF CONTEXT ===';
328
+
329
+ // Bug 8 fix: Skip attestation when no results to avoid audit noise
330
+ let attestation = null;
331
+ if (accepted.length > 0) {
332
+ attestation = createAttestation(queryText, accepted, agentId, sessionId);
333
+ }
334
+
335
+ return {
336
+ context,
337
+ memories: accepted,
338
+ attestation
339
+ };
340
+ }
341
+
342
+ /**
343
+ * Performs memory consolidation by merging highly similar memories.
344
+ * Bug 6 fix: DB mutations are wrapped in a transaction for atomicity.
345
+ */
346
+ export async function consolidateMemories() {
347
+ const activeMemories = db.prepare('SELECT * FROM memories WHERE valid_until IS NULL').all();
348
+ const consolidated = [];
349
+ const visited = new Set();
350
+
351
+ // Pre-compile the transaction for atomic DB operations (Bug 6 fix)
352
+ const archiveAndMerge = db.transaction((canonicalId, mergedContent, dupIds) => {
353
+ // Update canonical memory with merged content
354
+ db.prepare('UPDATE memories SET content = ?, last_accessed = unixepoch() WHERE id = ?').run(mergedContent, canonicalId);
355
+
356
+ // Archive duplicates
357
+ for (const dupId of dupIds) {
358
+ db.prepare('UPDATE memories SET valid_until = unixepoch() WHERE id = ?').run(dupId);
359
+ db.prepare('INSERT INTO contradictions (old_memory_id, new_memory_id, resolution_reason) VALUES (?, ?, ?)')
360
+ .run(dupId, canonicalId, `Consolidated into canonical memory #${canonicalId}`);
361
+ }
362
+ });
363
+
364
+ for (const mem of activeMemories) {
365
+ if (visited.has(mem.id)) continue;
366
+
367
+ // Search for similar memories
368
+ const embedding = db.prepare('SELECT embedding FROM memories_vec WHERE rowid = ?').get(mem.id);
369
+ if (!embedding) continue;
370
+
371
+ // sqlite-vec similarity search
372
+ const hits = db.prepare(`
373
+ SELECT rowid AS id, distance
374
+ FROM memories_vec
375
+ WHERE embedding MATCH ?
376
+ AND k = 10
377
+ `).all(embedding.embedding);
378
+
379
+ const duplicates = [];
380
+ for (const hit of hits) {
381
+ if (Number(hit.id) === mem.id) continue;
382
+ if (visited.has(Number(hit.id))) continue;
383
+
384
+ const sim = Math.max(0, 1 - (hit.distance * hit.distance) / 2);
385
+ if (sim > 0.85) {
386
+ const dupMemory = db.prepare('SELECT * FROM memories WHERE id = ? AND valid_until IS NULL').get(Number(hit.id));
387
+ if (dupMemory) {
388
+ duplicates.push(dupMemory);
389
+ }
390
+ }
391
+ }
392
+
393
+ if (duplicates.length > 0) {
394
+ // Group found! Merge them.
395
+ const allMemoriesInGroup = [mem, ...duplicates];
396
+
397
+ // Sort by importance to pick canonical
398
+ allMemoriesInGroup.sort((a, b) => b.importance_score - a.importance_score);
399
+ const canonical = allMemoriesInGroup[0];
400
+ const dupesToArchive = allMemoriesInGroup.slice(1);
401
+
402
+ // Merge contents (unique sentences or concatenated text)
403
+ const contents = allMemoriesInGroup.map(m => m.content.trim());
404
+ const uniqueContents = Array.from(new Set(contents));
405
+ const mergedContent = uniqueContents.join('. ').replace(/\.\./g, '.');
406
+
407
+ // Generate new embedding OUTSIDE the transaction (async operation)
408
+ const newEmbedding = await generateEmbedding(mergedContent);
409
+
410
+ // Run atomic DB transaction for all mutations (Bug 6 fix)
411
+ archiveAndMerge(canonical.id, mergedContent, dupesToArchive.map(d => d.id));
412
+
413
+ // Update vector embedding (also outside transaction since vec0 tables have their own handling)
414
+ db.prepare('DELETE FROM memories_vec WHERE rowid = ?').run(canonical.id);
415
+ db.prepare('INSERT INTO memories_vec (rowid, embedding) VALUES (?, ?)').run(BigInt(canonical.id), Buffer.from(newEmbedding.buffer));
416
+
417
+ for (const dup of dupesToArchive) {
418
+ visited.add(dup.id);
419
+ }
420
+
421
+ visited.add(canonical.id);
422
+ consolidated.push({
423
+ canonical_id: canonical.id,
424
+ merged_content: mergedContent,
425
+ archived_ids: dupesToArchive.map(d => d.id)
426
+ });
427
+ }
428
+ }
107
429
 
108
- return results;
430
+ return {
431
+ success: true,
432
+ consolidated_groups: consolidated.length,
433
+ details: consolidated
434
+ };
109
435
  }
package/src/server.js CHANGED
@@ -3,6 +3,7 @@
3
3
  *
4
4
  * Creates the MCP server, registers all tools, and connects
5
5
  * via stdio transport (the standard MCP communication method).
6
+ * Sets up hourly temporal decay and daily consolidation background tasks.
6
7
  *
7
8
  * IMPORTANT: Never write to stdout — it's reserved for MCP protocol.
8
9
  * All logging goes to stderr via console.error().
@@ -10,8 +11,9 @@
10
11
 
11
12
  import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
12
13
  import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
13
- import { registerTools } from './tools.js';
14
+ import { registerTools, cleanupWatchers } from './tools.js';
14
15
  import { applyTemporalDecay, closeDatabase } from './database.js';
16
+ import { consolidateMemories } from './search.js';
15
17
 
16
18
  /**
17
19
  * Start the Persyst MCP server.
@@ -21,7 +23,7 @@ export async function startServer() {
21
23
  // --- Create MCP server ---
22
24
  const server = new McpServer({
23
25
  name: 'persyst',
24
- version: '1.0.1'
26
+ version: '2.0.0'
25
27
  });
26
28
 
27
29
  // --- Register all tools ---
@@ -32,10 +34,24 @@ export async function startServer() {
32
34
  // Runs every hour: reduces importance of memories not accessed in 7+ days
33
35
  const decayTimer = setInterval(applyTemporalDecay, 3600000);
34
36
 
35
- // --- Graceful shutdown ---
37
+ // --- Start daily consolidation sweep ---
38
+ // Runs every 24 hours: merges similar memories (similarity > 0.85)
39
+ const consolidationTimer = setInterval(async () => {
40
+ console.error('[persyst] Running scheduled daily memory consolidation sweep...');
41
+ try {
42
+ const report = await consolidateMemories();
43
+ console.error(`[persyst] Consolidation sweep completed: consolidated ${report.consolidated_groups} duplicate groups.`);
44
+ } catch (err) {
45
+ console.error('[persyst] Daily consolidation sweep failed:', err.message);
46
+ }
47
+ }, 86400000);
48
+
49
+ // --- Graceful shutdown (Bug 3 fix: also cleans up git watchers) ---
36
50
  const shutdown = () => {
37
51
  console.error('[persyst] Shutting down...');
38
52
  clearInterval(decayTimer);
53
+ clearInterval(consolidationTimer);
54
+ cleanupWatchers(); // Bug 3 fix: stop all git repo watchers
39
55
  closeDatabase();
40
56
  process.exit(0);
41
57
  };
@@ -43,7 +59,6 @@ export async function startServer() {
43
59
  process.on('SIGTERM', shutdown);
44
60
 
45
61
  // --- Connect via stdio ---
46
- // This is how Claude Code, Cursor, and Aider communicate with us
47
62
  const transport = new StdioServerTransport();
48
63
  await server.connect(transport);
49
64