persyst-mcp 2.1.0 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,387 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * extract-worker.js — PAMP Background Queue Worker
5
+ *
6
+ * Processes extraction jobs from the disk-based queue at ~/.persyst/queue/.
7
+ * Spawned as a detached child process by the hook — runs independently.
8
+ *
9
+ * Lifecycle:
10
+ * 1. Reads .json job files from ~/.persyst/queue/
11
+ * 2. For each job: runs Tier 3 LLM extraction
12
+ * 3. Deduplicates facts against existing memories (semantic check)
13
+ * 4. Checks for recent agent-written memories to avoid race conditions
14
+ * 5. Writes validated facts to the database
15
+ * 6. Cleans up job file on success, increments retry on failure
16
+ * 7. Exits when queue is empty
17
+ *
18
+ * Safety bounds:
19
+ * - Max 3 retries per job before archiving to failed/
20
+ * - Queue trimming: deletes jobs older than 7 days
21
+ * - Max 50 jobs per worker run to prevent CPU starvation
22
+ * - Process lock file to prevent multiple concurrent workers
23
+ */
24
+
25
+ import { homedir } from 'os';
26
+ import { join } from 'path';
27
+ import {
28
+ readdirSync, readFileSync, writeFileSync, unlinkSync,
29
+ mkdirSync, existsSync, statSync, renameSync
30
+ } from 'fs';
31
+ import { fileURLToPath } from 'url';
32
+ import { dirname } from 'path';
33
+
34
+ const __filename = fileURLToPath(import.meta.url);
35
+ const __dirname = dirname(__filename);
36
+
37
+ // ============================================================
38
+ // PATHS
39
+ // ============================================================
40
+
41
+ const PERSYST_DIR = join(homedir(), '.persyst');
42
+ const QUEUE_DIR = join(PERSYST_DIR, 'queue');
43
+ const FAILED_DIR = join(PERSYST_DIR, 'queue', 'failed');
44
+ const LOCK_FILE = join(QUEUE_DIR, '.worker.lock');
45
+ const LOG_FILE = join(PERSYST_DIR, 'worker.log');
46
+
47
+ mkdirSync(QUEUE_DIR, { recursive: true });
48
+ mkdirSync(FAILED_DIR, { recursive: true });
49
+
50
+ // ============================================================
51
+ // CONSTANTS
52
+ // ============================================================
53
+
54
+ const MAX_RETRIES = 3;
55
+ const MAX_JOBS_PER_RUN = 50;
56
+ const MAX_QUEUE_AGE_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
57
+ const DEDUP_SIMILARITY_THRESHOLD = 0.80;
58
+ const RECENT_MEMORY_WINDOW_S = 60; // Check last 60 seconds for agent race
59
+ const MIN_CONFIDENCE = 0.65;
60
+
61
+
62
+
63
+ // ============================================================
64
+ // LOGGING
65
+ // ============================================================
66
+
67
+ function log(level, msg) {
68
+ const ts = new Date().toISOString();
69
+ const line = `[${ts}] [${level}] ${msg}\n`;
70
+ process.stderr.write(line);
71
+ try {
72
+ writeFileSync(LOG_FILE, line, { flag: 'a' });
73
+ } catch (_) { /* non-critical */ }
74
+ }
75
+
76
+ // ============================================================
77
+ // PROCESS LOCK (prevent concurrent workers)
78
+ // ============================================================
79
+
80
+ function acquireLock() {
81
+ try {
82
+ if (existsSync(LOCK_FILE)) {
83
+ const lockContent = readFileSync(LOCK_FILE, 'utf8').trim();
84
+ const lockPid = parseInt(lockContent, 10);
85
+
86
+ // Check if the locking process is still alive
87
+ if (lockPid && lockPid !== process.pid) {
88
+ try {
89
+ process.kill(lockPid, 0); // Signal 0 = check existence
90
+ log('WARN', `Another worker is running (PID: ${lockPid}), exiting.`);
91
+ return false;
92
+ } catch (_) {
93
+ // Process is dead — stale lock, claim it
94
+ log('INFO', `Stale lock from PID ${lockPid}, claiming.`);
95
+ }
96
+ }
97
+ }
98
+
99
+ writeFileSync(LOCK_FILE, String(process.pid));
100
+ return true;
101
+ } catch (err) {
102
+ log('ERROR', `Lock acquisition failed: ${err.message}`);
103
+ return false;
104
+ }
105
+ }
106
+
107
+ function releaseLock() {
108
+ try {
109
+ if (existsSync(LOCK_FILE)) {
110
+ const content = readFileSync(LOCK_FILE, 'utf8').trim();
111
+ if (content === String(process.pid)) {
112
+ unlinkSync(LOCK_FILE);
113
+ }
114
+ }
115
+ } catch (_) { /* best-effort */ }
116
+ }
117
+
118
+ // ============================================================
119
+ // QUEUE MANAGEMENT
120
+ // ============================================================
121
+
122
+ /**
123
+ * Clean old queue files (older than 7 days).
124
+ */
125
+ function cleanOldJobs() {
126
+ const now = Date.now();
127
+ let cleaned = 0;
128
+
129
+ try {
130
+ const files = readdirSync(QUEUE_DIR).filter(f => f.endsWith('.json'));
131
+ for (const file of files) {
132
+ const filePath = join(QUEUE_DIR, file);
133
+ const stat = statSync(filePath);
134
+ if (now - stat.mtimeMs > MAX_QUEUE_AGE_MS) {
135
+ unlinkSync(filePath);
136
+ cleaned++;
137
+ }
138
+ }
139
+ if (cleaned > 0) {
140
+ log('INFO', `Cleaned ${cleaned} expired queue files.`);
141
+ }
142
+ } catch (err) {
143
+ log('WARN', `Queue cleanup error: ${err.message}`);
144
+ }
145
+ }
146
+
147
+ /**
148
+ * Read all pending job files from the queue, sorted oldest-first.
149
+ * @returns {Array<{path: string, data: Object}>}
150
+ */
151
+ function readJobQueue() {
152
+ try {
153
+ const files = readdirSync(QUEUE_DIR)
154
+ .filter(f => f.endsWith('.json') && !f.startsWith('.'))
155
+ .sort(); // Filenames include timestamps, so sort = oldest first
156
+
157
+ return files.slice(0, MAX_JOBS_PER_RUN).map(file => {
158
+ const filePath = join(QUEUE_DIR, file);
159
+ try {
160
+ const data = JSON.parse(readFileSync(filePath, 'utf8'));
161
+ return { path: filePath, filename: file, data };
162
+ } catch (_) {
163
+ // Corrupted file — move to failed
164
+ try { renameSync(filePath, join(FAILED_DIR, file)); } catch (__) {}
165
+ return null;
166
+ }
167
+ }).filter(Boolean);
168
+ } catch (err) {
169
+ log('ERROR', `Failed to read queue: ${err.message}`);
170
+ return [];
171
+ }
172
+ }
173
+
174
+ // ============================================================
175
+ // DEDUPLICATION
176
+ // ============================================================
177
+
178
+ /**
179
+ * Check if a fact already exists in the database.
180
+ * Uses exact match first (fast), then semantic similarity (slower).
181
+ *
182
+ * @param {string} factContent - The fact to check
183
+ * @param {Object} db - Database module
184
+ * @param {Function} searchFn - Hybrid search function
185
+ * @returns {Promise<boolean>} true if duplicate
186
+ */
187
+ async function isDuplicate(factContent, db, searchFn) {
188
+ // 1. Exact content match (instant)
189
+ if (db.memoryExists(factContent)) {
190
+ return true;
191
+ }
192
+
193
+ // 2. Semantic similarity check (needs embedding)
194
+ try {
195
+ const results = await searchFn(factContent, 3);
196
+ for (const result of results) {
197
+ const similarity = parseFloat(result.similarity || 0);
198
+ if (similarity >= DEDUP_SIMILARITY_THRESHOLD) {
199
+ log('INFO', `Dedup: "${factContent.slice(0, 60)}..." similar to memory #${result.id} (sim=${similarity})`);
200
+ return true;
201
+ }
202
+ }
203
+ } catch (err) {
204
+ log('WARN', `Dedup search failed: ${err.message}`);
205
+ // Fail open — allow the fact through if search fails
206
+ }
207
+
208
+ return false;
209
+ }
210
+
211
+ /**
212
+ * Check if an agent recently wrote a similar memory (race condition guard).
213
+ * Looks at memories created in the last RECENT_MEMORY_WINDOW_S seconds.
214
+ *
215
+ * @param {string} factContent
216
+ * @param {Object} db
217
+ * @returns {boolean}
218
+ */
219
+ function hasRecentAgentMemory(factContent, db) {
220
+ try {
221
+ const recentMemories = db.getRecentMemories(20);
222
+ const now = Math.floor(Date.now() / 1000);
223
+
224
+ for (const mem of recentMemories) {
225
+ if (now - mem.created_at > RECENT_MEMORY_WINDOW_S) continue;
226
+
227
+ // Simple word-overlap check for race condition detection
228
+ const factWords = new Set(factContent.toLowerCase().split(/\s+/));
229
+ const memWords = new Set(mem.content.toLowerCase().split(/\s+/));
230
+ let overlap = 0;
231
+ for (const w of factWords) {
232
+ if (memWords.has(w)) overlap++;
233
+ }
234
+ const overlapRatio = overlap / Math.max(factWords.size, 1);
235
+ if (overlapRatio > 0.5) {
236
+ log('INFO', `Race guard: "${factContent.slice(0, 50)}..." overlaps with recent memory #${mem.id}`);
237
+ return true;
238
+ }
239
+ }
240
+ } catch (err) {
241
+ log('WARN', `Recent memory check failed: ${err.message}`);
242
+ }
243
+ return false;
244
+ }
245
+
246
+ // ============================================================
247
+ // MAIN WORKER
248
+ // ============================================================
249
+
250
+ async function main() {
251
+ log('INFO', '=== PAMP Worker started ===');
252
+
253
+ // Acquire process lock
254
+ if (!acquireLock()) {
255
+ process.exit(0);
256
+ }
257
+
258
+ try {
259
+ // Clean expired jobs
260
+ cleanOldJobs();
261
+
262
+ // Read pending jobs
263
+ const jobs = readJobQueue();
264
+ if (jobs.length === 0) {
265
+ log('INFO', 'No pending jobs. Exiting.');
266
+ return;
267
+ }
268
+
269
+ log('INFO', `Processing ${jobs.length} job(s)...`);
270
+
271
+ // Lazy-load heavy dependencies only if we have work to do
272
+ const dbModule = await import('../src/database.js');
273
+ const { searchHybrid } = await import('../src/search.js');
274
+ const { generateEmbedding } = await import('../src/embeddings.js');
275
+
276
+ let totalExtracted = 0;
277
+ let totalStored = 0;
278
+ let totalDuplicates = 0;
279
+ let totalFailed = 0;
280
+
281
+ for (const job of jobs) {
282
+ const { path: jobPath, filename, data } = job;
283
+ const retryCount = data._retries || 0;
284
+
285
+ try {
286
+ log('INFO', `Processing: ${filename} (retry: ${retryCount})`);
287
+
288
+ const facts = [];
289
+ let heuristicFacts = [];
290
+
291
+ // 1. Run Tier 2 Heuristic Extraction (always safe, zero cost)
292
+ try {
293
+ const { extractHeuristic } = await import('../src/extractor-heuristic.js');
294
+ heuristicFacts = extractHeuristic(data.text);
295
+ for (const f of heuristicFacts) {
296
+ facts.push({ ...f, tier: 'heuristic' });
297
+ }
298
+ } catch (heurErr) {
299
+ log('ERROR', `Heuristic extraction failed: ${heurErr.message}`);
300
+ }
301
+
302
+ log('INFO', `Extracted ${facts.length} heuristic fact(s)`);
303
+
304
+ // Deduplicate facts within this run
305
+ const uniqueFacts = [];
306
+ const seenFacts = new Set();
307
+ for (const fact of facts) {
308
+ const key = fact.content.toLowerCase().replace(/\s+/g, ' ').trim();
309
+ if (!seenFacts.has(key)) {
310
+ seenFacts.add(key);
311
+ uniqueFacts.push(fact);
312
+ }
313
+ }
314
+
315
+ totalExtracted += uniqueFacts.length;
316
+
317
+ // Process each fact
318
+ for (const fact of uniqueFacts) {
319
+ if (fact.confidence < MIN_CONFIDENCE) {
320
+ log('INFO', `Skipping low-confidence fact (${fact.confidence}): "${fact.content.slice(0, 50)}..."`);
321
+ continue;
322
+ }
323
+
324
+ // Dedup check 1: recent agent memory race
325
+ if (hasRecentAgentMemory(fact.content, dbModule)) {
326
+ totalDuplicates++;
327
+ continue;
328
+ }
329
+
330
+ // Dedup check 2: existing memory search
331
+ if (await isDuplicate(fact.content, dbModule, searchHybrid)) {
332
+ totalDuplicates++;
333
+ continue;
334
+ }
335
+
336
+ // Store the new memory
337
+ try {
338
+ const memoryId = dbModule.insertMemory(fact.content, fact.confidence, {
339
+ source_type: 'agent',
340
+ source_id: data.agent_id || 'pamp-worker',
341
+ confidence: fact.confidence
342
+ }, data.namespace || 'shared');
343
+
344
+ // Generate and store embedding
345
+ const embedding = await generateEmbedding(fact.content);
346
+ dbModule.insertVector(memoryId, embedding);
347
+
348
+ totalStored++;
349
+ log('INFO', `Stored memory #${memoryId}: "${fact.content.slice(0, 60)}..." (${fact.category}, conf=${fact.confidence})`);
350
+ } catch (storeErr) {
351
+ log('ERROR', `Failed to store fact: ${storeErr.message}`);
352
+ }
353
+ }
354
+
355
+ // Success — remove job file
356
+ try { unlinkSync(jobPath); } catch (_) {}
357
+
358
+ } catch (jobErr) {
359
+ totalFailed++;
360
+ log('ERROR', `Job ${filename} failed: ${jobErr.message}`);
361
+
362
+ // Retry or move to failed
363
+ if (retryCount >= MAX_RETRIES - 1) {
364
+ log('WARN', `Job ${filename} exceeded max retries, moving to failed/`);
365
+ try { renameSync(jobPath, join(FAILED_DIR, filename)); } catch (_) {}
366
+ } else {
367
+ // Increment retry count
368
+ try {
369
+ data._retries = retryCount + 1;
370
+ writeFileSync(jobPath, JSON.stringify(data, null, 2));
371
+ } catch (_) {}
372
+ }
373
+ }
374
+ }
375
+
376
+ log('INFO', `=== Worker complete: extracted=${totalExtracted} stored=${totalStored} dupes=${totalDuplicates} failed=${totalFailed} ===`);
377
+
378
+ } finally {
379
+ releaseLock();
380
+ }
381
+ }
382
+
383
+ main().catch(err => {
384
+ log('ERROR', `Worker crashed: ${err.message}`);
385
+ releaseLock();
386
+ process.exit(1);
387
+ });
package/bin/extract.js ADDED
@@ -0,0 +1,185 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * extract.js — Manual Extraction CLI
5
+ *
6
+ * Allows developers to test and run extraction on demand.
7
+ *
8
+ * Usage:
9
+ * npx persyst-mcp extract "I prefer TypeScript over JavaScript"
10
+ * npx persyst-mcp extract --file conversation.txt
11
+ * npx persyst-mcp extract --tier heuristic "we decided to use PostgreSQL"
12
+ * npx persyst-mcp extract --provider gemini "our stack uses Next.js"
13
+ * npx persyst-mcp extract --dry-run "always use camelCase"
14
+ */
15
+
16
+ import { argv, stdin, stdout } from 'process';
17
+ import { readFileSync, existsSync } from 'fs';
18
+
19
+ // ============================================================
20
+ // ARGUMENT PARSING
21
+ // ============================================================
22
+
23
+ const args = argv.slice(2);
24
+ const flags = {};
25
+ const positional = [];
26
+
27
+ for (let i = 0; i < args.length; i++) {
28
+ if (args[i].startsWith('--')) {
29
+ const flag = args[i].slice(2);
30
+ // Check if next arg is the value (not another flag)
31
+ if (i + 1 < args.length && !args[i + 1].startsWith('--')) {
32
+ flags[flag] = args[i + 1];
33
+ i++;
34
+ } else {
35
+ flags[flag] = true;
36
+ }
37
+ } else {
38
+ positional.push(args[i]);
39
+ }
40
+ }
41
+
42
+ // ============================================================
43
+ // HELP
44
+ // ============================================================
45
+
46
+ if (flags.help || args.length === 0) {
47
+ console.log(`
48
+ Persyst Extract — Manual Fact Extraction CLI
49
+
50
+ USAGE:
51
+ npx persyst-mcp extract <text> Extract from text
52
+ npx persyst-mcp extract --file <path> Extract from file
53
+ echo "text" | npx persyst-mcp extract - Extract from stdin
54
+
55
+ OPTIONS:
56
+ --dry-run Show extracted facts without storing to database
57
+ --json Output results as JSON
58
+ --file <path> Read text from a file
59
+ --help Show this help message
60
+
61
+ EXAMPLES:
62
+ npx persyst-mcp extract "I prefer Postgres over SQLite"
63
+ npx persyst-mcp extract --dry-run --file ./conversation.log
64
+ `);
65
+ process.exit(0);
66
+ }
67
+
68
+ // ============================================================
69
+ // INPUT RESOLUTION
70
+ // ============================================================
71
+
72
+ let inputText = '';
73
+
74
+ if (flags.file) {
75
+ // Read from file
76
+ const filePath = flags.file;
77
+ if (!existsSync(filePath)) {
78
+ console.error(`Error: File not found: ${filePath}`);
79
+ process.exit(1);
80
+ }
81
+ inputText = readFileSync(filePath, 'utf8');
82
+ } else if (positional[0] === '-') {
83
+ // Read from stdin
84
+ inputText = readFileSync(0, 'utf8');
85
+ } else if (positional.length > 0) {
86
+ // Read from positional args
87
+ inputText = positional.join(' ');
88
+ } else {
89
+ console.error('Error: No text provided. Use --help for usage.');
90
+ process.exit(1);
91
+ }
92
+
93
+ if (!inputText.trim()) {
94
+ console.error('Error: Empty input text.');
95
+ process.exit(1);
96
+ }
97
+
98
+ // ============================================================
99
+ // EXTRACTION
100
+ // ============================================================
101
+
102
+ async function run() {
103
+ const dryRun = flags['dry-run'] === true;
104
+ const jsonOutput = flags.json === true;
105
+
106
+ const allFacts = [];
107
+
108
+ // --- Tier 2: Heuristic ---
109
+ const { extractHeuristic } = await import('../src/extractor-heuristic.js');
110
+ const heuristicFacts = extractHeuristic(inputText);
111
+
112
+ for (const f of heuristicFacts) {
113
+ allFacts.push({ ...f, tier: 'heuristic' });
114
+ }
115
+
116
+ if (!jsonOutput) {
117
+ console.log(`\n📋 Heuristic fact(s) extracted: ${heuristicFacts.length}`);
118
+ for (const f of heuristicFacts) {
119
+ console.log(` ✓ [${f.category}] (conf: ${f.confidence}) ${f.content}`);
120
+ }
121
+ }
122
+
123
+ // --- Summary ---
124
+ if (!jsonOutput) {
125
+ console.log(`\n━━━ Total: ${allFacts.length} fact(s) ━━━`);
126
+ }
127
+
128
+ // --- Store to database (unless dry-run) ---
129
+ if (!dryRun && allFacts.length > 0) {
130
+ if (!jsonOutput) {
131
+ console.log(`\n💾 Storing to database...`);
132
+ }
133
+
134
+ const { insertMemory, insertVector, memoryExists } = await import('../src/database.js');
135
+ const { generateEmbedding } = await import('../src/embeddings.js');
136
+
137
+ let stored = 0;
138
+ let dupes = 0;
139
+
140
+ for (const fact of allFacts) {
141
+ // Exact dedup
142
+ if (memoryExists(fact.content)) {
143
+ dupes++;
144
+ if (!jsonOutput) {
145
+ console.log(` ⏭ Duplicate: "${fact.content.slice(0, 50)}..."`);
146
+ }
147
+ continue;
148
+ }
149
+
150
+ const id = insertMemory(fact.content, fact.confidence, {
151
+ source_type: 'agent',
152
+ source_id: `pamp-${fact.tier}`,
153
+ confidence: fact.confidence
154
+ });
155
+
156
+ const embedding = await generateEmbedding(fact.content);
157
+ insertVector(id, embedding);
158
+
159
+ stored++;
160
+ if (!jsonOutput) {
161
+ console.log(` ✅ Stored memory #${id}: "${fact.content.slice(0, 60)}..."`);
162
+ }
163
+ }
164
+
165
+ if (!jsonOutput) {
166
+ console.log(`\n📊 Result: ${stored} stored, ${dupes} duplicates skipped`);
167
+ }
168
+ } else if (dryRun && !jsonOutput) {
169
+ console.log(`\n🔍 Dry run — no facts stored.`);
170
+ }
171
+
172
+ // --- JSON output ---
173
+ if (jsonOutput) {
174
+ console.log(JSON.stringify({
175
+ input_length: inputText.length,
176
+ facts: allFacts,
177
+ dry_run: dryRun
178
+ }, null, 2));
179
+ }
180
+ }
181
+
182
+ run().catch(err => {
183
+ console.error(`\n❌ Extraction failed: ${err.message}`);
184
+ process.exit(1);
185
+ });
package/bin/ingest.js ADDED
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * persyst-ingest — Direct Git Commit Ingester
5
+ *
6
+ * Usage:
7
+ * npx persyst-mcp ingest [repo_path] [count]
8
+ *
9
+ * This script runs directly without starting the MCP server, allowing
10
+ * git hooks or direct CLI commands to populate the memory database.
11
+ */
12
+
13
+ import { getRecentCommits } from '../src/git.js';
14
+ import {
15
+ insertMemory,
16
+ insertVector,
17
+ insertEntity,
18
+ insertEdge,
19
+ memoryExistsByHashPrefix
20
+ } from '../src/database.js';
21
+ import { generateEmbedding } from '../src/embeddings.js';
22
+ import { searchCache } from '../src/cache.js';
23
+
24
+ const repoPath = process.argv[2] || process.cwd();
25
+ const count = parseInt(process.argv[3], 10) || 10;
26
+
27
+ async function run() {
28
+ console.log(`[persyst] Ingesting git commits for: ${repoPath}`);
29
+ try {
30
+ const commits = await getRecentCommits(repoPath, count);
31
+ let added = 0;
32
+ let skipped = 0;
33
+
34
+ for (const commit of commits) {
35
+ const hashPrefix = commit.hash.slice(0, 7);
36
+ // Check if commit already exists in memories
37
+ if (memoryExistsByHashPrefix(`[${hashPrefix}]%`)) {
38
+ skipped++;
39
+ continue;
40
+ }
41
+
42
+ // Insert memory with git provenance
43
+ const id = insertMemory(commit.fullText, commit.importance, {
44
+ source_type: 'git',
45
+ source_id: commit.hash,
46
+ confidence: 0.8
47
+ });
48
+
49
+ // Generate embedding vector and store
50
+ const embedding = await generateEmbedding(commit.fullText);
51
+ insertVector(id, embedding);
52
+
53
+ // Link Author entity
54
+ const authorId = insertEntity(commit.author, 'person');
55
+ if (authorId) {
56
+ insertEdge(authorId, id, 'authored', 'entity', 'memory');
57
+ }
58
+
59
+ // Link Files Touched
60
+ for (const file of commit.files) {
61
+ const fileId = insertEntity(file, 'file');
62
+ if (fileId) {
63
+ insertEdge(fileId, id, 'touches', 'entity', 'memory');
64
+ }
65
+ }
66
+
67
+ added++;
68
+ }
69
+
70
+ if (added > 0) {
71
+ searchCache.invalidate();
72
+ }
73
+
74
+ console.log(`[persyst] Success: Ingested ${added} commits (${skipped} already existed)`);
75
+ process.exit(0);
76
+ } catch (err) {
77
+ console.error(`[persyst] Ingestion failed: ${err.message}`);
78
+ process.exit(1);
79
+ }
80
+ }
81
+
82
+ run();