npm - persyst-mcp - Versions diffs - 2.1.0 → 2.1.2 - Mend

persyst-mcp 2.1.0 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/bin/extract-worker.js +387 -0
package/bin/extract.js +185 -0
package/bin/ingest.js +82 -0
package/bin/init.js +174 -0
package/bin/setup.js +9 -4
package/hooks/persyst-hook.js +195 -10
package/index.js +20 -0
package/package.json +9 -3
package/src/database.js +84 -16
package/src/extractor-heuristic.js +250 -0
package/src/search.js +31 -10
package/src/server.js +1 -1
package/src/tools.js +40 -26

package/bin/extract-worker.js ADDED Viewed

@@ -0,0 +1,387 @@
+#!/usr/bin/env node
+/**
+ * extract-worker.js — PAMP Background Queue Worker
+ *
+ * Processes extraction jobs from the disk-based queue at ~/.persyst/queue/.
+ * Spawned as a detached child process by the hook — runs independently.
+ *
+ * Lifecycle:
+ *   1. Reads .json job files from ~/.persyst/queue/
+ *   2. For each job: runs Tier 3 LLM extraction
+ *   3. Deduplicates facts against existing memories (semantic check)
+ *   4. Checks for recent agent-written memories to avoid race conditions
+ *   5. Writes validated facts to the database
+ *   6. Cleans up job file on success, increments retry on failure
+ *   7. Exits when queue is empty
+ *
+ * Safety bounds:
+ *   - Max 3 retries per job before archiving to failed/
+ *   - Queue trimming: deletes jobs older than 7 days
+ *   - Max 50 jobs per worker run to prevent CPU starvation
+ *   - Process lock file to prevent multiple concurrent workers
+ */
+import { homedir } from 'os';
+import { join } from 'path';
+import {
+  readdirSync, readFileSync, writeFileSync, unlinkSync,
+  mkdirSync, existsSync, statSync, renameSync
+} from 'fs';
+import { fileURLToPath } from 'url';
+import { dirname } from 'path';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+// ============================================================
+// PATHS
+// ============================================================
+const PERSYST_DIR = join(homedir(), '.persyst');
+const QUEUE_DIR = join(PERSYST_DIR, 'queue');
+const FAILED_DIR = join(PERSYST_DIR, 'queue', 'failed');
+const LOCK_FILE = join(QUEUE_DIR, '.worker.lock');
+const LOG_FILE = join(PERSYST_DIR, 'worker.log');
+mkdirSync(QUEUE_DIR, { recursive: true });
+mkdirSync(FAILED_DIR, { recursive: true });
+// ============================================================
+// CONSTANTS
+// ============================================================
+const MAX_RETRIES = 3;
+const MAX_JOBS_PER_RUN = 50;
+const MAX_QUEUE_AGE_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
+const DEDUP_SIMILARITY_THRESHOLD = 0.80;
+const RECENT_MEMORY_WINDOW_S = 60; // Check last 60 seconds for agent race
+const MIN_CONFIDENCE = 0.65;
+// ============================================================
+// LOGGING
+// ============================================================
+function log(level, msg) {
+  const ts = new Date().toISOString();
+  const line = `[${ts}] [${level}] ${msg}\n`;
+  process.stderr.write(line);
+  try {
+    writeFileSync(LOG_FILE, line, { flag: 'a' });
+  } catch (_) { /* non-critical */ }
+}
+// ============================================================
+// PROCESS LOCK (prevent concurrent workers)
+// ============================================================
+function acquireLock() {
+  try {
+    if (existsSync(LOCK_FILE)) {
+      const lockContent = readFileSync(LOCK_FILE, 'utf8').trim();
+      const lockPid = parseInt(lockContent, 10);
+      // Check if the locking process is still alive
+      if (lockPid && lockPid !== process.pid) {
+        try {
+          process.kill(lockPid, 0); // Signal 0 = check existence
+          log('WARN', `Another worker is running (PID: ${lockPid}), exiting.`);
+          return false;
+        } catch (_) {
+          // Process is dead — stale lock, claim it
+          log('INFO', `Stale lock from PID ${lockPid}, claiming.`);
+        }
+      }
+    }
+    writeFileSync(LOCK_FILE, String(process.pid));
+    return true;
+  } catch (err) {
+    log('ERROR', `Lock acquisition failed: ${err.message}`);
+    return false;
+  }
+}
+function releaseLock() {
+  try {
+    if (existsSync(LOCK_FILE)) {
+      const content = readFileSync(LOCK_FILE, 'utf8').trim();
+      if (content === String(process.pid)) {
+        unlinkSync(LOCK_FILE);
+      }
+    }
+  } catch (_) { /* best-effort */ }
+}
+// ============================================================
+// QUEUE MANAGEMENT
+// ============================================================
+/**
+ * Clean old queue files (older than 7 days).
+ */
+function cleanOldJobs() {
+  const now = Date.now();
+  let cleaned = 0;
+  try {
+    const files = readdirSync(QUEUE_DIR).filter(f => f.endsWith('.json'));
+    for (const file of files) {
+      const filePath = join(QUEUE_DIR, file);
+      const stat = statSync(filePath);
+      if (now - stat.mtimeMs > MAX_QUEUE_AGE_MS) {
+        unlinkSync(filePath);
+        cleaned++;
+      }
+    }
+    if (cleaned > 0) {
+      log('INFO', `Cleaned ${cleaned} expired queue files.`);
+    }
+  } catch (err) {
+    log('WARN', `Queue cleanup error: ${err.message}`);
+  }
+}
+/**
+ * Read all pending job files from the queue, sorted oldest-first.
+ * @returns {Array<{path: string, data: Object}>}
+ */
+function readJobQueue() {
+  try {
+    const files = readdirSync(QUEUE_DIR)
+      .filter(f => f.endsWith('.json') && !f.startsWith('.'))
+      .sort(); // Filenames include timestamps, so sort = oldest first
+    return files.slice(0, MAX_JOBS_PER_RUN).map(file => {
+      const filePath = join(QUEUE_DIR, file);
+      try {
+        const data = JSON.parse(readFileSync(filePath, 'utf8'));
+        return { path: filePath, filename: file, data };
+      } catch (_) {
+        // Corrupted file — move to failed
+        try { renameSync(filePath, join(FAILED_DIR, file)); } catch (__) {}
+        return null;
+      }
+    }).filter(Boolean);
+  } catch (err) {
+    log('ERROR', `Failed to read queue: ${err.message}`);
+    return [];
+  }
+}
+// ============================================================
+// DEDUPLICATION
+// ============================================================
+/**
+ * Check if a fact already exists in the database.
+ * Uses exact match first (fast), then semantic similarity (slower).
+ *
+ * @param {string} factContent - The fact to check
+ * @param {Object} db - Database module
+ * @param {Function} searchFn - Hybrid search function
+ * @returns {Promise<boolean>} true if duplicate
+ */
+async function isDuplicate(factContent, db, searchFn) {
+  // 1. Exact content match (instant)
+  if (db.memoryExists(factContent)) {
+    return true;
+  }
+  // 2. Semantic similarity check (needs embedding)
+  try {
+    const results = await searchFn(factContent, 3);
+    for (const result of results) {
+      const similarity = parseFloat(result.similarity || 0);
+      if (similarity >= DEDUP_SIMILARITY_THRESHOLD) {
+        log('INFO', `Dedup: "${factContent.slice(0, 60)}..." similar to memory #${result.id} (sim=${similarity})`);
+        return true;
+      }
+    }
+  } catch (err) {
+    log('WARN', `Dedup search failed: ${err.message}`);
+    // Fail open — allow the fact through if search fails
+  }
+  return false;
+}
+/**
+ * Check if an agent recently wrote a similar memory (race condition guard).
+ * Looks at memories created in the last RECENT_MEMORY_WINDOW_S seconds.
+ *
+ * @param {string} factContent
+ * @param {Object} db
+ * @returns {boolean}
+ */
+function hasRecentAgentMemory(factContent, db) {
+  try {
+    const recentMemories = db.getRecentMemories(20);
+    const now = Math.floor(Date.now() / 1000);
+    for (const mem of recentMemories) {
+      if (now - mem.created_at > RECENT_MEMORY_WINDOW_S) continue;
+      // Simple word-overlap check for race condition detection
+      const factWords = new Set(factContent.toLowerCase().split(/\s+/));
+      const memWords = new Set(mem.content.toLowerCase().split(/\s+/));
+      let overlap = 0;
+      for (const w of factWords) {
+        if (memWords.has(w)) overlap++;
+      }
+      const overlapRatio = overlap / Math.max(factWords.size, 1);
+      if (overlapRatio > 0.5) {
+        log('INFO', `Race guard: "${factContent.slice(0, 50)}..." overlaps with recent memory #${mem.id}`);
+        return true;
+      }
+    }
+  } catch (err) {
+    log('WARN', `Recent memory check failed: ${err.message}`);
+  }
+  return false;
+}
+// ============================================================
+// MAIN WORKER
+// ============================================================
+async function main() {
+  log('INFO', '=== PAMP Worker started ===');
+  // Acquire process lock
+  if (!acquireLock()) {
+    process.exit(0);
+  }
+  try {
+    // Clean expired jobs
+    cleanOldJobs();
+    // Read pending jobs
+    const jobs = readJobQueue();
+    if (jobs.length === 0) {
+      log('INFO', 'No pending jobs. Exiting.');
+      return;
+    }
+    log('INFO', `Processing ${jobs.length} job(s)...`);
+    // Lazy-load heavy dependencies only if we have work to do
+    const dbModule = await import('../src/database.js');
+    const { searchHybrid } = await import('../src/search.js');
+    const { generateEmbedding } = await import('../src/embeddings.js');
+    let totalExtracted = 0;
+    let totalStored = 0;
+    let totalDuplicates = 0;
+    let totalFailed = 0;
+    for (const job of jobs) {
+      const { path: jobPath, filename, data } = job;
+      const retryCount = data._retries || 0;
+      try {
+        log('INFO', `Processing: ${filename} (retry: ${retryCount})`);
+        const facts = [];
+        let heuristicFacts = [];
+        // 1. Run Tier 2 Heuristic Extraction (always safe, zero cost)
+        try {
+          const { extractHeuristic } = await import('../src/extractor-heuristic.js');
+          heuristicFacts = extractHeuristic(data.text);
+          for (const f of heuristicFacts) {
+            facts.push({ ...f, tier: 'heuristic' });
+          }
+        } catch (heurErr) {
+          log('ERROR', `Heuristic extraction failed: ${heurErr.message}`);
+        }
+        log('INFO', `Extracted ${facts.length} heuristic fact(s)`);
+        // Deduplicate facts within this run
+        const uniqueFacts = [];
+        const seenFacts = new Set();
+        for (const fact of facts) {
+          const key = fact.content.toLowerCase().replace(/\s+/g, ' ').trim();
+          if (!seenFacts.has(key)) {
+            seenFacts.add(key);
+            uniqueFacts.push(fact);
+          }
+        }
+        totalExtracted += uniqueFacts.length;
+        // Process each fact
+        for (const fact of uniqueFacts) {
+          if (fact.confidence < MIN_CONFIDENCE) {
+            log('INFO', `Skipping low-confidence fact (${fact.confidence}): "${fact.content.slice(0, 50)}..."`);
+            continue;
+          }
+          // Dedup check 1: recent agent memory race
+          if (hasRecentAgentMemory(fact.content, dbModule)) {
+            totalDuplicates++;
+            continue;
+          }
+          // Dedup check 2: existing memory search
+          if (await isDuplicate(fact.content, dbModule, searchHybrid)) {
+            totalDuplicates++;
+            continue;
+          }
+          // Store the new memory
+          try {
+            const memoryId = dbModule.insertMemory(fact.content, fact.confidence, {
+              source_type: 'agent',
+              source_id: data.agent_id || 'pamp-worker',
+              confidence: fact.confidence
+            }, data.namespace || 'shared');
+            // Generate and store embedding
+            const embedding = await generateEmbedding(fact.content);
+            dbModule.insertVector(memoryId, embedding);
+            totalStored++;
+            log('INFO', `Stored memory #${memoryId}: "${fact.content.slice(0, 60)}..." (${fact.category}, conf=${fact.confidence})`);
+          } catch (storeErr) {
+            log('ERROR', `Failed to store fact: ${storeErr.message}`);
+          }
+        }
+        // Success — remove job file
+        try { unlinkSync(jobPath); } catch (_) {}
+      } catch (jobErr) {
+        totalFailed++;
+        log('ERROR', `Job ${filename} failed: ${jobErr.message}`);
+        // Retry or move to failed
+        if (retryCount >= MAX_RETRIES - 1) {
+          log('WARN', `Job ${filename} exceeded max retries, moving to failed/`);
+          try { renameSync(jobPath, join(FAILED_DIR, filename)); } catch (_) {}
+        } else {
+          // Increment retry count
+          try {
+            data._retries = retryCount + 1;
+            writeFileSync(jobPath, JSON.stringify(data, null, 2));
+          } catch (_) {}
+        }
+      }
+    }
+    log('INFO', `=== Worker complete: extracted=${totalExtracted} stored=${totalStored} dupes=${totalDuplicates} failed=${totalFailed} ===`);
+  } finally {
+    releaseLock();
+  }
+}
+main().catch(err => {
+  log('ERROR', `Worker crashed: ${err.message}`);
+  releaseLock();
+  process.exit(1);
+});

package/bin/extract.js ADDED Viewed

@@ -0,0 +1,185 @@
+#!/usr/bin/env node
+/**
+ * extract.js — Manual Extraction CLI
+ *
+ * Allows developers to test and run extraction on demand.
+ *
+ * Usage:
+ *   npx persyst-mcp extract "I prefer TypeScript over JavaScript"
+ *   npx persyst-mcp extract --file conversation.txt
+ *   npx persyst-mcp extract --tier heuristic "we decided to use PostgreSQL"
+ *   npx persyst-mcp extract --provider gemini "our stack uses Next.js"
+ *   npx persyst-mcp extract --dry-run "always use camelCase"
+ */
+import { argv, stdin, stdout } from 'process';
+import { readFileSync, existsSync } from 'fs';
+// ============================================================
+// ARGUMENT PARSING
+// ============================================================
+const args = argv.slice(2);
+const flags = {};
+const positional = [];
+for (let i = 0; i < args.length; i++) {
+  if (args[i].startsWith('--')) {
+    const flag = args[i].slice(2);
+    // Check if next arg is the value (not another flag)
+    if (i + 1 < args.length && !args[i + 1].startsWith('--')) {
+      flags[flag] = args[i + 1];
+      i++;
+    } else {
+      flags[flag] = true;
+    }
+  } else {
+    positional.push(args[i]);
+  }
+}
+// ============================================================
+// HELP
+// ============================================================
+if (flags.help || args.length === 0) {
+  console.log(`
+  Persyst Extract — Manual Fact Extraction CLI
+  USAGE:
+    npx persyst-mcp extract <text>           Extract from text
+    npx persyst-mcp extract --file <path>    Extract from file
+    echo "text" | npx persyst-mcp extract -  Extract from stdin
+  OPTIONS:
+    --dry-run              Show extracted facts without storing to database
+    --json                 Output results as JSON
+    --file <path>          Read text from a file
+    --help                 Show this help message
+  EXAMPLES:
+    npx persyst-mcp extract "I prefer Postgres over SQLite"
+    npx persyst-mcp extract --dry-run --file ./conversation.log
+  `);
+  process.exit(0);
+}
+// ============================================================
+// INPUT RESOLUTION
+// ============================================================
+let inputText = '';
+if (flags.file) {
+  // Read from file
+  const filePath = flags.file;
+  if (!existsSync(filePath)) {
+    console.error(`Error: File not found: ${filePath}`);
+    process.exit(1);
+  }
+  inputText = readFileSync(filePath, 'utf8');
+} else if (positional[0] === '-') {
+  // Read from stdin
+  inputText = readFileSync(0, 'utf8');
+} else if (positional.length > 0) {
+  // Read from positional args
+  inputText = positional.join(' ');
+} else {
+  console.error('Error: No text provided. Use --help for usage.');
+  process.exit(1);
+}
+if (!inputText.trim()) {
+  console.error('Error: Empty input text.');
+  process.exit(1);
+}
+// ============================================================
+// EXTRACTION
+// ============================================================
+async function run() {
+  const dryRun = flags['dry-run'] === true;
+  const jsonOutput = flags.json === true;
+  const allFacts = [];
+  // --- Tier 2: Heuristic ---
+  const { extractHeuristic } = await import('../src/extractor-heuristic.js');
+  const heuristicFacts = extractHeuristic(inputText);
+  for (const f of heuristicFacts) {
+    allFacts.push({ ...f, tier: 'heuristic' });
+  }
+  if (!jsonOutput) {
+    console.log(`\n📋 Heuristic fact(s) extracted: ${heuristicFacts.length}`);
+    for (const f of heuristicFacts) {
+      console.log(`  ✓ [${f.category}] (conf: ${f.confidence}) ${f.content}`);
+    }
+  }
+  // --- Summary ---
+  if (!jsonOutput) {
+    console.log(`\n━━━ Total: ${allFacts.length} fact(s) ━━━`);
+  }
+  // --- Store to database (unless dry-run) ---
+  if (!dryRun && allFacts.length > 0) {
+    if (!jsonOutput) {
+      console.log(`\n💾 Storing to database...`);
+    }
+    const { insertMemory, insertVector, memoryExists } = await import('../src/database.js');
+    const { generateEmbedding } = await import('../src/embeddings.js');
+    let stored = 0;
+    let dupes = 0;
+    for (const fact of allFacts) {
+      // Exact dedup
+      if (memoryExists(fact.content)) {
+        dupes++;
+        if (!jsonOutput) {
+          console.log(`  ⏭ Duplicate: "${fact.content.slice(0, 50)}..."`);
+        }
+        continue;
+      }
+      const id = insertMemory(fact.content, fact.confidence, {
+        source_type: 'agent',
+        source_id: `pamp-${fact.tier}`,
+        confidence: fact.confidence
+      });
+      const embedding = await generateEmbedding(fact.content);
+      insertVector(id, embedding);
+      stored++;
+      if (!jsonOutput) {
+        console.log(`  ✅ Stored memory #${id}: "${fact.content.slice(0, 60)}..."`);
+      }
+    }
+    if (!jsonOutput) {
+      console.log(`\n📊 Result: ${stored} stored, ${dupes} duplicates skipped`);
+    }
+  } else if (dryRun && !jsonOutput) {
+    console.log(`\n🔍 Dry run — no facts stored.`);
+  }
+  // --- JSON output ---
+  if (jsonOutput) {
+    console.log(JSON.stringify({
+      input_length: inputText.length,
+      facts: allFacts,
+      dry_run: dryRun
+    }, null, 2));
+  }
+}
+run().catch(err => {
+  console.error(`\n❌ Extraction failed: ${err.message}`);
+  process.exit(1);
+});

package/bin/ingest.js ADDED Viewed

@@ -0,0 +1,82 @@
+#!/usr/bin/env node
+/**
+ * persyst-ingest — Direct Git Commit Ingester
+ *
+ * Usage:
+ *   npx persyst-mcp ingest [repo_path] [count]
+ *
+ * This script runs directly without starting the MCP server, allowing
+ * git hooks or direct CLI commands to populate the memory database.
+ */
+import { getRecentCommits } from '../src/git.js';
+import {
+  insertMemory,
+  insertVector,
+  insertEntity,
+  insertEdge,
+  memoryExistsByHashPrefix
+} from '../src/database.js';
+import { generateEmbedding } from '../src/embeddings.js';
+import { searchCache } from '../src/cache.js';
+const repoPath = process.argv[2] || process.cwd();
+const count = parseInt(process.argv[3], 10) || 10;
+async function run() {
+  console.log(`[persyst] Ingesting git commits for: ${repoPath}`);
+  try {
+    const commits = await getRecentCommits(repoPath, count);
+    let added = 0;
+    let skipped = 0;
+    for (const commit of commits) {
+      const hashPrefix = commit.hash.slice(0, 7);
+      // Check if commit already exists in memories
+      if (memoryExistsByHashPrefix(`[${hashPrefix}]%`)) {
+        skipped++;
+        continue;
+      }
+      // Insert memory with git provenance
+      const id = insertMemory(commit.fullText, commit.importance, {
+        source_type: 'git',
+        source_id: commit.hash,
+        confidence: 0.8
+      });
+      // Generate embedding vector and store
+      const embedding = await generateEmbedding(commit.fullText);
+      insertVector(id, embedding);
+      // Link Author entity
+      const authorId = insertEntity(commit.author, 'person');
+      if (authorId) {
+        insertEdge(authorId, id, 'authored', 'entity', 'memory');
+      }
+      // Link Files Touched
+      for (const file of commit.files) {
+        const fileId = insertEntity(file, 'file');
+        if (fileId) {
+          insertEdge(fileId, id, 'touches', 'entity', 'memory');
+        }
+      }
+      added++;
+    }
+    if (added > 0) {
+      searchCache.invalidate();
+    }
+    console.log(`[persyst] Success: Ingested ${added} commits (${skipped} already existed)`);
+    process.exit(0);
+  } catch (err) {
+    console.error(`[persyst] Ingestion failed: ${err.message}`);
+    process.exit(1);
+  }
+}
+run();