npm - @lojban/semantic-search-mcp - Versions diffs - 1.0.0 → 1.0.2 - Mend

@lojban/semantic-search-mcp 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -12,7 +12,7 @@ Use it in **Cursor**, **Claude Code**, or any IDE that supports MCP to search th
 ## How it works
-- **Indexing**: Scans directories for `.txt`, `.md`, `.tsv`, `.csv`, `.json`, `.html`, `.xml`. Each non-empty line gets a vector embedding (via [Hugging Face Transformers.js](https://huggingface.co/docs/transformers.js), model `Xenova/all-MiniLM-L6-v2`) and is stored in a local SQLite database with [sqlite-vec](https://github.com/asg017/sqlite-vec).
+- **Indexing**: Scans directories for `.txt`, `.md`, `.tsv`, `.csv`, `.json`, `.html`, `.xml`. Each non-empty line gets a vector embedding (via [Hugging Face Transformers.js](https://huggingface.co/docs/transformers.js), model `Xenova/all-MiniLM-L6-v2`) and is stored in a local SQLite database with [@dao-xyz/sqlite3-vec](https://www.npmjs.com/package/@dao-xyz/sqlite3-vec) (SQLite + sqlite-vec for Node and browser).
 - **Search**: You send a natural-language query; the server embeds it and returns the closest lines by cosine similarity.
 - **Storage**: Index is stored in your project's `.semantic-search/data/` (or set `SEMANTIC_SEARCH_DATA_DIR`). No cloud, no API keys.

package/package.json CHANGED Viewed

@@ -1,21 +1,19 @@
 {
   "name": "@lojban/semantic-search-mcp",
-  "version": "1.0.0",
+  "version": "1.0.2",
   "description": "Local-first MCP server for semantic search using transformers.js and SQLite",
   "type": "module",
   "scripts": {
     "dev": "tsx src/index.ts"
   },
   "dependencies": {
+    "@dao-xyz/sqlite3-vec": "^0.0.19",
     "@huggingface/transformers": "^3.0.0",
     "@modelcontextprotocol/sdk": "^1.0.0",
-    "better-sqlite3": "^11.0.0",
     "glob": "^10.3.0",
-    "sqlite-vec": "^0.1.0",
     "tsx": "^4.0.0"
   },
   "devDependencies": {
-    "@types/better-sqlite3": "^7.6.0",
     "@types/node": "^20.0.0",
     "typescript": "^5.0.0"
   },

package/src/index.ts CHANGED Viewed

@@ -7,210 +7,196 @@ import {
 } from '@modelcontextprotocol/sdk/types.js';
 import path from 'path';
 import { getEmbedding, getBatchEmbeddings } from './embeddings.js';
-import { VectorStorage, SearchResult } from './storage.js';
+import { createVectorStorage, type SearchResult } from './storage.js';
 import { scanDirectories } from './scanner.js';
-import { mkdirSync } from 'fs';
 // Data dir: use env, or project cwd so each workspace has its own index when run via npx from Cursor
 const dataDir =
   process.env.SEMANTIC_SEARCH_DATA_DIR ||
   path.join(process.cwd(), '.semantic-search', 'data');
 const DB_PATH = path.join(dataDir, 'vectors.db');
-mkdirSync(path.dirname(DB_PATH), { recursive: true });
-// Initialize storage
-const storage = new VectorStorage(DB_PATH);
+async function main() {
+  const storage = await createVectorStorage(DB_PATH);
-// Create MCP server
-const server = new Server(
-  {
-    name: 'semantic-search',
-    version: '1.0.0',
-  },
-  {
-    capabilities: {
-      tools: {},
+  const server = new Server(
+    {
+      name: 'semantic-search',
+      version: '1.0.0',
     },
-  }
-);
-// Define available tools
-server.setRequestHandler(ListToolsRequestSchema, async () => {
-  return {
-    tools: [
-      {
-        name: 'index_directories',
-        description: 'Scan directories and index all text file lines for semantic search. Each line gets a vector embedding.',
-        inputSchema: {
-          type: 'object',
-          properties: {
-            directories: {
-              type: 'array',
-              items: { type: 'string' },
-              description: 'List of directory paths to scan and index. Defaults to SEMANTIC_SEARCH_INDEX_DIRS (comma-separated) if unset.',
-            },
-            clear_existing: {
-              type: 'boolean',
-              description: 'Whether to clear the existing index before indexing (default: false)',
-              default: false,
+    {
+      capabilities: {
+        tools: {},
+      },
+    }
+  );
+  server.setRequestHandler(ListToolsRequestSchema, async () => {
+    return {
+      tools: [
+        {
+          name: 'index_directories',
+          description: 'Scan directories and index all text file lines for semantic search. Each line gets a vector embedding.',
+          inputSchema: {
+            type: 'object',
+            properties: {
+              directories: {
+                type: 'array',
+                items: { type: 'string' },
+                description: 'List of directory paths to scan and index. Defaults to SEMANTIC_SEARCH_INDEX_DIRS (comma-separated) if unset.',
+              },
+              clear_existing: {
+                type: 'boolean',
+                description: 'Whether to clear the existing index before indexing (default: false)',
+                default: false,
+              },
             },
+            required: [],
           },
-          required: [],
         },
-      },
-      {
-        name: 'search',
-        description: 'Search for lines semantically similar to the query. Returns the most relevant lines from indexed files.',
-        inputSchema: {
-          type: 'object',
-          properties: {
-            query: {
-              type: 'string',
-              description: 'The search query (natural language)',
-            },
-            limit: {
-              type: 'number',
-              description: 'Maximum number of results to return (default: 10)',
-              default: 10,
+        {
+          name: 'search',
+          description: 'Search for lines semantically similar to the query. Returns the most relevant lines from indexed files.',
+          inputSchema: {
+            type: 'object',
+            properties: {
+              query: {
+                type: 'string',
+                description: 'The search query (natural language)',
+              },
+              limit: {
+                type: 'number',
+                description: 'Maximum number of results to return (default: 10)',
+                default: 10,
+              },
             },
+            required: ['query'],
           },
-          required: ['query'],
         },
-      },
-      {
-        name: 'get_index_stats',
-        description: 'Get statistics about the current index (number of files and lines indexed)',
-        inputSchema: {
-          type: 'object',
-          properties: {},
+        {
+          name: 'get_index_stats',
+          description: 'Get statistics about the current index (number of files and lines indexed)',
+          inputSchema: {
+            type: 'object',
+            properties: {},
+          },
         },
-      },
-    ],
-  };
-});
-// Handle tool calls
-server.setRequestHandler(CallToolRequestSchema, async (request) => {
-  const { name, arguments: args } = request.params;
-  try {
-    switch (name) {
-      case 'index_directories': {
-        let directories = (args as { directories?: string[]; clear_existing?: boolean }).directories;
-        if (!directories?.length) {
-          const envDirs = process.env.SEMANTIC_SEARCH_INDEX_DIRS;
-          directories = envDirs ? envDirs.split(',').map((d) => d.trim()).filter(Boolean) : [];
-        }
-        if (!directories.length) {
-          throw new Error('No directories to index. Set directories in the request or SEMANTIC_SEARCH_INDEX_DIRS (comma-separated).');
+      ],
+    };
+  });
+  server.setRequestHandler(CallToolRequestSchema, async (request) => {
+    const { name, arguments: args } = request.params;
+    try {
+      switch (name) {
+        case 'index_directories': {
+          let directories = (args as { directories?: string[]; clear_existing?: boolean }).directories;
+          if (!directories?.length) {
+            const envDirs = process.env.SEMANTIC_SEARCH_INDEX_DIRS;
+            directories = envDirs ? envDirs.split(',').map((d) => d.trim()).filter(Boolean) : [];
+          }
+          if (!directories.length) {
+            throw new Error('No directories to index. Set directories in the request or SEMANTIC_SEARCH_INDEX_DIRS (comma-separated).');
+          }
+          const clearExisting = (args as { directories?: string[]; clear_existing?: boolean }).clear_existing ?? false;
+          if (clearExisting) {
+            storage.clear();
+          }
+          console.error(`Scanning ${directories.length} directories...`);
+          const lines = await scanDirectories(directories);
+          console.error(`Found ${lines.length} lines to index`);
+          const batchSize = 50;
+          let indexed = 0;
+          for (let i = 0; i < lines.length; i += batchSize) {
+            const batch = lines.slice(i, i + batchSize);
+            const texts = batch.map(l => l.content);
+            const embeddings = await getBatchEmbeddings(texts);
+            const batchData = batch.map((line, idx) => ({
+              filePath: line.filePath,
+              lineNumber: line.lineNumber,
+              content: line.content,
+              embedding: embeddings[idx],
+            }));
+            await storage.upsertLinesBatch(batchData);
+            indexed += batch.length;
+            console.error(`Indexed ${indexed}/${lines.length} lines`);
+          }
+          const stats = await storage.getStats();
+          return {
+            content: [
+              {
+                type: 'text',
+                text: JSON.stringify({
+                  success: true,
+                  indexed_lines: stats.totalLines,
+                  indexed_files: stats.totalFiles,
+                  message: `Successfully indexed ${stats.totalLines} lines from ${stats.totalFiles} files`,
+                }),
+              },
+            ],
+          };
         }
-        const clearExisting = (args as { directories?: string[]; clear_existing?: boolean }).clear_existing ?? false;
-        if (clearExisting) {
-          storage.clear();
+        case 'search': {
+          const query = (args as { query: string; limit?: number }).query;
+          const limit = (args as { query: string; limit?: number }).limit ?? 10;
+          const queryEmbedding = await getEmbedding(query);
+          const results = await storage.search(queryEmbedding, limit);
+          return {
+            content: [
+              {
+                type: 'text',
+                text: JSON.stringify({
+                  query,
+                  results: results.map((r: SearchResult) => ({
+                    file: r.file_path,
+                    line: r.line_number,
+                    content: r.content,
+                    score: Math.round(r.score * 1000) / 1000,
+                  })),
+                }),
+              },
+            ],
+          };
         }
-        // Scan directories
-        console.error(`Scanning ${directories.length} directories...`);
-        const lines = await scanDirectories(directories);
-        console.error(`Found ${lines.length} lines to index`);
-        // Generate embeddings and store
-        const batchSize = 50;
-        let indexed = 0;
-        for (let i = 0; i < lines.length; i += batchSize) {
-          const batch = lines.slice(i, i + batchSize);
-          const texts = batch.map(l => l.content);
-          const embeddings = await getBatchEmbeddings(texts);
-          const batchData = batch.map((line, idx) => ({
-            filePath: line.filePath,
-            lineNumber: line.lineNumber,
-            content: line.content,
-            embedding: embeddings[idx],
-          }));
-          storage.upsertLinesBatch(batchData);
-          indexed += batch.length;
-          console.error(`Indexed ${indexed}/${lines.length} lines`);
+        case 'get_index_stats': {
+          const stats = await storage.getStats();
+          return {
+            content: [
+              {
+                type: 'text',
+                text: JSON.stringify({
+                  total_files: stats.totalFiles,
+                  total_lines: stats.totalLines,
+                }),
+              },
+            ],
+          };
         }
-        const stats = storage.getStats();
-        return {
-          content: [
-            {
-              type: 'text',
-              text: JSON.stringify({
-                success: true,
-                indexed_lines: stats.totalLines,
-                indexed_files: stats.totalFiles,
-                message: `Successfully indexed ${stats.totalLines} lines from ${stats.totalFiles} files`,
-              }),
-            },
-          ],
-        };
+        default:
+          throw new Error(`Unknown tool: ${name}`);
       }
-      case 'search': {
-        const query = (args as { query: string; limit?: number }).query;
-        const limit = (args as { query: string; limit?: number }).limit ?? 10;
-        // Generate query embedding
-        const queryEmbedding = await getEmbedding(query);
-        // Search
-        const results = storage.search(queryEmbedding, limit);
-        return {
-          content: [
-            {
-              type: 'text',
-              text: JSON.stringify({
-                query,
-                results: results.map((r: SearchResult) => ({
-                  file: r.file_path,
-                  line: r.line_number,
-                  content: r.content,
-                  score: Math.round(r.score * 1000) / 1000,
-                })),
-              }),
-            },
-          ],
-        };
-      }
-      case 'get_index_stats': {
-        const stats = storage.getStats();
-        return {
-          content: [
-            {
-              type: 'text',
-              text: JSON.stringify({
-                total_files: stats.totalFiles,
-                total_lines: stats.totalLines,
-              }),
-            },
-          ],
-        };
-      }
-      default:
-        throw new Error(`Unknown tool: ${name}`);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return {
+        content: [{ type: 'text', text: JSON.stringify({ error: message }) }],
+        isError: true,
+      };
     }
-  } catch (error) {
-    const message = error instanceof Error ? error.message : String(error);
-    return {
-      content: [{ type: 'text', text: JSON.stringify({ error: message }) }],
-      isError: true,
-    };
-  }
-});
+  });
-// Start server
-async function main() {
   const transport = new StdioServerTransport();
   await server.connect(transport);
   console.error('Semantic Search MCP Server running on stdio');

package/src/storage.ts CHANGED Viewed

@@ -1,6 +1,7 @@
-import Database from 'better-sqlite3';
+import pkg from '@dao-xyz/sqlite3-vec';
+const { createDatabase } = pkg;
 import path from 'path';
-import * as sqliteVec from 'sqlite-vec';
+import { mkdirSync } from 'fs';
 const EMBEDDING_DIM = 384; // all-MiniLM-L6-v2 produces 384-dim vectors
@@ -19,20 +20,17 @@ export interface SearchResult {
   score: number;
 }
+type DB = Awaited<ReturnType<typeof createDatabase>>;
 export class VectorStorage {
-  private db: Database.Database;
-  constructor(dbPath: string) {
-    this.db = new Database(dbPath);
-    // Load sqlite-vec extension
-    sqliteVec.load(this.db);
+  private db: DB;
+  constructor(db: DB) {
+    this.db = db;
     this.init();
   }
   private init(): void {
-    // Create regular table for metadata
     this.db.exec(`
       CREATE TABLE IF NOT EXISTS lines (
         id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -41,11 +39,8 @@ export class VectorStorage {
         content TEXT NOT NULL,
         UNIQUE(file_path, line_number)
       );
       CREATE INDEX IF NOT EXISTS idx_file ON lines(file_path);
     `);
-    // Create virtual table for vectors using sqlite-vec
     this.db.exec(`
       CREATE VIRTUAL TABLE IF NOT EXISTS vec_lines USING vec0(
         line_id INTEGER PRIMARY KEY,
@@ -57,126 +52,90 @@ export class VectorStorage {
   /**
    * Insert or update a line with its embedding
    */
-  upsertLine(filePath: string, lineNumber: number, content: string, embedding: Float32Array): void {
-    const insertLine = this.db.prepare(`
-      INSERT INTO lines (file_path, line_number, content)
-      VALUES (?, ?, ?)
-      ON CONFLICT(file_path, line_number) DO UPDATE SET
-        content = excluded.content
-      RETURNING id
-    `);
-    const result = insertLine.get(filePath, lineNumber, content) as { id: number };
-    const lineId = result.id;
-    // Insert/update vector
-    // vec0 tables don't support UPSERT, so we delete first just in case
-    const safeId = BigInt(lineId);
-    this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?').run(safeId);
-    const insertVec = this.db.prepare(`
-      INSERT INTO vec_lines (line_id, embedding)
-      VALUES (?, ?)
-    `);
-    insertVec.run(safeId, JSON.stringify(Array.from(embedding)));
+  async upsertLine(filePath: string, lineNumber: number, content: string, embedding: Float32Array): Promise<void> {
+    const insertLine = await this.db.prepare(
+      `INSERT INTO lines (file_path, line_number, content)
+       VALUES (?, ?, ?)
+       ON CONFLICT(file_path, line_number) DO UPDATE SET content = excluded.content`
+    );
+    insertLine.run([filePath, lineNumber, content]);
+    const sel = await this.db.prepare('SELECT id FROM lines WHERE file_path = ? AND line_number = ?');
+    const row = sel.get([filePath, lineNumber]) as { id: number } | undefined;
+    if (row == null) throw new Error('Failed to get line id');
+    const id = Math.trunc(Number(row.id));
+    (await this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?')).run([id]);
+    (await this.db.prepare('INSERT INTO vec_lines (line_id, embedding) VALUES (?, ?)')).run([id, embedding.buffer]);
   }
   /**
    * Batch insert lines for efficiency
    */
-  upsertLinesBatch(lines: Array<{ filePath: string; lineNumber: number; content: string; embedding: Float32Array }>): void {
-    const insertLine = this.db.prepare(`
-      INSERT INTO lines (file_path, line_number, content)
-      VALUES (?, ?, ?)
-      ON CONFLICT(file_path, line_number) DO UPDATE SET
-        content = excluded.content
-      RETURNING id
-    `);
-    // vec0 doesn't support UPSERT, so we use DELETE + INSERT
-    const deleteVec = this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?');
-    const insertVec = this.db.prepare('INSERT INTO vec_lines (line_id, embedding) VALUES (?, ?)');
-    const insertMany = this.db.transaction((items: typeof lines) => {
-      for (const item of items) {
-        const result = insertLine.get(item.filePath, item.lineNumber, item.content) as { id: number | bigint };
-        // Ensure id is treated as appropriate integer type for vec0
-        const id = result.id;
-        const safeId = BigInt(id);
-        deleteVec.run(safeId);
-        insertVec.run(safeId, JSON.stringify(Array.from(item.embedding)));
-      }
-    });
-    return insertMany(lines);
+  async upsertLinesBatch(
+    lines: Array<{ filePath: string; lineNumber: number; content: string; embedding: Float32Array }>
+  ): Promise<void> {
+    const insertLine = await this.db.prepare(
+      `INSERT INTO lines (file_path, line_number, content)
+       VALUES (?, ?, ?)
+       ON CONFLICT(file_path, line_number) DO UPDATE SET content = excluded.content`
+    );
+    const selId = await this.db.prepare('SELECT id FROM lines WHERE file_path = ? AND line_number = ?');
+    const deleteVec = await this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?');
+    const insertVec = await this.db.prepare('INSERT INTO vec_lines (line_id, embedding) VALUES (?, ?)');
+    for (const item of lines) {
+      insertLine.run([item.filePath, item.lineNumber, item.content]);
+      const row = selId.get([item.filePath, item.lineNumber]) as { id: number };
+      const id = Math.trunc(Number(row.id));
+      deleteVec.run([id]);
+      insertVec.run([id, item.embedding.buffer]);
+    }
   }
   /**
-   * Search for similar lines using sqlite-vec's native cosine similarity
+   * Search for similar lines using sqlite-vec cosine distance
    */
-  search(queryEmbedding: Float32Array, limit: number = 10): SearchResult[] {
-    const stmt = this.db.prepare(`
-      SELECT
+  async search(queryEmbedding: Float32Array, limit: number = 10): Promise<SearchResult[]> {
+    const stmt = await this.db.prepare(`
+      SELECT
         l.file_path,
         l.line_number,
         l.content,
-        vec_distance_cosine(v.embedding, ?) as distance
+        vec_distance_cosine(v.embedding, ?1) AS distance
       FROM vec_lines v
       INNER JOIN lines l ON v.line_id = l.id
       ORDER BY distance
-      LIMIT ?
+      LIMIT ?2
     `);
-    const rows = stmt.all(JSON.stringify(Array.from(queryEmbedding)), limit) as Array<{
+    const rows = stmt.all([queryEmbedding.buffer, limit]) as Array<{
       file_path: string;
       line_number: number;
       content: string;
       distance: number;
     }>;
-    return rows.map(row => ({
+    return rows.map((row) => ({
       file_path: row.file_path,
       line_number: row.line_number,
       content: row.content,
-      score: 1 - row.distance, // Convert distance to similarity score
+      score: 1 - row.distance,
     }));
   }
-  /**
-   * Get index statistics
-   */
-  getStats(): { totalFiles: number; totalLines: number } {
-    const filesStmt = this.db.prepare('SELECT COUNT(DISTINCT file_path) as count FROM lines');
-    const linesStmt = this.db.prepare('SELECT COUNT(*) as count FROM lines');
-    const totalFiles = (filesStmt.get() as { count: number }).count;
-    const totalLines = (linesStmt.get() as { count: number }).count;
-    return { totalFiles, totalLines };
+  async getStats(): Promise<{ totalFiles: number; totalLines: number }> {
+    const filesRow = (await this.db.prepare('SELECT COUNT(DISTINCT file_path) AS count FROM lines')).get() as { count: number } | undefined;
+    const linesRow = (await this.db.prepare('SELECT COUNT(*) AS count FROM lines')).get() as { count: number } | undefined;
+    return {
+      totalFiles: filesRow?.count ?? 0,
+      totalLines: linesRow?.count ?? 0,
+    };
   }
-  /**
-   * Remove all lines for a specific file
-   */
-  removeFile(filePath: string): void {
-    const deleteVecs = this.db.prepare(`
-      DELETE FROM vec_lines
-      WHERE line_id IN (SELECT id FROM lines WHERE file_path = ?)
-    `);
-    const deleteLines = this.db.prepare('DELETE FROM lines WHERE file_path = ?');
-    const transaction = this.db.transaction(() => {
-      deleteVecs.run(filePath);
-      deleteLines.run(filePath);
-    });
-    transaction();
+  async removeFile(filePath: string): Promise<void> {
+    (await this.db.prepare('DELETE FROM vec_lines WHERE line_id IN (SELECT id FROM lines WHERE file_path = ?)')).run([filePath]);
+    (await this.db.prepare('DELETE FROM lines WHERE file_path = ?')).run([filePath]);
   }
-  /**
-   * Clear the entire index
-   */
   clear(): void {
     this.db.exec('DELETE FROM vec_lines');
     this.db.exec('DELETE FROM lines');
@@ -187,3 +146,15 @@ export class VectorStorage {
   }
 }
+/**
+ * Create and open the vector storage (async). Use this instead of `new VectorStorage()`.
+ */
+export async function createVectorStorage(dbPath: string): Promise<VectorStorage> {
+  mkdirSync(path.dirname(dbPath), { recursive: true });
+  const db = await createDatabase({
+    database: dbPath,
+  });
+  await db.open();
+  return new VectorStorage(db);
+}