npm - @optave/codegraph - Versions diffs - 1.1.0 → 1.4.1 - Mend

@optave/codegraph 1.1.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/LICENSE +190 -190
package/README.md +498 -311
package/grammars/tree-sitter-c_sharp.wasm +0 -0
package/grammars/tree-sitter-go.wasm +0 -0
package/grammars/tree-sitter-hcl.wasm +0 -0
package/grammars/tree-sitter-java.wasm +0 -0
package/grammars/tree-sitter-javascript.wasm +0 -0
package/grammars/tree-sitter-php.wasm +0 -0
package/grammars/tree-sitter-python.wasm +0 -0
package/grammars/tree-sitter-ruby.wasm +0 -0
package/grammars/tree-sitter-rust.wasm +0 -0
package/grammars/tree-sitter-tsx.wasm +0 -0
package/grammars/tree-sitter-typescript.wasm +0 -0
package/package.json +90 -69
package/src/builder.js +161 -162
package/src/cli.js +284 -224
package/src/config.js +103 -55
package/src/constants.js +41 -28
package/src/cycles.js +125 -104
package/src/db.js +129 -117
package/src/embedder.js +253 -59
package/src/export.js +150 -138
package/src/index.js +50 -39
package/src/logger.js +24 -20
package/src/mcp.js +311 -139
package/src/native.js +68 -0
package/src/parser.js +2214 -573
package/src/queries.js +334 -128
package/src/resolve.js +171 -0
package/src/watcher.js +81 -53

package/src/embedder.js CHANGED Viewed

@@ -1,45 +1,70 @@
-import fs from "fs";
-import path from "path";
+import fs from 'node:fs';
+import path from 'node:path';
 import Database from 'better-sqlite3';
 import { findDbPath, openReadonlyOrFail } from './db.js';
-import { warn, debug } from './logger.js';
+import { warn } from './logger.js';
 // Lazy-load transformers (heavy, optional module)
 let pipeline = null;
-let cos_sim = null;
+let _cos_sim = null;
 let extractor = null;
 let activeModel = null;
 export const MODELS = {
-  'minilm': {
+  minilm: {
     name: 'Xenova/all-MiniLM-L6-v2',
     dim: 384,
     desc: 'Smallest, fastest (~23MB). General text.',
-    quantized: true
+    quantized: true,
   },
   'jina-small': {
     name: 'Xenova/jina-embeddings-v2-small-en',
     dim: 512,
     desc: 'Small, good quality (~33MB). General text.',
-    quantized: false
+    quantized: false,
   },
   'jina-base': {
     name: 'Xenova/jina-embeddings-v2-base-en',
     dim: 768,
     desc: 'Good quality (~137MB). General text, 8192 token context.',
-    quantized: false
+    quantized: false,
+  },
+  'jina-code': {
+    name: 'Xenova/jina-embeddings-v2-base-code',
+    dim: 768,
+    desc: 'Code-aware (~137MB). Trained on code+text, best for code search.',
+    quantized: false,
   },
-  'nomic': {
+  nomic: {
     name: 'Xenova/nomic-embed-text-v1',
     dim: 768,
-    desc: 'Best local quality (~137MB). 8192 context, beats OpenAI ada-002.',
-    quantized: false
-  }
+    desc: 'Good local quality (~137MB). 8192 context.',
+    quantized: false,
+  },
+  'nomic-v1.5': {
+    name: 'nomic-ai/nomic-embed-text-v1.5',
+    dim: 768,
+    desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.',
+    quantized: false,
+  },
+  'bge-large': {
+    name: 'Xenova/bge-large-en-v1.5',
+    dim: 1024,
+    desc: 'Best general retrieval (~335MB). Top MTEB scores.',
+    quantized: false,
+  },
 };
 export const DEFAULT_MODEL = 'minilm';
-const BATCH_SIZE_MAP = { 'minilm': 32, 'jina-small': 16, 'jina-base': 8, 'nomic': 8 };
+const BATCH_SIZE_MAP = {
+  minilm: 32,
+  'jina-small': 16,
+  'jina-base': 8,
+  'jina-code': 8,
+  nomic: 8,
+  'nomic-v1.5': 8,
+  'bge-large': 4,
+};
 const DEFAULT_BATCH_SIZE = 32;
 function getModelConfig(modelKey) {
@@ -62,7 +87,7 @@ async function loadTransformers() {
   } catch {
     console.error(
       'Semantic search requires @huggingface/transformers.\n' +
-      'Install it with: npm install @huggingface/transformers'
+        'Install it with: npm install @huggingface/transformers',
     );
     process.exit(1);
   }
@@ -75,7 +100,7 @@ async function loadModel(modelKey) {
   const transformers = await loadTransformers();
   pipeline = transformers.pipeline;
-  cos_sim = transformers.cos_sim;
+  _cos_sim = transformers.cos_sim;
   console.log(`Loading embedding model: ${config.name} (${config.dim}d)...`);
   const opts = config.quantized ? { quantized: true } : {};
@@ -119,7 +144,9 @@ export async function embed(texts, modelKey) {
  * Cosine similarity between two Float32Arrays.
  */
 export function cosineSim(a, b) {
-  let dot = 0, normA = 0, normB = 0;
+  let dot = 0,
+    normA = 0,
+    normB = 0;
   for (let i = 0; i < a.length; i++) {
     dot += a[i] * b[i];
     normA += a[i] * a[i];
@@ -157,9 +184,11 @@ export async function buildEmbeddings(rootDir, modelKey) {
   db.exec('DELETE FROM embeddings');
   db.exec('DELETE FROM embedding_meta');
-  const nodes = db.prepare(
-    `SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`
-  ).all();
+  const nodes = db
+    .prepare(
+      `SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`,
+    )
+    .all();
   console.log(`Building embeddings for ${nodes.length} symbols...`);
@@ -200,7 +229,9 @@ export async function buildEmbeddings(rootDir, modelKey) {
   console.log(`Embedding ${texts.length} symbols...`);
   const { vectors, dim } = await embed(texts, modelKey);
-  const insert = db.prepare('INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview) VALUES (?, ?, ?)');
+  const insert = db.prepare(
+    'INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview) VALUES (?, ?, ?)',
+  );
   const insertMeta = db.prepare('INSERT OR REPLACE INTO embedding_meta (key, value) VALUES (?, ?)');
   const insertAll = db.transaction(() => {
     for (let i = 0; i < vectors.length; i++) {
@@ -214,32 +245,31 @@ export async function buildEmbeddings(rootDir, modelKey) {
   });
   insertAll();
-  console.log(`\nStored ${vectors.length} embeddings (${dim}d, ${getModelConfig(modelKey).name}) in graph.db`);
+  console.log(
+    `\nStored ${vectors.length} embeddings (${dim}d, ${getModelConfig(modelKey).name}) in graph.db`,
+  );
   db.close();
 }
 /**
- * Semantic search with pre-filter support to reduce the search space.
+ * Shared setup for search functions: opens DB, validates embeddings/model, loads rows.
+ * Returns { db, rows, modelKey, storedDim } or null on failure (prints error).
  */
-export async function search(query, customDbPath, opts = {}) {
-  const limit = opts.limit || 15;
-  const noTests = opts.noTests || false;
-  const minScore = opts.minScore || 0.2;
+function _prepareSearch(customDbPath, opts = {}) {
   const db = openReadonlyOrFail(customDbPath);
   let count;
   try {
-    count = db.prepare("SELECT COUNT(*) as c FROM embeddings").get().c;
+    count = db.prepare('SELECT COUNT(*) as c FROM embeddings').get().c;
   } catch {
     console.log('No embeddings table found. Run `codegraph embed` first.');
     db.close();
-    return;
+    return null;
   }
   if (count === 0) {
     console.log('No embeddings found. Run `codegraph embed` first.');
     db.close();
-    return;
+    return null;
   }
   let storedModel = null;
@@ -248,26 +278,23 @@ export async function search(query, customDbPath, opts = {}) {
     const modelRow = db.prepare("SELECT value FROM embedding_meta WHERE key = 'model'").get();
     const dimRow = db.prepare("SELECT value FROM embedding_meta WHERE key = 'dim'").get();
     if (modelRow) storedModel = modelRow.value;
-    if (dimRow) storedDim = parseInt(dimRow.value);
-  } catch { /* old DB without meta table */ }
+    if (dimRow) storedDim = parseInt(dimRow.value, 10);
+  } catch {
+    /* old DB without meta table */
+  }
   let modelKey = opts.model || null;
   if (!modelKey && storedModel) {
     for (const [key, config] of Object.entries(MODELS)) {
-      if (config.name === storedModel) { modelKey = key; break; }
+      if (config.name === storedModel) {
+        modelKey = key;
+        break;
+      }
     }
   }
-  const { vectors: [queryVec], dim } = await embed([query], modelKey);
-  if (storedDim && dim !== storedDim) {
-    console.log(`Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`);
-    console.log(`  Re-run \`codegraph embed\` with the same model, or use --model to match.`);
-    db.close();
-    return;
-  }
   // Pre-filter: allow filtering by kind or file pattern to reduce search space
+  const noTests = opts.noTests || false;
   const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./;
   let sql = `
     SELECT e.node_id, e.vector, e.text_preview, n.name, n.kind, n.file, n.line
@@ -285,15 +312,45 @@ export async function search(query, customDbPath, opts = {}) {
     params.push(`%${opts.filePattern}%`);
   }
   if (conditions.length > 0) {
-    sql += ' WHERE ' + conditions.join(' AND ');
+    sql += ` WHERE ${conditions.join(' AND ')}`;
+  }
+  let rows = db.prepare(sql).all(...params);
+  if (noTests) {
+    rows = rows.filter((row) => !TEST_PATTERN.test(row.file));
   }
-  const rows = db.prepare(sql).all(...params);
+  return { db, rows, modelKey, storedDim };
+}
+/**
+ * Single-query semantic search — returns data instead of printing.
+ * Returns { results: [{ name, kind, file, line, similarity }] } or null on failure.
+ */
+export async function searchData(query, customDbPath, opts = {}) {
+  const limit = opts.limit || 15;
+  const minScore = opts.minScore || 0.2;
+  const prepared = _prepareSearch(customDbPath, opts);
+  if (!prepared) return null;
+  const { db, rows, modelKey, storedDim } = prepared;
+  const {
+    vectors: [queryVec],
+    dim,
+  } = await embed([query], modelKey);
+  if (storedDim && dim !== storedDim) {
+    console.log(
+      `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
+    );
+    console.log(`  Re-run \`codegraph embed\` with the same model, or use --model to match.`);
+    db.close();
+    return null;
+  }
   const results = [];
   for (const row of rows) {
-    if (noTests && TEST_PATTERN.test(row.file)) continue;
     const vec = new Float32Array(new Uint8Array(row.vector).buffer);
     const sim = cosineSim(queryVec, vec);
@@ -303,28 +360,165 @@ export async function search(query, customDbPath, opts = {}) {
         kind: row.kind,
         file: row.file,
         line: row.line,
-        similarity: sim
+        similarity: sim,
       });
     }
   }
   results.sort((a, b) => b.similarity - a.similarity);
+  db.close();
+  return { results: results.slice(0, limit) };
+}
+/**
+ * Multi-query semantic search with Reciprocal Rank Fusion (RRF).
+ * Returns { results: [{ name, kind, file, line, rrf, queryScores }] } or null on failure.
+ */
+export async function multiSearchData(queries, customDbPath, opts = {}) {
+  const limit = opts.limit || 15;
+  const minScore = opts.minScore || 0.2;
+  const k = opts.rrfK || 60;
+  const prepared = _prepareSearch(customDbPath, opts);
+  if (!prepared) return null;
+  const { db, rows, modelKey, storedDim } = prepared;
+  const { vectors: queryVecs, dim } = await embed(queries, modelKey);
+  // Warn about similar queries that may bias RRF results
+  const SIMILARITY_WARN_THRESHOLD = 0.85;
+  for (let i = 0; i < queryVecs.length; i++) {
+    for (let j = i + 1; j < queryVecs.length; j++) {
+      const sim = cosineSim(queryVecs[i], queryVecs[j]);
+      if (sim >= SIMILARITY_WARN_THRESHOLD) {
+        warn(
+          `Queries "${queries[i]}" and "${queries[j]}" are very similar ` +
+            `(${(sim * 100).toFixed(0)}% cosine similarity). ` +
+            `This may bias RRF results toward their shared matches. ` +
+            `Consider using more distinct queries.`,
+        );
+      }
+    }
+  }
+  if (storedDim && dim !== storedDim) {
+    console.log(
+      `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
+    );
+    console.log(`  Re-run \`codegraph embed\` with the same model, or use --model to match.`);
+    db.close();
+    return null;
+  }
-  console.log(`\nSemantic search: "${query}"\n`);
+  // Parse row vectors once
+  const rowVecs = rows.map((row) => new Float32Array(new Uint8Array(row.vector).buffer));
-  const topResults = results.slice(0, limit);
-  if (topResults.length === 0) {
-    console.log('  No results above threshold.');
-  } else {
-    for (const r of topResults) {
-      const bar = '#'.repeat(Math.round(r.similarity * 20));
-      const kindIcon = r.kind === 'function' ? 'f' : r.kind === 'class' ? '*' : 'o';
-      console.log(`  ${(r.similarity * 100).toFixed(1)}% ${bar}`);
-      console.log(`    ${kindIcon} ${r.name} -- ${r.file}:${r.line}`);
+  // For each query: compute similarities, filter by minScore, rank
+  const perQueryRanked = queries.map((_query, qi) => {
+    const scored = [];
+    for (let ri = 0; ri < rows.length; ri++) {
+      const sim = cosineSim(queryVecs[qi], rowVecs[ri]);
+      if (sim >= minScore) {
+        scored.push({ rowIndex: ri, similarity: sim });
+      }
     }
+    scored.sort((a, b) => b.similarity - a.similarity);
+    // Assign 1-indexed ranks
+    return scored.map((item, rank) => ({ ...item, rank: rank + 1 }));
+  });
+  // Fuse results using RRF: for each unique row, sum 1/(k + rank_i) across queries
+  const fusionMap = new Map(); // rowIndex -> { rrfScore, queryScores[] }
+  for (let qi = 0; qi < queries.length; qi++) {
+    for (const item of perQueryRanked[qi]) {
+      if (!fusionMap.has(item.rowIndex)) {
+        fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] });
+      }
+      const entry = fusionMap.get(item.rowIndex);
+      entry.rrfScore += 1 / (k + item.rank);
+      entry.queryScores.push({
+        query: queries[qi],
+        similarity: item.similarity,
+        rank: item.rank,
+      });
+    }
+  }
+  // Build results sorted by RRF score
+  const results = [];
+  for (const [rowIndex, entry] of fusionMap) {
+    const row = rows[rowIndex];
+    results.push({
+      name: row.name,
+      kind: row.kind,
+      file: row.file,
+      line: row.line,
+      rrf: entry.rrfScore,
+      queryScores: entry.queryScores,
+    });
   }
-  console.log(`\n  ${results.length} results total (showing top ${topResults.length})\n`);
+  results.sort((a, b) => b.rrf - a.rrf);
   db.close();
+  return { results: results.slice(0, limit) };
 }
+/**
+ * Semantic search with pre-filter support — CLI wrapper with multi-query detection.
+ */
+export async function search(query, customDbPath, opts = {}) {
+  // Split by semicolons, trim, filter empties
+  const queries = query
+    .split(';')
+    .map((q) => q.trim())
+    .filter((q) => q.length > 0);
+  if (queries.length <= 1) {
+    // Single-query path — preserve original output format
+    const singleQuery = queries[0] || query;
+    const data = await searchData(singleQuery, customDbPath, opts);
+    if (!data) return;
+    console.log(`\nSemantic search: "${singleQuery}"\n`);
+    if (data.results.length === 0) {
+      console.log('  No results above threshold.');
+    } else {
+      for (const r of data.results) {
+        const bar = '#'.repeat(Math.round(r.similarity * 20));
+        const kindIcon = r.kind === 'function' ? 'f' : r.kind === 'class' ? '*' : 'o';
+        console.log(`  ${(r.similarity * 100).toFixed(1)}% ${bar}`);
+        console.log(`    ${kindIcon} ${r.name} -- ${r.file}:${r.line}`);
+      }
+    }
+    console.log(`\n  ${data.results.length} results shown\n`);
+  } else {
+    // Multi-query path — RRF ranking
+    const data = await multiSearchData(queries, customDbPath, opts);
+    if (!data) return;
+    console.log(`\nMulti-query semantic search (RRF, k=${opts.rrfK || 60}):`);
+    queries.forEach((q, i) => {
+      console.log(`  [${i + 1}] "${q}"`);
+    });
+    console.log();
+    if (data.results.length === 0) {
+      console.log('  No results above threshold.');
+    } else {
+      for (const r of data.results) {
+        const kindIcon = r.kind === 'function' ? 'f' : r.kind === 'class' ? '*' : 'o';
+        console.log(`  RRF ${r.rrf.toFixed(4)}  ${kindIcon} ${r.name} -- ${r.file}:${r.line}`);
+        for (const qs of r.queryScores) {
+          const bar = '#'.repeat(Math.round(qs.similarity * 20));
+          console.log(
+            `    [${queries.indexOf(qs.query) + 1}] ${(qs.similarity * 100).toFixed(1)}% ${bar} (rank ${qs.rank})`,
+          );
+        }
+      }
+    }
+    console.log(`\n  ${data.results.length} results shown\n`);
+  }
+}