npm - @softerist/heuristic-mcp - Versions diffs - 2.1.47 → 3.0.0 - Mend

@softerist/heuristic-mcp 2.1.47 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

package/.agent/workflows/code-review.md +60 -0
package/.prettierrc +7 -0
package/ARCHITECTURE.md +105 -170
package/CONTRIBUTING.md +32 -113
package/GEMINI.md +73 -0
package/LICENSE +21 -21
package/README.md +161 -54
package/config.json +876 -75
package/debug-pids.js +27 -0
package/eslint.config.js +36 -0
package/features/ann-config.js +37 -26
package/features/clear-cache.js +28 -19
package/features/find-similar-code.js +142 -66
package/features/hybrid-search.js +253 -93
package/features/index-codebase.js +1455 -394
package/features/lifecycle.js +813 -180
package/features/register.js +58 -52
package/index.js +450 -306
package/lib/cache-ops.js +22 -0
package/lib/cache-utils.js +68 -0
package/lib/cache.js +1392 -587
package/lib/call-graph.js +165 -50
package/lib/cli.js +154 -0
package/lib/config.js +462 -121
package/lib/embedding-process.js +77 -0
package/lib/embedding-worker.js +545 -30
package/lib/ignore-patterns.js +61 -59
package/lib/json-worker.js +14 -0
package/lib/json-writer.js +344 -0
package/lib/logging.js +88 -0
package/lib/memory-logger.js +13 -0
package/lib/project-detector.js +13 -17
package/lib/server-lifecycle.js +38 -0
package/lib/settings-editor.js +645 -0
package/lib/tokenizer.js +207 -104
package/lib/utils.js +273 -198
package/lib/vector-store-binary.js +592 -0
package/mcp_config.example.json +13 -0
package/package.json +13 -2
package/scripts/clear-cache.js +6 -17
package/scripts/download-model.js +14 -9
package/scripts/postinstall.js +5 -5
package/search-configs.js +36 -0
package/test/ann-config.test.js +179 -0
package/test/ann-fallback.test.js +6 -6
package/test/binary-store.test.js +69 -0
package/test/cache-branches.test.js +120 -0
package/test/cache-errors.test.js +264 -0
package/test/cache-extra.test.js +300 -0
package/test/cache-helpers.test.js +205 -0
package/test/cache-hnsw-failure.test.js +40 -0
package/test/cache-json-worker.test.js +190 -0
package/test/cache-worker.test.js +102 -0
package/test/cache.test.js +443 -0
package/test/call-graph.test.js +103 -4
package/test/clear-cache.test.js +69 -68
package/test/code-review-workflow.test.js +50 -0
package/test/config.test.js +418 -0
package/test/coverage-gap.test.js +497 -0
package/test/coverage-maximizer.test.js +236 -0
package/test/debug-analysis.js +107 -0
package/test/embedding-model.test.js +173 -103
package/test/embedding-worker-extra.test.js +272 -0
package/test/embedding-worker.test.js +158 -0
package/test/features.test.js +139 -0
package/test/final-boost.test.js +271 -0
package/test/final-polish.test.js +183 -0
package/test/final.test.js +95 -0
package/test/find-similar-code.test.js +191 -0
package/test/helpers.js +92 -11
package/test/helpers.test.js +46 -0
package/test/hybrid-search-basic.test.js +62 -0
package/test/hybrid-search-branch.test.js +202 -0
package/test/hybrid-search-callgraph.test.js +229 -0
package/test/hybrid-search-extra.test.js +81 -0
package/test/hybrid-search.test.js +484 -71
package/test/index-cli.test.js +520 -0
package/test/index-codebase-batch.test.js +119 -0
package/test/index-codebase-branches.test.js +585 -0
package/test/index-codebase-core.test.js +1032 -0
package/test/index-codebase-edge-cases.test.js +254 -0
package/test/index-codebase-errors.test.js +132 -0
package/test/index-codebase-gap.test.js +239 -0
package/test/index-codebase-lines.test.js +151 -0
package/test/index-codebase-watcher.test.js +259 -0
package/test/index-codebase-zone.test.js +259 -0
package/test/index-codebase.test.js +371 -69
package/test/index-memory.test.js +220 -0
package/test/indexer-detailed.test.js +176 -0
package/test/integration.test.js +148 -92
package/test/json-worker.test.js +50 -0
package/test/lifecycle.test.js +541 -0
package/test/master.test.js +198 -0
package/test/perfection.test.js +349 -0
package/test/project-detector.test.js +65 -0
package/test/register.test.js +262 -0
package/test/tokenizer.test.js +55 -93
package/test/ultra-maximizer.test.js +116 -0
package/test/utils-branches.test.js +161 -0
package/test/utils-extra.test.js +116 -0
package/test/utils.test.js +131 -0
package/test/verify_fixes.js +76 -0
package/test/worker-errors.test.js +96 -0
package/test/worker-init.test.js +102 -0
package/test/worker_throttling.test.js +93 -0
package/tools/scripts/benchmark-search.js +95 -0
package/tools/scripts/cache-stats.js +71 -0
package/tools/scripts/manual-search.js +34 -0
package/vitest.config.js +19 -9

package/features/hybrid-search.js CHANGED Viewed

@@ -1,7 +1,7 @@
-import path from "path";
-import fs from "fs/promises";
-import { dotSimilarity } from "../lib/utils.js";
-import { extractSymbolsFromContent } from "../lib/call-graph.js";
+import path from 'path';
+import fs from 'fs/promises';
+import { dotSimilarity } from '../lib/utils.js';
+import { extractSymbolsFromContent } from '../lib/call-graph.js';
 export class HybridSearch {
   constructor(embedder, cache, config) {
@@ -11,6 +11,14 @@ export class HybridSearch {
     this.fileModTimes = new Map(); // Cache for file modification times
   }
+  async getChunkContent(chunkOrIndex) {
+    return await this.cache.getChunkContent(chunkOrIndex);
+  }
+  getChunkVector(chunk) {
+    return this.cache.getChunkVector(chunk);
+  }
   getAnnCandidateCount(maxResults, totalChunks) {
     const minCandidates = this.config.annMinCandidates ?? 0;
     const maxCandidates = this.config.annMaxCandidates ?? totalChunks;
@@ -26,7 +34,13 @@ export class HybridSearch {
     for (const file of uniqueFiles) {
       if (!this.fileModTimes.has(file)) {
-        missing.push(file);
+        // Try to get from cache metadata first (fast)
+        const meta = this.cache.getFileMeta(file);
+        if (meta && typeof meta.mtimeMs === 'number') {
+          this.fileModTimes.set(file, meta.mtimeMs);
+        } else {
+          missing.push(file);
+        }
       }
     }
@@ -34,17 +48,31 @@ export class HybridSearch {
       return;
     }
-    const BATCH_SIZE = 200;
-    for (let i = 0; i < missing.length; i += BATCH_SIZE) {
-      const batch = missing.slice(i, i + BATCH_SIZE);
-      await Promise.all(batch.map(async file => {
+    // Concurrency-limited execution to avoid EMFILE
+    const CONCURRENCY = 50;
+    let index = 0;
+    const worker = async () => {
+      while (index < missing.length) {
+        const file = missing[index++];
+        if (!file) break; // Safety check
         try {
           const stats = await fs.stat(file);
           this.fileModTimes.set(file, stats.mtimeMs);
         } catch {
           this.fileModTimes.set(file, null);
         }
-      }));
+      }
+    };
+    await Promise.all(Array.from({ length: Math.min(CONCURRENCY, missing.length) }, worker));
+    // Prevent unbounded growth (simple eviction)
+    if (this.fileModTimes.size > 5000) {
+      for (const [key] of this.fileModTimes) {
+        this.fileModTimes.delete(key);
+        if (this.fileModTimes.size <= 4000) break;
+      }
     }
   }
@@ -54,82 +82,190 @@ export class HybridSearch {
   }
   async search(query, maxResults) {
-    const vectorStore = this.cache.getVectorStore();
-    if (vectorStore.length === 0) {
-      return {
-        results: [],
-        message: "No code has been indexed yet. Please wait for initial indexing to complete."
-      };
-    }
+    try {
+      if (typeof this.cache.ensureLoaded === 'function') {
+        await this.cache.ensureLoaded();
+      }
+      this.cache.startRead();
+      const storeSize = this.cache.getStoreSize();
+      if (storeSize === 0) {
+        return {
+          results: [],
+          message: 'No code has been indexed yet. Please wait for initial indexing to complete.',
+        };
+      }
-    // Generate query embedding
-    const queryEmbed = await this.embedder(query, { pooling: "mean", normalize: true });
-    const queryVector = Array.from(queryEmbed.data);
-    const queryVectorTyped = queryEmbed.data;
+      // Generate query embedding
+      console.info(`[Search] Query: "${query}"`);
+      const queryEmbed = await this.embedder(query, {
+        pooling: 'mean',
+        normalize: true,
+      });
+      const queryVector = queryEmbed.data; // Keep as Float32Array for performance
+      const queryVectorTyped = queryVector;
-    let candidates = vectorStore;
+      let candidateIndices = null; // null implies full scan of all chunks
     let usedAnn = false;
     if (this.config.annEnabled) {
-      const candidateCount = this.getAnnCandidateCount(maxResults, vectorStore.length);
+      const candidateCount = this.getAnnCandidateCount(maxResults, storeSize);
       const annLabels = await this.cache.queryAnn(queryVectorTyped, candidateCount);
       if (annLabels && annLabels.length >= maxResults) {
         usedAnn = true;
-        const seen = new Set();
-        candidates = annLabels
-          .map((index) => {
-            if (seen.has(index)) return null;
-            seen.add(index);
-            return vectorStore[index];
-          })
-          .filter(Boolean);
+        console.info(`[Search] Using ANN index (${annLabels.length} candidates)`);
+        candidateIndices = Array.from(new Set(annLabels)); // dedupe
       }
     }
-    if (usedAnn && candidates.length < maxResults) {
-      candidates = vectorStore;
-      usedAnn = false;
+    if (!usedAnn) {
+      console.info(`[Search] Using full scan (${storeSize} chunks)`);
     }
-    if (this.config.recencyBoost > 0) {
-      await this.populateFileModTimes(candidates.map(chunk => chunk.file));
+    if (usedAnn && candidateIndices && candidateIndices.length < maxResults) {
+      console.info(`[Search] ANN returned fewer results (${candidateIndices.length}) than requested (${maxResults}), augmenting with full scan...`);
+      candidateIndices = null; // Fallback to full scan to ensure we don't miss anything relevant
+      usedAnn = false;
     }
-    // Score all chunks (synchronous map now, much faster)
-    const scoredChunks = candidates.map(chunk => {
-      // Semantic similarity (vectors are normalized)
-      let score = dotSimilarity(queryVector, chunk.vector) * this.config.semanticWeight;
+    const lowerQuery = query.toLowerCase();
+    const queryWords =
+      lowerQuery.length > 1 ? lowerQuery.split(/\s+/).filter((word) => word.length > 2) : [];
+    const queryWordCount = queryWords.length;
+    if (usedAnn && candidateIndices && lowerQuery.length > 1) {
+      let exactMatchCount = 0;
+      for (const index of candidateIndices) {
+        const content = await this.getChunkContent(index);
+        if (content && content.toLowerCase().includes(lowerQuery)) {
+          exactMatchCount++;
+        }
+      }
-      // Exact match boost
-      const lowerQuery = query.toLowerCase();
-      const lowerContent = chunk.content.toLowerCase();
+      if (exactMatchCount < maxResults) {
+        // Fallback to full scan if keyword constraint isn't met in candidates
+        // Note: This is expensive as it iterates everything.
+        // Optimization: Only do this for small-ish codebases to avoid UI freeze
+        const MAX_FULL_SCAN_SIZE = 2000;
+        if (storeSize <= MAX_FULL_SCAN_SIZE) {
+          const seen = new Set(candidateIndices);
+          // Full scan logic for keyword augmentation
+          // Iterate by index with yielding
+          const FALLBACK_BATCH = 100;
+          for (let i = 0; i < storeSize; i += FALLBACK_BATCH) {
+             if (i > 0) await new Promise(r => setTimeout(r, 0)); // Yield
+             const limit = Math.min(storeSize, i + FALLBACK_BATCH);
+             for (let j = i; j < limit; j++) {
+                if (seen.has(j)) continue;
+                // Lazy load content only if needed (this might be slow for huge repo)
+                // But `getChunkContent` should use cache.
+                const content = await this.getChunkContent(j);
+                if (content && content.toLowerCase().includes(lowerQuery)) {
+                    seen.add(j);
+                    candidateIndices.push(j);
+                }
+             }
+          }
+        } else {
+          console.info(`[Search] Skipping full scan fallback (store size ${storeSize} > ${MAX_FULL_SCAN_SIZE})`);
+        }
+      }
+    }
-      if (lowerContent.includes(lowerQuery)) {
-        score += this.config.exactMatchBoost;
+    // Recency pre-processing
+    let recencyBoostEnabled = this.config.recencyBoost > 0;
+    let now = Date.now();
+    let recencyDecayMs = (this.config.recencyDecayDays || 30) * 24 * 60 * 60 * 1000;
+    let semanticWeight = this.config.semanticWeight;
+    let exactMatchBoost = this.config.exactMatchBoost;
+    let recencyBoost = this.config.recencyBoost;
+    if (recencyBoostEnabled) {
+      const candidates = candidateIndices
+        ? candidateIndices.map((idx) => this.cache.getChunk(idx)).filter(Boolean)
+        : Array.from({ length: storeSize }, (_, i) => this.cache.getChunk(i)).filter(Boolean);
+      // optimization: avoid IO storm during full scan fallbacks
+      // For large candidate sets, we strictly rely on cached metadata
+      // For small sets, we allow best-effort fs.stat
+      if (candidates.length <= 1000) {
+        await this.populateFileModTimes(candidates.map((chunk) => chunk.file));
       } else {
-        // Partial word matching
-        const queryWords = lowerQuery.split(/\s+/);
-        const matchedWords = queryWords.filter(word =>
-          word.length > 2 && lowerContent.includes(word)
-        ).length;
-        score += (matchedWords / queryWords.length) * 0.3;
+        // Bulk pre-populate from cache only (no syscalls)
+        for (const chunk of candidates) {
+          if (!this.fileModTimes.has(chunk.file)) {
+            const meta = this.cache.getFileMeta(chunk.file);
+            if (meta && typeof meta.mtimeMs === 'number') {
+              this.fileModTimes.set(chunk.file, meta.mtimeMs);
+            }
+          }
+        }
       }
+    }
-      // Recency boost - recently modified files rank higher
-      if (this.config.recencyBoost > 0) {
-        const mtime = this.fileModTimes.get(chunk.file);
-        if (typeof mtime === "number") {
-          const daysSinceModified = (Date.now() - mtime) / (1000 * 60 * 60 * 24);
-          const decayDays = this.config.recencyDecayDays || 30;
+    // Score all chunks (batched to prevent blocking event loop)
+    const BATCH_SIZE = 500;
+    const scoredChunks = [];
-          // Linear decay: full boost at 0 days, no boost after decayDays
-          const recencyScore = Math.max(0, 1 - (daysSinceModified / decayDays));
-          score += recencyScore * this.config.recencyBoost;
-        }
+    // Process in batches
+    // Candidates is now implicitly range 0..storeSize OR candidateIndices
+    const totalCandidates = candidateIndices ? candidateIndices.length : storeSize;
+    for (let i = 0; i < totalCandidates; i += BATCH_SIZE) {
+      // Allow event loop to tick between batches
+      if (i > 0) {
+        await new Promise((resolve) => setTimeout(resolve, 0));
       }
-      return { ...chunk, score };
-    });
+      const limit = Math.min(totalCandidates, i + BATCH_SIZE);
+      for (let j = i; j < limit; j++) {
+        const idx = candidateIndices ? candidateIndices[j] : j;
+        // Lazy load keys
+        const vector = this.cache.getVector(idx);
+        if (!vector) continue;
+        // Ensure vector compatibility (dotSimilarity now checks length too)
+        let score = dotSimilarity(queryVector, vector) * semanticWeight;
+        // Exact match boost
+        const content = await this.getChunkContent(idx);
+        const lowerContent = content ? content.toLowerCase() : '';
+        if (lowerContent && lowerContent.includes(lowerQuery)) {
+          score += exactMatchBoost;
+        } else if (lowerContent && queryWordCount > 0) {
+          // Partial word matching (optimized)
+          let matchedWords = 0;
+          for (let k = 0; k < queryWordCount; k++) {
+            if (lowerContent.includes(queryWords[k])) matchedWords++;
+          }
+          score += (matchedWords / queryWordCount) * 0.3;
+        }
+        // Needs chunk info for result
+        const chunkInfo = this.cache.getChunk(idx);
+        // Recency boost
+        if (recencyBoostEnabled && chunkInfo) {
+              const mtime = this.fileModTimes.get(chunkInfo.file);
+              if (typeof mtime === 'number') {
+                const ageMs = now - mtime;
+                const recencyFactor = Math.max(0, 1 - ageMs / recencyDecayMs);
+                score += recencyFactor * recencyBoost;
+              }
+        }
+        if (chunkInfo) {
+            scoredChunks.push({ ...chunkInfo, score, content });
+        }
+      }
+    }
     // Sort by initial score
     scoredChunks.sort((a, b) => b.score - a.score);
@@ -140,7 +276,8 @@ export class HybridSearch {
       const topN = Math.min(5, scoredChunks.length);
       const symbolsFromTop = new Set();
       for (let i = 0; i < topN; i++) {
-        const symbols = extractSymbolsFromContent(scoredChunks[i].content);
+        const content = await this.getChunkContent(scoredChunks[i]);
+        const symbols = extractSymbolsFromContent(content || '');
         for (const sym of symbols) {
           symbolsFromTop.add(sym);
         }
@@ -157,62 +294,85 @@ export class HybridSearch {
             chunk.score += proximity * this.config.callGraphBoost;
           }
         }
         // Re-sort after applying call graph boost
         scoredChunks.sort((a, b) => b.score - a.score);
       }
     }
     // Get top results
-    const results = scoredChunks.slice(0, maxResults);
+    const results = await Promise.all(scoredChunks.slice(0, maxResults).map(async (chunk) => {
+      if (chunk.content === undefined || chunk.content === null) {
+        return { ...chunk, content: await this.getChunkContent(chunk) };
+      }
+      return chunk;
+    }));
+    if (results.length > 0) {
+      console.info(`[Search] Found ${results.length} results. Top score: ${results[0].score.toFixed(4)}`);
+    } else {
+      console.info('[Search] No results found.');
+    }
     return { results, message: null };
+    } finally {
+      this.cache.endRead();
+    }
   }
-  formatResults(results) {
+  async formatResults(results) {
     if (results.length === 0) {
-      return "No matching code found for your query.";
+      return 'No matching code found for your query.';
     }
-    return results.map((r, idx) => {
+    const formatted = await Promise.all(results.map(async (r, idx) => {
       const relPath = path.relative(this.config.searchDirectory, r.file);
-      return `## Result ${idx + 1} (Relevance: ${(r.score * 100).toFixed(1)}%)\n` +
-             `**File:** \`${relPath}\`\n` +
-             `**Lines:** ${r.startLine}-${r.endLine}\n\n` +
-             "```" + path.extname(r.file).slice(1) + "\n" +
-             r.content + "\n" +
-             "```\n";
-    }).join("\n");
+      const content = r.content ?? await this.getChunkContent(r);
+      return (
+        `## Result ${idx + 1} (Relevance: ${(r.score * 100).toFixed(1)}%)\n` +
+        `**File:** \`${relPath}\`\n` +
+        `**Lines:** ${r.startLine}-${r.endLine}\n\n` +
+        '```' +
+        path.extname(r.file).slice(1) +
+        '\n' +
+        content +
+        '\n' +
+        '```\n'
+      );
+    }));
+    return formatted.join('\n');
   }
 }
 // MCP Tool definition for this feature
 export function getToolDefinition(config) {
   return {
-    name: "a_semantic_search",
-    description: "Performs intelligent hybrid code search combining semantic understanding with exact text matching. Ideal for finding code by meaning (e.g., 'authentication logic', 'database queries') even with typos or variations. Returns the most relevant code snippets with file locations and line numbers.",
+    name: 'a_semantic_search',
+    description:
+      "Performs intelligent hybrid code search combining semantic understanding with exact text matching. Ideal for finding code by meaning (e.g., 'authentication logic', 'database queries') even with typos or variations. Returns the most relevant code snippets with file locations and line numbers.",
     inputSchema: {
-      type: "object",
+      type: 'object',
       properties: {
         query: {
-          type: "string",
-          description: "Search query - can be natural language (e.g., 'where do we handle user login') or specific terms"
+          type: 'string',
+          description:
+            "Search query - can be natural language (e.g., 'where do we handle user login') or specific terms",
         },
         maxResults: {
-          type: "number",
-          description: "Maximum number of results to return (default: from config)",
-          default: config.maxResults
-        }
+          type: 'number',
+          description: 'Maximum number of results to return (default: from config)',
+          default: config.maxResults,
+        },
       },
-      required: ["query"]
+      required: ['query'],
     },
     annotations: {
-      title: "Semantic Code Search",
+      title: 'Semantic Code Search',
       readOnlyHint: true,
       destructiveHint: false,
       idempotentHint: true,
-      openWorldHint: false
-    }
+      openWorldHint: false,
+    },
   };
 }
@@ -225,13 +385,13 @@ export async function handleToolCall(request, hybridSearch) {
   if (message) {
     return {
-      content: [{ type: "text", text: message }]
+      content: [{ type: 'text', text: message }],
     };
   }
-  const formattedText = hybridSearch.formatResults(results);
+  const formattedText = await hybridSearch.formatResults(results);
   return {
-    content: [{ type: "text", text: formattedText }]
+    content: [{ type: 'text', text: formattedText }],
   };
 }