npm - @zuvia-software-solutions/code-mapper - Versions diffs - 2.3.12 → 2.4.1 - Mend

@zuvia-software-solutions/code-mapper 2.3.12 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/dist/cli/analyze.d.ts +1 -0
package/dist/cli/analyze.js +75 -1
package/dist/cli/index.js +2 -2
package/dist/core/db/adapter.d.ts +44 -1
package/dist/core/db/adapter.js +122 -1
package/dist/core/db/schema.d.ts +20 -1
package/dist/core/db/schema.js +45 -0
package/dist/core/embeddings/embedding-pipeline.d.ts +3 -1
package/dist/core/embeddings/embedding-pipeline.js +55 -2
package/dist/core/embeddings/nl-embedder.d.ts +44 -0
package/dist/core/embeddings/nl-embedder.js +262 -0
package/dist/core/embeddings/text-generator.js +10 -2
package/dist/core/embeddings/types.d.ts +1 -1
package/dist/core/embeddings/types.js +2 -4
package/dist/core/incremental/refresh.js +39 -3
package/dist/mcp/local/local-backend.d.ts +37 -0
package/dist/mcp/local/local-backend.js +537 -25
package/models/mlx-embedder.py +29 -2
package/package.json +1 -1

package/dist/core/embeddings/nl-embedder.js ADDED Viewed

@@ -0,0 +1,262 @@
+// code-mapper/src/core/embeddings/nl-embedder.ts
+/**
+ * @file Natural language embedder using bge-small-en-v1.5.
+ *
+ * Runs entirely in Node.js via @huggingface/transformers — no Python, no GPU.
+ * Embeds human-readable descriptions extracted from code (JSDoc comments,
+ * enum values, type patterns, file headers) for conceptual search.
+ *
+ * 33M params, q8 quantized, 384-dim embeddings, ~6ms/text on CPU.
+ */
+// NL embedder — no schema imports needed
+const MODEL_ID = 'Xenova/bge-small-en-v1.5';
+// Lazy-loaded pipeline
+let extractor = null;
+let loadPromise = null;
+/** Initialize the NL embedding model (lazy, idempotent) */
+export async function initNlEmbedder() {
+    if (extractor)
+        return;
+    if (loadPromise)
+        return loadPromise;
+    loadPromise = (async () => {
+        const { pipeline } = await import('@huggingface/transformers');
+        extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
+    })();
+    return loadPromise;
+}
+/** Check if the NL embedder is ready */
+export function isNlEmbedderReady() {
+    return extractor !== null;
+}
+/** Embed a single text, returns Float32Array */
+export async function nlEmbed(text) {
+    if (!extractor)
+        await initNlEmbedder();
+    const result = await extractor(text, { pooling: 'cls', normalize: true });
+    return Array.from(result.data);
+}
+/** Embed a batch of texts */
+export async function nlEmbedBatch(texts) {
+    if (!extractor)
+        await initNlEmbedder();
+    const results = [];
+    for (const text of texts) {
+        const result = await extractor(text, { pooling: 'cls', normalize: true });
+        results.push(Array.from(result.data));
+    }
+    return results;
+}
+/** Extract all JSDoc/block comment text (up to 10 lines) */
+function extractFullComment(content) {
+    if (!content)
+        return '';
+    const lines = content.split('\n');
+    const commentLines = [];
+    let inBlock = false;
+    for (const l of lines) {
+        const t = l.trim();
+        if (t.startsWith('/**') || t.startsWith('/*')) {
+            inBlock = true;
+            const inner = t.replace(/^\/\*\*?/, '').replace(/\*\/$/, '').trim();
+            if (inner && !inner.startsWith('@'))
+                commentLines.push(inner);
+            if (t.includes('*/'))
+                inBlock = false;
+            continue;
+        }
+        if (inBlock) {
+            if (t.includes('*/')) {
+                inBlock = false;
+                continue;
+            }
+            const inner = t.replace(/^\*\s?/, '').trim();
+            if (inner && !inner.startsWith('@'))
+                commentLines.push(inner);
+            if (commentLines.length >= 10)
+                break;
+            continue;
+        }
+        if (t.startsWith('//')) {
+            const inner = t.slice(2).trim();
+            if (inner)
+                commentLines.push(inner);
+            if (commentLines.length >= 10)
+                break;
+            continue;
+        }
+        if (t.startsWith('#') && !t.startsWith('#!')) {
+            const inner = t.slice(1).trim();
+            if (inner)
+                commentLines.push(inner);
+            if (commentLines.length >= 10)
+                break;
+            continue;
+        }
+        if (commentLines.length > 0)
+            break; // comment ended
+    }
+    return commentLines.join(' ');
+}
+/** Expand camelCase/PascalCase/snake_case to space-separated words */
+function expandIdentifier(name) {
+    return name
+        .replace(/([a-z])([A-Z])/g, '$1 $2')
+        .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
+        .replace(/[_\-]/g, ' ')
+        .toLowerCase();
+}
+/** Extract enum/const array values as NL text */
+function extractEnumValues(content) {
+    // Match: ['value1', 'value2', ...] as const
+    const match = content.match(/\[([^\]]+)\]\s*as\s*const/);
+    if (match?.[1]) {
+        const values = match[1].replace(/['"]/g, '').split(',').map(v => v.trim()).filter(Boolean);
+        if (values.length > 0)
+            return values.join(', ');
+    }
+    // Match: enum { Value1, Value2 }
+    const enumMatch = content.match(/enum\s+\w+\s*\{([^}]+)\}/);
+    if (enumMatch?.[1]) {
+        const values = enumMatch[1].split(',').map(v => v.trim().split('=')[0].trim()).filter(Boolean);
+        if (values.length > 0)
+            return values.map(v => expandIdentifier(v)).join(', ');
+    }
+    return '';
+}
+/** Extract parameter names from function signature */
+function extractParamNames(content) {
+    const match = content.match(/\(([^)]*)\)/);
+    if (!match?.[1])
+        return '';
+    return match[1].split(',')
+        .map(p => p.trim().split(':')[0].split('=')[0].trim())
+        .filter(p => p && p !== '')
+        .map(p => expandIdentifier(p))
+        .join(', ');
+}
+/** Build NL documents from a node */
+export function extractNlTexts(node) {
+    const docs = [];
+    const name = node.name;
+    const expandedName = expandIdentifier(name);
+    const dir = node.filePath.split('/').slice(-3, -1).join('/');
+    // 1. Comment-based NL text (primary)
+    const comment = extractFullComment(node.content);
+    if (comment) {
+        docs.push({
+            nodeId: node.id,
+            source: 'comment',
+            text: `${expandedName}: ${comment}. File: ${dir}`,
+        });
+    }
+    // 2. Name + params + return type (always available)
+    const params = extractParamNames(node.content);
+    const parts = [expandedName];
+    if (params)
+        parts.push(`Parameters: ${params}`);
+    if (dir)
+        parts.push(`in ${dir}`);
+    if (!comment) {
+        // Only add name-based doc if no comment (avoid duplication)
+        docs.push({
+            nodeId: node.id,
+            source: 'name',
+            text: parts.join('. '),
+        });
+    }
+    // 3. Enum/const values
+    if (node.label === 'Enum' || node.label === 'Const' || node.label === 'TypeAlias') {
+        const values = extractEnumValues(node.content);
+        if (values) {
+            docs.push({
+                nodeId: node.id,
+                source: 'enum',
+                text: `${expandedName}: ${values}`,
+            });
+        }
+    }
+    return docs;
+}
+// ---------------------------------------------------------------------------
+// Full NL embedding pipeline
+// ---------------------------------------------------------------------------
+/** Hash text for skip detection */
+import { createHash } from 'crypto';
+function md5(text) {
+    return createHash('md5').update(text).digest('hex');
+}
+/**
+ * Build NL embeddings for all eligible nodes in the database.
+ * Reads nodes, extracts NL text, embeds with bge-small, writes to nl_embeddings.
+ */
+export async function buildNlEmbeddings(db, onProgress) {
+    const t0 = Date.now();
+    await initNlEmbedder();
+    // Query all nodes (not just EMBEDDABLE_LABELS — we want enums, consts, types too)
+    const labels = ['Function', 'Class', 'Method', 'Interface', 'Const', 'Enum', 'TypeAlias', 'Namespace', 'Module', 'Struct'];
+    const placeholders = labels.map(() => '?').join(',');
+    const rows = db.prepare(`SELECT id, name, label, filePath, content, startLine, description FROM nodes WHERE label IN (${placeholders})`).all(...labels);
+    // Skip test files
+    const testPatterns = ['/test/', '/tests/', '/spec/', '/fixtures/', '/__tests__/', '/__mocks__/', '.test.', '.spec.', '_test.', '_spec.'];
+    const filteredRows = rows.filter(r => !testPatterns.some(p => r.filePath.includes(p)));
+    // Extract NL documents
+    const allDocs = [];
+    for (const row of filteredRows) {
+        const docs = extractNlTexts(row);
+        for (const doc of docs)
+            allDocs.push(doc);
+    }
+    if (allDocs.length === 0) {
+        return { embedded: 0, skipped: 0, durationMs: Date.now() - t0 };
+    }
+    // Check existing hashes for skip detection
+    const existingHashes = new Map();
+    try {
+        const hashRows = db.prepare('SELECT nodeId, textHash FROM nl_embeddings WHERE textHash IS NOT NULL').all();
+        for (const r of hashRows)
+            existingHashes.set(r.nodeId + ':' + r.textHash, '1');
+    }
+    catch { /* table might not exist yet */ }
+    // Filter to docs that need embedding
+    const toEmbed = [];
+    let skipped = 0;
+    for (const doc of allDocs) {
+        const hash = md5(doc.text);
+        if (existingHashes.has(doc.nodeId + ':' + hash)) {
+            skipped++;
+            continue;
+        }
+        toEmbed.push({ ...doc, hash });
+    }
+    if (toEmbed.length === 0) {
+        return { embedded: 0, skipped, durationMs: Date.now() - t0 };
+    }
+    // Clear existing NL embeddings and rebuild
+    db.prepare('DELETE FROM nl_embeddings').run();
+    // Embed in batches and write to DB
+    const BATCH = 100;
+    const insertStmt = db.prepare('INSERT INTO nl_embeddings (nodeId, embedding, textHash, source, text) VALUES (?, ?, ?, ?, ?)');
+    let embedded = 0;
+    db.exec('BEGIN');
+    try {
+        for (let i = 0; i < toEmbed.length; i += BATCH) {
+            const batch = toEmbed.slice(i, i + BATCH);
+            const vecs = await nlEmbedBatch(batch.map(d => d.text));
+            for (let j = 0; j < batch.length; j++) {
+                const doc = batch[j];
+                const vec = vecs[j];
+                const blob = Buffer.from(new Float32Array(vec).buffer);
+                insertStmt.run(doc.nodeId, blob, doc.hash, doc.source, doc.text);
+                embedded++;
+            }
+            onProgress?.(Math.min(i + BATCH, toEmbed.length), toEmbed.length);
+        }
+        db.exec('COMMIT');
+    }
+    catch (err) {
+        db.exec('ROLLBACK');
+        throw err;
+    }
+    return { embedded, skipped, durationMs: Date.now() - t0 };
+}

package/dist/core/embeddings/text-generator.js CHANGED Viewed

@@ -16,6 +16,13 @@ const getFileName = (filePath) => {
     const parts = filePath.split('/');
     return parts[parts.length - 1] || filePath;
 };
+/** Extract directory context from file path (last 2-3 segments) */
+const getDirectoryContext = (filePath) => {
+    const parts = filePath.split('/');
+    // Remove filename, take last 2 directory segments
+    parts.pop();
+    return parts.slice(-2).join('/');
+};
 /**
  * Extract the first JSDoc/comment block as a natural language description.
  * This bridges natural language queries to code — "blast radius analysis"
@@ -154,8 +161,9 @@ export const generateEmbeddingText = (node, _config = {}) => {
     const comment = extractFirstComment(node.content);
     if (comment)
         parts.push(comment);
-    // 3. File location
-    parts.push(`File: ${getFileName(node.filePath)}`);
+    // 3. File location with directory context
+    const dir = getDirectoryContext(node.filePath);
+    parts.push(`File: ${getFileName(node.filePath)}${dir ? ` in ${dir}` : ''}`);
     // 4. Code signature (not full body)
     const sig = extractSignature(node.content, label);
     if (sig)

package/dist/core/embeddings/types.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 /** @file types.ts @description Type definitions for embedding generation and semantic search */
-export declare const EMBEDDABLE_LABELS: readonly ["Function", "Class", "Method", "Interface"];
+export declare const EMBEDDABLE_LABELS: readonly ["Function", "Class", "Method", "Interface", "Const", "Enum", "TypeAlias", "Namespace", "Module", "Struct"];
 export type EmbeddableLabel = typeof EMBEDDABLE_LABELS[number];
 /** Check if a label is embeddable */
 export declare const isEmbeddableLabel: (label: string) => label is EmbeddableLabel;

package/dist/core/embeddings/types.js CHANGED Viewed

@@ -4,10 +4,8 @@
 // File nodes removed — their embeddings were low quality (import headers, license text)
 // and polluted semantic search. BM25 FTS already searches file content effectively.
 export const EMBEDDABLE_LABELS = [
-    'Function',
-    'Class',
-    'Method',
-    'Interface',
+    'Function', 'Class', 'Method', 'Interface',
+    'Const', 'Enum', 'TypeAlias', 'Namespace', 'Module', 'Struct',
 ];
 /** Check if a label is embeddable */
 export const isEmbeddableLabel = (label) => EMBEDDABLE_LABELS.includes(label);

package/dist/core/incremental/refresh.js CHANGED Viewed

@@ -21,7 +21,7 @@ import { getLanguageFromFilename, getDefinitionNodeFromCaptures } from '../inges
 import { loadParser, loadLanguage, isLanguageAvailable } from '../tree-sitter/parser-loader.js';
 import { getTreeSitterBufferSize, TREE_SITTER_MAX_BUFFER } from '../ingestion/constants.js';
 import { generateId } from '../../lib/utils.js';
-import { deleteNodesByFile, insertNode, insertEdge, findNodeAtLine, findNodesByFile, deleteEmbeddingsByFile, insertEmbeddingsBatch, countEmbeddings } from '../db/adapter.js';
+import { deleteNodesByFile, insertNode, insertEdge, findNodeAtLine, findNodesByFile, deleteEmbeddingsByFile, insertEmbeddingsBatch, countEmbeddings, deleteRefsByFile, insertRefsBatch, deleteFileWordsByFile, upsertFileWords } from '../db/adapter.js';
 import { assertNodeLabel, toNodeId, toEdgeId } from '../db/schema.js';
 import {} from './types.js';
 import { getTsgoService } from '../semantic/tsgo-service.js';
@@ -79,10 +79,13 @@ export async function refreshFiles(db, repoPath, dirtyFiles) {
     let nodesInserted = 0;
     let edgesInserted = 0;
     let filesSkipped = 0;
-    // Phase 1: Delete old nodes for all dirty files
+    // Phase 1: Delete old nodes + refs + file_words for all dirty files
     for (const entry of dirtyFiles) {
-        const deleted = deleteNodesByFile(db, entry.relativePath);
+        const relPath = entry.relativePath;
+        const deleted = deleteNodesByFile(db, relPath);
         nodesDeleted += deleted;
+        deleteRefsByFile(db, relPath);
+        deleteFileWordsByFile(db, relPath);
     }
     // Phase 2: Parse modified/created files with tree-sitter
     const parser = await loadParser();
@@ -90,6 +93,7 @@ export async function refreshFiles(db, repoPath, dirtyFiles) {
     const allDefinitions = [];
     const callSites = [];
     const insertedFilePaths = new Set();
+    const fileContents = new Map(); // for file_words rebuild
     for (const entry of filesToProcess) {
         const relPath = entry.relativePath;
         const absPath = path.resolve(repoPath, relPath);
@@ -110,6 +114,7 @@ export async function refreshFiles(db, repoPath, dirtyFiles) {
             filesSkipped++;
             continue;
         }
+        fileContents.set(relPath, content);
         try {
             await loadLanguage(language, relPath);
         }
@@ -247,6 +252,37 @@ export async function refreshFiles(db, repoPath, dirtyFiles) {
         });
         edgesInserted++;
     }
+    // Phase 3b+3c: Rebuild refs + file_words for dirty files
+    const STOP_WORDS = new Set(['the', 'and', 'for', 'from', 'with', 'this', 'that', 'have', 'has', 'not', 'are', 'was', 'were', 'been', 'being', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'does', 'did', 'let', 'var', 'const', 'new', 'return', 'function', 'class', 'import', 'export', 'default', 'void', 'null', 'undefined', 'true', 'false', 'else', 'case', 'break', 'continue', 'while', 'throw', 'catch', 'try', 'finally', 'async', 'await', 'yield', 'typeof', 'instanceof', 'delete', 'switch', 'interface', 'type', 'enum', 'extends', 'implements', 'static', 'private', 'public', 'protected', 'abstract', 'readonly', 'override', 'declare', 'module', 'namespace', 'require', 'string', 'number', 'boolean', 'object', 'any', 'never', 'unknown', 'symbol']);
+    // Phase 3b: Rebuild refs for dirty files (identifier occurrence index)
+    for (const [relPath, content] of fileContents) {
+        const refs = [];
+        const lines = content.split('\n');
+        const identRegex = /\b[a-zA-Z_]\w{2,}\b/g;
+        for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
+            let match;
+            while ((match = identRegex.exec(lines[lineIdx])) !== null) {
+                if (!STOP_WORDS.has(match[0].toLowerCase())) {
+                    refs.push({ symbol: match[0], filePath: relPath, line: lineIdx });
+                }
+            }
+        }
+        if (refs.length > 0)
+            insertRefsBatch(db, refs);
+    }
+    // Phase 3c: Rebuild file_words for dirty files (conceptual search index)
+    for (const [relPath, content] of fileContents) {
+        const wordSet = new Set();
+        const wordRegex = /\b[a-zA-Z]\w{2,}\b/g;
+        let match;
+        while ((match = wordRegex.exec(content)) !== null) {
+            const w = match[0].toLowerCase();
+            if (!STOP_WORDS.has(w))
+                wordSet.add(w);
+        }
+        if (wordSet.size > 0)
+            upsertFileWords(db, relPath, [...wordSet].join(' '));
+    }
     // Phase 4 + 5: Resolve call edges and cross-file edges using tsgo LSP
     // (TS/JS files only — tsgo is optional, skip if unavailable)
     console.error(`Code Mapper: refresh tsgo init with repoPath=${repoPath}`);

package/dist/mcp/local/local-backend.d.ts CHANGED Viewed

@@ -40,10 +40,22 @@ export declare class LocalBackend {
     private refreshLocks;
     /** Per-repo tsgo LSP service instances for live semantic enrichment */
     private tsgoServices;
+    /** Per-repo in-memory embedding cache: nodeId → Float32Array (256-dim) */
+    private embeddingCaches;
+    /** Per-repo in-memory NL embedding cache: includes source text for match_reason */
+    private nlEmbeddingCaches;
     /** Get (or lazily start) a tsgo LSP service for a repo. Returns null if unavailable. */
     private getTsgo;
     /** Get (or lazily open) the SQLite database for a repo. */
     private getDb;
+    /** Load all embeddings into memory for fast vector search */
+    private loadEmbeddingCache;
+    /** Search embeddings in memory — O(N) dot products, no disk I/O */
+    private searchEmbeddingsInMemory;
+    /** Load NL embeddings into memory for fast conceptual search */
+    private loadNlEmbeddingCache;
+    /** Search NL embeddings in memory, returns match_reason text */
+    private searchNlEmbeddingsInMemory;
     /** Hard ceiling — beyond this, incremental is unreliable, warn prominently */
     private static readonly MAX_INCREMENTAL_FILES;
     /** Start file system watcher for a repo to detect source changes */
@@ -125,6 +137,31 @@ export declare class LocalBackend {
      * Semantic vector search helper
      */
     private semanticSearch;
+    /**
+     * NL semantic search: embed query with bge-small, search NL descriptions.
+     * Returns match_reason (the NL text that matched) for agent transparency.
+     */
+    private nlSemanticSearch;
+    /**
+     * Refs-based search: find symbols referenced in files that contain the query identifiers.
+     * Bridges the gap between graph edges (incomplete) and grep (complete for exact names).
+     */
+    private refsSearch;
+    /**
+     * File-words FTS search: find files whose content contains conceptual terms,
+     * then return the best symbol from each file. Bridges NL → code gap.
+     */
+    private fileWordsSearch;
+    /**
+     * Query expansion via embedding nearest neighbors: embed the query,
+     * find 5 closest symbols, extract their names as BM25 expansion terms.
+     */
+    private expandQueryViaNearestNeighbors;
+    /**
+     * Ripgrep fallback: when all search signals return sparse results,
+     * grep the repo for query terms to find any relevant files.
+     */
+    private ripgrepFallback;
     executeSql(repoName: string, query: string): Promise<any>;
     private sqlQuery;
     /** Format raw SQL result rows as a markdown table, with raw fallback */