npm - @zuvia-software-solutions/code-mapper - Versions diffs - 1.4.0 → 2.0.1 - Mend

@zuvia-software-solutions/code-mapper 1.4.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

package/dist/cli/ai-context.js +1 -1
package/dist/cli/analyze.d.ts +1 -0
package/dist/cli/analyze.js +73 -82
package/dist/cli/augment.js +0 -2
package/dist/cli/eval-server.d.ts +2 -2
package/dist/cli/eval-server.js +6 -6
package/dist/cli/index.js +6 -10
package/dist/cli/mcp.d.ts +1 -3
package/dist/cli/mcp.js +3 -3
package/dist/cli/refresh.d.ts +2 -2
package/dist/cli/refresh.js +24 -29
package/dist/cli/status.js +4 -13
package/dist/cli/tool.d.ts +5 -4
package/dist/cli/tool.js +8 -10
package/dist/config/ignore-service.js +14 -34
package/dist/core/augmentation/engine.js +53 -83
package/dist/core/db/adapter.d.ts +99 -0
package/dist/core/db/adapter.js +402 -0
package/dist/core/db/graph-loader.d.ts +27 -0
package/dist/core/db/graph-loader.js +148 -0
package/dist/core/db/queries.d.ts +160 -0
package/dist/core/db/queries.js +441 -0
package/dist/core/db/schema.d.ts +108 -0
package/dist/core/db/schema.js +136 -0
package/dist/core/embeddings/embedder.d.ts +21 -12
package/dist/core/embeddings/embedder.js +104 -50
package/dist/core/embeddings/embedding-pipeline.d.ts +48 -22
package/dist/core/embeddings/embedding-pipeline.js +220 -262
package/dist/core/embeddings/text-generator.js +4 -19
package/dist/core/embeddings/types.d.ts +1 -1
package/dist/core/graph/graph.d.ts +1 -1
package/dist/core/graph/graph.js +1 -0
package/dist/core/graph/types.d.ts +11 -9
package/dist/core/graph/types.js +4 -1
package/dist/core/incremental/refresh.d.ts +46 -0
package/dist/core/incremental/refresh.js +503 -0
package/dist/core/incremental/types.d.ts +2 -1
package/dist/core/incremental/types.js +42 -44
package/dist/core/ingestion/ast-cache.js +1 -0
package/dist/core/ingestion/call-processor.d.ts +15 -3
package/dist/core/ingestion/call-processor.js +448 -60
package/dist/core/ingestion/cluster-enricher.d.ts +1 -1
package/dist/core/ingestion/cluster-enricher.js +2 -0
package/dist/core/ingestion/community-processor.d.ts +1 -1
package/dist/core/ingestion/community-processor.js +8 -3
package/dist/core/ingestion/export-detection.d.ts +1 -1
package/dist/core/ingestion/export-detection.js +1 -1
package/dist/core/ingestion/filesystem-walker.js +1 -1
package/dist/core/ingestion/heritage-processor.d.ts +2 -2
package/dist/core/ingestion/heritage-processor.js +22 -11
package/dist/core/ingestion/import-processor.d.ts +2 -2
package/dist/core/ingestion/import-processor.js +24 -9
package/dist/core/ingestion/language-config.js +7 -4
package/dist/core/ingestion/mro-processor.d.ts +1 -1
package/dist/core/ingestion/mro-processor.js +23 -11
package/dist/core/ingestion/named-binding-extraction.js +5 -5
package/dist/core/ingestion/parsing-processor.d.ts +4 -4
package/dist/core/ingestion/parsing-processor.js +26 -18
package/dist/core/ingestion/pipeline.d.ts +4 -2
package/dist/core/ingestion/pipeline.js +50 -20
package/dist/core/ingestion/process-processor.d.ts +2 -2
package/dist/core/ingestion/process-processor.js +28 -14
package/dist/core/ingestion/resolution-context.d.ts +1 -1
package/dist/core/ingestion/resolution-context.js +14 -4
package/dist/core/ingestion/resolvers/csharp.js +4 -3
package/dist/core/ingestion/resolvers/go.js +3 -1
package/dist/core/ingestion/resolvers/jvm.js +13 -4
package/dist/core/ingestion/resolvers/standard.js +2 -2
package/dist/core/ingestion/resolvers/utils.js +6 -2
package/dist/core/ingestion/route-stitcher.d.ts +15 -0
package/dist/core/ingestion/route-stitcher.js +92 -0
package/dist/core/ingestion/structure-processor.d.ts +1 -1
package/dist/core/ingestion/structure-processor.js +3 -2
package/dist/core/ingestion/symbol-table.d.ts +2 -0
package/dist/core/ingestion/symbol-table.js +5 -1
package/dist/core/ingestion/tree-sitter-queries.d.ts +2 -2
package/dist/core/ingestion/tree-sitter-queries.js +177 -0
package/dist/core/ingestion/type-env.js +20 -0
package/dist/core/ingestion/type-extractors/csharp.js +4 -3
package/dist/core/ingestion/type-extractors/go.js +23 -12
package/dist/core/ingestion/type-extractors/php.js +18 -10
package/dist/core/ingestion/type-extractors/ruby.js +15 -3
package/dist/core/ingestion/type-extractors/rust.js +3 -2
package/dist/core/ingestion/type-extractors/shared.js +3 -2
package/dist/core/ingestion/type-extractors/typescript.js +11 -5
package/dist/core/ingestion/utils.d.ts +27 -4
package/dist/core/ingestion/utils.js +145 -100
package/dist/core/ingestion/workers/parse-worker.d.ts +1 -0
package/dist/core/ingestion/workers/parse-worker.js +97 -29
package/dist/core/ingestion/workers/worker-pool.js +3 -0
package/dist/core/search/bm25-index.d.ts +15 -8
package/dist/core/search/bm25-index.js +48 -98
package/dist/core/search/hybrid-search.d.ts +9 -3
package/dist/core/search/hybrid-search.js +30 -25
package/dist/core/search/reranker.js +9 -7
package/dist/core/search/types.d.ts +0 -4
package/dist/core/semantic/tsgo-service.d.ts +7 -1
package/dist/core/semantic/tsgo-service.js +165 -66
package/dist/lib/tsgo-test.d.ts +2 -0
package/dist/lib/tsgo-test.js +6 -0
package/dist/lib/type-utils.d.ts +25 -0
package/dist/lib/type-utils.js +22 -0
package/dist/lib/utils.d.ts +3 -2
package/dist/lib/utils.js +3 -2
package/dist/mcp/compatible-stdio-transport.js +1 -1
package/dist/mcp/local/local-backend.d.ts +29 -56
package/dist/mcp/local/local-backend.js +808 -1118
package/dist/mcp/resources.js +35 -25
package/dist/mcp/server.d.ts +1 -1
package/dist/mcp/server.js +5 -5
package/dist/mcp/tools.js +24 -25
package/dist/storage/repo-manager.d.ts +2 -12
package/dist/storage/repo-manager.js +1 -47
package/dist/types/pipeline.d.ts +8 -5
package/dist/types/pipeline.js +5 -0
package/package.json +18 -11
package/dist/cli/serve.d.ts +0 -5
package/dist/cli/serve.js +0 -8
package/dist/core/incremental/child-process.d.ts +0 -8
package/dist/core/incremental/child-process.js +0 -649
package/dist/core/incremental/refresh-coordinator.d.ts +0 -32
package/dist/core/incremental/refresh-coordinator.js +0 -147
package/dist/core/lbug/csv-generator.d.ts +0 -28
package/dist/core/lbug/csv-generator.js +0 -355
package/dist/core/lbug/lbug-adapter.d.ts +0 -96
package/dist/core/lbug/lbug-adapter.js +0 -753
package/dist/core/lbug/schema.d.ts +0 -46
package/dist/core/lbug/schema.js +0 -402
package/dist/mcp/core/embedder.d.ts +0 -24
package/dist/mcp/core/embedder.js +0 -168
package/dist/mcp/core/lbug-adapter.d.ts +0 -29
package/dist/mcp/core/lbug-adapter.js +0 -330
package/dist/server/api.d.ts +0 -5
package/dist/server/api.js +0 -340
package/dist/server/mcp-http.d.ts +0 -7
package/dist/server/mcp-http.js +0 -95
package/models/mlx-embedder.py +0 -185

package/dist/core/embeddings/embedding-pipeline.js CHANGED Viewed

@@ -2,43 +2,43 @@
 /**
  * @file embedding-pipeline.ts
  * @description Orchestrates the background embedding process:
- * 1) Query embeddable nodes from LadybugDB
+ * 1) Query embeddable nodes from SQLite
  * 2) Generate text representations
  * 3) Batch embed using transformers.js
- * 4) Store embeddings in LadybugDB
- * 5) Create vector index for semantic search
+ * 4) Store embeddings in SQLite
+ * 5) Vector search via brute-force cosine similarity in adapter.ts
  */
-import { initEmbedder, embedBatch, embedText, embeddingToArray, isEmbedderReady } from './embedder.js';
+import { initEmbedder, embedBatch, embedQuery, embeddingToArray, isEmbedderReady } from './embedder.js';
 import { generateEmbeddingText } from './text-generator.js';
 import { DEFAULT_EMBEDDING_CONFIG, EMBEDDABLE_LABELS, } from './types.js';
-const isDev = process.env.NODE_ENV === 'development';
-/** Query all embeddable nodes from LadybugDB (File has different schema than code elements) */
-const queryEmbeddableNodes = async (executeQuery) => {
+import { toNodeId } from '../db/schema.js';
+import { createHash } from 'crypto';
+const isDev = process.env['NODE_ENV'] === 'development';
+/** Fast content hash for detecting unchanged embedding text */
+function textHash(text) {
+    return createHash('md5').update(text).digest('hex');
+}
+/** Query all embeddable nodes from SQLite */
+const queryEmbeddableNodes = (db) => {
     const allNodes = [];
     for (const label of EMBEDDABLE_LABELS) {
         try {
-            // All embeddable labels are code elements with startLine/endLine
-            const query = `
-        MATCH (n:${label})
-        RETURN n.id AS id, n.name AS name, '${label}' AS label,
-               n.filePath AS filePath, n.content AS content,
-               n.startLine AS startLine, n.endLine AS endLine
-      `;
-            const rows = await executeQuery(query);
+            const rows = db.prepare(`SELECT id, name, label, filePath, content, startLine, endLine
+         FROM nodes WHERE label = ?`).all(label);
             for (const row of rows) {
                 allNodes.push({
-                    id: row.id ?? row[0],
-                    name: row.name ?? row[1],
-                    label: row.label ?? row[2],
-                    filePath: row.filePath ?? row[3],
-                    content: row.content ?? row[4] ?? '',
-                    startLine: row.startLine ?? row[5],
-                    endLine: row.endLine ?? row[6],
+                    id: row.id,
+                    name: row.name,
+                    label: row.label,
+                    filePath: row.filePath,
+                    content: row.content ?? '',
+                    startLine: row.startLine ?? 0,
+                    endLine: row.endLine ?? 0,
                 });
             }
         }
         catch (error) {
-            // Table might not exist or be empty — continue
+            // Table might not exist or be empty -- continue
             if (isDev) {
                 console.warn(`Query for ${label} nodes failed:`, error);
             }
@@ -47,55 +47,111 @@ const queryEmbeddableNodes = async (executeQuery) => {
     return allNodes;
 };
 /**
- * Batch INSERT embeddings into the CodeEmbedding table
+ * Fetch graph context (callers, callees, community module) for a set of nodes.
  *
- * Separate lightweight table avoids copy-on-write overhead from
- * UPDATEing nodes with large content fields
+ * This enrichment adds relationship context so that embedding text like
+ * "import resolution pipeline" matches `processImports` because its caller
+ * "runPipelineFromRepo" contains "pipeline".
+ *
+ * Reusable by both the full analyze pipeline and incremental refresh.
+ *
+ * @param db - Open SQLite database instance
+ * @param nodes - Nodes to fetch context for (must have `id` field)
+ * @returns Map from node ID to graph context
  */
-const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
-    // INSERT into separate embedding table — avoids large-row COW overhead
-    const cypher = `CREATE (e:CodeEmbedding {nodeId: $nodeId, embedding: $embedding})`;
-    const paramsList = updates.map(u => ({ nodeId: u.id, embedding: u.embedding }));
-    await executeWithReusedStatement(cypher, paramsList);
-};
-/** Create the HNSW vector index on the CodeEmbedding table */
-let vectorExtensionLoaded = false;
-const createVectorIndex = async (executeQuery) => {
-    // LadybugDB v0.15+ requires explicit VECTOR extension load (once per session)
-    if (!vectorExtensionLoaded) {
-        try {
-            await executeQuery('INSTALL VECTOR');
-            await executeQuery('LOAD EXTENSION VECTOR');
-            vectorExtensionLoaded = true;
+export function fetchGraphContext(db, nodes) {
+    const graphContext = new Map();
+    const totalNodes = nodes.length;
+    if (totalNodes === 0)
+        return graphContext;
+    try {
+        const ph = nodes.map(() => '?').join(',');
+        const nodeIds = nodes.map(n => n.id);
+        // Batch fetch callers
+        const callerRows = db.prepare(`
+      SELECT e.targetId AS nid, n.name AS name
+      FROM edges e JOIN nodes n ON n.id = e.sourceId
+      WHERE e.targetId IN (${ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7
+      LIMIT ${totalNodes * 3}
+    `).all(...nodeIds);
+        const callerMap = new Map();
+        for (const r of callerRows) {
+            if (!callerMap.has(r.nid))
+                callerMap.set(r.nid, []);
+            callerMap.get(r.nid).push(r.name);
         }
-        catch {
-            // Extension may already be loaded — index creation will fail clearly if not
-            vectorExtensionLoaded = true;
+        // Batch fetch callees
+        const calleeRows = db.prepare(`
+      SELECT e.sourceId AS nid, n.name AS name
+      FROM edges e JOIN nodes n ON n.id = e.targetId
+      WHERE e.sourceId IN (${ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7
+      LIMIT ${totalNodes * 3}
+    `).all(...nodeIds);
+        const calleeMap = new Map();
+        for (const r of calleeRows) {
+            if (!calleeMap.has(r.nid))
+                calleeMap.set(r.nid, []);
+            calleeMap.get(r.nid).push(r.name);
         }
-    }
-    const cypher = `
-    CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
-  `;
-    try {
-        await executeQuery(cypher);
-    }
-    catch (error) {
-        // Index might already exist
-        if (isDev) {
-            console.warn('Vector index creation warning:', error);
+        // Batch fetch module (community membership)
+        const moduleRows = db.prepare(`
+      SELECT e.sourceId AS nid, c.heuristicLabel AS module
+      FROM edges e JOIN nodes c ON c.id = e.targetId
+      WHERE e.sourceId IN (${ph}) AND e.type = 'MEMBER_OF' AND c.label = 'Community'
+      LIMIT ${totalNodes}
+    `).all(...nodeIds);
+        const moduleMap = new Map();
+        for (const r of moduleRows) {
+            moduleMap.set(r.nid, r.module ?? '');
+        }
+        // Assemble
+        for (const node of nodes) {
+            graphContext.set(node.id, {
+                callers: (callerMap.get(node.id) || []).slice(0, 3),
+                callees: (calleeMap.get(node.id) || []).slice(0, 3),
+                module: moduleMap.get(node.id) || '',
+            });
         }
     }
-};
+    catch { } // Non-fatal -- embeddings work without graph context
+    return graphContext;
+}
+/**
+ * Enrich embedding text with graph context (callers, callees, module).
+ *
+ * Inserts context lines (Module, Called by, Calls) after the header
+ * section of the generated text, before the code snippet.
+ *
+ * @param text - Base embedding text from generateEmbeddingText
+ * @param ctx - Graph context for this node
+ * @returns Enriched text
+ */
+export function enrichTextWithGraphContext(text, ctx) {
+    const parts = [];
+    if (ctx.module)
+        parts.push(`Module: ${ctx.module}`);
+    if (ctx.callers.length > 0)
+        parts.push(`Called by: ${ctx.callers.join(', ')}`);
+    if (ctx.callees.length > 0)
+        parts.push(`Calls: ${ctx.callees.join(', ')}`);
+    if (parts.length === 0)
+        return text;
+    const lines = text.split('\n');
+    const insertIdx = lines.findIndex(l => l === '') || 2;
+    lines.splice(insertIdx, 0, ...parts);
+    return lines.join('\n');
+}
 /**
- * Run the full embedding pipeline (load model, embed nodes, create index)
- * @param executeQuery - Execute Cypher queries against LadybugDB
- * @param executeWithReusedStatement - Execute with reused prepared statement
+ * Run the full embedding pipeline (load model, embed nodes, store in SQLite)
+ * @param db - Open SQLite database instance
  * @param onProgress - Progress callback
  * @param config - Configuration override
  * @param skipNodeIds - Node IDs that already have embeddings (incremental mode)
  */
-export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatement, onProgress, config = {}, skipNodeIds) => {
+export async function runEmbeddingPipeline(db, onProgress, config = {}, skipNodeIds) {
     const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config };
+    // Lazy import to avoid circular dependencies at module load time
+    const { insertEmbeddingsBatch } = await import('../db/adapter.js');
     try {
         // Phase 1: Load model
         onProgress({
@@ -117,73 +173,22 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
             modelDownloadPercent: 100,
         });
         if (isDev) {
-            console.log('🔍 Querying embeddable nodes...');
+            console.log('Querying embeddable nodes...');
         }
         // Phase 2: Query nodes
-        let nodes = await queryEmbeddableNodes(executeQuery);
+        let nodes = queryEmbeddableNodes(db);
         // Incremental mode: skip already-embedded nodes
         if (skipNodeIds && skipNodeIds.size > 0) {
             const beforeCount = nodes.length;
             nodes = nodes.filter(n => !skipNodeIds.has(n.id));
             if (isDev) {
-                console.log(`📦 Incremental embeddings: ${beforeCount} total, ${skipNodeIds.size} cached, ${nodes.length} to embed`);
+                console.log(`Incremental embeddings: ${beforeCount} total, ${skipNodeIds.size} cached, ${nodes.length} to embed`);
             }
         }
         const totalNodes = nodes.length;
         // Enrich nodes with graph context (callers, callees, module) for better embeddings
-        // This adds relationship context so "import resolution pipeline" matches processImports
-        // because its caller "runPipelineFromRepo" contains "pipeline"
-        const graphContext = new Map();
-        if (totalNodes > 0) {
-            try {
-                const nodeIds = nodes.map(n => `'${String(n.id).replace(/'/g, "''")}'`).join(', ');
-                // Batch fetch callers
-                const callerRows = await executeQuery(`
-          MATCH (caller)-[r:CodeRelation {type: 'CALLS'}]->(n) WHERE n.id IN [${nodeIds}] AND r.confidence >= 0.7
-          RETURN n.id AS nid, caller.name AS name LIMIT ${totalNodes * 3}
-        `);
-                const callerMap = new Map();
-                for (const r of callerRows) {
-                    const nid = String(r.nid ?? r[0]);
-                    if (!callerMap.has(nid))
-                        callerMap.set(nid, []);
-                    callerMap.get(nid).push(String(r.name ?? r[1]));
-                }
-                // Batch fetch callees
-                const calleeRows = await executeQuery(`
-          MATCH (n)-[r:CodeRelation {type: 'CALLS'}]->(callee) WHERE n.id IN [${nodeIds}] AND r.confidence >= 0.7
-          RETURN n.id AS nid, callee.name AS name LIMIT ${totalNodes * 3}
-        `);
-                const calleeMap = new Map();
-                for (const r of calleeRows) {
-                    const nid = String(r.nid ?? r[0]);
-                    if (!calleeMap.has(nid))
-                        calleeMap.set(nid, []);
-                    calleeMap.get(nid).push(String(r.name ?? r[1]));
-                }
-                // Batch fetch module
-                const moduleRows = await executeQuery(`
-          MATCH (n)-[:CodeRelation {type: 'MEMBER_OF'}]->(c:Community) WHERE n.id IN [${nodeIds}]
-          RETURN n.id AS nid, c.heuristicLabel AS module LIMIT ${totalNodes}
-        `);
-                const moduleMap = new Map();
-                for (const r of moduleRows) {
-                    moduleMap.set(String(r.nid ?? r[0]), String(r.module ?? r[1] ?? ''));
-                }
-                // Assemble
-                for (const node of nodes) {
-                    graphContext.set(node.id, {
-                        callers: (callerMap.get(node.id) || []).slice(0, 3),
-                        callees: (calleeMap.get(node.id) || []).slice(0, 3),
-                        module: moduleMap.get(node.id) || '',
-                    });
-                }
-            }
-            catch { } // Non-fatal — embeddings work without graph context
-        }
-        if (isDev) {
-            console.log(`📊 Found ${totalNodes} embeddable nodes (${graphContext.size} with graph context)`);
-        }
+        const graphContext = fetchGraphContext(db, nodes);
+        console.error(`Code Mapper: ${totalNodes} embeddable nodes, ${graphContext.size} with graph context (callers/callees/module)`);
         if (totalNodes === 0) {
             onProgress({
                 phase: 'ready',
@@ -194,8 +199,6 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
             return;
         }
         // Phase 3: Batch embed
-        const batchSize = finalConfig.batchSize;
-        const totalBatches = Math.ceil(totalNodes / batchSize);
         let processedNodes = 0;
         onProgress({
             phase: 'embedding',
@@ -203,46 +206,62 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
             nodesProcessed: 0,
             totalNodes,
         });
-        // Generate ALL text representations with graph context enrichment
+        // Generate text representations with graph context enrichment
         const allTexts = nodes.map(node => {
-            const ctx = graphContext.get(node.id);
             let text = generateEmbeddingText(node, finalConfig);
+            const ctx = graphContext.get(node.id);
             if (ctx) {
-                const parts = [];
-                if (ctx.module)
-                    parts.push(`Module: ${ctx.module}`);
-                if (ctx.callers.length > 0)
-                    parts.push(`Called by: ${ctx.callers.join(', ')}`);
-                if (ctx.callees.length > 0)
-                    parts.push(`Calls: ${ctx.callees.join(', ')}`);
-                if (parts.length > 0) {
-                    const lines = text.split('\n');
-                    const insertIdx = lines.findIndex(l => l === '') || 2;
-                    lines.splice(insertIdx, 0, ...parts);
-                    text = lines.join('\n');
-                }
+                text = enrichTextWithGraphContext(text, ctx);
             }
             return text;
         });
-        // Send ALL texts to the MLX embedder in one call — it does length-tiered
-        // batching internally for optimal Metal GPU utilization
-        const allEmbeddings = await embedBatch(allTexts);
+        // Hash-based skip: compare text hashes to skip unchanged nodes
+        const { getEmbeddingHashes } = await import('../db/adapter.js');
+        const existingHashes = getEmbeddingHashes(db);
+        const hashes = allTexts.map(t => textHash(t));
+        const toEmbed = [];
+        const skipped = [];
+        for (let i = 0; i < nodes.length; i++) {
+            const node = nodes[i];
+            const hash = hashes[i];
+            const existing = existingHashes.get(node.id);
+            if (existing === hash) {
+                skipped.push({ index: i, hash });
+            }
+            else {
+                toEmbed.push({ index: i, text: allTexts[i], hash });
+            }
+        }
+        console.error(`Code Mapper: ${toEmbed.length} nodes to embed, ${skipped.length} unchanged (hash skip)`);
+        // Embed only changed nodes
+        let embeddingResults = [];
+        if (toEmbed.length > 0) {
+            const t0Embed = Date.now();
+            embeddingResults = await embedBatch(toEmbed.map(e => e.text));
+            console.error(`Code Mapper: MLX embedded ${embeddingResults.length} texts in ${Date.now() - t0Embed}ms`);
+        }
         onProgress({
             phase: 'embedding',
             percent: 85,
             nodesProcessed: totalNodes,
             totalNodes,
         });
-        // Insert all embeddings into LadybugDB in batches
+        // Insert embeddings with hashes into SQLite in batches
         const DB_BATCH = 200;
-        for (let i = 0; i < nodes.length; i += DB_BATCH) {
-            const batchNodes = nodes.slice(i, i + DB_BATCH);
-            const batchEmbeddings = allEmbeddings.slice(i, i + DB_BATCH);
-            const updates = batchNodes.map((node, j) => ({
-                id: node.id,
-                embedding: embeddingToArray(batchEmbeddings[j]),
-            }));
-            await batchInsertEmbeddings(executeWithReusedStatement, updates);
+        const allUpdates = toEmbed.map((entry, j) => {
+            const emb = embeddingResults[j];
+            if (!emb)
+                throw new Error(`Missing embedding at index ${j}`);
+            const node = nodes[entry.index];
+            return {
+                nodeId: toNodeId(node.id),
+                embedding: embeddingToArray(emb),
+                textHash: entry.hash,
+            };
+        });
+        for (let i = 0; i < allUpdates.length; i += DB_BATCH) {
+            const batch = allUpdates.slice(i, i + DB_BATCH);
+            insertEmbeddingsBatch(db, batch);
             processedNodes = Math.min(i + DB_BATCH, nodes.length);
             onProgress({
                 phase: 'embedding',
@@ -251,17 +270,14 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
                 totalNodes,
             });
         }
-        // Phase 4: Create HNSW vector index
+        // Phase 4: No HNSW index needed -- SQLite uses brute-force cosine similarity
+        // which is fast enough for <200K vectors at 256 dims
         onProgress({
             phase: 'indexing',
             percent: 90,
             nodesProcessed: totalNodes,
             totalNodes,
         });
-        if (isDev) {
-            console.log('📇 Creating vector index...');
-        }
-        await createVectorIndex(executeQuery);
         // Done
         onProgress({
             phase: 'ready',
@@ -269,14 +285,12 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
             nodesProcessed: totalNodes,
             totalNodes,
         });
-        if (isDev) {
-            console.log('✅ Embedding pipeline complete!');
-        }
+        console.error(`Code Mapper: Embedding pipeline complete (${totalNodes} nodes stored)`);
     }
     catch (error) {
         const errorMessage = error instanceof Error ? error.message : 'Unknown error';
         if (isDev) {
-            console.error('❌ Embedding pipeline error:', error);
+            console.error('Embedding pipeline error:', error);
         }
         onProgress({
             phase: 'error',
@@ -285,118 +299,62 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
         });
         throw error;
     }
-};
+}
+// ---------------------------------------------------------------------------
+// SQLite-backed semantic search (for api.ts and hybrid-search.ts consumers)
+// ---------------------------------------------------------------------------
 /**
- * Perform semantic search via the CodeEmbedding vector index
- * @param executeQuery - Execute Cypher queries
+ * Semantic vector search against a SQLite database.
+ *
+ * Uses brute-force cosine similarity via adapter.searchVector, then
+ * enriches results with node metadata. This mirrors the pattern in
+ * local-backend.ts but as a standalone function for hybrid search.
+ *
+ * @param db - Open SQLite database instance
  * @param query - Search query text
  * @param k - Number of results (default: 10)
- * @param maxDistance - Maximum cosine distance threshold (default: 0.5)
- * @returns Search results ordered by relevance
+ * @param maxDistance - Maximum cosine distance threshold (default: from types.ts)
  */
-export const semanticSearch = async (executeQuery, query, k = 10, maxDistance = 0.5) => {
-    if (!isEmbedderReady()) {
-        throw new Error('Embedding model not initialized. Run embedding pipeline first.');
+export async function semanticSearchSqlite(db, query, k = 10) {
+    try {
+        // Lazy imports to avoid loading heavy model code at module init
+        const { searchVector, countEmbeddings } = await import('../db/adapter.js');
+        const { findNodesByIds } = await import('../db/queries.js');
+        const { DEFAULT_MAX_SEMANTIC_DISTANCE } = await import('../search/types.js');
+        // Check if embeddings exist before loading the model
+        const embCount = countEmbeddings(db);
+        if (embCount === 0)
+            return [];
+        if (!isEmbedderReady())
+            return [];
+        const queryVec = await embedQuery(query);
+        const vecResults = searchVector(db, queryVec, k, DEFAULT_MAX_SEMANTIC_DISTANCE);
+        if (vecResults.length === 0)
+            return [];
+        // Build distance lookup
+        const distanceMap = new Map();
+        for (const r of vecResults) {
+            distanceMap.set(r.nodeId, r.distance);
+        }
+        // Batch metadata fetch
+        const metaNodes = findNodesByIds(db, vecResults.map(r => r.nodeId));
+        return metaNodes.map(node => {
+            const result = {
+                nodeId: node.id,
+                name: node.name,
+                label: node.label,
+                filePath: node.filePath,
+                distance: distanceMap.get(node.id) ?? 1,
+            };
+            if (node.startLine != null)
+                result.startLine = node.startLine;
+            if (node.endLine != null)
+                result.endLine = node.endLine;
+            return result;
+        });
     }
-    // Embed query text
-    const queryEmbedding = await embedText(query);
-    const queryVec = embeddingToArray(queryEmbedding);
-    const queryVecStr = `[${queryVec.join(',')}]`;
-    // Query vector index for nearest neighbors
-    const vectorQuery = `
-    CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
-      CAST(${queryVecStr} AS FLOAT[${DEFAULT_EMBEDDING_CONFIG.dimensions}]), ${k})
-    YIELD node AS emb, distance
-    WITH emb, distance
-    WHERE distance < ${maxDistance}
-    RETURN emb.nodeId AS nodeId, distance
-    ORDER BY distance
-  `;
-    const embResults = await executeQuery(vectorQuery);
-    if (embResults.length === 0) {
+    catch {
+        // Expected when embeddings are disabled — silently fall back to BM25-only
         return [];
     }
-    // Group by label for batched metadata queries
-    const byLabel = new Map();
-    for (const embRow of embResults) {
-        const nodeId = embRow.nodeId ?? embRow[0];
-        const distance = embRow.distance ?? embRow[1];
-        const labelEndIdx = nodeId.indexOf(':');
-        const label = labelEndIdx > 0 ? nodeId.substring(0, labelEndIdx) : 'Unknown';
-        if (!byLabel.has(label))
-            byLabel.set(label, []);
-        byLabel.get(label).push({ nodeId, distance });
-    }
-    // Batch-fetch node metadata per label
-    const results = [];
-    for (const [label, items] of byLabel) {
-        const idList = items.map(i => `'${i.nodeId.replace(/'/g, "''")}'`).join(', ');
-        try {
-            let nodeQuery;
-            if (label === 'File') {
-                nodeQuery = `
-          MATCH (n:File) WHERE n.id IN [${idList}]
-          RETURN n.id AS id, n.name AS name, n.filePath AS filePath
-        `;
-            }
-            else {
-                nodeQuery = `
-          MATCH (n:${label}) WHERE n.id IN [${idList}]
-          RETURN n.id AS id, n.name AS name, n.filePath AS filePath,
-                 n.startLine AS startLine, n.endLine AS endLine
-        `;
-            }
-            const nodeRows = await executeQuery(nodeQuery);
-            const rowMap = new Map();
-            for (const row of nodeRows) {
-                const id = row.id ?? row[0];
-                rowMap.set(id, row);
-            }
-            for (const item of items) {
-                const nodeRow = rowMap.get(item.nodeId);
-                if (nodeRow) {
-                    results.push({
-                        nodeId: item.nodeId,
-                        name: nodeRow.name ?? nodeRow[1] ?? '',
-                        label,
-                        filePath: nodeRow.filePath ?? nodeRow[2] ?? '',
-                        distance: item.distance,
-                        startLine: label !== 'File' ? (nodeRow.startLine ?? nodeRow[3]) : undefined,
-                        endLine: label !== 'File' ? (nodeRow.endLine ?? nodeRow[4]) : undefined,
-                    });
-                }
-            }
-        }
-        catch {
-            // Table might not exist — skip
-        }
-    }
-    // Re-sort by distance (batch queries may have mixed order)
-    results.sort((a, b) => a.distance - b.distance);
-    return results;
-};
-/**
- * Semantic search with flattened results (graph expansion placeholder)
- *
- * For full graph traversal, use the execute_vector_cypher tool directly
- *
- * @param executeQuery - Execute Cypher queries
- * @param query - Search query text
- * @param k - Number of semantic matches (default: 5)
- * @param _hops - Unused, kept for API compatibility
- */
-export const semanticSearchWithContext = async (executeQuery, query, k = 5, _hops = 1) => {
-    // Return semantic results directly — use execute_vector_cypher for graph traversal
-    const results = await semanticSearch(executeQuery, query, k, 0.5);
-    return results.map(r => ({
-        matchId: r.nodeId,
-        matchName: r.name,
-        matchLabel: r.label,
-        matchPath: r.filePath,
-        distance: r.distance,
-        connectedId: null,
-        connectedName: null,
-        connectedLabel: null,
-        relationType: null,
-    }));
-};
+}

package/dist/core/embeddings/text-generator.js CHANGED Viewed

@@ -5,6 +5,7 @@
  * combining node metadata with code snippets for semantic matching
  */
 import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
+import { assertNever } from '../../lib/type-utils.js';
 /** Extract filename from a file path */
 const getFileName = (filePath) => {
     const parts = filePath.split('/');
@@ -110,20 +111,6 @@ const generateInterfaceText = (node, maxSnippetLength) => {
     }
     return parts.join('\n');
 };
-/** Generate embedding text for a File node (uses shorter snippet) */
-const generateFileText = (node, maxSnippetLength) => {
-    const parts = [
-        `File: ${node.name}`,
-        `Path: ${node.filePath}`,
-    ];
-    if (node.content) {
-        const cleanedContent = cleanContent(node.content);
-        // Files can be very long — cap at 300 chars
-        const snippet = truncateContent(cleanedContent, Math.min(maxSnippetLength, 300));
-        parts.push('', snippet);
-    }
-    return parts.join('\n');
-};
 /**
  * Generate embedding text for any embeddable node (dispatches by label)
  * @param node - The node to generate text for
@@ -132,7 +119,8 @@ const generateFileText = (node, maxSnippetLength) => {
  */
 export const generateEmbeddingText = (node, config = {}) => {
     const maxSnippetLength = config.maxSnippetLength ?? DEFAULT_EMBEDDING_CONFIG.maxSnippetLength;
-    switch (node.label) {
+    const label = node.label;
+    switch (label) {
         case 'Function':
             return generateFunctionText(node, maxSnippetLength);
         case 'Class':
@@ -141,11 +129,8 @@ export const generateEmbeddingText = (node, config = {}) => {
             return generateMethodText(node, maxSnippetLength);
         case 'Interface':
             return generateInterfaceText(node, maxSnippetLength);
-        case 'File':
-            return generateFileText(node, maxSnippetLength);
         default:
-            // Fallback for any other embeddable type
-            return `${node.label}: ${node.name}\nPath: ${node.filePath}`;
+            return assertNever(label, `Unknown embeddable label: ${node.label}`);
     }
 };
 /**

package/dist/core/embeddings/types.d.ts CHANGED Viewed

@@ -40,7 +40,7 @@ export interface SemanticSearchResult {
     startLine?: number;
     endLine?: number;
 }
-/** Minimal node data for embedding (from LadybugDB query) */
+/** Minimal node data for embedding (from database query) */
 export interface EmbeddableNode {
     id: string;
     name: string;