@zuvia-software-solutions/code-mapper 1.4.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/dist/cli/ai-context.js +1 -1
  2. package/dist/cli/analyze.d.ts +1 -0
  3. package/dist/cli/analyze.js +73 -82
  4. package/dist/cli/augment.js +0 -2
  5. package/dist/cli/eval-server.d.ts +2 -2
  6. package/dist/cli/eval-server.js +6 -6
  7. package/dist/cli/index.js +6 -10
  8. package/dist/cli/mcp.d.ts +1 -3
  9. package/dist/cli/mcp.js +3 -3
  10. package/dist/cli/refresh.d.ts +2 -2
  11. package/dist/cli/refresh.js +24 -29
  12. package/dist/cli/status.js +4 -13
  13. package/dist/cli/tool.d.ts +5 -4
  14. package/dist/cli/tool.js +8 -10
  15. package/dist/config/ignore-service.js +14 -34
  16. package/dist/core/augmentation/engine.js +53 -83
  17. package/dist/core/db/adapter.d.ts +99 -0
  18. package/dist/core/db/adapter.js +402 -0
  19. package/dist/core/db/graph-loader.d.ts +27 -0
  20. package/dist/core/db/graph-loader.js +148 -0
  21. package/dist/core/db/queries.d.ts +160 -0
  22. package/dist/core/db/queries.js +441 -0
  23. package/dist/core/db/schema.d.ts +108 -0
  24. package/dist/core/db/schema.js +136 -0
  25. package/dist/core/embeddings/embedder.d.ts +21 -12
  26. package/dist/core/embeddings/embedder.js +104 -50
  27. package/dist/core/embeddings/embedding-pipeline.d.ts +48 -22
  28. package/dist/core/embeddings/embedding-pipeline.js +220 -262
  29. package/dist/core/embeddings/text-generator.js +4 -19
  30. package/dist/core/embeddings/types.d.ts +1 -1
  31. package/dist/core/graph/graph.d.ts +1 -1
  32. package/dist/core/graph/graph.js +1 -0
  33. package/dist/core/graph/types.d.ts +11 -9
  34. package/dist/core/graph/types.js +4 -1
  35. package/dist/core/incremental/refresh.d.ts +46 -0
  36. package/dist/core/incremental/refresh.js +503 -0
  37. package/dist/core/incremental/types.d.ts +2 -1
  38. package/dist/core/incremental/types.js +42 -44
  39. package/dist/core/ingestion/ast-cache.js +1 -0
  40. package/dist/core/ingestion/call-processor.d.ts +15 -3
  41. package/dist/core/ingestion/call-processor.js +448 -60
  42. package/dist/core/ingestion/cluster-enricher.d.ts +1 -1
  43. package/dist/core/ingestion/cluster-enricher.js +2 -0
  44. package/dist/core/ingestion/community-processor.d.ts +1 -1
  45. package/dist/core/ingestion/community-processor.js +8 -3
  46. package/dist/core/ingestion/export-detection.d.ts +1 -1
  47. package/dist/core/ingestion/export-detection.js +1 -1
  48. package/dist/core/ingestion/filesystem-walker.js +1 -1
  49. package/dist/core/ingestion/heritage-processor.d.ts +2 -2
  50. package/dist/core/ingestion/heritage-processor.js +22 -11
  51. package/dist/core/ingestion/import-processor.d.ts +2 -2
  52. package/dist/core/ingestion/import-processor.js +24 -9
  53. package/dist/core/ingestion/language-config.js +7 -4
  54. package/dist/core/ingestion/mro-processor.d.ts +1 -1
  55. package/dist/core/ingestion/mro-processor.js +23 -11
  56. package/dist/core/ingestion/named-binding-extraction.js +5 -5
  57. package/dist/core/ingestion/parsing-processor.d.ts +4 -4
  58. package/dist/core/ingestion/parsing-processor.js +26 -18
  59. package/dist/core/ingestion/pipeline.d.ts +4 -2
  60. package/dist/core/ingestion/pipeline.js +50 -20
  61. package/dist/core/ingestion/process-processor.d.ts +2 -2
  62. package/dist/core/ingestion/process-processor.js +28 -14
  63. package/dist/core/ingestion/resolution-context.d.ts +1 -1
  64. package/dist/core/ingestion/resolution-context.js +14 -4
  65. package/dist/core/ingestion/resolvers/csharp.js +4 -3
  66. package/dist/core/ingestion/resolvers/go.js +3 -1
  67. package/dist/core/ingestion/resolvers/jvm.js +13 -4
  68. package/dist/core/ingestion/resolvers/standard.js +2 -2
  69. package/dist/core/ingestion/resolvers/utils.js +6 -2
  70. package/dist/core/ingestion/route-stitcher.d.ts +15 -0
  71. package/dist/core/ingestion/route-stitcher.js +92 -0
  72. package/dist/core/ingestion/structure-processor.d.ts +1 -1
  73. package/dist/core/ingestion/structure-processor.js +3 -2
  74. package/dist/core/ingestion/symbol-table.d.ts +2 -0
  75. package/dist/core/ingestion/symbol-table.js +5 -1
  76. package/dist/core/ingestion/tree-sitter-queries.d.ts +2 -2
  77. package/dist/core/ingestion/tree-sitter-queries.js +177 -0
  78. package/dist/core/ingestion/type-env.js +20 -0
  79. package/dist/core/ingestion/type-extractors/csharp.js +4 -3
  80. package/dist/core/ingestion/type-extractors/go.js +23 -12
  81. package/dist/core/ingestion/type-extractors/php.js +18 -10
  82. package/dist/core/ingestion/type-extractors/ruby.js +15 -3
  83. package/dist/core/ingestion/type-extractors/rust.js +3 -2
  84. package/dist/core/ingestion/type-extractors/shared.js +3 -2
  85. package/dist/core/ingestion/type-extractors/typescript.js +11 -5
  86. package/dist/core/ingestion/utils.d.ts +27 -4
  87. package/dist/core/ingestion/utils.js +145 -100
  88. package/dist/core/ingestion/workers/parse-worker.d.ts +1 -0
  89. package/dist/core/ingestion/workers/parse-worker.js +97 -29
  90. package/dist/core/ingestion/workers/worker-pool.js +3 -0
  91. package/dist/core/search/bm25-index.d.ts +15 -8
  92. package/dist/core/search/bm25-index.js +48 -98
  93. package/dist/core/search/hybrid-search.d.ts +9 -3
  94. package/dist/core/search/hybrid-search.js +30 -25
  95. package/dist/core/search/reranker.js +9 -7
  96. package/dist/core/search/types.d.ts +0 -4
  97. package/dist/core/semantic/tsgo-service.d.ts +7 -1
  98. package/dist/core/semantic/tsgo-service.js +165 -66
  99. package/dist/lib/tsgo-test.d.ts +2 -0
  100. package/dist/lib/tsgo-test.js +6 -0
  101. package/dist/lib/type-utils.d.ts +25 -0
  102. package/dist/lib/type-utils.js +22 -0
  103. package/dist/lib/utils.d.ts +3 -2
  104. package/dist/lib/utils.js +3 -2
  105. package/dist/mcp/compatible-stdio-transport.js +1 -1
  106. package/dist/mcp/local/local-backend.d.ts +29 -56
  107. package/dist/mcp/local/local-backend.js +808 -1118
  108. package/dist/mcp/resources.js +35 -25
  109. package/dist/mcp/server.d.ts +1 -1
  110. package/dist/mcp/server.js +5 -5
  111. package/dist/mcp/tools.js +24 -25
  112. package/dist/storage/repo-manager.d.ts +2 -12
  113. package/dist/storage/repo-manager.js +1 -47
  114. package/dist/types/pipeline.d.ts +8 -5
  115. package/dist/types/pipeline.js +5 -0
  116. package/package.json +18 -11
  117. package/dist/cli/serve.d.ts +0 -5
  118. package/dist/cli/serve.js +0 -8
  119. package/dist/core/incremental/child-process.d.ts +0 -8
  120. package/dist/core/incremental/child-process.js +0 -649
  121. package/dist/core/incremental/refresh-coordinator.d.ts +0 -32
  122. package/dist/core/incremental/refresh-coordinator.js +0 -147
  123. package/dist/core/lbug/csv-generator.d.ts +0 -28
  124. package/dist/core/lbug/csv-generator.js +0 -355
  125. package/dist/core/lbug/lbug-adapter.d.ts +0 -96
  126. package/dist/core/lbug/lbug-adapter.js +0 -753
  127. package/dist/core/lbug/schema.d.ts +0 -46
  128. package/dist/core/lbug/schema.js +0 -402
  129. package/dist/mcp/core/embedder.d.ts +0 -24
  130. package/dist/mcp/core/embedder.js +0 -168
  131. package/dist/mcp/core/lbug-adapter.d.ts +0 -29
  132. package/dist/mcp/core/lbug-adapter.js +0 -330
  133. package/dist/server/api.d.ts +0 -5
  134. package/dist/server/api.js +0 -340
  135. package/dist/server/mcp-http.d.ts +0 -7
  136. package/dist/server/mcp-http.js +0 -95
  137. package/models/mlx-embedder.py +0 -185
@@ -2,43 +2,43 @@
2
2
  /**
3
3
  * @file embedding-pipeline.ts
4
4
  * @description Orchestrates the background embedding process:
5
- * 1) Query embeddable nodes from LadybugDB
5
+ * 1) Query embeddable nodes from SQLite
6
6
  * 2) Generate text representations
7
7
  * 3) Batch embed using transformers.js
8
- * 4) Store embeddings in LadybugDB
9
- * 5) Create vector index for semantic search
8
+ * 4) Store embeddings in SQLite
9
+ * 5) Vector search via brute-force cosine similarity in adapter.ts
10
10
  */
11
- import { initEmbedder, embedBatch, embedText, embeddingToArray, isEmbedderReady } from './embedder.js';
11
+ import { initEmbedder, embedBatch, embedQuery, embeddingToArray, isEmbedderReady } from './embedder.js';
12
12
  import { generateEmbeddingText } from './text-generator.js';
13
13
  import { DEFAULT_EMBEDDING_CONFIG, EMBEDDABLE_LABELS, } from './types.js';
14
- const isDev = process.env.NODE_ENV === 'development';
15
- /** Query all embeddable nodes from LadybugDB (File has different schema than code elements) */
16
- const queryEmbeddableNodes = async (executeQuery) => {
14
+ import { toNodeId } from '../db/schema.js';
15
+ import { createHash } from 'crypto';
16
+ const isDev = process.env['NODE_ENV'] === 'development';
17
+ /** Fast content hash for detecting unchanged embedding text */
18
+ function textHash(text) {
19
+ return createHash('md5').update(text).digest('hex');
20
+ }
21
+ /** Query all embeddable nodes from SQLite */
22
+ const queryEmbeddableNodes = (db) => {
17
23
  const allNodes = [];
18
24
  for (const label of EMBEDDABLE_LABELS) {
19
25
  try {
20
- // All embeddable labels are code elements with startLine/endLine
21
- const query = `
22
- MATCH (n:${label})
23
- RETURN n.id AS id, n.name AS name, '${label}' AS label,
24
- n.filePath AS filePath, n.content AS content,
25
- n.startLine AS startLine, n.endLine AS endLine
26
- `;
27
- const rows = await executeQuery(query);
26
+ const rows = db.prepare(`SELECT id, name, label, filePath, content, startLine, endLine
27
+ FROM nodes WHERE label = ?`).all(label);
28
28
  for (const row of rows) {
29
29
  allNodes.push({
30
- id: row.id ?? row[0],
31
- name: row.name ?? row[1],
32
- label: row.label ?? row[2],
33
- filePath: row.filePath ?? row[3],
34
- content: row.content ?? row[4] ?? '',
35
- startLine: row.startLine ?? row[5],
36
- endLine: row.endLine ?? row[6],
30
+ id: row.id,
31
+ name: row.name,
32
+ label: row.label,
33
+ filePath: row.filePath,
34
+ content: row.content ?? '',
35
+ startLine: row.startLine ?? 0,
36
+ endLine: row.endLine ?? 0,
37
37
  });
38
38
  }
39
39
  }
40
40
  catch (error) {
41
- // Table might not exist or be empty continue
41
+ // Table might not exist or be empty -- continue
42
42
  if (isDev) {
43
43
  console.warn(`Query for ${label} nodes failed:`, error);
44
44
  }
@@ -47,55 +47,111 @@ const queryEmbeddableNodes = async (executeQuery) => {
47
47
  return allNodes;
48
48
  };
49
49
  /**
50
- * Batch INSERT embeddings into the CodeEmbedding table
50
+ * Fetch graph context (callers, callees, community module) for a set of nodes.
51
51
  *
52
- * Separate lightweight table avoids copy-on-write overhead from
53
- * UPDATEing nodes with large content fields
52
+ * This enrichment adds relationship context so that embedding text like
53
+ * "import resolution pipeline" matches `processImports` because its caller
54
+ * "runPipelineFromRepo" contains "pipeline".
55
+ *
56
+ * Reusable by both the full analyze pipeline and incremental refresh.
57
+ *
58
+ * @param db - Open SQLite database instance
59
+ * @param nodes - Nodes to fetch context for (must have `id` field)
60
+ * @returns Map from node ID to graph context
54
61
  */
55
- const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
56
- // INSERT into separate embedding table — avoids large-row COW overhead
57
- const cypher = `CREATE (e:CodeEmbedding {nodeId: $nodeId, embedding: $embedding})`;
58
- const paramsList = updates.map(u => ({ nodeId: u.id, embedding: u.embedding }));
59
- await executeWithReusedStatement(cypher, paramsList);
60
- };
61
- /** Create the HNSW vector index on the CodeEmbedding table */
62
- let vectorExtensionLoaded = false;
63
- const createVectorIndex = async (executeQuery) => {
64
- // LadybugDB v0.15+ requires explicit VECTOR extension load (once per session)
65
- if (!vectorExtensionLoaded) {
66
- try {
67
- await executeQuery('INSTALL VECTOR');
68
- await executeQuery('LOAD EXTENSION VECTOR');
69
- vectorExtensionLoaded = true;
62
+ export function fetchGraphContext(db, nodes) {
63
+ const graphContext = new Map();
64
+ const totalNodes = nodes.length;
65
+ if (totalNodes === 0)
66
+ return graphContext;
67
+ try {
68
+ const ph = nodes.map(() => '?').join(',');
69
+ const nodeIds = nodes.map(n => n.id);
70
+ // Batch fetch callers
71
+ const callerRows = db.prepare(`
72
+ SELECT e.targetId AS nid, n.name AS name
73
+ FROM edges e JOIN nodes n ON n.id = e.sourceId
74
+ WHERE e.targetId IN (${ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7
75
+ LIMIT ${totalNodes * 3}
76
+ `).all(...nodeIds);
77
+ const callerMap = new Map();
78
+ for (const r of callerRows) {
79
+ if (!callerMap.has(r.nid))
80
+ callerMap.set(r.nid, []);
81
+ callerMap.get(r.nid).push(r.name);
70
82
  }
71
- catch {
72
- // Extension may already be loaded — index creation will fail clearly if not
73
- vectorExtensionLoaded = true;
83
+ // Batch fetch callees
84
+ const calleeRows = db.prepare(`
85
+ SELECT e.sourceId AS nid, n.name AS name
86
+ FROM edges e JOIN nodes n ON n.id = e.targetId
87
+ WHERE e.sourceId IN (${ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7
88
+ LIMIT ${totalNodes * 3}
89
+ `).all(...nodeIds);
90
+ const calleeMap = new Map();
91
+ for (const r of calleeRows) {
92
+ if (!calleeMap.has(r.nid))
93
+ calleeMap.set(r.nid, []);
94
+ calleeMap.get(r.nid).push(r.name);
74
95
  }
75
- }
76
- const cypher = `
77
- CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
78
- `;
79
- try {
80
- await executeQuery(cypher);
81
- }
82
- catch (error) {
83
- // Index might already exist
84
- if (isDev) {
85
- console.warn('Vector index creation warning:', error);
96
+ // Batch fetch module (community membership)
97
+ const moduleRows = db.prepare(`
98
+ SELECT e.sourceId AS nid, c.heuristicLabel AS module
99
+ FROM edges e JOIN nodes c ON c.id = e.targetId
100
+ WHERE e.sourceId IN (${ph}) AND e.type = 'MEMBER_OF' AND c.label = 'Community'
101
+ LIMIT ${totalNodes}
102
+ `).all(...nodeIds);
103
+ const moduleMap = new Map();
104
+ for (const r of moduleRows) {
105
+ moduleMap.set(r.nid, r.module ?? '');
106
+ }
107
+ // Assemble
108
+ for (const node of nodes) {
109
+ graphContext.set(node.id, {
110
+ callers: (callerMap.get(node.id) || []).slice(0, 3),
111
+ callees: (calleeMap.get(node.id) || []).slice(0, 3),
112
+ module: moduleMap.get(node.id) || '',
113
+ });
86
114
  }
87
115
  }
88
- };
116
+ catch { } // Non-fatal -- embeddings work without graph context
117
+ return graphContext;
118
+ }
119
+ /**
120
+ * Enrich embedding text with graph context (callers, callees, module).
121
+ *
122
+ * Inserts context lines (Module, Called by, Calls) after the header
123
+ * section of the generated text, before the code snippet.
124
+ *
125
+ * @param text - Base embedding text from generateEmbeddingText
126
+ * @param ctx - Graph context for this node
127
+ * @returns Enriched text
128
+ */
129
+ export function enrichTextWithGraphContext(text, ctx) {
130
+ const parts = [];
131
+ if (ctx.module)
132
+ parts.push(`Module: ${ctx.module}`);
133
+ if (ctx.callers.length > 0)
134
+ parts.push(`Called by: ${ctx.callers.join(', ')}`);
135
+ if (ctx.callees.length > 0)
136
+ parts.push(`Calls: ${ctx.callees.join(', ')}`);
137
+ if (parts.length === 0)
138
+ return text;
139
+ const lines = text.split('\n');
140
+ const insertIdx = lines.findIndex(l => l === '') || 2;
141
+ lines.splice(insertIdx, 0, ...parts);
142
+ return lines.join('\n');
143
+ }
89
144
  /**
90
- * Run the full embedding pipeline (load model, embed nodes, create index)
91
- * @param executeQuery - Execute Cypher queries against LadybugDB
92
- * @param executeWithReusedStatement - Execute with reused prepared statement
145
+ * Run the full embedding pipeline (load model, embed nodes, store in SQLite)
146
+ * @param db - Open SQLite database instance
93
147
  * @param onProgress - Progress callback
94
148
  * @param config - Configuration override
95
149
  * @param skipNodeIds - Node IDs that already have embeddings (incremental mode)
96
150
  */
97
- export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatement, onProgress, config = {}, skipNodeIds) => {
151
+ export async function runEmbeddingPipeline(db, onProgress, config = {}, skipNodeIds) {
98
152
  const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config };
153
+ // Lazy import to avoid circular dependencies at module load time
154
+ const { insertEmbeddingsBatch } = await import('../db/adapter.js');
99
155
  try {
100
156
  // Phase 1: Load model
101
157
  onProgress({
@@ -117,73 +173,22 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
117
173
  modelDownloadPercent: 100,
118
174
  });
119
175
  if (isDev) {
120
- console.log('🔍 Querying embeddable nodes...');
176
+ console.log('Querying embeddable nodes...');
121
177
  }
122
178
  // Phase 2: Query nodes
123
- let nodes = await queryEmbeddableNodes(executeQuery);
179
+ let nodes = queryEmbeddableNodes(db);
124
180
  // Incremental mode: skip already-embedded nodes
125
181
  if (skipNodeIds && skipNodeIds.size > 0) {
126
182
  const beforeCount = nodes.length;
127
183
  nodes = nodes.filter(n => !skipNodeIds.has(n.id));
128
184
  if (isDev) {
129
- console.log(`📦 Incremental embeddings: ${beforeCount} total, ${skipNodeIds.size} cached, ${nodes.length} to embed`);
185
+ console.log(`Incremental embeddings: ${beforeCount} total, ${skipNodeIds.size} cached, ${nodes.length} to embed`);
130
186
  }
131
187
  }
132
188
  const totalNodes = nodes.length;
133
189
  // Enrich nodes with graph context (callers, callees, module) for better embeddings
134
- // This adds relationship context so "import resolution pipeline" matches processImports
135
- // because its caller "runPipelineFromRepo" contains "pipeline"
136
- const graphContext = new Map();
137
- if (totalNodes > 0) {
138
- try {
139
- const nodeIds = nodes.map(n => `'${String(n.id).replace(/'/g, "''")}'`).join(', ');
140
- // Batch fetch callers
141
- const callerRows = await executeQuery(`
142
- MATCH (caller)-[r:CodeRelation {type: 'CALLS'}]->(n) WHERE n.id IN [${nodeIds}] AND r.confidence >= 0.7
143
- RETURN n.id AS nid, caller.name AS name LIMIT ${totalNodes * 3}
144
- `);
145
- const callerMap = new Map();
146
- for (const r of callerRows) {
147
- const nid = String(r.nid ?? r[0]);
148
- if (!callerMap.has(nid))
149
- callerMap.set(nid, []);
150
- callerMap.get(nid).push(String(r.name ?? r[1]));
151
- }
152
- // Batch fetch callees
153
- const calleeRows = await executeQuery(`
154
- MATCH (n)-[r:CodeRelation {type: 'CALLS'}]->(callee) WHERE n.id IN [${nodeIds}] AND r.confidence >= 0.7
155
- RETURN n.id AS nid, callee.name AS name LIMIT ${totalNodes * 3}
156
- `);
157
- const calleeMap = new Map();
158
- for (const r of calleeRows) {
159
- const nid = String(r.nid ?? r[0]);
160
- if (!calleeMap.has(nid))
161
- calleeMap.set(nid, []);
162
- calleeMap.get(nid).push(String(r.name ?? r[1]));
163
- }
164
- // Batch fetch module
165
- const moduleRows = await executeQuery(`
166
- MATCH (n)-[:CodeRelation {type: 'MEMBER_OF'}]->(c:Community) WHERE n.id IN [${nodeIds}]
167
- RETURN n.id AS nid, c.heuristicLabel AS module LIMIT ${totalNodes}
168
- `);
169
- const moduleMap = new Map();
170
- for (const r of moduleRows) {
171
- moduleMap.set(String(r.nid ?? r[0]), String(r.module ?? r[1] ?? ''));
172
- }
173
- // Assemble
174
- for (const node of nodes) {
175
- graphContext.set(node.id, {
176
- callers: (callerMap.get(node.id) || []).slice(0, 3),
177
- callees: (calleeMap.get(node.id) || []).slice(0, 3),
178
- module: moduleMap.get(node.id) || '',
179
- });
180
- }
181
- }
182
- catch { } // Non-fatal — embeddings work without graph context
183
- }
184
- if (isDev) {
185
- console.log(`📊 Found ${totalNodes} embeddable nodes (${graphContext.size} with graph context)`);
186
- }
190
+ const graphContext = fetchGraphContext(db, nodes);
191
+ console.error(`Code Mapper: ${totalNodes} embeddable nodes, ${graphContext.size} with graph context (callers/callees/module)`);
187
192
  if (totalNodes === 0) {
188
193
  onProgress({
189
194
  phase: 'ready',
@@ -194,8 +199,6 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
194
199
  return;
195
200
  }
196
201
  // Phase 3: Batch embed
197
- const batchSize = finalConfig.batchSize;
198
- const totalBatches = Math.ceil(totalNodes / batchSize);
199
202
  let processedNodes = 0;
200
203
  onProgress({
201
204
  phase: 'embedding',
@@ -203,46 +206,62 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
203
206
  nodesProcessed: 0,
204
207
  totalNodes,
205
208
  });
206
- // Generate ALL text representations with graph context enrichment
209
+ // Generate text representations with graph context enrichment
207
210
  const allTexts = nodes.map(node => {
208
- const ctx = graphContext.get(node.id);
209
211
  let text = generateEmbeddingText(node, finalConfig);
212
+ const ctx = graphContext.get(node.id);
210
213
  if (ctx) {
211
- const parts = [];
212
- if (ctx.module)
213
- parts.push(`Module: ${ctx.module}`);
214
- if (ctx.callers.length > 0)
215
- parts.push(`Called by: ${ctx.callers.join(', ')}`);
216
- if (ctx.callees.length > 0)
217
- parts.push(`Calls: ${ctx.callees.join(', ')}`);
218
- if (parts.length > 0) {
219
- const lines = text.split('\n');
220
- const insertIdx = lines.findIndex(l => l === '') || 2;
221
- lines.splice(insertIdx, 0, ...parts);
222
- text = lines.join('\n');
223
- }
214
+ text = enrichTextWithGraphContext(text, ctx);
224
215
  }
225
216
  return text;
226
217
  });
227
- // Send ALL texts to the MLX embedder in one call — it does length-tiered
228
- // batching internally for optimal Metal GPU utilization
229
- const allEmbeddings = await embedBatch(allTexts);
218
+ // Hash-based skip: compare text hashes to skip unchanged nodes
219
+ const { getEmbeddingHashes } = await import('../db/adapter.js');
220
+ const existingHashes = getEmbeddingHashes(db);
221
+ const hashes = allTexts.map(t => textHash(t));
222
+ const toEmbed = [];
223
+ const skipped = [];
224
+ for (let i = 0; i < nodes.length; i++) {
225
+ const node = nodes[i];
226
+ const hash = hashes[i];
227
+ const existing = existingHashes.get(node.id);
228
+ if (existing === hash) {
229
+ skipped.push({ index: i, hash });
230
+ }
231
+ else {
232
+ toEmbed.push({ index: i, text: allTexts[i], hash });
233
+ }
234
+ }
235
+ console.error(`Code Mapper: ${toEmbed.length} nodes to embed, ${skipped.length} unchanged (hash skip)`);
236
+ // Embed only changed nodes
237
+ let embeddingResults = [];
238
+ if (toEmbed.length > 0) {
239
+ const t0Embed = Date.now();
240
+ embeddingResults = await embedBatch(toEmbed.map(e => e.text));
241
+ console.error(`Code Mapper: MLX embedded ${embeddingResults.length} texts in ${Date.now() - t0Embed}ms`);
242
+ }
230
243
  onProgress({
231
244
  phase: 'embedding',
232
245
  percent: 85,
233
246
  nodesProcessed: totalNodes,
234
247
  totalNodes,
235
248
  });
236
- // Insert all embeddings into LadybugDB in batches
249
+ // Insert embeddings with hashes into SQLite in batches
237
250
  const DB_BATCH = 200;
238
- for (let i = 0; i < nodes.length; i += DB_BATCH) {
239
- const batchNodes = nodes.slice(i, i + DB_BATCH);
240
- const batchEmbeddings = allEmbeddings.slice(i, i + DB_BATCH);
241
- const updates = batchNodes.map((node, j) => ({
242
- id: node.id,
243
- embedding: embeddingToArray(batchEmbeddings[j]),
244
- }));
245
- await batchInsertEmbeddings(executeWithReusedStatement, updates);
251
+ const allUpdates = toEmbed.map((entry, j) => {
252
+ const emb = embeddingResults[j];
253
+ if (!emb)
254
+ throw new Error(`Missing embedding at index ${j}`);
255
+ const node = nodes[entry.index];
256
+ return {
257
+ nodeId: toNodeId(node.id),
258
+ embedding: embeddingToArray(emb),
259
+ textHash: entry.hash,
260
+ };
261
+ });
262
+ for (let i = 0; i < allUpdates.length; i += DB_BATCH) {
263
+ const batch = allUpdates.slice(i, i + DB_BATCH);
264
+ insertEmbeddingsBatch(db, batch);
246
265
  processedNodes = Math.min(i + DB_BATCH, nodes.length);
247
266
  onProgress({
248
267
  phase: 'embedding',
@@ -251,17 +270,14 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
251
270
  totalNodes,
252
271
  });
253
272
  }
254
- // Phase 4: Create HNSW vector index
273
+ // Phase 4: No HNSW index needed -- SQLite uses brute-force cosine similarity
274
+ // which is fast enough for <200K vectors at 256 dims
255
275
  onProgress({
256
276
  phase: 'indexing',
257
277
  percent: 90,
258
278
  nodesProcessed: totalNodes,
259
279
  totalNodes,
260
280
  });
261
- if (isDev) {
262
- console.log('📇 Creating vector index...');
263
- }
264
- await createVectorIndex(executeQuery);
265
281
  // Done
266
282
  onProgress({
267
283
  phase: 'ready',
@@ -269,14 +285,12 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
269
285
  nodesProcessed: totalNodes,
270
286
  totalNodes,
271
287
  });
272
- if (isDev) {
273
- console.log('✅ Embedding pipeline complete!');
274
- }
288
+ console.error(`Code Mapper: Embedding pipeline complete (${totalNodes} nodes stored)`);
275
289
  }
276
290
  catch (error) {
277
291
  const errorMessage = error instanceof Error ? error.message : 'Unknown error';
278
292
  if (isDev) {
279
- console.error('Embedding pipeline error:', error);
293
+ console.error('Embedding pipeline error:', error);
280
294
  }
281
295
  onProgress({
282
296
  phase: 'error',
@@ -285,118 +299,62 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
285
299
  });
286
300
  throw error;
287
301
  }
288
- };
302
+ }
303
+ // ---------------------------------------------------------------------------
304
+ // SQLite-backed semantic search (for api.ts and hybrid-search.ts consumers)
305
+ // ---------------------------------------------------------------------------
289
306
  /**
290
- * Perform semantic search via the CodeEmbedding vector index
291
- * @param executeQuery - Execute Cypher queries
307
+ * Semantic vector search against a SQLite database.
308
+ *
309
+ * Uses brute-force cosine similarity via adapter.searchVector, then
310
+ * enriches results with node metadata. This mirrors the pattern in
311
+ * local-backend.ts but as a standalone function for hybrid search.
312
+ *
313
+ * @param db - Open SQLite database instance
292
314
  * @param query - Search query text
293
315
  * @param k - Number of results (default: 10)
294
- * @param maxDistance - Maximum cosine distance threshold (default: 0.5)
295
- * @returns Search results ordered by relevance
316
+ * @param maxDistance - Maximum cosine distance threshold (default: from types.ts)
296
317
  */
297
- export const semanticSearch = async (executeQuery, query, k = 10, maxDistance = 0.5) => {
298
- if (!isEmbedderReady()) {
299
- throw new Error('Embedding model not initialized. Run embedding pipeline first.');
318
+ export async function semanticSearchSqlite(db, query, k = 10) {
319
+ try {
320
+ // Lazy imports to avoid loading heavy model code at module init
321
+ const { searchVector, countEmbeddings } = await import('../db/adapter.js');
322
+ const { findNodesByIds } = await import('../db/queries.js');
323
+ const { DEFAULT_MAX_SEMANTIC_DISTANCE } = await import('../search/types.js');
324
+ // Check if embeddings exist before loading the model
325
+ const embCount = countEmbeddings(db);
326
+ if (embCount === 0)
327
+ return [];
328
+ if (!isEmbedderReady())
329
+ return [];
330
+ const queryVec = await embedQuery(query);
331
+ const vecResults = searchVector(db, queryVec, k, DEFAULT_MAX_SEMANTIC_DISTANCE);
332
+ if (vecResults.length === 0)
333
+ return [];
334
+ // Build distance lookup
335
+ const distanceMap = new Map();
336
+ for (const r of vecResults) {
337
+ distanceMap.set(r.nodeId, r.distance);
338
+ }
339
+ // Batch metadata fetch
340
+ const metaNodes = findNodesByIds(db, vecResults.map(r => r.nodeId));
341
+ return metaNodes.map(node => {
342
+ const result = {
343
+ nodeId: node.id,
344
+ name: node.name,
345
+ label: node.label,
346
+ filePath: node.filePath,
347
+ distance: distanceMap.get(node.id) ?? 1,
348
+ };
349
+ if (node.startLine != null)
350
+ result.startLine = node.startLine;
351
+ if (node.endLine != null)
352
+ result.endLine = node.endLine;
353
+ return result;
354
+ });
300
355
  }
301
- // Embed query text
302
- const queryEmbedding = await embedText(query);
303
- const queryVec = embeddingToArray(queryEmbedding);
304
- const queryVecStr = `[${queryVec.join(',')}]`;
305
- // Query vector index for nearest neighbors
306
- const vectorQuery = `
307
- CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
308
- CAST(${queryVecStr} AS FLOAT[${DEFAULT_EMBEDDING_CONFIG.dimensions}]), ${k})
309
- YIELD node AS emb, distance
310
- WITH emb, distance
311
- WHERE distance < ${maxDistance}
312
- RETURN emb.nodeId AS nodeId, distance
313
- ORDER BY distance
314
- `;
315
- const embResults = await executeQuery(vectorQuery);
316
- if (embResults.length === 0) {
356
+ catch {
357
+ // Expected when embeddings are disabled — silently fall back to BM25-only
317
358
  return [];
318
359
  }
319
- // Group by label for batched metadata queries
320
- const byLabel = new Map();
321
- for (const embRow of embResults) {
322
- const nodeId = embRow.nodeId ?? embRow[0];
323
- const distance = embRow.distance ?? embRow[1];
324
- const labelEndIdx = nodeId.indexOf(':');
325
- const label = labelEndIdx > 0 ? nodeId.substring(0, labelEndIdx) : 'Unknown';
326
- if (!byLabel.has(label))
327
- byLabel.set(label, []);
328
- byLabel.get(label).push({ nodeId, distance });
329
- }
330
- // Batch-fetch node metadata per label
331
- const results = [];
332
- for (const [label, items] of byLabel) {
333
- const idList = items.map(i => `'${i.nodeId.replace(/'/g, "''")}'`).join(', ');
334
- try {
335
- let nodeQuery;
336
- if (label === 'File') {
337
- nodeQuery = `
338
- MATCH (n:File) WHERE n.id IN [${idList}]
339
- RETURN n.id AS id, n.name AS name, n.filePath AS filePath
340
- `;
341
- }
342
- else {
343
- nodeQuery = `
344
- MATCH (n:${label}) WHERE n.id IN [${idList}]
345
- RETURN n.id AS id, n.name AS name, n.filePath AS filePath,
346
- n.startLine AS startLine, n.endLine AS endLine
347
- `;
348
- }
349
- const nodeRows = await executeQuery(nodeQuery);
350
- const rowMap = new Map();
351
- for (const row of nodeRows) {
352
- const id = row.id ?? row[0];
353
- rowMap.set(id, row);
354
- }
355
- for (const item of items) {
356
- const nodeRow = rowMap.get(item.nodeId);
357
- if (nodeRow) {
358
- results.push({
359
- nodeId: item.nodeId,
360
- name: nodeRow.name ?? nodeRow[1] ?? '',
361
- label,
362
- filePath: nodeRow.filePath ?? nodeRow[2] ?? '',
363
- distance: item.distance,
364
- startLine: label !== 'File' ? (nodeRow.startLine ?? nodeRow[3]) : undefined,
365
- endLine: label !== 'File' ? (nodeRow.endLine ?? nodeRow[4]) : undefined,
366
- });
367
- }
368
- }
369
- }
370
- catch {
371
- // Table might not exist — skip
372
- }
373
- }
374
- // Re-sort by distance (batch queries may have mixed order)
375
- results.sort((a, b) => a.distance - b.distance);
376
- return results;
377
- };
378
- /**
379
- * Semantic search with flattened results (graph expansion placeholder)
380
- *
381
- * For full graph traversal, use the execute_vector_cypher tool directly
382
- *
383
- * @param executeQuery - Execute Cypher queries
384
- * @param query - Search query text
385
- * @param k - Number of semantic matches (default: 5)
386
- * @param _hops - Unused, kept for API compatibility
387
- */
388
- export const semanticSearchWithContext = async (executeQuery, query, k = 5, _hops = 1) => {
389
- // Return semantic results directly — use execute_vector_cypher for graph traversal
390
- const results = await semanticSearch(executeQuery, query, k, 0.5);
391
- return results.map(r => ({
392
- matchId: r.nodeId,
393
- matchName: r.name,
394
- matchLabel: r.label,
395
- matchPath: r.filePath,
396
- distance: r.distance,
397
- connectedId: null,
398
- connectedName: null,
399
- connectedLabel: null,
400
- relationType: null,
401
- }));
402
- };
360
+ }
@@ -5,6 +5,7 @@
5
5
  * combining node metadata with code snippets for semantic matching
6
6
  */
7
7
  import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
8
+ import { assertNever } from '../../lib/type-utils.js';
8
9
  /** Extract filename from a file path */
9
10
  const getFileName = (filePath) => {
10
11
  const parts = filePath.split('/');
@@ -110,20 +111,6 @@ const generateInterfaceText = (node, maxSnippetLength) => {
110
111
  }
111
112
  return parts.join('\n');
112
113
  };
113
- /** Generate embedding text for a File node (uses shorter snippet) */
114
- const generateFileText = (node, maxSnippetLength) => {
115
- const parts = [
116
- `File: ${node.name}`,
117
- `Path: ${node.filePath}`,
118
- ];
119
- if (node.content) {
120
- const cleanedContent = cleanContent(node.content);
121
- // Files can be very long — cap at 300 chars
122
- const snippet = truncateContent(cleanedContent, Math.min(maxSnippetLength, 300));
123
- parts.push('', snippet);
124
- }
125
- return parts.join('\n');
126
- };
127
114
  /**
128
115
  * Generate embedding text for any embeddable node (dispatches by label)
129
116
  * @param node - The node to generate text for
@@ -132,7 +119,8 @@ const generateFileText = (node, maxSnippetLength) => {
132
119
  */
133
120
  export const generateEmbeddingText = (node, config = {}) => {
134
121
  const maxSnippetLength = config.maxSnippetLength ?? DEFAULT_EMBEDDING_CONFIG.maxSnippetLength;
135
- switch (node.label) {
122
+ const label = node.label;
123
+ switch (label) {
136
124
  case 'Function':
137
125
  return generateFunctionText(node, maxSnippetLength);
138
126
  case 'Class':
@@ -141,11 +129,8 @@ export const generateEmbeddingText = (node, config = {}) => {
141
129
  return generateMethodText(node, maxSnippetLength);
142
130
  case 'Interface':
143
131
  return generateInterfaceText(node, maxSnippetLength);
144
- case 'File':
145
- return generateFileText(node, maxSnippetLength);
146
132
  default:
147
- // Fallback for any other embeddable type
148
- return `${node.label}: ${node.name}\nPath: ${node.filePath}`;
133
+ return assertNever(label, `Unknown embeddable label: ${node.label}`);
149
134
  }
150
135
  };
151
136
  /**
@@ -40,7 +40,7 @@ export interface SemanticSearchResult {
40
40
  startLine?: number;
41
41
  endLine?: number;
42
42
  }
43
- /** Minimal node data for embedding (from LadybugDB query) */
43
+ /** Minimal node data for embedding (from database query) */
44
44
  export interface EmbeddableNode {
45
45
  id: string;
46
46
  name: string;