@zuvia-software-solutions/code-mapper 1.4.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/ai-context.js +1 -1
- package/dist/cli/analyze.d.ts +1 -0
- package/dist/cli/analyze.js +73 -82
- package/dist/cli/augment.js +0 -2
- package/dist/cli/eval-server.d.ts +2 -2
- package/dist/cli/eval-server.js +6 -6
- package/dist/cli/index.js +6 -10
- package/dist/cli/mcp.d.ts +1 -3
- package/dist/cli/mcp.js +3 -3
- package/dist/cli/refresh.d.ts +2 -2
- package/dist/cli/refresh.js +24 -29
- package/dist/cli/status.js +4 -13
- package/dist/cli/tool.d.ts +5 -4
- package/dist/cli/tool.js +8 -10
- package/dist/config/ignore-service.js +14 -34
- package/dist/core/augmentation/engine.js +53 -83
- package/dist/core/db/adapter.d.ts +99 -0
- package/dist/core/db/adapter.js +402 -0
- package/dist/core/db/graph-loader.d.ts +27 -0
- package/dist/core/db/graph-loader.js +148 -0
- package/dist/core/db/queries.d.ts +160 -0
- package/dist/core/db/queries.js +441 -0
- package/dist/core/db/schema.d.ts +108 -0
- package/dist/core/db/schema.js +136 -0
- package/dist/core/embeddings/embedder.d.ts +21 -12
- package/dist/core/embeddings/embedder.js +104 -50
- package/dist/core/embeddings/embedding-pipeline.d.ts +48 -22
- package/dist/core/embeddings/embedding-pipeline.js +220 -262
- package/dist/core/embeddings/text-generator.js +4 -19
- package/dist/core/embeddings/types.d.ts +1 -1
- package/dist/core/graph/graph.d.ts +1 -1
- package/dist/core/graph/graph.js +1 -0
- package/dist/core/graph/types.d.ts +11 -9
- package/dist/core/graph/types.js +4 -1
- package/dist/core/incremental/refresh.d.ts +46 -0
- package/dist/core/incremental/refresh.js +503 -0
- package/dist/core/incremental/types.d.ts +2 -1
- package/dist/core/incremental/types.js +42 -44
- package/dist/core/ingestion/ast-cache.js +1 -0
- package/dist/core/ingestion/call-processor.d.ts +15 -3
- package/dist/core/ingestion/call-processor.js +448 -60
- package/dist/core/ingestion/cluster-enricher.d.ts +1 -1
- package/dist/core/ingestion/cluster-enricher.js +2 -0
- package/dist/core/ingestion/community-processor.d.ts +1 -1
- package/dist/core/ingestion/community-processor.js +8 -3
- package/dist/core/ingestion/export-detection.d.ts +1 -1
- package/dist/core/ingestion/export-detection.js +1 -1
- package/dist/core/ingestion/filesystem-walker.js +1 -1
- package/dist/core/ingestion/heritage-processor.d.ts +2 -2
- package/dist/core/ingestion/heritage-processor.js +22 -11
- package/dist/core/ingestion/import-processor.d.ts +2 -2
- package/dist/core/ingestion/import-processor.js +24 -9
- package/dist/core/ingestion/language-config.js +7 -4
- package/dist/core/ingestion/mro-processor.d.ts +1 -1
- package/dist/core/ingestion/mro-processor.js +23 -11
- package/dist/core/ingestion/named-binding-extraction.js +5 -5
- package/dist/core/ingestion/parsing-processor.d.ts +4 -4
- package/dist/core/ingestion/parsing-processor.js +26 -18
- package/dist/core/ingestion/pipeline.d.ts +4 -2
- package/dist/core/ingestion/pipeline.js +50 -20
- package/dist/core/ingestion/process-processor.d.ts +2 -2
- package/dist/core/ingestion/process-processor.js +28 -14
- package/dist/core/ingestion/resolution-context.d.ts +1 -1
- package/dist/core/ingestion/resolution-context.js +14 -4
- package/dist/core/ingestion/resolvers/csharp.js +4 -3
- package/dist/core/ingestion/resolvers/go.js +3 -1
- package/dist/core/ingestion/resolvers/jvm.js +13 -4
- package/dist/core/ingestion/resolvers/standard.js +2 -2
- package/dist/core/ingestion/resolvers/utils.js +6 -2
- package/dist/core/ingestion/route-stitcher.d.ts +15 -0
- package/dist/core/ingestion/route-stitcher.js +92 -0
- package/dist/core/ingestion/structure-processor.d.ts +1 -1
- package/dist/core/ingestion/structure-processor.js +3 -2
- package/dist/core/ingestion/symbol-table.d.ts +2 -0
- package/dist/core/ingestion/symbol-table.js +5 -1
- package/dist/core/ingestion/tree-sitter-queries.d.ts +2 -2
- package/dist/core/ingestion/tree-sitter-queries.js +177 -0
- package/dist/core/ingestion/type-env.js +20 -0
- package/dist/core/ingestion/type-extractors/csharp.js +4 -3
- package/dist/core/ingestion/type-extractors/go.js +23 -12
- package/dist/core/ingestion/type-extractors/php.js +18 -10
- package/dist/core/ingestion/type-extractors/ruby.js +15 -3
- package/dist/core/ingestion/type-extractors/rust.js +3 -2
- package/dist/core/ingestion/type-extractors/shared.js +3 -2
- package/dist/core/ingestion/type-extractors/typescript.js +11 -5
- package/dist/core/ingestion/utils.d.ts +27 -4
- package/dist/core/ingestion/utils.js +145 -100
- package/dist/core/ingestion/workers/parse-worker.d.ts +1 -0
- package/dist/core/ingestion/workers/parse-worker.js +97 -29
- package/dist/core/ingestion/workers/worker-pool.js +3 -0
- package/dist/core/search/bm25-index.d.ts +15 -8
- package/dist/core/search/bm25-index.js +48 -98
- package/dist/core/search/hybrid-search.d.ts +9 -3
- package/dist/core/search/hybrid-search.js +30 -25
- package/dist/core/search/reranker.js +9 -7
- package/dist/core/search/types.d.ts +0 -4
- package/dist/core/semantic/tsgo-service.d.ts +7 -1
- package/dist/core/semantic/tsgo-service.js +165 -66
- package/dist/lib/tsgo-test.d.ts +2 -0
- package/dist/lib/tsgo-test.js +6 -0
- package/dist/lib/type-utils.d.ts +25 -0
- package/dist/lib/type-utils.js +22 -0
- package/dist/lib/utils.d.ts +3 -2
- package/dist/lib/utils.js +3 -2
- package/dist/mcp/compatible-stdio-transport.js +1 -1
- package/dist/mcp/local/local-backend.d.ts +29 -56
- package/dist/mcp/local/local-backend.js +808 -1118
- package/dist/mcp/resources.js +35 -25
- package/dist/mcp/server.d.ts +1 -1
- package/dist/mcp/server.js +5 -5
- package/dist/mcp/tools.js +24 -25
- package/dist/storage/repo-manager.d.ts +2 -12
- package/dist/storage/repo-manager.js +1 -47
- package/dist/types/pipeline.d.ts +8 -5
- package/dist/types/pipeline.js +5 -0
- package/package.json +18 -11
- package/dist/cli/serve.d.ts +0 -5
- package/dist/cli/serve.js +0 -8
- package/dist/core/incremental/child-process.d.ts +0 -8
- package/dist/core/incremental/child-process.js +0 -649
- package/dist/core/incremental/refresh-coordinator.d.ts +0 -32
- package/dist/core/incremental/refresh-coordinator.js +0 -147
- package/dist/core/lbug/csv-generator.d.ts +0 -28
- package/dist/core/lbug/csv-generator.js +0 -355
- package/dist/core/lbug/lbug-adapter.d.ts +0 -96
- package/dist/core/lbug/lbug-adapter.js +0 -753
- package/dist/core/lbug/schema.d.ts +0 -46
- package/dist/core/lbug/schema.js +0 -402
- package/dist/mcp/core/embedder.d.ts +0 -24
- package/dist/mcp/core/embedder.js +0 -168
- package/dist/mcp/core/lbug-adapter.d.ts +0 -29
- package/dist/mcp/core/lbug-adapter.js +0 -330
- package/dist/server/api.d.ts +0 -5
- package/dist/server/api.js +0 -340
- package/dist/server/mcp-http.d.ts +0 -7
- package/dist/server/mcp-http.js +0 -95
- package/models/mlx-embedder.py +0 -185
|
@@ -2,43 +2,43 @@
|
|
|
2
2
|
/**
|
|
3
3
|
* @file embedding-pipeline.ts
|
|
4
4
|
* @description Orchestrates the background embedding process:
|
|
5
|
-
* 1) Query embeddable nodes from
|
|
5
|
+
* 1) Query embeddable nodes from SQLite
|
|
6
6
|
* 2) Generate text representations
|
|
7
7
|
* 3) Batch embed using transformers.js
|
|
8
|
-
* 4) Store embeddings in
|
|
9
|
-
* 5)
|
|
8
|
+
* 4) Store embeddings in SQLite
|
|
9
|
+
* 5) Vector search via brute-force cosine similarity in adapter.ts
|
|
10
10
|
*/
|
|
11
|
-
import { initEmbedder, embedBatch,
|
|
11
|
+
import { initEmbedder, embedBatch, embedQuery, embeddingToArray, isEmbedderReady } from './embedder.js';
|
|
12
12
|
import { generateEmbeddingText } from './text-generator.js';
|
|
13
13
|
import { DEFAULT_EMBEDDING_CONFIG, EMBEDDABLE_LABELS, } from './types.js';
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
const
|
|
14
|
+
import { toNodeId } from '../db/schema.js';
|
|
15
|
+
import { createHash } from 'crypto';
|
|
16
|
+
const isDev = process.env['NODE_ENV'] === 'development';
|
|
17
|
+
/** Fast content hash for detecting unchanged embedding text */
|
|
18
|
+
function textHash(text) {
|
|
19
|
+
return createHash('md5').update(text).digest('hex');
|
|
20
|
+
}
|
|
21
|
+
/** Query all embeddable nodes from SQLite */
|
|
22
|
+
const queryEmbeddableNodes = (db) => {
|
|
17
23
|
const allNodes = [];
|
|
18
24
|
for (const label of EMBEDDABLE_LABELS) {
|
|
19
25
|
try {
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
MATCH (n:${label})
|
|
23
|
-
RETURN n.id AS id, n.name AS name, '${label}' AS label,
|
|
24
|
-
n.filePath AS filePath, n.content AS content,
|
|
25
|
-
n.startLine AS startLine, n.endLine AS endLine
|
|
26
|
-
`;
|
|
27
|
-
const rows = await executeQuery(query);
|
|
26
|
+
const rows = db.prepare(`SELECT id, name, label, filePath, content, startLine, endLine
|
|
27
|
+
FROM nodes WHERE label = ?`).all(label);
|
|
28
28
|
for (const row of rows) {
|
|
29
29
|
allNodes.push({
|
|
30
|
-
id: row.id
|
|
31
|
-
name: row.name
|
|
32
|
-
label: row.label
|
|
33
|
-
filePath: row.filePath
|
|
34
|
-
content: row.content ??
|
|
35
|
-
startLine: row.startLine ??
|
|
36
|
-
endLine: row.endLine ??
|
|
30
|
+
id: row.id,
|
|
31
|
+
name: row.name,
|
|
32
|
+
label: row.label,
|
|
33
|
+
filePath: row.filePath,
|
|
34
|
+
content: row.content ?? '',
|
|
35
|
+
startLine: row.startLine ?? 0,
|
|
36
|
+
endLine: row.endLine ?? 0,
|
|
37
37
|
});
|
|
38
38
|
}
|
|
39
39
|
}
|
|
40
40
|
catch (error) {
|
|
41
|
-
// Table might not exist or be empty
|
|
41
|
+
// Table might not exist or be empty -- continue
|
|
42
42
|
if (isDev) {
|
|
43
43
|
console.warn(`Query for ${label} nodes failed:`, error);
|
|
44
44
|
}
|
|
@@ -47,55 +47,111 @@ const queryEmbeddableNodes = async (executeQuery) => {
|
|
|
47
47
|
return allNodes;
|
|
48
48
|
};
|
|
49
49
|
/**
|
|
50
|
-
*
|
|
50
|
+
* Fetch graph context (callers, callees, community module) for a set of nodes.
|
|
51
51
|
*
|
|
52
|
-
*
|
|
53
|
-
*
|
|
52
|
+
* This enrichment adds relationship context so that embedding text like
|
|
53
|
+
* "import resolution pipeline" matches `processImports` because its caller
|
|
54
|
+
* "runPipelineFromRepo" contains "pipeline".
|
|
55
|
+
*
|
|
56
|
+
* Reusable by both the full analyze pipeline and incremental refresh.
|
|
57
|
+
*
|
|
58
|
+
* @param db - Open SQLite database instance
|
|
59
|
+
* @param nodes - Nodes to fetch context for (must have `id` field)
|
|
60
|
+
* @returns Map from node ID to graph context
|
|
54
61
|
*/
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
const
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
62
|
+
export function fetchGraphContext(db, nodes) {
|
|
63
|
+
const graphContext = new Map();
|
|
64
|
+
const totalNodes = nodes.length;
|
|
65
|
+
if (totalNodes === 0)
|
|
66
|
+
return graphContext;
|
|
67
|
+
try {
|
|
68
|
+
const ph = nodes.map(() => '?').join(',');
|
|
69
|
+
const nodeIds = nodes.map(n => n.id);
|
|
70
|
+
// Batch fetch callers
|
|
71
|
+
const callerRows = db.prepare(`
|
|
72
|
+
SELECT e.targetId AS nid, n.name AS name
|
|
73
|
+
FROM edges e JOIN nodes n ON n.id = e.sourceId
|
|
74
|
+
WHERE e.targetId IN (${ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7
|
|
75
|
+
LIMIT ${totalNodes * 3}
|
|
76
|
+
`).all(...nodeIds);
|
|
77
|
+
const callerMap = new Map();
|
|
78
|
+
for (const r of callerRows) {
|
|
79
|
+
if (!callerMap.has(r.nid))
|
|
80
|
+
callerMap.set(r.nid, []);
|
|
81
|
+
callerMap.get(r.nid).push(r.name);
|
|
70
82
|
}
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
83
|
+
// Batch fetch callees
|
|
84
|
+
const calleeRows = db.prepare(`
|
|
85
|
+
SELECT e.sourceId AS nid, n.name AS name
|
|
86
|
+
FROM edges e JOIN nodes n ON n.id = e.targetId
|
|
87
|
+
WHERE e.sourceId IN (${ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7
|
|
88
|
+
LIMIT ${totalNodes * 3}
|
|
89
|
+
`).all(...nodeIds);
|
|
90
|
+
const calleeMap = new Map();
|
|
91
|
+
for (const r of calleeRows) {
|
|
92
|
+
if (!calleeMap.has(r.nid))
|
|
93
|
+
calleeMap.set(r.nid, []);
|
|
94
|
+
calleeMap.get(r.nid).push(r.name);
|
|
74
95
|
}
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
96
|
+
// Batch fetch module (community membership)
|
|
97
|
+
const moduleRows = db.prepare(`
|
|
98
|
+
SELECT e.sourceId AS nid, c.heuristicLabel AS module
|
|
99
|
+
FROM edges e JOIN nodes c ON c.id = e.targetId
|
|
100
|
+
WHERE e.sourceId IN (${ph}) AND e.type = 'MEMBER_OF' AND c.label = 'Community'
|
|
101
|
+
LIMIT ${totalNodes}
|
|
102
|
+
`).all(...nodeIds);
|
|
103
|
+
const moduleMap = new Map();
|
|
104
|
+
for (const r of moduleRows) {
|
|
105
|
+
moduleMap.set(r.nid, r.module ?? '');
|
|
106
|
+
}
|
|
107
|
+
// Assemble
|
|
108
|
+
for (const node of nodes) {
|
|
109
|
+
graphContext.set(node.id, {
|
|
110
|
+
callers: (callerMap.get(node.id) || []).slice(0, 3),
|
|
111
|
+
callees: (calleeMap.get(node.id) || []).slice(0, 3),
|
|
112
|
+
module: moduleMap.get(node.id) || '',
|
|
113
|
+
});
|
|
86
114
|
}
|
|
87
115
|
}
|
|
88
|
-
}
|
|
116
|
+
catch { } // Non-fatal -- embeddings work without graph context
|
|
117
|
+
return graphContext;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Enrich embedding text with graph context (callers, callees, module).
|
|
121
|
+
*
|
|
122
|
+
* Inserts context lines (Module, Called by, Calls) after the header
|
|
123
|
+
* section of the generated text, before the code snippet.
|
|
124
|
+
*
|
|
125
|
+
* @param text - Base embedding text from generateEmbeddingText
|
|
126
|
+
* @param ctx - Graph context for this node
|
|
127
|
+
* @returns Enriched text
|
|
128
|
+
*/
|
|
129
|
+
export function enrichTextWithGraphContext(text, ctx) {
|
|
130
|
+
const parts = [];
|
|
131
|
+
if (ctx.module)
|
|
132
|
+
parts.push(`Module: ${ctx.module}`);
|
|
133
|
+
if (ctx.callers.length > 0)
|
|
134
|
+
parts.push(`Called by: ${ctx.callers.join(', ')}`);
|
|
135
|
+
if (ctx.callees.length > 0)
|
|
136
|
+
parts.push(`Calls: ${ctx.callees.join(', ')}`);
|
|
137
|
+
if (parts.length === 0)
|
|
138
|
+
return text;
|
|
139
|
+
const lines = text.split('\n');
|
|
140
|
+
const insertIdx = lines.findIndex(l => l === '') || 2;
|
|
141
|
+
lines.splice(insertIdx, 0, ...parts);
|
|
142
|
+
return lines.join('\n');
|
|
143
|
+
}
|
|
89
144
|
/**
|
|
90
|
-
* Run the full embedding pipeline (load model, embed nodes,
|
|
91
|
-
* @param
|
|
92
|
-
* @param executeWithReusedStatement - Execute with reused prepared statement
|
|
145
|
+
* Run the full embedding pipeline (load model, embed nodes, store in SQLite)
|
|
146
|
+
* @param db - Open SQLite database instance
|
|
93
147
|
* @param onProgress - Progress callback
|
|
94
148
|
* @param config - Configuration override
|
|
95
149
|
* @param skipNodeIds - Node IDs that already have embeddings (incremental mode)
|
|
96
150
|
*/
|
|
97
|
-
export
|
|
151
|
+
export async function runEmbeddingPipeline(db, onProgress, config = {}, skipNodeIds) {
|
|
98
152
|
const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config };
|
|
153
|
+
// Lazy import to avoid circular dependencies at module load time
|
|
154
|
+
const { insertEmbeddingsBatch } = await import('../db/adapter.js');
|
|
99
155
|
try {
|
|
100
156
|
// Phase 1: Load model
|
|
101
157
|
onProgress({
|
|
@@ -117,73 +173,22 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
|
|
|
117
173
|
modelDownloadPercent: 100,
|
|
118
174
|
});
|
|
119
175
|
if (isDev) {
|
|
120
|
-
console.log('
|
|
176
|
+
console.log('Querying embeddable nodes...');
|
|
121
177
|
}
|
|
122
178
|
// Phase 2: Query nodes
|
|
123
|
-
let nodes =
|
|
179
|
+
let nodes = queryEmbeddableNodes(db);
|
|
124
180
|
// Incremental mode: skip already-embedded nodes
|
|
125
181
|
if (skipNodeIds && skipNodeIds.size > 0) {
|
|
126
182
|
const beforeCount = nodes.length;
|
|
127
183
|
nodes = nodes.filter(n => !skipNodeIds.has(n.id));
|
|
128
184
|
if (isDev) {
|
|
129
|
-
console.log(
|
|
185
|
+
console.log(`Incremental embeddings: ${beforeCount} total, ${skipNodeIds.size} cached, ${nodes.length} to embed`);
|
|
130
186
|
}
|
|
131
187
|
}
|
|
132
188
|
const totalNodes = nodes.length;
|
|
133
189
|
// Enrich nodes with graph context (callers, callees, module) for better embeddings
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
const graphContext = new Map();
|
|
137
|
-
if (totalNodes > 0) {
|
|
138
|
-
try {
|
|
139
|
-
const nodeIds = nodes.map(n => `'${String(n.id).replace(/'/g, "''")}'`).join(', ');
|
|
140
|
-
// Batch fetch callers
|
|
141
|
-
const callerRows = await executeQuery(`
|
|
142
|
-
MATCH (caller)-[r:CodeRelation {type: 'CALLS'}]->(n) WHERE n.id IN [${nodeIds}] AND r.confidence >= 0.7
|
|
143
|
-
RETURN n.id AS nid, caller.name AS name LIMIT ${totalNodes * 3}
|
|
144
|
-
`);
|
|
145
|
-
const callerMap = new Map();
|
|
146
|
-
for (const r of callerRows) {
|
|
147
|
-
const nid = String(r.nid ?? r[0]);
|
|
148
|
-
if (!callerMap.has(nid))
|
|
149
|
-
callerMap.set(nid, []);
|
|
150
|
-
callerMap.get(nid).push(String(r.name ?? r[1]));
|
|
151
|
-
}
|
|
152
|
-
// Batch fetch callees
|
|
153
|
-
const calleeRows = await executeQuery(`
|
|
154
|
-
MATCH (n)-[r:CodeRelation {type: 'CALLS'}]->(callee) WHERE n.id IN [${nodeIds}] AND r.confidence >= 0.7
|
|
155
|
-
RETURN n.id AS nid, callee.name AS name LIMIT ${totalNodes * 3}
|
|
156
|
-
`);
|
|
157
|
-
const calleeMap = new Map();
|
|
158
|
-
for (const r of calleeRows) {
|
|
159
|
-
const nid = String(r.nid ?? r[0]);
|
|
160
|
-
if (!calleeMap.has(nid))
|
|
161
|
-
calleeMap.set(nid, []);
|
|
162
|
-
calleeMap.get(nid).push(String(r.name ?? r[1]));
|
|
163
|
-
}
|
|
164
|
-
// Batch fetch module
|
|
165
|
-
const moduleRows = await executeQuery(`
|
|
166
|
-
MATCH (n)-[:CodeRelation {type: 'MEMBER_OF'}]->(c:Community) WHERE n.id IN [${nodeIds}]
|
|
167
|
-
RETURN n.id AS nid, c.heuristicLabel AS module LIMIT ${totalNodes}
|
|
168
|
-
`);
|
|
169
|
-
const moduleMap = new Map();
|
|
170
|
-
for (const r of moduleRows) {
|
|
171
|
-
moduleMap.set(String(r.nid ?? r[0]), String(r.module ?? r[1] ?? ''));
|
|
172
|
-
}
|
|
173
|
-
// Assemble
|
|
174
|
-
for (const node of nodes) {
|
|
175
|
-
graphContext.set(node.id, {
|
|
176
|
-
callers: (callerMap.get(node.id) || []).slice(0, 3),
|
|
177
|
-
callees: (calleeMap.get(node.id) || []).slice(0, 3),
|
|
178
|
-
module: moduleMap.get(node.id) || '',
|
|
179
|
-
});
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
catch { } // Non-fatal — embeddings work without graph context
|
|
183
|
-
}
|
|
184
|
-
if (isDev) {
|
|
185
|
-
console.log(`📊 Found ${totalNodes} embeddable nodes (${graphContext.size} with graph context)`);
|
|
186
|
-
}
|
|
190
|
+
const graphContext = fetchGraphContext(db, nodes);
|
|
191
|
+
console.error(`Code Mapper: ${totalNodes} embeddable nodes, ${graphContext.size} with graph context (callers/callees/module)`);
|
|
187
192
|
if (totalNodes === 0) {
|
|
188
193
|
onProgress({
|
|
189
194
|
phase: 'ready',
|
|
@@ -194,8 +199,6 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
|
|
|
194
199
|
return;
|
|
195
200
|
}
|
|
196
201
|
// Phase 3: Batch embed
|
|
197
|
-
const batchSize = finalConfig.batchSize;
|
|
198
|
-
const totalBatches = Math.ceil(totalNodes / batchSize);
|
|
199
202
|
let processedNodes = 0;
|
|
200
203
|
onProgress({
|
|
201
204
|
phase: 'embedding',
|
|
@@ -203,46 +206,62 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
|
|
|
203
206
|
nodesProcessed: 0,
|
|
204
207
|
totalNodes,
|
|
205
208
|
});
|
|
206
|
-
// Generate
|
|
209
|
+
// Generate text representations with graph context enrichment
|
|
207
210
|
const allTexts = nodes.map(node => {
|
|
208
|
-
const ctx = graphContext.get(node.id);
|
|
209
211
|
let text = generateEmbeddingText(node, finalConfig);
|
|
212
|
+
const ctx = graphContext.get(node.id);
|
|
210
213
|
if (ctx) {
|
|
211
|
-
|
|
212
|
-
if (ctx.module)
|
|
213
|
-
parts.push(`Module: ${ctx.module}`);
|
|
214
|
-
if (ctx.callers.length > 0)
|
|
215
|
-
parts.push(`Called by: ${ctx.callers.join(', ')}`);
|
|
216
|
-
if (ctx.callees.length > 0)
|
|
217
|
-
parts.push(`Calls: ${ctx.callees.join(', ')}`);
|
|
218
|
-
if (parts.length > 0) {
|
|
219
|
-
const lines = text.split('\n');
|
|
220
|
-
const insertIdx = lines.findIndex(l => l === '') || 2;
|
|
221
|
-
lines.splice(insertIdx, 0, ...parts);
|
|
222
|
-
text = lines.join('\n');
|
|
223
|
-
}
|
|
214
|
+
text = enrichTextWithGraphContext(text, ctx);
|
|
224
215
|
}
|
|
225
216
|
return text;
|
|
226
217
|
});
|
|
227
|
-
//
|
|
228
|
-
|
|
229
|
-
const
|
|
218
|
+
// Hash-based skip: compare text hashes to skip unchanged nodes
|
|
219
|
+
const { getEmbeddingHashes } = await import('../db/adapter.js');
|
|
220
|
+
const existingHashes = getEmbeddingHashes(db);
|
|
221
|
+
const hashes = allTexts.map(t => textHash(t));
|
|
222
|
+
const toEmbed = [];
|
|
223
|
+
const skipped = [];
|
|
224
|
+
for (let i = 0; i < nodes.length; i++) {
|
|
225
|
+
const node = nodes[i];
|
|
226
|
+
const hash = hashes[i];
|
|
227
|
+
const existing = existingHashes.get(node.id);
|
|
228
|
+
if (existing === hash) {
|
|
229
|
+
skipped.push({ index: i, hash });
|
|
230
|
+
}
|
|
231
|
+
else {
|
|
232
|
+
toEmbed.push({ index: i, text: allTexts[i], hash });
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
console.error(`Code Mapper: ${toEmbed.length} nodes to embed, ${skipped.length} unchanged (hash skip)`);
|
|
236
|
+
// Embed only changed nodes
|
|
237
|
+
let embeddingResults = [];
|
|
238
|
+
if (toEmbed.length > 0) {
|
|
239
|
+
const t0Embed = Date.now();
|
|
240
|
+
embeddingResults = await embedBatch(toEmbed.map(e => e.text));
|
|
241
|
+
console.error(`Code Mapper: MLX embedded ${embeddingResults.length} texts in ${Date.now() - t0Embed}ms`);
|
|
242
|
+
}
|
|
230
243
|
onProgress({
|
|
231
244
|
phase: 'embedding',
|
|
232
245
|
percent: 85,
|
|
233
246
|
nodesProcessed: totalNodes,
|
|
234
247
|
totalNodes,
|
|
235
248
|
});
|
|
236
|
-
// Insert
|
|
249
|
+
// Insert embeddings with hashes into SQLite in batches
|
|
237
250
|
const DB_BATCH = 200;
|
|
238
|
-
|
|
239
|
-
const
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
251
|
+
const allUpdates = toEmbed.map((entry, j) => {
|
|
252
|
+
const emb = embeddingResults[j];
|
|
253
|
+
if (!emb)
|
|
254
|
+
throw new Error(`Missing embedding at index ${j}`);
|
|
255
|
+
const node = nodes[entry.index];
|
|
256
|
+
return {
|
|
257
|
+
nodeId: toNodeId(node.id),
|
|
258
|
+
embedding: embeddingToArray(emb),
|
|
259
|
+
textHash: entry.hash,
|
|
260
|
+
};
|
|
261
|
+
});
|
|
262
|
+
for (let i = 0; i < allUpdates.length; i += DB_BATCH) {
|
|
263
|
+
const batch = allUpdates.slice(i, i + DB_BATCH);
|
|
264
|
+
insertEmbeddingsBatch(db, batch);
|
|
246
265
|
processedNodes = Math.min(i + DB_BATCH, nodes.length);
|
|
247
266
|
onProgress({
|
|
248
267
|
phase: 'embedding',
|
|
@@ -251,17 +270,14 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
|
|
|
251
270
|
totalNodes,
|
|
252
271
|
});
|
|
253
272
|
}
|
|
254
|
-
// Phase 4:
|
|
273
|
+
// Phase 4: No HNSW index needed -- SQLite uses brute-force cosine similarity
|
|
274
|
+
// which is fast enough for <200K vectors at 256 dims
|
|
255
275
|
onProgress({
|
|
256
276
|
phase: 'indexing',
|
|
257
277
|
percent: 90,
|
|
258
278
|
nodesProcessed: totalNodes,
|
|
259
279
|
totalNodes,
|
|
260
280
|
});
|
|
261
|
-
if (isDev) {
|
|
262
|
-
console.log('📇 Creating vector index...');
|
|
263
|
-
}
|
|
264
|
-
await createVectorIndex(executeQuery);
|
|
265
281
|
// Done
|
|
266
282
|
onProgress({
|
|
267
283
|
phase: 'ready',
|
|
@@ -269,14 +285,12 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
|
|
|
269
285
|
nodesProcessed: totalNodes,
|
|
270
286
|
totalNodes,
|
|
271
287
|
});
|
|
272
|
-
|
|
273
|
-
console.log('✅ Embedding pipeline complete!');
|
|
274
|
-
}
|
|
288
|
+
console.error(`Code Mapper: Embedding pipeline complete (${totalNodes} nodes stored)`);
|
|
275
289
|
}
|
|
276
290
|
catch (error) {
|
|
277
291
|
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
278
292
|
if (isDev) {
|
|
279
|
-
console.error('
|
|
293
|
+
console.error('Embedding pipeline error:', error);
|
|
280
294
|
}
|
|
281
295
|
onProgress({
|
|
282
296
|
phase: 'error',
|
|
@@ -285,118 +299,62 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
|
|
|
285
299
|
});
|
|
286
300
|
throw error;
|
|
287
301
|
}
|
|
288
|
-
}
|
|
302
|
+
}
|
|
303
|
+
// ---------------------------------------------------------------------------
|
|
304
|
+
// SQLite-backed semantic search (for api.ts and hybrid-search.ts consumers)
|
|
305
|
+
// ---------------------------------------------------------------------------
|
|
289
306
|
/**
|
|
290
|
-
*
|
|
291
|
-
*
|
|
307
|
+
* Semantic vector search against a SQLite database.
|
|
308
|
+
*
|
|
309
|
+
* Uses brute-force cosine similarity via adapter.searchVector, then
|
|
310
|
+
* enriches results with node metadata. This mirrors the pattern in
|
|
311
|
+
* local-backend.ts but as a standalone function for hybrid search.
|
|
312
|
+
*
|
|
313
|
+
* @param db - Open SQLite database instance
|
|
292
314
|
* @param query - Search query text
|
|
293
315
|
* @param k - Number of results (default: 10)
|
|
294
|
-
* @param maxDistance - Maximum cosine distance threshold (default:
|
|
295
|
-
* @returns Search results ordered by relevance
|
|
316
|
+
* @param maxDistance - Maximum cosine distance threshold (default: from types.ts)
|
|
296
317
|
*/
|
|
297
|
-
export
|
|
298
|
-
|
|
299
|
-
|
|
318
|
+
export async function semanticSearchSqlite(db, query, k = 10) {
|
|
319
|
+
try {
|
|
320
|
+
// Lazy imports to avoid loading heavy model code at module init
|
|
321
|
+
const { searchVector, countEmbeddings } = await import('../db/adapter.js');
|
|
322
|
+
const { findNodesByIds } = await import('../db/queries.js');
|
|
323
|
+
const { DEFAULT_MAX_SEMANTIC_DISTANCE } = await import('../search/types.js');
|
|
324
|
+
// Check if embeddings exist before loading the model
|
|
325
|
+
const embCount = countEmbeddings(db);
|
|
326
|
+
if (embCount === 0)
|
|
327
|
+
return [];
|
|
328
|
+
if (!isEmbedderReady())
|
|
329
|
+
return [];
|
|
330
|
+
const queryVec = await embedQuery(query);
|
|
331
|
+
const vecResults = searchVector(db, queryVec, k, DEFAULT_MAX_SEMANTIC_DISTANCE);
|
|
332
|
+
if (vecResults.length === 0)
|
|
333
|
+
return [];
|
|
334
|
+
// Build distance lookup
|
|
335
|
+
const distanceMap = new Map();
|
|
336
|
+
for (const r of vecResults) {
|
|
337
|
+
distanceMap.set(r.nodeId, r.distance);
|
|
338
|
+
}
|
|
339
|
+
// Batch metadata fetch
|
|
340
|
+
const metaNodes = findNodesByIds(db, vecResults.map(r => r.nodeId));
|
|
341
|
+
return metaNodes.map(node => {
|
|
342
|
+
const result = {
|
|
343
|
+
nodeId: node.id,
|
|
344
|
+
name: node.name,
|
|
345
|
+
label: node.label,
|
|
346
|
+
filePath: node.filePath,
|
|
347
|
+
distance: distanceMap.get(node.id) ?? 1,
|
|
348
|
+
};
|
|
349
|
+
if (node.startLine != null)
|
|
350
|
+
result.startLine = node.startLine;
|
|
351
|
+
if (node.endLine != null)
|
|
352
|
+
result.endLine = node.endLine;
|
|
353
|
+
return result;
|
|
354
|
+
});
|
|
300
355
|
}
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
const queryVec = embeddingToArray(queryEmbedding);
|
|
304
|
-
const queryVecStr = `[${queryVec.join(',')}]`;
|
|
305
|
-
// Query vector index for nearest neighbors
|
|
306
|
-
const vectorQuery = `
|
|
307
|
-
CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
|
|
308
|
-
CAST(${queryVecStr} AS FLOAT[${DEFAULT_EMBEDDING_CONFIG.dimensions}]), ${k})
|
|
309
|
-
YIELD node AS emb, distance
|
|
310
|
-
WITH emb, distance
|
|
311
|
-
WHERE distance < ${maxDistance}
|
|
312
|
-
RETURN emb.nodeId AS nodeId, distance
|
|
313
|
-
ORDER BY distance
|
|
314
|
-
`;
|
|
315
|
-
const embResults = await executeQuery(vectorQuery);
|
|
316
|
-
if (embResults.length === 0) {
|
|
356
|
+
catch {
|
|
357
|
+
// Expected when embeddings are disabled — silently fall back to BM25-only
|
|
317
358
|
return [];
|
|
318
359
|
}
|
|
319
|
-
|
|
320
|
-
const byLabel = new Map();
|
|
321
|
-
for (const embRow of embResults) {
|
|
322
|
-
const nodeId = embRow.nodeId ?? embRow[0];
|
|
323
|
-
const distance = embRow.distance ?? embRow[1];
|
|
324
|
-
const labelEndIdx = nodeId.indexOf(':');
|
|
325
|
-
const label = labelEndIdx > 0 ? nodeId.substring(0, labelEndIdx) : 'Unknown';
|
|
326
|
-
if (!byLabel.has(label))
|
|
327
|
-
byLabel.set(label, []);
|
|
328
|
-
byLabel.get(label).push({ nodeId, distance });
|
|
329
|
-
}
|
|
330
|
-
// Batch-fetch node metadata per label
|
|
331
|
-
const results = [];
|
|
332
|
-
for (const [label, items] of byLabel) {
|
|
333
|
-
const idList = items.map(i => `'${i.nodeId.replace(/'/g, "''")}'`).join(', ');
|
|
334
|
-
try {
|
|
335
|
-
let nodeQuery;
|
|
336
|
-
if (label === 'File') {
|
|
337
|
-
nodeQuery = `
|
|
338
|
-
MATCH (n:File) WHERE n.id IN [${idList}]
|
|
339
|
-
RETURN n.id AS id, n.name AS name, n.filePath AS filePath
|
|
340
|
-
`;
|
|
341
|
-
}
|
|
342
|
-
else {
|
|
343
|
-
nodeQuery = `
|
|
344
|
-
MATCH (n:${label}) WHERE n.id IN [${idList}]
|
|
345
|
-
RETURN n.id AS id, n.name AS name, n.filePath AS filePath,
|
|
346
|
-
n.startLine AS startLine, n.endLine AS endLine
|
|
347
|
-
`;
|
|
348
|
-
}
|
|
349
|
-
const nodeRows = await executeQuery(nodeQuery);
|
|
350
|
-
const rowMap = new Map();
|
|
351
|
-
for (const row of nodeRows) {
|
|
352
|
-
const id = row.id ?? row[0];
|
|
353
|
-
rowMap.set(id, row);
|
|
354
|
-
}
|
|
355
|
-
for (const item of items) {
|
|
356
|
-
const nodeRow = rowMap.get(item.nodeId);
|
|
357
|
-
if (nodeRow) {
|
|
358
|
-
results.push({
|
|
359
|
-
nodeId: item.nodeId,
|
|
360
|
-
name: nodeRow.name ?? nodeRow[1] ?? '',
|
|
361
|
-
label,
|
|
362
|
-
filePath: nodeRow.filePath ?? nodeRow[2] ?? '',
|
|
363
|
-
distance: item.distance,
|
|
364
|
-
startLine: label !== 'File' ? (nodeRow.startLine ?? nodeRow[3]) : undefined,
|
|
365
|
-
endLine: label !== 'File' ? (nodeRow.endLine ?? nodeRow[4]) : undefined,
|
|
366
|
-
});
|
|
367
|
-
}
|
|
368
|
-
}
|
|
369
|
-
}
|
|
370
|
-
catch {
|
|
371
|
-
// Table might not exist — skip
|
|
372
|
-
}
|
|
373
|
-
}
|
|
374
|
-
// Re-sort by distance (batch queries may have mixed order)
|
|
375
|
-
results.sort((a, b) => a.distance - b.distance);
|
|
376
|
-
return results;
|
|
377
|
-
};
|
|
378
|
-
/**
|
|
379
|
-
* Semantic search with flattened results (graph expansion placeholder)
|
|
380
|
-
*
|
|
381
|
-
* For full graph traversal, use the execute_vector_cypher tool directly
|
|
382
|
-
*
|
|
383
|
-
* @param executeQuery - Execute Cypher queries
|
|
384
|
-
* @param query - Search query text
|
|
385
|
-
* @param k - Number of semantic matches (default: 5)
|
|
386
|
-
* @param _hops - Unused, kept for API compatibility
|
|
387
|
-
*/
|
|
388
|
-
export const semanticSearchWithContext = async (executeQuery, query, k = 5, _hops = 1) => {
|
|
389
|
-
// Return semantic results directly — use execute_vector_cypher for graph traversal
|
|
390
|
-
const results = await semanticSearch(executeQuery, query, k, 0.5);
|
|
391
|
-
return results.map(r => ({
|
|
392
|
-
matchId: r.nodeId,
|
|
393
|
-
matchName: r.name,
|
|
394
|
-
matchLabel: r.label,
|
|
395
|
-
matchPath: r.filePath,
|
|
396
|
-
distance: r.distance,
|
|
397
|
-
connectedId: null,
|
|
398
|
-
connectedName: null,
|
|
399
|
-
connectedLabel: null,
|
|
400
|
-
relationType: null,
|
|
401
|
-
}));
|
|
402
|
-
};
|
|
360
|
+
}
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
* combining node metadata with code snippets for semantic matching
|
|
6
6
|
*/
|
|
7
7
|
import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
|
|
8
|
+
import { assertNever } from '../../lib/type-utils.js';
|
|
8
9
|
/** Extract filename from a file path */
|
|
9
10
|
const getFileName = (filePath) => {
|
|
10
11
|
const parts = filePath.split('/');
|
|
@@ -110,20 +111,6 @@ const generateInterfaceText = (node, maxSnippetLength) => {
|
|
|
110
111
|
}
|
|
111
112
|
return parts.join('\n');
|
|
112
113
|
};
|
|
113
|
-
/** Generate embedding text for a File node (uses shorter snippet) */
|
|
114
|
-
const generateFileText = (node, maxSnippetLength) => {
|
|
115
|
-
const parts = [
|
|
116
|
-
`File: ${node.name}`,
|
|
117
|
-
`Path: ${node.filePath}`,
|
|
118
|
-
];
|
|
119
|
-
if (node.content) {
|
|
120
|
-
const cleanedContent = cleanContent(node.content);
|
|
121
|
-
// Files can be very long — cap at 300 chars
|
|
122
|
-
const snippet = truncateContent(cleanedContent, Math.min(maxSnippetLength, 300));
|
|
123
|
-
parts.push('', snippet);
|
|
124
|
-
}
|
|
125
|
-
return parts.join('\n');
|
|
126
|
-
};
|
|
127
114
|
/**
|
|
128
115
|
* Generate embedding text for any embeddable node (dispatches by label)
|
|
129
116
|
* @param node - The node to generate text for
|
|
@@ -132,7 +119,8 @@ const generateFileText = (node, maxSnippetLength) => {
|
|
|
132
119
|
*/
|
|
133
120
|
export const generateEmbeddingText = (node, config = {}) => {
|
|
134
121
|
const maxSnippetLength = config.maxSnippetLength ?? DEFAULT_EMBEDDING_CONFIG.maxSnippetLength;
|
|
135
|
-
|
|
122
|
+
const label = node.label;
|
|
123
|
+
switch (label) {
|
|
136
124
|
case 'Function':
|
|
137
125
|
return generateFunctionText(node, maxSnippetLength);
|
|
138
126
|
case 'Class':
|
|
@@ -141,11 +129,8 @@ export const generateEmbeddingText = (node, config = {}) => {
|
|
|
141
129
|
return generateMethodText(node, maxSnippetLength);
|
|
142
130
|
case 'Interface':
|
|
143
131
|
return generateInterfaceText(node, maxSnippetLength);
|
|
144
|
-
case 'File':
|
|
145
|
-
return generateFileText(node, maxSnippetLength);
|
|
146
132
|
default:
|
|
147
|
-
|
|
148
|
-
return `${node.label}: ${node.name}\nPath: ${node.filePath}`;
|
|
133
|
+
return assertNever(label, `Unknown embeddable label: ${node.label}`);
|
|
149
134
|
}
|
|
150
135
|
};
|
|
151
136
|
/**
|
|
@@ -40,7 +40,7 @@ export interface SemanticSearchResult {
|
|
|
40
40
|
startLine?: number;
|
|
41
41
|
endLine?: number;
|
|
42
42
|
}
|
|
43
|
-
/** Minimal node data for embedding (from
|
|
43
|
+
/** Minimal node data for embedding (from database query) */
|
|
44
44
|
export interface EmbeddableNode {
|
|
45
45
|
id: string;
|
|
46
46
|
name: string;
|