@zuvia-software-solutions/code-mapper 2.4.0 → 2.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/analyze.d.ts +1 -0
- package/dist/cli/analyze.js +16 -0
- package/dist/cli/index.js +2 -2
- package/dist/core/db/adapter.d.ts +9 -0
- package/dist/core/db/adapter.js +40 -0
- package/dist/core/db/schema.d.ts +9 -1
- package/dist/core/db/schema.js +11 -0
- package/dist/core/embeddings/nl-embedder.d.ts +44 -0
- package/dist/core/embeddings/nl-embedder.js +262 -0
- package/dist/mcp/local/local-backend.d.ts +11 -0
- package/dist/mcp/local/local-backend.js +241 -22
- package/package.json +1 -1
package/dist/cli/analyze.d.ts
CHANGED
package/dist/cli/analyze.js
CHANGED
|
@@ -428,6 +428,22 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
428
428
|
// Reopen DB after Python is done
|
|
429
429
|
db = openDb(dbPath);
|
|
430
430
|
}
|
|
431
|
+
// Phase 4b: NL Embeddings (bge-small, CPU, Node.js)
|
|
432
|
+
if (options?.nlEmbeddings) {
|
|
433
|
+
recordPhase('nl-embeddings');
|
|
434
|
+
updateBar(95, 'Generating NL embeddings (bge-small)...');
|
|
435
|
+
const { buildNlEmbeddings } = await import('../core/embeddings/nl-embedder.js');
|
|
436
|
+
try {
|
|
437
|
+
const nlResult = await buildNlEmbeddings(db, (current, total) => {
|
|
438
|
+
const pct = 95 + Math.round((current / Math.max(total, 1)) * 3);
|
|
439
|
+
updateBar(pct, `NL embeddings (${current}/${total})`, 'NL embeddings');
|
|
440
|
+
});
|
|
441
|
+
updateBar(98, `NL embeddings: ${nlResult.embedded} embedded, ${nlResult.skipped} cached (${(nlResult.durationMs / 1000).toFixed(1)}s)`);
|
|
442
|
+
}
|
|
443
|
+
catch (err) {
|
|
444
|
+
console.error(`\n Warning: NL embeddings failed: ${err instanceof Error ? err.message : err}`);
|
|
445
|
+
}
|
|
446
|
+
}
|
|
431
447
|
// Phase 5: Finalize (98-100%)
|
|
432
448
|
recordPhase('finalize');
|
|
433
449
|
updateBar(98, 'Saving metadata...');
|
package/dist/cli/index.js
CHANGED
|
@@ -22,8 +22,8 @@ program
|
|
|
22
22
|
.command('analyze [path]')
|
|
23
23
|
.description('Index a repository (full analysis)')
|
|
24
24
|
.option('-f, --force', 'Force full re-index even if up to date')
|
|
25
|
-
.option('--embeddings', 'Enable embedding generation
|
|
26
|
-
.option('--
|
|
25
|
+
.option('--embeddings', 'Enable code embedding generation (Jina/MLX, GPU)', false)
|
|
26
|
+
.option('--nl-embeddings', 'Enable NL embedding generation (bge-small, CPU, recommended)', false)
|
|
27
27
|
.option('--no-tsgo', 'Skip tsgo LSP for call resolution (faster, less accurate)')
|
|
28
28
|
.option('-v, --verbose', 'Enable verbose ingestion warnings (default: false)')
|
|
29
29
|
.addHelpText('after', '\nEnvironment variables:\n CODE_MAPPER_NO_GITIGNORE=1 Skip .gitignore parsing (still reads .code-mapperignore)')
|
|
@@ -138,6 +138,15 @@ export declare function searchFileWords(db: Database.Database, query: string, li
|
|
|
138
138
|
}>;
|
|
139
139
|
/** Clear all file words (used before full rebuild) */
|
|
140
140
|
export declare function clearFileWords(db: Database.Database): void;
|
|
141
|
+
/** Count NL embeddings in the index */
|
|
142
|
+
export declare function countNlEmbeddings(db: Database.Database): number;
|
|
143
|
+
/** Search NL embeddings via brute-force cosine similarity */
|
|
144
|
+
export declare function searchNlVector(db: Database.Database, queryVec: number[], limit?: number, maxDistance?: number): Array<{
|
|
145
|
+
nodeId: string;
|
|
146
|
+
distance: number;
|
|
147
|
+
source: string;
|
|
148
|
+
text: string;
|
|
149
|
+
}>;
|
|
141
150
|
/** Escape a string for use in SQL single-quoted literals. */
|
|
142
151
|
export declare function escapeSql(value: string): string;
|
|
143
152
|
/** Execute a raw SQL query and return rows. */
|
package/dist/core/db/adapter.js
CHANGED
|
@@ -594,6 +594,46 @@ export function clearFileWords(db) {
|
|
|
594
594
|
db.prepare('DELETE FROM file_words').run();
|
|
595
595
|
}
|
|
596
596
|
// ---------------------------------------------------------------------------
|
|
597
|
+
// NL Embeddings
|
|
598
|
+
// ---------------------------------------------------------------------------
|
|
599
|
+
/** Count NL embeddings in the index */
|
|
600
|
+
export function countNlEmbeddings(db) {
|
|
601
|
+
try {
|
|
602
|
+
const row = db.prepare('SELECT COUNT(*) as cnt FROM nl_embeddings').get();
|
|
603
|
+
return row?.cnt ?? 0;
|
|
604
|
+
}
|
|
605
|
+
catch {
|
|
606
|
+
return 0;
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
/** Search NL embeddings via brute-force cosine similarity */
|
|
610
|
+
export function searchNlVector(db, queryVec, limit = 10, maxDistance = 0.5) {
|
|
611
|
+
let rows;
|
|
612
|
+
try {
|
|
613
|
+
rows = db.prepare('SELECT nodeId, embedding, source, text FROM nl_embeddings').all();
|
|
614
|
+
}
|
|
615
|
+
catch {
|
|
616
|
+
return [];
|
|
617
|
+
}
|
|
618
|
+
const results = [];
|
|
619
|
+
for (const row of rows) {
|
|
620
|
+
const vec = new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.byteLength / 4);
|
|
621
|
+
let dot = 0, normA = 0, normB = 0;
|
|
622
|
+
for (let i = 0; i < queryVec.length && i < vec.length; i++) {
|
|
623
|
+
dot += queryVec[i] * vec[i];
|
|
624
|
+
normA += queryVec[i] * queryVec[i];
|
|
625
|
+
normB += vec[i] * vec[i];
|
|
626
|
+
}
|
|
627
|
+
const similarity = dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
628
|
+
const distance = 1 - similarity;
|
|
629
|
+
if (distance < maxDistance) {
|
|
630
|
+
results.push({ nodeId: row.nodeId, distance, source: row.source, text: row.text });
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
results.sort((a, b) => a.distance - b.distance);
|
|
634
|
+
return results.slice(0, limit);
|
|
635
|
+
}
|
|
636
|
+
// ---------------------------------------------------------------------------
|
|
597
637
|
// Raw SQL escape (for dynamic queries in local-backend.ts)
|
|
598
638
|
// ---------------------------------------------------------------------------
|
|
599
639
|
/** Escape a string for use in SQL single-quoted literals. */
|
package/dist/core/db/schema.d.ts
CHANGED
|
@@ -68,6 +68,14 @@ export interface EmbeddingRow {
|
|
|
68
68
|
readonly embedding: Buffer;
|
|
69
69
|
readonly textHash: string | null;
|
|
70
70
|
}
|
|
71
|
+
/** A NL embedding row as stored in the `nl_embeddings` table */
|
|
72
|
+
export interface NlEmbeddingRow {
|
|
73
|
+
readonly nodeId: NodeId;
|
|
74
|
+
readonly embedding: Buffer;
|
|
75
|
+
readonly textHash: string | null;
|
|
76
|
+
readonly source: string;
|
|
77
|
+
readonly text: string;
|
|
78
|
+
}
|
|
71
79
|
/** A reference occurrence as stored in the `refs` table */
|
|
72
80
|
export interface RefsRow {
|
|
73
81
|
readonly symbol: string;
|
|
@@ -118,4 +126,4 @@ export interface EdgeInsert {
|
|
|
118
126
|
}
|
|
119
127
|
/** Legacy edge table name constant (kept for compatibility) */
|
|
120
128
|
export declare const REL_TABLE_NAME = "CodeRelation";
|
|
121
|
-
export declare const SCHEMA_SQL = "\n-- Nodes: unified table for all code elements\nCREATE TABLE IF NOT EXISTS nodes (\n id TEXT PRIMARY KEY,\n label TEXT NOT NULL,\n name TEXT NOT NULL DEFAULT '',\n filePath TEXT NOT NULL DEFAULT '',\n startLine INTEGER,\n endLine INTEGER,\n isExported INTEGER,\n content TEXT NOT NULL DEFAULT '',\n description TEXT NOT NULL DEFAULT '',\n heuristicLabel TEXT,\n cohesion REAL,\n symbolCount INTEGER,\n keywords TEXT,\n enrichedBy TEXT,\n processType TEXT,\n stepCount INTEGER,\n communities TEXT,\n entryPointId TEXT,\n terminalId TEXT,\n parameterCount INTEGER,\n returnType TEXT,\n nameExpanded TEXT DEFAULT '',\n searchText TEXT DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);\nCREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath ON nodes(filePath);\nCREATE INDEX IF NOT EXISTS idx_nodes_label_name ON nodes(label, name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath_lines ON nodes(filePath, startLine, endLine);\n\n-- Edges: single table for all relationships\nCREATE TABLE IF NOT EXISTS edges (\n id TEXT PRIMARY KEY,\n sourceId TEXT NOT NULL,\n targetId TEXT NOT NULL,\n type TEXT NOT NULL,\n confidence REAL NOT NULL DEFAULT 1.0,\n reason TEXT NOT NULL DEFAULT '',\n step INTEGER NOT NULL DEFAULT 0,\n callLine INTEGER\n);\n\nCREATE INDEX IF NOT EXISTS idx_edges_sourceId ON edges(sourceId);\nCREATE INDEX IF NOT EXISTS idx_edges_targetId ON edges(targetId);\nCREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);\nCREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(sourceId, type);\nCREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(targetId, type);\n\n-- Embeddings: vector storage\nCREATE TABLE IF NOT EXISTS embeddings (\n nodeId TEXT PRIMARY KEY,\n embedding BLOB NOT NULL,\n textHash TEXT\n);\n\n-- FTS5 virtual table (auto-updated via triggers)\nCREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(\n name,\n nameExpanded,\n searchText,\n filePath,\n content,\n content='nodes',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN\n INSERT INTO nodes_fts(rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\n\n-- Refs: identifier occurrence index (pre-computed grep)\nCREATE TABLE IF NOT EXISTS refs (\n symbol TEXT NOT NULL,\n filePath TEXT NOT NULL,\n line INTEGER NOT NULL\n);\n\nCREATE INDEX IF NOT EXISTS idx_refs_symbol ON refs(symbol);\nCREATE INDEX IF NOT EXISTS idx_refs_filePath ON refs(filePath);\nCREATE INDEX IF NOT EXISTS idx_refs_symbol_file ON refs(symbol, filePath);\n\n-- File-level word index for conceptual search\nCREATE TABLE IF NOT EXISTS file_words (\n filePath TEXT PRIMARY KEY,\n words TEXT NOT NULL DEFAULT ''\n);\n\nCREATE VIRTUAL TABLE IF NOT EXISTS file_words_fts USING fts5(\n words,\n content='file_words',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ai AFTER INSERT ON file_words BEGIN\n INSERT INTO file_words_fts(rowid, words) VALUES (new.rowid, new.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ad AFTER DELETE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_au AFTER UPDATE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES (new.rowid, new.words);\nEND;\n";
|
|
129
|
+
export declare const SCHEMA_SQL = "\n-- Nodes: unified table for all code elements\nCREATE TABLE IF NOT EXISTS nodes (\n id TEXT PRIMARY KEY,\n label TEXT NOT NULL,\n name TEXT NOT NULL DEFAULT '',\n filePath TEXT NOT NULL DEFAULT '',\n startLine INTEGER,\n endLine INTEGER,\n isExported INTEGER,\n content TEXT NOT NULL DEFAULT '',\n description TEXT NOT NULL DEFAULT '',\n heuristicLabel TEXT,\n cohesion REAL,\n symbolCount INTEGER,\n keywords TEXT,\n enrichedBy TEXT,\n processType TEXT,\n stepCount INTEGER,\n communities TEXT,\n entryPointId TEXT,\n terminalId TEXT,\n parameterCount INTEGER,\n returnType TEXT,\n nameExpanded TEXT DEFAULT '',\n searchText TEXT DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);\nCREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath ON nodes(filePath);\nCREATE INDEX IF NOT EXISTS idx_nodes_label_name ON nodes(label, name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath_lines ON nodes(filePath, startLine, endLine);\n\n-- Edges: single table for all relationships\nCREATE TABLE IF NOT EXISTS edges (\n id TEXT PRIMARY KEY,\n sourceId TEXT NOT NULL,\n targetId TEXT NOT NULL,\n type TEXT NOT NULL,\n confidence REAL NOT NULL DEFAULT 1.0,\n reason TEXT NOT NULL DEFAULT '',\n step INTEGER NOT NULL DEFAULT 0,\n callLine INTEGER\n);\n\nCREATE INDEX IF NOT EXISTS idx_edges_sourceId ON edges(sourceId);\nCREATE INDEX IF NOT EXISTS idx_edges_targetId ON edges(targetId);\nCREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);\nCREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(sourceId, type);\nCREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(targetId, type);\n\n-- Embeddings: vector storage\nCREATE TABLE IF NOT EXISTS embeddings (\n nodeId TEXT PRIMARY KEY,\n embedding BLOB NOT NULL,\n textHash TEXT\n);\n\n-- NL Embeddings: natural language description vectors (bge-small, 384-dim)\nCREATE TABLE IF NOT EXISTS nl_embeddings (\n nodeId TEXT NOT NULL,\n embedding BLOB NOT NULL,\n textHash TEXT,\n source TEXT NOT NULL DEFAULT 'comment',\n text TEXT NOT NULL DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nl_emb_nodeId ON nl_embeddings(nodeId);\n\n-- FTS5 virtual table (auto-updated via triggers)\nCREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(\n name,\n nameExpanded,\n searchText,\n filePath,\n content,\n content='nodes',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN\n INSERT INTO nodes_fts(rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\n\n-- Refs: identifier occurrence index (pre-computed grep)\nCREATE TABLE IF NOT EXISTS refs (\n symbol TEXT NOT NULL,\n filePath TEXT NOT NULL,\n line INTEGER NOT NULL\n);\n\nCREATE INDEX IF NOT EXISTS idx_refs_symbol ON refs(symbol);\nCREATE INDEX IF NOT EXISTS idx_refs_filePath ON refs(filePath);\nCREATE INDEX IF NOT EXISTS idx_refs_symbol_file ON refs(symbol, filePath);\n\n-- File-level word index for conceptual search\nCREATE TABLE IF NOT EXISTS file_words (\n filePath TEXT PRIMARY KEY,\n words TEXT NOT NULL DEFAULT ''\n);\n\nCREATE VIRTUAL TABLE IF NOT EXISTS file_words_fts USING fts5(\n words,\n content='file_words',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ai AFTER INSERT ON file_words BEGIN\n INSERT INTO file_words_fts(rowid, words) VALUES (new.rowid, new.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ad AFTER DELETE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_au AFTER UPDATE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES (new.rowid, new.words);\nEND;\n";
|
package/dist/core/db/schema.js
CHANGED
|
@@ -114,6 +114,17 @@ CREATE TABLE IF NOT EXISTS embeddings (
|
|
|
114
114
|
textHash TEXT
|
|
115
115
|
);
|
|
116
116
|
|
|
117
|
+
-- NL Embeddings: natural language description vectors (bge-small, 384-dim)
|
|
118
|
+
CREATE TABLE IF NOT EXISTS nl_embeddings (
|
|
119
|
+
nodeId TEXT NOT NULL,
|
|
120
|
+
embedding BLOB NOT NULL,
|
|
121
|
+
textHash TEXT,
|
|
122
|
+
source TEXT NOT NULL DEFAULT 'comment',
|
|
123
|
+
text TEXT NOT NULL DEFAULT ''
|
|
124
|
+
);
|
|
125
|
+
|
|
126
|
+
CREATE INDEX IF NOT EXISTS idx_nl_emb_nodeId ON nl_embeddings(nodeId);
|
|
127
|
+
|
|
117
128
|
-- FTS5 virtual table (auto-updated via triggers)
|
|
118
129
|
CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(
|
|
119
130
|
name,
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file Natural language embedder using bge-small-en-v1.5.
|
|
3
|
+
*
|
|
4
|
+
* Runs entirely in Node.js via @huggingface/transformers — no Python, no GPU.
|
|
5
|
+
* Embeds human-readable descriptions extracted from code (JSDoc comments,
|
|
6
|
+
* enum values, type patterns, file headers) for conceptual search.
|
|
7
|
+
*
|
|
8
|
+
* 33M params, q8 quantized, 384-dim embeddings, ~6ms/text on CPU.
|
|
9
|
+
*/
|
|
10
|
+
import type Database from 'better-sqlite3';
|
|
11
|
+
/** Initialize the NL embedding model (lazy, idempotent) */
|
|
12
|
+
export declare function initNlEmbedder(): Promise<void>;
|
|
13
|
+
/** Check if the NL embedder is ready */
|
|
14
|
+
export declare function isNlEmbedderReady(): boolean;
|
|
15
|
+
/** Embed a single text, returns Float32Array */
|
|
16
|
+
export declare function nlEmbed(text: string): Promise<number[]>;
|
|
17
|
+
/** Embed a batch of texts */
|
|
18
|
+
export declare function nlEmbedBatch(texts: string[]): Promise<number[][]>;
|
|
19
|
+
interface NodeForNl {
|
|
20
|
+
id: string;
|
|
21
|
+
name: string;
|
|
22
|
+
label: string;
|
|
23
|
+
filePath: string;
|
|
24
|
+
content: string;
|
|
25
|
+
startLine: number | null;
|
|
26
|
+
description: string;
|
|
27
|
+
}
|
|
28
|
+
interface NlDocument {
|
|
29
|
+
nodeId: string;
|
|
30
|
+
source: string;
|
|
31
|
+
text: string;
|
|
32
|
+
}
|
|
33
|
+
/** Build NL documents from a node */
|
|
34
|
+
export declare function extractNlTexts(node: NodeForNl): NlDocument[];
|
|
35
|
+
/**
|
|
36
|
+
* Build NL embeddings for all eligible nodes in the database.
|
|
37
|
+
* Reads nodes, extracts NL text, embeds with bge-small, writes to nl_embeddings.
|
|
38
|
+
*/
|
|
39
|
+
export declare function buildNlEmbeddings(db: Database.Database, onProgress?: (current: number, total: number) => void): Promise<{
|
|
40
|
+
embedded: number;
|
|
41
|
+
skipped: number;
|
|
42
|
+
durationMs: number;
|
|
43
|
+
}>;
|
|
44
|
+
export {};
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
// code-mapper/src/core/embeddings/nl-embedder.ts
|
|
2
|
+
/**
|
|
3
|
+
* @file Natural language embedder using bge-small-en-v1.5.
|
|
4
|
+
*
|
|
5
|
+
* Runs entirely in Node.js via @huggingface/transformers — no Python, no GPU.
|
|
6
|
+
* Embeds human-readable descriptions extracted from code (JSDoc comments,
|
|
7
|
+
* enum values, type patterns, file headers) for conceptual search.
|
|
8
|
+
*
|
|
9
|
+
* 33M params, q8 quantized, 384-dim embeddings, ~6ms/text on CPU.
|
|
10
|
+
*/
|
|
11
|
+
// NL embedder — no schema imports needed
|
|
12
|
+
const MODEL_ID = 'Xenova/bge-small-en-v1.5';
|
|
13
|
+
// Lazy-loaded pipeline
|
|
14
|
+
let extractor = null;
|
|
15
|
+
let loadPromise = null;
|
|
16
|
+
/** Initialize the NL embedding model (lazy, idempotent) */
|
|
17
|
+
export async function initNlEmbedder() {
|
|
18
|
+
if (extractor)
|
|
19
|
+
return;
|
|
20
|
+
if (loadPromise)
|
|
21
|
+
return loadPromise;
|
|
22
|
+
loadPromise = (async () => {
|
|
23
|
+
const { pipeline } = await import('@huggingface/transformers');
|
|
24
|
+
extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
|
|
25
|
+
})();
|
|
26
|
+
return loadPromise;
|
|
27
|
+
}
|
|
28
|
+
/** Check if the NL embedder is ready */
|
|
29
|
+
export function isNlEmbedderReady() {
|
|
30
|
+
return extractor !== null;
|
|
31
|
+
}
|
|
32
|
+
/** Embed a single text, returns Float32Array */
|
|
33
|
+
export async function nlEmbed(text) {
|
|
34
|
+
if (!extractor)
|
|
35
|
+
await initNlEmbedder();
|
|
36
|
+
const result = await extractor(text, { pooling: 'cls', normalize: true });
|
|
37
|
+
return Array.from(result.data);
|
|
38
|
+
}
|
|
39
|
+
/** Embed a batch of texts */
|
|
40
|
+
export async function nlEmbedBatch(texts) {
|
|
41
|
+
if (!extractor)
|
|
42
|
+
await initNlEmbedder();
|
|
43
|
+
const results = [];
|
|
44
|
+
for (const text of texts) {
|
|
45
|
+
const result = await extractor(text, { pooling: 'cls', normalize: true });
|
|
46
|
+
results.push(Array.from(result.data));
|
|
47
|
+
}
|
|
48
|
+
return results;
|
|
49
|
+
}
|
|
50
|
+
/** Extract all JSDoc/block comment text (up to 10 lines) */
|
|
51
|
+
function extractFullComment(content) {
|
|
52
|
+
if (!content)
|
|
53
|
+
return '';
|
|
54
|
+
const lines = content.split('\n');
|
|
55
|
+
const commentLines = [];
|
|
56
|
+
let inBlock = false;
|
|
57
|
+
for (const l of lines) {
|
|
58
|
+
const t = l.trim();
|
|
59
|
+
if (t.startsWith('/**') || t.startsWith('/*')) {
|
|
60
|
+
inBlock = true;
|
|
61
|
+
const inner = t.replace(/^\/\*\*?/, '').replace(/\*\/$/, '').trim();
|
|
62
|
+
if (inner && !inner.startsWith('@'))
|
|
63
|
+
commentLines.push(inner);
|
|
64
|
+
if (t.includes('*/'))
|
|
65
|
+
inBlock = false;
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
if (inBlock) {
|
|
69
|
+
if (t.includes('*/')) {
|
|
70
|
+
inBlock = false;
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
const inner = t.replace(/^\*\s?/, '').trim();
|
|
74
|
+
if (inner && !inner.startsWith('@'))
|
|
75
|
+
commentLines.push(inner);
|
|
76
|
+
if (commentLines.length >= 10)
|
|
77
|
+
break;
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
if (t.startsWith('//')) {
|
|
81
|
+
const inner = t.slice(2).trim();
|
|
82
|
+
if (inner)
|
|
83
|
+
commentLines.push(inner);
|
|
84
|
+
if (commentLines.length >= 10)
|
|
85
|
+
break;
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
if (t.startsWith('#') && !t.startsWith('#!')) {
|
|
89
|
+
const inner = t.slice(1).trim();
|
|
90
|
+
if (inner)
|
|
91
|
+
commentLines.push(inner);
|
|
92
|
+
if (commentLines.length >= 10)
|
|
93
|
+
break;
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
if (commentLines.length > 0)
|
|
97
|
+
break; // comment ended
|
|
98
|
+
}
|
|
99
|
+
return commentLines.join(' ');
|
|
100
|
+
}
|
|
101
|
+
/** Expand camelCase/PascalCase/snake_case to space-separated words */
|
|
102
|
+
function expandIdentifier(name) {
|
|
103
|
+
return name
|
|
104
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
105
|
+
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
|
|
106
|
+
.replace(/[_\-]/g, ' ')
|
|
107
|
+
.toLowerCase();
|
|
108
|
+
}
|
|
109
|
+
/** Extract enum/const array values as NL text */
|
|
110
|
+
function extractEnumValues(content) {
|
|
111
|
+
// Match: ['value1', 'value2', ...] as const
|
|
112
|
+
const match = content.match(/\[([^\]]+)\]\s*as\s*const/);
|
|
113
|
+
if (match?.[1]) {
|
|
114
|
+
const values = match[1].replace(/['"]/g, '').split(',').map(v => v.trim()).filter(Boolean);
|
|
115
|
+
if (values.length > 0)
|
|
116
|
+
return values.join(', ');
|
|
117
|
+
}
|
|
118
|
+
// Match: enum { Value1, Value2 }
|
|
119
|
+
const enumMatch = content.match(/enum\s+\w+\s*\{([^}]+)\}/);
|
|
120
|
+
if (enumMatch?.[1]) {
|
|
121
|
+
const values = enumMatch[1].split(',').map(v => v.trim().split('=')[0].trim()).filter(Boolean);
|
|
122
|
+
if (values.length > 0)
|
|
123
|
+
return values.map(v => expandIdentifier(v)).join(', ');
|
|
124
|
+
}
|
|
125
|
+
return '';
|
|
126
|
+
}
|
|
127
|
+
/** Extract parameter names from function signature */
|
|
128
|
+
function extractParamNames(content) {
|
|
129
|
+
const match = content.match(/\(([^)]*)\)/);
|
|
130
|
+
if (!match?.[1])
|
|
131
|
+
return '';
|
|
132
|
+
return match[1].split(',')
|
|
133
|
+
.map(p => p.trim().split(':')[0].split('=')[0].trim())
|
|
134
|
+
.filter(p => p && p !== '')
|
|
135
|
+
.map(p => expandIdentifier(p))
|
|
136
|
+
.join(', ');
|
|
137
|
+
}
|
|
138
|
+
/** Build NL documents from a node */
|
|
139
|
+
export function extractNlTexts(node) {
|
|
140
|
+
const docs = [];
|
|
141
|
+
const name = node.name;
|
|
142
|
+
const expandedName = expandIdentifier(name);
|
|
143
|
+
const dir = node.filePath.split('/').slice(-3, -1).join('/');
|
|
144
|
+
// 1. Comment-based NL text (primary)
|
|
145
|
+
const comment = extractFullComment(node.content);
|
|
146
|
+
if (comment) {
|
|
147
|
+
docs.push({
|
|
148
|
+
nodeId: node.id,
|
|
149
|
+
source: 'comment',
|
|
150
|
+
text: `${expandedName}: ${comment}. File: ${dir}`,
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
// 2. Name + params + return type (always available)
|
|
154
|
+
const params = extractParamNames(node.content);
|
|
155
|
+
const parts = [expandedName];
|
|
156
|
+
if (params)
|
|
157
|
+
parts.push(`Parameters: ${params}`);
|
|
158
|
+
if (dir)
|
|
159
|
+
parts.push(`in ${dir}`);
|
|
160
|
+
if (!comment) {
|
|
161
|
+
// Only add name-based doc if no comment (avoid duplication)
|
|
162
|
+
docs.push({
|
|
163
|
+
nodeId: node.id,
|
|
164
|
+
source: 'name',
|
|
165
|
+
text: parts.join('. '),
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
// 3. Enum/const values
|
|
169
|
+
if (node.label === 'Enum' || node.label === 'Const' || node.label === 'TypeAlias') {
|
|
170
|
+
const values = extractEnumValues(node.content);
|
|
171
|
+
if (values) {
|
|
172
|
+
docs.push({
|
|
173
|
+
nodeId: node.id,
|
|
174
|
+
source: 'enum',
|
|
175
|
+
text: `${expandedName}: ${values}`,
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
return docs;
|
|
180
|
+
}
|
|
181
|
+
// ---------------------------------------------------------------------------
|
|
182
|
+
// Full NL embedding pipeline
|
|
183
|
+
// ---------------------------------------------------------------------------
|
|
184
|
+
/** Hash text for skip detection */
|
|
185
|
+
import { createHash } from 'crypto';
|
|
186
|
+
function md5(text) {
|
|
187
|
+
return createHash('md5').update(text).digest('hex');
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Build NL embeddings for all eligible nodes in the database.
|
|
191
|
+
* Reads nodes, extracts NL text, embeds with bge-small, writes to nl_embeddings.
|
|
192
|
+
*/
|
|
193
|
+
export async function buildNlEmbeddings(db, onProgress) {
|
|
194
|
+
const t0 = Date.now();
|
|
195
|
+
await initNlEmbedder();
|
|
196
|
+
// Query all nodes (not just EMBEDDABLE_LABELS — we want enums, consts, types too)
|
|
197
|
+
const labels = ['Function', 'Class', 'Method', 'Interface', 'Const', 'Enum', 'TypeAlias', 'Namespace', 'Module', 'Struct'];
|
|
198
|
+
const placeholders = labels.map(() => '?').join(',');
|
|
199
|
+
const rows = db.prepare(`SELECT id, name, label, filePath, content, startLine, description FROM nodes WHERE label IN (${placeholders})`).all(...labels);
|
|
200
|
+
// Skip test files
|
|
201
|
+
const testPatterns = ['/test/', '/tests/', '/spec/', '/fixtures/', '/__tests__/', '/__mocks__/', '.test.', '.spec.', '_test.', '_spec.'];
|
|
202
|
+
const filteredRows = rows.filter(r => !testPatterns.some(p => r.filePath.includes(p)));
|
|
203
|
+
// Extract NL documents
|
|
204
|
+
const allDocs = [];
|
|
205
|
+
for (const row of filteredRows) {
|
|
206
|
+
const docs = extractNlTexts(row);
|
|
207
|
+
for (const doc of docs)
|
|
208
|
+
allDocs.push(doc);
|
|
209
|
+
}
|
|
210
|
+
if (allDocs.length === 0) {
|
|
211
|
+
return { embedded: 0, skipped: 0, durationMs: Date.now() - t0 };
|
|
212
|
+
}
|
|
213
|
+
// Check existing hashes for skip detection
|
|
214
|
+
const existingHashes = new Map();
|
|
215
|
+
try {
|
|
216
|
+
const hashRows = db.prepare('SELECT nodeId, textHash FROM nl_embeddings WHERE textHash IS NOT NULL').all();
|
|
217
|
+
for (const r of hashRows)
|
|
218
|
+
existingHashes.set(r.nodeId + ':' + r.textHash, '1');
|
|
219
|
+
}
|
|
220
|
+
catch { /* table might not exist yet */ }
|
|
221
|
+
// Filter to docs that need embedding
|
|
222
|
+
const toEmbed = [];
|
|
223
|
+
let skipped = 0;
|
|
224
|
+
for (const doc of allDocs) {
|
|
225
|
+
const hash = md5(doc.text);
|
|
226
|
+
if (existingHashes.has(doc.nodeId + ':' + hash)) {
|
|
227
|
+
skipped++;
|
|
228
|
+
continue;
|
|
229
|
+
}
|
|
230
|
+
toEmbed.push({ ...doc, hash });
|
|
231
|
+
}
|
|
232
|
+
if (toEmbed.length === 0) {
|
|
233
|
+
return { embedded: 0, skipped, durationMs: Date.now() - t0 };
|
|
234
|
+
}
|
|
235
|
+
// Clear existing NL embeddings and rebuild
|
|
236
|
+
db.prepare('DELETE FROM nl_embeddings').run();
|
|
237
|
+
// Embed in batches and write to DB
|
|
238
|
+
const BATCH = 100;
|
|
239
|
+
const insertStmt = db.prepare('INSERT INTO nl_embeddings (nodeId, embedding, textHash, source, text) VALUES (?, ?, ?, ?, ?)');
|
|
240
|
+
let embedded = 0;
|
|
241
|
+
db.exec('BEGIN');
|
|
242
|
+
try {
|
|
243
|
+
for (let i = 0; i < toEmbed.length; i += BATCH) {
|
|
244
|
+
const batch = toEmbed.slice(i, i + BATCH);
|
|
245
|
+
const vecs = await nlEmbedBatch(batch.map(d => d.text));
|
|
246
|
+
for (let j = 0; j < batch.length; j++) {
|
|
247
|
+
const doc = batch[j];
|
|
248
|
+
const vec = vecs[j];
|
|
249
|
+
const blob = Buffer.from(new Float32Array(vec).buffer);
|
|
250
|
+
insertStmt.run(doc.nodeId, blob, doc.hash, doc.source, doc.text);
|
|
251
|
+
embedded++;
|
|
252
|
+
}
|
|
253
|
+
onProgress?.(Math.min(i + BATCH, toEmbed.length), toEmbed.length);
|
|
254
|
+
}
|
|
255
|
+
db.exec('COMMIT');
|
|
256
|
+
}
|
|
257
|
+
catch (err) {
|
|
258
|
+
db.exec('ROLLBACK');
|
|
259
|
+
throw err;
|
|
260
|
+
}
|
|
261
|
+
return { embedded, skipped, durationMs: Date.now() - t0 };
|
|
262
|
+
}
|
|
@@ -42,6 +42,8 @@ export declare class LocalBackend {
|
|
|
42
42
|
private tsgoServices;
|
|
43
43
|
/** Per-repo in-memory embedding cache: nodeId → Float32Array (256-dim) */
|
|
44
44
|
private embeddingCaches;
|
|
45
|
+
/** Per-repo in-memory NL embedding cache: includes source text for match_reason */
|
|
46
|
+
private nlEmbeddingCaches;
|
|
45
47
|
/** Get (or lazily start) a tsgo LSP service for a repo. Returns null if unavailable. */
|
|
46
48
|
private getTsgo;
|
|
47
49
|
/** Get (or lazily open) the SQLite database for a repo. */
|
|
@@ -50,6 +52,10 @@ export declare class LocalBackend {
|
|
|
50
52
|
private loadEmbeddingCache;
|
|
51
53
|
/** Search embeddings in memory — O(N) dot products, no disk I/O */
|
|
52
54
|
private searchEmbeddingsInMemory;
|
|
55
|
+
/** Load NL embeddings into memory for fast conceptual search */
|
|
56
|
+
private loadNlEmbeddingCache;
|
|
57
|
+
/** Search NL embeddings in memory, returns match_reason text */
|
|
58
|
+
private searchNlEmbeddingsInMemory;
|
|
53
59
|
/** Hard ceiling — beyond this, incremental is unreliable, warn prominently */
|
|
54
60
|
private static readonly MAX_INCREMENTAL_FILES;
|
|
55
61
|
/** Start file system watcher for a repo to detect source changes */
|
|
@@ -131,6 +137,11 @@ export declare class LocalBackend {
|
|
|
131
137
|
* Semantic vector search helper
|
|
132
138
|
*/
|
|
133
139
|
private semanticSearch;
|
|
140
|
+
/**
|
|
141
|
+
* NL semantic search: embed query with bge-small, search NL descriptions.
|
|
142
|
+
* Returns match_reason (the NL text that matched) for agent transparency.
|
|
143
|
+
*/
|
|
144
|
+
private nlSemanticSearch;
|
|
134
145
|
/**
|
|
135
146
|
* Refs-based search: find symbols referenced in files that contain the query identifiers.
|
|
136
147
|
* Bridges the gap between graph edges (incomplete) and grep (complete for exact names).
|
|
@@ -57,6 +57,8 @@ export class LocalBackend {
|
|
|
57
57
|
tsgoServices = new Map();
|
|
58
58
|
/** Per-repo in-memory embedding cache: nodeId → Float32Array (256-dim) */
|
|
59
59
|
embeddingCaches = new Map();
|
|
60
|
+
/** Per-repo in-memory NL embedding cache: includes source text for match_reason */
|
|
61
|
+
nlEmbeddingCaches = new Map();
|
|
60
62
|
/** Get (or lazily start) a tsgo LSP service for a repo. Returns null if unavailable. */
|
|
61
63
|
async getTsgo(repo) {
|
|
62
64
|
const existing = this.tsgoServices.get(repo.id);
|
|
@@ -140,6 +142,70 @@ export class LocalBackend {
|
|
|
140
142
|
results.sort((a, b) => a.distance - b.distance);
|
|
141
143
|
return results.slice(0, limit);
|
|
142
144
|
}
|
|
145
|
+
/** Load NL embeddings into memory for fast conceptual search */
|
|
146
|
+
loadNlEmbeddingCache(repoId) {
|
|
147
|
+
try {
|
|
148
|
+
const db = this.getDb(repoId);
|
|
149
|
+
let rows;
|
|
150
|
+
try {
|
|
151
|
+
rows = db.prepare('SELECT nodeId, embedding, text FROM nl_embeddings').all();
|
|
152
|
+
}
|
|
153
|
+
catch {
|
|
154
|
+
return;
|
|
155
|
+
} // table might not exist
|
|
156
|
+
if (rows.length === 0) {
|
|
157
|
+
this.nlEmbeddingCaches.delete(repoId);
|
|
158
|
+
return;
|
|
159
|
+
}
|
|
160
|
+
const dims = rows[0].embedding.byteLength / 4;
|
|
161
|
+
const nodeIds = [];
|
|
162
|
+
const texts = [];
|
|
163
|
+
const matrix = new Float32Array(rows.length * dims);
|
|
164
|
+
const norms = new Float32Array(rows.length);
|
|
165
|
+
for (let i = 0; i < rows.length; i++) {
|
|
166
|
+
const row = rows[i];
|
|
167
|
+
nodeIds.push(row.nodeId);
|
|
168
|
+
texts.push(row.text);
|
|
169
|
+
const vec = new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.byteLength / 4);
|
|
170
|
+
matrix.set(vec, i * dims);
|
|
171
|
+
let norm = 0;
|
|
172
|
+
for (let d = 0; d < dims; d++)
|
|
173
|
+
norm += vec[d] * vec[d];
|
|
174
|
+
norms[i] = Math.sqrt(norm);
|
|
175
|
+
}
|
|
176
|
+
this.nlEmbeddingCaches.set(repoId, { nodeIds, texts, matrix, norms });
|
|
177
|
+
}
|
|
178
|
+
catch { /* NL embeddings not available */ }
|
|
179
|
+
}
|
|
180
|
+
/** Search NL embeddings in memory, returns match_reason text */
|
|
181
|
+
searchNlEmbeddingsInMemory(repoId, queryVec, limit = 10, maxDistance = 0.5) {
|
|
182
|
+
const cache = this.nlEmbeddingCaches.get(repoId);
|
|
183
|
+
if (!cache || cache.nodeIds.length === 0)
|
|
184
|
+
return [];
|
|
185
|
+
const dims = queryVec.length;
|
|
186
|
+
const results = [];
|
|
187
|
+
let qNorm = 0;
|
|
188
|
+
for (let d = 0; d < dims; d++)
|
|
189
|
+
qNorm += queryVec[d] * queryVec[d];
|
|
190
|
+
qNorm = Math.sqrt(qNorm);
|
|
191
|
+
if (qNorm === 0)
|
|
192
|
+
return [];
|
|
193
|
+
const cacheDims = cache.matrix.length / cache.nodeIds.length;
|
|
194
|
+
for (let i = 0; i < cache.nodeIds.length; i++) {
|
|
195
|
+
const offset = i * cacheDims;
|
|
196
|
+
let dot = 0;
|
|
197
|
+
const minDims = Math.min(dims, cacheDims);
|
|
198
|
+
for (let d = 0; d < minDims; d++)
|
|
199
|
+
dot += queryVec[d] * cache.matrix[offset + d];
|
|
200
|
+
const similarity = dot / (qNorm * cache.norms[i]);
|
|
201
|
+
const distance = 1 - similarity;
|
|
202
|
+
if (distance < maxDistance) {
|
|
203
|
+
results.push({ nodeId: cache.nodeIds[i], distance, text: cache.texts[i] });
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
results.sort((a, b) => a.distance - b.distance);
|
|
207
|
+
return results.slice(0, limit);
|
|
208
|
+
}
|
|
143
209
|
/** Hard ceiling — beyond this, incremental is unreliable, warn prominently */
|
|
144
210
|
static MAX_INCREMENTAL_FILES = 200;
|
|
145
211
|
/** Start file system watcher for a repo to detect source changes */
|
|
@@ -273,11 +339,12 @@ export class LocalBackend {
|
|
|
273
339
|
for (const [id, handle] of this.repos) {
|
|
274
340
|
this.startWatcher(id, handle);
|
|
275
341
|
this.seedWatcherFromGit(id, handle);
|
|
276
|
-
// Load embedding
|
|
342
|
+
// Load embedding caches into memory for fast vector search
|
|
277
343
|
if ((handle.stats?.embeddings ?? 0) > 0) {
|
|
278
344
|
this.loadEmbeddingCache(id);
|
|
279
345
|
anyEmbeddings = true;
|
|
280
346
|
}
|
|
347
|
+
this.loadNlEmbeddingCache(id); // NL cache loaded regardless (cheap, may not exist)
|
|
281
348
|
}
|
|
282
349
|
// Pre-warm MLX embedder so first query has zero model-load latency
|
|
283
350
|
if (anyEmbeddings) {
|
|
@@ -592,6 +659,9 @@ export class LocalBackend {
|
|
|
592
659
|
const end = d.endLine || 0;
|
|
593
660
|
const isSmall = end > 0 && start > 0 && (end - start) < 10;
|
|
594
661
|
lines.push(` ${sig} — ${d.type} @ ${this.shortPath(d.filePath)}:${start || '?'}${mod}`);
|
|
662
|
+
if (d.match_reason) {
|
|
663
|
+
lines.push(` _"${d.match_reason}"_`);
|
|
664
|
+
}
|
|
595
665
|
if (isSmall && d.content) {
|
|
596
666
|
const src = String(d.content).trim();
|
|
597
667
|
if (src.length < 500) {
|
|
@@ -671,6 +741,9 @@ export class LocalBackend {
|
|
|
671
741
|
renderFlows(2);
|
|
672
742
|
}
|
|
673
743
|
}
|
|
744
|
+
lines.push('');
|
|
745
|
+
lines.push('---');
|
|
746
|
+
lines.push('_Note: Results ranked by BM25 keyword + semantic + refs + file-content signals. Natural language queries may miss code using different terminology. If results seem incomplete, try specific identifiers or `grep -rn "keyword" --include="*.ts"` for exhaustive search._');
|
|
674
747
|
return lines.join('\n');
|
|
675
748
|
}
|
|
676
749
|
formatContextAsText(result) {
|
|
@@ -776,6 +849,10 @@ export class LocalBackend {
|
|
|
776
849
|
lines.push(` ${p.name} (step ${p.step_index}/${p.step_count})`);
|
|
777
850
|
}
|
|
778
851
|
}
|
|
852
|
+
// Guidance footer for agents
|
|
853
|
+
lines.push('');
|
|
854
|
+
lines.push('---');
|
|
855
|
+
lines.push('_Note: Callers are from graph edges + refs index. For widely-used symbols, verify completeness with `grep -rn "symbolName(" --include="*.ts"`. Outgoing calls may miss dynamic dispatch or reflection._');
|
|
779
856
|
return lines.join('\n');
|
|
780
857
|
}
|
|
781
858
|
formatImpactAsText(result) {
|
|
@@ -822,6 +899,9 @@ export class LocalBackend {
|
|
|
822
899
|
lines.push('');
|
|
823
900
|
lines.push(`### Modules: ${mods.map((m) => `${m.name} (${m.hits} ${m.impact})`).join(' | ')}`);
|
|
824
901
|
}
|
|
902
|
+
lines.push('');
|
|
903
|
+
lines.push('---');
|
|
904
|
+
lines.push('_Note: d=1 callers include graph edges + refs index. Indirect deps through dynamic dispatch, config, or type-only references may not appear. For critical changes, verify d=1 with `grep -rn "symbolName" --include="*.ts"`._');
|
|
825
905
|
return lines.join('\n');
|
|
826
906
|
}
|
|
827
907
|
formatDetectChangesAsText(result) {
|
|
@@ -1002,13 +1082,14 @@ export class LocalBackend {
|
|
|
1002
1082
|
// Step 1: Four-signal search in parallel
|
|
1003
1083
|
// BM25 uses expanded query; semantic uses enriched query; refs + file_words use raw query
|
|
1004
1084
|
const searchLimit = processLimit * maxSymbolsPerProcess;
|
|
1005
|
-
const [bm25Results, semanticResults, refsResults, fileWordsResults] = await Promise.all([
|
|
1085
|
+
const [bm25Results, semanticResults, nlSemanticResults, refsResults, fileWordsResults] = await Promise.all([
|
|
1006
1086
|
this.bm25Search(repo, expandedSearchQuery, searchLimit),
|
|
1007
1087
|
this.semanticSearch(repo, semanticQuery, searchLimit),
|
|
1088
|
+
this.nlSemanticSearch(repo, searchQuery, searchLimit),
|
|
1008
1089
|
Promise.resolve(this.refsSearch(repo, searchQuery, searchLimit)),
|
|
1009
1090
|
Promise.resolve(this.fileWordsSearch(repo, searchQuery, searchLimit)),
|
|
1010
1091
|
]);
|
|
1011
|
-
// Step 2: Weighted RRF merge (
|
|
1092
|
+
// Step 2: Weighted RRF merge (5 signals)
|
|
1012
1093
|
const bm25ForRRF = bm25Results.map((r, i) => ({
|
|
1013
1094
|
nodeId: String(r.nodeId ?? ''),
|
|
1014
1095
|
name: String(r.name ?? ''),
|
|
@@ -1038,7 +1119,26 @@ export class LocalBackend {
|
|
|
1038
1119
|
...(r.startLine != null ? { startLine: r.startLine } : {}),
|
|
1039
1120
|
...(r.endLine != null ? { endLine: r.endLine } : {}),
|
|
1040
1121
|
}));
|
|
1041
|
-
|
|
1122
|
+
// NL semantic results get high weight — proven 100% recall on conceptual queries
|
|
1123
|
+
const nlForRRF = nlSemanticResults.map((r) => ({
|
|
1124
|
+
nodeId: String(r.nodeId ?? ''), name: String(r.name ?? ''), label: String(r.type ?? ''),
|
|
1125
|
+
filePath: String(r.filePath ?? ''), distance: Number(r.distance ?? 1),
|
|
1126
|
+
...(r.startLine != null ? { startLine: r.startLine } : {}),
|
|
1127
|
+
...(r.endLine != null ? { endLine: r.endLine } : {}),
|
|
1128
|
+
}));
|
|
1129
|
+
// Merge code + NL semantic into one semantic list (best of both worlds)
|
|
1130
|
+
const combinedSemantic = [...semanticForRRF, ...nlForRRF]
|
|
1131
|
+
.sort((a, b) => a.distance - b.distance)
|
|
1132
|
+
.filter((r, i, arr) => arr.findIndex(x => x.nodeId === r.nodeId) === i) // dedupe by nodeId
|
|
1133
|
+
.slice(0, searchLimit);
|
|
1134
|
+
let rrfMerged = mergeWithRRF(bm25ForRRF, combinedSemantic, { limit: searchLimit });
|
|
1135
|
+
// Store NL match reasons for display
|
|
1136
|
+
const nlMatchReasons = new Map();
|
|
1137
|
+
for (const r of nlSemanticResults) {
|
|
1138
|
+
if (r.match_reason && !nlMatchReasons.has(r.nodeId)) {
|
|
1139
|
+
nlMatchReasons.set(r.nodeId, r.match_reason);
|
|
1140
|
+
}
|
|
1141
|
+
}
|
|
1042
1142
|
// Merge refs + fileWords into the RRF results (lower weight)
|
|
1043
1143
|
if (refsForRRF.length > 0 || fileWordsForRRF.length > 0) {
|
|
1044
1144
|
const supplemental = mergeWithRRF(refsForRRF, fileWordsForRRF.map((r) => ({
|
|
@@ -1084,13 +1184,22 @@ export class LocalBackend {
|
|
|
1084
1184
|
if (!searchDataMap.has(key))
|
|
1085
1185
|
searchDataMap.set(key, r);
|
|
1086
1186
|
}
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1187
|
+
for (const r of nlSemanticResults) {
|
|
1188
|
+
const key = r.nodeId || r.filePath;
|
|
1189
|
+
if (!searchDataMap.has(key))
|
|
1190
|
+
searchDataMap.set(key, r);
|
|
1191
|
+
}
|
|
1192
|
+
let merged = rrfMerged.map(rrf => {
|
|
1193
|
+
const data = searchDataMap.get(rrf.nodeId ?? '') ?? searchDataMap.get(rrf.filePath) ?? {
|
|
1090
1194
|
name: rrf.name ?? rrf.filePath.split('/').pop(), type: rrf.label ?? 'File',
|
|
1091
1195
|
filePath: rrf.filePath, nodeId: rrf.nodeId,
|
|
1092
|
-
}
|
|
1093
|
-
|
|
1196
|
+
};
|
|
1197
|
+
// Attach NL match reason if available
|
|
1198
|
+
const reason = nlMatchReasons.get(rrf.nodeId ?? '') ?? nlMatchReasons.get(data.nodeId ?? '');
|
|
1199
|
+
if (reason)
|
|
1200
|
+
data.match_reason = reason;
|
|
1201
|
+
return { score: rrf.score, data };
|
|
1202
|
+
});
|
|
1094
1203
|
// Filter noise: remove test files, config files, docs from results by default
|
|
1095
1204
|
merged = merged.filter(item => {
|
|
1096
1205
|
const fp = String(item.data.filePath ?? '').toLowerCase();
|
|
@@ -1511,6 +1620,48 @@ export class LocalBackend {
|
|
|
1511
1620
|
return [];
|
|
1512
1621
|
}
|
|
1513
1622
|
}
|
|
1623
|
+
/**
|
|
1624
|
+
* NL semantic search: embed query with bge-small, search NL descriptions.
|
|
1625
|
+
* Returns match_reason (the NL text that matched) for agent transparency.
|
|
1626
|
+
*/
|
|
1627
|
+
async nlSemanticSearch(repo, query, limit) {
|
|
1628
|
+
try {
|
|
1629
|
+
const cache = this.nlEmbeddingCaches.get(repo.id);
|
|
1630
|
+
if (!cache || cache.nodeIds.length === 0)
|
|
1631
|
+
return [];
|
|
1632
|
+
const { nlEmbed } = await import('../../core/embeddings/nl-embedder.js');
|
|
1633
|
+
const queryVec = await nlEmbed(query);
|
|
1634
|
+
const vecResults = this.searchNlEmbeddingsInMemory(repo.id, queryVec, limit, 0.5);
|
|
1635
|
+
if (vecResults.length === 0)
|
|
1636
|
+
return [];
|
|
1637
|
+
// Fetch node metadata
|
|
1638
|
+
const metaDb = this.getDb(repo.id);
|
|
1639
|
+
const seen = new Set();
|
|
1640
|
+
const results = [];
|
|
1641
|
+
for (const r of vecResults) {
|
|
1642
|
+
if (seen.has(r.nodeId))
|
|
1643
|
+
continue;
|
|
1644
|
+
seen.add(r.nodeId);
|
|
1645
|
+
const node = getNode(metaDb, toNodeId(r.nodeId));
|
|
1646
|
+
if (node) {
|
|
1647
|
+
results.push({
|
|
1648
|
+
nodeId: r.nodeId,
|
|
1649
|
+
name: node.name,
|
|
1650
|
+
type: node.label,
|
|
1651
|
+
filePath: node.filePath,
|
|
1652
|
+
distance: r.distance,
|
|
1653
|
+
startLine: node.startLine,
|
|
1654
|
+
endLine: node.endLine,
|
|
1655
|
+
match_reason: r.text, // The NL text that matched — shown to agents
|
|
1656
|
+
});
|
|
1657
|
+
}
|
|
1658
|
+
}
|
|
1659
|
+
return results;
|
|
1660
|
+
}
|
|
1661
|
+
catch {
|
|
1662
|
+
return [];
|
|
1663
|
+
}
|
|
1664
|
+
}
|
|
1514
1665
|
/**
|
|
1515
1666
|
* Refs-based search: find symbols referenced in files that contain the query identifiers.
|
|
1516
1667
|
* Bridges the gap between graph edges (incomplete) and grep (complete for exact names).
|
|
@@ -2033,24 +2184,30 @@ export class LocalBackend {
|
|
|
2033
2184
|
}
|
|
2034
2185
|
// Supplement callers from refs table (catches callers the graph missed)
|
|
2035
2186
|
try {
|
|
2036
|
-
const refCallers = findRefsBySymbol(db, sym.name,
|
|
2187
|
+
const refCallers = findRefsBySymbol(db, sym.name, 200);
|
|
2037
2188
|
const knownFiles = new Set(incomingRows.map(r => r.filePath));
|
|
2189
|
+
let refsAdded = 0;
|
|
2038
2190
|
for (const ref of refCallers) {
|
|
2039
2191
|
if (ref.filePath === sym.filePath)
|
|
2040
2192
|
continue; // skip self-file
|
|
2041
2193
|
if (knownFiles.has(ref.filePath))
|
|
2042
|
-
continue; // already
|
|
2043
|
-
knownFiles.add(ref.filePath);
|
|
2194
|
+
continue; // already have a caller from this file
|
|
2044
2195
|
const enclosing = this.findNodeAtPosition(db, ref.filePath, ref.line);
|
|
2045
|
-
if (enclosing)
|
|
2046
|
-
|
|
2047
|
-
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2196
|
+
if (!enclosing)
|
|
2197
|
+
continue; // no symbol at this line (e.g. import statement)
|
|
2198
|
+
knownFiles.add(ref.filePath); // mark AFTER finding a valid node
|
|
2199
|
+
incomingRows.push({
|
|
2200
|
+
relType: 'CALLS', uid: '', name: enclosing.name, filePath: ref.filePath,
|
|
2201
|
+
kind: enclosing.label, startLine: ref.line, reason: 'refs-index',
|
|
2202
|
+
});
|
|
2203
|
+
refsAdded++;
|
|
2051
2204
|
}
|
|
2205
|
+
if (process.env['CODE_MAPPER_VERBOSE'])
|
|
2206
|
+
console.error(`Code Mapper: refs supplement for '${sym.name}': ${refsAdded} added from ${refCallers.length} refs`);
|
|
2207
|
+
}
|
|
2208
|
+
catch (err) {
|
|
2209
|
+
console.error(`Code Mapper: refs supplement failed: ${err instanceof Error ? err.message : err}`);
|
|
2052
2210
|
}
|
|
2053
|
-
catch { /* refs table may not exist yet */ }
|
|
2054
2211
|
// Outgoing refs — exclude generic method names that produce false positives at low confidence
|
|
2055
2212
|
const GENERIC_NAMES_EXCLUDE = new Set(['has', 'get', 'set', 'add', 'remove', 'delete', 'close', 'stop', 'clear', 'reset', 'toString', 'valueOf', 'push', 'pop', 'entries', 'keys', 'values']);
|
|
2056
2213
|
let outgoingRows = [];
|
|
@@ -2388,10 +2545,44 @@ export class LocalBackend {
|
|
|
2388
2545
|
logQueryError('rename:read-ref', e);
|
|
2389
2546
|
}
|
|
2390
2547
|
}
|
|
2391
|
-
// Step
|
|
2392
|
-
let
|
|
2548
|
+
// Step 3a: Refs table lookup (instant, covers most cases the graph missed)
|
|
2549
|
+
let refsEdits = 0;
|
|
2393
2550
|
const graphFiles = new Set([sym.filePath, ...allIncoming.map(r => r.filePath)].filter(Boolean));
|
|
2394
|
-
|
|
2551
|
+
try {
|
|
2552
|
+
const refsDb = this.getDb(repo.id);
|
|
2553
|
+
const refsForName = findRefsBySymbol(refsDb, oldName, 500);
|
|
2554
|
+
for (const ref of refsForName) {
|
|
2555
|
+
const normalizedFile = ref.filePath.replace(/\\/g, '/');
|
|
2556
|
+
if (graphFiles.has(normalizedFile))
|
|
2557
|
+
continue;
|
|
2558
|
+
graphFiles.add(normalizedFile); // mark so ripgrep doesn't re-process
|
|
2559
|
+
try {
|
|
2560
|
+
const content = await fs.readFile(assertSafePath(normalizedFile), 'utf-8');
|
|
2561
|
+
const lines = content.split('\n');
|
|
2562
|
+
const regex = new RegExp(`\\b${oldName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'g');
|
|
2563
|
+
for (let i = 0; i < lines.length; i++) {
|
|
2564
|
+
const searchLine = lines[i];
|
|
2565
|
+
if (!searchLine)
|
|
2566
|
+
continue;
|
|
2567
|
+
regex.lastIndex = 0;
|
|
2568
|
+
if (regex.test(searchLine)) {
|
|
2569
|
+
regex.lastIndex = 0;
|
|
2570
|
+
addEdit(normalizedFile, i + 1, searchLine.trim(), searchLine.replace(regex, new_name).trim(), 'refs');
|
|
2571
|
+
refsEdits++;
|
|
2572
|
+
}
|
|
2573
|
+
}
|
|
2574
|
+
}
|
|
2575
|
+
catch (e) {
|
|
2576
|
+
logQueryError('rename:refs-read', e);
|
|
2577
|
+
}
|
|
2578
|
+
}
|
|
2579
|
+
}
|
|
2580
|
+
catch (e) {
|
|
2581
|
+
logQueryError('rename:refs', e);
|
|
2582
|
+
}
|
|
2583
|
+
// Step 3b: Ripgrep text search for anything refs + graph missed
|
|
2584
|
+
let astSearchEdits = 0;
|
|
2585
|
+
// Simple text search across the repo for the old name (in files not already covered)
|
|
2395
2586
|
try {
|
|
2396
2587
|
const { execFileSync } = await import('child_process');
|
|
2397
2588
|
const rgArgs = [
|
|
@@ -2456,9 +2647,11 @@ export class LocalBackend {
|
|
|
2456
2647
|
files_affected: allChanges.length,
|
|
2457
2648
|
total_edits: totalEdits,
|
|
2458
2649
|
graph_edits: graphEdits,
|
|
2650
|
+
refs_edits: refsEdits,
|
|
2459
2651
|
text_search_edits: astSearchEdits,
|
|
2460
2652
|
changes: allChanges,
|
|
2461
2653
|
applied: !dry_run,
|
|
2654
|
+
_note: 'Rename uses graph edges + refs index + ripgrep. Always review changes before applying. String literals, comments, and dynamic references (e.g. obj[methodName]) need manual review.',
|
|
2462
2655
|
};
|
|
2463
2656
|
}
|
|
2464
2657
|
async impact(repo, params) {
|
|
@@ -2535,6 +2728,32 @@ export class LocalBackend {
|
|
|
2535
2728
|
}
|
|
2536
2729
|
}
|
|
2537
2730
|
}
|
|
2731
|
+
// Supplement d=1 callers from refs table (catches callers the graph missed)
|
|
2732
|
+
if (direction === 'upstream') {
|
|
2733
|
+
try {
|
|
2734
|
+
const targetName = sym.name;
|
|
2735
|
+
const d1FromRefs = findRefsBySymbol(db, targetName, 200);
|
|
2736
|
+
for (const ref of d1FromRefs) {
|
|
2737
|
+
if (ref.filePath === sym.filePath)
|
|
2738
|
+
continue;
|
|
2739
|
+
const refNode = this.findNodeAtPosition(db, ref.filePath, ref.line);
|
|
2740
|
+
if (refNode && !seenIds.has(refNode.name + ':' + ref.filePath)) {
|
|
2741
|
+
// Find the actual node ID for this position
|
|
2742
|
+
const fullNodes = findNodesByFile(db, ref.filePath);
|
|
2743
|
+
const match = fullNodes.find(n => n.name === refNode.name && n.startLine != null && n.startLine <= ref.line + 1 && (n.endLine ?? 9999) >= ref.line + 1);
|
|
2744
|
+
if (match && !seenIds.has(match.id) && !startIds.some(s => s === match.id)) {
|
|
2745
|
+
seenIds.add(match.id);
|
|
2746
|
+
mergedNodes.push({
|
|
2747
|
+
id: match.id, name: match.name, label: match.label,
|
|
2748
|
+
filePath: match.filePath, depth: 1,
|
|
2749
|
+
relationType: 'CALLS', confidence: 0.8,
|
|
2750
|
+
});
|
|
2751
|
+
}
|
|
2752
|
+
}
|
|
2753
|
+
}
|
|
2754
|
+
}
|
|
2755
|
+
catch { /* refs table may not exist */ }
|
|
2756
|
+
}
|
|
2538
2757
|
const impacted = mergedNodes;
|
|
2539
2758
|
const truncated = anyTruncated;
|
|
2540
2759
|
const grouped = {};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@zuvia-software-solutions/code-mapper",
|
|
3
|
-
"version": "2.4.
|
|
3
|
+
"version": "2.4.1",
|
|
4
4
|
"description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
|
|
5
5
|
"author": "Abhigyan Patwari",
|
|
6
6
|
"license": "PolyForm-Noncommercial-1.0.0",
|