@zuvia-software-solutions/code-mapper 2.4.0 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@
2
2
  export interface AnalyzeOptions {
3
3
  force?: boolean;
4
4
  embeddings?: boolean;
5
+ nlEmbeddings?: boolean;
5
6
  tsgo?: boolean;
6
7
  verbose?: boolean;
7
8
  }
@@ -428,6 +428,22 @@ export const analyzeCommand = async (inputPath, options) => {
428
428
  // Reopen DB after Python is done
429
429
  db = openDb(dbPath);
430
430
  }
431
+ // Phase 4b: NL Embeddings (bge-small, CPU, Node.js)
432
+ if (options?.nlEmbeddings) {
433
+ recordPhase('nl-embeddings');
434
+ updateBar(95, 'Generating NL embeddings (bge-small)...');
435
+ const { buildNlEmbeddings } = await import('../core/embeddings/nl-embedder.js');
436
+ try {
437
+ const nlResult = await buildNlEmbeddings(db, (current, total) => {
438
+ const pct = 95 + Math.round((current / Math.max(total, 1)) * 3);
439
+ updateBar(pct, `NL embeddings (${current}/${total})`, 'NL embeddings');
440
+ });
441
+ updateBar(98, `NL embeddings: ${nlResult.embedded} embedded, ${nlResult.skipped} cached (${(nlResult.durationMs / 1000).toFixed(1)}s)`);
442
+ }
443
+ catch (err) {
444
+ console.error(`\n Warning: NL embeddings failed: ${err instanceof Error ? err.message : err}`);
445
+ }
446
+ }
431
447
  // Phase 5: Finalize (98-100%)
432
448
  recordPhase('finalize');
433
449
  updateBar(98, 'Saving metadata...');
package/dist/cli/index.js CHANGED
@@ -22,8 +22,8 @@ program
22
22
  .command('analyze [path]')
23
23
  .description('Index a repository (full analysis)')
24
24
  .option('-f, --force', 'Force full re-index even if up to date')
25
- .option('--embeddings', 'Enable embedding generation for semantic search (on by default)', true)
26
- .option('--no-embeddings', 'Skip embedding generation')
25
+ .option('--embeddings', 'Enable code embedding generation (Jina/MLX, GPU)', false)
26
+ .option('--nl-embeddings', 'Enable NL embedding generation (bge-small, CPU, recommended)', false)
27
27
  .option('--no-tsgo', 'Skip tsgo LSP for call resolution (faster, less accurate)')
28
28
  .option('-v, --verbose', 'Enable verbose ingestion warnings (default: false)')
29
29
  .addHelpText('after', '\nEnvironment variables:\n CODE_MAPPER_NO_GITIGNORE=1 Skip .gitignore parsing (still reads .code-mapperignore)')
@@ -138,6 +138,15 @@ export declare function searchFileWords(db: Database.Database, query: string, li
138
138
  }>;
139
139
  /** Clear all file words (used before full rebuild) */
140
140
  export declare function clearFileWords(db: Database.Database): void;
141
+ /** Count NL embeddings in the index */
142
+ export declare function countNlEmbeddings(db: Database.Database): number;
143
+ /** Search NL embeddings via brute-force cosine similarity */
144
+ export declare function searchNlVector(db: Database.Database, queryVec: number[], limit?: number, maxDistance?: number): Array<{
145
+ nodeId: string;
146
+ distance: number;
147
+ source: string;
148
+ text: string;
149
+ }>;
141
150
  /** Escape a string for use in SQL single-quoted literals. */
142
151
  export declare function escapeSql(value: string): string;
143
152
  /** Execute a raw SQL query and return rows. */
@@ -594,6 +594,46 @@ export function clearFileWords(db) {
594
594
  db.prepare('DELETE FROM file_words').run();
595
595
  }
596
596
  // ---------------------------------------------------------------------------
597
+ // NL Embeddings
598
+ // ---------------------------------------------------------------------------
599
+ /** Count NL embeddings in the index */
600
+ export function countNlEmbeddings(db) {
601
+ try {
602
+ const row = db.prepare('SELECT COUNT(*) as cnt FROM nl_embeddings').get();
603
+ return row?.cnt ?? 0;
604
+ }
605
+ catch {
606
+ return 0;
607
+ }
608
+ }
609
+ /** Search NL embeddings via brute-force cosine similarity */
610
+ export function searchNlVector(db, queryVec, limit = 10, maxDistance = 0.5) {
611
+ let rows;
612
+ try {
613
+ rows = db.prepare('SELECT nodeId, embedding, source, text FROM nl_embeddings').all();
614
+ }
615
+ catch {
616
+ return [];
617
+ }
618
+ const results = [];
619
+ for (const row of rows) {
620
+ const vec = new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.byteLength / 4);
621
+ let dot = 0, normA = 0, normB = 0;
622
+ for (let i = 0; i < queryVec.length && i < vec.length; i++) {
623
+ dot += queryVec[i] * vec[i];
624
+ normA += queryVec[i] * queryVec[i];
625
+ normB += vec[i] * vec[i];
626
+ }
627
+ const similarity = dot / (Math.sqrt(normA) * Math.sqrt(normB));
628
+ const distance = 1 - similarity;
629
+ if (distance < maxDistance) {
630
+ results.push({ nodeId: row.nodeId, distance, source: row.source, text: row.text });
631
+ }
632
+ }
633
+ results.sort((a, b) => a.distance - b.distance);
634
+ return results.slice(0, limit);
635
+ }
636
+ // ---------------------------------------------------------------------------
597
637
  // Raw SQL escape (for dynamic queries in local-backend.ts)
598
638
  // ---------------------------------------------------------------------------
599
639
  /** Escape a string for use in SQL single-quoted literals. */
@@ -68,6 +68,14 @@ export interface EmbeddingRow {
68
68
  readonly embedding: Buffer;
69
69
  readonly textHash: string | null;
70
70
  }
71
+ /** A NL embedding row as stored in the `nl_embeddings` table */
72
+ export interface NlEmbeddingRow {
73
+ readonly nodeId: NodeId;
74
+ readonly embedding: Buffer;
75
+ readonly textHash: string | null;
76
+ readonly source: string;
77
+ readonly text: string;
78
+ }
71
79
  /** A reference occurrence as stored in the `refs` table */
72
80
  export interface RefsRow {
73
81
  readonly symbol: string;
@@ -118,4 +126,4 @@ export interface EdgeInsert {
118
126
  }
119
127
  /** Legacy edge table name constant (kept for compatibility) */
120
128
  export declare const REL_TABLE_NAME = "CodeRelation";
121
- export declare const SCHEMA_SQL = "\n-- Nodes: unified table for all code elements\nCREATE TABLE IF NOT EXISTS nodes (\n id TEXT PRIMARY KEY,\n label TEXT NOT NULL,\n name TEXT NOT NULL DEFAULT '',\n filePath TEXT NOT NULL DEFAULT '',\n startLine INTEGER,\n endLine INTEGER,\n isExported INTEGER,\n content TEXT NOT NULL DEFAULT '',\n description TEXT NOT NULL DEFAULT '',\n heuristicLabel TEXT,\n cohesion REAL,\n symbolCount INTEGER,\n keywords TEXT,\n enrichedBy TEXT,\n processType TEXT,\n stepCount INTEGER,\n communities TEXT,\n entryPointId TEXT,\n terminalId TEXT,\n parameterCount INTEGER,\n returnType TEXT,\n nameExpanded TEXT DEFAULT '',\n searchText TEXT DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);\nCREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath ON nodes(filePath);\nCREATE INDEX IF NOT EXISTS idx_nodes_label_name ON nodes(label, name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath_lines ON nodes(filePath, startLine, endLine);\n\n-- Edges: single table for all relationships\nCREATE TABLE IF NOT EXISTS edges (\n id TEXT PRIMARY KEY,\n sourceId TEXT NOT NULL,\n targetId TEXT NOT NULL,\n type TEXT NOT NULL,\n confidence REAL NOT NULL DEFAULT 1.0,\n reason TEXT NOT NULL DEFAULT '',\n step INTEGER NOT NULL DEFAULT 0,\n callLine INTEGER\n);\n\nCREATE INDEX IF NOT EXISTS idx_edges_sourceId ON edges(sourceId);\nCREATE INDEX IF NOT EXISTS idx_edges_targetId ON edges(targetId);\nCREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);\nCREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(sourceId, type);\nCREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(targetId, type);\n\n-- Embeddings: vector storage\nCREATE TABLE IF NOT EXISTS embeddings (\n nodeId TEXT PRIMARY KEY,\n embedding BLOB NOT NULL,\n textHash TEXT\n);\n\n-- FTS5 virtual table (auto-updated via triggers)\nCREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(\n name,\n nameExpanded,\n searchText,\n filePath,\n content,\n content='nodes',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN\n INSERT INTO nodes_fts(rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\n\n-- Refs: identifier occurrence index (pre-computed grep)\nCREATE TABLE IF NOT EXISTS refs (\n symbol TEXT NOT NULL,\n filePath TEXT NOT NULL,\n line INTEGER NOT NULL\n);\n\nCREATE INDEX IF NOT EXISTS idx_refs_symbol ON refs(symbol);\nCREATE INDEX IF NOT EXISTS idx_refs_filePath ON refs(filePath);\nCREATE INDEX IF NOT EXISTS idx_refs_symbol_file ON refs(symbol, filePath);\n\n-- File-level word index for conceptual search\nCREATE TABLE IF NOT EXISTS file_words (\n filePath TEXT PRIMARY KEY,\n words TEXT NOT NULL DEFAULT ''\n);\n\nCREATE VIRTUAL TABLE IF NOT EXISTS file_words_fts USING fts5(\n words,\n content='file_words',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ai AFTER INSERT ON file_words BEGIN\n INSERT INTO file_words_fts(rowid, words) VALUES (new.rowid, new.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ad AFTER DELETE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_au AFTER UPDATE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES (new.rowid, new.words);\nEND;\n";
129
+ export declare const SCHEMA_SQL = "\n-- Nodes: unified table for all code elements\nCREATE TABLE IF NOT EXISTS nodes (\n id TEXT PRIMARY KEY,\n label TEXT NOT NULL,\n name TEXT NOT NULL DEFAULT '',\n filePath TEXT NOT NULL DEFAULT '',\n startLine INTEGER,\n endLine INTEGER,\n isExported INTEGER,\n content TEXT NOT NULL DEFAULT '',\n description TEXT NOT NULL DEFAULT '',\n heuristicLabel TEXT,\n cohesion REAL,\n symbolCount INTEGER,\n keywords TEXT,\n enrichedBy TEXT,\n processType TEXT,\n stepCount INTEGER,\n communities TEXT,\n entryPointId TEXT,\n terminalId TEXT,\n parameterCount INTEGER,\n returnType TEXT,\n nameExpanded TEXT DEFAULT '',\n searchText TEXT DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);\nCREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath ON nodes(filePath);\nCREATE INDEX IF NOT EXISTS idx_nodes_label_name ON nodes(label, name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath_lines ON nodes(filePath, startLine, endLine);\n\n-- Edges: single table for all relationships\nCREATE TABLE IF NOT EXISTS edges (\n id TEXT PRIMARY KEY,\n sourceId TEXT NOT NULL,\n targetId TEXT NOT NULL,\n type TEXT NOT NULL,\n confidence REAL NOT NULL DEFAULT 1.0,\n reason TEXT NOT NULL DEFAULT '',\n step INTEGER NOT NULL DEFAULT 0,\n callLine INTEGER\n);\n\nCREATE INDEX IF NOT EXISTS idx_edges_sourceId ON edges(sourceId);\nCREATE INDEX IF NOT EXISTS idx_edges_targetId ON edges(targetId);\nCREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);\nCREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(sourceId, type);\nCREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(targetId, type);\n\n-- Embeddings: vector storage\nCREATE TABLE IF NOT EXISTS embeddings (\n nodeId TEXT PRIMARY KEY,\n embedding BLOB NOT NULL,\n textHash TEXT\n);\n\n-- NL Embeddings: natural language description vectors (bge-small, 384-dim)\nCREATE TABLE IF NOT EXISTS nl_embeddings (\n nodeId TEXT NOT NULL,\n embedding BLOB NOT NULL,\n textHash TEXT,\n source TEXT NOT NULL DEFAULT 'comment',\n text TEXT NOT NULL DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nl_emb_nodeId ON nl_embeddings(nodeId);\n\n-- FTS5 virtual table (auto-updated via triggers)\nCREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(\n name,\n nameExpanded,\n searchText,\n filePath,\n content,\n content='nodes',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN\n INSERT INTO nodes_fts(rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\n\n-- Refs: identifier occurrence index (pre-computed grep)\nCREATE TABLE IF NOT EXISTS refs (\n symbol TEXT NOT NULL,\n filePath TEXT NOT NULL,\n line INTEGER NOT NULL\n);\n\nCREATE INDEX IF NOT EXISTS idx_refs_symbol ON refs(symbol);\nCREATE INDEX IF NOT EXISTS idx_refs_filePath ON refs(filePath);\nCREATE INDEX IF NOT EXISTS idx_refs_symbol_file ON refs(symbol, filePath);\n\n-- File-level word index for conceptual search\nCREATE TABLE IF NOT EXISTS file_words (\n filePath TEXT PRIMARY KEY,\n words TEXT NOT NULL DEFAULT ''\n);\n\nCREATE VIRTUAL TABLE IF NOT EXISTS file_words_fts USING fts5(\n words,\n content='file_words',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ai AFTER INSERT ON file_words BEGIN\n INSERT INTO file_words_fts(rowid, words) VALUES (new.rowid, new.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ad AFTER DELETE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_au AFTER UPDATE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES (new.rowid, new.words);\nEND;\n";
@@ -114,6 +114,17 @@ CREATE TABLE IF NOT EXISTS embeddings (
114
114
  textHash TEXT
115
115
  );
116
116
 
117
+ -- NL Embeddings: natural language description vectors (bge-small, 384-dim)
118
+ CREATE TABLE IF NOT EXISTS nl_embeddings (
119
+ nodeId TEXT NOT NULL,
120
+ embedding BLOB NOT NULL,
121
+ textHash TEXT,
122
+ source TEXT NOT NULL DEFAULT 'comment',
123
+ text TEXT NOT NULL DEFAULT ''
124
+ );
125
+
126
+ CREATE INDEX IF NOT EXISTS idx_nl_emb_nodeId ON nl_embeddings(nodeId);
127
+
117
128
  -- FTS5 virtual table (auto-updated via triggers)
118
129
  CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(
119
130
  name,
@@ -0,0 +1,44 @@
1
+ /**
2
+ * @file Natural language embedder using bge-small-en-v1.5.
3
+ *
4
+ * Runs entirely in Node.js via @huggingface/transformers — no Python, no GPU.
5
+ * Embeds human-readable descriptions extracted from code (JSDoc comments,
6
+ * enum values, type patterns, file headers) for conceptual search.
7
+ *
8
+ * 33M params, q8 quantized, 384-dim embeddings, ~6ms/text on CPU.
9
+ */
10
+ import type Database from 'better-sqlite3';
11
+ /** Initialize the NL embedding model (lazy, idempotent) */
12
+ export declare function initNlEmbedder(): Promise<void>;
13
+ /** Check if the NL embedder is ready */
14
+ export declare function isNlEmbedderReady(): boolean;
15
+ /** Embed a single text, returns Float32Array */
16
+ export declare function nlEmbed(text: string): Promise<number[]>;
17
+ /** Embed a batch of texts */
18
+ export declare function nlEmbedBatch(texts: string[]): Promise<number[][]>;
19
+ interface NodeForNl {
20
+ id: string;
21
+ name: string;
22
+ label: string;
23
+ filePath: string;
24
+ content: string;
25
+ startLine: number | null;
26
+ description: string;
27
+ }
28
+ interface NlDocument {
29
+ nodeId: string;
30
+ source: string;
31
+ text: string;
32
+ }
33
+ /** Build NL documents from a node */
34
+ export declare function extractNlTexts(node: NodeForNl): NlDocument[];
35
+ /**
36
+ * Build NL embeddings for all eligible nodes in the database.
37
+ * Reads nodes, extracts NL text, embeds with bge-small, writes to nl_embeddings.
38
+ */
39
+ export declare function buildNlEmbeddings(db: Database.Database, onProgress?: (current: number, total: number) => void): Promise<{
40
+ embedded: number;
41
+ skipped: number;
42
+ durationMs: number;
43
+ }>;
44
+ export {};
@@ -0,0 +1,262 @@
1
+ // code-mapper/src/core/embeddings/nl-embedder.ts
2
+ /**
3
+ * @file Natural language embedder using bge-small-en-v1.5.
4
+ *
5
+ * Runs entirely in Node.js via @huggingface/transformers — no Python, no GPU.
6
+ * Embeds human-readable descriptions extracted from code (JSDoc comments,
7
+ * enum values, type patterns, file headers) for conceptual search.
8
+ *
9
+ * 33M params, q8 quantized, 384-dim embeddings, ~6ms/text on CPU.
10
+ */
11
+ // NL embedder — no schema imports needed
12
+ const MODEL_ID = 'Xenova/bge-small-en-v1.5';
13
+ // Lazy-loaded pipeline
14
+ let extractor = null;
15
+ let loadPromise = null;
16
+ /** Initialize the NL embedding model (lazy, idempotent) */
17
+ export async function initNlEmbedder() {
18
+ if (extractor)
19
+ return;
20
+ if (loadPromise)
21
+ return loadPromise;
22
+ loadPromise = (async () => {
23
+ const { pipeline } = await import('@huggingface/transformers');
24
+ extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
25
+ })();
26
+ return loadPromise;
27
+ }
28
+ /** Check if the NL embedder is ready */
29
+ export function isNlEmbedderReady() {
30
+ return extractor !== null;
31
+ }
32
+ /** Embed a single text, returns Float32Array */
33
+ export async function nlEmbed(text) {
34
+ if (!extractor)
35
+ await initNlEmbedder();
36
+ const result = await extractor(text, { pooling: 'cls', normalize: true });
37
+ return Array.from(result.data);
38
+ }
39
+ /** Embed a batch of texts */
40
+ export async function nlEmbedBatch(texts) {
41
+ if (!extractor)
42
+ await initNlEmbedder();
43
+ const results = [];
44
+ for (const text of texts) {
45
+ const result = await extractor(text, { pooling: 'cls', normalize: true });
46
+ results.push(Array.from(result.data));
47
+ }
48
+ return results;
49
+ }
50
+ /** Extract all JSDoc/block comment text (up to 10 lines) */
51
+ function extractFullComment(content) {
52
+ if (!content)
53
+ return '';
54
+ const lines = content.split('\n');
55
+ const commentLines = [];
56
+ let inBlock = false;
57
+ for (const l of lines) {
58
+ const t = l.trim();
59
+ if (t.startsWith('/**') || t.startsWith('/*')) {
60
+ inBlock = true;
61
+ const inner = t.replace(/^\/\*\*?/, '').replace(/\*\/$/, '').trim();
62
+ if (inner && !inner.startsWith('@'))
63
+ commentLines.push(inner);
64
+ if (t.includes('*/'))
65
+ inBlock = false;
66
+ continue;
67
+ }
68
+ if (inBlock) {
69
+ if (t.includes('*/')) {
70
+ inBlock = false;
71
+ continue;
72
+ }
73
+ const inner = t.replace(/^\*\s?/, '').trim();
74
+ if (inner && !inner.startsWith('@'))
75
+ commentLines.push(inner);
76
+ if (commentLines.length >= 10)
77
+ break;
78
+ continue;
79
+ }
80
+ if (t.startsWith('//')) {
81
+ const inner = t.slice(2).trim();
82
+ if (inner)
83
+ commentLines.push(inner);
84
+ if (commentLines.length >= 10)
85
+ break;
86
+ continue;
87
+ }
88
+ if (t.startsWith('#') && !t.startsWith('#!')) {
89
+ const inner = t.slice(1).trim();
90
+ if (inner)
91
+ commentLines.push(inner);
92
+ if (commentLines.length >= 10)
93
+ break;
94
+ continue;
95
+ }
96
+ if (commentLines.length > 0)
97
+ break; // comment ended
98
+ }
99
+ return commentLines.join(' ');
100
+ }
101
+ /** Expand camelCase/PascalCase/snake_case to space-separated words */
102
+ function expandIdentifier(name) {
103
+ return name
104
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
105
+ .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
106
+ .replace(/[_\-]/g, ' ')
107
+ .toLowerCase();
108
+ }
109
+ /** Extract enum/const array values as NL text */
110
+ function extractEnumValues(content) {
111
+ // Match: ['value1', 'value2', ...] as const
112
+ const match = content.match(/\[([^\]]+)\]\s*as\s*const/);
113
+ if (match?.[1]) {
114
+ const values = match[1].replace(/['"]/g, '').split(',').map(v => v.trim()).filter(Boolean);
115
+ if (values.length > 0)
116
+ return values.join(', ');
117
+ }
118
+ // Match: enum { Value1, Value2 }
119
+ const enumMatch = content.match(/enum\s+\w+\s*\{([^}]+)\}/);
120
+ if (enumMatch?.[1]) {
121
+ const values = enumMatch[1].split(',').map(v => v.trim().split('=')[0].trim()).filter(Boolean);
122
+ if (values.length > 0)
123
+ return values.map(v => expandIdentifier(v)).join(', ');
124
+ }
125
+ return '';
126
+ }
127
+ /** Extract parameter names from function signature */
128
+ function extractParamNames(content) {
129
+ const match = content.match(/\(([^)]*)\)/);
130
+ if (!match?.[1])
131
+ return '';
132
+ return match[1].split(',')
133
+ .map(p => p.trim().split(':')[0].split('=')[0].trim())
134
+ .filter(p => p && p !== '')
135
+ .map(p => expandIdentifier(p))
136
+ .join(', ');
137
+ }
138
+ /** Build NL documents from a node */
139
+ export function extractNlTexts(node) {
140
+ const docs = [];
141
+ const name = node.name;
142
+ const expandedName = expandIdentifier(name);
143
+ const dir = node.filePath.split('/').slice(-3, -1).join('/');
144
+ // 1. Comment-based NL text (primary)
145
+ const comment = extractFullComment(node.content);
146
+ if (comment) {
147
+ docs.push({
148
+ nodeId: node.id,
149
+ source: 'comment',
150
+ text: `${expandedName}: ${comment}. File: ${dir}`,
151
+ });
152
+ }
153
+ // 2. Name + params + return type (always available)
154
+ const params = extractParamNames(node.content);
155
+ const parts = [expandedName];
156
+ if (params)
157
+ parts.push(`Parameters: ${params}`);
158
+ if (dir)
159
+ parts.push(`in ${dir}`);
160
+ if (!comment) {
161
+ // Only add name-based doc if no comment (avoid duplication)
162
+ docs.push({
163
+ nodeId: node.id,
164
+ source: 'name',
165
+ text: parts.join('. '),
166
+ });
167
+ }
168
+ // 3. Enum/const values
169
+ if (node.label === 'Enum' || node.label === 'Const' || node.label === 'TypeAlias') {
170
+ const values = extractEnumValues(node.content);
171
+ if (values) {
172
+ docs.push({
173
+ nodeId: node.id,
174
+ source: 'enum',
175
+ text: `${expandedName}: ${values}`,
176
+ });
177
+ }
178
+ }
179
+ return docs;
180
+ }
181
+ // ---------------------------------------------------------------------------
182
+ // Full NL embedding pipeline
183
+ // ---------------------------------------------------------------------------
184
+ /** Hash text for skip detection */
185
+ import { createHash } from 'crypto';
186
+ function md5(text) {
187
+ return createHash('md5').update(text).digest('hex');
188
+ }
189
+ /**
190
+ * Build NL embeddings for all eligible nodes in the database.
191
+ * Reads nodes, extracts NL text, embeds with bge-small, writes to nl_embeddings.
192
+ */
193
+ export async function buildNlEmbeddings(db, onProgress) {
194
+ const t0 = Date.now();
195
+ await initNlEmbedder();
196
+ // Query all nodes (not just EMBEDDABLE_LABELS — we want enums, consts, types too)
197
+ const labels = ['Function', 'Class', 'Method', 'Interface', 'Const', 'Enum', 'TypeAlias', 'Namespace', 'Module', 'Struct'];
198
+ const placeholders = labels.map(() => '?').join(',');
199
+ const rows = db.prepare(`SELECT id, name, label, filePath, content, startLine, description FROM nodes WHERE label IN (${placeholders})`).all(...labels);
200
+ // Skip test files
201
+ const testPatterns = ['/test/', '/tests/', '/spec/', '/fixtures/', '/__tests__/', '/__mocks__/', '.test.', '.spec.', '_test.', '_spec.'];
202
+ const filteredRows = rows.filter(r => !testPatterns.some(p => r.filePath.includes(p)));
203
+ // Extract NL documents
204
+ const allDocs = [];
205
+ for (const row of filteredRows) {
206
+ const docs = extractNlTexts(row);
207
+ for (const doc of docs)
208
+ allDocs.push(doc);
209
+ }
210
+ if (allDocs.length === 0) {
211
+ return { embedded: 0, skipped: 0, durationMs: Date.now() - t0 };
212
+ }
213
+ // Check existing hashes for skip detection
214
+ const existingHashes = new Map();
215
+ try {
216
+ const hashRows = db.prepare('SELECT nodeId, textHash FROM nl_embeddings WHERE textHash IS NOT NULL').all();
217
+ for (const r of hashRows)
218
+ existingHashes.set(r.nodeId + ':' + r.textHash, '1');
219
+ }
220
+ catch { /* table might not exist yet */ }
221
+ // Filter to docs that need embedding
222
+ const toEmbed = [];
223
+ let skipped = 0;
224
+ for (const doc of allDocs) {
225
+ const hash = md5(doc.text);
226
+ if (existingHashes.has(doc.nodeId + ':' + hash)) {
227
+ skipped++;
228
+ continue;
229
+ }
230
+ toEmbed.push({ ...doc, hash });
231
+ }
232
+ if (toEmbed.length === 0) {
233
+ return { embedded: 0, skipped, durationMs: Date.now() - t0 };
234
+ }
235
+ // Clear existing NL embeddings and rebuild
236
+ db.prepare('DELETE FROM nl_embeddings').run();
237
+ // Embed in batches and write to DB
238
+ const BATCH = 100;
239
+ const insertStmt = db.prepare('INSERT INTO nl_embeddings (nodeId, embedding, textHash, source, text) VALUES (?, ?, ?, ?, ?)');
240
+ let embedded = 0;
241
+ db.exec('BEGIN');
242
+ try {
243
+ for (let i = 0; i < toEmbed.length; i += BATCH) {
244
+ const batch = toEmbed.slice(i, i + BATCH);
245
+ const vecs = await nlEmbedBatch(batch.map(d => d.text));
246
+ for (let j = 0; j < batch.length; j++) {
247
+ const doc = batch[j];
248
+ const vec = vecs[j];
249
+ const blob = Buffer.from(new Float32Array(vec).buffer);
250
+ insertStmt.run(doc.nodeId, blob, doc.hash, doc.source, doc.text);
251
+ embedded++;
252
+ }
253
+ onProgress?.(Math.min(i + BATCH, toEmbed.length), toEmbed.length);
254
+ }
255
+ db.exec('COMMIT');
256
+ }
257
+ catch (err) {
258
+ db.exec('ROLLBACK');
259
+ throw err;
260
+ }
261
+ return { embedded, skipped, durationMs: Date.now() - t0 };
262
+ }
@@ -42,6 +42,8 @@ export declare class LocalBackend {
42
42
  private tsgoServices;
43
43
  /** Per-repo in-memory embedding cache: nodeId → Float32Array (256-dim) */
44
44
  private embeddingCaches;
45
+ /** Per-repo in-memory NL embedding cache: includes source text for match_reason */
46
+ private nlEmbeddingCaches;
45
47
  /** Get (or lazily start) a tsgo LSP service for a repo. Returns null if unavailable. */
46
48
  private getTsgo;
47
49
  /** Get (or lazily open) the SQLite database for a repo. */
@@ -50,6 +52,10 @@ export declare class LocalBackend {
50
52
  private loadEmbeddingCache;
51
53
  /** Search embeddings in memory — O(N) dot products, no disk I/O */
52
54
  private searchEmbeddingsInMemory;
55
+ /** Load NL embeddings into memory for fast conceptual search */
56
+ private loadNlEmbeddingCache;
57
+ /** Search NL embeddings in memory, returns match_reason text */
58
+ private searchNlEmbeddingsInMemory;
53
59
  /** Hard ceiling — beyond this, incremental is unreliable, warn prominently */
54
60
  private static readonly MAX_INCREMENTAL_FILES;
55
61
  /** Start file system watcher for a repo to detect source changes */
@@ -131,6 +137,11 @@ export declare class LocalBackend {
131
137
  * Semantic vector search helper
132
138
  */
133
139
  private semanticSearch;
140
+ /**
141
+ * NL semantic search: embed query with bge-small, search NL descriptions.
142
+ * Returns match_reason (the NL text that matched) for agent transparency.
143
+ */
144
+ private nlSemanticSearch;
134
145
  /**
135
146
  * Refs-based search: find symbols referenced in files that contain the query identifiers.
136
147
  * Bridges the gap between graph edges (incomplete) and grep (complete for exact names).
@@ -57,6 +57,8 @@ export class LocalBackend {
57
57
  tsgoServices = new Map();
58
58
  /** Per-repo in-memory embedding cache: nodeId → Float32Array (256-dim) */
59
59
  embeddingCaches = new Map();
60
+ /** Per-repo in-memory NL embedding cache: includes source text for match_reason */
61
+ nlEmbeddingCaches = new Map();
60
62
  /** Get (or lazily start) a tsgo LSP service for a repo. Returns null if unavailable. */
61
63
  async getTsgo(repo) {
62
64
  const existing = this.tsgoServices.get(repo.id);
@@ -140,6 +142,70 @@ export class LocalBackend {
140
142
  results.sort((a, b) => a.distance - b.distance);
141
143
  return results.slice(0, limit);
142
144
  }
145
+ /** Load NL embeddings into memory for fast conceptual search */
146
+ loadNlEmbeddingCache(repoId) {
147
+ try {
148
+ const db = this.getDb(repoId);
149
+ let rows;
150
+ try {
151
+ rows = db.prepare('SELECT nodeId, embedding, text FROM nl_embeddings').all();
152
+ }
153
+ catch {
154
+ return;
155
+ } // table might not exist
156
+ if (rows.length === 0) {
157
+ this.nlEmbeddingCaches.delete(repoId);
158
+ return;
159
+ }
160
+ const dims = rows[0].embedding.byteLength / 4;
161
+ const nodeIds = [];
162
+ const texts = [];
163
+ const matrix = new Float32Array(rows.length * dims);
164
+ const norms = new Float32Array(rows.length);
165
+ for (let i = 0; i < rows.length; i++) {
166
+ const row = rows[i];
167
+ nodeIds.push(row.nodeId);
168
+ texts.push(row.text);
169
+ const vec = new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.byteLength / 4);
170
+ matrix.set(vec, i * dims);
171
+ let norm = 0;
172
+ for (let d = 0; d < dims; d++)
173
+ norm += vec[d] * vec[d];
174
+ norms[i] = Math.sqrt(norm);
175
+ }
176
+ this.nlEmbeddingCaches.set(repoId, { nodeIds, texts, matrix, norms });
177
+ }
178
+ catch { /* NL embeddings not available */ }
179
+ }
180
+ /** Search NL embeddings in memory, returns match_reason text */
181
+ searchNlEmbeddingsInMemory(repoId, queryVec, limit = 10, maxDistance = 0.5) {
182
+ const cache = this.nlEmbeddingCaches.get(repoId);
183
+ if (!cache || cache.nodeIds.length === 0)
184
+ return [];
185
+ const dims = queryVec.length;
186
+ const results = [];
187
+ let qNorm = 0;
188
+ for (let d = 0; d < dims; d++)
189
+ qNorm += queryVec[d] * queryVec[d];
190
+ qNorm = Math.sqrt(qNorm);
191
+ if (qNorm === 0)
192
+ return [];
193
+ const cacheDims = cache.matrix.length / cache.nodeIds.length;
194
+ for (let i = 0; i < cache.nodeIds.length; i++) {
195
+ const offset = i * cacheDims;
196
+ let dot = 0;
197
+ const minDims = Math.min(dims, cacheDims);
198
+ for (let d = 0; d < minDims; d++)
199
+ dot += queryVec[d] * cache.matrix[offset + d];
200
+ const similarity = dot / (qNorm * cache.norms[i]);
201
+ const distance = 1 - similarity;
202
+ if (distance < maxDistance) {
203
+ results.push({ nodeId: cache.nodeIds[i], distance, text: cache.texts[i] });
204
+ }
205
+ }
206
+ results.sort((a, b) => a.distance - b.distance);
207
+ return results.slice(0, limit);
208
+ }
143
209
  /** Hard ceiling — beyond this, incremental is unreliable, warn prominently */
144
210
  static MAX_INCREMENTAL_FILES = 200;
145
211
  /** Start file system watcher for a repo to detect source changes */
@@ -273,11 +339,12 @@ export class LocalBackend {
273
339
  for (const [id, handle] of this.repos) {
274
340
  this.startWatcher(id, handle);
275
341
  this.seedWatcherFromGit(id, handle);
276
- // Load embedding cache into memory for fast vector search
342
+ // Load embedding caches into memory for fast vector search
277
343
  if ((handle.stats?.embeddings ?? 0) > 0) {
278
344
  this.loadEmbeddingCache(id);
279
345
  anyEmbeddings = true;
280
346
  }
347
+ this.loadNlEmbeddingCache(id); // NL cache loaded regardless (cheap, may not exist)
281
348
  }
282
349
  // Pre-warm MLX embedder so first query has zero model-load latency
283
350
  if (anyEmbeddings) {
@@ -592,6 +659,9 @@ export class LocalBackend {
592
659
  const end = d.endLine || 0;
593
660
  const isSmall = end > 0 && start > 0 && (end - start) < 10;
594
661
  lines.push(` ${sig} — ${d.type} @ ${this.shortPath(d.filePath)}:${start || '?'}${mod}`);
662
+ if (d.match_reason) {
663
+ lines.push(` _"${d.match_reason}"_`);
664
+ }
595
665
  if (isSmall && d.content) {
596
666
  const src = String(d.content).trim();
597
667
  if (src.length < 500) {
@@ -671,6 +741,9 @@ export class LocalBackend {
671
741
  renderFlows(2);
672
742
  }
673
743
  }
744
+ lines.push('');
745
+ lines.push('---');
746
+ lines.push('_Note: Results ranked by BM25 keyword + semantic + refs + file-content signals. Natural language queries may miss code using different terminology. If results seem incomplete, try specific identifiers or `grep -rn "keyword" --include="*.ts"` for exhaustive search._');
674
747
  return lines.join('\n');
675
748
  }
676
749
  formatContextAsText(result) {
@@ -776,6 +849,10 @@ export class LocalBackend {
776
849
  lines.push(` ${p.name} (step ${p.step_index}/${p.step_count})`);
777
850
  }
778
851
  }
852
+ // Guidance footer for agents
853
+ lines.push('');
854
+ lines.push('---');
855
+ lines.push('_Note: Callers are from graph edges + refs index. For widely-used symbols, verify completeness with `grep -rn "symbolName(" --include="*.ts"`. Outgoing calls may miss dynamic dispatch or reflection._');
779
856
  return lines.join('\n');
780
857
  }
781
858
  formatImpactAsText(result) {
@@ -822,6 +899,9 @@ export class LocalBackend {
822
899
  lines.push('');
823
900
  lines.push(`### Modules: ${mods.map((m) => `${m.name} (${m.hits} ${m.impact})`).join(' | ')}`);
824
901
  }
902
+ lines.push('');
903
+ lines.push('---');
904
+ lines.push('_Note: d=1 callers include graph edges + refs index. Indirect deps through dynamic dispatch, config, or type-only references may not appear. For critical changes, verify d=1 with `grep -rn "symbolName" --include="*.ts"`._');
825
905
  return lines.join('\n');
826
906
  }
827
907
  formatDetectChangesAsText(result) {
@@ -1002,13 +1082,14 @@ export class LocalBackend {
1002
1082
  // Step 1: Four-signal search in parallel
1003
1083
  // BM25 uses expanded query; semantic uses enriched query; refs + file_words use raw query
1004
1084
  const searchLimit = processLimit * maxSymbolsPerProcess;
1005
- const [bm25Results, semanticResults, refsResults, fileWordsResults] = await Promise.all([
1085
+ const [bm25Results, semanticResults, nlSemanticResults, refsResults, fileWordsResults] = await Promise.all([
1006
1086
  this.bm25Search(repo, expandedSearchQuery, searchLimit),
1007
1087
  this.semanticSearch(repo, semanticQuery, searchLimit),
1088
+ this.nlSemanticSearch(repo, searchQuery, searchLimit),
1008
1089
  Promise.resolve(this.refsSearch(repo, searchQuery, searchLimit)),
1009
1090
  Promise.resolve(this.fileWordsSearch(repo, searchQuery, searchLimit)),
1010
1091
  ]);
1011
- // Step 2: Weighted RRF merge (4 signals)
1092
+ // Step 2: Weighted RRF merge (5 signals)
1012
1093
  const bm25ForRRF = bm25Results.map((r, i) => ({
1013
1094
  nodeId: String(r.nodeId ?? ''),
1014
1095
  name: String(r.name ?? ''),
@@ -1038,7 +1119,26 @@ export class LocalBackend {
1038
1119
  ...(r.startLine != null ? { startLine: r.startLine } : {}),
1039
1120
  ...(r.endLine != null ? { endLine: r.endLine } : {}),
1040
1121
  }));
1041
- let rrfMerged = mergeWithRRF(bm25ForRRF, semanticForRRF, { limit: searchLimit });
1122
+ // NL semantic results get high weight proven 100% recall on conceptual queries
1123
+ const nlForRRF = nlSemanticResults.map((r) => ({
1124
+ nodeId: String(r.nodeId ?? ''), name: String(r.name ?? ''), label: String(r.type ?? ''),
1125
+ filePath: String(r.filePath ?? ''), distance: Number(r.distance ?? 1),
1126
+ ...(r.startLine != null ? { startLine: r.startLine } : {}),
1127
+ ...(r.endLine != null ? { endLine: r.endLine } : {}),
1128
+ }));
1129
+ // Merge code + NL semantic into one semantic list (best of both worlds)
1130
+ const combinedSemantic = [...semanticForRRF, ...nlForRRF]
1131
+ .sort((a, b) => a.distance - b.distance)
1132
+ .filter((r, i, arr) => arr.findIndex(x => x.nodeId === r.nodeId) === i) // dedupe by nodeId
1133
+ .slice(0, searchLimit);
1134
+ let rrfMerged = mergeWithRRF(bm25ForRRF, combinedSemantic, { limit: searchLimit });
1135
+ // Store NL match reasons for display
1136
+ const nlMatchReasons = new Map();
1137
+ for (const r of nlSemanticResults) {
1138
+ if (r.match_reason && !nlMatchReasons.has(r.nodeId)) {
1139
+ nlMatchReasons.set(r.nodeId, r.match_reason);
1140
+ }
1141
+ }
1042
1142
  // Merge refs + fileWords into the RRF results (lower weight)
1043
1143
  if (refsForRRF.length > 0 || fileWordsForRRF.length > 0) {
1044
1144
  const supplemental = mergeWithRRF(refsForRRF, fileWordsForRRF.map((r) => ({
@@ -1084,13 +1184,22 @@ export class LocalBackend {
1084
1184
  if (!searchDataMap.has(key))
1085
1185
  searchDataMap.set(key, r);
1086
1186
  }
1087
- let merged = rrfMerged.map(rrf => ({
1088
- score: rrf.score,
1089
- data: searchDataMap.get(rrf.nodeId ?? '') ?? searchDataMap.get(rrf.filePath) ?? {
1187
+ for (const r of nlSemanticResults) {
1188
+ const key = r.nodeId || r.filePath;
1189
+ if (!searchDataMap.has(key))
1190
+ searchDataMap.set(key, r);
1191
+ }
1192
+ let merged = rrfMerged.map(rrf => {
1193
+ const data = searchDataMap.get(rrf.nodeId ?? '') ?? searchDataMap.get(rrf.filePath) ?? {
1090
1194
  name: rrf.name ?? rrf.filePath.split('/').pop(), type: rrf.label ?? 'File',
1091
1195
  filePath: rrf.filePath, nodeId: rrf.nodeId,
1092
- },
1093
- }));
1196
+ };
1197
+ // Attach NL match reason if available
1198
+ const reason = nlMatchReasons.get(rrf.nodeId ?? '') ?? nlMatchReasons.get(data.nodeId ?? '');
1199
+ if (reason)
1200
+ data.match_reason = reason;
1201
+ return { score: rrf.score, data };
1202
+ });
1094
1203
  // Filter noise: remove test files, config files, docs from results by default
1095
1204
  merged = merged.filter(item => {
1096
1205
  const fp = String(item.data.filePath ?? '').toLowerCase();
@@ -1511,6 +1620,48 @@ export class LocalBackend {
1511
1620
  return [];
1512
1621
  }
1513
1622
  }
1623
+ /**
1624
+ * NL semantic search: embed query with bge-small, search NL descriptions.
1625
+ * Returns match_reason (the NL text that matched) for agent transparency.
1626
+ */
1627
+ async nlSemanticSearch(repo, query, limit) {
1628
+ try {
1629
+ const cache = this.nlEmbeddingCaches.get(repo.id);
1630
+ if (!cache || cache.nodeIds.length === 0)
1631
+ return [];
1632
+ const { nlEmbed } = await import('../../core/embeddings/nl-embedder.js');
1633
+ const queryVec = await nlEmbed(query);
1634
+ const vecResults = this.searchNlEmbeddingsInMemory(repo.id, queryVec, limit, 0.5);
1635
+ if (vecResults.length === 0)
1636
+ return [];
1637
+ // Fetch node metadata
1638
+ const metaDb = this.getDb(repo.id);
1639
+ const seen = new Set();
1640
+ const results = [];
1641
+ for (const r of vecResults) {
1642
+ if (seen.has(r.nodeId))
1643
+ continue;
1644
+ seen.add(r.nodeId);
1645
+ const node = getNode(metaDb, toNodeId(r.nodeId));
1646
+ if (node) {
1647
+ results.push({
1648
+ nodeId: r.nodeId,
1649
+ name: node.name,
1650
+ type: node.label,
1651
+ filePath: node.filePath,
1652
+ distance: r.distance,
1653
+ startLine: node.startLine,
1654
+ endLine: node.endLine,
1655
+ match_reason: r.text, // The NL text that matched — shown to agents
1656
+ });
1657
+ }
1658
+ }
1659
+ return results;
1660
+ }
1661
+ catch {
1662
+ return [];
1663
+ }
1664
+ }
1514
1665
  /**
1515
1666
  * Refs-based search: find symbols referenced in files that contain the query identifiers.
1516
1667
  * Bridges the gap between graph edges (incomplete) and grep (complete for exact names).
@@ -2033,24 +2184,30 @@ export class LocalBackend {
2033
2184
  }
2034
2185
  // Supplement callers from refs table (catches callers the graph missed)
2035
2186
  try {
2036
- const refCallers = findRefsBySymbol(db, sym.name, 100);
2187
+ const refCallers = findRefsBySymbol(db, sym.name, 200);
2037
2188
  const knownFiles = new Set(incomingRows.map(r => r.filePath));
2189
+ let refsAdded = 0;
2038
2190
  for (const ref of refCallers) {
2039
2191
  if (ref.filePath === sym.filePath)
2040
2192
  continue; // skip self-file
2041
2193
  if (knownFiles.has(ref.filePath))
2042
- continue; // already known
2043
- knownFiles.add(ref.filePath);
2194
+ continue; // already have a caller from this file
2044
2195
  const enclosing = this.findNodeAtPosition(db, ref.filePath, ref.line);
2045
- if (enclosing) {
2046
- incomingRows.push({
2047
- relType: 'CALLS', uid: '', name: enclosing.name, filePath: ref.filePath,
2048
- kind: enclosing.label, startLine: ref.line, reason: 'refs-index',
2049
- });
2050
- }
2196
+ if (!enclosing)
2197
+ continue; // no symbol at this line (e.g. import statement)
2198
+ knownFiles.add(ref.filePath); // mark AFTER finding a valid node
2199
+ incomingRows.push({
2200
+ relType: 'CALLS', uid: '', name: enclosing.name, filePath: ref.filePath,
2201
+ kind: enclosing.label, startLine: ref.line, reason: 'refs-index',
2202
+ });
2203
+ refsAdded++;
2051
2204
  }
2205
+ if (process.env['CODE_MAPPER_VERBOSE'])
2206
+ console.error(`Code Mapper: refs supplement for '${sym.name}': ${refsAdded} added from ${refCallers.length} refs`);
2207
+ }
2208
+ catch (err) {
2209
+ console.error(`Code Mapper: refs supplement failed: ${err instanceof Error ? err.message : err}`);
2052
2210
  }
2053
- catch { /* refs table may not exist yet */ }
2054
2211
  // Outgoing refs — exclude generic method names that produce false positives at low confidence
2055
2212
  const GENERIC_NAMES_EXCLUDE = new Set(['has', 'get', 'set', 'add', 'remove', 'delete', 'close', 'stop', 'clear', 'reset', 'toString', 'valueOf', 'push', 'pop', 'entries', 'keys', 'values']);
2056
2213
  let outgoingRows = [];
@@ -2388,10 +2545,44 @@ export class LocalBackend {
2388
2545
  logQueryError('rename:read-ref', e);
2389
2546
  }
2390
2547
  }
2391
- // Step 3: Text search for refs the graph might have missed
2392
- let astSearchEdits = 0;
2548
+ // Step 3a: Refs table lookup (instant, covers most cases the graph missed)
2549
+ let refsEdits = 0;
2393
2550
  const graphFiles = new Set([sym.filePath, ...allIncoming.map(r => r.filePath)].filter(Boolean));
2394
- // Simple text search across the repo for the old name (in files not already covered by graph)
2551
+ try {
2552
+ const refsDb = this.getDb(repo.id);
2553
+ const refsForName = findRefsBySymbol(refsDb, oldName, 500);
2554
+ for (const ref of refsForName) {
2555
+ const normalizedFile = ref.filePath.replace(/\\/g, '/');
2556
+ if (graphFiles.has(normalizedFile))
2557
+ continue;
2558
+ graphFiles.add(normalizedFile); // mark so ripgrep doesn't re-process
2559
+ try {
2560
+ const content = await fs.readFile(assertSafePath(normalizedFile), 'utf-8');
2561
+ const lines = content.split('\n');
2562
+ const regex = new RegExp(`\\b${oldName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'g');
2563
+ for (let i = 0; i < lines.length; i++) {
2564
+ const searchLine = lines[i];
2565
+ if (!searchLine)
2566
+ continue;
2567
+ regex.lastIndex = 0;
2568
+ if (regex.test(searchLine)) {
2569
+ regex.lastIndex = 0;
2570
+ addEdit(normalizedFile, i + 1, searchLine.trim(), searchLine.replace(regex, new_name).trim(), 'refs');
2571
+ refsEdits++;
2572
+ }
2573
+ }
2574
+ }
2575
+ catch (e) {
2576
+ logQueryError('rename:refs-read', e);
2577
+ }
2578
+ }
2579
+ }
2580
+ catch (e) {
2581
+ logQueryError('rename:refs', e);
2582
+ }
2583
+ // Step 3b: Ripgrep text search for anything refs + graph missed
2584
+ let astSearchEdits = 0;
2585
+ // Simple text search across the repo for the old name (in files not already covered)
2395
2586
  try {
2396
2587
  const { execFileSync } = await import('child_process');
2397
2588
  const rgArgs = [
@@ -2456,9 +2647,11 @@ export class LocalBackend {
2456
2647
  files_affected: allChanges.length,
2457
2648
  total_edits: totalEdits,
2458
2649
  graph_edits: graphEdits,
2650
+ refs_edits: refsEdits,
2459
2651
  text_search_edits: astSearchEdits,
2460
2652
  changes: allChanges,
2461
2653
  applied: !dry_run,
2654
+ _note: 'Rename uses graph edges + refs index + ripgrep. Always review changes before applying. String literals, comments, and dynamic references (e.g. obj[methodName]) need manual review.',
2462
2655
  };
2463
2656
  }
2464
2657
  async impact(repo, params) {
@@ -2535,6 +2728,32 @@ export class LocalBackend {
2535
2728
  }
2536
2729
  }
2537
2730
  }
2731
+ // Supplement d=1 callers from refs table (catches callers the graph missed)
2732
+ if (direction === 'upstream') {
2733
+ try {
2734
+ const targetName = sym.name;
2735
+ const d1FromRefs = findRefsBySymbol(db, targetName, 200);
2736
+ for (const ref of d1FromRefs) {
2737
+ if (ref.filePath === sym.filePath)
2738
+ continue;
2739
+ const refNode = this.findNodeAtPosition(db, ref.filePath, ref.line);
2740
+ if (refNode && !seenIds.has(refNode.name + ':' + ref.filePath)) {
2741
+ // Find the actual node ID for this position
2742
+ const fullNodes = findNodesByFile(db, ref.filePath);
2743
+ const match = fullNodes.find(n => n.name === refNode.name && n.startLine != null && n.startLine <= ref.line + 1 && (n.endLine ?? 9999) >= ref.line + 1);
2744
+ if (match && !seenIds.has(match.id) && !startIds.some(s => s === match.id)) {
2745
+ seenIds.add(match.id);
2746
+ mergedNodes.push({
2747
+ id: match.id, name: match.name, label: match.label,
2748
+ filePath: match.filePath, depth: 1,
2749
+ relationType: 'CALLS', confidence: 0.8,
2750
+ });
2751
+ }
2752
+ }
2753
+ }
2754
+ }
2755
+ catch { /* refs table may not exist */ }
2756
+ }
2538
2757
  const impacted = mergedNodes;
2539
2758
  const truncated = anyTruncated;
2540
2759
  const grouped = {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@zuvia-software-solutions/code-mapper",
3
- "version": "2.4.0",
3
+ "version": "2.4.1",
4
4
  "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
5
5
  "author": "Abhigyan Patwari",
6
6
  "license": "PolyForm-Noncommercial-1.0.0",