@zuvia-software-solutions/code-mapper 2.3.12 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@
2
2
  export interface AnalyzeOptions {
3
3
  force?: boolean;
4
4
  embeddings?: boolean;
5
+ nlEmbeddings?: boolean;
5
6
  tsgo?: boolean;
6
7
  verbose?: boolean;
7
8
  }
@@ -269,10 +269,68 @@ export const analyzeCommand = async (inputPath, options) => {
269
269
  recordPhase('search-text');
270
270
  updateBar(84, 'Building search index...');
271
271
  populateSearchText(db);
272
+ // Phase 2.7: Build refs table (identifier occurrence index)
273
+ recordPhase('refs');
274
+ updateBar(85, 'Building refs index...');
275
+ {
276
+ const { clearRefs, insertRefsBatch, clearFileWords, upsertFileWords } = await import('../core/db/adapter.js');
277
+ const fsRef = await import('fs/promises');
278
+ clearRefs(db);
279
+ clearFileWords(db);
280
+ // Scan all source files for identifier occurrences
281
+ const STOP_WORDS = new Set(['the', 'and', 'for', 'from', 'with', 'this', 'that', 'have', 'has', 'not', 'are', 'was', 'were', 'been', 'being', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'does', 'did', 'let', 'var', 'const', 'new', 'return', 'function', 'class', 'import', 'export', 'default', 'void', 'null', 'undefined', 'true', 'false', 'else', 'case', 'break', 'continue', 'while', 'throw', 'catch', 'try', 'finally', 'async', 'await', 'yield', 'typeof', 'instanceof', 'delete', 'switch', 'interface', 'type', 'enum', 'extends', 'implements', 'static', 'private', 'public', 'protected', 'abstract', 'readonly', 'override', 'declare', 'module', 'namespace', 'require', 'string', 'number', 'boolean', 'object', 'any', 'never', 'unknown', 'symbol']);
282
+ const SRC_EXTENSIONS = new Set(['.ts', '.tsx', '.js', '.jsx', '.py', '.go', '.rs', '.java', '.c', '.h', '.cpp', '.hpp', '.cs', '.rb', '.php', '.kt', '.swift', '.mts', '.mjs', '.cts', '.cjs']);
283
+ const identRegex = /\b[a-zA-Z_]\w{2,}\b/g;
284
+ const wordRegex = /\b[a-zA-Z]\w{2,}\b/g;
285
+ // Get all file paths from the nodes table
286
+ const fileRows = db.prepare("SELECT DISTINCT filePath FROM nodes WHERE label = 'File'").all();
287
+ let refsBuilt = 0;
288
+ for (const { filePath } of fileRows) {
289
+ const ext = path.extname(filePath).toLowerCase();
290
+ if (!SRC_EXTENSIONS.has(ext))
291
+ continue;
292
+ let content;
293
+ try {
294
+ content = await fsRef.readFile(path.resolve(repoPath, filePath), 'utf-8');
295
+ }
296
+ catch {
297
+ continue;
298
+ }
299
+ // Build refs (identifier occurrences — skip language keywords)
300
+ const refs = [];
301
+ const lines = content.split('\n');
302
+ for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
303
+ let match;
304
+ identRegex.lastIndex = 0;
305
+ while ((match = identRegex.exec(lines[lineIdx])) !== null) {
306
+ if (!STOP_WORDS.has(match[0].toLowerCase())) {
307
+ refs.push({ symbol: match[0], filePath, line: lineIdx });
308
+ }
309
+ }
310
+ }
311
+ if (refs.length > 0)
312
+ insertRefsBatch(db, refs);
313
+ // Build file_words (conceptual search)
314
+ const wordSet = new Set();
315
+ let wMatch;
316
+ wordRegex.lastIndex = 0;
317
+ while ((wMatch = wordRegex.exec(content)) !== null) {
318
+ const w = wMatch[0].toLowerCase();
319
+ if (!STOP_WORDS.has(w))
320
+ wordSet.add(w);
321
+ }
322
+ if (wordSet.size > 0)
323
+ upsertFileWords(db, filePath, [...wordSet].join(' '));
324
+ refsBuilt++;
325
+ if (refsBuilt % 500 === 0) {
326
+ updateBar(85, `Building refs index... (${refsBuilt}/${fileRows.length})`);
327
+ }
328
+ }
329
+ }
272
330
  // Phase 3: FTS (85-90%)
273
331
  // FTS5 is auto-created by schema triggers — no manual index creation needed
274
332
  recordPhase('fts');
275
- updateBar(85, 'Search indexes ready');
333
+ updateBar(87, 'Search indexes ready');
276
334
  // Phase 3.5: Re-insert cached embeddings
277
335
  recordPhase('restore-embeddings');
278
336
  if (cachedEmbeddings.length > 0) {
@@ -370,6 +428,22 @@ export const analyzeCommand = async (inputPath, options) => {
370
428
  // Reopen DB after Python is done
371
429
  db = openDb(dbPath);
372
430
  }
431
+ // Phase 4b: NL Embeddings (bge-small, CPU, Node.js)
432
+ if (options?.nlEmbeddings) {
433
+ recordPhase('nl-embeddings');
434
+ updateBar(95, 'Generating NL embeddings (bge-small)...');
435
+ const { buildNlEmbeddings } = await import('../core/embeddings/nl-embedder.js');
436
+ try {
437
+ const nlResult = await buildNlEmbeddings(db, (current, total) => {
438
+ const pct = 95 + Math.round((current / Math.max(total, 1)) * 3);
439
+ updateBar(pct, `NL embeddings (${current}/${total})`, 'NL embeddings');
440
+ });
441
+ updateBar(98, `NL embeddings: ${nlResult.embedded} embedded, ${nlResult.skipped} cached (${(nlResult.durationMs / 1000).toFixed(1)}s)`);
442
+ }
443
+ catch (err) {
444
+ console.error(`\n Warning: NL embeddings failed: ${err instanceof Error ? err.message : err}`);
445
+ }
446
+ }
373
447
  // Phase 5: Finalize (98-100%)
374
448
  recordPhase('finalize');
375
449
  updateBar(98, 'Saving metadata...');
package/dist/cli/index.js CHANGED
@@ -22,8 +22,8 @@ program
22
22
  .command('analyze [path]')
23
23
  .description('Index a repository (full analysis)')
24
24
  .option('-f, --force', 'Force full re-index even if up to date')
25
- .option('--embeddings', 'Enable embedding generation for semantic search (on by default)', true)
26
- .option('--no-embeddings', 'Skip embedding generation')
25
+ .option('--embeddings', 'Enable code embedding generation (Jina/MLX, GPU)', false)
26
+ .option('--nl-embeddings', 'Enable NL embedding generation (bge-small, CPU, recommended)', false)
27
27
  .option('--no-tsgo', 'Skip tsgo LSP for call resolution (faster, less accurate)')
28
28
  .option('-v, --verbose', 'Enable verbose ingestion warnings (default: false)')
29
29
  .addHelpText('after', '\nEnvironment variables:\n CODE_MAPPER_NO_GITIGNORE=1 Skip .gitignore parsing (still reads .code-mapperignore)')
@@ -80,11 +80,12 @@ export declare function searchFTS(db: Database.Database, query: string, limit?:
80
80
  filePath: string;
81
81
  score: number;
82
82
  }>;
83
- /** Get node count, edge count, and embedding count. */
83
+ /** Get node count, edge count, embedding count, and refs count. */
84
84
  export declare function getStats(db: Database.Database): {
85
85
  nodes: number;
86
86
  edges: number;
87
87
  embeddings: number;
88
+ refs: number;
88
89
  };
89
90
  /** Batch insert nodes in a single transaction. */
90
91
  export declare function insertNodesBatch(db: Database.Database, nodes: readonly NodeInsert[]): void;
@@ -104,6 +105,48 @@ export declare function insertEmbeddingsBatch(db: Database.Database, items: read
104
105
  }[]): void;
105
106
  /** Get all textHashes from the embeddings table for hash-based skip on re-index */
106
107
  export declare function getEmbeddingHashes(db: Database.Database): Map<string, string>;
108
+ /** Bulk-insert identifier references */
109
+ export declare function insertRefsBatch(db: Database.Database, refs: ReadonlyArray<{
110
+ symbol: string;
111
+ filePath: string;
112
+ line: number;
113
+ }>): void;
114
+ /** Delete all refs for a given file (used by incremental refresh) */
115
+ export declare function deleteRefsByFile(db: Database.Database, filePath: string): void;
116
+ /** Find all files referencing a symbol name */
117
+ export declare function findRefsBySymbol(db: Database.Database, symbol: string, limit?: number): Array<{
118
+ filePath: string;
119
+ line: number;
120
+ }>;
121
+ /** Count total refs in the index */
122
+ export declare function countRefs(db: Database.Database): number;
123
+ /** Delete all refs (used before full rebuild) */
124
+ export declare function clearRefs(db: Database.Database): void;
125
+ /** Insert or replace file-level word index */
126
+ export declare function upsertFileWords(db: Database.Database, filePath: string, words: string): void;
127
+ /** Bulk insert file words in a transaction */
128
+ export declare function insertFileWordsBatch(db: Database.Database, entries: ReadonlyArray<{
129
+ filePath: string;
130
+ words: string;
131
+ }>): void;
132
+ /** Delete file words for a given file */
133
+ export declare function deleteFileWordsByFile(db: Database.Database, filePath: string): void;
134
+ /** Search file_words_fts for conceptual matches, returns file paths ranked by relevance */
135
+ export declare function searchFileWords(db: Database.Database, query: string, limit?: number): Array<{
136
+ filePath: string;
137
+ score: number;
138
+ }>;
139
+ /** Clear all file words (used before full rebuild) */
140
+ export declare function clearFileWords(db: Database.Database): void;
141
+ /** Count NL embeddings in the index */
142
+ export declare function countNlEmbeddings(db: Database.Database): number;
143
+ /** Search NL embeddings via brute-force cosine similarity */
144
+ export declare function searchNlVector(db: Database.Database, queryVec: number[], limit?: number, maxDistance?: number): Array<{
145
+ nodeId: string;
146
+ distance: number;
147
+ source: string;
148
+ text: string;
149
+ }>;
107
150
  /** Escape a string for use in SQL single-quoted literals. */
108
151
  export declare function escapeSql(value: string): string;
109
152
  /** Execute a raw SQL query and return rows. */
@@ -347,12 +347,13 @@ export function searchFTS(db, query, limit = 20) {
347
347
  // ---------------------------------------------------------------------------
348
348
  // Stats
349
349
  // ---------------------------------------------------------------------------
350
- /** Get node count, edge count, and embedding count. */
350
+ /** Get node count, edge count, embedding count, and refs count. */
351
351
  export function getStats(db) {
352
352
  return {
353
353
  nodes: countNodes(db),
354
354
  edges: countEdges(db),
355
355
  embeddings: countEmbeddings(db),
356
+ refs: countRefs(db),
356
357
  };
357
358
  }
358
359
  // ---------------------------------------------------------------------------
@@ -513,6 +514,126 @@ export function getEmbeddingHashes(db) {
513
514
  return map;
514
515
  }
515
516
  // ---------------------------------------------------------------------------
517
+ // Refs (identifier occurrence index)
518
+ // ---------------------------------------------------------------------------
519
+ /** Bulk-insert identifier references */
520
+ export function insertRefsBatch(db, refs) {
521
+ if (refs.length === 0)
522
+ return;
523
+ const stmt = db.prepare('INSERT INTO refs (symbol, filePath, line) VALUES (?, ?, ?)');
524
+ const tx = db.transaction(() => {
525
+ for (const ref of refs) {
526
+ stmt.run(ref.symbol, ref.filePath, ref.line);
527
+ }
528
+ });
529
+ tx();
530
+ }
531
+ /** Delete all refs for a given file (used by incremental refresh) */
532
+ export function deleteRefsByFile(db, filePath) {
533
+ db.prepare('DELETE FROM refs WHERE filePath = ?').run(filePath);
534
+ }
535
+ /** Find all files referencing a symbol name */
536
+ export function findRefsBySymbol(db, symbol, limit = 200) {
537
+ return db.prepare('SELECT DISTINCT filePath, line FROM refs WHERE symbol = ? LIMIT ?').all(symbol, limit);
538
+ }
539
+ /** Count total refs in the index */
540
+ export function countRefs(db) {
541
+ const row = db.prepare('SELECT COUNT(*) as cnt FROM refs').get();
542
+ return row?.cnt ?? 0;
543
+ }
544
+ /** Delete all refs (used before full rebuild) */
545
+ export function clearRefs(db) {
546
+ db.prepare('DELETE FROM refs').run();
547
+ }
548
+ // ---------------------------------------------------------------------------
549
+ // File Words (conceptual search index)
550
+ // ---------------------------------------------------------------------------
551
+ /** Insert or replace file-level word index */
552
+ export function upsertFileWords(db, filePath, words) {
553
+ db.prepare('INSERT OR REPLACE INTO file_words (filePath, words) VALUES (?, ?)').run(filePath, words);
554
+ }
555
+ /** Bulk insert file words in a transaction */
556
+ export function insertFileWordsBatch(db, entries) {
557
+ if (entries.length === 0)
558
+ return;
559
+ const stmt = db.prepare('INSERT OR REPLACE INTO file_words (filePath, words) VALUES (?, ?)');
560
+ const tx = db.transaction(() => {
561
+ for (const entry of entries) {
562
+ stmt.run(entry.filePath, entry.words);
563
+ }
564
+ });
565
+ tx();
566
+ }
567
+ /** Delete file words for a given file */
568
+ export function deleteFileWordsByFile(db, filePath) {
569
+ db.prepare('DELETE FROM file_words WHERE filePath = ?').run(filePath);
570
+ }
571
+ /** Search file_words_fts for conceptual matches, returns file paths ranked by relevance */
572
+ export function searchFileWords(db, query, limit = 20) {
573
+ let safeQuery = query.replace(/"/g, '""').replace(/[*(){}[\]^~\\:]/g, ' ').trim();
574
+ if (!safeQuery)
575
+ return [];
576
+ const words = safeQuery.split(/\s+/).filter(w => w.length > 2);
577
+ if (words.length === 0)
578
+ return [];
579
+ safeQuery = words.join(' OR ');
580
+ try {
581
+ return db.prepare(`SELECT fw.filePath, rank as score
582
+ FROM file_words_fts fts
583
+ JOIN file_words fw ON fw.rowid = fts.rowid
584
+ WHERE file_words_fts MATCH ?
585
+ ORDER BY rank
586
+ LIMIT ?`).all(safeQuery, limit);
587
+ }
588
+ catch {
589
+ return [];
590
+ }
591
+ }
592
+ /** Clear all file words (used before full rebuild) */
593
+ export function clearFileWords(db) {
594
+ db.prepare('DELETE FROM file_words').run();
595
+ }
596
+ // ---------------------------------------------------------------------------
597
+ // NL Embeddings
598
+ // ---------------------------------------------------------------------------
599
+ /** Count NL embeddings in the index */
600
+ export function countNlEmbeddings(db) {
601
+ try {
602
+ const row = db.prepare('SELECT COUNT(*) as cnt FROM nl_embeddings').get();
603
+ return row?.cnt ?? 0;
604
+ }
605
+ catch {
606
+ return 0;
607
+ }
608
+ }
609
+ /** Search NL embeddings via brute-force cosine similarity */
610
+ export function searchNlVector(db, queryVec, limit = 10, maxDistance = 0.5) {
611
+ let rows;
612
+ try {
613
+ rows = db.prepare('SELECT nodeId, embedding, source, text FROM nl_embeddings').all();
614
+ }
615
+ catch {
616
+ return [];
617
+ }
618
+ const results = [];
619
+ for (const row of rows) {
620
+ const vec = new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.byteLength / 4);
621
+ let dot = 0, normA = 0, normB = 0;
622
+ for (let i = 0; i < queryVec.length && i < vec.length; i++) {
623
+ dot += queryVec[i] * vec[i];
624
+ normA += queryVec[i] * queryVec[i];
625
+ normB += vec[i] * vec[i];
626
+ }
627
+ const similarity = dot / (Math.sqrt(normA) * Math.sqrt(normB));
628
+ const distance = 1 - similarity;
629
+ if (distance < maxDistance) {
630
+ results.push({ nodeId: row.nodeId, distance, source: row.source, text: row.text });
631
+ }
632
+ }
633
+ results.sort((a, b) => a.distance - b.distance);
634
+ return results.slice(0, limit);
635
+ }
636
+ // ---------------------------------------------------------------------------
516
637
  // Raw SQL escape (for dynamic queries in local-backend.ts)
517
638
  // ---------------------------------------------------------------------------
518
639
  /** Escape a string for use in SQL single-quoted literals. */
@@ -68,6 +68,25 @@ export interface EmbeddingRow {
68
68
  readonly embedding: Buffer;
69
69
  readonly textHash: string | null;
70
70
  }
71
+ /** A NL embedding row as stored in the `nl_embeddings` table */
72
+ export interface NlEmbeddingRow {
73
+ readonly nodeId: NodeId;
74
+ readonly embedding: Buffer;
75
+ readonly textHash: string | null;
76
+ readonly source: string;
77
+ readonly text: string;
78
+ }
79
+ /** A reference occurrence as stored in the `refs` table */
80
+ export interface RefsRow {
81
+ readonly symbol: string;
82
+ readonly filePath: string;
83
+ readonly line: number;
84
+ }
85
+ /** A file-level word index row as stored in the `file_words` table */
86
+ export interface FileWordsRow {
87
+ readonly filePath: string;
88
+ readonly words: string;
89
+ }
71
90
  /** Fields required to insert a node */
72
91
  export interface NodeInsert {
73
92
  readonly id: NodeId;
@@ -107,4 +126,4 @@ export interface EdgeInsert {
107
126
  }
108
127
  /** Legacy edge table name constant (kept for compatibility) */
109
128
  export declare const REL_TABLE_NAME = "CodeRelation";
110
- export declare const SCHEMA_SQL = "\n-- Nodes: unified table for all code elements\nCREATE TABLE IF NOT EXISTS nodes (\n id TEXT PRIMARY KEY,\n label TEXT NOT NULL,\n name TEXT NOT NULL DEFAULT '',\n filePath TEXT NOT NULL DEFAULT '',\n startLine INTEGER,\n endLine INTEGER,\n isExported INTEGER,\n content TEXT NOT NULL DEFAULT '',\n description TEXT NOT NULL DEFAULT '',\n heuristicLabel TEXT,\n cohesion REAL,\n symbolCount INTEGER,\n keywords TEXT,\n enrichedBy TEXT,\n processType TEXT,\n stepCount INTEGER,\n communities TEXT,\n entryPointId TEXT,\n terminalId TEXT,\n parameterCount INTEGER,\n returnType TEXT,\n nameExpanded TEXT DEFAULT '',\n searchText TEXT DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);\nCREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath ON nodes(filePath);\nCREATE INDEX IF NOT EXISTS idx_nodes_label_name ON nodes(label, name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath_lines ON nodes(filePath, startLine, endLine);\n\n-- Edges: single table for all relationships\nCREATE TABLE IF NOT EXISTS edges (\n id TEXT PRIMARY KEY,\n sourceId TEXT NOT NULL,\n targetId TEXT NOT NULL,\n type TEXT NOT NULL,\n confidence REAL NOT NULL DEFAULT 1.0,\n reason TEXT NOT NULL DEFAULT '',\n step INTEGER NOT NULL DEFAULT 0,\n callLine INTEGER\n);\n\nCREATE INDEX IF NOT EXISTS idx_edges_sourceId ON edges(sourceId);\nCREATE INDEX IF NOT EXISTS idx_edges_targetId ON edges(targetId);\nCREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);\nCREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(sourceId, type);\nCREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(targetId, type);\n\n-- Embeddings: vector storage\nCREATE TABLE IF NOT EXISTS embeddings (\n nodeId TEXT PRIMARY KEY,\n embedding BLOB NOT NULL,\n textHash TEXT\n);\n\n-- FTS5 virtual table (auto-updated via triggers)\nCREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(\n name,\n nameExpanded,\n searchText,\n filePath,\n content,\n content='nodes',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN\n INSERT INTO nodes_fts(rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\n";
129
+ export declare const SCHEMA_SQL = "\n-- Nodes: unified table for all code elements\nCREATE TABLE IF NOT EXISTS nodes (\n id TEXT PRIMARY KEY,\n label TEXT NOT NULL,\n name TEXT NOT NULL DEFAULT '',\n filePath TEXT NOT NULL DEFAULT '',\n startLine INTEGER,\n endLine INTEGER,\n isExported INTEGER,\n content TEXT NOT NULL DEFAULT '',\n description TEXT NOT NULL DEFAULT '',\n heuristicLabel TEXT,\n cohesion REAL,\n symbolCount INTEGER,\n keywords TEXT,\n enrichedBy TEXT,\n processType TEXT,\n stepCount INTEGER,\n communities TEXT,\n entryPointId TEXT,\n terminalId TEXT,\n parameterCount INTEGER,\n returnType TEXT,\n nameExpanded TEXT DEFAULT '',\n searchText TEXT DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);\nCREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath ON nodes(filePath);\nCREATE INDEX IF NOT EXISTS idx_nodes_label_name ON nodes(label, name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath_lines ON nodes(filePath, startLine, endLine);\n\n-- Edges: single table for all relationships\nCREATE TABLE IF NOT EXISTS edges (\n id TEXT PRIMARY KEY,\n sourceId TEXT NOT NULL,\n targetId TEXT NOT NULL,\n type TEXT NOT NULL,\n confidence REAL NOT NULL DEFAULT 1.0,\n reason TEXT NOT NULL DEFAULT '',\n step INTEGER NOT NULL DEFAULT 0,\n callLine INTEGER\n);\n\nCREATE INDEX IF NOT EXISTS idx_edges_sourceId ON edges(sourceId);\nCREATE INDEX IF NOT EXISTS idx_edges_targetId ON edges(targetId);\nCREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);\nCREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(sourceId, type);\nCREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(targetId, type);\n\n-- Embeddings: vector storage\nCREATE TABLE IF NOT EXISTS embeddings (\n nodeId TEXT PRIMARY KEY,\n embedding BLOB NOT NULL,\n textHash TEXT\n);\n\n-- NL Embeddings: natural language description vectors (bge-small, 384-dim)\nCREATE TABLE IF NOT EXISTS nl_embeddings (\n nodeId TEXT NOT NULL,\n embedding BLOB NOT NULL,\n textHash TEXT,\n source TEXT NOT NULL DEFAULT 'comment',\n text TEXT NOT NULL DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nl_emb_nodeId ON nl_embeddings(nodeId);\n\n-- FTS5 virtual table (auto-updated via triggers)\nCREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(\n name,\n nameExpanded,\n searchText,\n filePath,\n content,\n content='nodes',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN\n INSERT INTO nodes_fts(rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\n\n-- Refs: identifier occurrence index (pre-computed grep)\nCREATE TABLE IF NOT EXISTS refs (\n symbol TEXT NOT NULL,\n filePath TEXT NOT NULL,\n line INTEGER NOT NULL\n);\n\nCREATE INDEX IF NOT EXISTS idx_refs_symbol ON refs(symbol);\nCREATE INDEX IF NOT EXISTS idx_refs_filePath ON refs(filePath);\nCREATE INDEX IF NOT EXISTS idx_refs_symbol_file ON refs(symbol, filePath);\n\n-- File-level word index for conceptual search\nCREATE TABLE IF NOT EXISTS file_words (\n filePath TEXT PRIMARY KEY,\n words TEXT NOT NULL DEFAULT ''\n);\n\nCREATE VIRTUAL TABLE IF NOT EXISTS file_words_fts USING fts5(\n words,\n content='file_words',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ai AFTER INSERT ON file_words BEGIN\n INSERT INTO file_words_fts(rowid, words) VALUES (new.rowid, new.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ad AFTER DELETE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_au AFTER UPDATE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES (new.rowid, new.words);\nEND;\n";
@@ -114,6 +114,17 @@ CREATE TABLE IF NOT EXISTS embeddings (
114
114
  textHash TEXT
115
115
  );
116
116
 
117
+ -- NL Embeddings: natural language description vectors (bge-small, 384-dim)
118
+ CREATE TABLE IF NOT EXISTS nl_embeddings (
119
+ nodeId TEXT NOT NULL,
120
+ embedding BLOB NOT NULL,
121
+ textHash TEXT,
122
+ source TEXT NOT NULL DEFAULT 'comment',
123
+ text TEXT NOT NULL DEFAULT ''
124
+ );
125
+
126
+ CREATE INDEX IF NOT EXISTS idx_nl_emb_nodeId ON nl_embeddings(nodeId);
127
+
117
128
  -- FTS5 virtual table (auto-updated via triggers)
118
129
  CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(
119
130
  name,
@@ -135,4 +146,38 @@ CREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN
135
146
  INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);
136
147
  INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);
137
148
  END;
149
+
150
+ -- Refs: identifier occurrence index (pre-computed grep)
151
+ CREATE TABLE IF NOT EXISTS refs (
152
+ symbol TEXT NOT NULL,
153
+ filePath TEXT NOT NULL,
154
+ line INTEGER NOT NULL
155
+ );
156
+
157
+ CREATE INDEX IF NOT EXISTS idx_refs_symbol ON refs(symbol);
158
+ CREATE INDEX IF NOT EXISTS idx_refs_filePath ON refs(filePath);
159
+ CREATE INDEX IF NOT EXISTS idx_refs_symbol_file ON refs(symbol, filePath);
160
+
161
+ -- File-level word index for conceptual search
162
+ CREATE TABLE IF NOT EXISTS file_words (
163
+ filePath TEXT PRIMARY KEY,
164
+ words TEXT NOT NULL DEFAULT ''
165
+ );
166
+
167
+ CREATE VIRTUAL TABLE IF NOT EXISTS file_words_fts USING fts5(
168
+ words,
169
+ content='file_words',
170
+ content_rowid='rowid'
171
+ );
172
+
173
+ CREATE TRIGGER IF NOT EXISTS file_words_fts_ai AFTER INSERT ON file_words BEGIN
174
+ INSERT INTO file_words_fts(rowid, words) VALUES (new.rowid, new.words);
175
+ END;
176
+ CREATE TRIGGER IF NOT EXISTS file_words_fts_ad AFTER DELETE ON file_words BEGIN
177
+ INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);
178
+ END;
179
+ CREATE TRIGGER IF NOT EXISTS file_words_fts_au AFTER UPDATE ON file_words BEGIN
180
+ INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);
181
+ INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES (new.rowid, new.words);
182
+ END;
138
183
  `;
@@ -11,11 +11,13 @@ import { type EmbeddingProgress, type EmbeddingConfig, type SemanticSearchResult
11
11
  import type Database from 'better-sqlite3';
12
12
  /** Progress callback type */
13
13
  export type EmbeddingProgressCallback = (progress: EmbeddingProgress) => void;
14
- /** Graph context for a node: callers, callees, and community module */
14
+ /** Graph context for a node: callers, callees, community module, imports, and siblings */
15
15
  export interface GraphContext {
16
16
  callers: string[];
17
17
  callees: string[];
18
18
  module: string;
19
+ importNames: string[];
20
+ siblingNames: string[];
19
21
  }
20
22
  /**
21
23
  * Fetch graph context (callers, callees, community module) for a set of nodes.
@@ -92,12 +92,51 @@ export function fetchGraphContext(db, nodes) {
92
92
  for (const r of moduleRows) {
93
93
  moduleMap.set(r.nid, r.module ?? '');
94
94
  }
95
+ // Batch fetch import names (what this file imports)
96
+ const chunkSize = 500;
97
+ const importMap = {};
98
+ for (let ci = 0; ci < nodeIds.length; ci += chunkSize) {
99
+ const chunk = nodeIds.slice(ci, ci + chunkSize);
100
+ const ph = chunk.map(() => '?').join(',');
101
+ const importRows = db.prepare(`SELECT DISTINCT n.filePath, tn.name
102
+ FROM nodes n
103
+ JOIN edges e ON e.sourceId = n.id AND e.type = 'IMPORTS'
104
+ JOIN nodes tn ON tn.id = e.targetId
105
+ WHERE n.id IN (${ph})`).all(...chunk);
106
+ for (const row of importRows) {
107
+ if (!importMap[row.filePath])
108
+ importMap[row.filePath] = [];
109
+ if (importMap[row.filePath].length < 10)
110
+ importMap[row.filePath].push(row.name);
111
+ }
112
+ }
113
+ // Batch fetch sibling symbol names (other symbols in same file)
114
+ const siblingMap = {};
115
+ for (let ci = 0; ci < nodeIds.length; ci += chunkSize) {
116
+ const chunk = nodeIds.slice(ci, ci + chunkSize);
117
+ const ph = chunk.map(() => '?').join(',');
118
+ const sibRows = db.prepare(`SELECT n1.id as sourceId, n2.name as sibName
119
+ FROM nodes n1
120
+ JOIN nodes n2 ON n2.filePath = n1.filePath AND n2.id != n1.id
121
+ WHERE n1.id IN (${ph})
122
+ AND n2.label NOT IN ('File', 'Folder', 'Community', 'Process')
123
+ LIMIT ${chunk.length * 5}`).all(...chunk);
124
+ for (const row of sibRows) {
125
+ if (!siblingMap[row.sourceId])
126
+ siblingMap[row.sourceId] = [];
127
+ if (siblingMap[row.sourceId].length < 5)
128
+ siblingMap[row.sourceId].push(row.sibName);
129
+ }
130
+ }
95
131
  // Assemble
96
132
  for (const node of nodes) {
133
+ const n = node;
97
134
  graphContext.set(node.id, {
98
135
  callers: (callerMap.get(node.id) || []).slice(0, 3),
99
136
  callees: (calleeMap.get(node.id) || []).slice(0, 3),
100
137
  module: moduleMap.get(node.id) || '',
138
+ importNames: n.filePath ? (importMap[n.filePath] || []) : [],
139
+ siblingNames: siblingMap[node.id] || [],
101
140
  });
102
141
  }
103
142
  }
@@ -115,7 +154,7 @@ export function fetchGraphContext(db, nodes) {
115
154
  * @returns Enriched text
116
155
  */
117
156
  export function enrichTextWithGraphContext(text, ctx) {
118
- if (!ctx.module && ctx.callers.length === 0 && ctx.callees.length === 0)
157
+ if (!ctx.module && ctx.callers.length === 0 && ctx.callees.length === 0 && ctx.importNames.length === 0 && ctx.siblingNames.length === 0)
119
158
  return text;
120
159
  const lines = text.split('\n');
121
160
  // Append Module to the File: line (matches Python batch format)
@@ -125,7 +164,17 @@ export function enrichTextWithGraphContext(text, ctx) {
125
164
  lines[fileIdx] += ` | Module: ${ctx.module}`;
126
165
  }
127
166
  }
128
- // Insert callers/callees after the File: line
167
+ // Insert imports after the File: line
168
+ if (ctx.importNames.length > 0) {
169
+ const importLine = `Imports: ${ctx.importNames.join(', ')}`;
170
+ // Insert after the File line
171
+ const fileIdx = lines.findIndex(l => l.startsWith('File:'));
172
+ if (fileIdx >= 0)
173
+ lines.splice(fileIdx + 1, 0, importLine);
174
+ else
175
+ lines.push(importLine);
176
+ }
177
+ // Insert callers/callees after the File: line (and imports if present)
129
178
  const insertParts = [];
130
179
  if (ctx.callers.length > 0)
131
180
  insertParts.push(`Called by: ${ctx.callers.join(', ')}`);
@@ -138,6 +187,10 @@ export function enrichTextWithGraphContext(text, ctx) {
138
187
  lines.splice(insertAt, 0, insertParts[i] ?? '');
139
188
  }
140
189
  }
190
+ // Append siblings at the end
191
+ if (ctx.siblingNames.length > 0) {
192
+ lines.push(`Siblings: ${ctx.siblingNames.join(', ')}`);
193
+ }
141
194
  return lines.join('\n');
142
195
  }
143
196
  /**
@@ -0,0 +1,44 @@
1
+ /**
2
+ * @file Natural language embedder using bge-small-en-v1.5.
3
+ *
4
+ * Runs entirely in Node.js via @huggingface/transformers — no Python, no GPU.
5
+ * Embeds human-readable descriptions extracted from code (JSDoc comments,
6
+ * enum values, type patterns, file headers) for conceptual search.
7
+ *
8
+ * 33M params, q8 quantized, 384-dim embeddings, ~6ms/text on CPU.
9
+ */
10
+ import type Database from 'better-sqlite3';
11
+ /** Initialize the NL embedding model (lazy, idempotent) */
12
+ export declare function initNlEmbedder(): Promise<void>;
13
+ /** Check if the NL embedder is ready */
14
+ export declare function isNlEmbedderReady(): boolean;
15
+ /** Embed a single text, returns Float32Array */
16
+ export declare function nlEmbed(text: string): Promise<number[]>;
17
+ /** Embed a batch of texts */
18
+ export declare function nlEmbedBatch(texts: string[]): Promise<number[][]>;
19
+ interface NodeForNl {
20
+ id: string;
21
+ name: string;
22
+ label: string;
23
+ filePath: string;
24
+ content: string;
25
+ startLine: number | null;
26
+ description: string;
27
+ }
28
+ interface NlDocument {
29
+ nodeId: string;
30
+ source: string;
31
+ text: string;
32
+ }
33
+ /** Build NL documents from a node */
34
+ export declare function extractNlTexts(node: NodeForNl): NlDocument[];
35
+ /**
36
+ * Build NL embeddings for all eligible nodes in the database.
37
+ * Reads nodes, extracts NL text, embeds with bge-small, writes to nl_embeddings.
38
+ */
39
+ export declare function buildNlEmbeddings(db: Database.Database, onProgress?: (current: number, total: number) => void): Promise<{
40
+ embedded: number;
41
+ skipped: number;
42
+ durationMs: number;
43
+ }>;
44
+ export {};