@zuvia-software-solutions/code-mapper 2.3.12 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,262 @@
1
+ // code-mapper/src/core/embeddings/nl-embedder.ts
2
+ /**
3
+ * @file Natural language embedder using bge-small-en-v1.5.
4
+ *
5
+ * Runs entirely in Node.js via @huggingface/transformers — no Python, no GPU.
6
+ * Embeds human-readable descriptions extracted from code (JSDoc comments,
7
+ * enum values, type patterns, file headers) for conceptual search.
8
+ *
9
+ * 33M params, q8 quantized, 384-dim embeddings, ~6ms/text on CPU.
10
+ */
11
+ // NL embedder — no schema imports needed
12
+ const MODEL_ID = 'Xenova/bge-small-en-v1.5';
13
+ // Lazy-loaded pipeline
14
+ let extractor = null;
15
+ let loadPromise = null;
16
+ /** Initialize the NL embedding model (lazy, idempotent) */
17
+ export async function initNlEmbedder() {
18
+ if (extractor)
19
+ return;
20
+ if (loadPromise)
21
+ return loadPromise;
22
+ loadPromise = (async () => {
23
+ const { pipeline } = await import('@huggingface/transformers');
24
+ extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
25
+ })();
26
+ return loadPromise;
27
+ }
28
+ /** Check if the NL embedder is ready */
29
+ export function isNlEmbedderReady() {
30
+ return extractor !== null;
31
+ }
32
+ /** Embed a single text, returns Float32Array */
33
+ export async function nlEmbed(text) {
34
+ if (!extractor)
35
+ await initNlEmbedder();
36
+ const result = await extractor(text, { pooling: 'cls', normalize: true });
37
+ return Array.from(result.data);
38
+ }
39
+ /** Embed a batch of texts */
40
+ export async function nlEmbedBatch(texts) {
41
+ if (!extractor)
42
+ await initNlEmbedder();
43
+ const results = [];
44
+ for (const text of texts) {
45
+ const result = await extractor(text, { pooling: 'cls', normalize: true });
46
+ results.push(Array.from(result.data));
47
+ }
48
+ return results;
49
+ }
50
+ /** Extract all JSDoc/block comment text (up to 10 lines) */
51
+ function extractFullComment(content) {
52
+ if (!content)
53
+ return '';
54
+ const lines = content.split('\n');
55
+ const commentLines = [];
56
+ let inBlock = false;
57
+ for (const l of lines) {
58
+ const t = l.trim();
59
+ if (t.startsWith('/**') || t.startsWith('/*')) {
60
+ inBlock = true;
61
+ const inner = t.replace(/^\/\*\*?/, '').replace(/\*\/$/, '').trim();
62
+ if (inner && !inner.startsWith('@'))
63
+ commentLines.push(inner);
64
+ if (t.includes('*/'))
65
+ inBlock = false;
66
+ continue;
67
+ }
68
+ if (inBlock) {
69
+ if (t.includes('*/')) {
70
+ inBlock = false;
71
+ continue;
72
+ }
73
+ const inner = t.replace(/^\*\s?/, '').trim();
74
+ if (inner && !inner.startsWith('@'))
75
+ commentLines.push(inner);
76
+ if (commentLines.length >= 10)
77
+ break;
78
+ continue;
79
+ }
80
+ if (t.startsWith('//')) {
81
+ const inner = t.slice(2).trim();
82
+ if (inner)
83
+ commentLines.push(inner);
84
+ if (commentLines.length >= 10)
85
+ break;
86
+ continue;
87
+ }
88
+ if (t.startsWith('#') && !t.startsWith('#!')) {
89
+ const inner = t.slice(1).trim();
90
+ if (inner)
91
+ commentLines.push(inner);
92
+ if (commentLines.length >= 10)
93
+ break;
94
+ continue;
95
+ }
96
+ if (commentLines.length > 0)
97
+ break; // comment ended
98
+ }
99
+ return commentLines.join(' ');
100
+ }
101
+ /** Expand camelCase/PascalCase/snake_case to space-separated words */
102
+ function expandIdentifier(name) {
103
+ return name
104
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
105
+ .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
106
+ .replace(/[_\-]/g, ' ')
107
+ .toLowerCase();
108
+ }
109
+ /** Extract enum/const array values as NL text */
110
+ function extractEnumValues(content) {
111
+ // Match: ['value1', 'value2', ...] as const
112
+ const match = content.match(/\[([^\]]+)\]\s*as\s*const/);
113
+ if (match?.[1]) {
114
+ const values = match[1].replace(/['"]/g, '').split(',').map(v => v.trim()).filter(Boolean);
115
+ if (values.length > 0)
116
+ return values.join(', ');
117
+ }
118
+ // Match: enum { Value1, Value2 }
119
+ const enumMatch = content.match(/enum\s+\w+\s*\{([^}]+)\}/);
120
+ if (enumMatch?.[1]) {
121
+ const values = enumMatch[1].split(',').map(v => v.trim().split('=')[0].trim()).filter(Boolean);
122
+ if (values.length > 0)
123
+ return values.map(v => expandIdentifier(v)).join(', ');
124
+ }
125
+ return '';
126
+ }
127
+ /** Extract parameter names from function signature */
128
+ function extractParamNames(content) {
129
+ const match = content.match(/\(([^)]*)\)/);
130
+ if (!match?.[1])
131
+ return '';
132
+ return match[1].split(',')
133
+ .map(p => p.trim().split(':')[0].split('=')[0].trim())
134
+ .filter(p => p && p !== '')
135
+ .map(p => expandIdentifier(p))
136
+ .join(', ');
137
+ }
138
+ /** Build NL documents from a node */
139
+ export function extractNlTexts(node) {
140
+ const docs = [];
141
+ const name = node.name;
142
+ const expandedName = expandIdentifier(name);
143
+ const dir = node.filePath.split('/').slice(-3, -1).join('/');
144
+ // 1. Comment-based NL text (primary)
145
+ const comment = extractFullComment(node.content);
146
+ if (comment) {
147
+ docs.push({
148
+ nodeId: node.id,
149
+ source: 'comment',
150
+ text: `${expandedName}: ${comment}. File: ${dir}`,
151
+ });
152
+ }
153
+ // 2. Name + params + return type (always available)
154
+ const params = extractParamNames(node.content);
155
+ const parts = [expandedName];
156
+ if (params)
157
+ parts.push(`Parameters: ${params}`);
158
+ if (dir)
159
+ parts.push(`in ${dir}`);
160
+ if (!comment) {
161
+ // Only add name-based doc if no comment (avoid duplication)
162
+ docs.push({
163
+ nodeId: node.id,
164
+ source: 'name',
165
+ text: parts.join('. '),
166
+ });
167
+ }
168
+ // 3. Enum/const values
169
+ if (node.label === 'Enum' || node.label === 'Const' || node.label === 'TypeAlias') {
170
+ const values = extractEnumValues(node.content);
171
+ if (values) {
172
+ docs.push({
173
+ nodeId: node.id,
174
+ source: 'enum',
175
+ text: `${expandedName}: ${values}`,
176
+ });
177
+ }
178
+ }
179
+ return docs;
180
+ }
181
+ // ---------------------------------------------------------------------------
182
+ // Full NL embedding pipeline
183
+ // ---------------------------------------------------------------------------
184
+ /** Hash text for skip detection */
185
+ import { createHash } from 'crypto';
186
+ function md5(text) {
187
+ return createHash('md5').update(text).digest('hex');
188
+ }
189
+ /**
190
+ * Build NL embeddings for all eligible nodes in the database.
191
+ * Reads nodes, extracts NL text, embeds with bge-small, writes to nl_embeddings.
192
+ */
193
+ export async function buildNlEmbeddings(db, onProgress) {
194
+ const t0 = Date.now();
195
+ await initNlEmbedder();
196
+ // Query all nodes (not just EMBEDDABLE_LABELS — we want enums, consts, types too)
197
+ const labels = ['Function', 'Class', 'Method', 'Interface', 'Const', 'Enum', 'TypeAlias', 'Namespace', 'Module', 'Struct'];
198
+ const placeholders = labels.map(() => '?').join(',');
199
+ const rows = db.prepare(`SELECT id, name, label, filePath, content, startLine, description FROM nodes WHERE label IN (${placeholders})`).all(...labels);
200
+ // Skip test files
201
+ const testPatterns = ['/test/', '/tests/', '/spec/', '/fixtures/', '/__tests__/', '/__mocks__/', '.test.', '.spec.', '_test.', '_spec.'];
202
+ const filteredRows = rows.filter(r => !testPatterns.some(p => r.filePath.includes(p)));
203
+ // Extract NL documents
204
+ const allDocs = [];
205
+ for (const row of filteredRows) {
206
+ const docs = extractNlTexts(row);
207
+ for (const doc of docs)
208
+ allDocs.push(doc);
209
+ }
210
+ if (allDocs.length === 0) {
211
+ return { embedded: 0, skipped: 0, durationMs: Date.now() - t0 };
212
+ }
213
+ // Check existing hashes for skip detection
214
+ const existingHashes = new Map();
215
+ try {
216
+ const hashRows = db.prepare('SELECT nodeId, textHash FROM nl_embeddings WHERE textHash IS NOT NULL').all();
217
+ for (const r of hashRows)
218
+ existingHashes.set(r.nodeId + ':' + r.textHash, '1');
219
+ }
220
+ catch { /* table might not exist yet */ }
221
+ // Filter to docs that need embedding
222
+ const toEmbed = [];
223
+ let skipped = 0;
224
+ for (const doc of allDocs) {
225
+ const hash = md5(doc.text);
226
+ if (existingHashes.has(doc.nodeId + ':' + hash)) {
227
+ skipped++;
228
+ continue;
229
+ }
230
+ toEmbed.push({ ...doc, hash });
231
+ }
232
+ if (toEmbed.length === 0) {
233
+ return { embedded: 0, skipped, durationMs: Date.now() - t0 };
234
+ }
235
+ // Clear existing NL embeddings and rebuild
236
+ db.prepare('DELETE FROM nl_embeddings').run();
237
+ // Embed in batches and write to DB
238
+ const BATCH = 100;
239
+ const insertStmt = db.prepare('INSERT INTO nl_embeddings (nodeId, embedding, textHash, source, text) VALUES (?, ?, ?, ?, ?)');
240
+ let embedded = 0;
241
+ db.exec('BEGIN');
242
+ try {
243
+ for (let i = 0; i < toEmbed.length; i += BATCH) {
244
+ const batch = toEmbed.slice(i, i + BATCH);
245
+ const vecs = await nlEmbedBatch(batch.map(d => d.text));
246
+ for (let j = 0; j < batch.length; j++) {
247
+ const doc = batch[j];
248
+ const vec = vecs[j];
249
+ const blob = Buffer.from(new Float32Array(vec).buffer);
250
+ insertStmt.run(doc.nodeId, blob, doc.hash, doc.source, doc.text);
251
+ embedded++;
252
+ }
253
+ onProgress?.(Math.min(i + BATCH, toEmbed.length), toEmbed.length);
254
+ }
255
+ db.exec('COMMIT');
256
+ }
257
+ catch (err) {
258
+ db.exec('ROLLBACK');
259
+ throw err;
260
+ }
261
+ return { embedded, skipped, durationMs: Date.now() - t0 };
262
+ }
@@ -16,6 +16,13 @@ const getFileName = (filePath) => {
16
16
  const parts = filePath.split('/');
17
17
  return parts[parts.length - 1] || filePath;
18
18
  };
19
+ /** Extract directory context from file path (last 2-3 segments) */
20
+ const getDirectoryContext = (filePath) => {
21
+ const parts = filePath.split('/');
22
+ // Remove filename, take last 2 directory segments
23
+ parts.pop();
24
+ return parts.slice(-2).join('/');
25
+ };
19
26
  /**
20
27
  * Extract the first JSDoc/comment block as a natural language description.
21
28
  * This bridges natural language queries to code — "blast radius analysis"
@@ -154,8 +161,9 @@ export const generateEmbeddingText = (node, _config = {}) => {
154
161
  const comment = extractFirstComment(node.content);
155
162
  if (comment)
156
163
  parts.push(comment);
157
- // 3. File location
158
- parts.push(`File: ${getFileName(node.filePath)}`);
164
+ // 3. File location with directory context
165
+ const dir = getDirectoryContext(node.filePath);
166
+ parts.push(`File: ${getFileName(node.filePath)}${dir ? ` in ${dir}` : ''}`);
159
167
  // 4. Code signature (not full body)
160
168
  const sig = extractSignature(node.content, label);
161
169
  if (sig)
@@ -1,5 +1,5 @@
1
1
  /** @file types.ts @description Type definitions for embedding generation and semantic search */
2
- export declare const EMBEDDABLE_LABELS: readonly ["Function", "Class", "Method", "Interface"];
2
+ export declare const EMBEDDABLE_LABELS: readonly ["Function", "Class", "Method", "Interface", "Const", "Enum", "TypeAlias", "Namespace", "Module", "Struct"];
3
3
  export type EmbeddableLabel = typeof EMBEDDABLE_LABELS[number];
4
4
  /** Check if a label is embeddable */
5
5
  export declare const isEmbeddableLabel: (label: string) => label is EmbeddableLabel;
@@ -4,10 +4,8 @@
4
4
  // File nodes removed — their embeddings were low quality (import headers, license text)
5
5
  // and polluted semantic search. BM25 FTS already searches file content effectively.
6
6
  export const EMBEDDABLE_LABELS = [
7
- 'Function',
8
- 'Class',
9
- 'Method',
10
- 'Interface',
7
+ 'Function', 'Class', 'Method', 'Interface',
8
+ 'Const', 'Enum', 'TypeAlias', 'Namespace', 'Module', 'Struct',
11
9
  ];
12
10
  /** Check if a label is embeddable */
13
11
  export const isEmbeddableLabel = (label) => EMBEDDABLE_LABELS.includes(label);
@@ -21,7 +21,7 @@ import { getLanguageFromFilename, getDefinitionNodeFromCaptures } from '../inges
21
21
  import { loadParser, loadLanguage, isLanguageAvailable } from '../tree-sitter/parser-loader.js';
22
22
  import { getTreeSitterBufferSize, TREE_SITTER_MAX_BUFFER } from '../ingestion/constants.js';
23
23
  import { generateId } from '../../lib/utils.js';
24
- import { deleteNodesByFile, insertNode, insertEdge, findNodeAtLine, findNodesByFile, deleteEmbeddingsByFile, insertEmbeddingsBatch, countEmbeddings } from '../db/adapter.js';
24
+ import { deleteNodesByFile, insertNode, insertEdge, findNodeAtLine, findNodesByFile, deleteEmbeddingsByFile, insertEmbeddingsBatch, countEmbeddings, deleteRefsByFile, insertRefsBatch, deleteFileWordsByFile, upsertFileWords } from '../db/adapter.js';
25
25
  import { assertNodeLabel, toNodeId, toEdgeId } from '../db/schema.js';
26
26
  import {} from './types.js';
27
27
  import { getTsgoService } from '../semantic/tsgo-service.js';
@@ -79,10 +79,13 @@ export async function refreshFiles(db, repoPath, dirtyFiles) {
79
79
  let nodesInserted = 0;
80
80
  let edgesInserted = 0;
81
81
  let filesSkipped = 0;
82
- // Phase 1: Delete old nodes for all dirty files
82
+ // Phase 1: Delete old nodes + refs + file_words for all dirty files
83
83
  for (const entry of dirtyFiles) {
84
- const deleted = deleteNodesByFile(db, entry.relativePath);
84
+ const relPath = entry.relativePath;
85
+ const deleted = deleteNodesByFile(db, relPath);
85
86
  nodesDeleted += deleted;
87
+ deleteRefsByFile(db, relPath);
88
+ deleteFileWordsByFile(db, relPath);
86
89
  }
87
90
  // Phase 2: Parse modified/created files with tree-sitter
88
91
  const parser = await loadParser();
@@ -90,6 +93,7 @@ export async function refreshFiles(db, repoPath, dirtyFiles) {
90
93
  const allDefinitions = [];
91
94
  const callSites = [];
92
95
  const insertedFilePaths = new Set();
96
+ const fileContents = new Map(); // for file_words rebuild
93
97
  for (const entry of filesToProcess) {
94
98
  const relPath = entry.relativePath;
95
99
  const absPath = path.resolve(repoPath, relPath);
@@ -110,6 +114,7 @@ export async function refreshFiles(db, repoPath, dirtyFiles) {
110
114
  filesSkipped++;
111
115
  continue;
112
116
  }
117
+ fileContents.set(relPath, content);
113
118
  try {
114
119
  await loadLanguage(language, relPath);
115
120
  }
@@ -247,6 +252,37 @@ export async function refreshFiles(db, repoPath, dirtyFiles) {
247
252
  });
248
253
  edgesInserted++;
249
254
  }
255
+ // Phase 3b+3c: Rebuild refs + file_words for dirty files
256
+ const STOP_WORDS = new Set(['the', 'and', 'for', 'from', 'with', 'this', 'that', 'have', 'has', 'not', 'are', 'was', 'were', 'been', 'being', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'does', 'did', 'let', 'var', 'const', 'new', 'return', 'function', 'class', 'import', 'export', 'default', 'void', 'null', 'undefined', 'true', 'false', 'else', 'case', 'break', 'continue', 'while', 'throw', 'catch', 'try', 'finally', 'async', 'await', 'yield', 'typeof', 'instanceof', 'delete', 'switch', 'interface', 'type', 'enum', 'extends', 'implements', 'static', 'private', 'public', 'protected', 'abstract', 'readonly', 'override', 'declare', 'module', 'namespace', 'require', 'string', 'number', 'boolean', 'object', 'any', 'never', 'unknown', 'symbol']);
257
+ // Phase 3b: Rebuild refs for dirty files (identifier occurrence index)
258
+ for (const [relPath, content] of fileContents) {
259
+ const refs = [];
260
+ const lines = content.split('\n');
261
+ const identRegex = /\b[a-zA-Z_]\w{2,}\b/g;
262
+ for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
263
+ let match;
264
+ while ((match = identRegex.exec(lines[lineIdx])) !== null) {
265
+ if (!STOP_WORDS.has(match[0].toLowerCase())) {
266
+ refs.push({ symbol: match[0], filePath: relPath, line: lineIdx });
267
+ }
268
+ }
269
+ }
270
+ if (refs.length > 0)
271
+ insertRefsBatch(db, refs);
272
+ }
273
+ // Phase 3c: Rebuild file_words for dirty files (conceptual search index)
274
+ for (const [relPath, content] of fileContents) {
275
+ const wordSet = new Set();
276
+ const wordRegex = /\b[a-zA-Z]\w{2,}\b/g;
277
+ let match;
278
+ while ((match = wordRegex.exec(content)) !== null) {
279
+ const w = match[0].toLowerCase();
280
+ if (!STOP_WORDS.has(w))
281
+ wordSet.add(w);
282
+ }
283
+ if (wordSet.size > 0)
284
+ upsertFileWords(db, relPath, [...wordSet].join(' '));
285
+ }
250
286
  // Phase 4 + 5: Resolve call edges and cross-file edges using tsgo LSP
251
287
  // (TS/JS files only — tsgo is optional, skip if unavailable)
252
288
  console.error(`Code Mapper: refresh tsgo init with repoPath=${repoPath}`);
@@ -40,10 +40,22 @@ export declare class LocalBackend {
40
40
  private refreshLocks;
41
41
  /** Per-repo tsgo LSP service instances for live semantic enrichment */
42
42
  private tsgoServices;
43
+ /** Per-repo in-memory embedding cache: nodeId → Float32Array (256-dim) */
44
+ private embeddingCaches;
45
+ /** Per-repo in-memory NL embedding cache: includes source text for match_reason */
46
+ private nlEmbeddingCaches;
43
47
  /** Get (or lazily start) a tsgo LSP service for a repo. Returns null if unavailable. */
44
48
  private getTsgo;
45
49
  /** Get (or lazily open) the SQLite database for a repo. */
46
50
  private getDb;
51
+ /** Load all embeddings into memory for fast vector search */
52
+ private loadEmbeddingCache;
53
+ /** Search embeddings in memory — O(N) dot products, no disk I/O */
54
+ private searchEmbeddingsInMemory;
55
+ /** Load NL embeddings into memory for fast conceptual search */
56
+ private loadNlEmbeddingCache;
57
+ /** Search NL embeddings in memory, returns match_reason text */
58
+ private searchNlEmbeddingsInMemory;
47
59
  /** Hard ceiling — beyond this, incremental is unreliable, warn prominently */
48
60
  private static readonly MAX_INCREMENTAL_FILES;
49
61
  /** Start file system watcher for a repo to detect source changes */
@@ -125,6 +137,31 @@ export declare class LocalBackend {
125
137
  * Semantic vector search helper
126
138
  */
127
139
  private semanticSearch;
140
+ /**
141
+ * NL semantic search: embed query with bge-small, search NL descriptions.
142
+ * Returns match_reason (the NL text that matched) for agent transparency.
143
+ */
144
+ private nlSemanticSearch;
145
+ /**
146
+ * Refs-based search: find symbols referenced in files that contain the query identifiers.
147
+ * Bridges the gap between graph edges (incomplete) and grep (complete for exact names).
148
+ */
149
+ private refsSearch;
150
+ /**
151
+ * File-words FTS search: find files whose content contains conceptual terms,
152
+ * then return the best symbol from each file. Bridges NL → code gap.
153
+ */
154
+ private fileWordsSearch;
155
+ /**
156
+ * Query expansion via embedding nearest neighbors: embed the query,
157
+ * find 5 closest symbols, extract their names as BM25 expansion terms.
158
+ */
159
+ private expandQueryViaNearestNeighbors;
160
+ /**
161
+ * Ripgrep fallback: when all search signals return sparse results,
162
+ * grep the repo for query terms to find any relevant files.
163
+ */
164
+ private ripgrepFallback;
128
165
  executeSql(repoName: string, query: string): Promise<any>;
129
166
  private sqlQuery;
130
167
  /** Format raw SQL result rows as a markdown table, with raw fallback */