@zuvia-software-solutions/code-mapper 2.4.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,431 @@
1
+ // code-mapper/src/core/embeddings/nl-embedder.ts
2
+ /**
3
+ * @file Natural language embedder using bge-small-en-v1.5.
4
+ *
5
+ * Runs entirely in Node.js via @huggingface/transformers — no Python, no GPU.
6
+ * Embeds human-readable descriptions extracted from code (JSDoc comments,
7
+ * enum values, type patterns, file headers) for conceptual search.
8
+ *
9
+ * 33M params, q8 quantized, 384-dim embeddings, ~6ms/text on CPU.
10
+ */
11
+ // NL embedder — no schema imports needed
12
+ const MODEL_ID = 'Xenova/bge-small-en-v1.5';
13
+ // Lazy-loaded pipeline
14
+ let extractor = null;
15
+ let loadPromise = null;
16
+ /** Initialize the NL embedding model (lazy, idempotent) */
17
+ export async function initNlEmbedder() {
18
+ if (extractor)
19
+ return;
20
+ if (loadPromise)
21
+ return loadPromise;
22
+ loadPromise = (async () => {
23
+ const { pipeline, env } = await import('@huggingface/transformers');
24
+ // Use all available CPU threads for ONNX inference
25
+ if (env.backends?.onnx?.wasm) {
26
+ env.backends.onnx.wasm.numThreads = Math.max(1, (await import('os')).cpus().length);
27
+ }
28
+ extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
29
+ })();
30
+ return loadPromise;
31
+ }
32
+ /** Check if the NL embedder is ready */
33
+ export function isNlEmbedderReady() {
34
+ return extractor !== null;
35
+ }
36
+ /** Embed a single text, returns Float32Array */
37
+ export async function nlEmbed(text) {
38
+ if (!extractor)
39
+ await initNlEmbedder();
40
+ const result = await extractor(text, { pooling: 'cls', normalize: true });
41
+ return Array.from(result.data);
42
+ }
43
+ /** Embed a batch of texts (processes in sub-batches for memory efficiency) */
44
+ export async function nlEmbedBatch(texts) {
45
+ if (!extractor)
46
+ await initNlEmbedder();
47
+ const BATCH = 32; // sub-batch size — balances throughput vs memory
48
+ const results = [];
49
+ for (let i = 0; i < texts.length; i += BATCH) {
50
+ const batch = texts.slice(i, i + BATCH);
51
+ // Process sub-batch — transformers.js handles arrays
52
+ const batchResults = await Promise.all(batch.map(text => extractor(text, { pooling: 'cls', normalize: true })));
53
+ for (const result of batchResults) {
54
+ results.push(Array.from(result.data));
55
+ }
56
+ }
57
+ return results;
58
+ }
59
+ /** Extract all JSDoc/block comment text (up to 10 lines) */
60
+ function extractFullComment(content) {
61
+ if (!content)
62
+ return '';
63
+ const lines = content.split('\n');
64
+ const commentLines = [];
65
+ let inBlock = false;
66
+ for (const l of lines) {
67
+ const t = l.trim();
68
+ if (t.startsWith('/**') || t.startsWith('/*')) {
69
+ inBlock = true;
70
+ const inner = t.replace(/^\/\*\*?/, '').replace(/\*\/$/, '').trim();
71
+ if (inner && !inner.startsWith('@'))
72
+ commentLines.push(inner);
73
+ if (t.includes('*/'))
74
+ inBlock = false;
75
+ continue;
76
+ }
77
+ if (inBlock) {
78
+ if (t.includes('*/')) {
79
+ inBlock = false;
80
+ continue;
81
+ }
82
+ const inner = t.replace(/^\*\s?/, '').trim();
83
+ if (inner && !inner.startsWith('@'))
84
+ commentLines.push(inner);
85
+ if (commentLines.length >= 10)
86
+ break;
87
+ continue;
88
+ }
89
+ if (t.startsWith('//')) {
90
+ const inner = t.slice(2).trim();
91
+ if (inner)
92
+ commentLines.push(inner);
93
+ if (commentLines.length >= 10)
94
+ break;
95
+ continue;
96
+ }
97
+ if (t.startsWith('#') && !t.startsWith('#!')) {
98
+ const inner = t.slice(1).trim();
99
+ if (inner)
100
+ commentLines.push(inner);
101
+ if (commentLines.length >= 10)
102
+ break;
103
+ continue;
104
+ }
105
+ if (commentLines.length > 0)
106
+ break; // comment ended
107
+ }
108
+ return commentLines.join(' ');
109
+ }
110
+ /** Expand camelCase/PascalCase/snake_case to space-separated words */
111
+ function expandIdentifier(name) {
112
+ return name
113
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
114
+ .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
115
+ .replace(/[_\-]/g, ' ')
116
+ .toLowerCase();
117
+ }
118
+ /** Extract enum/const array values as NL text */
119
+ function extractEnumValues(content) {
120
+ // Match: ['value1', 'value2', ...] as const
121
+ const match = content.match(/\[([^\]]+)\]\s*as\s*const/);
122
+ if (match?.[1]) {
123
+ const values = match[1].replace(/['"]/g, '').split(',').map(v => v.trim()).filter(Boolean);
124
+ if (values.length > 0)
125
+ return values.join(', ');
126
+ }
127
+ // Match: enum { Value1, Value2 }
128
+ const enumMatch = content.match(/enum\s+\w+\s*\{([^}]+)\}/);
129
+ if (enumMatch?.[1]) {
130
+ const values = enumMatch[1].split(',').map(v => v.trim().split('=')[0].trim()).filter(Boolean);
131
+ if (values.length > 0)
132
+ return values.map(v => expandIdentifier(v)).join(', ');
133
+ }
134
+ return '';
135
+ }
136
+ /** Extract parameter names from function signature */
137
+ function extractParamNames(content) {
138
+ const match = content.match(/\(([^)]*)\)/);
139
+ if (!match?.[1])
140
+ return '';
141
+ return match[1].split(',')
142
+ .map(p => p.trim().split(':')[0].split('=')[0].trim())
143
+ .filter(p => p && p !== '')
144
+ .map(p => expandIdentifier(p))
145
+ .join(', ');
146
+ }
147
+ /** Build NL documents from a node */
148
+ export function extractNlTexts(node) {
149
+ const docs = [];
150
+ const name = node.name;
151
+ const expandedName = expandIdentifier(name);
152
+ const dir = node.filePath.split('/').slice(-3, -1).join('/');
153
+ // 1. Comment-based NL text (primary)
154
+ const comment = extractFullComment(node.content);
155
+ if (comment) {
156
+ docs.push({
157
+ nodeId: node.id,
158
+ source: 'comment',
159
+ text: `${expandedName}: ${comment}. File: ${dir}`,
160
+ });
161
+ }
162
+ // 2. Name + params + return type (always available)
163
+ const params = extractParamNames(node.content);
164
+ const parts = [expandedName];
165
+ if (params)
166
+ parts.push(`Parameters: ${params}`);
167
+ if (dir)
168
+ parts.push(`in ${dir}`);
169
+ if (!comment) {
170
+ // Only add name-based doc if no comment (avoid duplication)
171
+ docs.push({
172
+ nodeId: node.id,
173
+ source: 'name',
174
+ text: parts.join('. '),
175
+ });
176
+ }
177
+ // 3. Enum/const values
178
+ if (node.label === 'Enum' || node.label === 'Const' || node.label === 'TypeAlias') {
179
+ const values = extractEnumValues(node.content);
180
+ if (values) {
181
+ docs.push({
182
+ nodeId: node.id,
183
+ source: 'enum',
184
+ text: `${expandedName}: ${values}`,
185
+ });
186
+ }
187
+ }
188
+ return docs;
189
+ }
190
+ // ---------------------------------------------------------------------------
191
+ // Full NL embedding pipeline
192
+ // ---------------------------------------------------------------------------
193
+ /** Hash text for skip detection */
194
+ import { createHash } from 'crypto';
195
+ function md5(text) {
196
+ return createHash('md5').update(text).digest('hex');
197
+ }
198
+ /**
199
+ * Build NL embeddings for all eligible nodes in the database.
200
+ * Reads nodes, extracts NL text, embeds with bge-small, writes to nl_embeddings.
201
+ */
202
+ export async function buildNlEmbeddings(db, onProgress) {
203
+ const t0 = Date.now();
204
+ await initNlEmbedder();
205
+ // Query all nodes (not just EMBEDDABLE_LABELS — we want enums, consts, types too)
206
+ const labels = ['Function', 'Class', 'Method', 'Interface', 'Const', 'Enum', 'TypeAlias', 'Namespace', 'Module', 'Struct'];
207
+ const placeholders = labels.map(() => '?').join(',');
208
+ const rows = db.prepare(`SELECT id, name, label, filePath, content, startLine, description FROM nodes WHERE label IN (${placeholders})`).all(...labels);
209
+ // NL embeddings include ALL files (including tests) — test names describe
210
+ // functionality in natural language which helps conceptual search.
211
+ // The bge-small model is fast enough (~6ms/doc) that the cost is trivial.
212
+ const filteredRows = rows;
213
+ // Extract NL documents
214
+ const allDocs = [];
215
+ for (const row of filteredRows) {
216
+ const docs = extractNlTexts(row);
217
+ for (const doc of docs)
218
+ allDocs.push(doc);
219
+ }
220
+ if (allDocs.length === 0) {
221
+ return { embedded: 0, skipped: 0, durationMs: Date.now() - t0 };
222
+ }
223
+ // Deduplicate: one embedding per nodeId — prefer 'comment' source over 'name' or 'enum'
224
+ const SOURCE_PRIORITY = { comment: 0, enum: 1, name: 2 };
225
+ const bestByNode = new Map();
226
+ for (const doc of allDocs) {
227
+ const existing = bestByNode.get(doc.nodeId);
228
+ if (!existing || (SOURCE_PRIORITY[doc.source] ?? 9) < (SOURCE_PRIORITY[existing.source] ?? 9)) {
229
+ bestByNode.set(doc.nodeId, doc);
230
+ }
231
+ }
232
+ const uniqueDocs = [...bestByNode.values()];
233
+ // Check existing hashes for skip detection
234
+ const existingHashes = new Map();
235
+ try {
236
+ const hashRows = db.prepare('SELECT nodeId, textHash FROM embeddings WHERE textHash IS NOT NULL').all();
237
+ for (const r of hashRows)
238
+ existingHashes.set(r.nodeId + ':' + r.textHash, '1');
239
+ }
240
+ catch { /* table might not exist yet */ }
241
+ // Filter to docs that need embedding
242
+ const toEmbed = [];
243
+ let skipped = 0;
244
+ for (const doc of uniqueDocs) {
245
+ const hash = md5(doc.text);
246
+ if (existingHashes.has(doc.nodeId + ':' + hash)) {
247
+ skipped++;
248
+ continue;
249
+ }
250
+ toEmbed.push({ ...doc, hash });
251
+ }
252
+ if (toEmbed.length === 0) {
253
+ return { embedded: 0, skipped, durationMs: Date.now() - t0 };
254
+ }
255
+ // Clear existing embeddings and rebuild
256
+ db.prepare('DELETE FROM embeddings').run();
257
+ try {
258
+ db.prepare('DELETE FROM nl_embeddings').run();
259
+ }
260
+ catch { /* table may not exist */ }
261
+ // Parallel multi-process embedding — same architecture as tsgo
262
+ // Each worker loads its own bge-small model, embeds independently.
263
+ const os = await import('os');
264
+ const { fork } = await import('child_process');
265
+ const { fileURLToPath } = await import('url');
266
+ const pathMod = await import('path');
267
+ const cpuCount = os.cpus().length;
268
+ const maxByCore = Math.max(1, Math.floor(cpuCount * 0.75));
269
+ const maxByWorkload = Math.max(1, Math.floor(toEmbed.length / 50));
270
+ const workerCount = Math.min(maxByCore, maxByWorkload, 8); // cap at 8 for memory
271
+ // Find worker script path
272
+ const thisDir = pathMod.dirname(fileURLToPath(import.meta.url));
273
+ const workerScript = pathMod.join(thisDir, 'nl-embed-worker.js');
274
+ // Split work across workers
275
+ const ITEMS_PER_BATCH = 50;
276
+ let nextIdx = 0;
277
+ let embedded = 0;
278
+ const getNextBatch = () => {
279
+ if (nextIdx >= toEmbed.length)
280
+ return null;
281
+ const batch = toEmbed.slice(nextIdx, nextIdx + ITEMS_PER_BATCH);
282
+ nextIdx += ITEMS_PER_BATCH;
283
+ return batch;
284
+ };
285
+ // Prepare DB statements
286
+ const insertStmt = db.prepare('INSERT OR REPLACE INTO embeddings (nodeId, embedding, textHash) VALUES (?, ?, ?)');
287
+ let nlInsertStmt = null;
288
+ try {
289
+ nlInsertStmt = db.prepare('INSERT INTO nl_embeddings (nodeId, embedding, textHash, source, text) VALUES (?, ?, ?, ?, ?)');
290
+ }
291
+ catch { /* nl_embeddings table may not exist */ }
292
+ // Track doc metadata for nl_embeddings text lookup
293
+ const docMap = new Map();
294
+ for (const doc of toEmbed)
295
+ docMap.set(doc.nodeId, { source: doc.source, text: doc.text, hash: doc.hash });
296
+ if (workerCount <= 1) {
297
+ // Single process — use in-process embedding (small workloads)
298
+ await initNlEmbedder();
299
+ db.exec('BEGIN');
300
+ try {
301
+ for (let i = 0; i < toEmbed.length; i += ITEMS_PER_BATCH) {
302
+ const batch = toEmbed.slice(i, i + ITEMS_PER_BATCH);
303
+ const vecs = await nlEmbedBatch(batch.map(d => d.text));
304
+ for (let j = 0; j < batch.length; j++) {
305
+ const doc = batch[j];
306
+ const vec = vecs[j];
307
+ const blob = Buffer.from(new Float32Array(vec).buffer);
308
+ insertStmt.run(doc.nodeId, blob, doc.hash);
309
+ if (nlInsertStmt) {
310
+ try {
311
+ nlInsertStmt.run(doc.nodeId, blob, doc.hash, doc.source, doc.text);
312
+ }
313
+ catch { }
314
+ }
315
+ embedded++;
316
+ }
317
+ onProgress?.(Math.min(i + ITEMS_PER_BATCH, toEmbed.length), toEmbed.length);
318
+ }
319
+ db.exec('COMMIT');
320
+ }
321
+ catch (err) {
322
+ db.exec('ROLLBACK');
323
+ throw err;
324
+ }
325
+ }
326
+ else {
327
+ // Multi-process: spawn N workers, dynamic dispatch
328
+ const workers = [];
329
+ const workerReady = [];
330
+ for (let i = 0; i < workerCount; i++) {
331
+ const worker = fork(workerScript, [], { stdio: ['pipe', 'pipe', 'pipe', 'ipc'] });
332
+ workers.push(worker);
333
+ workerReady.push(new Promise((resolve) => {
334
+ const handler = (msg) => {
335
+ if (msg?.type === 'ready') {
336
+ worker.removeListener('message', handler);
337
+ resolve();
338
+ }
339
+ };
340
+ worker.on('message', handler);
341
+ // Timeout: if worker doesn't ready in 30s, skip it
342
+ setTimeout(() => resolve(), 30000);
343
+ }));
344
+ }
345
+ // Wait for all workers to load model
346
+ await Promise.all(workerReady);
347
+ const activeWorkers = workers.filter(w => w.connected);
348
+ if (activeWorkers.length === 0) {
349
+ // Fallback to single process
350
+ await initNlEmbedder();
351
+ db.exec('BEGIN');
352
+ try {
353
+ for (let i = 0; i < toEmbed.length; i += ITEMS_PER_BATCH) {
354
+ const batch = toEmbed.slice(i, i + ITEMS_PER_BATCH);
355
+ const vecs = await nlEmbedBatch(batch.map(d => d.text));
356
+ for (let j = 0; j < batch.length; j++) {
357
+ const doc = batch[j];
358
+ const blob = Buffer.from(new Float32Array(vecs[j]).buffer);
359
+ insertStmt.run(doc.nodeId, blob, doc.hash);
360
+ embedded++;
361
+ }
362
+ onProgress?.(Math.min(i + ITEMS_PER_BATCH, toEmbed.length), toEmbed.length);
363
+ }
364
+ db.exec('COMMIT');
365
+ }
366
+ catch (err) {
367
+ db.exec('ROLLBACK');
368
+ throw err;
369
+ }
370
+ }
371
+ else {
372
+ // Dynamic dispatch: each worker requests next batch when done
373
+ db.exec('BEGIN');
374
+ let batchId = 0;
375
+ const runWorker = (worker) => {
376
+ return new Promise((resolve) => {
377
+ const sendNext = () => {
378
+ const batch = getNextBatch();
379
+ if (!batch) {
380
+ worker.send({ type: 'exit' });
381
+ resolve();
382
+ return;
383
+ }
384
+ worker.send({
385
+ type: 'embed',
386
+ batchId: batchId++,
387
+ items: batch.map(d => ({ nodeId: d.nodeId, text: d.text })),
388
+ });
389
+ };
390
+ worker.on('message', (msg) => {
391
+ if (msg?.type === 'results') {
392
+ // Write results to DB
393
+ for (const r of msg.results) {
394
+ const blob = Buffer.from(new Float32Array(r.vec).buffer);
395
+ const meta = docMap.get(r.nodeId);
396
+ insertStmt.run(r.nodeId, blob, meta?.hash ?? '');
397
+ if (nlInsertStmt && meta) {
398
+ try {
399
+ nlInsertStmt.run(r.nodeId, blob, meta.hash, meta.source, meta.text);
400
+ }
401
+ catch { }
402
+ }
403
+ embedded++;
404
+ }
405
+ onProgress?.(embedded, toEmbed.length);
406
+ sendNext(); // request next batch
407
+ }
408
+ });
409
+ worker.on('exit', () => resolve());
410
+ sendNext(); // start first batch
411
+ });
412
+ };
413
+ try {
414
+ await Promise.all(activeWorkers.map(w => runWorker(w)));
415
+ db.exec('COMMIT');
416
+ }
417
+ catch (err) {
418
+ db.exec('ROLLBACK');
419
+ throw err;
420
+ }
421
+ }
422
+ // Cleanup workers
423
+ for (const w of workers) {
424
+ try {
425
+ w.kill();
426
+ }
427
+ catch { }
428
+ }
429
+ }
430
+ return { embedded, skipped, durationMs: Date.now() - t0 };
431
+ }
@@ -492,46 +492,38 @@ export async function refreshEmbeddings(db, dirtyFiles, hasEmbeddings) {
492
492
  }
493
493
  if (newNodes.length === 0)
494
494
  return;
495
- // Step 3: Enrich with graph context same as the full analyze pipeline
496
- // Lazy import to avoid circular dependency at module load time
497
- const { fetchGraphContext, enrichTextWithGraphContext } = await import('../embeddings/embedding-pipeline.js');
498
- const { generateEmbeddingText } = await import('../embeddings/text-generator.js');
499
- const { initEmbedder, embedBatch, embeddingToArray } = await import('../embeddings/embedder.js');
500
- const graphContext = fetchGraphContext(db, newNodes);
501
- // Step 4: Generate enriched text + hash for skip detection
495
+ // Step 3: Extract NL text and embed with bge-small (same model as full analyze)
496
+ const { extractNlTexts, initNlEmbedder, nlEmbed } = await import('../embeddings/nl-embedder.js');
502
497
  const { createHash } = await import('crypto');
503
498
  const { getEmbeddingHashes } = await import('../db/adapter.js');
504
499
  const existingHashes = getEmbeddingHashes(db);
500
+ await initNlEmbedder();
505
501
  const toEmbed = [];
506
502
  for (const node of newNodes) {
507
- let text = generateEmbeddingText(node);
508
- const ctx = graphContext.get(node.id);
509
- if (ctx) {
510
- text = enrichTextWithGraphContext(text, ctx);
511
- }
512
- const hash = createHash('md5').update(text).digest('hex');
513
- // Skip if hash unchanged (content + graph context identical)
503
+ const nlDocs = extractNlTexts({
504
+ id: node.id, name: node.name, label: node.label,
505
+ filePath: node.filePath, content: node.content || '',
506
+ startLine: node.startLine ?? null, description: node.description || '',
507
+ });
508
+ // Pick best doc (prefer comment over name)
509
+ const best = nlDocs.find(d => d.source === 'comment') || nlDocs[0];
510
+ if (!best)
511
+ continue;
512
+ const hash = createHash('md5').update(best.text).digest('hex');
514
513
  if (existingHashes.get(node.id) === hash)
515
514
  continue;
516
- toEmbed.push({ node, text, hash });
515
+ toEmbed.push({ nodeId: node.id, text: best.text, hash, source: best.source });
517
516
  }
518
517
  if (toEmbed.length === 0) {
519
518
  console.error(`Code Mapper: All ${newNodes.length} node(s) unchanged (hash skip)`);
520
519
  return;
521
520
  }
522
521
  console.error(`Code Mapper: Embedding ${toEmbed.length}/${newNodes.length} node(s) (${newNodes.length - toEmbed.length} unchanged)`);
523
- // Step 5: Ensure embedder is ready
524
- await initEmbedder();
525
- // Step 6: Batch embed only changed nodes
526
- const embeddings = await embedBatch(toEmbed.map(e => e.text));
527
- // Step 7: Insert with hashes
522
+ // Step 4: Embed and insert
528
523
  const items = [];
529
- for (let i = 0; i < toEmbed.length; i++) {
530
- const entry = toEmbed[i];
531
- const emb = embeddings[i];
532
- if (entry?.node && emb) {
533
- items.push({ nodeId: toNodeId(entry.node.id), embedding: embeddingToArray(emb), textHash: entry.hash });
534
- }
524
+ for (const entry of toEmbed) {
525
+ const vec = await nlEmbed(entry.text);
526
+ items.push({ nodeId: toNodeId(entry.nodeId), embedding: vec, textHash: entry.hash });
535
527
  }
536
528
  insertEmbeddingsBatch(db, items);
537
529
  console.error(`Code Mapper: Embedded ${items.length} node(s) incrementally`);
@@ -42,6 +42,8 @@ export declare class LocalBackend {
42
42
  private tsgoServices;
43
43
  /** Per-repo in-memory embedding cache: nodeId → Float32Array (256-dim) */
44
44
  private embeddingCaches;
45
+ /** Per-repo in-memory NL embedding cache: includes source text for match_reason */
46
+ private nlEmbeddingCaches;
45
47
  /** Get (or lazily start) a tsgo LSP service for a repo. Returns null if unavailable. */
46
48
  private getTsgo;
47
49
  /** Get (or lazily open) the SQLite database for a repo. */
@@ -50,6 +52,10 @@ export declare class LocalBackend {
50
52
  private loadEmbeddingCache;
51
53
  /** Search embeddings in memory — O(N) dot products, no disk I/O */
52
54
  private searchEmbeddingsInMemory;
55
+ /** Load NL embeddings into memory for fast conceptual search */
56
+ private loadNlEmbeddingCache;
57
+ /** Search NL embeddings in memory, returns match_reason text */
58
+ private searchNlEmbeddingsInMemory;
53
59
  /** Hard ceiling — beyond this, incremental is unreliable, warn prominently */
54
60
  private static readonly MAX_INCREMENTAL_FILES;
55
61
  /** Start file system watcher for a repo to detect source changes */
@@ -131,6 +137,11 @@ export declare class LocalBackend {
131
137
  * Semantic vector search helper
132
138
  */
133
139
  private semanticSearch;
140
+ /**
141
+ * NL semantic search: embed query with bge-small, search NL descriptions.
142
+ * Returns match_reason (the NL text that matched) for agent transparency.
143
+ */
144
+ private nlSemanticSearch;
134
145
  /**
135
146
  * Refs-based search: find symbols referenced in files that contain the query identifiers.
136
147
  * Bridges the gap between graph edges (incomplete) and grep (complete for exact names).