@zuvia-software-solutions/code-mapper 2.4.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -347,86 +347,26 @@ export const analyzeCommand = async (inputPath, options) => {
347
347
  catch { /* some may fail if node was removed, that's fine */ }
348
348
  }
349
349
  }
350
- // Phase 4: Embeddings (90-98%)
350
+ // Phase 4: Embeddings — bge-small NL embeddings (CPU, Node.js, no Python)
351
+ // Extracts natural language from code (comments, names, enums, patterns)
352
+ // and embeds with bge-small-en-v1.5 (33M params, 384-dim, ~6ms/doc).
351
353
  const stats = getStats(db);
352
354
  let embeddingFailed = false;
353
355
  if (options?.embeddings) {
354
356
  recordPhase('embeddings');
355
- updateBar(90, 'Generating embeddings...');
356
- // Close DB so Python can write to it
357
- closeDb(dbPath);
358
- // Run Python embedder in batch mode — reads from SQLite, embeds, writes back.
359
- // Zero IPC overhead: ~3x faster than Node↔Python JSON streaming.
360
- const { spawn: spawnChild } = await import('child_process');
361
- const { fileURLToPath } = await import('url');
362
- const mlxScript = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..', '..', 'models', 'mlx-embedder.py');
363
- await new Promise((resolve) => {
364
- // Use spawn (not execFile) — no internal buffer limit, streams only.
365
- // execFile buffers all stdout in memory which causes OOM/kill on large codebases.
366
- const proc = spawnChild('python3', [mlxScript, 'batch', dbPath, '--dims', '256', '--max-tokens', '2048'], {
367
- stdio: ['ignore', 'pipe', 'pipe'],
368
- });
369
- let stderrBuf = '';
370
- proc.stderr?.on('data', (chunk) => {
371
- stderrBuf += chunk.toString();
372
- if (stderrBuf.length > 10240)
373
- stderrBuf = stderrBuf.slice(-10240);
374
- });
375
- proc.on('close', (code) => {
376
- if (code !== 0) {
377
- // Non-fatal: index is already saved, just embeddings failed
378
- console.error(`\n Warning: Embedding failed (exit code ${code}). Index saved without embeddings.`);
379
- if (stderrBuf.trim())
380
- console.error(` ${stderrBuf.trim().split('\n').slice(-3).join('\n ')}`);
381
- embeddingFailed = true;
382
- }
383
- resolve();
384
- });
385
- proc.on('error', (err) => {
386
- console.error(`\n Warning: Embedding failed: ${err.message}. Index saved without embeddings.`);
387
- embeddingFailed = true;
388
- resolve();
389
- });
390
- // Stream progress from Python's JSON lines on stdout
391
- let lineBuf = '';
392
- proc.stdout?.on('data', (chunk) => {
393
- lineBuf += chunk.toString();
394
- const lines = lineBuf.split('\n');
395
- lineBuf = lines.pop() || '';
396
- for (const line of lines) {
397
- if (!line.trim())
398
- continue;
399
- try {
400
- const msg = JSON.parse(line);
401
- if (msg.phase === 'downloading' || msg.phase === 'converting') {
402
- updateBar(90, msg.message);
403
- }
404
- else if (msg.phase === 'loaded') {
405
- updateBar(91, `Model loaded (${msg.load_ms}ms)`);
406
- }
407
- else if (msg.phase === 'queried') {
408
- updateBar(92, `Found ${msg.nodes} embeddable nodes${msg.skipped_tests ? ` (${msg.skipped_tests} test files skipped)` : ''}`);
409
- }
410
- else if (msg.phase === 'prepared') {
411
- updateBar(93, `${msg.to_embed} to embed, ${msg.skipped} cached`);
412
- }
413
- else if (msg.phase === 'embedding') {
414
- const scaled = 93 + Math.round((msg.progress / 100) * 4);
415
- updateBar(scaled, `Embedding... ${msg.progress}% (${msg.embedded} written)`);
416
- }
417
- else if (msg.phase === 'embedded') {
418
- updateBar(97, `Embedded ${msg.count} nodes (${(msg.ms / 1000).toFixed(1)}s)`);
419
- }
420
- else if (msg.phase === 'done') {
421
- updateBar(98, `Embeddings complete (${msg.embedded} new, ${msg.skipped} cached)`);
422
- }
423
- }
424
- catch { }
425
- }
357
+ updateBar(90, 'Generating embeddings (bge-small)...');
358
+ const { buildNlEmbeddings } = await import('../core/embeddings/nl-embedder.js');
359
+ try {
360
+ const result = await buildNlEmbeddings(db, (current, total) => {
361
+ const pct = 90 + Math.round((current / Math.max(total, 1)) * 8);
362
+ updateBar(pct, `Embeddings (${current}/${total})`, 'Embeddings');
426
363
  });
427
- });
428
- // Reopen DB after Python is done
429
- db = openDb(dbPath);
364
+ updateBar(98, `Embeddings: ${result.embedded} embedded, ${result.skipped} cached (${(result.durationMs / 1000).toFixed(1)}s)`);
365
+ }
366
+ catch (err) {
367
+ console.error(`\n Warning: Embedding failed: ${err instanceof Error ? err.message : err}`);
368
+ embeddingFailed = true;
369
+ }
430
370
  }
431
371
  // Phase 5: Finalize (98-100%)
432
372
  recordPhase('finalize');
@@ -519,7 +459,7 @@ export const analyzeCommand = async (inputPath, options) => {
519
459
  'search-text': 'Search text',
520
460
  fts: 'FTS indexing',
521
461
  'restore-embeddings': 'Restore embeddings',
522
- embeddings: 'Embeddings (MLX)',
462
+ embeddings: 'Embeddings (bge-small)',
523
463
  finalize: 'Finalize & context',
524
464
  done: 'Done',
525
465
  };
package/dist/cli/index.js CHANGED
@@ -22,7 +22,7 @@ program
22
22
  .command('analyze [path]')
23
23
  .description('Index a repository (full analysis)')
24
24
  .option('-f, --force', 'Force full re-index even if up to date')
25
- .option('--embeddings', 'Enable embedding generation for semantic search (on by default)', true)
25
+ .option('--embeddings', 'Generate semantic embeddings (bge-small, CPU, fast)')
26
26
  .option('--no-embeddings', 'Skip embedding generation')
27
27
  .option('--no-tsgo', 'Skip tsgo LSP for call resolution (faster, less accurate)')
28
28
  .option('-v, --verbose', 'Enable verbose ingestion warnings (default: false)')
@@ -138,6 +138,15 @@ export declare function searchFileWords(db: Database.Database, query: string, li
138
138
  }>;
139
139
  /** Clear all file words (used before full rebuild) */
140
140
  export declare function clearFileWords(db: Database.Database): void;
141
+ /** Count NL embeddings in the index */
142
+ export declare function countNlEmbeddings(db: Database.Database): number;
143
+ /** Search NL embeddings via brute-force cosine similarity */
144
+ export declare function searchNlVector(db: Database.Database, queryVec: number[], limit?: number, maxDistance?: number): Array<{
145
+ nodeId: string;
146
+ distance: number;
147
+ source: string;
148
+ text: string;
149
+ }>;
141
150
  /** Escape a string for use in SQL single-quoted literals. */
142
151
  export declare function escapeSql(value: string): string;
143
152
  /** Execute a raw SQL query and return rows. */
@@ -594,6 +594,46 @@ export function clearFileWords(db) {
594
594
  db.prepare('DELETE FROM file_words').run();
595
595
  }
596
596
  // ---------------------------------------------------------------------------
597
+ // NL Embeddings
598
+ // ---------------------------------------------------------------------------
599
+ /** Count NL embeddings in the index */
600
+ export function countNlEmbeddings(db) {
601
+ try {
602
+ const row = db.prepare('SELECT COUNT(*) as cnt FROM nl_embeddings').get();
603
+ return row?.cnt ?? 0;
604
+ }
605
+ catch {
606
+ return 0;
607
+ }
608
+ }
609
+ /** Search NL embeddings via brute-force cosine similarity */
610
+ export function searchNlVector(db, queryVec, limit = 10, maxDistance = 0.5) {
611
+ let rows;
612
+ try {
613
+ rows = db.prepare('SELECT nodeId, embedding, source, text FROM nl_embeddings').all();
614
+ }
615
+ catch {
616
+ return [];
617
+ }
618
+ const results = [];
619
+ for (const row of rows) {
620
+ const vec = new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.byteLength / 4);
621
+ let dot = 0, normA = 0, normB = 0;
622
+ for (let i = 0; i < queryVec.length && i < vec.length; i++) {
623
+ dot += queryVec[i] * vec[i];
624
+ normA += queryVec[i] * queryVec[i];
625
+ normB += vec[i] * vec[i];
626
+ }
627
+ const similarity = dot / (Math.sqrt(normA) * Math.sqrt(normB));
628
+ const distance = 1 - similarity;
629
+ if (distance < maxDistance) {
630
+ results.push({ nodeId: row.nodeId, distance, source: row.source, text: row.text });
631
+ }
632
+ }
633
+ results.sort((a, b) => a.distance - b.distance);
634
+ return results.slice(0, limit);
635
+ }
636
+ // ---------------------------------------------------------------------------
597
637
  // Raw SQL escape (for dynamic queries in local-backend.ts)
598
638
  // ---------------------------------------------------------------------------
599
639
  /** Escape a string for use in SQL single-quoted literals. */
@@ -68,6 +68,14 @@ export interface EmbeddingRow {
68
68
  readonly embedding: Buffer;
69
69
  readonly textHash: string | null;
70
70
  }
71
+ /** A NL embedding row as stored in the `nl_embeddings` table */
72
+ export interface NlEmbeddingRow {
73
+ readonly nodeId: NodeId;
74
+ readonly embedding: Buffer;
75
+ readonly textHash: string | null;
76
+ readonly source: string;
77
+ readonly text: string;
78
+ }
71
79
  /** A reference occurrence as stored in the `refs` table */
72
80
  export interface RefsRow {
73
81
  readonly symbol: string;
@@ -118,4 +126,4 @@ export interface EdgeInsert {
118
126
  }
119
127
  /** Legacy edge table name constant (kept for compatibility) */
120
128
  export declare const REL_TABLE_NAME = "CodeRelation";
121
- export declare const SCHEMA_SQL = "\n-- Nodes: unified table for all code elements\nCREATE TABLE IF NOT EXISTS nodes (\n id TEXT PRIMARY KEY,\n label TEXT NOT NULL,\n name TEXT NOT NULL DEFAULT '',\n filePath TEXT NOT NULL DEFAULT '',\n startLine INTEGER,\n endLine INTEGER,\n isExported INTEGER,\n content TEXT NOT NULL DEFAULT '',\n description TEXT NOT NULL DEFAULT '',\n heuristicLabel TEXT,\n cohesion REAL,\n symbolCount INTEGER,\n keywords TEXT,\n enrichedBy TEXT,\n processType TEXT,\n stepCount INTEGER,\n communities TEXT,\n entryPointId TEXT,\n terminalId TEXT,\n parameterCount INTEGER,\n returnType TEXT,\n nameExpanded TEXT DEFAULT '',\n searchText TEXT DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);\nCREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath ON nodes(filePath);\nCREATE INDEX IF NOT EXISTS idx_nodes_label_name ON nodes(label, name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath_lines ON nodes(filePath, startLine, endLine);\n\n-- Edges: single table for all relationships\nCREATE TABLE IF NOT EXISTS edges (\n id TEXT PRIMARY KEY,\n sourceId TEXT NOT NULL,\n targetId TEXT NOT NULL,\n type TEXT NOT NULL,\n confidence REAL NOT NULL DEFAULT 1.0,\n reason TEXT NOT NULL DEFAULT '',\n step INTEGER NOT NULL DEFAULT 0,\n callLine INTEGER\n);\n\nCREATE INDEX IF NOT EXISTS idx_edges_sourceId ON edges(sourceId);\nCREATE INDEX IF NOT EXISTS idx_edges_targetId ON edges(targetId);\nCREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);\nCREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(sourceId, type);\nCREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(targetId, type);\n\n-- Embeddings: vector storage\nCREATE TABLE IF NOT EXISTS embeddings (\n nodeId TEXT PRIMARY KEY,\n embedding BLOB NOT NULL,\n textHash TEXT\n);\n\n-- FTS5 virtual table (auto-updated via triggers)\nCREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(\n name,\n nameExpanded,\n searchText,\n filePath,\n content,\n content='nodes',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN\n INSERT INTO nodes_fts(rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\n\n-- Refs: identifier occurrence index (pre-computed grep)\nCREATE TABLE IF NOT EXISTS refs (\n symbol TEXT NOT NULL,\n filePath TEXT NOT NULL,\n line INTEGER NOT NULL\n);\n\nCREATE INDEX IF NOT EXISTS idx_refs_symbol ON refs(symbol);\nCREATE INDEX IF NOT EXISTS idx_refs_filePath ON refs(filePath);\nCREATE INDEX IF NOT EXISTS idx_refs_symbol_file ON refs(symbol, filePath);\n\n-- File-level word index for conceptual search\nCREATE TABLE IF NOT EXISTS file_words (\n filePath TEXT PRIMARY KEY,\n words TEXT NOT NULL DEFAULT ''\n);\n\nCREATE VIRTUAL TABLE IF NOT EXISTS file_words_fts USING fts5(\n words,\n content='file_words',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ai AFTER INSERT ON file_words BEGIN\n INSERT INTO file_words_fts(rowid, words) VALUES (new.rowid, new.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ad AFTER DELETE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_au AFTER UPDATE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES (new.rowid, new.words);\nEND;\n";
129
+ export declare const SCHEMA_SQL = "\n-- Nodes: unified table for all code elements\nCREATE TABLE IF NOT EXISTS nodes (\n id TEXT PRIMARY KEY,\n label TEXT NOT NULL,\n name TEXT NOT NULL DEFAULT '',\n filePath TEXT NOT NULL DEFAULT '',\n startLine INTEGER,\n endLine INTEGER,\n isExported INTEGER,\n content TEXT NOT NULL DEFAULT '',\n description TEXT NOT NULL DEFAULT '',\n heuristicLabel TEXT,\n cohesion REAL,\n symbolCount INTEGER,\n keywords TEXT,\n enrichedBy TEXT,\n processType TEXT,\n stepCount INTEGER,\n communities TEXT,\n entryPointId TEXT,\n terminalId TEXT,\n parameterCount INTEGER,\n returnType TEXT,\n nameExpanded TEXT DEFAULT '',\n searchText TEXT DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);\nCREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath ON nodes(filePath);\nCREATE INDEX IF NOT EXISTS idx_nodes_label_name ON nodes(label, name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath_lines ON nodes(filePath, startLine, endLine);\n\n-- Edges: single table for all relationships\nCREATE TABLE IF NOT EXISTS edges (\n id TEXT PRIMARY KEY,\n sourceId TEXT NOT NULL,\n targetId TEXT NOT NULL,\n type TEXT NOT NULL,\n confidence REAL NOT NULL DEFAULT 1.0,\n reason TEXT NOT NULL DEFAULT '',\n step INTEGER NOT NULL DEFAULT 0,\n callLine INTEGER\n);\n\nCREATE INDEX IF NOT EXISTS idx_edges_sourceId ON edges(sourceId);\nCREATE INDEX IF NOT EXISTS idx_edges_targetId ON edges(targetId);\nCREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);\nCREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(sourceId, type);\nCREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(targetId, type);\n\n-- Embeddings: vector storage\nCREATE TABLE IF NOT EXISTS embeddings (\n nodeId TEXT PRIMARY KEY,\n embedding BLOB NOT NULL,\n textHash TEXT\n);\n\n-- NL Embeddings: natural language description vectors (bge-small, 384-dim)\nCREATE TABLE IF NOT EXISTS nl_embeddings (\n nodeId TEXT NOT NULL,\n embedding BLOB NOT NULL,\n textHash TEXT,\n source TEXT NOT NULL DEFAULT 'comment',\n text TEXT NOT NULL DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nl_emb_nodeId ON nl_embeddings(nodeId);\n\n-- FTS5 virtual table (auto-updated via triggers)\nCREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(\n name,\n nameExpanded,\n searchText,\n filePath,\n content,\n content='nodes',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN\n INSERT INTO nodes_fts(rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\n\n-- Refs: identifier occurrence index (pre-computed grep)\nCREATE TABLE IF NOT EXISTS refs (\n symbol TEXT NOT NULL,\n filePath TEXT NOT NULL,\n line INTEGER NOT NULL\n);\n\nCREATE INDEX IF NOT EXISTS idx_refs_symbol ON refs(symbol);\nCREATE INDEX IF NOT EXISTS idx_refs_filePath ON refs(filePath);\nCREATE INDEX IF NOT EXISTS idx_refs_symbol_file ON refs(symbol, filePath);\n\n-- File-level word index for conceptual search\nCREATE TABLE IF NOT EXISTS file_words (\n filePath TEXT PRIMARY KEY,\n words TEXT NOT NULL DEFAULT ''\n);\n\nCREATE VIRTUAL TABLE IF NOT EXISTS file_words_fts USING fts5(\n words,\n content='file_words',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ai AFTER INSERT ON file_words BEGIN\n INSERT INTO file_words_fts(rowid, words) VALUES (new.rowid, new.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_ad AFTER DELETE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\nEND;\nCREATE TRIGGER IF NOT EXISTS file_words_fts_au AFTER UPDATE ON file_words BEGIN\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES ('delete', old.rowid, old.words);\n INSERT INTO file_words_fts(file_words_fts, rowid, words) VALUES (new.rowid, new.words);\nEND;\n";
@@ -114,6 +114,17 @@ CREATE TABLE IF NOT EXISTS embeddings (
114
114
  textHash TEXT
115
115
  );
116
116
 
117
+ -- NL Embeddings: natural language description vectors (bge-small, 384-dim)
118
+ CREATE TABLE IF NOT EXISTS nl_embeddings (
119
+ nodeId TEXT NOT NULL,
120
+ embedding BLOB NOT NULL,
121
+ textHash TEXT,
122
+ source TEXT NOT NULL DEFAULT 'comment',
123
+ text TEXT NOT NULL DEFAULT ''
124
+ );
125
+
126
+ CREATE INDEX IF NOT EXISTS idx_nl_emb_nodeId ON nl_embeddings(nodeId);
127
+
117
128
  -- FTS5 virtual table (auto-updated via triggers)
118
129
  CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(
119
130
  name,
@@ -1,5 +1,4 @@
1
- /** @file index.ts @description Barrel re-exports for the embedding pipeline system */
1
+ /** @file Barrel re-exports for the embedding system (bge-small NL embedder) */
2
2
  export * from './types.js';
3
- export * from './embedder.js';
4
3
  export * from './text-generator.js';
5
- export * from './embedding-pipeline.js';
4
+ export * from './nl-embedder.js';
@@ -1,6 +1,5 @@
1
1
  // code-mapper/src/core/embeddings/index.ts
2
- /** @file index.ts @description Barrel re-exports for the embedding pipeline system */
2
+ /** @file Barrel re-exports for the embedding system (bge-small NL embedder) */
3
3
  export * from './types.js';
4
- export * from './embedder.js';
5
4
  export * from './text-generator.js';
6
- export * from './embedding-pipeline.js';
5
+ export * from './nl-embedder.js';
@@ -0,0 +1,8 @@
1
+ /**
2
+ * @file Worker process for parallel NL embedding.
3
+ * Spawned by buildNlEmbeddings — loads bge-small independently,
4
+ * embeds texts received via IPC, sends vectors back.
5
+ *
6
+ * Same architecture as parallel tsgo: N processes, each with own model.
7
+ */
8
+ export {};
@@ -0,0 +1,38 @@
1
+ // code-mapper/src/core/embeddings/nl-embed-worker.ts
2
+ /**
3
+ * @file Worker process for parallel NL embedding.
4
+ * Spawned by buildNlEmbeddings — loads bge-small independently,
5
+ * embeds texts received via IPC, sends vectors back.
6
+ *
7
+ * Same architecture as parallel tsgo: N processes, each with own model.
8
+ */
9
+ import { pipeline } from '@huggingface/transformers';
10
+ const MODEL_ID = 'Xenova/bge-small-en-v1.5';
11
+ async function main() {
12
+ // Load model
13
+ const extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
14
+ process.send({ type: 'ready' });
15
+ // Process messages from parent
16
+ process.on('message', async (msg) => {
17
+ if (msg.type === 'embed') {
18
+ const results = [];
19
+ for (const item of msg.items) {
20
+ try {
21
+ const result = await extractor(item.text, { pooling: 'cls', normalize: true });
22
+ results.push({ nodeId: item.nodeId, vec: Array.from(result.data) });
23
+ }
24
+ catch {
25
+ // Skip failed embeddings
26
+ }
27
+ }
28
+ process.send({ type: 'results', results, batchId: msg.batchId });
29
+ }
30
+ else if (msg.type === 'exit') {
31
+ process.exit(0);
32
+ }
33
+ });
34
+ }
35
+ main().catch(err => {
36
+ console.error('NL embed worker failed:', err);
37
+ process.exit(1);
38
+ });
@@ -0,0 +1,44 @@
1
+ /**
2
+ * @file Natural language embedder using bge-small-en-v1.5.
3
+ *
4
+ * Runs entirely in Node.js via @huggingface/transformers — no Python, no GPU.
5
+ * Embeds human-readable descriptions extracted from code (JSDoc comments,
6
+ * enum values, type patterns, file headers) for conceptual search.
7
+ *
8
+ * 33M params, q8 quantized, 384-dim embeddings, ~6ms/text on CPU.
9
+ */
10
+ import type Database from 'better-sqlite3';
11
+ /** Initialize the NL embedding model (lazy, idempotent) */
12
+ export declare function initNlEmbedder(): Promise<void>;
13
+ /** Check if the NL embedder is ready */
14
+ export declare function isNlEmbedderReady(): boolean;
15
+ /** Embed a single text, returns Float32Array */
16
+ export declare function nlEmbed(text: string): Promise<number[]>;
17
+ /** Embed a batch of texts (processes in sub-batches for memory efficiency) */
18
+ export declare function nlEmbedBatch(texts: string[]): Promise<number[][]>;
19
+ interface NodeForNl {
20
+ id: string;
21
+ name: string;
22
+ label: string;
23
+ filePath: string;
24
+ content: string;
25
+ startLine: number | null;
26
+ description: string;
27
+ }
28
+ interface NlDocument {
29
+ nodeId: string;
30
+ source: string;
31
+ text: string;
32
+ }
33
+ /** Build NL documents from a node */
34
+ export declare function extractNlTexts(node: NodeForNl): NlDocument[];
35
+ /**
36
+ * Build NL embeddings for all eligible nodes in the database.
37
+ * Reads nodes, extracts NL text, embeds with bge-small, writes to nl_embeddings.
38
+ */
39
+ export declare function buildNlEmbeddings(db: Database.Database, onProgress?: (current: number, total: number) => void): Promise<{
40
+ embedded: number;
41
+ skipped: number;
42
+ durationMs: number;
43
+ }>;
44
+ export {};