@optave/codegraph 1.1.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/embedder.js CHANGED
@@ -1,45 +1,70 @@
1
- import fs from "fs";
2
- import path from "path";
3
-
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
4
3
  import Database from 'better-sqlite3';
5
4
  import { findDbPath, openReadonlyOrFail } from './db.js';
6
- import { warn, debug } from './logger.js';
5
+ import { warn } from './logger.js';
7
6
 
8
7
  // Lazy-load transformers (heavy, optional module)
9
8
  let pipeline = null;
10
- let cos_sim = null;
9
+ let _cos_sim = null;
11
10
  let extractor = null;
12
11
  let activeModel = null;
13
12
 
14
13
  export const MODELS = {
15
- 'minilm': {
14
+ minilm: {
16
15
  name: 'Xenova/all-MiniLM-L6-v2',
17
16
  dim: 384,
18
17
  desc: 'Smallest, fastest (~23MB). General text.',
19
- quantized: true
18
+ quantized: true,
20
19
  },
21
20
  'jina-small': {
22
21
  name: 'Xenova/jina-embeddings-v2-small-en',
23
22
  dim: 512,
24
23
  desc: 'Small, good quality (~33MB). General text.',
25
- quantized: false
24
+ quantized: false,
26
25
  },
27
26
  'jina-base': {
28
27
  name: 'Xenova/jina-embeddings-v2-base-en',
29
28
  dim: 768,
30
29
  desc: 'Good quality (~137MB). General text, 8192 token context.',
31
- quantized: false
30
+ quantized: false,
31
+ },
32
+ 'jina-code': {
33
+ name: 'Xenova/jina-embeddings-v2-base-code',
34
+ dim: 768,
35
+ desc: 'Code-aware (~137MB). Trained on code+text, best for code search.',
36
+ quantized: false,
32
37
  },
33
- 'nomic': {
38
+ nomic: {
34
39
  name: 'Xenova/nomic-embed-text-v1',
35
40
  dim: 768,
36
- desc: 'Best local quality (~137MB). 8192 context, beats OpenAI ada-002.',
37
- quantized: false
38
- }
41
+ desc: 'Good local quality (~137MB). 8192 context.',
42
+ quantized: false,
43
+ },
44
+ 'nomic-v1.5': {
45
+ name: 'nomic-ai/nomic-embed-text-v1.5',
46
+ dim: 768,
47
+ desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.',
48
+ quantized: false,
49
+ },
50
+ 'bge-large': {
51
+ name: 'Xenova/bge-large-en-v1.5',
52
+ dim: 1024,
53
+ desc: 'Best general retrieval (~335MB). Top MTEB scores.',
54
+ quantized: false,
55
+ },
39
56
  };
40
57
 
41
58
  export const DEFAULT_MODEL = 'minilm';
42
- const BATCH_SIZE_MAP = { 'minilm': 32, 'jina-small': 16, 'jina-base': 8, 'nomic': 8 };
59
+ const BATCH_SIZE_MAP = {
60
+ minilm: 32,
61
+ 'jina-small': 16,
62
+ 'jina-base': 8,
63
+ 'jina-code': 8,
64
+ nomic: 8,
65
+ 'nomic-v1.5': 8,
66
+ 'bge-large': 4,
67
+ };
43
68
  const DEFAULT_BATCH_SIZE = 32;
44
69
 
45
70
  function getModelConfig(modelKey) {
@@ -62,7 +87,7 @@ async function loadTransformers() {
62
87
  } catch {
63
88
  console.error(
64
89
  'Semantic search requires @huggingface/transformers.\n' +
65
- 'Install it with: npm install @huggingface/transformers'
90
+ 'Install it with: npm install @huggingface/transformers',
66
91
  );
67
92
  process.exit(1);
68
93
  }
@@ -75,7 +100,7 @@ async function loadModel(modelKey) {
75
100
 
76
101
  const transformers = await loadTransformers();
77
102
  pipeline = transformers.pipeline;
78
- cos_sim = transformers.cos_sim;
103
+ _cos_sim = transformers.cos_sim;
79
104
 
80
105
  console.log(`Loading embedding model: ${config.name} (${config.dim}d)...`);
81
106
  const opts = config.quantized ? { quantized: true } : {};
@@ -119,7 +144,9 @@ export async function embed(texts, modelKey) {
119
144
  * Cosine similarity between two Float32Arrays.
120
145
  */
121
146
  export function cosineSim(a, b) {
122
- let dot = 0, normA = 0, normB = 0;
147
+ let dot = 0,
148
+ normA = 0,
149
+ normB = 0;
123
150
  for (let i = 0; i < a.length; i++) {
124
151
  dot += a[i] * b[i];
125
152
  normA += a[i] * a[i];
@@ -157,9 +184,11 @@ export async function buildEmbeddings(rootDir, modelKey) {
157
184
  db.exec('DELETE FROM embeddings');
158
185
  db.exec('DELETE FROM embedding_meta');
159
186
 
160
- const nodes = db.prepare(
161
- `SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`
162
- ).all();
187
+ const nodes = db
188
+ .prepare(
189
+ `SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`,
190
+ )
191
+ .all();
163
192
 
164
193
  console.log(`Building embeddings for ${nodes.length} symbols...`);
165
194
 
@@ -200,7 +229,9 @@ export async function buildEmbeddings(rootDir, modelKey) {
200
229
  console.log(`Embedding ${texts.length} symbols...`);
201
230
  const { vectors, dim } = await embed(texts, modelKey);
202
231
 
203
- const insert = db.prepare('INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview) VALUES (?, ?, ?)');
232
+ const insert = db.prepare(
233
+ 'INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview) VALUES (?, ?, ?)',
234
+ );
204
235
  const insertMeta = db.prepare('INSERT OR REPLACE INTO embedding_meta (key, value) VALUES (?, ?)');
205
236
  const insertAll = db.transaction(() => {
206
237
  for (let i = 0; i < vectors.length; i++) {
@@ -214,32 +245,31 @@ export async function buildEmbeddings(rootDir, modelKey) {
214
245
  });
215
246
  insertAll();
216
247
 
217
- console.log(`\nStored ${vectors.length} embeddings (${dim}d, ${getModelConfig(modelKey).name}) in graph.db`);
248
+ console.log(
249
+ `\nStored ${vectors.length} embeddings (${dim}d, ${getModelConfig(modelKey).name}) in graph.db`,
250
+ );
218
251
  db.close();
219
252
  }
220
253
 
221
254
  /**
222
- * Semantic search with pre-filter support to reduce the search space.
255
+ * Shared setup for search functions: opens DB, validates embeddings/model, loads rows.
256
+ * Returns { db, rows, modelKey, storedDim } or null on failure (prints error).
223
257
  */
224
- export async function search(query, customDbPath, opts = {}) {
225
- const limit = opts.limit || 15;
226
- const noTests = opts.noTests || false;
227
- const minScore = opts.minScore || 0.2;
228
-
258
+ function _prepareSearch(customDbPath, opts = {}) {
229
259
  const db = openReadonlyOrFail(customDbPath);
230
260
 
231
261
  let count;
232
262
  try {
233
- count = db.prepare("SELECT COUNT(*) as c FROM embeddings").get().c;
263
+ count = db.prepare('SELECT COUNT(*) as c FROM embeddings').get().c;
234
264
  } catch {
235
265
  console.log('No embeddings table found. Run `codegraph embed` first.');
236
266
  db.close();
237
- return;
267
+ return null;
238
268
  }
239
269
  if (count === 0) {
240
270
  console.log('No embeddings found. Run `codegraph embed` first.');
241
271
  db.close();
242
- return;
272
+ return null;
243
273
  }
244
274
 
245
275
  let storedModel = null;
@@ -248,26 +278,23 @@ export async function search(query, customDbPath, opts = {}) {
248
278
  const modelRow = db.prepare("SELECT value FROM embedding_meta WHERE key = 'model'").get();
249
279
  const dimRow = db.prepare("SELECT value FROM embedding_meta WHERE key = 'dim'").get();
250
280
  if (modelRow) storedModel = modelRow.value;
251
- if (dimRow) storedDim = parseInt(dimRow.value);
252
- } catch { /* old DB without meta table */ }
281
+ if (dimRow) storedDim = parseInt(dimRow.value, 10);
282
+ } catch {
283
+ /* old DB without meta table */
284
+ }
253
285
 
254
286
  let modelKey = opts.model || null;
255
287
  if (!modelKey && storedModel) {
256
288
  for (const [key, config] of Object.entries(MODELS)) {
257
- if (config.name === storedModel) { modelKey = key; break; }
289
+ if (config.name === storedModel) {
290
+ modelKey = key;
291
+ break;
292
+ }
258
293
  }
259
294
  }
260
295
 
261
- const { vectors: [queryVec], dim } = await embed([query], modelKey);
262
-
263
- if (storedDim && dim !== storedDim) {
264
- console.log(`Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`);
265
- console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`);
266
- db.close();
267
- return;
268
- }
269
-
270
296
  // Pre-filter: allow filtering by kind or file pattern to reduce search space
297
+ const noTests = opts.noTests || false;
271
298
  const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./;
272
299
  let sql = `
273
300
  SELECT e.node_id, e.vector, e.text_preview, n.name, n.kind, n.file, n.line
@@ -285,15 +312,45 @@ export async function search(query, customDbPath, opts = {}) {
285
312
  params.push(`%${opts.filePattern}%`);
286
313
  }
287
314
  if (conditions.length > 0) {
288
- sql += ' WHERE ' + conditions.join(' AND ');
315
+ sql += ` WHERE ${conditions.join(' AND ')}`;
316
+ }
317
+
318
+ let rows = db.prepare(sql).all(...params);
319
+ if (noTests) {
320
+ rows = rows.filter((row) => !TEST_PATTERN.test(row.file));
289
321
  }
290
322
 
291
- const rows = db.prepare(sql).all(...params);
323
+ return { db, rows, modelKey, storedDim };
324
+ }
325
+
326
+ /**
327
+ * Single-query semantic search — returns data instead of printing.
328
+ * Returns { results: [{ name, kind, file, line, similarity }] } or null on failure.
329
+ */
330
+ export async function searchData(query, customDbPath, opts = {}) {
331
+ const limit = opts.limit || 15;
332
+ const minScore = opts.minScore || 0.2;
333
+
334
+ const prepared = _prepareSearch(customDbPath, opts);
335
+ if (!prepared) return null;
336
+ const { db, rows, modelKey, storedDim } = prepared;
337
+
338
+ const {
339
+ vectors: [queryVec],
340
+ dim,
341
+ } = await embed([query], modelKey);
342
+
343
+ if (storedDim && dim !== storedDim) {
344
+ console.log(
345
+ `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
346
+ );
347
+ console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`);
348
+ db.close();
349
+ return null;
350
+ }
292
351
 
293
352
  const results = [];
294
353
  for (const row of rows) {
295
- if (noTests && TEST_PATTERN.test(row.file)) continue;
296
-
297
354
  const vec = new Float32Array(new Uint8Array(row.vector).buffer);
298
355
  const sim = cosineSim(queryVec, vec);
299
356
 
@@ -303,28 +360,165 @@ export async function search(query, customDbPath, opts = {}) {
303
360
  kind: row.kind,
304
361
  file: row.file,
305
362
  line: row.line,
306
- similarity: sim
363
+ similarity: sim,
307
364
  });
308
365
  }
309
366
  }
310
367
 
311
368
  results.sort((a, b) => b.similarity - a.similarity);
369
+ db.close();
370
+ return { results: results.slice(0, limit) };
371
+ }
372
+
373
+ /**
374
+ * Multi-query semantic search with Reciprocal Rank Fusion (RRF).
375
+ * Returns { results: [{ name, kind, file, line, rrf, queryScores }] } or null on failure.
376
+ */
377
+ export async function multiSearchData(queries, customDbPath, opts = {}) {
378
+ const limit = opts.limit || 15;
379
+ const minScore = opts.minScore || 0.2;
380
+ const k = opts.rrfK || 60;
381
+
382
+ const prepared = _prepareSearch(customDbPath, opts);
383
+ if (!prepared) return null;
384
+ const { db, rows, modelKey, storedDim } = prepared;
385
+
386
+ const { vectors: queryVecs, dim } = await embed(queries, modelKey);
387
+
388
+ // Warn about similar queries that may bias RRF results
389
+ const SIMILARITY_WARN_THRESHOLD = 0.85;
390
+ for (let i = 0; i < queryVecs.length; i++) {
391
+ for (let j = i + 1; j < queryVecs.length; j++) {
392
+ const sim = cosineSim(queryVecs[i], queryVecs[j]);
393
+ if (sim >= SIMILARITY_WARN_THRESHOLD) {
394
+ warn(
395
+ `Queries "${queries[i]}" and "${queries[j]}" are very similar ` +
396
+ `(${(sim * 100).toFixed(0)}% cosine similarity). ` +
397
+ `This may bias RRF results toward their shared matches. ` +
398
+ `Consider using more distinct queries.`,
399
+ );
400
+ }
401
+ }
402
+ }
403
+
404
+ if (storedDim && dim !== storedDim) {
405
+ console.log(
406
+ `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
407
+ );
408
+ console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`);
409
+ db.close();
410
+ return null;
411
+ }
312
412
 
313
- console.log(`\nSemantic search: "${query}"\n`);
413
+ // Parse row vectors once
414
+ const rowVecs = rows.map((row) => new Float32Array(new Uint8Array(row.vector).buffer));
314
415
 
315
- const topResults = results.slice(0, limit);
316
- if (topResults.length === 0) {
317
- console.log(' No results above threshold.');
318
- } else {
319
- for (const r of topResults) {
320
- const bar = '#'.repeat(Math.round(r.similarity * 20));
321
- const kindIcon = r.kind === 'function' ? 'f' : r.kind === 'class' ? '*' : 'o';
322
- console.log(` ${(r.similarity * 100).toFixed(1)}% ${bar}`);
323
- console.log(` ${kindIcon} ${r.name} -- ${r.file}:${r.line}`);
416
+ // For each query: compute similarities, filter by minScore, rank
417
+ const perQueryRanked = queries.map((_query, qi) => {
418
+ const scored = [];
419
+ for (let ri = 0; ri < rows.length; ri++) {
420
+ const sim = cosineSim(queryVecs[qi], rowVecs[ri]);
421
+ if (sim >= minScore) {
422
+ scored.push({ rowIndex: ri, similarity: sim });
423
+ }
324
424
  }
425
+ scored.sort((a, b) => b.similarity - a.similarity);
426
+ // Assign 1-indexed ranks
427
+ return scored.map((item, rank) => ({ ...item, rank: rank + 1 }));
428
+ });
429
+
430
+ // Fuse results using RRF: for each unique row, sum 1/(k + rank_i) across queries
431
+ const fusionMap = new Map(); // rowIndex -> { rrfScore, queryScores[] }
432
+ for (let qi = 0; qi < queries.length; qi++) {
433
+ for (const item of perQueryRanked[qi]) {
434
+ if (!fusionMap.has(item.rowIndex)) {
435
+ fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] });
436
+ }
437
+ const entry = fusionMap.get(item.rowIndex);
438
+ entry.rrfScore += 1 / (k + item.rank);
439
+ entry.queryScores.push({
440
+ query: queries[qi],
441
+ similarity: item.similarity,
442
+ rank: item.rank,
443
+ });
444
+ }
445
+ }
446
+
447
+ // Build results sorted by RRF score
448
+ const results = [];
449
+ for (const [rowIndex, entry] of fusionMap) {
450
+ const row = rows[rowIndex];
451
+ results.push({
452
+ name: row.name,
453
+ kind: row.kind,
454
+ file: row.file,
455
+ line: row.line,
456
+ rrf: entry.rrfScore,
457
+ queryScores: entry.queryScores,
458
+ });
325
459
  }
326
460
 
327
- console.log(`\n ${results.length} results total (showing top ${topResults.length})\n`);
461
+ results.sort((a, b) => b.rrf - a.rrf);
328
462
  db.close();
463
+ return { results: results.slice(0, limit) };
329
464
  }
330
465
 
466
+ /**
467
+ * Semantic search with pre-filter support — CLI wrapper with multi-query detection.
468
+ */
469
+ export async function search(query, customDbPath, opts = {}) {
470
+ // Split by semicolons, trim, filter empties
471
+ const queries = query
472
+ .split(';')
473
+ .map((q) => q.trim())
474
+ .filter((q) => q.length > 0);
475
+
476
+ if (queries.length <= 1) {
477
+ // Single-query path — preserve original output format
478
+ const singleQuery = queries[0] || query;
479
+ const data = await searchData(singleQuery, customDbPath, opts);
480
+ if (!data) return;
481
+
482
+ console.log(`\nSemantic search: "${singleQuery}"\n`);
483
+
484
+ if (data.results.length === 0) {
485
+ console.log(' No results above threshold.');
486
+ } else {
487
+ for (const r of data.results) {
488
+ const bar = '#'.repeat(Math.round(r.similarity * 20));
489
+ const kindIcon = r.kind === 'function' ? 'f' : r.kind === 'class' ? '*' : 'o';
490
+ console.log(` ${(r.similarity * 100).toFixed(1)}% ${bar}`);
491
+ console.log(` ${kindIcon} ${r.name} -- ${r.file}:${r.line}`);
492
+ }
493
+ }
494
+
495
+ console.log(`\n ${data.results.length} results shown\n`);
496
+ } else {
497
+ // Multi-query path — RRF ranking
498
+ const data = await multiSearchData(queries, customDbPath, opts);
499
+ if (!data) return;
500
+
501
+ console.log(`\nMulti-query semantic search (RRF, k=${opts.rrfK || 60}):`);
502
+ queries.forEach((q, i) => {
503
+ console.log(` [${i + 1}] "${q}"`);
504
+ });
505
+ console.log();
506
+
507
+ if (data.results.length === 0) {
508
+ console.log(' No results above threshold.');
509
+ } else {
510
+ for (const r of data.results) {
511
+ const kindIcon = r.kind === 'function' ? 'f' : r.kind === 'class' ? '*' : 'o';
512
+ console.log(` RRF ${r.rrf.toFixed(4)} ${kindIcon} ${r.name} -- ${r.file}:${r.line}`);
513
+ for (const qs of r.queryScores) {
514
+ const bar = '#'.repeat(Math.round(qs.similarity * 20));
515
+ console.log(
516
+ ` [${queries.indexOf(qs.query) + 1}] ${(qs.similarity * 100).toFixed(1)}% ${bar} (rank ${qs.rank})`,
517
+ );
518
+ }
519
+ }
520
+ }
521
+
522
+ console.log(`\n ${data.results.length} results shown\n`);
523
+ }
524
+ }