@comfanion/usethis_search 0.1.5 → 3.0.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  // OpenCode Vectorizer - Semantic Code Search with Multi-Index Support
2
+ // v2: Content cleaning, semantic chunking, hybrid search, metadata, cache, metrics
2
3
 
3
4
  import { pipeline, env } from "@xenova/transformers";
4
5
  import * as lancedb from "vectordb";
@@ -6,6 +7,17 @@ import fs from "fs/promises";
6
7
  import path from "path";
7
8
  import crypto from "crypto";
8
9
 
10
+ // ── New modules ─────────────────────────────────────────────────────────────
11
+ import { cleanContent, DEFAULT_CLEANING_CONFIG } from "./content-cleaner.ts";
12
+ import { extractFileMetadata, detectFileType, detectLanguage } from "./metadata-extractor.ts";
13
+ import { chunkContent, DEFAULT_CHUNKING_CONFIG } from "./chunkers/chunker-factory.ts";
14
+ import { BM25Index } from "./bm25-index.ts";
15
+ import { mergeResults, DEFAULT_HYBRID_CONFIG } from "./hybrid-search.ts";
16
+ import { QueryCache, DEFAULT_CACHE_CONFIG } from "./query-cache.ts";
17
+ import { SearchMetrics } from "./search-metrics.ts";
18
+ import { GraphDB } from "./graph-db.ts";
19
+ import { GraphBuilder } from "./graph-builder.ts";
20
+
9
21
  // Suppress transformers.js logs unless DEBUG is set
10
22
  const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
11
23
  if (!DEBUG) {
@@ -57,6 +69,13 @@ let GLOBAL_IGNORE = [];
57
69
  // Default embedding model (fast). Can be overridden by config.
58
70
  let EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2";
59
71
 
72
+ // ── Extended config parsed from YAML ────────────────────────────────────────
73
+ let CLEANING_CONFIG = { ...DEFAULT_CLEANING_CONFIG };
74
+ let CHUNKING_CONFIG = { ...DEFAULT_CHUNKING_CONFIG };
75
+ let HYBRID_CONFIG = { ...DEFAULT_HYBRID_CONFIG };
76
+ let METRICS_ENABLED = false;
77
+ let CACHE_ENABLED = true;
78
+
60
79
  function defaultVectorizerYaml() {
61
80
  return (
62
81
  `vectorizer:\n` +
@@ -64,6 +83,40 @@ function defaultVectorizerYaml() {
64
83
  ` auto_index: true\n` +
65
84
  ` model: \"${EMBEDDING_MODEL}\"\n` +
66
85
  ` debounce_ms: 1000\n` +
86
+ `\n` +
87
+ ` # Content cleaning before chunking\n` +
88
+ ` cleaning:\n` +
89
+ ` remove_toc: true\n` +
90
+ ` remove_frontmatter_metadata: false\n` +
91
+ ` remove_imports: false\n` +
92
+ ` remove_comments: false\n` +
93
+ `\n` +
94
+ ` # Chunking strategy\n` +
95
+ ` chunking:\n` +
96
+ ` strategy: \"semantic\" # fixed | semantic\n` +
97
+ ` markdown:\n` +
98
+ ` split_by_headings: true\n` +
99
+ ` min_chunk_size: 200\n` +
100
+ ` max_chunk_size: 2000\n` +
101
+ ` preserve_heading_hierarchy: true\n` +
102
+ ` code:\n` +
103
+ ` split_by_functions: true\n` +
104
+ ` include_function_signature: true\n` +
105
+ ` min_chunk_size: 300\n` +
106
+ ` max_chunk_size: 1500\n` +
107
+ ` fixed:\n` +
108
+ ` max_chars: 1500\n` +
109
+ `\n` +
110
+ ` # Hybrid search (vector + BM25)\n` +
111
+ ` search:\n` +
112
+ ` hybrid: false\n` +
113
+ ` bm25_weight: 0.3\n` +
114
+ `\n` +
115
+ ` # Quality monitoring\n` +
116
+ ` quality:\n` +
117
+ ` enable_metrics: false\n` +
118
+ ` enable_cache: true\n` +
119
+ `\n` +
67
120
  ` indexes:\n` +
68
121
  ` code:\n` +
69
122
  ` enabled: true\n` +
@@ -104,8 +157,25 @@ async function ensureDefaultConfig(projectRoot) {
104
157
  }
105
158
  }
106
159
 
160
+ // ── YAML mini-parser helpers ────────────────────────────────────────────────
161
+
162
+ function parseBool(section, key, fallback) {
163
+ const m = section.match(new RegExp(`^\\s+${key}:\\s*(true|false)`, "m"));
164
+ return m ? m[1] === "true" : fallback;
165
+ }
166
+
167
+ function parseNumber(section, key, fallback) {
168
+ const m = section.match(new RegExp(`^\\s+${key}:\\s*(\\d+(?:\\.\\d+)?)`, "m"));
169
+ return m ? parseFloat(m[1]) : fallback;
170
+ }
171
+
172
+ function parseString(section, key, fallback) {
173
+ const m = section.match(new RegExp(`^\\s+${key}:\\s*["']?([^"'\\n]+)["']?`, "m"));
174
+ return m ? m[1].trim() : fallback;
175
+ }
176
+
107
177
  /**
108
- * Load index configuration from .opencode/vectorizer.yaml (preferred) or .opencode/config.yaml.
178
+ * Load index configuration from .opencode/vectorizer.yaml.
109
179
  */
110
180
  async function loadConfig(projectRoot) {
111
181
  try {
@@ -142,6 +212,61 @@ async function loadConfig(projectRoot) {
142
212
  if (DEBUG) console.log("[vectorizer] Using model from config:", EMBEDDING_MODEL);
143
213
  }
144
214
 
215
+ // ── Parse cleaning config ───────────────────────────────────────────────
216
+ const cleaningMatch = section.match(/^\s{2}cleaning:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
217
+ if (cleaningMatch) {
218
+ const cs = cleaningMatch[1];
219
+ CLEANING_CONFIG = {
220
+ remove_toc: parseBool(cs, "remove_toc", true),
221
+ remove_frontmatter_metadata: parseBool(cs, "remove_frontmatter_metadata", false),
222
+ remove_imports: parseBool(cs, "remove_imports", false),
223
+ remove_comments: parseBool(cs, "remove_comments", false),
224
+ };
225
+ }
226
+
227
+ // ── Parse chunking config ───────────────────────────────────────────────
228
+ const chunkingMatch = section.match(/^\s{2}chunking:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
229
+ if (chunkingMatch) {
230
+ const cs = chunkingMatch[1];
231
+ const strategy = parseString(cs, "strategy", "semantic");
232
+ CHUNKING_CONFIG = {
233
+ strategy: strategy,
234
+ markdown: {
235
+ split_by_headings: parseBool(cs, "split_by_headings", true),
236
+ min_chunk_size: parseNumber(cs, "min_chunk_size", 200),
237
+ max_chunk_size: parseNumber(cs, "max_chunk_size", 2000),
238
+ preserve_heading_hierarchy: parseBool(cs, "preserve_heading_hierarchy", true),
239
+ },
240
+ code: {
241
+ split_by_functions: parseBool(cs, "split_by_functions", true),
242
+ include_function_signature: parseBool(cs, "include_function_signature", true),
243
+ min_chunk_size: parseNumber(cs, "min_chunk_size", 300),
244
+ max_chunk_size: parseNumber(cs, "max_chunk_size", 1500),
245
+ },
246
+ fixed: {
247
+ max_chars: parseNumber(cs, "max_chars", 1500),
248
+ },
249
+ };
250
+ }
251
+
252
+ // ── Parse search config ─────────────────────────────────────────────────
253
+ const searchMatch = section.match(/^\s{2}search:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
254
+ if (searchMatch) {
255
+ const ss = searchMatch[1];
256
+ HYBRID_CONFIG = {
257
+ enabled: parseBool(ss, "hybrid", false),
258
+ bm25_weight: parseNumber(ss, "bm25_weight", 0.3),
259
+ };
260
+ }
261
+
262
+ // ── Parse quality config ────────────────────────────────────────────────
263
+ const qualityMatch = section.match(/^\s{2}quality:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
264
+ if (qualityMatch) {
265
+ const qs = qualityMatch[1];
266
+ METRICS_ENABLED = parseBool(qs, "enable_metrics", false);
267
+ CACHE_ENABLED = parseBool(qs, "enable_cache", true);
268
+ }
269
+
145
270
  // Parse global exclude
146
271
  const excludeMatch = section.match(/^\s{2}exclude:\s*\n((?:\s{4}-\s+.+\n?)*)/m);
147
272
  if (excludeMatch) {
@@ -196,12 +321,25 @@ async function loadConfig(projectRoot) {
196
321
  }
197
322
  }
198
323
 
199
- if (DEBUG) console.log("[vectorizer] Loaded config:", { INDEX_PRESETS, GLOBAL_IGNORE });
324
+ if (DEBUG) console.log("[vectorizer] Loaded config:", { INDEX_PRESETS, GLOBAL_IGNORE, HYBRID_CONFIG, CHUNKING_CONFIG });
200
325
  } catch {
201
326
  if (DEBUG) console.log("[vectorizer] Using default presets (config load failed)");
202
327
  }
203
328
  }
204
329
 
330
+ // ── Shared query cache (singleton per process) ─────────────────────────────
331
+ let _queryCache = null;
332
+ function getQueryCache() {
333
+ if (!_queryCache) _queryCache = new QueryCache(DEFAULT_CACHE_CONFIG);
334
+ return _queryCache;
335
+ }
336
+ function clearQueryCache() {
337
+ if (_queryCache) {
338
+ _queryCache.destroy();
339
+ _queryCache = null;
340
+ }
341
+ }
342
+
205
343
  class CodebaseIndexer {
206
344
  constructor(projectRoot, indexName = "code") {
207
345
  this.root = projectRoot;
@@ -212,6 +350,10 @@ class CodebaseIndexer {
212
350
  this.db = null;
213
351
  this.hashes = {};
214
352
  this.configLoaded = false;
353
+ this.bm25 = null; // lazy-built BM25 index
354
+ this.metrics = null; // lazy-loaded SearchMetrics
355
+ this.graphDB = null; // Graph DB for relationships
356
+ this.graphBuilder = null; // Graph builder orchestrator
215
357
  }
216
358
 
217
359
  async init() {
@@ -222,22 +364,42 @@ class CodebaseIndexer {
222
364
  await fs.mkdir(this.cacheDir, { recursive: true });
223
365
  this.db = await lancedb.connect(path.join(this.cacheDir, "lancedb"));
224
366
  await this.loadHashes();
367
+
368
+ const graphType = this.indexName === "docs" ? "doc_graph" : "code_graph";
369
+ const graphPath = path.join(this.root, ".opencode", "graph", graphType);
370
+ await fs.mkdir(path.dirname(graphPath), { recursive: true });
371
+ this.graphDB = await new GraphDB(graphPath).init();
372
+ this.graphBuilder = new GraphBuilder(this.graphDB, this.root);
373
+
225
374
  return this;
226
375
  }
227
376
 
228
377
  async loadModel() {
229
378
  if (!this.model) {
230
- if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}...`);
231
- this.model = await pipeline("feature-extraction", EMBEDDING_MODEL, {
232
- progress_callback: DEBUG ? undefined : null,
233
- });
234
- if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`);
379
+ try {
380
+ if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}...`);
381
+ this.model = await pipeline("feature-extraction", EMBEDDING_MODEL, {
382
+ progress_callback: DEBUG ? undefined : null,
383
+ });
384
+ if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`);
385
+ } catch (error) {
386
+ this.model = null;
387
+ throw new Error(`Model loading failed: ${error.message || error}`);
388
+ }
235
389
  }
236
390
  return this.model;
237
391
  }
238
392
 
239
393
  async unloadModel() {
240
394
  this.model = null;
395
+ // Release BM25 data held in memory
396
+ if (this.bm25) {
397
+ this.bm25.clear();
398
+ this.bm25 = null;
399
+ }
400
+ this._bm25Rows = null;
401
+ this.metrics = null;
402
+ clearQueryCache();
241
403
  if (global.gc) global.gc();
242
404
  }
243
405
 
@@ -274,12 +436,28 @@ class CodebaseIndexer {
274
436
  return false;
275
437
  }
276
438
 
439
+ // ── Embedding (with optional cache) ───────────────────────────────────────
440
+
277
441
  async embed(text) {
278
442
  const model = await this.loadModel();
279
443
  const result = await model(text, { pooling: "mean", normalize: true });
280
444
  return Array.from(result.data);
281
445
  }
282
446
 
447
+ async embedQuery(text) {
448
+ if (CACHE_ENABLED) {
449
+ const cache = getQueryCache();
450
+ const cached = cache.get(text);
451
+ if (cached) return cached;
452
+ const embedding = await this.embed(text);
453
+ cache.set(text, embedding);
454
+ return embedding;
455
+ }
456
+ return this.embed(text);
457
+ }
458
+
459
+ // ── Legacy chunker (kept for backward compat / "fixed" strategy) ──────────
460
+
283
461
  chunkCode(content, maxChars = 1500) {
284
462
  const chunks = [];
285
463
  const lines = content.split("\n");
@@ -309,6 +487,8 @@ class CodebaseIndexer {
309
487
  return this.hashes[relPath] !== currentHash;
310
488
  }
311
489
 
490
+ // ── Index a single file (v2: cleaning + semantic chunking + metadata) ─────
491
+
312
492
  async indexFile(filePath) {
313
493
  const relPath = path.relative(this.root, filePath);
314
494
 
@@ -324,21 +504,47 @@ class CodebaseIndexer {
324
504
  return false;
325
505
  }
326
506
 
327
- const chunks = this.chunkCode(content);
507
+ // Extract metadata
508
+ const fileMeta = await extractFileMetadata(filePath, content);
328
509
  const archived = this.isArchived(relPath, content);
329
- const data = [];
330
510
 
331
- for (let i = 0; i < chunks.length; i++) {
332
- const embedding = await this.embed(chunks[i]);
511
+ // Clean content before chunking
512
+ const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
513
+
514
+ // Semantic chunking
515
+ const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
516
+
517
+ // v3: Assign chunk IDs for graph tracking
518
+ const chunksWithIds = this.graphBuilder.assignChunkIds(relPath, chunks);
519
+
520
+ // v3: Delete old edges for this file and build new ones
521
+ await this.graphDB.deleteByFile(relPath);
522
+ await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
523
+
524
+ const data = [];
525
+ for (let i = 0; i < chunksWithIds.length; i++) {
526
+ const embedding = await this.embed(chunksWithIds[i].content);
333
527
  data.push({
528
+ chunk_id: chunksWithIds[i].chunk_id,
334
529
  file: relPath,
335
530
  chunk_index: i,
336
- content: chunks[i],
531
+ content: chunksWithIds[i].content,
337
532
  vector: embedding,
338
533
  archived: archived,
534
+ // v2 metadata
535
+ file_type: fileMeta.file_type,
536
+ language: fileMeta.language,
537
+ last_modified: fileMeta.last_modified,
538
+ file_size: fileMeta.file_size,
539
+ heading_context: chunksWithIds[i].heading_context || "",
540
+ function_name: chunksWithIds[i].function_name || "",
541
+ class_name: chunksWithIds[i].class_name || "",
542
+ tags: (fileMeta.tags || []).join(","),
339
543
  });
340
544
  }
341
545
 
546
+ if (data.length === 0) return false;
547
+
342
548
  const tableName = "chunks";
343
549
  const tables = await this.db.tableNames();
344
550
  if (tables.includes(tableName)) {
@@ -351,27 +557,244 @@ class CodebaseIndexer {
351
557
  this.hashes[relPath] = hash;
352
558
  await this.saveHashes();
353
559
 
560
+ // Invalidate BM25 index (needs rebuild) — release memory
561
+ if (this.bm25) {
562
+ this.bm25.clear();
563
+ this.bm25 = null;
564
+ }
565
+ this._bm25Rows = null;
566
+
354
567
  return true;
355
568
  }
356
569
 
357
- async search(query, limit = 5, includeArchived = false) {
570
+ // ── BM25 index management ────────────────────────────────────────────────
571
+
572
+ async ensureBM25() {
573
+ if (this.bm25) return this.bm25;
574
+
575
+ const tableName = "chunks";
576
+ const tables = await this.db.tableNames();
577
+ if (!tables.includes(tableName)) return null;
578
+
579
+ const table = await this.db.openTable(tableName);
580
+ const allRows = await table.search([0]).limit(100000).execute();
581
+
582
+ if (allRows.length === 0) return null;
583
+
584
+ // Sort for stable ID mapping between builds
585
+ allRows.sort((a, b) => {
586
+ const ka = `${a.file}:${a.chunk_index}`;
587
+ const kb = `${b.file}:${b.chunk_index}`;
588
+ return ka.localeCompare(kb);
589
+ });
590
+
591
+ // Release previous data before rebuilding
592
+ if (this.bm25) this.bm25.clear();
593
+ this._bm25Rows = null;
594
+
595
+ this.bm25 = new BM25Index();
596
+ this.bm25.build(allRows.map((r) => r.content));
597
+ this._bm25Rows = allRows;
598
+
599
+ return this.bm25;
600
+ }
601
+
602
+ // ── Search (v2: hybrid + metadata filters + metrics) ──────────────────────
603
+
604
+ async search(query, limit = 5, includeArchived = false, options = {}) {
358
605
  const tableName = "chunks";
359
606
  const tables = await this.db.tableNames();
360
607
  if (!tables.includes(tableName)) {
361
608
  return [];
362
609
  }
363
610
 
364
- const queryEmbedding = await this.embed(query);
611
+ const queryEmbedding = await this.embedQuery(query);
365
612
  const table = await this.db.openTable(tableName);
366
613
 
367
- const fetchLimit = includeArchived ? limit : limit * 3;
614
+ // Only over-fetch when filters or hybrid search are active
615
+ const hasFilters = !includeArchived || options.fileType || options.language ||
616
+ options.modifiedAfter || options.modifiedBefore ||
617
+ (options.tags && options.tags.length > 0);
618
+ const isHybrid = HYBRID_CONFIG.enabled || options.hybrid;
619
+ const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit * 3, 50) : limit;
368
620
  let results = await table.search(queryEmbedding).limit(fetchLimit).execute();
369
621
 
622
+ // ── Hybrid search ───────────────────────────────────────────────────────
623
+ if (HYBRID_CONFIG.enabled || options.hybrid) {
624
+ try {
625
+ const bm25 = await this.ensureBM25();
626
+ if (bm25 && this._bm25Rows) {
627
+ const bm25Results = bm25.search(query, fetchLimit);
628
+
629
+ // Build score maps
630
+ const vectorScores = new Map();
631
+ for (let i = 0; i < results.length; i++) {
632
+ const score = results[i]._distance != null ? 1 - results[i]._distance : 0.5;
633
+ vectorScores.set(i, score);
634
+ }
635
+
636
+ const bm25Scores = new Map();
637
+ for (const r of bm25Results) {
638
+ bm25Scores.set(r.id, r.score);
639
+ }
640
+
641
+ // We need a unified ID space. Since vector results and BM25 results
642
+ // reference different row sets, we use the full table rows for BM25
643
+ // and merge by file+chunk_index key.
644
+ const resultMap = new Map();
645
+
646
+ for (let i = 0; i < results.length; i++) {
647
+ const key = `${results[i].file}:${results[i].chunk_index}`;
648
+ const vs = results[i]._distance != null ? 1 - results[i]._distance : 0.5;
649
+ resultMap.set(key, { row: results[i], vectorScore: vs, bm25Score: 0 });
650
+ }
651
+
652
+ for (const br of bm25Results) {
653
+ if (br.id < this._bm25Rows.length) {
654
+ const bRow = this._bm25Rows[br.id];
655
+ const key = `${bRow.file}:${bRow.chunk_index}`;
656
+ if (resultMap.has(key)) {
657
+ resultMap.get(key).bm25Score = br.score;
658
+ } else {
659
+ resultMap.set(key, { row: bRow, vectorScore: 0, bm25Score: br.score });
660
+ }
661
+ }
662
+ }
663
+
664
+ // Normalize BM25 scores
665
+ let maxBM25 = 0;
666
+ for (const v of resultMap.values()) {
667
+ if (v.bm25Score > maxBM25) maxBM25 = v.bm25Score;
668
+ }
669
+
670
+ const bw = (options.bm25_weight ?? HYBRID_CONFIG.bm25_weight) || 0.3;
671
+ const vw = 1 - bw;
672
+
673
+ const merged = [];
674
+ for (const v of resultMap.values()) {
675
+ const normBM25 = maxBM25 > 0 ? v.bm25Score / maxBM25 : 0;
676
+ const combined = vw * v.vectorScore + bw * normBM25;
677
+ merged.push({ ...v.row, _combinedScore: combined, _distance: v.row._distance });
678
+ }
679
+
680
+ merged.sort((a, b) => b._combinedScore - a._combinedScore);
681
+ results = merged;
682
+ }
683
+ } catch (e) {
684
+ if (DEBUG) console.log("[vectorizer] Hybrid search fallback:", e.message);
685
+ // Fall through to vector-only results
686
+ }
687
+ }
688
+
689
+ // ── Metadata filters ──────────────────────────────────────────────────
370
690
  if (!includeArchived) {
371
691
  results = results.filter((r) => !r.archived);
372
692
  }
373
693
 
374
- return results.slice(0, limit);
694
+ if (options.fileType) {
695
+ results = results.filter((r) => r.file_type === options.fileType);
696
+ }
697
+
698
+ if (options.language) {
699
+ results = results.filter((r) => r.language === options.language);
700
+ }
701
+
702
+ if (options.modifiedAfter) {
703
+ const after = new Date(options.modifiedAfter).getTime();
704
+ results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
705
+ }
706
+
707
+ if (options.modifiedBefore) {
708
+ const before = new Date(options.modifiedBefore).getTime();
709
+ results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
710
+ }
711
+
712
+ if (options.tags && options.tags.length > 0) {
713
+ results = results.filter((r) => {
714
+ const rowTags = (r.tags || "").split(",").filter(Boolean);
715
+ return options.tags.some((t) => rowTags.includes(t));
716
+ });
717
+ }
718
+
719
+ const finalResults = results.slice(0, limit);
720
+
721
+ // ── Metrics tracking ────────────────────────────────────────────────────
722
+ if (METRICS_ENABLED) {
723
+ try {
724
+ if (!this.metrics) {
725
+ this.metrics = new SearchMetrics(this.root);
726
+ await this.metrics.load();
727
+ }
728
+ const scores = finalResults.map((r) =>
729
+ r._combinedScore != null
730
+ ? r._combinedScore
731
+ : r._distance != null
732
+ ? 1 - r._distance
733
+ : 0
734
+ );
735
+ this.metrics.recordQuery(query, this.indexName, scores, HYBRID_CONFIG.enabled || !!options.hybrid);
736
+ await this.metrics.save();
737
+ } catch {
738
+ // non-fatal
739
+ }
740
+ }
741
+
742
+ // ── Graph context expansion (v3) ───────────────────────────────────────
743
+ if (this.graphDB) {
744
+ for (const result of finalResults) {
745
+ if (!result.chunk_id) continue;
746
+
747
+ const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
748
+ const incoming = await this.graphDB.getIncoming(result.chunk_id);
749
+ const allEdges = [...outgoing, ...incoming];
750
+
751
+ const neighbors = [];
752
+ for (const edge of allEdges) {
753
+ const neighborId = edge.subject === result.chunk_id ? edge.object : edge.subject;
754
+ const neighborChunk = await this.findChunkById(neighborId);
755
+ if (!neighborChunk) continue;
756
+
757
+ const similarity = this.cosineSimilarity(neighborChunk.vector, queryEmbedding);
758
+ const score = edge.weight * similarity;
759
+
760
+ neighbors.push({
761
+ chunk_id: neighborId,
762
+ file: neighborChunk.file,
763
+ content: neighborChunk.content,
764
+ relation: edge.predicate,
765
+ score,
766
+ via: edge.source
767
+ });
768
+ }
769
+
770
+ neighbors.sort((a, b) => b.score - a.score);
771
+ result.relatedContext = neighbors.slice(0, 3);
772
+ }
773
+ }
774
+
775
+ return finalResults;
776
+ }
777
+
778
+ async findChunkById(chunkId) {
779
+ const tableName = "chunks";
780
+ const tables = await this.db.tableNames();
781
+ if (!tables.includes(tableName)) return null;
782
+
783
+ const table = await this.db.openTable(tableName);
784
+ const rows = await table.search([0]).limit(100000).execute();
785
+ return rows.find(r => r.chunk_id === chunkId) || null;
786
+ }
787
+
788
+ cosineSimilarity(vecA, vecB) {
789
+ let dotProduct = 0;
790
+ let normA = 0;
791
+ let normB = 0;
792
+ for (let i = 0; i < vecA.length; i++) {
793
+ dotProduct += vecA[i] * vecB[i];
794
+ normA += vecA[i] * vecA[i];
795
+ normB += vecB[i] * vecB[i];
796
+ }
797
+ return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
375
798
  }
376
799
 
377
800
  async checkHealth(extraIgnore = []) {
@@ -478,7 +901,14 @@ class CodebaseIndexer {
478
901
 
479
902
  async indexSingleFile(filePath) {
480
903
  const absPath = path.isAbsolute(filePath) ? filePath : path.join(this.root, filePath);
481
- return await this.indexFile(absPath);
904
+ // Prevent path traversal outside project root
905
+ const normalized = path.normalize(absPath);
906
+ const relative = path.relative(this.root, normalized);
907
+ if (relative.startsWith("..") || path.isAbsolute(relative)) {
908
+ if (DEBUG) console.log(`[vectorizer] Path traversal blocked: ${filePath}`);
909
+ return false;
910
+ }
911
+ return await this.indexFile(normalized);
482
912
  }
483
913
 
484
914
  async getStats() {
@@ -500,6 +930,12 @@ class CodebaseIndexer {
500
930
  model: EMBEDDING_MODEL,
501
931
  fileCount,
502
932
  chunkCount,
933
+ features: {
934
+ chunking: CHUNKING_CONFIG.strategy,
935
+ hybrid: HYBRID_CONFIG.enabled,
936
+ metrics: METRICS_ENABLED,
937
+ cache: CACHE_ENABLED,
938
+ },
503
939
  };
504
940
  }
505
941
 
@@ -525,12 +961,19 @@ class CodebaseIndexer {
525
961
  async clear() {
526
962
  await fs.rm(this.cacheDir, { recursive: true, force: true });
527
963
  this.hashes = {};
964
+ if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
965
+ this._bm25Rows = null;
966
+ this.metrics = null;
528
967
  await this.init();
529
968
  }
530
969
 
531
970
  async clearAll() {
532
971
  await fs.rm(this.baseDir, { recursive: true, force: true });
533
972
  this.hashes = {};
973
+ if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
974
+ this._bm25Rows = null;
975
+ this.metrics = null;
976
+ clearQueryCache();
534
977
  await this.init();
535
978
  }
536
979
 
@@ -546,6 +989,16 @@ class CodebaseIndexer {
546
989
  } catch {}
547
990
  return indexes;
548
991
  }
992
+
993
+ // ── Metrics access ────────────────────────────────────────────────────────
994
+
995
+ async getMetrics() {
996
+ if (!this.metrics) {
997
+ this.metrics = new SearchMetrics(this.root);
998
+ await this.metrics.load();
999
+ }
1000
+ return this.metrics.getSummary();
1001
+ }
549
1002
  }
550
1003
 
551
1004
  function getEmbeddingModel() {