@comfanion/usethis_search 0.1.4 → 0.2.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,97 @@
1
+ /**
2
+ * Hybrid Search — merges vector similarity and BM25 keyword scores.
3
+ *
4
+ * Uses Reciprocal Rank Fusion (RRF) or weighted linear combination
5
+ * to merge results from two search backends.
6
+ */
7
+
8
+ // ── Types ───────────────────────────────────────────────────────────────────
9
+
10
+ export interface HybridSearchConfig {
11
+ enabled: boolean
12
+ bm25_weight: number // 0.0–1.0, vector_weight = 1 - bm25_weight
13
+ }
14
+
15
+ export const DEFAULT_HYBRID_CONFIG: HybridSearchConfig = {
16
+ enabled: false,
17
+ bm25_weight: 0.3,
18
+ }
19
+
20
+ export interface ScoredResult {
21
+ id: number // index into the results array
22
+ vectorScore: number // 0–1 (1 = best)
23
+ bm25Score: number // raw BM25 score (unnormalized)
24
+ combinedScore: number
25
+ }
26
+
27
+ // ── Merge logic ─────────────────────────────────────────────────────────────
28
+
29
+ /**
30
+ * Normalize BM25 scores to 0–1 range using min-max scaling.
31
+ */
32
+ function normalizeBM25Scores(scores: Map<number, number>): Map<number, number> {
33
+ if (scores.size === 0) return scores
34
+
35
+ let min = Infinity
36
+ let max = -Infinity
37
+ for (const s of scores.values()) {
38
+ if (s < min) min = s
39
+ if (s > max) max = s
40
+ }
41
+
42
+ const range = max - min
43
+ if (range === 0) {
44
+ // All same score → normalize to 0.5
45
+ const result = new Map<number, number>()
46
+ for (const [id] of scores) result.set(id, 0.5)
47
+ return result
48
+ }
49
+
50
+ const result = new Map<number, number>()
51
+ for (const [id, score] of scores) {
52
+ result.set(id, (score - min) / range)
53
+ }
54
+ return result
55
+ }
56
+
57
+ /**
58
+ * Merge vector and BM25 results using weighted linear combination.
59
+ *
60
+ * @param vectorResults Map of chunkIndex → vectorScore (0–1, higher = better)
61
+ * @param bm25Results Map of chunkIndex → raw BM25 score
62
+ * @param config Hybrid search config (weights)
63
+ * @param limit Max results to return
64
+ */
65
+ export function mergeResults(
66
+ vectorResults: Map<number, number>,
67
+ bm25Results: Map<number, number>,
68
+ config: HybridSearchConfig = DEFAULT_HYBRID_CONFIG,
69
+ limit: number = 10,
70
+ ): ScoredResult[] {
71
+ const vectorWeight = 1 - config.bm25_weight
72
+ const bm25Weight = config.bm25_weight
73
+
74
+ const normalizedBM25 = normalizeBM25Scores(bm25Results)
75
+
76
+ // Collect all unique IDs
77
+ const allIds = new Set<number>()
78
+ for (const id of vectorResults.keys()) allIds.add(id)
79
+ for (const id of normalizedBM25.keys()) allIds.add(id)
80
+
81
+ const results: ScoredResult[] = []
82
+
83
+ for (const id of allIds) {
84
+ const vs = vectorResults.get(id) ?? 0
85
+ const bs = normalizedBM25.get(id) ?? 0
86
+
87
+ results.push({
88
+ id,
89
+ vectorScore: vs,
90
+ bm25Score: bm25Results.get(id) ?? 0,
91
+ combinedScore: vectorWeight * vs + bm25Weight * bs,
92
+ })
93
+ }
94
+
95
+ results.sort((a, b) => b.combinedScore - a.combinedScore)
96
+ return results.slice(0, limit)
97
+ }
@@ -1,4 +1,5 @@
1
1
  // OpenCode Vectorizer - Semantic Code Search with Multi-Index Support
2
+ // v2: Content cleaning, semantic chunking, hybrid search, metadata, cache, metrics
2
3
 
3
4
  import { pipeline, env } from "@xenova/transformers";
4
5
  import * as lancedb from "vectordb";
@@ -6,6 +7,15 @@ import fs from "fs/promises";
6
7
  import path from "path";
7
8
  import crypto from "crypto";
8
9
 
10
+ // ── New modules ─────────────────────────────────────────────────────────────
11
+ import { cleanContent, DEFAULT_CLEANING_CONFIG } from "./content-cleaner.ts";
12
+ import { extractFileMetadata, detectFileType, detectLanguage } from "./metadata-extractor.ts";
13
+ import { chunkContent, DEFAULT_CHUNKING_CONFIG } from "./chunkers/chunker-factory.ts";
14
+ import { BM25Index } from "./bm25-index.ts";
15
+ import { mergeResults, DEFAULT_HYBRID_CONFIG } from "./hybrid-search.ts";
16
+ import { QueryCache, DEFAULT_CACHE_CONFIG } from "./query-cache.ts";
17
+ import { SearchMetrics } from "./search-metrics.ts";
18
+
9
19
  // Suppress transformers.js logs unless DEBUG is set
10
20
  const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
11
21
  if (!DEBUG) {
@@ -57,6 +67,13 @@ let GLOBAL_IGNORE = [];
57
67
  // Default embedding model (fast). Can be overridden by config.
58
68
  let EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2";
59
69
 
70
+ // ── Extended config parsed from YAML ────────────────────────────────────────
71
+ let CLEANING_CONFIG = { ...DEFAULT_CLEANING_CONFIG };
72
+ let CHUNKING_CONFIG = { ...DEFAULT_CHUNKING_CONFIG };
73
+ let HYBRID_CONFIG = { ...DEFAULT_HYBRID_CONFIG };
74
+ let METRICS_ENABLED = false;
75
+ let CACHE_ENABLED = true;
76
+
60
77
  function defaultVectorizerYaml() {
61
78
  return (
62
79
  `vectorizer:\n` +
@@ -64,6 +81,40 @@ function defaultVectorizerYaml() {
64
81
  ` auto_index: true\n` +
65
82
  ` model: \"${EMBEDDING_MODEL}\"\n` +
66
83
  ` debounce_ms: 1000\n` +
84
+ `\n` +
85
+ ` # Content cleaning before chunking\n` +
86
+ ` cleaning:\n` +
87
+ ` remove_toc: true\n` +
88
+ ` remove_frontmatter_metadata: false\n` +
89
+ ` remove_imports: false\n` +
90
+ ` remove_comments: false\n` +
91
+ `\n` +
92
+ ` # Chunking strategy\n` +
93
+ ` chunking:\n` +
94
+ ` strategy: \"semantic\" # fixed | semantic\n` +
95
+ ` markdown:\n` +
96
+ ` split_by_headings: true\n` +
97
+ ` min_chunk_size: 200\n` +
98
+ ` max_chunk_size: 2000\n` +
99
+ ` preserve_heading_hierarchy: true\n` +
100
+ ` code:\n` +
101
+ ` split_by_functions: true\n` +
102
+ ` include_function_signature: true\n` +
103
+ ` min_chunk_size: 300\n` +
104
+ ` max_chunk_size: 1500\n` +
105
+ ` fixed:\n` +
106
+ ` max_chars: 1500\n` +
107
+ `\n` +
108
+ ` # Hybrid search (vector + BM25)\n` +
109
+ ` search:\n` +
110
+ ` hybrid: false\n` +
111
+ ` bm25_weight: 0.3\n` +
112
+ `\n` +
113
+ ` # Quality monitoring\n` +
114
+ ` quality:\n` +
115
+ ` enable_metrics: false\n` +
116
+ ` enable_cache: true\n` +
117
+ `\n` +
67
118
  ` indexes:\n` +
68
119
  ` code:\n` +
69
120
  ` enabled: true\n` +
@@ -104,8 +155,25 @@ async function ensureDefaultConfig(projectRoot) {
104
155
  }
105
156
  }
106
157
 
158
+ // ── YAML mini-parser helpers ────────────────────────────────────────────────
159
+
160
+ function parseBool(section, key, fallback) {
161
+ const m = section.match(new RegExp(`^\\s+${key}:\\s*(true|false)`, "m"));
162
+ return m ? m[1] === "true" : fallback;
163
+ }
164
+
165
+ function parseNumber(section, key, fallback) {
166
+ const m = section.match(new RegExp(`^\\s+${key}:\\s*(\\d+(?:\\.\\d+)?)`, "m"));
167
+ return m ? parseFloat(m[1]) : fallback;
168
+ }
169
+
170
+ function parseString(section, key, fallback) {
171
+ const m = section.match(new RegExp(`^\\s+${key}:\\s*["']?([^"'\\n]+)["']?`, "m"));
172
+ return m ? m[1].trim() : fallback;
173
+ }
174
+
107
175
  /**
108
- * Load index configuration from .opencode/vectorizer.yaml (preferred) or .opencode/config.yaml.
176
+ * Load index configuration from .opencode/vectorizer.yaml.
109
177
  */
110
178
  async function loadConfig(projectRoot) {
111
179
  try {
@@ -142,6 +210,61 @@ async function loadConfig(projectRoot) {
142
210
  if (DEBUG) console.log("[vectorizer] Using model from config:", EMBEDDING_MODEL);
143
211
  }
144
212
 
213
+ // ── Parse cleaning config ───────────────────────────────────────────────
214
+ const cleaningMatch = section.match(/^\s{2}cleaning:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
215
+ if (cleaningMatch) {
216
+ const cs = cleaningMatch[1];
217
+ CLEANING_CONFIG = {
218
+ remove_toc: parseBool(cs, "remove_toc", true),
219
+ remove_frontmatter_metadata: parseBool(cs, "remove_frontmatter_metadata", false),
220
+ remove_imports: parseBool(cs, "remove_imports", false),
221
+ remove_comments: parseBool(cs, "remove_comments", false),
222
+ };
223
+ }
224
+
225
+ // ── Parse chunking config ───────────────────────────────────────────────
226
+ const chunkingMatch = section.match(/^\s{2}chunking:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
227
+ if (chunkingMatch) {
228
+ const cs = chunkingMatch[1];
229
+ const strategy = parseString(cs, "strategy", "semantic");
230
+ CHUNKING_CONFIG = {
231
+ strategy: strategy,
232
+ markdown: {
233
+ split_by_headings: parseBool(cs, "split_by_headings", true),
234
+ min_chunk_size: parseNumber(cs, "min_chunk_size", 200),
235
+ max_chunk_size: parseNumber(cs, "max_chunk_size", 2000),
236
+ preserve_heading_hierarchy: parseBool(cs, "preserve_heading_hierarchy", true),
237
+ },
238
+ code: {
239
+ split_by_functions: parseBool(cs, "split_by_functions", true),
240
+ include_function_signature: parseBool(cs, "include_function_signature", true),
241
+ min_chunk_size: parseNumber(cs, "min_chunk_size", 300),
242
+ max_chunk_size: parseNumber(cs, "max_chunk_size", 1500),
243
+ },
244
+ fixed: {
245
+ max_chars: parseNumber(cs, "max_chars", 1500),
246
+ },
247
+ };
248
+ }
249
+
250
+ // ── Parse search config ─────────────────────────────────────────────────
251
+ const searchMatch = section.match(/^\s{2}search:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
252
+ if (searchMatch) {
253
+ const ss = searchMatch[1];
254
+ HYBRID_CONFIG = {
255
+ enabled: parseBool(ss, "hybrid", false),
256
+ bm25_weight: parseNumber(ss, "bm25_weight", 0.3),
257
+ };
258
+ }
259
+
260
+ // ── Parse quality config ────────────────────────────────────────────────
261
+ const qualityMatch = section.match(/^\s{2}quality:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
262
+ if (qualityMatch) {
263
+ const qs = qualityMatch[1];
264
+ METRICS_ENABLED = parseBool(qs, "enable_metrics", false);
265
+ CACHE_ENABLED = parseBool(qs, "enable_cache", true);
266
+ }
267
+
145
268
  // Parse global exclude
146
269
  const excludeMatch = section.match(/^\s{2}exclude:\s*\n((?:\s{4}-\s+.+\n?)*)/m);
147
270
  if (excludeMatch) {
@@ -196,12 +319,25 @@ async function loadConfig(projectRoot) {
196
319
  }
197
320
  }
198
321
 
199
- if (DEBUG) console.log("[vectorizer] Loaded config:", { INDEX_PRESETS, GLOBAL_IGNORE });
322
+ if (DEBUG) console.log("[vectorizer] Loaded config:", { INDEX_PRESETS, GLOBAL_IGNORE, HYBRID_CONFIG, CHUNKING_CONFIG });
200
323
  } catch {
201
324
  if (DEBUG) console.log("[vectorizer] Using default presets (config load failed)");
202
325
  }
203
326
  }
204
327
 
328
+ // ── Shared query cache (singleton per process) ─────────────────────────────
329
+ let _queryCache = null;
330
+ function getQueryCache() {
331
+ if (!_queryCache) _queryCache = new QueryCache(DEFAULT_CACHE_CONFIG);
332
+ return _queryCache;
333
+ }
334
+ function clearQueryCache() {
335
+ if (_queryCache) {
336
+ _queryCache.destroy();
337
+ _queryCache = null;
338
+ }
339
+ }
340
+
205
341
  class CodebaseIndexer {
206
342
  constructor(projectRoot, indexName = "code") {
207
343
  this.root = projectRoot;
@@ -212,6 +348,8 @@ class CodebaseIndexer {
212
348
  this.db = null;
213
349
  this.hashes = {};
214
350
  this.configLoaded = false;
351
+ this.bm25 = null; // lazy-built BM25 index
352
+ this.metrics = null; // lazy-loaded SearchMetrics
215
353
  }
216
354
 
217
355
  async init() {
@@ -227,17 +365,30 @@ class CodebaseIndexer {
227
365
 
228
366
  async loadModel() {
229
367
  if (!this.model) {
230
- if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}...`);
231
- this.model = await pipeline("feature-extraction", EMBEDDING_MODEL, {
232
- progress_callback: DEBUG ? undefined : null,
233
- });
234
- if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`);
368
+ try {
369
+ if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}...`);
370
+ this.model = await pipeline("feature-extraction", EMBEDDING_MODEL, {
371
+ progress_callback: DEBUG ? undefined : null,
372
+ });
373
+ if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`);
374
+ } catch (error) {
375
+ this.model = null;
376
+ throw new Error(`Model loading failed: ${error.message || error}`);
377
+ }
235
378
  }
236
379
  return this.model;
237
380
  }
238
381
 
239
382
  async unloadModel() {
240
383
  this.model = null;
384
+ // Release BM25 data held in memory
385
+ if (this.bm25) {
386
+ this.bm25.clear();
387
+ this.bm25 = null;
388
+ }
389
+ this._bm25Rows = null;
390
+ this.metrics = null;
391
+ clearQueryCache();
241
392
  if (global.gc) global.gc();
242
393
  }
243
394
 
@@ -274,12 +425,28 @@ class CodebaseIndexer {
274
425
  return false;
275
426
  }
276
427
 
428
+ // ── Embedding (with optional cache) ───────────────────────────────────────
429
+
277
430
  async embed(text) {
278
431
  const model = await this.loadModel();
279
432
  const result = await model(text, { pooling: "mean", normalize: true });
280
433
  return Array.from(result.data);
281
434
  }
282
435
 
436
+ async embedQuery(text) {
437
+ if (CACHE_ENABLED) {
438
+ const cache = getQueryCache();
439
+ const cached = cache.get(text);
440
+ if (cached) return cached;
441
+ const embedding = await this.embed(text);
442
+ cache.set(text, embedding);
443
+ return embedding;
444
+ }
445
+ return this.embed(text);
446
+ }
447
+
448
+ // ── Legacy chunker (kept for backward compat / "fixed" strategy) ──────────
449
+
283
450
  chunkCode(content, maxChars = 1500) {
284
451
  const chunks = [];
285
452
  const lines = content.split("\n");
@@ -309,6 +476,8 @@ class CodebaseIndexer {
309
476
  return this.hashes[relPath] !== currentHash;
310
477
  }
311
478
 
479
+ // ── Index a single file (v2: cleaning + semantic chunking + metadata) ─────
480
+
312
481
  async indexFile(filePath) {
313
482
  const relPath = path.relative(this.root, filePath);
314
483
 
@@ -324,21 +493,39 @@ class CodebaseIndexer {
324
493
  return false;
325
494
  }
326
495
 
327
- const chunks = this.chunkCode(content);
496
+ // Extract metadata
497
+ const fileMeta = await extractFileMetadata(filePath, content);
328
498
  const archived = this.isArchived(relPath, content);
329
- const data = [];
330
499
 
500
+ // Clean content before chunking
501
+ const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
502
+
503
+ // Semantic chunking
504
+ const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
505
+
506
+ const data = [];
331
507
  for (let i = 0; i < chunks.length; i++) {
332
- const embedding = await this.embed(chunks[i]);
508
+ const embedding = await this.embed(chunks[i].content);
333
509
  data.push({
334
510
  file: relPath,
335
511
  chunk_index: i,
336
- content: chunks[i],
512
+ content: chunks[i].content,
337
513
  vector: embedding,
338
514
  archived: archived,
515
+ // v2 metadata
516
+ file_type: fileMeta.file_type,
517
+ language: fileMeta.language,
518
+ last_modified: fileMeta.last_modified,
519
+ file_size: fileMeta.file_size,
520
+ heading_context: chunks[i].heading_context || "",
521
+ function_name: chunks[i].function_name || "",
522
+ class_name: chunks[i].class_name || "",
523
+ tags: (fileMeta.tags || []).join(","),
339
524
  });
340
525
  }
341
526
 
527
+ if (data.length === 0) return false;
528
+
342
529
  const tableName = "chunks";
343
530
  const tables = await this.db.tableNames();
344
531
  if (tables.includes(tableName)) {
@@ -351,27 +538,189 @@ class CodebaseIndexer {
351
538
  this.hashes[relPath] = hash;
352
539
  await this.saveHashes();
353
540
 
541
+ // Invalidate BM25 index (needs rebuild) — release memory
542
+ if (this.bm25) {
543
+ this.bm25.clear();
544
+ this.bm25 = null;
545
+ }
546
+ this._bm25Rows = null;
547
+
354
548
  return true;
355
549
  }
356
550
 
357
- async search(query, limit = 5, includeArchived = false) {
551
+ // ── BM25 index management ────────────────────────────────────────────────
552
+
553
+ async ensureBM25() {
554
+ if (this.bm25) return this.bm25;
555
+
556
+ const tableName = "chunks";
557
+ const tables = await this.db.tableNames();
558
+ if (!tables.includes(tableName)) return null;
559
+
560
+ const table = await this.db.openTable(tableName);
561
+ const allRows = await table.search([0]).limit(100000).execute();
562
+
563
+ if (allRows.length === 0) return null;
564
+
565
+ // Sort for stable ID mapping between builds
566
+ allRows.sort((a, b) => {
567
+ const ka = `${a.file}:${a.chunk_index}`;
568
+ const kb = `${b.file}:${b.chunk_index}`;
569
+ return ka.localeCompare(kb);
570
+ });
571
+
572
+ // Release previous data before rebuilding
573
+ if (this.bm25) this.bm25.clear();
574
+ this._bm25Rows = null;
575
+
576
+ this.bm25 = new BM25Index();
577
+ this.bm25.build(allRows.map((r) => r.content));
578
+ this._bm25Rows = allRows;
579
+
580
+ return this.bm25;
581
+ }
582
+
583
+ // ── Search (v2: hybrid + metadata filters + metrics) ──────────────────────
584
+
585
+ async search(query, limit = 5, includeArchived = false, options = {}) {
358
586
  const tableName = "chunks";
359
587
  const tables = await this.db.tableNames();
360
588
  if (!tables.includes(tableName)) {
361
589
  return [];
362
590
  }
363
591
 
364
- const queryEmbedding = await this.embed(query);
592
+ const queryEmbedding = await this.embedQuery(query);
365
593
  const table = await this.db.openTable(tableName);
366
594
 
367
- const fetchLimit = includeArchived ? limit : limit * 3;
595
+ // Only over-fetch when filters or hybrid search are active
596
+ const hasFilters = !includeArchived || options.fileType || options.language ||
597
+ options.modifiedAfter || options.modifiedBefore ||
598
+ (options.tags && options.tags.length > 0);
599
+ const isHybrid = HYBRID_CONFIG.enabled || options.hybrid;
600
+ const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit * 3, 50) : limit;
368
601
  let results = await table.search(queryEmbedding).limit(fetchLimit).execute();
369
602
 
603
+ // ── Hybrid search ───────────────────────────────────────────────────────
604
+ if (HYBRID_CONFIG.enabled || options.hybrid) {
605
+ try {
606
+ const bm25 = await this.ensureBM25();
607
+ if (bm25 && this._bm25Rows) {
608
+ const bm25Results = bm25.search(query, fetchLimit);
609
+
610
+ // Build score maps
611
+ const vectorScores = new Map();
612
+ for (let i = 0; i < results.length; i++) {
613
+ const score = results[i]._distance != null ? 1 - results[i]._distance : 0.5;
614
+ vectorScores.set(i, score);
615
+ }
616
+
617
+ const bm25Scores = new Map();
618
+ for (const r of bm25Results) {
619
+ bm25Scores.set(r.id, r.score);
620
+ }
621
+
622
+ // We need a unified ID space. Since vector results and BM25 results
623
+ // reference different row sets, we use the full table rows for BM25
624
+ // and merge by file+chunk_index key.
625
+ const resultMap = new Map();
626
+
627
+ for (let i = 0; i < results.length; i++) {
628
+ const key = `${results[i].file}:${results[i].chunk_index}`;
629
+ const vs = results[i]._distance != null ? 1 - results[i]._distance : 0.5;
630
+ resultMap.set(key, { row: results[i], vectorScore: vs, bm25Score: 0 });
631
+ }
632
+
633
+ for (const br of bm25Results) {
634
+ if (br.id < this._bm25Rows.length) {
635
+ const bRow = this._bm25Rows[br.id];
636
+ const key = `${bRow.file}:${bRow.chunk_index}`;
637
+ if (resultMap.has(key)) {
638
+ resultMap.get(key).bm25Score = br.score;
639
+ } else {
640
+ resultMap.set(key, { row: bRow, vectorScore: 0, bm25Score: br.score });
641
+ }
642
+ }
643
+ }
644
+
645
+ // Normalize BM25 scores
646
+ let maxBM25 = 0;
647
+ for (const v of resultMap.values()) {
648
+ if (v.bm25Score > maxBM25) maxBM25 = v.bm25Score;
649
+ }
650
+
651
+ const bw = (options.bm25_weight ?? HYBRID_CONFIG.bm25_weight) || 0.3;
652
+ const vw = 1 - bw;
653
+
654
+ const merged = [];
655
+ for (const v of resultMap.values()) {
656
+ const normBM25 = maxBM25 > 0 ? v.bm25Score / maxBM25 : 0;
657
+ const combined = vw * v.vectorScore + bw * normBM25;
658
+ merged.push({ ...v.row, _combinedScore: combined, _distance: v.row._distance });
659
+ }
660
+
661
+ merged.sort((a, b) => b._combinedScore - a._combinedScore);
662
+ results = merged;
663
+ }
664
+ } catch (e) {
665
+ if (DEBUG) console.log("[vectorizer] Hybrid search fallback:", e.message);
666
+ // Fall through to vector-only results
667
+ }
668
+ }
669
+
670
+ // ── Metadata filters ──────────────────────────────────────────────────
370
671
  if (!includeArchived) {
371
672
  results = results.filter((r) => !r.archived);
372
673
  }
373
674
 
374
- return results.slice(0, limit);
675
+ if (options.fileType) {
676
+ results = results.filter((r) => r.file_type === options.fileType);
677
+ }
678
+
679
+ if (options.language) {
680
+ results = results.filter((r) => r.language === options.language);
681
+ }
682
+
683
+ if (options.modifiedAfter) {
684
+ const after = new Date(options.modifiedAfter).getTime();
685
+ results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
686
+ }
687
+
688
+ if (options.modifiedBefore) {
689
+ const before = new Date(options.modifiedBefore).getTime();
690
+ results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
691
+ }
692
+
693
+ if (options.tags && options.tags.length > 0) {
694
+ results = results.filter((r) => {
695
+ const rowTags = (r.tags || "").split(",").filter(Boolean);
696
+ return options.tags.some((t) => rowTags.includes(t));
697
+ });
698
+ }
699
+
700
+ const finalResults = results.slice(0, limit);
701
+
702
+ // ── Metrics tracking ────────────────────────────────────────────────────
703
+ if (METRICS_ENABLED) {
704
+ try {
705
+ if (!this.metrics) {
706
+ this.metrics = new SearchMetrics(this.root);
707
+ await this.metrics.load();
708
+ }
709
+ const scores = finalResults.map((r) =>
710
+ r._combinedScore != null
711
+ ? r._combinedScore
712
+ : r._distance != null
713
+ ? 1 - r._distance
714
+ : 0
715
+ );
716
+ this.metrics.recordQuery(query, this.indexName, scores, HYBRID_CONFIG.enabled || !!options.hybrid);
717
+ await this.metrics.save();
718
+ } catch {
719
+ // non-fatal
720
+ }
721
+ }
722
+
723
+ return finalResults;
375
724
  }
376
725
 
377
726
  async checkHealth(extraIgnore = []) {
@@ -478,7 +827,14 @@ class CodebaseIndexer {
478
827
 
479
828
  async indexSingleFile(filePath) {
480
829
  const absPath = path.isAbsolute(filePath) ? filePath : path.join(this.root, filePath);
481
- return await this.indexFile(absPath);
830
+ // Prevent path traversal outside project root
831
+ const normalized = path.normalize(absPath);
832
+ const relative = path.relative(this.root, normalized);
833
+ if (relative.startsWith("..") || path.isAbsolute(relative)) {
834
+ if (DEBUG) console.log(`[vectorizer] Path traversal blocked: ${filePath}`);
835
+ return false;
836
+ }
837
+ return await this.indexFile(normalized);
482
838
  }
483
839
 
484
840
  async getStats() {
@@ -500,6 +856,12 @@ class CodebaseIndexer {
500
856
  model: EMBEDDING_MODEL,
501
857
  fileCount,
502
858
  chunkCount,
859
+ features: {
860
+ chunking: CHUNKING_CONFIG.strategy,
861
+ hybrid: HYBRID_CONFIG.enabled,
862
+ metrics: METRICS_ENABLED,
863
+ cache: CACHE_ENABLED,
864
+ },
503
865
  };
504
866
  }
505
867
 
@@ -525,12 +887,19 @@ class CodebaseIndexer {
525
887
  async clear() {
526
888
  await fs.rm(this.cacheDir, { recursive: true, force: true });
527
889
  this.hashes = {};
890
+ if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
891
+ this._bm25Rows = null;
892
+ this.metrics = null;
528
893
  await this.init();
529
894
  }
530
895
 
531
896
  async clearAll() {
532
897
  await fs.rm(this.baseDir, { recursive: true, force: true });
533
898
  this.hashes = {};
899
+ if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
900
+ this._bm25Rows = null;
901
+ this.metrics = null;
902
+ clearQueryCache();
534
903
  await this.init();
535
904
  }
536
905
 
@@ -546,6 +915,16 @@ class CodebaseIndexer {
546
915
  } catch {}
547
916
  return indexes;
548
917
  }
918
+
919
+ // ── Metrics access ────────────────────────────────────────────────────────
920
+
921
+ async getMetrics() {
922
+ if (!this.metrics) {
923
+ this.metrics = new SearchMetrics(this.root);
924
+ await this.metrics.load();
925
+ }
926
+ return this.metrics.getSummary();
927
+ }
549
928
  }
550
929
 
551
930
  function getEmbeddingModel() {