@comfanion/usethis_search 0.1.5 → 0.2.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  // OpenCode Vectorizer - Semantic Code Search with Multi-Index Support
2
+ // v2: Content cleaning, semantic chunking, hybrid search, metadata, cache, metrics
2
3
 
3
4
  import { pipeline, env } from "@xenova/transformers";
4
5
  import * as lancedb from "vectordb";
@@ -6,6 +7,15 @@ import fs from "fs/promises";
6
7
  import path from "path";
7
8
  import crypto from "crypto";
8
9
 
10
+ // ── New modules ─────────────────────────────────────────────────────────────
11
+ import { cleanContent, DEFAULT_CLEANING_CONFIG } from "./content-cleaner.ts";
12
+ import { extractFileMetadata, detectFileType, detectLanguage } from "./metadata-extractor.ts";
13
+ import { chunkContent, DEFAULT_CHUNKING_CONFIG } from "./chunkers/chunker-factory.ts";
14
+ import { BM25Index } from "./bm25-index.ts";
15
+ import { mergeResults, DEFAULT_HYBRID_CONFIG } from "./hybrid-search.ts";
16
+ import { QueryCache, DEFAULT_CACHE_CONFIG } from "./query-cache.ts";
17
+ import { SearchMetrics } from "./search-metrics.ts";
18
+
9
19
  // Suppress transformers.js logs unless DEBUG is set
10
20
  const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
11
21
  if (!DEBUG) {
@@ -57,6 +67,13 @@ let GLOBAL_IGNORE = [];
57
67
  // Default embedding model (fast). Can be overridden by config.
58
68
  let EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2";
59
69
 
70
+ // ── Extended config parsed from YAML ────────────────────────────────────────
71
+ let CLEANING_CONFIG = { ...DEFAULT_CLEANING_CONFIG };
72
+ let CHUNKING_CONFIG = { ...DEFAULT_CHUNKING_CONFIG };
73
+ let HYBRID_CONFIG = { ...DEFAULT_HYBRID_CONFIG };
74
+ let METRICS_ENABLED = false;
75
+ let CACHE_ENABLED = true;
76
+
60
77
  function defaultVectorizerYaml() {
61
78
  return (
62
79
  `vectorizer:\n` +
@@ -64,6 +81,40 @@ function defaultVectorizerYaml() {
64
81
  ` auto_index: true\n` +
65
82
  ` model: \"${EMBEDDING_MODEL}\"\n` +
66
83
  ` debounce_ms: 1000\n` +
84
+ `\n` +
85
+ ` # Content cleaning before chunking\n` +
86
+ ` cleaning:\n` +
87
+ ` remove_toc: true\n` +
88
+ ` remove_frontmatter_metadata: false\n` +
89
+ ` remove_imports: false\n` +
90
+ ` remove_comments: false\n` +
91
+ `\n` +
92
+ ` # Chunking strategy\n` +
93
+ ` chunking:\n` +
94
+ ` strategy: \"semantic\" # fixed | semantic\n` +
95
+ ` markdown:\n` +
96
+ ` split_by_headings: true\n` +
97
+ ` min_chunk_size: 200\n` +
98
+ ` max_chunk_size: 2000\n` +
99
+ ` preserve_heading_hierarchy: true\n` +
100
+ ` code:\n` +
101
+ ` split_by_functions: true\n` +
102
+ ` include_function_signature: true\n` +
103
+ ` min_chunk_size: 300\n` +
104
+ ` max_chunk_size: 1500\n` +
105
+ ` fixed:\n` +
106
+ ` max_chars: 1500\n` +
107
+ `\n` +
108
+ ` # Hybrid search (vector + BM25)\n` +
109
+ ` search:\n` +
110
+ ` hybrid: false\n` +
111
+ ` bm25_weight: 0.3\n` +
112
+ `\n` +
113
+ ` # Quality monitoring\n` +
114
+ ` quality:\n` +
115
+ ` enable_metrics: false\n` +
116
+ ` enable_cache: true\n` +
117
+ `\n` +
67
118
  ` indexes:\n` +
68
119
  ` code:\n` +
69
120
  ` enabled: true\n` +
@@ -104,8 +155,25 @@ async function ensureDefaultConfig(projectRoot) {
104
155
  }
105
156
  }
106
157
 
158
+ // ── YAML mini-parser helpers ────────────────────────────────────────────────
159
+
160
+ function parseBool(section, key, fallback) {
161
+ const m = section.match(new RegExp(`^\\s+${key}:\\s*(true|false)`, "m"));
162
+ return m ? m[1] === "true" : fallback;
163
+ }
164
+
165
+ function parseNumber(section, key, fallback) {
166
+ const m = section.match(new RegExp(`^\\s+${key}:\\s*(\\d+(?:\\.\\d+)?)`, "m"));
167
+ return m ? parseFloat(m[1]) : fallback;
168
+ }
169
+
170
+ function parseString(section, key, fallback) {
171
+ const m = section.match(new RegExp(`^\\s+${key}:\\s*["']?([^"'\\n]+)["']?`, "m"));
172
+ return m ? m[1].trim() : fallback;
173
+ }
174
+
107
175
  /**
108
- * Load index configuration from .opencode/vectorizer.yaml (preferred) or .opencode/config.yaml.
176
+ * Load index configuration from .opencode/vectorizer.yaml.
109
177
  */
110
178
  async function loadConfig(projectRoot) {
111
179
  try {
@@ -142,6 +210,61 @@ async function loadConfig(projectRoot) {
142
210
  if (DEBUG) console.log("[vectorizer] Using model from config:", EMBEDDING_MODEL);
143
211
  }
144
212
 
213
+ // ── Parse cleaning config ───────────────────────────────────────────────
214
+ const cleaningMatch = section.match(/^\s{2}cleaning:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
215
+ if (cleaningMatch) {
216
+ const cs = cleaningMatch[1];
217
+ CLEANING_CONFIG = {
218
+ remove_toc: parseBool(cs, "remove_toc", true),
219
+ remove_frontmatter_metadata: parseBool(cs, "remove_frontmatter_metadata", false),
220
+ remove_imports: parseBool(cs, "remove_imports", false),
221
+ remove_comments: parseBool(cs, "remove_comments", false),
222
+ };
223
+ }
224
+
225
+ // ── Parse chunking config ───────────────────────────────────────────────
226
+ const chunkingMatch = section.match(/^\s{2}chunking:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
227
+ if (chunkingMatch) {
228
+ const cs = chunkingMatch[1];
229
+ const strategy = parseString(cs, "strategy", "semantic");
230
+ CHUNKING_CONFIG = {
231
+ strategy: strategy,
232
+ markdown: {
233
+ split_by_headings: parseBool(cs, "split_by_headings", true),
234
+ min_chunk_size: parseNumber(cs, "min_chunk_size", 200),
235
+ max_chunk_size: parseNumber(cs, "max_chunk_size", 2000),
236
+ preserve_heading_hierarchy: parseBool(cs, "preserve_heading_hierarchy", true),
237
+ },
238
+ code: {
239
+ split_by_functions: parseBool(cs, "split_by_functions", true),
240
+ include_function_signature: parseBool(cs, "include_function_signature", true),
241
+ min_chunk_size: parseNumber(cs, "min_chunk_size", 300),
242
+ max_chunk_size: parseNumber(cs, "max_chunk_size", 1500),
243
+ },
244
+ fixed: {
245
+ max_chars: parseNumber(cs, "max_chars", 1500),
246
+ },
247
+ };
248
+ }
249
+
250
+ // ── Parse search config ─────────────────────────────────────────────────
251
+ const searchMatch = section.match(/^\s{2}search:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
252
+ if (searchMatch) {
253
+ const ss = searchMatch[1];
254
+ HYBRID_CONFIG = {
255
+ enabled: parseBool(ss, "hybrid", false),
256
+ bm25_weight: parseNumber(ss, "bm25_weight", 0.3),
257
+ };
258
+ }
259
+
260
+ // ── Parse quality config ────────────────────────────────────────────────
261
+ const qualityMatch = section.match(/^\s{2}quality:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
262
+ if (qualityMatch) {
263
+ const qs = qualityMatch[1];
264
+ METRICS_ENABLED = parseBool(qs, "enable_metrics", false);
265
+ CACHE_ENABLED = parseBool(qs, "enable_cache", true);
266
+ }
267
+
145
268
  // Parse global exclude
146
269
  const excludeMatch = section.match(/^\s{2}exclude:\s*\n((?:\s{4}-\s+.+\n?)*)/m);
147
270
  if (excludeMatch) {
@@ -196,12 +319,25 @@ async function loadConfig(projectRoot) {
196
319
  }
197
320
  }
198
321
 
199
- if (DEBUG) console.log("[vectorizer] Loaded config:", { INDEX_PRESETS, GLOBAL_IGNORE });
322
+ if (DEBUG) console.log("[vectorizer] Loaded config:", { INDEX_PRESETS, GLOBAL_IGNORE, HYBRID_CONFIG, CHUNKING_CONFIG });
200
323
  } catch {
201
324
  if (DEBUG) console.log("[vectorizer] Using default presets (config load failed)");
202
325
  }
203
326
  }
204
327
 
328
+ // ── Shared query cache (singleton per process) ─────────────────────────────
329
+ let _queryCache = null;
330
+ function getQueryCache() {
331
+ if (!_queryCache) _queryCache = new QueryCache(DEFAULT_CACHE_CONFIG);
332
+ return _queryCache;
333
+ }
334
+ function clearQueryCache() {
335
+ if (_queryCache) {
336
+ _queryCache.destroy();
337
+ _queryCache = null;
338
+ }
339
+ }
340
+
205
341
  class CodebaseIndexer {
206
342
  constructor(projectRoot, indexName = "code") {
207
343
  this.root = projectRoot;
@@ -212,6 +348,8 @@ class CodebaseIndexer {
212
348
  this.db = null;
213
349
  this.hashes = {};
214
350
  this.configLoaded = false;
351
+ this.bm25 = null; // lazy-built BM25 index
352
+ this.metrics = null; // lazy-loaded SearchMetrics
215
353
  }
216
354
 
217
355
  async init() {
@@ -227,17 +365,30 @@ class CodebaseIndexer {
227
365
 
228
366
  async loadModel() {
229
367
  if (!this.model) {
230
- if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}...`);
231
- this.model = await pipeline("feature-extraction", EMBEDDING_MODEL, {
232
- progress_callback: DEBUG ? undefined : null,
233
- });
234
- if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`);
368
+ try {
369
+ if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}...`);
370
+ this.model = await pipeline("feature-extraction", EMBEDDING_MODEL, {
371
+ progress_callback: DEBUG ? undefined : null,
372
+ });
373
+ if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`);
374
+ } catch (error) {
375
+ this.model = null;
376
+ throw new Error(`Model loading failed: ${error.message || error}`);
377
+ }
235
378
  }
236
379
  return this.model;
237
380
  }
238
381
 
239
382
  async unloadModel() {
240
383
  this.model = null;
384
+ // Release BM25 data held in memory
385
+ if (this.bm25) {
386
+ this.bm25.clear();
387
+ this.bm25 = null;
388
+ }
389
+ this._bm25Rows = null;
390
+ this.metrics = null;
391
+ clearQueryCache();
241
392
  if (global.gc) global.gc();
242
393
  }
243
394
 
@@ -274,12 +425,28 @@ class CodebaseIndexer {
274
425
  return false;
275
426
  }
276
427
 
428
+ // ── Embedding (with optional cache) ───────────────────────────────────────
429
+
277
430
  async embed(text) {
278
431
  const model = await this.loadModel();
279
432
  const result = await model(text, { pooling: "mean", normalize: true });
280
433
  return Array.from(result.data);
281
434
  }
282
435
 
436
+ async embedQuery(text) {
437
+ if (CACHE_ENABLED) {
438
+ const cache = getQueryCache();
439
+ const cached = cache.get(text);
440
+ if (cached) return cached;
441
+ const embedding = await this.embed(text);
442
+ cache.set(text, embedding);
443
+ return embedding;
444
+ }
445
+ return this.embed(text);
446
+ }
447
+
448
+ // ── Legacy chunker (kept for backward compat / "fixed" strategy) ──────────
449
+
283
450
  chunkCode(content, maxChars = 1500) {
284
451
  const chunks = [];
285
452
  const lines = content.split("\n");
@@ -309,6 +476,8 @@ class CodebaseIndexer {
309
476
  return this.hashes[relPath] !== currentHash;
310
477
  }
311
478
 
479
+ // ── Index a single file (v2: cleaning + semantic chunking + metadata) ─────
480
+
312
481
  async indexFile(filePath) {
313
482
  const relPath = path.relative(this.root, filePath);
314
483
 
@@ -324,21 +493,39 @@ class CodebaseIndexer {
324
493
  return false;
325
494
  }
326
495
 
327
- const chunks = this.chunkCode(content);
496
+ // Extract metadata
497
+ const fileMeta = await extractFileMetadata(filePath, content);
328
498
  const archived = this.isArchived(relPath, content);
329
- const data = [];
330
499
 
500
+ // Clean content before chunking
501
+ const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
502
+
503
+ // Semantic chunking
504
+ const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
505
+
506
+ const data = [];
331
507
  for (let i = 0; i < chunks.length; i++) {
332
- const embedding = await this.embed(chunks[i]);
508
+ const embedding = await this.embed(chunks[i].content);
333
509
  data.push({
334
510
  file: relPath,
335
511
  chunk_index: i,
336
- content: chunks[i],
512
+ content: chunks[i].content,
337
513
  vector: embedding,
338
514
  archived: archived,
515
+ // v2 metadata
516
+ file_type: fileMeta.file_type,
517
+ language: fileMeta.language,
518
+ last_modified: fileMeta.last_modified,
519
+ file_size: fileMeta.file_size,
520
+ heading_context: chunks[i].heading_context || "",
521
+ function_name: chunks[i].function_name || "",
522
+ class_name: chunks[i].class_name || "",
523
+ tags: (fileMeta.tags || []).join(","),
339
524
  });
340
525
  }
341
526
 
527
+ if (data.length === 0) return false;
528
+
342
529
  const tableName = "chunks";
343
530
  const tables = await this.db.tableNames();
344
531
  if (tables.includes(tableName)) {
@@ -351,27 +538,189 @@ class CodebaseIndexer {
351
538
  this.hashes[relPath] = hash;
352
539
  await this.saveHashes();
353
540
 
541
+ // Invalidate BM25 index (needs rebuild) — release memory
542
+ if (this.bm25) {
543
+ this.bm25.clear();
544
+ this.bm25 = null;
545
+ }
546
+ this._bm25Rows = null;
547
+
354
548
  return true;
355
549
  }
356
550
 
357
- async search(query, limit = 5, includeArchived = false) {
551
+ // ── BM25 index management ────────────────────────────────────────────────
552
+
553
+ async ensureBM25() {
554
+ if (this.bm25) return this.bm25;
555
+
556
+ const tableName = "chunks";
557
+ const tables = await this.db.tableNames();
558
+ if (!tables.includes(tableName)) return null;
559
+
560
+ const table = await this.db.openTable(tableName);
561
+ const allRows = await table.search([0]).limit(100000).execute();
562
+
563
+ if (allRows.length === 0) return null;
564
+
565
+ // Sort for stable ID mapping between builds
566
+ allRows.sort((a, b) => {
567
+ const ka = `${a.file}:${a.chunk_index}`;
568
+ const kb = `${b.file}:${b.chunk_index}`;
569
+ return ka.localeCompare(kb);
570
+ });
571
+
572
+ // Release previous data before rebuilding
573
+ if (this.bm25) this.bm25.clear();
574
+ this._bm25Rows = null;
575
+
576
+ this.bm25 = new BM25Index();
577
+ this.bm25.build(allRows.map((r) => r.content));
578
+ this._bm25Rows = allRows;
579
+
580
+ return this.bm25;
581
+ }
582
+
583
+ // ── Search (v2: hybrid + metadata filters + metrics) ──────────────────────
584
+
585
+ async search(query, limit = 5, includeArchived = false, options = {}) {
358
586
  const tableName = "chunks";
359
587
  const tables = await this.db.tableNames();
360
588
  if (!tables.includes(tableName)) {
361
589
  return [];
362
590
  }
363
591
 
364
- const queryEmbedding = await this.embed(query);
592
+ const queryEmbedding = await this.embedQuery(query);
365
593
  const table = await this.db.openTable(tableName);
366
594
 
367
- const fetchLimit = includeArchived ? limit : limit * 3;
595
+ // Only over-fetch when filters or hybrid search are active
596
+ const hasFilters = !includeArchived || options.fileType || options.language ||
597
+ options.modifiedAfter || options.modifiedBefore ||
598
+ (options.tags && options.tags.length > 0);
599
+ const isHybrid = HYBRID_CONFIG.enabled || options.hybrid;
600
+ const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit * 3, 50) : limit;
368
601
  let results = await table.search(queryEmbedding).limit(fetchLimit).execute();
369
602
 
603
+ // ── Hybrid search ───────────────────────────────────────────────────────
604
+ if (HYBRID_CONFIG.enabled || options.hybrid) {
605
+ try {
606
+ const bm25 = await this.ensureBM25();
607
+ if (bm25 && this._bm25Rows) {
608
+ const bm25Results = bm25.search(query, fetchLimit);
609
+
610
+ // Build score maps
611
+ const vectorScores = new Map();
612
+ for (let i = 0; i < results.length; i++) {
613
+ const score = results[i]._distance != null ? 1 - results[i]._distance : 0.5;
614
+ vectorScores.set(i, score);
615
+ }
616
+
617
+ const bm25Scores = new Map();
618
+ for (const r of bm25Results) {
619
+ bm25Scores.set(r.id, r.score);
620
+ }
621
+
622
+ // We need a unified ID space. Since vector results and BM25 results
623
+ // reference different row sets, we use the full table rows for BM25
624
+ // and merge by file+chunk_index key.
625
+ const resultMap = new Map();
626
+
627
+ for (let i = 0; i < results.length; i++) {
628
+ const key = `${results[i].file}:${results[i].chunk_index}`;
629
+ const vs = results[i]._distance != null ? 1 - results[i]._distance : 0.5;
630
+ resultMap.set(key, { row: results[i], vectorScore: vs, bm25Score: 0 });
631
+ }
632
+
633
+ for (const br of bm25Results) {
634
+ if (br.id < this._bm25Rows.length) {
635
+ const bRow = this._bm25Rows[br.id];
636
+ const key = `${bRow.file}:${bRow.chunk_index}`;
637
+ if (resultMap.has(key)) {
638
+ resultMap.get(key).bm25Score = br.score;
639
+ } else {
640
+ resultMap.set(key, { row: bRow, vectorScore: 0, bm25Score: br.score });
641
+ }
642
+ }
643
+ }
644
+
645
+ // Normalize BM25 scores
646
+ let maxBM25 = 0;
647
+ for (const v of resultMap.values()) {
648
+ if (v.bm25Score > maxBM25) maxBM25 = v.bm25Score;
649
+ }
650
+
651
+ const bw = (options.bm25_weight ?? HYBRID_CONFIG.bm25_weight) || 0.3;
652
+ const vw = 1 - bw;
653
+
654
+ const merged = [];
655
+ for (const v of resultMap.values()) {
656
+ const normBM25 = maxBM25 > 0 ? v.bm25Score / maxBM25 : 0;
657
+ const combined = vw * v.vectorScore + bw * normBM25;
658
+ merged.push({ ...v.row, _combinedScore: combined, _distance: v.row._distance });
659
+ }
660
+
661
+ merged.sort((a, b) => b._combinedScore - a._combinedScore);
662
+ results = merged;
663
+ }
664
+ } catch (e) {
665
+ if (DEBUG) console.log("[vectorizer] Hybrid search fallback:", e.message);
666
+ // Fall through to vector-only results
667
+ }
668
+ }
669
+
670
+ // ── Metadata filters ──────────────────────────────────────────────────
370
671
  if (!includeArchived) {
371
672
  results = results.filter((r) => !r.archived);
372
673
  }
373
674
 
374
- return results.slice(0, limit);
675
+ if (options.fileType) {
676
+ results = results.filter((r) => r.file_type === options.fileType);
677
+ }
678
+
679
+ if (options.language) {
680
+ results = results.filter((r) => r.language === options.language);
681
+ }
682
+
683
+ if (options.modifiedAfter) {
684
+ const after = new Date(options.modifiedAfter).getTime();
685
+ results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
686
+ }
687
+
688
+ if (options.modifiedBefore) {
689
+ const before = new Date(options.modifiedBefore).getTime();
690
+ results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
691
+ }
692
+
693
+ if (options.tags && options.tags.length > 0) {
694
+ results = results.filter((r) => {
695
+ const rowTags = (r.tags || "").split(",").filter(Boolean);
696
+ return options.tags.some((t) => rowTags.includes(t));
697
+ });
698
+ }
699
+
700
+ const finalResults = results.slice(0, limit);
701
+
702
+ // ── Metrics tracking ────────────────────────────────────────────────────
703
+ if (METRICS_ENABLED) {
704
+ try {
705
+ if (!this.metrics) {
706
+ this.metrics = new SearchMetrics(this.root);
707
+ await this.metrics.load();
708
+ }
709
+ const scores = finalResults.map((r) =>
710
+ r._combinedScore != null
711
+ ? r._combinedScore
712
+ : r._distance != null
713
+ ? 1 - r._distance
714
+ : 0
715
+ );
716
+ this.metrics.recordQuery(query, this.indexName, scores, HYBRID_CONFIG.enabled || !!options.hybrid);
717
+ await this.metrics.save();
718
+ } catch {
719
+ // non-fatal
720
+ }
721
+ }
722
+
723
+ return finalResults;
375
724
  }
376
725
 
377
726
  async checkHealth(extraIgnore = []) {
@@ -478,7 +827,14 @@ class CodebaseIndexer {
478
827
 
479
828
  async indexSingleFile(filePath) {
480
829
  const absPath = path.isAbsolute(filePath) ? filePath : path.join(this.root, filePath);
481
- return await this.indexFile(absPath);
830
+ // Prevent path traversal outside project root
831
+ const normalized = path.normalize(absPath);
832
+ const relative = path.relative(this.root, normalized);
833
+ if (relative.startsWith("..") || path.isAbsolute(relative)) {
834
+ if (DEBUG) console.log(`[vectorizer] Path traversal blocked: ${filePath}`);
835
+ return false;
836
+ }
837
+ return await this.indexFile(normalized);
482
838
  }
483
839
 
484
840
  async getStats() {
@@ -500,6 +856,12 @@ class CodebaseIndexer {
500
856
  model: EMBEDDING_MODEL,
501
857
  fileCount,
502
858
  chunkCount,
859
+ features: {
860
+ chunking: CHUNKING_CONFIG.strategy,
861
+ hybrid: HYBRID_CONFIG.enabled,
862
+ metrics: METRICS_ENABLED,
863
+ cache: CACHE_ENABLED,
864
+ },
503
865
  };
504
866
  }
505
867
 
@@ -525,12 +887,19 @@ class CodebaseIndexer {
525
887
  async clear() {
526
888
  await fs.rm(this.cacheDir, { recursive: true, force: true });
527
889
  this.hashes = {};
890
+ if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
891
+ this._bm25Rows = null;
892
+ this.metrics = null;
528
893
  await this.init();
529
894
  }
530
895
 
531
896
  async clearAll() {
532
897
  await fs.rm(this.baseDir, { recursive: true, force: true });
533
898
  this.hashes = {};
899
+ if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
900
+ this._bm25Rows = null;
901
+ this.metrics = null;
902
+ clearQueryCache();
534
903
  await this.init();
535
904
  }
536
905
 
@@ -546,6 +915,16 @@ class CodebaseIndexer {
546
915
  } catch {}
547
916
  return indexes;
548
917
  }
918
+
919
+ // ── Metrics access ────────────────────────────────────────────────────────
920
+
921
+ async getMetrics() {
922
+ if (!this.metrics) {
923
+ this.metrics = new SearchMetrics(this.root);
924
+ await this.metrics.load();
925
+ }
926
+ return this.metrics.getSummary();
927
+ }
549
928
  }
550
929
 
551
930
  function getEmbeddingModel() {
@@ -0,0 +1,125 @@
1
+ /**
2
+ * Metadata Extractor — derives rich metadata from file path + content.
3
+ *
4
+ * Adds file_type, language, last_modified, file_size, heading_context,
5
+ * function_name, class_name, and frontmatter tags to each chunk.
6
+ */
7
+
8
+ import path from "path"
9
+ import fs from "fs/promises"
10
+
11
+ // ── Types ───────────────────────────────────────────────────────────────────
12
+
13
+ export type FileType = "code" | "docs" | "config"
14
+
15
+ export interface FileMetadata {
16
+ file_type: FileType
17
+ language: string
18
+ last_modified: string // ISO timestamp
19
+ file_size: number // bytes
20
+ tags: string[]
21
+ }
22
+
23
+ export interface ChunkMetadata extends FileMetadata {
24
+ file: string
25
+ chunk_index: number
26
+ content: string
27
+ vector: number[]
28
+ archived: boolean
29
+ heading_context?: string
30
+ function_name?: string
31
+ class_name?: string
32
+ }
33
+
34
+ // ── Extension maps ──────────────────────────────────────────────────────────
35
+
36
+ const CODE_EXTENSIONS: Record<string, string> = {
37
+ ".js": "javascript", ".mjs": "javascript", ".cjs": "javascript",
38
+ ".ts": "typescript", ".tsx": "typescript", ".jsx": "javascript",
39
+ ".py": "python",
40
+ ".go": "go",
41
+ ".rs": "rust",
42
+ ".java": "java", ".kt": "kotlin",
43
+ ".swift": "swift",
44
+ ".c": "c", ".cpp": "cpp", ".h": "c", ".hpp": "cpp",
45
+ ".cs": "csharp",
46
+ ".rb": "ruby",
47
+ ".php": "php",
48
+ ".scala": "scala",
49
+ ".clj": "clojure",
50
+ }
51
+
52
+ const DOC_EXTENSIONS = new Set([".md", ".mdx", ".txt", ".rst", ".adoc"])
53
+
54
+ const CONFIG_EXTENSIONS = new Set([
55
+ ".yaml", ".yml", ".json", ".toml", ".ini", ".xml", ".env",
56
+ ])
57
+
58
+ // ── Helpers ─────────────────────────────────────────────────────────────────
59
+
60
+ export function detectFileType(filePath: string): FileType {
61
+ const ext = path.extname(filePath).toLowerCase()
62
+ if (CODE_EXTENSIONS[ext]) return "code"
63
+ if (DOC_EXTENSIONS.has(ext)) return "docs"
64
+ if (CONFIG_EXTENSIONS.has(ext)) return "config"
65
+ return "code" // fallback
66
+ }
67
+
68
+ export function detectLanguage(filePath: string): string {
69
+ const ext = path.extname(filePath).toLowerCase()
70
+ if (CODE_EXTENSIONS[ext]) return CODE_EXTENSIONS[ext]
71
+ if (DOC_EXTENSIONS.has(ext)) return ext === ".md" || ext === ".mdx" ? "markdown" : ext.slice(1)
72
+ if (CONFIG_EXTENSIONS.has(ext)) return ext.slice(1)
73
+ return "unknown"
74
+ }
75
+
76
+ /** Extract tags from YAML front-matter (Markdown only). */
77
+ export function extractFrontmatterTags(content: string): string[] {
78
+ const match = content.match(/^---\n([\s\S]*?)\n---/)
79
+ if (!match) return []
80
+ const fm = match[1]
81
+
82
+ // Look for `tags:` key — array or inline
83
+ const tagsMatch = fm.match(/^tags:\s*\n((?:\s+-\s+.+\n?)*)/m)
84
+ if (tagsMatch) {
85
+ return tagsMatch[1]
86
+ .split("\n")
87
+ .map((l) => l.replace(/^\s*-\s*/, "").trim())
88
+ .filter(Boolean)
89
+ }
90
+
91
+ // Inline: tags: [a, b, c]
92
+ const inlineMatch = fm.match(/^tags:\s*\[([^\]]*)\]/m)
93
+ if (inlineMatch) {
94
+ return inlineMatch[1].split(",").map((t) => t.trim()).filter(Boolean)
95
+ }
96
+
97
+ return []
98
+ }
99
+
100
+ // ── Public API ──────────────────────────────────────────────────────────────
101
+
102
+ /**
103
+ * Extract base file-level metadata (without per-chunk fields).
104
+ */
105
+ export async function extractFileMetadata(
106
+ filePath: string,
107
+ content: string,
108
+ ): Promise<FileMetadata> {
109
+ let lastModified = new Date().toISOString()
110
+ let fileSize = Buffer.byteLength(content, "utf8")
111
+
112
+ try {
113
+ const stat = await fs.stat(filePath)
114
+ lastModified = stat.mtime.toISOString()
115
+ fileSize = stat.size
116
+ } catch {
117
+ // use defaults
118
+ }
119
+
120
+ const fileType = detectFileType(filePath)
121
+ const language = detectLanguage(filePath)
122
+ const tags = fileType === "docs" ? extractFrontmatterTags(content) : []
123
+
124
+ return { file_type: fileType, language, last_modified: lastModified, file_size: fileSize, tags }
125
+ }