@comfanion/usethis_search 0.1.5 → 0.2.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +98 -7
- package/file-indexer.ts +21 -1
- package/package.json +12 -2
- package/tools/codeindex.ts +135 -16
- package/tools/search.ts +46 -11
- package/vectorizer/bm25-index.ts +155 -0
- package/vectorizer/chunkers/chunker-factory.ts +98 -0
- package/vectorizer/chunkers/code-chunker.ts +325 -0
- package/vectorizer/chunkers/markdown-chunker.ts +177 -0
- package/vectorizer/content-cleaner.ts +136 -0
- package/vectorizer/hybrid-search.ts +97 -0
- package/vectorizer/index.js +395 -16
- package/vectorizer/metadata-extractor.ts +125 -0
- package/vectorizer/query-cache.ts +126 -0
- package/vectorizer/search-metrics.ts +155 -0
- package/vectorizer.yaml +81 -0
package/vectorizer/index.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
// OpenCode Vectorizer - Semantic Code Search with Multi-Index Support
|
|
2
|
+
// v2: Content cleaning, semantic chunking, hybrid search, metadata, cache, metrics
|
|
2
3
|
|
|
3
4
|
import { pipeline, env } from "@xenova/transformers";
|
|
4
5
|
import * as lancedb from "vectordb";
|
|
@@ -6,6 +7,15 @@ import fs from "fs/promises";
|
|
|
6
7
|
import path from "path";
|
|
7
8
|
import crypto from "crypto";
|
|
8
9
|
|
|
10
|
+
// ── New modules ─────────────────────────────────────────────────────────────
|
|
11
|
+
import { cleanContent, DEFAULT_CLEANING_CONFIG } from "./content-cleaner.ts";
|
|
12
|
+
import { extractFileMetadata, detectFileType, detectLanguage } from "./metadata-extractor.ts";
|
|
13
|
+
import { chunkContent, DEFAULT_CHUNKING_CONFIG } from "./chunkers/chunker-factory.ts";
|
|
14
|
+
import { BM25Index } from "./bm25-index.ts";
|
|
15
|
+
import { mergeResults, DEFAULT_HYBRID_CONFIG } from "./hybrid-search.ts";
|
|
16
|
+
import { QueryCache, DEFAULT_CACHE_CONFIG } from "./query-cache.ts";
|
|
17
|
+
import { SearchMetrics } from "./search-metrics.ts";
|
|
18
|
+
|
|
9
19
|
// Suppress transformers.js logs unless DEBUG is set
|
|
10
20
|
const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
|
|
11
21
|
if (!DEBUG) {
|
|
@@ -57,6 +67,13 @@ let GLOBAL_IGNORE = [];
|
|
|
57
67
|
// Default embedding model (fast). Can be overridden by config.
|
|
58
68
|
let EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2";
|
|
59
69
|
|
|
70
|
+
// ── Extended config parsed from YAML ────────────────────────────────────────
|
|
71
|
+
let CLEANING_CONFIG = { ...DEFAULT_CLEANING_CONFIG };
|
|
72
|
+
let CHUNKING_CONFIG = { ...DEFAULT_CHUNKING_CONFIG };
|
|
73
|
+
let HYBRID_CONFIG = { ...DEFAULT_HYBRID_CONFIG };
|
|
74
|
+
let METRICS_ENABLED = false;
|
|
75
|
+
let CACHE_ENABLED = true;
|
|
76
|
+
|
|
60
77
|
function defaultVectorizerYaml() {
|
|
61
78
|
return (
|
|
62
79
|
`vectorizer:\n` +
|
|
@@ -64,6 +81,40 @@ function defaultVectorizerYaml() {
|
|
|
64
81
|
` auto_index: true\n` +
|
|
65
82
|
` model: \"${EMBEDDING_MODEL}\"\n` +
|
|
66
83
|
` debounce_ms: 1000\n` +
|
|
84
|
+
`\n` +
|
|
85
|
+
` # Content cleaning before chunking\n` +
|
|
86
|
+
` cleaning:\n` +
|
|
87
|
+
` remove_toc: true\n` +
|
|
88
|
+
` remove_frontmatter_metadata: false\n` +
|
|
89
|
+
` remove_imports: false\n` +
|
|
90
|
+
` remove_comments: false\n` +
|
|
91
|
+
`\n` +
|
|
92
|
+
` # Chunking strategy\n` +
|
|
93
|
+
` chunking:\n` +
|
|
94
|
+
` strategy: \"semantic\" # fixed | semantic\n` +
|
|
95
|
+
` markdown:\n` +
|
|
96
|
+
` split_by_headings: true\n` +
|
|
97
|
+
` min_chunk_size: 200\n` +
|
|
98
|
+
` max_chunk_size: 2000\n` +
|
|
99
|
+
` preserve_heading_hierarchy: true\n` +
|
|
100
|
+
` code:\n` +
|
|
101
|
+
` split_by_functions: true\n` +
|
|
102
|
+
` include_function_signature: true\n` +
|
|
103
|
+
` min_chunk_size: 300\n` +
|
|
104
|
+
` max_chunk_size: 1500\n` +
|
|
105
|
+
` fixed:\n` +
|
|
106
|
+
` max_chars: 1500\n` +
|
|
107
|
+
`\n` +
|
|
108
|
+
` # Hybrid search (vector + BM25)\n` +
|
|
109
|
+
` search:\n` +
|
|
110
|
+
` hybrid: false\n` +
|
|
111
|
+
` bm25_weight: 0.3\n` +
|
|
112
|
+
`\n` +
|
|
113
|
+
` # Quality monitoring\n` +
|
|
114
|
+
` quality:\n` +
|
|
115
|
+
` enable_metrics: false\n` +
|
|
116
|
+
` enable_cache: true\n` +
|
|
117
|
+
`\n` +
|
|
67
118
|
` indexes:\n` +
|
|
68
119
|
` code:\n` +
|
|
69
120
|
` enabled: true\n` +
|
|
@@ -104,8 +155,25 @@ async function ensureDefaultConfig(projectRoot) {
|
|
|
104
155
|
}
|
|
105
156
|
}
|
|
106
157
|
|
|
158
|
+
// ── YAML mini-parser helpers ────────────────────────────────────────────────
|
|
159
|
+
|
|
160
|
+
function parseBool(section, key, fallback) {
|
|
161
|
+
const m = section.match(new RegExp(`^\\s+${key}:\\s*(true|false)`, "m"));
|
|
162
|
+
return m ? m[1] === "true" : fallback;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function parseNumber(section, key, fallback) {
|
|
166
|
+
const m = section.match(new RegExp(`^\\s+${key}:\\s*(\\d+(?:\\.\\d+)?)`, "m"));
|
|
167
|
+
return m ? parseFloat(m[1]) : fallback;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
function parseString(section, key, fallback) {
|
|
171
|
+
const m = section.match(new RegExp(`^\\s+${key}:\\s*["']?([^"'\\n]+)["']?`, "m"));
|
|
172
|
+
return m ? m[1].trim() : fallback;
|
|
173
|
+
}
|
|
174
|
+
|
|
107
175
|
/**
|
|
108
|
-
* Load index configuration from .opencode/vectorizer.yaml
|
|
176
|
+
* Load index configuration from .opencode/vectorizer.yaml.
|
|
109
177
|
*/
|
|
110
178
|
async function loadConfig(projectRoot) {
|
|
111
179
|
try {
|
|
@@ -142,6 +210,61 @@ async function loadConfig(projectRoot) {
|
|
|
142
210
|
if (DEBUG) console.log("[vectorizer] Using model from config:", EMBEDDING_MODEL);
|
|
143
211
|
}
|
|
144
212
|
|
|
213
|
+
// ── Parse cleaning config ───────────────────────────────────────────────
|
|
214
|
+
const cleaningMatch = section.match(/^\s{2}cleaning:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
|
|
215
|
+
if (cleaningMatch) {
|
|
216
|
+
const cs = cleaningMatch[1];
|
|
217
|
+
CLEANING_CONFIG = {
|
|
218
|
+
remove_toc: parseBool(cs, "remove_toc", true),
|
|
219
|
+
remove_frontmatter_metadata: parseBool(cs, "remove_frontmatter_metadata", false),
|
|
220
|
+
remove_imports: parseBool(cs, "remove_imports", false),
|
|
221
|
+
remove_comments: parseBool(cs, "remove_comments", false),
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// ── Parse chunking config ───────────────────────────────────────────────
|
|
226
|
+
const chunkingMatch = section.match(/^\s{2}chunking:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
|
|
227
|
+
if (chunkingMatch) {
|
|
228
|
+
const cs = chunkingMatch[1];
|
|
229
|
+
const strategy = parseString(cs, "strategy", "semantic");
|
|
230
|
+
CHUNKING_CONFIG = {
|
|
231
|
+
strategy: strategy,
|
|
232
|
+
markdown: {
|
|
233
|
+
split_by_headings: parseBool(cs, "split_by_headings", true),
|
|
234
|
+
min_chunk_size: parseNumber(cs, "min_chunk_size", 200),
|
|
235
|
+
max_chunk_size: parseNumber(cs, "max_chunk_size", 2000),
|
|
236
|
+
preserve_heading_hierarchy: parseBool(cs, "preserve_heading_hierarchy", true),
|
|
237
|
+
},
|
|
238
|
+
code: {
|
|
239
|
+
split_by_functions: parseBool(cs, "split_by_functions", true),
|
|
240
|
+
include_function_signature: parseBool(cs, "include_function_signature", true),
|
|
241
|
+
min_chunk_size: parseNumber(cs, "min_chunk_size", 300),
|
|
242
|
+
max_chunk_size: parseNumber(cs, "max_chunk_size", 1500),
|
|
243
|
+
},
|
|
244
|
+
fixed: {
|
|
245
|
+
max_chars: parseNumber(cs, "max_chars", 1500),
|
|
246
|
+
},
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// ── Parse search config ─────────────────────────────────────────────────
|
|
251
|
+
const searchMatch = section.match(/^\s{2}search:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
|
|
252
|
+
if (searchMatch) {
|
|
253
|
+
const ss = searchMatch[1];
|
|
254
|
+
HYBRID_CONFIG = {
|
|
255
|
+
enabled: parseBool(ss, "hybrid", false),
|
|
256
|
+
bm25_weight: parseNumber(ss, "bm25_weight", 0.3),
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// ── Parse quality config ────────────────────────────────────────────────
|
|
261
|
+
const qualityMatch = section.match(/^\s{2}quality:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
|
|
262
|
+
if (qualityMatch) {
|
|
263
|
+
const qs = qualityMatch[1];
|
|
264
|
+
METRICS_ENABLED = parseBool(qs, "enable_metrics", false);
|
|
265
|
+
CACHE_ENABLED = parseBool(qs, "enable_cache", true);
|
|
266
|
+
}
|
|
267
|
+
|
|
145
268
|
// Parse global exclude
|
|
146
269
|
const excludeMatch = section.match(/^\s{2}exclude:\s*\n((?:\s{4}-\s+.+\n?)*)/m);
|
|
147
270
|
if (excludeMatch) {
|
|
@@ -196,12 +319,25 @@ async function loadConfig(projectRoot) {
|
|
|
196
319
|
}
|
|
197
320
|
}
|
|
198
321
|
|
|
199
|
-
if (DEBUG) console.log("[vectorizer] Loaded config:", { INDEX_PRESETS, GLOBAL_IGNORE });
|
|
322
|
+
if (DEBUG) console.log("[vectorizer] Loaded config:", { INDEX_PRESETS, GLOBAL_IGNORE, HYBRID_CONFIG, CHUNKING_CONFIG });
|
|
200
323
|
} catch {
|
|
201
324
|
if (DEBUG) console.log("[vectorizer] Using default presets (config load failed)");
|
|
202
325
|
}
|
|
203
326
|
}
|
|
204
327
|
|
|
328
|
+
// ── Shared query cache (singleton per process) ─────────────────────────────
|
|
329
|
+
let _queryCache = null;
|
|
330
|
+
function getQueryCache() {
|
|
331
|
+
if (!_queryCache) _queryCache = new QueryCache(DEFAULT_CACHE_CONFIG);
|
|
332
|
+
return _queryCache;
|
|
333
|
+
}
|
|
334
|
+
function clearQueryCache() {
|
|
335
|
+
if (_queryCache) {
|
|
336
|
+
_queryCache.destroy();
|
|
337
|
+
_queryCache = null;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
|
|
205
341
|
class CodebaseIndexer {
|
|
206
342
|
constructor(projectRoot, indexName = "code") {
|
|
207
343
|
this.root = projectRoot;
|
|
@@ -212,6 +348,8 @@ class CodebaseIndexer {
|
|
|
212
348
|
this.db = null;
|
|
213
349
|
this.hashes = {};
|
|
214
350
|
this.configLoaded = false;
|
|
351
|
+
this.bm25 = null; // lazy-built BM25 index
|
|
352
|
+
this.metrics = null; // lazy-loaded SearchMetrics
|
|
215
353
|
}
|
|
216
354
|
|
|
217
355
|
async init() {
|
|
@@ -227,17 +365,30 @@ class CodebaseIndexer {
|
|
|
227
365
|
|
|
228
366
|
async loadModel() {
|
|
229
367
|
if (!this.model) {
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
368
|
+
try {
|
|
369
|
+
if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}...`);
|
|
370
|
+
this.model = await pipeline("feature-extraction", EMBEDDING_MODEL, {
|
|
371
|
+
progress_callback: DEBUG ? undefined : null,
|
|
372
|
+
});
|
|
373
|
+
if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`);
|
|
374
|
+
} catch (error) {
|
|
375
|
+
this.model = null;
|
|
376
|
+
throw new Error(`Model loading failed: ${error.message || error}`);
|
|
377
|
+
}
|
|
235
378
|
}
|
|
236
379
|
return this.model;
|
|
237
380
|
}
|
|
238
381
|
|
|
239
382
|
async unloadModel() {
|
|
240
383
|
this.model = null;
|
|
384
|
+
// Release BM25 data held in memory
|
|
385
|
+
if (this.bm25) {
|
|
386
|
+
this.bm25.clear();
|
|
387
|
+
this.bm25 = null;
|
|
388
|
+
}
|
|
389
|
+
this._bm25Rows = null;
|
|
390
|
+
this.metrics = null;
|
|
391
|
+
clearQueryCache();
|
|
241
392
|
if (global.gc) global.gc();
|
|
242
393
|
}
|
|
243
394
|
|
|
@@ -274,12 +425,28 @@ class CodebaseIndexer {
|
|
|
274
425
|
return false;
|
|
275
426
|
}
|
|
276
427
|
|
|
428
|
+
// ── Embedding (with optional cache) ───────────────────────────────────────
|
|
429
|
+
|
|
277
430
|
async embed(text) {
|
|
278
431
|
const model = await this.loadModel();
|
|
279
432
|
const result = await model(text, { pooling: "mean", normalize: true });
|
|
280
433
|
return Array.from(result.data);
|
|
281
434
|
}
|
|
282
435
|
|
|
436
|
+
async embedQuery(text) {
|
|
437
|
+
if (CACHE_ENABLED) {
|
|
438
|
+
const cache = getQueryCache();
|
|
439
|
+
const cached = cache.get(text);
|
|
440
|
+
if (cached) return cached;
|
|
441
|
+
const embedding = await this.embed(text);
|
|
442
|
+
cache.set(text, embedding);
|
|
443
|
+
return embedding;
|
|
444
|
+
}
|
|
445
|
+
return this.embed(text);
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
// ── Legacy chunker (kept for backward compat / "fixed" strategy) ──────────
|
|
449
|
+
|
|
283
450
|
chunkCode(content, maxChars = 1500) {
|
|
284
451
|
const chunks = [];
|
|
285
452
|
const lines = content.split("\n");
|
|
@@ -309,6 +476,8 @@ class CodebaseIndexer {
|
|
|
309
476
|
return this.hashes[relPath] !== currentHash;
|
|
310
477
|
}
|
|
311
478
|
|
|
479
|
+
// ── Index a single file (v2: cleaning + semantic chunking + metadata) ─────
|
|
480
|
+
|
|
312
481
|
async indexFile(filePath) {
|
|
313
482
|
const relPath = path.relative(this.root, filePath);
|
|
314
483
|
|
|
@@ -324,21 +493,39 @@ class CodebaseIndexer {
|
|
|
324
493
|
return false;
|
|
325
494
|
}
|
|
326
495
|
|
|
327
|
-
|
|
496
|
+
// Extract metadata
|
|
497
|
+
const fileMeta = await extractFileMetadata(filePath, content);
|
|
328
498
|
const archived = this.isArchived(relPath, content);
|
|
329
|
-
const data = [];
|
|
330
499
|
|
|
500
|
+
// Clean content before chunking
|
|
501
|
+
const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
|
|
502
|
+
|
|
503
|
+
// Semantic chunking
|
|
504
|
+
const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
|
|
505
|
+
|
|
506
|
+
const data = [];
|
|
331
507
|
for (let i = 0; i < chunks.length; i++) {
|
|
332
|
-
const embedding = await this.embed(chunks[i]);
|
|
508
|
+
const embedding = await this.embed(chunks[i].content);
|
|
333
509
|
data.push({
|
|
334
510
|
file: relPath,
|
|
335
511
|
chunk_index: i,
|
|
336
|
-
content: chunks[i],
|
|
512
|
+
content: chunks[i].content,
|
|
337
513
|
vector: embedding,
|
|
338
514
|
archived: archived,
|
|
515
|
+
// v2 metadata
|
|
516
|
+
file_type: fileMeta.file_type,
|
|
517
|
+
language: fileMeta.language,
|
|
518
|
+
last_modified: fileMeta.last_modified,
|
|
519
|
+
file_size: fileMeta.file_size,
|
|
520
|
+
heading_context: chunks[i].heading_context || "",
|
|
521
|
+
function_name: chunks[i].function_name || "",
|
|
522
|
+
class_name: chunks[i].class_name || "",
|
|
523
|
+
tags: (fileMeta.tags || []).join(","),
|
|
339
524
|
});
|
|
340
525
|
}
|
|
341
526
|
|
|
527
|
+
if (data.length === 0) return false;
|
|
528
|
+
|
|
342
529
|
const tableName = "chunks";
|
|
343
530
|
const tables = await this.db.tableNames();
|
|
344
531
|
if (tables.includes(tableName)) {
|
|
@@ -351,27 +538,189 @@ class CodebaseIndexer {
|
|
|
351
538
|
this.hashes[relPath] = hash;
|
|
352
539
|
await this.saveHashes();
|
|
353
540
|
|
|
541
|
+
// Invalidate BM25 index (needs rebuild) — release memory
|
|
542
|
+
if (this.bm25) {
|
|
543
|
+
this.bm25.clear();
|
|
544
|
+
this.bm25 = null;
|
|
545
|
+
}
|
|
546
|
+
this._bm25Rows = null;
|
|
547
|
+
|
|
354
548
|
return true;
|
|
355
549
|
}
|
|
356
550
|
|
|
357
|
-
|
|
551
|
+
// ── BM25 index management ────────────────────────────────────────────────
|
|
552
|
+
|
|
553
|
+
async ensureBM25() {
|
|
554
|
+
if (this.bm25) return this.bm25;
|
|
555
|
+
|
|
556
|
+
const tableName = "chunks";
|
|
557
|
+
const tables = await this.db.tableNames();
|
|
558
|
+
if (!tables.includes(tableName)) return null;
|
|
559
|
+
|
|
560
|
+
const table = await this.db.openTable(tableName);
|
|
561
|
+
const allRows = await table.search([0]).limit(100000).execute();
|
|
562
|
+
|
|
563
|
+
if (allRows.length === 0) return null;
|
|
564
|
+
|
|
565
|
+
// Sort for stable ID mapping between builds
|
|
566
|
+
allRows.sort((a, b) => {
|
|
567
|
+
const ka = `${a.file}:${a.chunk_index}`;
|
|
568
|
+
const kb = `${b.file}:${b.chunk_index}`;
|
|
569
|
+
return ka.localeCompare(kb);
|
|
570
|
+
});
|
|
571
|
+
|
|
572
|
+
// Release previous data before rebuilding
|
|
573
|
+
if (this.bm25) this.bm25.clear();
|
|
574
|
+
this._bm25Rows = null;
|
|
575
|
+
|
|
576
|
+
this.bm25 = new BM25Index();
|
|
577
|
+
this.bm25.build(allRows.map((r) => r.content));
|
|
578
|
+
this._bm25Rows = allRows;
|
|
579
|
+
|
|
580
|
+
return this.bm25;
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
// ── Search (v2: hybrid + metadata filters + metrics) ──────────────────────
|
|
584
|
+
|
|
585
|
+
async search(query, limit = 5, includeArchived = false, options = {}) {
|
|
358
586
|
const tableName = "chunks";
|
|
359
587
|
const tables = await this.db.tableNames();
|
|
360
588
|
if (!tables.includes(tableName)) {
|
|
361
589
|
return [];
|
|
362
590
|
}
|
|
363
591
|
|
|
364
|
-
const queryEmbedding = await this.
|
|
592
|
+
const queryEmbedding = await this.embedQuery(query);
|
|
365
593
|
const table = await this.db.openTable(tableName);
|
|
366
594
|
|
|
367
|
-
|
|
595
|
+
// Only over-fetch when filters or hybrid search are active
|
|
596
|
+
const hasFilters = !includeArchived || options.fileType || options.language ||
|
|
597
|
+
options.modifiedAfter || options.modifiedBefore ||
|
|
598
|
+
(options.tags && options.tags.length > 0);
|
|
599
|
+
const isHybrid = HYBRID_CONFIG.enabled || options.hybrid;
|
|
600
|
+
const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit * 3, 50) : limit;
|
|
368
601
|
let results = await table.search(queryEmbedding).limit(fetchLimit).execute();
|
|
369
602
|
|
|
603
|
+
// ── Hybrid search ───────────────────────────────────────────────────────
|
|
604
|
+
if (HYBRID_CONFIG.enabled || options.hybrid) {
|
|
605
|
+
try {
|
|
606
|
+
const bm25 = await this.ensureBM25();
|
|
607
|
+
if (bm25 && this._bm25Rows) {
|
|
608
|
+
const bm25Results = bm25.search(query, fetchLimit);
|
|
609
|
+
|
|
610
|
+
// Build score maps
|
|
611
|
+
const vectorScores = new Map();
|
|
612
|
+
for (let i = 0; i < results.length; i++) {
|
|
613
|
+
const score = results[i]._distance != null ? 1 - results[i]._distance : 0.5;
|
|
614
|
+
vectorScores.set(i, score);
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
const bm25Scores = new Map();
|
|
618
|
+
for (const r of bm25Results) {
|
|
619
|
+
bm25Scores.set(r.id, r.score);
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
// We need a unified ID space. Since vector results and BM25 results
|
|
623
|
+
// reference different row sets, we use the full table rows for BM25
|
|
624
|
+
// and merge by file+chunk_index key.
|
|
625
|
+
const resultMap = new Map();
|
|
626
|
+
|
|
627
|
+
for (let i = 0; i < results.length; i++) {
|
|
628
|
+
const key = `${results[i].file}:${results[i].chunk_index}`;
|
|
629
|
+
const vs = results[i]._distance != null ? 1 - results[i]._distance : 0.5;
|
|
630
|
+
resultMap.set(key, { row: results[i], vectorScore: vs, bm25Score: 0 });
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
for (const br of bm25Results) {
|
|
634
|
+
if (br.id < this._bm25Rows.length) {
|
|
635
|
+
const bRow = this._bm25Rows[br.id];
|
|
636
|
+
const key = `${bRow.file}:${bRow.chunk_index}`;
|
|
637
|
+
if (resultMap.has(key)) {
|
|
638
|
+
resultMap.get(key).bm25Score = br.score;
|
|
639
|
+
} else {
|
|
640
|
+
resultMap.set(key, { row: bRow, vectorScore: 0, bm25Score: br.score });
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
// Normalize BM25 scores
|
|
646
|
+
let maxBM25 = 0;
|
|
647
|
+
for (const v of resultMap.values()) {
|
|
648
|
+
if (v.bm25Score > maxBM25) maxBM25 = v.bm25Score;
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
const bw = (options.bm25_weight ?? HYBRID_CONFIG.bm25_weight) || 0.3;
|
|
652
|
+
const vw = 1 - bw;
|
|
653
|
+
|
|
654
|
+
const merged = [];
|
|
655
|
+
for (const v of resultMap.values()) {
|
|
656
|
+
const normBM25 = maxBM25 > 0 ? v.bm25Score / maxBM25 : 0;
|
|
657
|
+
const combined = vw * v.vectorScore + bw * normBM25;
|
|
658
|
+
merged.push({ ...v.row, _combinedScore: combined, _distance: v.row._distance });
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
merged.sort((a, b) => b._combinedScore - a._combinedScore);
|
|
662
|
+
results = merged;
|
|
663
|
+
}
|
|
664
|
+
} catch (e) {
|
|
665
|
+
if (DEBUG) console.log("[vectorizer] Hybrid search fallback:", e.message);
|
|
666
|
+
// Fall through to vector-only results
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
// ── Metadata filters ──────────────────────────────────────────────────
|
|
370
671
|
if (!includeArchived) {
|
|
371
672
|
results = results.filter((r) => !r.archived);
|
|
372
673
|
}
|
|
373
674
|
|
|
374
|
-
|
|
675
|
+
if (options.fileType) {
|
|
676
|
+
results = results.filter((r) => r.file_type === options.fileType);
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
if (options.language) {
|
|
680
|
+
results = results.filter((r) => r.language === options.language);
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
if (options.modifiedAfter) {
|
|
684
|
+
const after = new Date(options.modifiedAfter).getTime();
|
|
685
|
+
results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
if (options.modifiedBefore) {
|
|
689
|
+
const before = new Date(options.modifiedBefore).getTime();
|
|
690
|
+
results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
if (options.tags && options.tags.length > 0) {
|
|
694
|
+
results = results.filter((r) => {
|
|
695
|
+
const rowTags = (r.tags || "").split(",").filter(Boolean);
|
|
696
|
+
return options.tags.some((t) => rowTags.includes(t));
|
|
697
|
+
});
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
const finalResults = results.slice(0, limit);
|
|
701
|
+
|
|
702
|
+
// ── Metrics tracking ────────────────────────────────────────────────────
|
|
703
|
+
if (METRICS_ENABLED) {
|
|
704
|
+
try {
|
|
705
|
+
if (!this.metrics) {
|
|
706
|
+
this.metrics = new SearchMetrics(this.root);
|
|
707
|
+
await this.metrics.load();
|
|
708
|
+
}
|
|
709
|
+
const scores = finalResults.map((r) =>
|
|
710
|
+
r._combinedScore != null
|
|
711
|
+
? r._combinedScore
|
|
712
|
+
: r._distance != null
|
|
713
|
+
? 1 - r._distance
|
|
714
|
+
: 0
|
|
715
|
+
);
|
|
716
|
+
this.metrics.recordQuery(query, this.indexName, scores, HYBRID_CONFIG.enabled || !!options.hybrid);
|
|
717
|
+
await this.metrics.save();
|
|
718
|
+
} catch {
|
|
719
|
+
// non-fatal
|
|
720
|
+
}
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
return finalResults;
|
|
375
724
|
}
|
|
376
725
|
|
|
377
726
|
async checkHealth(extraIgnore = []) {
|
|
@@ -478,7 +827,14 @@ class CodebaseIndexer {
|
|
|
478
827
|
|
|
479
828
|
async indexSingleFile(filePath) {
|
|
480
829
|
const absPath = path.isAbsolute(filePath) ? filePath : path.join(this.root, filePath);
|
|
481
|
-
|
|
830
|
+
// Prevent path traversal outside project root
|
|
831
|
+
const normalized = path.normalize(absPath);
|
|
832
|
+
const relative = path.relative(this.root, normalized);
|
|
833
|
+
if (relative.startsWith("..") || path.isAbsolute(relative)) {
|
|
834
|
+
if (DEBUG) console.log(`[vectorizer] Path traversal blocked: ${filePath}`);
|
|
835
|
+
return false;
|
|
836
|
+
}
|
|
837
|
+
return await this.indexFile(normalized);
|
|
482
838
|
}
|
|
483
839
|
|
|
484
840
|
async getStats() {
|
|
@@ -500,6 +856,12 @@ class CodebaseIndexer {
|
|
|
500
856
|
model: EMBEDDING_MODEL,
|
|
501
857
|
fileCount,
|
|
502
858
|
chunkCount,
|
|
859
|
+
features: {
|
|
860
|
+
chunking: CHUNKING_CONFIG.strategy,
|
|
861
|
+
hybrid: HYBRID_CONFIG.enabled,
|
|
862
|
+
metrics: METRICS_ENABLED,
|
|
863
|
+
cache: CACHE_ENABLED,
|
|
864
|
+
},
|
|
503
865
|
};
|
|
504
866
|
}
|
|
505
867
|
|
|
@@ -525,12 +887,19 @@ class CodebaseIndexer {
|
|
|
525
887
|
async clear() {
|
|
526
888
|
await fs.rm(this.cacheDir, { recursive: true, force: true });
|
|
527
889
|
this.hashes = {};
|
|
890
|
+
if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
|
|
891
|
+
this._bm25Rows = null;
|
|
892
|
+
this.metrics = null;
|
|
528
893
|
await this.init();
|
|
529
894
|
}
|
|
530
895
|
|
|
531
896
|
async clearAll() {
|
|
532
897
|
await fs.rm(this.baseDir, { recursive: true, force: true });
|
|
533
898
|
this.hashes = {};
|
|
899
|
+
if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
|
|
900
|
+
this._bm25Rows = null;
|
|
901
|
+
this.metrics = null;
|
|
902
|
+
clearQueryCache();
|
|
534
903
|
await this.init();
|
|
535
904
|
}
|
|
536
905
|
|
|
@@ -546,6 +915,16 @@ class CodebaseIndexer {
|
|
|
546
915
|
} catch {}
|
|
547
916
|
return indexes;
|
|
548
917
|
}
|
|
918
|
+
|
|
919
|
+
// ── Metrics access ────────────────────────────────────────────────────────
|
|
920
|
+
|
|
921
|
+
async getMetrics() {
|
|
922
|
+
if (!this.metrics) {
|
|
923
|
+
this.metrics = new SearchMetrics(this.root);
|
|
924
|
+
await this.metrics.load();
|
|
925
|
+
}
|
|
926
|
+
return this.metrics.getSummary();
|
|
927
|
+
}
|
|
549
928
|
}
|
|
550
929
|
|
|
551
930
|
function getEmbeddingModel() {
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Metadata Extractor — derives rich metadata from file path + content.
|
|
3
|
+
*
|
|
4
|
+
* Adds file_type, language, last_modified, file_size, heading_context,
|
|
5
|
+
* function_name, class_name, and frontmatter tags to each chunk.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import path from "path"
|
|
9
|
+
import fs from "fs/promises"
|
|
10
|
+
|
|
11
|
+
// ── Types ───────────────────────────────────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
export type FileType = "code" | "docs" | "config"
|
|
14
|
+
|
|
15
|
+
export interface FileMetadata {
|
|
16
|
+
file_type: FileType
|
|
17
|
+
language: string
|
|
18
|
+
last_modified: string // ISO timestamp
|
|
19
|
+
file_size: number // bytes
|
|
20
|
+
tags: string[]
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export interface ChunkMetadata extends FileMetadata {
|
|
24
|
+
file: string
|
|
25
|
+
chunk_index: number
|
|
26
|
+
content: string
|
|
27
|
+
vector: number[]
|
|
28
|
+
archived: boolean
|
|
29
|
+
heading_context?: string
|
|
30
|
+
function_name?: string
|
|
31
|
+
class_name?: string
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// ── Extension maps ──────────────────────────────────────────────────────────
|
|
35
|
+
|
|
36
|
+
const CODE_EXTENSIONS: Record<string, string> = {
|
|
37
|
+
".js": "javascript", ".mjs": "javascript", ".cjs": "javascript",
|
|
38
|
+
".ts": "typescript", ".tsx": "typescript", ".jsx": "javascript",
|
|
39
|
+
".py": "python",
|
|
40
|
+
".go": "go",
|
|
41
|
+
".rs": "rust",
|
|
42
|
+
".java": "java", ".kt": "kotlin",
|
|
43
|
+
".swift": "swift",
|
|
44
|
+
".c": "c", ".cpp": "cpp", ".h": "c", ".hpp": "cpp",
|
|
45
|
+
".cs": "csharp",
|
|
46
|
+
".rb": "ruby",
|
|
47
|
+
".php": "php",
|
|
48
|
+
".scala": "scala",
|
|
49
|
+
".clj": "clojure",
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const DOC_EXTENSIONS = new Set([".md", ".mdx", ".txt", ".rst", ".adoc"])
|
|
53
|
+
|
|
54
|
+
const CONFIG_EXTENSIONS = new Set([
|
|
55
|
+
".yaml", ".yml", ".json", ".toml", ".ini", ".xml", ".env",
|
|
56
|
+
])
|
|
57
|
+
|
|
58
|
+
// ── Helpers ─────────────────────────────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
export function detectFileType(filePath: string): FileType {
|
|
61
|
+
const ext = path.extname(filePath).toLowerCase()
|
|
62
|
+
if (CODE_EXTENSIONS[ext]) return "code"
|
|
63
|
+
if (DOC_EXTENSIONS.has(ext)) return "docs"
|
|
64
|
+
if (CONFIG_EXTENSIONS.has(ext)) return "config"
|
|
65
|
+
return "code" // fallback
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export function detectLanguage(filePath: string): string {
|
|
69
|
+
const ext = path.extname(filePath).toLowerCase()
|
|
70
|
+
if (CODE_EXTENSIONS[ext]) return CODE_EXTENSIONS[ext]
|
|
71
|
+
if (DOC_EXTENSIONS.has(ext)) return ext === ".md" || ext === ".mdx" ? "markdown" : ext.slice(1)
|
|
72
|
+
if (CONFIG_EXTENSIONS.has(ext)) return ext.slice(1)
|
|
73
|
+
return "unknown"
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/** Extract tags from YAML front-matter (Markdown only). */
|
|
77
|
+
export function extractFrontmatterTags(content: string): string[] {
|
|
78
|
+
const match = content.match(/^---\n([\s\S]*?)\n---/)
|
|
79
|
+
if (!match) return []
|
|
80
|
+
const fm = match[1]
|
|
81
|
+
|
|
82
|
+
// Look for `tags:` key — array or inline
|
|
83
|
+
const tagsMatch = fm.match(/^tags:\s*\n((?:\s+-\s+.+\n?)*)/m)
|
|
84
|
+
if (tagsMatch) {
|
|
85
|
+
return tagsMatch[1]
|
|
86
|
+
.split("\n")
|
|
87
|
+
.map((l) => l.replace(/^\s*-\s*/, "").trim())
|
|
88
|
+
.filter(Boolean)
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Inline: tags: [a, b, c]
|
|
92
|
+
const inlineMatch = fm.match(/^tags:\s*\[([^\]]*)\]/m)
|
|
93
|
+
if (inlineMatch) {
|
|
94
|
+
return inlineMatch[1].split(",").map((t) => t.trim()).filter(Boolean)
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return []
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// ── Public API ──────────────────────────────────────────────────────────────
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Extract base file-level metadata (without per-chunk fields).
|
|
104
|
+
*/
|
|
105
|
+
export async function extractFileMetadata(
|
|
106
|
+
filePath: string,
|
|
107
|
+
content: string,
|
|
108
|
+
): Promise<FileMetadata> {
|
|
109
|
+
let lastModified = new Date().toISOString()
|
|
110
|
+
let fileSize = Buffer.byteLength(content, "utf8")
|
|
111
|
+
|
|
112
|
+
try {
|
|
113
|
+
const stat = await fs.stat(filePath)
|
|
114
|
+
lastModified = stat.mtime.toISOString()
|
|
115
|
+
fileSize = stat.size
|
|
116
|
+
} catch {
|
|
117
|
+
// use defaults
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const fileType = detectFileType(filePath)
|
|
121
|
+
const language = detectLanguage(filePath)
|
|
122
|
+
const tags = fileType === "docs" ? extractFrontmatterTags(content) : []
|
|
123
|
+
|
|
124
|
+
return { file_type: fileType, language, last_modified: lastModified, file_size: fileSize, tags }
|
|
125
|
+
}
|