@comfanion/usethis_search 0.1.5 → 3.0.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +98 -7
- package/file-indexer.ts +21 -1
- package/index.ts +2 -0
- package/package.json +20 -3
- package/tools/codeindex.ts +135 -16
- package/tools/read-interceptor.ts +54 -0
- package/tools/search.ts +60 -12
- package/vectorizer/analyzers/lsp-analyzer.ts +162 -0
- package/vectorizer/analyzers/regex-analyzer.ts +255 -0
- package/vectorizer/bm25-index.ts +155 -0
- package/vectorizer/chunkers/chunker-factory.ts +98 -0
- package/vectorizer/chunkers/code-chunker.ts +325 -0
- package/vectorizer/chunkers/markdown-chunker.ts +177 -0
- package/vectorizer/content-cleaner.ts +136 -0
- package/vectorizer/graph-builder.ts +95 -0
- package/vectorizer/graph-db.ts +97 -0
- package/vectorizer/hybrid-search.ts +97 -0
- package/vectorizer/index.js +470 -17
- package/vectorizer/metadata-extractor.ts +125 -0
- package/vectorizer/query-cache.ts +126 -0
- package/vectorizer/search-metrics.ts +155 -0
- package/vectorizer.yaml +95 -0
package/vectorizer/index.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
// OpenCode Vectorizer - Semantic Code Search with Multi-Index Support
|
|
2
|
+
// v2: Content cleaning, semantic chunking, hybrid search, metadata, cache, metrics
|
|
2
3
|
|
|
3
4
|
import { pipeline, env } from "@xenova/transformers";
|
|
4
5
|
import * as lancedb from "vectordb";
|
|
@@ -6,6 +7,17 @@ import fs from "fs/promises";
|
|
|
6
7
|
import path from "path";
|
|
7
8
|
import crypto from "crypto";
|
|
8
9
|
|
|
10
|
+
// ── New modules ─────────────────────────────────────────────────────────────
|
|
11
|
+
import { cleanContent, DEFAULT_CLEANING_CONFIG } from "./content-cleaner.ts";
|
|
12
|
+
import { extractFileMetadata, detectFileType, detectLanguage } from "./metadata-extractor.ts";
|
|
13
|
+
import { chunkContent, DEFAULT_CHUNKING_CONFIG } from "./chunkers/chunker-factory.ts";
|
|
14
|
+
import { BM25Index } from "./bm25-index.ts";
|
|
15
|
+
import { mergeResults, DEFAULT_HYBRID_CONFIG } from "./hybrid-search.ts";
|
|
16
|
+
import { QueryCache, DEFAULT_CACHE_CONFIG } from "./query-cache.ts";
|
|
17
|
+
import { SearchMetrics } from "./search-metrics.ts";
|
|
18
|
+
import { GraphDB } from "./graph-db.ts";
|
|
19
|
+
import { GraphBuilder } from "./graph-builder.ts";
|
|
20
|
+
|
|
9
21
|
// Suppress transformers.js logs unless DEBUG is set
|
|
10
22
|
const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
|
|
11
23
|
if (!DEBUG) {
|
|
@@ -57,6 +69,13 @@ let GLOBAL_IGNORE = [];
|
|
|
57
69
|
// Default embedding model (fast). Can be overridden by config.
|
|
58
70
|
let EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2";
|
|
59
71
|
|
|
72
|
+
// ── Extended config parsed from YAML ────────────────────────────────────────
|
|
73
|
+
let CLEANING_CONFIG = { ...DEFAULT_CLEANING_CONFIG };
|
|
74
|
+
let CHUNKING_CONFIG = { ...DEFAULT_CHUNKING_CONFIG };
|
|
75
|
+
let HYBRID_CONFIG = { ...DEFAULT_HYBRID_CONFIG };
|
|
76
|
+
let METRICS_ENABLED = false;
|
|
77
|
+
let CACHE_ENABLED = true;
|
|
78
|
+
|
|
60
79
|
function defaultVectorizerYaml() {
|
|
61
80
|
return (
|
|
62
81
|
`vectorizer:\n` +
|
|
@@ -64,6 +83,40 @@ function defaultVectorizerYaml() {
|
|
|
64
83
|
` auto_index: true\n` +
|
|
65
84
|
` model: \"${EMBEDDING_MODEL}\"\n` +
|
|
66
85
|
` debounce_ms: 1000\n` +
|
|
86
|
+
`\n` +
|
|
87
|
+
` # Content cleaning before chunking\n` +
|
|
88
|
+
` cleaning:\n` +
|
|
89
|
+
` remove_toc: true\n` +
|
|
90
|
+
` remove_frontmatter_metadata: false\n` +
|
|
91
|
+
` remove_imports: false\n` +
|
|
92
|
+
` remove_comments: false\n` +
|
|
93
|
+
`\n` +
|
|
94
|
+
` # Chunking strategy\n` +
|
|
95
|
+
` chunking:\n` +
|
|
96
|
+
` strategy: \"semantic\" # fixed | semantic\n` +
|
|
97
|
+
` markdown:\n` +
|
|
98
|
+
` split_by_headings: true\n` +
|
|
99
|
+
` min_chunk_size: 200\n` +
|
|
100
|
+
` max_chunk_size: 2000\n` +
|
|
101
|
+
` preserve_heading_hierarchy: true\n` +
|
|
102
|
+
` code:\n` +
|
|
103
|
+
` split_by_functions: true\n` +
|
|
104
|
+
` include_function_signature: true\n` +
|
|
105
|
+
` min_chunk_size: 300\n` +
|
|
106
|
+
` max_chunk_size: 1500\n` +
|
|
107
|
+
` fixed:\n` +
|
|
108
|
+
` max_chars: 1500\n` +
|
|
109
|
+
`\n` +
|
|
110
|
+
` # Hybrid search (vector + BM25)\n` +
|
|
111
|
+
` search:\n` +
|
|
112
|
+
` hybrid: false\n` +
|
|
113
|
+
` bm25_weight: 0.3\n` +
|
|
114
|
+
`\n` +
|
|
115
|
+
` # Quality monitoring\n` +
|
|
116
|
+
` quality:\n` +
|
|
117
|
+
` enable_metrics: false\n` +
|
|
118
|
+
` enable_cache: true\n` +
|
|
119
|
+
`\n` +
|
|
67
120
|
` indexes:\n` +
|
|
68
121
|
` code:\n` +
|
|
69
122
|
` enabled: true\n` +
|
|
@@ -104,8 +157,25 @@ async function ensureDefaultConfig(projectRoot) {
|
|
|
104
157
|
}
|
|
105
158
|
}
|
|
106
159
|
|
|
160
|
+
// ── YAML mini-parser helpers ────────────────────────────────────────────────
|
|
161
|
+
|
|
162
|
+
function parseBool(section, key, fallback) {
|
|
163
|
+
const m = section.match(new RegExp(`^\\s+${key}:\\s*(true|false)`, "m"));
|
|
164
|
+
return m ? m[1] === "true" : fallback;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
function parseNumber(section, key, fallback) {
|
|
168
|
+
const m = section.match(new RegExp(`^\\s+${key}:\\s*(\\d+(?:\\.\\d+)?)`, "m"));
|
|
169
|
+
return m ? parseFloat(m[1]) : fallback;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
function parseString(section, key, fallback) {
|
|
173
|
+
const m = section.match(new RegExp(`^\\s+${key}:\\s*["']?([^"'\\n]+)["']?`, "m"));
|
|
174
|
+
return m ? m[1].trim() : fallback;
|
|
175
|
+
}
|
|
176
|
+
|
|
107
177
|
/**
|
|
108
|
-
* Load index configuration from .opencode/vectorizer.yaml
|
|
178
|
+
* Load index configuration from .opencode/vectorizer.yaml.
|
|
109
179
|
*/
|
|
110
180
|
async function loadConfig(projectRoot) {
|
|
111
181
|
try {
|
|
@@ -142,6 +212,61 @@ async function loadConfig(projectRoot) {
|
|
|
142
212
|
if (DEBUG) console.log("[vectorizer] Using model from config:", EMBEDDING_MODEL);
|
|
143
213
|
}
|
|
144
214
|
|
|
215
|
+
// ── Parse cleaning config ───────────────────────────────────────────────
|
|
216
|
+
const cleaningMatch = section.match(/^\s{2}cleaning:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
|
|
217
|
+
if (cleaningMatch) {
|
|
218
|
+
const cs = cleaningMatch[1];
|
|
219
|
+
CLEANING_CONFIG = {
|
|
220
|
+
remove_toc: parseBool(cs, "remove_toc", true),
|
|
221
|
+
remove_frontmatter_metadata: parseBool(cs, "remove_frontmatter_metadata", false),
|
|
222
|
+
remove_imports: parseBool(cs, "remove_imports", false),
|
|
223
|
+
remove_comments: parseBool(cs, "remove_comments", false),
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// ── Parse chunking config ───────────────────────────────────────────────
|
|
228
|
+
const chunkingMatch = section.match(/^\s{2}chunking:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
|
|
229
|
+
if (chunkingMatch) {
|
|
230
|
+
const cs = chunkingMatch[1];
|
|
231
|
+
const strategy = parseString(cs, "strategy", "semantic");
|
|
232
|
+
CHUNKING_CONFIG = {
|
|
233
|
+
strategy: strategy,
|
|
234
|
+
markdown: {
|
|
235
|
+
split_by_headings: parseBool(cs, "split_by_headings", true),
|
|
236
|
+
min_chunk_size: parseNumber(cs, "min_chunk_size", 200),
|
|
237
|
+
max_chunk_size: parseNumber(cs, "max_chunk_size", 2000),
|
|
238
|
+
preserve_heading_hierarchy: parseBool(cs, "preserve_heading_hierarchy", true),
|
|
239
|
+
},
|
|
240
|
+
code: {
|
|
241
|
+
split_by_functions: parseBool(cs, "split_by_functions", true),
|
|
242
|
+
include_function_signature: parseBool(cs, "include_function_signature", true),
|
|
243
|
+
min_chunk_size: parseNumber(cs, "min_chunk_size", 300),
|
|
244
|
+
max_chunk_size: parseNumber(cs, "max_chunk_size", 1500),
|
|
245
|
+
},
|
|
246
|
+
fixed: {
|
|
247
|
+
max_chars: parseNumber(cs, "max_chars", 1500),
|
|
248
|
+
},
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// ── Parse search config ─────────────────────────────────────────────────
|
|
253
|
+
const searchMatch = section.match(/^\s{2}search:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
|
|
254
|
+
if (searchMatch) {
|
|
255
|
+
const ss = searchMatch[1];
|
|
256
|
+
HYBRID_CONFIG = {
|
|
257
|
+
enabled: parseBool(ss, "hybrid", false),
|
|
258
|
+
bm25_weight: parseNumber(ss, "bm25_weight", 0.3),
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// ── Parse quality config ────────────────────────────────────────────────
|
|
263
|
+
const qualityMatch = section.match(/^\s{2}quality:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
|
|
264
|
+
if (qualityMatch) {
|
|
265
|
+
const qs = qualityMatch[1];
|
|
266
|
+
METRICS_ENABLED = parseBool(qs, "enable_metrics", false);
|
|
267
|
+
CACHE_ENABLED = parseBool(qs, "enable_cache", true);
|
|
268
|
+
}
|
|
269
|
+
|
|
145
270
|
// Parse global exclude
|
|
146
271
|
const excludeMatch = section.match(/^\s{2}exclude:\s*\n((?:\s{4}-\s+.+\n?)*)/m);
|
|
147
272
|
if (excludeMatch) {
|
|
@@ -196,12 +321,25 @@ async function loadConfig(projectRoot) {
|
|
|
196
321
|
}
|
|
197
322
|
}
|
|
198
323
|
|
|
199
|
-
if (DEBUG) console.log("[vectorizer] Loaded config:", { INDEX_PRESETS, GLOBAL_IGNORE });
|
|
324
|
+
if (DEBUG) console.log("[vectorizer] Loaded config:", { INDEX_PRESETS, GLOBAL_IGNORE, HYBRID_CONFIG, CHUNKING_CONFIG });
|
|
200
325
|
} catch {
|
|
201
326
|
if (DEBUG) console.log("[vectorizer] Using default presets (config load failed)");
|
|
202
327
|
}
|
|
203
328
|
}
|
|
204
329
|
|
|
330
|
+
// ── Shared query cache (singleton per process) ─────────────────────────────
|
|
331
|
+
let _queryCache = null;
|
|
332
|
+
function getQueryCache() {
|
|
333
|
+
if (!_queryCache) _queryCache = new QueryCache(DEFAULT_CACHE_CONFIG);
|
|
334
|
+
return _queryCache;
|
|
335
|
+
}
|
|
336
|
+
function clearQueryCache() {
|
|
337
|
+
if (_queryCache) {
|
|
338
|
+
_queryCache.destroy();
|
|
339
|
+
_queryCache = null;
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
205
343
|
class CodebaseIndexer {
|
|
206
344
|
constructor(projectRoot, indexName = "code") {
|
|
207
345
|
this.root = projectRoot;
|
|
@@ -212,6 +350,10 @@ class CodebaseIndexer {
|
|
|
212
350
|
this.db = null;
|
|
213
351
|
this.hashes = {};
|
|
214
352
|
this.configLoaded = false;
|
|
353
|
+
this.bm25 = null; // lazy-built BM25 index
|
|
354
|
+
this.metrics = null; // lazy-loaded SearchMetrics
|
|
355
|
+
this.graphDB = null; // Graph DB for relationships
|
|
356
|
+
this.graphBuilder = null; // Graph builder orchestrator
|
|
215
357
|
}
|
|
216
358
|
|
|
217
359
|
async init() {
|
|
@@ -222,22 +364,42 @@ class CodebaseIndexer {
|
|
|
222
364
|
await fs.mkdir(this.cacheDir, { recursive: true });
|
|
223
365
|
this.db = await lancedb.connect(path.join(this.cacheDir, "lancedb"));
|
|
224
366
|
await this.loadHashes();
|
|
367
|
+
|
|
368
|
+
const graphType = this.indexName === "docs" ? "doc_graph" : "code_graph";
|
|
369
|
+
const graphPath = path.join(this.root, ".opencode", "graph", graphType);
|
|
370
|
+
await fs.mkdir(path.dirname(graphPath), { recursive: true });
|
|
371
|
+
this.graphDB = await new GraphDB(graphPath).init();
|
|
372
|
+
this.graphBuilder = new GraphBuilder(this.graphDB, this.root);
|
|
373
|
+
|
|
225
374
|
return this;
|
|
226
375
|
}
|
|
227
376
|
|
|
228
377
|
async loadModel() {
|
|
229
378
|
if (!this.model) {
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
379
|
+
try {
|
|
380
|
+
if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}...`);
|
|
381
|
+
this.model = await pipeline("feature-extraction", EMBEDDING_MODEL, {
|
|
382
|
+
progress_callback: DEBUG ? undefined : null,
|
|
383
|
+
});
|
|
384
|
+
if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`);
|
|
385
|
+
} catch (error) {
|
|
386
|
+
this.model = null;
|
|
387
|
+
throw new Error(`Model loading failed: ${error.message || error}`);
|
|
388
|
+
}
|
|
235
389
|
}
|
|
236
390
|
return this.model;
|
|
237
391
|
}
|
|
238
392
|
|
|
239
393
|
async unloadModel() {
|
|
240
394
|
this.model = null;
|
|
395
|
+
// Release BM25 data held in memory
|
|
396
|
+
if (this.bm25) {
|
|
397
|
+
this.bm25.clear();
|
|
398
|
+
this.bm25 = null;
|
|
399
|
+
}
|
|
400
|
+
this._bm25Rows = null;
|
|
401
|
+
this.metrics = null;
|
|
402
|
+
clearQueryCache();
|
|
241
403
|
if (global.gc) global.gc();
|
|
242
404
|
}
|
|
243
405
|
|
|
@@ -274,12 +436,28 @@ class CodebaseIndexer {
|
|
|
274
436
|
return false;
|
|
275
437
|
}
|
|
276
438
|
|
|
439
|
+
// ── Embedding (with optional cache) ───────────────────────────────────────
|
|
440
|
+
|
|
277
441
|
async embed(text) {
|
|
278
442
|
const model = await this.loadModel();
|
|
279
443
|
const result = await model(text, { pooling: "mean", normalize: true });
|
|
280
444
|
return Array.from(result.data);
|
|
281
445
|
}
|
|
282
446
|
|
|
447
|
+
async embedQuery(text) {
|
|
448
|
+
if (CACHE_ENABLED) {
|
|
449
|
+
const cache = getQueryCache();
|
|
450
|
+
const cached = cache.get(text);
|
|
451
|
+
if (cached) return cached;
|
|
452
|
+
const embedding = await this.embed(text);
|
|
453
|
+
cache.set(text, embedding);
|
|
454
|
+
return embedding;
|
|
455
|
+
}
|
|
456
|
+
return this.embed(text);
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
// ── Legacy chunker (kept for backward compat / "fixed" strategy) ──────────
|
|
460
|
+
|
|
283
461
|
chunkCode(content, maxChars = 1500) {
|
|
284
462
|
const chunks = [];
|
|
285
463
|
const lines = content.split("\n");
|
|
@@ -309,6 +487,8 @@ class CodebaseIndexer {
|
|
|
309
487
|
return this.hashes[relPath] !== currentHash;
|
|
310
488
|
}
|
|
311
489
|
|
|
490
|
+
// ── Index a single file (v2: cleaning + semantic chunking + metadata) ─────
|
|
491
|
+
|
|
312
492
|
async indexFile(filePath) {
|
|
313
493
|
const relPath = path.relative(this.root, filePath);
|
|
314
494
|
|
|
@@ -324,21 +504,47 @@ class CodebaseIndexer {
|
|
|
324
504
|
return false;
|
|
325
505
|
}
|
|
326
506
|
|
|
327
|
-
|
|
507
|
+
// Extract metadata
|
|
508
|
+
const fileMeta = await extractFileMetadata(filePath, content);
|
|
328
509
|
const archived = this.isArchived(relPath, content);
|
|
329
|
-
const data = [];
|
|
330
510
|
|
|
331
|
-
|
|
332
|
-
|
|
511
|
+
// Clean content before chunking
|
|
512
|
+
const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
|
|
513
|
+
|
|
514
|
+
// Semantic chunking
|
|
515
|
+
const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
|
|
516
|
+
|
|
517
|
+
// v3: Assign chunk IDs for graph tracking
|
|
518
|
+
const chunksWithIds = this.graphBuilder.assignChunkIds(relPath, chunks);
|
|
519
|
+
|
|
520
|
+
// v3: Delete old edges for this file and build new ones
|
|
521
|
+
await this.graphDB.deleteByFile(relPath);
|
|
522
|
+
await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
|
|
523
|
+
|
|
524
|
+
const data = [];
|
|
525
|
+
for (let i = 0; i < chunksWithIds.length; i++) {
|
|
526
|
+
const embedding = await this.embed(chunksWithIds[i].content);
|
|
333
527
|
data.push({
|
|
528
|
+
chunk_id: chunksWithIds[i].chunk_id,
|
|
334
529
|
file: relPath,
|
|
335
530
|
chunk_index: i,
|
|
336
|
-
content:
|
|
531
|
+
content: chunksWithIds[i].content,
|
|
337
532
|
vector: embedding,
|
|
338
533
|
archived: archived,
|
|
534
|
+
// v2 metadata
|
|
535
|
+
file_type: fileMeta.file_type,
|
|
536
|
+
language: fileMeta.language,
|
|
537
|
+
last_modified: fileMeta.last_modified,
|
|
538
|
+
file_size: fileMeta.file_size,
|
|
539
|
+
heading_context: chunksWithIds[i].heading_context || "",
|
|
540
|
+
function_name: chunksWithIds[i].function_name || "",
|
|
541
|
+
class_name: chunksWithIds[i].class_name || "",
|
|
542
|
+
tags: (fileMeta.tags || []).join(","),
|
|
339
543
|
});
|
|
340
544
|
}
|
|
341
545
|
|
|
546
|
+
if (data.length === 0) return false;
|
|
547
|
+
|
|
342
548
|
const tableName = "chunks";
|
|
343
549
|
const tables = await this.db.tableNames();
|
|
344
550
|
if (tables.includes(tableName)) {
|
|
@@ -351,27 +557,244 @@ class CodebaseIndexer {
|
|
|
351
557
|
this.hashes[relPath] = hash;
|
|
352
558
|
await this.saveHashes();
|
|
353
559
|
|
|
560
|
+
// Invalidate BM25 index (needs rebuild) — release memory
|
|
561
|
+
if (this.bm25) {
|
|
562
|
+
this.bm25.clear();
|
|
563
|
+
this.bm25 = null;
|
|
564
|
+
}
|
|
565
|
+
this._bm25Rows = null;
|
|
566
|
+
|
|
354
567
|
return true;
|
|
355
568
|
}
|
|
356
569
|
|
|
357
|
-
|
|
570
|
+
// ── BM25 index management ────────────────────────────────────────────────
|
|
571
|
+
|
|
572
|
+
async ensureBM25() {
|
|
573
|
+
if (this.bm25) return this.bm25;
|
|
574
|
+
|
|
575
|
+
const tableName = "chunks";
|
|
576
|
+
const tables = await this.db.tableNames();
|
|
577
|
+
if (!tables.includes(tableName)) return null;
|
|
578
|
+
|
|
579
|
+
const table = await this.db.openTable(tableName);
|
|
580
|
+
const allRows = await table.search([0]).limit(100000).execute();
|
|
581
|
+
|
|
582
|
+
if (allRows.length === 0) return null;
|
|
583
|
+
|
|
584
|
+
// Sort for stable ID mapping between builds
|
|
585
|
+
allRows.sort((a, b) => {
|
|
586
|
+
const ka = `${a.file}:${a.chunk_index}`;
|
|
587
|
+
const kb = `${b.file}:${b.chunk_index}`;
|
|
588
|
+
return ka.localeCompare(kb);
|
|
589
|
+
});
|
|
590
|
+
|
|
591
|
+
// Release previous data before rebuilding
|
|
592
|
+
if (this.bm25) this.bm25.clear();
|
|
593
|
+
this._bm25Rows = null;
|
|
594
|
+
|
|
595
|
+
this.bm25 = new BM25Index();
|
|
596
|
+
this.bm25.build(allRows.map((r) => r.content));
|
|
597
|
+
this._bm25Rows = allRows;
|
|
598
|
+
|
|
599
|
+
return this.bm25;
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
// ── Search (v2: hybrid + metadata filters + metrics) ──────────────────────
|
|
603
|
+
|
|
604
|
+
async search(query, limit = 5, includeArchived = false, options = {}) {
|
|
358
605
|
const tableName = "chunks";
|
|
359
606
|
const tables = await this.db.tableNames();
|
|
360
607
|
if (!tables.includes(tableName)) {
|
|
361
608
|
return [];
|
|
362
609
|
}
|
|
363
610
|
|
|
364
|
-
const queryEmbedding = await this.
|
|
611
|
+
const queryEmbedding = await this.embedQuery(query);
|
|
365
612
|
const table = await this.db.openTable(tableName);
|
|
366
613
|
|
|
367
|
-
|
|
614
|
+
// Only over-fetch when filters or hybrid search are active
|
|
615
|
+
const hasFilters = !includeArchived || options.fileType || options.language ||
|
|
616
|
+
options.modifiedAfter || options.modifiedBefore ||
|
|
617
|
+
(options.tags && options.tags.length > 0);
|
|
618
|
+
const isHybrid = HYBRID_CONFIG.enabled || options.hybrid;
|
|
619
|
+
const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit * 3, 50) : limit;
|
|
368
620
|
let results = await table.search(queryEmbedding).limit(fetchLimit).execute();
|
|
369
621
|
|
|
622
|
+
// ── Hybrid search ───────────────────────────────────────────────────────
|
|
623
|
+
if (HYBRID_CONFIG.enabled || options.hybrid) {
|
|
624
|
+
try {
|
|
625
|
+
const bm25 = await this.ensureBM25();
|
|
626
|
+
if (bm25 && this._bm25Rows) {
|
|
627
|
+
const bm25Results = bm25.search(query, fetchLimit);
|
|
628
|
+
|
|
629
|
+
// Build score maps
|
|
630
|
+
const vectorScores = new Map();
|
|
631
|
+
for (let i = 0; i < results.length; i++) {
|
|
632
|
+
const score = results[i]._distance != null ? 1 - results[i]._distance : 0.5;
|
|
633
|
+
vectorScores.set(i, score);
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
const bm25Scores = new Map();
|
|
637
|
+
for (const r of bm25Results) {
|
|
638
|
+
bm25Scores.set(r.id, r.score);
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
// We need a unified ID space. Since vector results and BM25 results
|
|
642
|
+
// reference different row sets, we use the full table rows for BM25
|
|
643
|
+
// and merge by file+chunk_index key.
|
|
644
|
+
const resultMap = new Map();
|
|
645
|
+
|
|
646
|
+
for (let i = 0; i < results.length; i++) {
|
|
647
|
+
const key = `${results[i].file}:${results[i].chunk_index}`;
|
|
648
|
+
const vs = results[i]._distance != null ? 1 - results[i]._distance : 0.5;
|
|
649
|
+
resultMap.set(key, { row: results[i], vectorScore: vs, bm25Score: 0 });
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
for (const br of bm25Results) {
|
|
653
|
+
if (br.id < this._bm25Rows.length) {
|
|
654
|
+
const bRow = this._bm25Rows[br.id];
|
|
655
|
+
const key = `${bRow.file}:${bRow.chunk_index}`;
|
|
656
|
+
if (resultMap.has(key)) {
|
|
657
|
+
resultMap.get(key).bm25Score = br.score;
|
|
658
|
+
} else {
|
|
659
|
+
resultMap.set(key, { row: bRow, vectorScore: 0, bm25Score: br.score });
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
// Normalize BM25 scores
|
|
665
|
+
let maxBM25 = 0;
|
|
666
|
+
for (const v of resultMap.values()) {
|
|
667
|
+
if (v.bm25Score > maxBM25) maxBM25 = v.bm25Score;
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
const bw = (options.bm25_weight ?? HYBRID_CONFIG.bm25_weight) || 0.3;
|
|
671
|
+
const vw = 1 - bw;
|
|
672
|
+
|
|
673
|
+
const merged = [];
|
|
674
|
+
for (const v of resultMap.values()) {
|
|
675
|
+
const normBM25 = maxBM25 > 0 ? v.bm25Score / maxBM25 : 0;
|
|
676
|
+
const combined = vw * v.vectorScore + bw * normBM25;
|
|
677
|
+
merged.push({ ...v.row, _combinedScore: combined, _distance: v.row._distance });
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
merged.sort((a, b) => b._combinedScore - a._combinedScore);
|
|
681
|
+
results = merged;
|
|
682
|
+
}
|
|
683
|
+
} catch (e) {
|
|
684
|
+
if (DEBUG) console.log("[vectorizer] Hybrid search fallback:", e.message);
|
|
685
|
+
// Fall through to vector-only results
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
// ── Metadata filters ──────────────────────────────────────────────────
|
|
370
690
|
if (!includeArchived) {
|
|
371
691
|
results = results.filter((r) => !r.archived);
|
|
372
692
|
}
|
|
373
693
|
|
|
374
|
-
|
|
694
|
+
if (options.fileType) {
|
|
695
|
+
results = results.filter((r) => r.file_type === options.fileType);
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
if (options.language) {
|
|
699
|
+
results = results.filter((r) => r.language === options.language);
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
if (options.modifiedAfter) {
|
|
703
|
+
const after = new Date(options.modifiedAfter).getTime();
|
|
704
|
+
results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
if (options.modifiedBefore) {
|
|
708
|
+
const before = new Date(options.modifiedBefore).getTime();
|
|
709
|
+
results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
if (options.tags && options.tags.length > 0) {
|
|
713
|
+
results = results.filter((r) => {
|
|
714
|
+
const rowTags = (r.tags || "").split(",").filter(Boolean);
|
|
715
|
+
return options.tags.some((t) => rowTags.includes(t));
|
|
716
|
+
});
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
const finalResults = results.slice(0, limit);
|
|
720
|
+
|
|
721
|
+
// ── Metrics tracking ────────────────────────────────────────────────────
|
|
722
|
+
if (METRICS_ENABLED) {
|
|
723
|
+
try {
|
|
724
|
+
if (!this.metrics) {
|
|
725
|
+
this.metrics = new SearchMetrics(this.root);
|
|
726
|
+
await this.metrics.load();
|
|
727
|
+
}
|
|
728
|
+
const scores = finalResults.map((r) =>
|
|
729
|
+
r._combinedScore != null
|
|
730
|
+
? r._combinedScore
|
|
731
|
+
: r._distance != null
|
|
732
|
+
? 1 - r._distance
|
|
733
|
+
: 0
|
|
734
|
+
);
|
|
735
|
+
this.metrics.recordQuery(query, this.indexName, scores, HYBRID_CONFIG.enabled || !!options.hybrid);
|
|
736
|
+
await this.metrics.save();
|
|
737
|
+
} catch {
|
|
738
|
+
// non-fatal
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// ── Graph context expansion (v3) ───────────────────────────────────────
|
|
743
|
+
if (this.graphDB) {
|
|
744
|
+
for (const result of finalResults) {
|
|
745
|
+
if (!result.chunk_id) continue;
|
|
746
|
+
|
|
747
|
+
const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
|
|
748
|
+
const incoming = await this.graphDB.getIncoming(result.chunk_id);
|
|
749
|
+
const allEdges = [...outgoing, ...incoming];
|
|
750
|
+
|
|
751
|
+
const neighbors = [];
|
|
752
|
+
for (const edge of allEdges) {
|
|
753
|
+
const neighborId = edge.subject === result.chunk_id ? edge.object : edge.subject;
|
|
754
|
+
const neighborChunk = await this.findChunkById(neighborId);
|
|
755
|
+
if (!neighborChunk) continue;
|
|
756
|
+
|
|
757
|
+
const similarity = this.cosineSimilarity(neighborChunk.vector, queryEmbedding);
|
|
758
|
+
const score = edge.weight * similarity;
|
|
759
|
+
|
|
760
|
+
neighbors.push({
|
|
761
|
+
chunk_id: neighborId,
|
|
762
|
+
file: neighborChunk.file,
|
|
763
|
+
content: neighborChunk.content,
|
|
764
|
+
relation: edge.predicate,
|
|
765
|
+
score,
|
|
766
|
+
via: edge.source
|
|
767
|
+
});
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
neighbors.sort((a, b) => b.score - a.score);
|
|
771
|
+
result.relatedContext = neighbors.slice(0, 3);
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
return finalResults;
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
async findChunkById(chunkId) {
|
|
779
|
+
const tableName = "chunks";
|
|
780
|
+
const tables = await this.db.tableNames();
|
|
781
|
+
if (!tables.includes(tableName)) return null;
|
|
782
|
+
|
|
783
|
+
const table = await this.db.openTable(tableName);
|
|
784
|
+
const rows = await table.search([0]).limit(100000).execute();
|
|
785
|
+
return rows.find(r => r.chunk_id === chunkId) || null;
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
cosineSimilarity(vecA, vecB) {
|
|
789
|
+
let dotProduct = 0;
|
|
790
|
+
let normA = 0;
|
|
791
|
+
let normB = 0;
|
|
792
|
+
for (let i = 0; i < vecA.length; i++) {
|
|
793
|
+
dotProduct += vecA[i] * vecB[i];
|
|
794
|
+
normA += vecA[i] * vecA[i];
|
|
795
|
+
normB += vecB[i] * vecB[i];
|
|
796
|
+
}
|
|
797
|
+
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
375
798
|
}
|
|
376
799
|
|
|
377
800
|
async checkHealth(extraIgnore = []) {
|
|
@@ -478,7 +901,14 @@ class CodebaseIndexer {
|
|
|
478
901
|
|
|
479
902
|
async indexSingleFile(filePath) {
|
|
480
903
|
const absPath = path.isAbsolute(filePath) ? filePath : path.join(this.root, filePath);
|
|
481
|
-
|
|
904
|
+
// Prevent path traversal outside project root
|
|
905
|
+
const normalized = path.normalize(absPath);
|
|
906
|
+
const relative = path.relative(this.root, normalized);
|
|
907
|
+
if (relative.startsWith("..") || path.isAbsolute(relative)) {
|
|
908
|
+
if (DEBUG) console.log(`[vectorizer] Path traversal blocked: ${filePath}`);
|
|
909
|
+
return false;
|
|
910
|
+
}
|
|
911
|
+
return await this.indexFile(normalized);
|
|
482
912
|
}
|
|
483
913
|
|
|
484
914
|
async getStats() {
|
|
@@ -500,6 +930,12 @@ class CodebaseIndexer {
|
|
|
500
930
|
model: EMBEDDING_MODEL,
|
|
501
931
|
fileCount,
|
|
502
932
|
chunkCount,
|
|
933
|
+
features: {
|
|
934
|
+
chunking: CHUNKING_CONFIG.strategy,
|
|
935
|
+
hybrid: HYBRID_CONFIG.enabled,
|
|
936
|
+
metrics: METRICS_ENABLED,
|
|
937
|
+
cache: CACHE_ENABLED,
|
|
938
|
+
},
|
|
503
939
|
};
|
|
504
940
|
}
|
|
505
941
|
|
|
@@ -525,12 +961,19 @@ class CodebaseIndexer {
|
|
|
525
961
|
async clear() {
|
|
526
962
|
await fs.rm(this.cacheDir, { recursive: true, force: true });
|
|
527
963
|
this.hashes = {};
|
|
964
|
+
if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
|
|
965
|
+
this._bm25Rows = null;
|
|
966
|
+
this.metrics = null;
|
|
528
967
|
await this.init();
|
|
529
968
|
}
|
|
530
969
|
|
|
531
970
|
async clearAll() {
|
|
532
971
|
await fs.rm(this.baseDir, { recursive: true, force: true });
|
|
533
972
|
this.hashes = {};
|
|
973
|
+
if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
|
|
974
|
+
this._bm25Rows = null;
|
|
975
|
+
this.metrics = null;
|
|
976
|
+
clearQueryCache();
|
|
534
977
|
await this.init();
|
|
535
978
|
}
|
|
536
979
|
|
|
@@ -546,6 +989,16 @@ class CodebaseIndexer {
|
|
|
546
989
|
} catch {}
|
|
547
990
|
return indexes;
|
|
548
991
|
}
|
|
992
|
+
|
|
993
|
+
// ── Metrics access ────────────────────────────────────────────────────────
|
|
994
|
+
|
|
995
|
+
async getMetrics() {
|
|
996
|
+
if (!this.metrics) {
|
|
997
|
+
this.metrics = new SearchMetrics(this.root);
|
|
998
|
+
await this.metrics.load();
|
|
999
|
+
}
|
|
1000
|
+
return this.metrics.getSummary();
|
|
1001
|
+
}
|
|
549
1002
|
}
|
|
550
1003
|
|
|
551
1004
|
function getEmbeddingModel() {
|