@winci/local-rag 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +24 -0
- package/.mcp.json +11 -0
- package/LICENSE +21 -0
- package/README.md +567 -0
- package/hooks/hooks.json +25 -0
- package/hooks/scripts/reindex-file.sh +19 -0
- package/hooks/scripts/session-start.sh +11 -0
- package/package.json +52 -0
- package/skills/local-rag/SKILL.md +42 -0
- package/src/cli/commands/analytics.ts +58 -0
- package/src/cli/commands/benchmark.ts +30 -0
- package/src/cli/commands/checkpoint.ts +85 -0
- package/src/cli/commands/conversation.ts +102 -0
- package/src/cli/commands/demo.ts +119 -0
- package/src/cli/commands/eval.ts +31 -0
- package/src/cli/commands/index-cmd.ts +26 -0
- package/src/cli/commands/init.ts +35 -0
- package/src/cli/commands/map.ts +21 -0
- package/src/cli/commands/remove.ts +15 -0
- package/src/cli/commands/search-cmd.ts +59 -0
- package/src/cli/commands/serve.ts +5 -0
- package/src/cli/commands/status.ts +13 -0
- package/src/cli/index.ts +117 -0
- package/src/cli/progress.ts +21 -0
- package/src/cli/setup.ts +192 -0
- package/src/config/index.ts +101 -0
- package/src/conversation/indexer.ts +147 -0
- package/src/conversation/parser.ts +323 -0
- package/src/db/analytics.ts +116 -0
- package/src/db/annotations.ts +161 -0
- package/src/db/checkpoints.ts +166 -0
- package/src/db/conversation.ts +241 -0
- package/src/db/files.ts +146 -0
- package/src/db/graph.ts +250 -0
- package/src/db/index.ts +468 -0
- package/src/db/search.ts +244 -0
- package/src/db/types.ts +85 -0
- package/src/embeddings/embed.ts +73 -0
- package/src/graph/resolver.ts +305 -0
- package/src/indexing/chunker.ts +523 -0
- package/src/indexing/indexer.ts +263 -0
- package/src/indexing/parse.ts +99 -0
- package/src/indexing/watcher.ts +84 -0
- package/src/main.ts +8 -0
- package/src/search/benchmark.ts +139 -0
- package/src/search/eval.ts +171 -0
- package/src/search/hybrid.ts +194 -0
- package/src/search/reranker.ts +99 -0
- package/src/search/usages.ts +27 -0
- package/src/server/index.ts +126 -0
- package/src/tools/analytics-tools.ts +58 -0
- package/src/tools/annotation-tools.ts +89 -0
- package/src/tools/checkpoint-tools.ts +147 -0
- package/src/tools/conversation-tools.ts +86 -0
- package/src/tools/git-tools.ts +103 -0
- package/src/tools/graph-tools.ts +163 -0
- package/src/tools/index-tools.ts +91 -0
- package/src/tools/index.ts +33 -0
- package/src/tools/search.ts +238 -0
- package/src/types.ts +9 -0
- package/src/utils/log.ts +39 -0
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
import { relative } from "path";
|
|
2
|
+
import { createHash } from "crypto";
|
|
3
|
+
import { readFile } from "fs/promises";
|
|
4
|
+
import { Glob } from "bun";
|
|
5
|
+
import { parseFile } from "./parse";
|
|
6
|
+
import { embedBatch } from "../embeddings/embed";
|
|
7
|
+
import { chunkText, KNOWN_EXTENSIONS, type ChunkImport, type ChunkExport } from "./chunker";
|
|
8
|
+
import { RagDB } from "../db";
|
|
9
|
+
import { type RagConfig } from "../config";
|
|
10
|
+
import { resolveImports } from "../graph/resolver";
|
|
11
|
+
import { log } from "../utils/log";
|
|
12
|
+
import { type EmbeddedChunk } from "../types";
|
|
13
|
+
|
|
14
|
+
function aggregateGraphData(chunks: { imports?: ChunkImport[]; exports?: ChunkExport[] }[]): {
|
|
15
|
+
imports: { name: string; source: string }[];
|
|
16
|
+
exports: { name: string; type: string }[];
|
|
17
|
+
} {
|
|
18
|
+
const importMap = new Map<string, string>();
|
|
19
|
+
const exportMap = new Map<string, string>();
|
|
20
|
+
|
|
21
|
+
for (const chunk of chunks) {
|
|
22
|
+
if (chunk.imports) {
|
|
23
|
+
for (const imp of chunk.imports) {
|
|
24
|
+
if (!importMap.has(imp.source)) {
|
|
25
|
+
importMap.set(imp.source, imp.name);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
if (chunk.exports) {
|
|
30
|
+
for (const exp of chunk.exports) {
|
|
31
|
+
if (!exportMap.has(exp.name)) {
|
|
32
|
+
exportMap.set(exp.name, exp.type);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return {
|
|
39
|
+
imports: Array.from(importMap, ([source, name]) => ({ name, source })),
|
|
40
|
+
exports: Array.from(exportMap, ([name, type]) => ({ name, type })),
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export interface IndexResult {
|
|
45
|
+
indexed: number;
|
|
46
|
+
skipped: number;
|
|
47
|
+
pruned: number;
|
|
48
|
+
errors: string[];
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
async function fileHash(filePath: string): Promise<string> {
|
|
52
|
+
const content = await readFile(filePath);
|
|
53
|
+
return createHash("sha256").update(content).digest("hex");
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function matchesAny(filePath: string, globs: Glob[]): boolean {
|
|
57
|
+
return globs.some((g) => g.match(filePath));
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
async function collectFiles(
|
|
61
|
+
directory: string,
|
|
62
|
+
config: RagConfig,
|
|
63
|
+
onWarning?: (msg: string) => void
|
|
64
|
+
): Promise<string[]> {
|
|
65
|
+
const excludeGlobs = config.exclude.map((pat) => new Glob(pat));
|
|
66
|
+
|
|
67
|
+
async function scanPattern(pattern: string): Promise<string[]> {
|
|
68
|
+
const files: string[] = [];
|
|
69
|
+
const glob = new Glob(pattern);
|
|
70
|
+
try {
|
|
71
|
+
for await (const file of glob.scan({ cwd: directory, absolute: true })) {
|
|
72
|
+
const rel = relative(directory, file);
|
|
73
|
+
if (!matchesAny(rel, excludeGlobs)) {
|
|
74
|
+
files.push(file);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
} catch (err: any) {
|
|
78
|
+
if (err.code === "EPERM" || err.code === "EACCES") {
|
|
79
|
+
onWarning?.(`Skipping inaccessible path (${err.code}): ${err.path ?? pattern}`);
|
|
80
|
+
} else {
|
|
81
|
+
throw err;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
return files;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const results = await Promise.all(config.include.map(scanPattern));
|
|
88
|
+
return [...new Set(results.flat())];
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
interface ProcessFileOptions {
|
|
92
|
+
config: RagConfig;
|
|
93
|
+
/** Base directory for relative path display */
|
|
94
|
+
baseDir?: string;
|
|
95
|
+
onProgress?: (msg: string, opts?: { transient?: boolean }) => void;
|
|
96
|
+
signal?: AbortSignal;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Shared file processing pipeline: hash → parse → chunk → embed → write to DB.
|
|
101
|
+
* Streams DB writes alongside embedding to cap memory at one batch (~50 chunks)
|
|
102
|
+
* instead of buffering all embeddings.
|
|
103
|
+
*/
|
|
104
|
+
async function processFile(
|
|
105
|
+
filePath: string,
|
|
106
|
+
db: RagDB,
|
|
107
|
+
opts: ProcessFileOptions
|
|
108
|
+
): Promise<"indexed" | "skipped"> {
|
|
109
|
+
const { config, baseDir, onProgress, signal } = opts;
|
|
110
|
+
const batchSize = config.indexBatchSize ?? 50;
|
|
111
|
+
|
|
112
|
+
const hash = await fileHash(filePath);
|
|
113
|
+
const existing = db.getFileByPath(filePath);
|
|
114
|
+
|
|
115
|
+
if (existing && existing.hash === hash) {
|
|
116
|
+
return "skipped";
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const relPath = baseDir ? relative(baseDir, filePath) : filePath;
|
|
120
|
+
onProgress?.(`Indexing ${relPath}`);
|
|
121
|
+
|
|
122
|
+
const parsed = await parseFile(filePath);
|
|
123
|
+
|
|
124
|
+
if (!KNOWN_EXTENSIONS.has(parsed.extension)) {
|
|
125
|
+
onProgress?.(`Skipped (unsupported extension "${parsed.extension}"): ${relPath}`);
|
|
126
|
+
return "skipped";
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if (!parsed.content.trim()) {
|
|
130
|
+
return "skipped";
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const chunks = await chunkText(
|
|
134
|
+
parsed.content,
|
|
135
|
+
parsed.extension,
|
|
136
|
+
config.chunkSize,
|
|
137
|
+
config.chunkOverlap,
|
|
138
|
+
filePath
|
|
139
|
+
);
|
|
140
|
+
|
|
141
|
+
if (chunks.length > 10000) {
|
|
142
|
+
log.warn(`Large file: ${relPath} produced ${chunks.length} chunks`, "indexer");
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Stream: embed each batch and write to DB immediately (caps memory at one batch)
|
|
146
|
+
const DB_BATCH = 500;
|
|
147
|
+
const fileId = db.upsertFileStart(filePath, hash);
|
|
148
|
+
let chunkOffset = 0;
|
|
149
|
+
let pendingDbChunks: EmbeddedChunk[] = [];
|
|
150
|
+
|
|
151
|
+
for (let i = 0; i < chunks.length; i += batchSize) {
|
|
152
|
+
if (signal?.aborted) break;
|
|
153
|
+
|
|
154
|
+
const batch = chunks.slice(i, i + batchSize);
|
|
155
|
+
const embeddings = await embedBatch(batch.map(c => c.text), config.indexThreads);
|
|
156
|
+
|
|
157
|
+
for (let j = 0; j < batch.length; j++) {
|
|
158
|
+
const chunk = batch[j];
|
|
159
|
+
const primaryExport = chunk.exports?.[0];
|
|
160
|
+
pendingDbChunks.push({
|
|
161
|
+
snippet: chunk.text,
|
|
162
|
+
embedding: embeddings[j],
|
|
163
|
+
entityName: primaryExport?.name ?? null,
|
|
164
|
+
chunkType: primaryExport?.type ?? null,
|
|
165
|
+
startLine: chunk.startLine ?? null,
|
|
166
|
+
endLine: chunk.endLine ?? null,
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Flush to DB when we hit DB_BATCH size or on last iteration
|
|
171
|
+
if (pendingDbChunks.length >= DB_BATCH || i + batchSize >= chunks.length) {
|
|
172
|
+
if (signal?.aborted) break;
|
|
173
|
+
db.insertChunkBatch(fileId, pendingDbChunks, chunkOffset);
|
|
174
|
+
onProgress?.(`Writing ${Math.min(chunkOffset + pendingDbChunks.length, chunks.length)}/${chunks.length} chunks for ${relPath}`, { transient: true });
|
|
175
|
+
chunkOffset += pendingDbChunks.length;
|
|
176
|
+
pendingDbChunks = [];
|
|
177
|
+
await Bun.sleep(0);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if (signal?.aborted) return "skipped";
|
|
182
|
+
|
|
183
|
+
// Store graph metadata
|
|
184
|
+
const graphData = aggregateGraphData(chunks);
|
|
185
|
+
db.upsertFileGraph(fileId, graphData.imports, graphData.exports);
|
|
186
|
+
|
|
187
|
+
onProgress?.(`Indexed: ${relPath} (${chunks.length} chunks)`);
|
|
188
|
+
return "indexed";
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Index a single file. Returns true if the file was re-indexed, false if skipped.
|
|
193
|
+
*/
|
|
194
|
+
export async function indexFile(
|
|
195
|
+
filePath: string,
|
|
196
|
+
db: RagDB,
|
|
197
|
+
config: RagConfig
|
|
198
|
+
): Promise<"indexed" | "skipped" | "error"> {
|
|
199
|
+
try {
|
|
200
|
+
return await processFile(filePath, db, { config });
|
|
201
|
+
} catch (err) {
|
|
202
|
+
log.warn(`Failed to index ${filePath}: ${err instanceof Error ? err.message : err}`, "indexFile");
|
|
203
|
+
return "error";
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
export async function indexDirectory(
|
|
208
|
+
directory: string,
|
|
209
|
+
db: RagDB,
|
|
210
|
+
config: RagConfig,
|
|
211
|
+
onProgress?: (msg: string, opts?: { transient?: boolean }) => void,
|
|
212
|
+
signal?: AbortSignal
|
|
213
|
+
): Promise<IndexResult> {
|
|
214
|
+
const result: IndexResult = { indexed: 0, skipped: 0, pruned: 0, errors: [] };
|
|
215
|
+
|
|
216
|
+
if (signal?.aborted) return result;
|
|
217
|
+
|
|
218
|
+
const matchedFiles = await collectFiles(directory, config, onProgress);
|
|
219
|
+
|
|
220
|
+
onProgress?.(`Found ${matchedFiles.length} files to index`);
|
|
221
|
+
|
|
222
|
+
for (const filePath of matchedFiles) {
|
|
223
|
+
if (signal?.aborted) break;
|
|
224
|
+
|
|
225
|
+
try {
|
|
226
|
+
const status = await processFile(filePath, db, {
|
|
227
|
+
config,
|
|
228
|
+
baseDir: directory,
|
|
229
|
+
onProgress,
|
|
230
|
+
signal,
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
if (status === "indexed") {
|
|
234
|
+
result.indexed++;
|
|
235
|
+
} else {
|
|
236
|
+
result.skipped++;
|
|
237
|
+
}
|
|
238
|
+
} catch (err) {
|
|
239
|
+
const msg = `Error indexing ${filePath}: ${err instanceof Error ? err.message : err}`;
|
|
240
|
+
result.errors.push(msg);
|
|
241
|
+
onProgress?.(msg);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
if (signal?.aborted) return result;
|
|
246
|
+
|
|
247
|
+
// Prune files that no longer exist
|
|
248
|
+
const existingPaths = new Set(matchedFiles);
|
|
249
|
+
result.pruned = db.pruneDeleted(existingPaths);
|
|
250
|
+
if (result.pruned > 0) {
|
|
251
|
+
onProgress?.(`Pruned ${result.pruned} deleted files from index`);
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Resolve import paths across all files
|
|
255
|
+
if (result.indexed > 0) {
|
|
256
|
+
const resolved = resolveImports(db, directory);
|
|
257
|
+
if (resolved > 0) {
|
|
258
|
+
onProgress?.(`Resolved ${resolved} import paths`);
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
return result;
|
|
263
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import matter from "gray-matter";
|
|
2
|
+
import { readFile } from "fs/promises";
|
|
3
|
+
import { extname } from "path";
|
|
4
|
+
|
|
5
|
+
export interface ParsedFile {
|
|
6
|
+
path: string;
|
|
7
|
+
content: string;
|
|
8
|
+
frontmatter: Record<string, unknown> | null;
|
|
9
|
+
extension: string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
const MARKDOWN_EXTENSIONS = new Set([".md", ".mdx", ".markdown"]);
|
|
13
|
+
|
|
14
|
+
// Files with no real extension matched by exact basename.
|
|
15
|
+
const EXACT_NAME_MAP = new Map<string, string>([
|
|
16
|
+
["makefile", ".makefile"],
|
|
17
|
+
["gnumakefile", ".makefile"],
|
|
18
|
+
["vagrantfile", ".vagrantfile"],
|
|
19
|
+
["gemfile", ".gemfile"],
|
|
20
|
+
["rakefile", ".rakefile"],
|
|
21
|
+
["brewfile", ".brewfile"],
|
|
22
|
+
["procfile", ".procfile"],
|
|
23
|
+
]);
|
|
24
|
+
|
|
25
|
+
// Files whose basename starts with a known prefix (e.g. Dockerfile.dev,
|
|
26
|
+
// Jenkinsfile.staging). Each entry is [lowerPrefix, virtualExtension].
|
|
27
|
+
const PREFIX_NAME_MAP: [string, string][] = [
|
|
28
|
+
["dockerfile", ".dockerfile"],
|
|
29
|
+
["jenkinsfile", ".jenkinsfile"],
|
|
30
|
+
];
|
|
31
|
+
|
|
32
|
+
function resolveExtension(rawExt: string, basename: string): string {
|
|
33
|
+
if (rawExt) {
|
|
34
|
+
// Even with an extension, check prefixes: Dockerfile.dev should win.
|
|
35
|
+
for (const [prefix, virtualExt] of PREFIX_NAME_MAP) {
|
|
36
|
+
if (basename === prefix || basename.startsWith(prefix + ".")) {
|
|
37
|
+
return virtualExt;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return rawExt;
|
|
41
|
+
}
|
|
42
|
+
// No extension — check exact names first, then prefixes.
|
|
43
|
+
if (EXACT_NAME_MAP.has(basename)) return EXACT_NAME_MAP.get(basename)!;
|
|
44
|
+
for (const [prefix, virtualExt] of PREFIX_NAME_MAP) {
|
|
45
|
+
if (basename === prefix || basename.startsWith(prefix + ".")) {
|
|
46
|
+
return virtualExt;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return rawExt;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export async function parseFile(filePath: string): Promise<ParsedFile> {
|
|
53
|
+
const raw = await readFile(filePath, "utf-8");
|
|
54
|
+
const rawExt = extname(filePath).toLowerCase();
|
|
55
|
+
const basename = filePath.split("/").pop()?.toLowerCase() ?? "";
|
|
56
|
+
const ext = resolveExtension(rawExt, basename);
|
|
57
|
+
|
|
58
|
+
if (MARKDOWN_EXTENSIONS.has(ext)) {
|
|
59
|
+
const { data, content } = matter(raw);
|
|
60
|
+
const hasFrontmatter = Object.keys(data).length > 0;
|
|
61
|
+
return {
|
|
62
|
+
path: filePath,
|
|
63
|
+
content: hasFrontmatter
|
|
64
|
+
? buildWeightedText(data, content)
|
|
65
|
+
: content.trim(),
|
|
66
|
+
frontmatter: hasFrontmatter ? data : null,
|
|
67
|
+
extension: ext,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return {
|
|
72
|
+
path: filePath,
|
|
73
|
+
content: raw.trim(),
|
|
74
|
+
frontmatter: null,
|
|
75
|
+
extension: ext,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function buildWeightedText(
|
|
80
|
+
frontmatter: Record<string, unknown>,
|
|
81
|
+
body: string
|
|
82
|
+
): string {
|
|
83
|
+
const parts: string[] = [];
|
|
84
|
+
|
|
85
|
+
if (frontmatter.name) parts.push(`${frontmatter.name}`);
|
|
86
|
+
if (frontmatter.description)
|
|
87
|
+
parts.push(`description: ${frontmatter.description}`);
|
|
88
|
+
if (frontmatter.type) parts.push(`type: ${frontmatter.type}`);
|
|
89
|
+
if (frontmatter.tags) {
|
|
90
|
+
const tags = Array.isArray(frontmatter.tags)
|
|
91
|
+
? frontmatter.tags.join(", ")
|
|
92
|
+
: frontmatter.tags;
|
|
93
|
+
parts.push(`tags: ${tags}`);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (body.trim()) parts.push(body.trim());
|
|
97
|
+
|
|
98
|
+
return parts.join("\n");
|
|
99
|
+
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { watch } from "fs";
|
|
2
|
+
import { resolve, relative } from "path";
|
|
3
|
+
import { existsSync } from "fs";
|
|
4
|
+
import { Glob } from "bun";
|
|
5
|
+
import { indexFile } from "./indexer";
|
|
6
|
+
import { type RagConfig } from "../config";
|
|
7
|
+
import { type RagDB } from "../db";
|
|
8
|
+
import { resolveImportsForFile, buildPathToIdMap, buildIdToPathMap } from "../graph/resolver";
|
|
9
|
+
|
|
10
|
+
const DEBOUNCE_MS = 2000;
|
|
11
|
+
|
|
12
|
+
export interface Watcher {
|
|
13
|
+
close(): void;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export function startWatcher(
|
|
17
|
+
directory: string,
|
|
18
|
+
db: RagDB,
|
|
19
|
+
config: RagConfig,
|
|
20
|
+
onEvent?: (msg: string) => void
|
|
21
|
+
): Watcher {
|
|
22
|
+
const pending = new Map<string, NodeJS.Timeout>();
|
|
23
|
+
|
|
24
|
+
// Pre-compile globs once instead of per-event
|
|
25
|
+
const excludeGlobs = config.exclude.map((pat) => new Glob(pat));
|
|
26
|
+
const includeGlobs = config.include.map((pat) => new Glob(pat));
|
|
27
|
+
|
|
28
|
+
function matchesAny(filePath: string, globs: Glob[]): boolean {
|
|
29
|
+
return globs.some((g) => g.match(filePath));
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const fsWatcher = watch(directory, { recursive: true }, (_event, filename) => {
|
|
33
|
+
if (!filename) return;
|
|
34
|
+
|
|
35
|
+
const rel = filename.toString();
|
|
36
|
+
|
|
37
|
+
if (matchesAny(rel, excludeGlobs)) return;
|
|
38
|
+
if (!matchesAny(rel, includeGlobs)) return;
|
|
39
|
+
|
|
40
|
+
const absPath = resolve(directory, rel);
|
|
41
|
+
|
|
42
|
+
const existing = pending.get(absPath);
|
|
43
|
+
if (existing) clearTimeout(existing);
|
|
44
|
+
|
|
45
|
+
pending.set(
|
|
46
|
+
absPath,
|
|
47
|
+
setTimeout(async () => {
|
|
48
|
+
pending.delete(absPath);
|
|
49
|
+
|
|
50
|
+
if (!existsSync(absPath)) {
|
|
51
|
+
const removed = db.removeFile(absPath);
|
|
52
|
+
if (removed) {
|
|
53
|
+
onEvent?.(`Removed deleted file: ${rel}`);
|
|
54
|
+
}
|
|
55
|
+
return;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const result = await indexFile(absPath, db, config);
|
|
59
|
+
if (result === "indexed") {
|
|
60
|
+
const file = db.getFileByPath(absPath);
|
|
61
|
+
if (file) {
|
|
62
|
+
// Build lookups once and reuse for all resolve calls
|
|
63
|
+
const pathToId = buildPathToIdMap(db);
|
|
64
|
+
const idToPath = buildIdToPathMap(pathToId);
|
|
65
|
+
resolveImportsForFile(db, file.id, directory, pathToId, idToPath);
|
|
66
|
+
for (const importerId of db.getImportersOf(file.id)) {
|
|
67
|
+
resolveImportsForFile(db, importerId, directory, pathToId, idToPath);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
onEvent?.(`Re-indexed: ${rel}`);
|
|
71
|
+
}
|
|
72
|
+
}, DEBOUNCE_MS)
|
|
73
|
+
);
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
onEvent?.(`Watching ${directory} for changes`);
|
|
77
|
+
return {
|
|
78
|
+
close() {
|
|
79
|
+
for (const timer of pending.values()) clearTimeout(timer);
|
|
80
|
+
pending.clear();
|
|
81
|
+
fsWatcher.close();
|
|
82
|
+
},
|
|
83
|
+
};
|
|
84
|
+
}
|
package/src/main.ts
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import { readFile } from "fs/promises";
|
|
2
|
+
import { resolve } from "path";
|
|
3
|
+
import { RagDB } from "../db";
|
|
4
|
+
import { search, type DedupedResult } from "./hybrid";
|
|
5
|
+
import { loadConfig } from "../config";
|
|
6
|
+
|
|
7
|
+
export interface BenchmarkQuery {
|
|
8
|
+
query: string;
|
|
9
|
+
expected: string[]; // file paths (relative or absolute)
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface BenchmarkResult {
|
|
13
|
+
query: string;
|
|
14
|
+
expected: string[];
|
|
15
|
+
results: { path: string; score: number }[];
|
|
16
|
+
recall: number; // fraction of expected files found in top-K
|
|
17
|
+
reciprocalRank: number; // 1/rank of first expected file (0 if none found)
|
|
18
|
+
hit: boolean; // at least one expected file found
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface BenchmarkSummary {
|
|
22
|
+
total: number;
|
|
23
|
+
recallAtK: number; // average recall across queries
|
|
24
|
+
mrr: number; // mean reciprocal rank
|
|
25
|
+
zeroMissRate: number; // fraction of queries that missed all expected files
|
|
26
|
+
results: BenchmarkResult[];
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export async function loadBenchmarkQueries(path: string): Promise<BenchmarkQuery[]> {
|
|
30
|
+
const raw = await readFile(path, "utf-8");
|
|
31
|
+
const parsed = JSON.parse(raw);
|
|
32
|
+
|
|
33
|
+
if (!Array.isArray(parsed)) {
|
|
34
|
+
throw new Error("Benchmark file must be a JSON array of { query, expected } objects");
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
for (const entry of parsed) {
|
|
38
|
+
if (!entry.query || !Array.isArray(entry.expected) || entry.expected.length === 0) {
|
|
39
|
+
throw new Error(`Invalid benchmark entry: ${JSON.stringify(entry)}. Each entry needs "query" (string) and "expected" (string[])`);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return parsed;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function normalizePath(p: string, projectDir: string): string {
|
|
47
|
+
// If already absolute, return as-is; otherwise resolve relative to project
|
|
48
|
+
if (p.startsWith("/")) return p;
|
|
49
|
+
return resolve(projectDir, p);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export async function runBenchmark(
|
|
53
|
+
queries: BenchmarkQuery[],
|
|
54
|
+
db: RagDB,
|
|
55
|
+
projectDir: string,
|
|
56
|
+
topK: number = 5,
|
|
57
|
+
hybridWeight?: number
|
|
58
|
+
): Promise<BenchmarkSummary> {
|
|
59
|
+
const config = await loadConfig(projectDir);
|
|
60
|
+
const weight = hybridWeight ?? config.hybridWeight;
|
|
61
|
+
|
|
62
|
+
const results: BenchmarkResult[] = [];
|
|
63
|
+
|
|
64
|
+
for (const q of queries) {
|
|
65
|
+
const searchResults = await search(q.query, db, topK, 0, weight);
|
|
66
|
+
|
|
67
|
+
const resultPaths = searchResults.map((r) => r.path);
|
|
68
|
+
const expectedNormalized = q.expected.map((p) => normalizePath(p, projectDir));
|
|
69
|
+
|
|
70
|
+
// Recall: fraction of expected files found in results
|
|
71
|
+
const found = expectedNormalized.filter((e) =>
|
|
72
|
+
resultPaths.some((r) => r === e || r.endsWith(e) || e.endsWith(r))
|
|
73
|
+
);
|
|
74
|
+
const recall = found.length / expectedNormalized.length;
|
|
75
|
+
|
|
76
|
+
// Reciprocal rank: 1/rank of first expected file in results
|
|
77
|
+
let reciprocalRank = 0;
|
|
78
|
+
for (let i = 0; i < resultPaths.length; i++) {
|
|
79
|
+
const matchesExpected = expectedNormalized.some(
|
|
80
|
+
(e) => resultPaths[i] === e || resultPaths[i].endsWith(e) || e.endsWith(resultPaths[i])
|
|
81
|
+
);
|
|
82
|
+
if (matchesExpected) {
|
|
83
|
+
reciprocalRank = 1 / (i + 1);
|
|
84
|
+
break;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
results.push({
|
|
89
|
+
query: q.query,
|
|
90
|
+
expected: q.expected,
|
|
91
|
+
results: searchResults.map((r) => ({ path: r.path, score: r.score })),
|
|
92
|
+
recall,
|
|
93
|
+
reciprocalRank,
|
|
94
|
+
hit: found.length > 0,
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const total = results.length;
|
|
99
|
+
const recallAtK = total > 0 ? results.reduce((s, r) => s + r.recall, 0) / total : 0;
|
|
100
|
+
const mrr = total > 0 ? results.reduce((s, r) => s + r.reciprocalRank, 0) / total : 0;
|
|
101
|
+
const misses = results.filter((r) => !r.hit).length;
|
|
102
|
+
const zeroMissRate = total > 0 ? misses / total : 0;
|
|
103
|
+
|
|
104
|
+
return { total, recallAtK, mrr, zeroMissRate, results };
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export function formatBenchmarkReport(summary: BenchmarkSummary, topK: number = 5): string {
|
|
108
|
+
const lines: string[] = [];
|
|
109
|
+
|
|
110
|
+
lines.push(`Benchmark results (${summary.total} queries, top-${topK}):`);
|
|
111
|
+
lines.push(` Recall@${topK}: ${(summary.recallAtK * 100).toFixed(1)}%`);
|
|
112
|
+
lines.push(` MRR: ${summary.mrr.toFixed(3)}`);
|
|
113
|
+
lines.push(` Zero-miss rate: ${(summary.zeroMissRate * 100).toFixed(1)}% (${summary.results.filter((r) => !r.hit).length} queries)`);
|
|
114
|
+
|
|
115
|
+
// Show failures
|
|
116
|
+
const failures = summary.results.filter((r) => !r.hit);
|
|
117
|
+
if (failures.length > 0) {
|
|
118
|
+
lines.push("\nMissed queries (no expected file in results):");
|
|
119
|
+
for (const f of failures) {
|
|
120
|
+
lines.push(` "${f.query}"`);
|
|
121
|
+
lines.push(` expected: ${f.expected.join(", ")}`);
|
|
122
|
+
const got = f.results.length > 0
|
|
123
|
+
? f.results.map((r) => r.path).join(", ")
|
|
124
|
+
: "(no results)";
|
|
125
|
+
lines.push(` got: ${got}`);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Show partial hits (recall < 1 but > 0)
|
|
130
|
+
const partials = summary.results.filter((r) => r.hit && r.recall < 1);
|
|
131
|
+
if (partials.length > 0) {
|
|
132
|
+
lines.push("\nPartial matches (some expected files missing):");
|
|
133
|
+
for (const p of partials) {
|
|
134
|
+
lines.push(` "${p.query}" — recall: ${(p.recall * 100).toFixed(0)}%`);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
return lines.join("\n");
|
|
139
|
+
}
|