@hsingjui/contextweaver 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1496 @@
1
+ import {
2
+ getIndexer,
3
+ getVectorStore
4
+ } from "./chunk-6C2D5Y4R.js";
5
+ import {
6
+ initDb,
7
+ isChunksFtsInitialized,
8
+ isDebugEnabled,
9
+ isFtsInitialized,
10
+ logger,
11
+ searchChunksFts,
12
+ searchFilesFts,
13
+ segmentQuery
14
+ } from "./chunk-5SRSUMKW.js";
15
+ import {
16
+ getEmbeddingConfig,
17
+ getRerankerConfig
18
+ } from "./chunk-PN7DP6XL.js";
19
+
20
+ // src/api/reranker.ts
21
+ var RerankerClient = class {
22
+ config;
23
+ constructor(config) {
24
+ this.config = config || getRerankerConfig();
25
+ }
26
+ /**
27
+ * 对文档进行重排序
28
+ * @param query 查询文本
29
+ * @param documents 待排序的文档文本数组
30
+ * @param options 选项
31
+ */
32
+ async rerank(query, documents, options = {}) {
33
+ if (documents.length === 0) {
34
+ return [];
35
+ }
36
+ const { topN = this.config.topN, maxChunksPerDoc, chunkOverlap, retries = 3 } = options;
37
+ const requestBody = {
38
+ model: this.config.model,
39
+ query,
40
+ documents,
41
+ top_n: Math.min(topN, documents.length),
42
+ return_documents: false
43
+ // 不需要返回原文,节省带宽
44
+ };
45
+ if (maxChunksPerDoc !== void 0) {
46
+ requestBody.max_chunks_per_doc = maxChunksPerDoc;
47
+ }
48
+ if (chunkOverlap !== void 0) {
49
+ requestBody.overlap = chunkOverlap;
50
+ }
51
+ for (let attempt = 1; attempt <= retries; attempt++) {
52
+ try {
53
+ const response = await fetch(this.config.baseUrl, {
54
+ method: "POST",
55
+ headers: {
56
+ "Content-Type": "application/json",
57
+ Authorization: `Bearer ${this.config.apiKey}`
58
+ },
59
+ body: JSON.stringify(requestBody)
60
+ });
61
+ const data = await response.json();
62
+ if (!response.ok || data.error) {
63
+ const errorMsg = data.error?.message || `HTTP ${response.status}`;
64
+ throw new Error(`Rerank API \u9519\u8BEF: ${errorMsg}`);
65
+ }
66
+ const results = data.results.map((item) => ({
67
+ originalIndex: item.index,
68
+ score: item.relevance_score,
69
+ text: documents[item.index]
70
+ }));
71
+ logger.debug(
72
+ {
73
+ query: query.slice(0, 50),
74
+ inputCount: documents.length,
75
+ outputCount: results.length
76
+ },
77
+ "Rerank \u5B8C\u6210"
78
+ );
79
+ return results;
80
+ } catch (err) {
81
+ const isRateLimited = err.message?.includes("429") || err.message?.includes("rate");
82
+ if (attempt < retries) {
83
+ const delay = isRateLimited ? 1e3 * attempt : 500 * attempt;
84
+ logger.warn(
85
+ { attempt, maxRetries: retries, delay, error: err.message },
86
+ "Rerank \u8BF7\u6C42\u5931\u8D25\uFF0C\u51C6\u5907\u91CD\u8BD5"
87
+ );
88
+ await sleep(delay);
89
+ } else {
90
+ logger.error({ error: err.message, stack: err.stack, query: query.slice(0, 50) }, "Rerank \u8BF7\u6C42\u6700\u7EC8\u5931\u8D25");
91
+ throw err;
92
+ }
93
+ }
94
+ }
95
+ throw new Error("Rerank \u5904\u7406\u5F02\u5E38");
96
+ }
97
+ /**
98
+ * 对带有元数据的文档进行重排序
99
+ * @param query 查询文本
100
+ * @param items 文档项数组
101
+ * @param textExtractor 从文档项中提取文本的函数
102
+ * @param options 选项
103
+ */
104
+ async rerankWithData(query, items, textExtractor, options = {}) {
105
+ if (items.length === 0) {
106
+ return [];
107
+ }
108
+ const texts = items.map(textExtractor);
109
+ const results = await this.rerank(query, texts, options);
110
+ return results.map((result) => ({
111
+ ...result,
112
+ data: items[result.originalIndex]
113
+ }));
114
+ }
115
+ /**
116
+ * 获取当前配置
117
+ */
118
+ getConfig() {
119
+ return { ...this.config };
120
+ }
121
+ };
122
+ var defaultClient = null;
123
+ function getRerankerClient() {
124
+ if (!defaultClient) {
125
+ defaultClient = new RerankerClient();
126
+ }
127
+ return defaultClient;
128
+ }
129
+ function sleep(ms) {
130
+ return new Promise((resolve) => setTimeout(resolve, ms));
131
+ }
132
+
133
+ // src/search/ContextPacker.ts
134
+ var ContextPacker = class {
135
+ projectId;
136
+ config;
137
+ constructor(projectId, config) {
138
+ this.projectId = projectId;
139
+ this.config = config;
140
+ }
141
+ /**
142
+ * 打包:合并 chunks → 按文件聚合段落 → 预算裁剪
143
+ */
144
+ async pack(chunks) {
145
+ if (chunks.length === 0) return [];
146
+ const byFile = this.groupByFile(chunks);
147
+ const db = initDb(this.projectId);
148
+ const result = [];
149
+ let totalChars = 0;
150
+ const sortedFiles = Object.entries(byFile).map(([filePath, fileChunks]) => ({
151
+ filePath,
152
+ chunks: fileChunks,
153
+ maxScore: Math.max(...fileChunks.map((c) => c.score))
154
+ })).sort((a, b) => b.maxScore - a.maxScore);
155
+ const allFilePaths = sortedFiles.map((f) => f.filePath);
156
+ const placeholders = allFilePaths.map(() => "?").join(",");
157
+ const rows = db.prepare(
158
+ `SELECT path, content FROM files WHERE path IN (${placeholders})`
159
+ ).all(...allFilePaths);
160
+ const contentMap = new Map(rows.map((r) => [r.path, r.content]));
161
+ for (const { filePath, chunks: fileChunks } of sortedFiles) {
162
+ const content = contentMap.get(filePath);
163
+ if (!content) continue;
164
+ const segments = this.mergeAndSlice(fileChunks, content);
165
+ const topSegments = segments.sort((a, b) => b.score - a.score).slice(0, this.config.maxSegmentsPerFile).sort((a, b) => a.rawStart - b.rawStart);
166
+ const budgetedSegments = [];
167
+ for (const seg of topSegments) {
168
+ if (totalChars + seg.text.length > this.config.maxTotalChars) {
169
+ break;
170
+ }
171
+ totalChars += seg.text.length;
172
+ budgetedSegments.push(seg);
173
+ }
174
+ if (budgetedSegments.length > 0) {
175
+ result.push({ filePath, segments: budgetedSegments });
176
+ }
177
+ if (totalChars >= this.config.maxTotalChars) break;
178
+ }
179
+ return result;
180
+ }
181
+ /**
182
+ * 按文件分组
183
+ */
184
+ groupByFile(chunks) {
185
+ const byFile = {};
186
+ for (const chunk of chunks) {
187
+ const key = chunk.filePath;
188
+ if (!byFile[key]) byFile[key] = [];
189
+ byFile[key].push(chunk);
190
+ }
191
+ return byFile;
192
+ }
193
+ /**
194
+ * 合并重叠区间 + 从原文件切片
195
+ */
196
+ mergeAndSlice(chunks, content) {
197
+ if (chunks.length === 0) return [];
198
+ const sorted = [...chunks].sort((a, b) => a.record.raw_start - b.record.raw_start);
199
+ const intervals = [];
200
+ for (const chunk of sorted) {
201
+ const start = chunk.record.raw_start;
202
+ const end = chunk.record.raw_end;
203
+ const last = intervals[intervals.length - 1];
204
+ if (last && start <= last.end) {
205
+ last.end = Math.max(last.end, end);
206
+ last.score = Math.max(last.score, chunk.score);
207
+ last.chunks.push(chunk);
208
+ } else {
209
+ intervals.push({
210
+ start,
211
+ end,
212
+ score: chunk.score,
213
+ breadcrumb: chunk.record.breadcrumb,
214
+ chunks: [chunk]
215
+ });
216
+ }
217
+ }
218
+ return intervals.map((iv) => {
219
+ const startLine = this.offsetToLine(content, iv.start);
220
+ const endLine = this.offsetToLine(content, iv.end);
221
+ return {
222
+ filePath: chunks[0].filePath,
223
+ rawStart: iv.start,
224
+ rawEnd: iv.end,
225
+ startLine,
226
+ endLine,
227
+ score: iv.score,
228
+ breadcrumb: iv.breadcrumb,
229
+ text: content.slice(iv.start, iv.end)
230
+ };
231
+ });
232
+ }
233
+ /**
234
+ * 将字符偏移量转换为行号(1-indexed)
235
+ */
236
+ offsetToLine(content, offset) {
237
+ let line = 1;
238
+ for (let i = 0; i < offset && i < content.length; i++) {
239
+ if (content[i] === "\n") {
240
+ line++;
241
+ }
242
+ }
243
+ return line;
244
+ }
245
+ };
246
+
247
+ // src/search/resolvers/types.ts
248
+ function commonPrefixLength(path1, path2) {
249
+ const parts1 = path1.split("/");
250
+ const parts2 = path2.split("/");
251
+ let count = 0;
252
+ for (let i = 0; i < Math.min(parts1.length, parts2.length); i++) {
253
+ if (parts1[i] === parts2[i]) {
254
+ count++;
255
+ } else {
256
+ break;
257
+ }
258
+ }
259
+ return count;
260
+ }
261
+
262
+ // src/search/resolvers/JsTsResolver.ts
263
+ var JsTsResolver = class {
264
+ exts = [".ts", ".tsx", ".js", ".jsx", ".mts", ".mjs", ".cts", ".cjs"];
265
+ // TypeScript ESM 项目使用 .js 扩展名导入,但源文件是 .ts
266
+ extMapping = {
267
+ ".js": [".ts", ".tsx", ".js", ".jsx"],
268
+ ".jsx": [".tsx", ".jsx"],
269
+ ".mjs": [".mts", ".mjs"],
270
+ ".cjs": [".cts", ".cjs"]
271
+ };
272
+ supports(filePath) {
273
+ const ext = filePath.split(".").pop()?.toLowerCase();
274
+ return this.exts.includes(`.${ext}` || "");
275
+ }
276
+ extract(content) {
277
+ const imports = [];
278
+ const patterns = [
279
+ // import xxx from './foo' 或 import { xxx } from './foo'
280
+ /(?:import|export)\s+(?:[\w\s{},*]+\s+from\s+)?['"]([^'"]+)['"]/g,
281
+ // import('./foo') 或 require('./foo')
282
+ /(?:import|require)\s*\(\s*['"]([^'"]+)['"]\s*\)/g
283
+ ];
284
+ for (const pattern of patterns) {
285
+ let match;
286
+ while ((match = pattern.exec(content)) !== null) {
287
+ imports.push(match[1]);
288
+ }
289
+ }
290
+ return imports;
291
+ }
292
+ resolve(importStr, currentFile, allFiles) {
293
+ if (!importStr.startsWith(".")) return null;
294
+ const currentDir = currentFile.split("/").slice(0, -1).join("/");
295
+ const parts = [...currentDir.split("/"), ...importStr.split("/")];
296
+ const resolvedParts = [];
297
+ for (const part of parts) {
298
+ if (part === "." || part === "") continue;
299
+ if (part === "..") resolvedParts.pop();
300
+ else resolvedParts.push(part);
301
+ }
302
+ const basePath = resolvedParts.join("/");
303
+ const existingExt = this.exts.find((ext) => basePath.endsWith(ext));
304
+ if (existingExt) {
305
+ const basePathWithoutExt = basePath.slice(0, -existingExt.length);
306
+ const mappedExts = this.extMapping[existingExt] || [existingExt];
307
+ for (const mappedExt of mappedExts) {
308
+ const mappedPath = basePathWithoutExt + mappedExt;
309
+ if (allFiles.has(mappedPath)) return mappedPath;
310
+ }
311
+ return null;
312
+ }
313
+ for (const ext of this.exts) {
314
+ const pathWithExt = basePath + ext;
315
+ if (allFiles.has(pathWithExt)) return pathWithExt;
316
+ }
317
+ for (const ext of this.exts) {
318
+ const indexPath = `${basePath}/index${ext}`;
319
+ if (allFiles.has(indexPath)) return indexPath;
320
+ }
321
+ return null;
322
+ }
323
+ };
324
+
325
+ // src/search/resolvers/PythonResolver.ts
326
+ var PythonResolver = class {
327
+ supports(filePath) {
328
+ return filePath.endsWith(".py");
329
+ }
330
+ extract(content) {
331
+ const pattern = /^\s*(?:from\s+(\.{0,3}[\w\.]*)\\s+import|import\s+([\w\.]+))/gm;
332
+ const imports = [];
333
+ let match;
334
+ while ((match = pattern.exec(content)) !== null) {
335
+ const importStr = match[1] || match[2];
336
+ if (importStr) {
337
+ imports.push(importStr);
338
+ }
339
+ }
340
+ return imports;
341
+ }
342
+ resolve(importStr, currentFile, allFiles) {
343
+ if (importStr.startsWith(".")) {
344
+ return this.resolveRelativeImport(importStr, currentFile, allFiles);
345
+ }
346
+ return this.resolveAbsoluteImport(importStr, currentFile, allFiles);
347
+ }
348
+ /**
349
+ * 解析 Python 相对导入
350
+ * - from . import foo -> 当前目录的 foo.py 或 foo/__init__.py
351
+ * - from .. import bar -> 父目录的 bar.py 或 bar/__init__.py
352
+ * - from ..utils import baz -> 父目录的 utils.py 或 utils/baz.py
353
+ */
354
+ resolveRelativeImport(importStr, currentFile, allFiles) {
355
+ const dotMatch = importStr.match(/^(\.+)/);
356
+ if (!dotMatch) return null;
357
+ const dotCount = dotMatch[1].length;
358
+ const rest = importStr.slice(dotCount);
359
+ const currentParts = currentFile.split("/");
360
+ currentParts.pop();
361
+ const targetDirParts = currentParts.slice(0, currentParts.length - (dotCount - 1));
362
+ if (targetDirParts.length < 0) return null;
363
+ const modulePath = rest.replace(/\./g, "/");
364
+ const basePath = targetDirParts.join("/");
365
+ const candidates = [];
366
+ if (modulePath) {
367
+ candidates.push(
368
+ `${basePath}/${modulePath}.py`,
369
+ `${basePath}/${modulePath}/__init__.py`
370
+ );
371
+ } else {
372
+ candidates.push(`${basePath}/__init__.py`);
373
+ }
374
+ for (const candidate of candidates) {
375
+ if (allFiles.has(candidate)) {
376
+ return candidate;
377
+ }
378
+ }
379
+ return null;
380
+ }
381
+ /**
382
+ * 解析 Python 绝对导入 (后缀模糊匹配 + 路径前缀歧义消解)
383
+ * - from my.pkg import xxx -> 找到以 /my/pkg.py 或 /my/pkg/__init__.py 结尾的文件
384
+ * - 如果有多个匹配,优先选择与当前文件路径前缀重叠最多的
385
+ */
386
+ resolveAbsoluteImport(importStr, currentFile, allFiles) {
387
+ const modulePath = importStr.replace(/\./g, "/");
388
+ const suffixes = [
389
+ `/${modulePath}.py`,
390
+ `/${modulePath}/__init__.py`
391
+ ];
392
+ const candidates = [];
393
+ for (const filePath of allFiles) {
394
+ for (const suffix of suffixes) {
395
+ if (filePath.endsWith(suffix)) {
396
+ const boundaryIndex = filePath.length - suffix.length;
397
+ if (boundaryIndex <= 0 || filePath[boundaryIndex - 1] === "/") {
398
+ candidates.push(filePath);
399
+ break;
400
+ }
401
+ }
402
+ }
403
+ }
404
+ if (candidates.length === 0) return null;
405
+ if (candidates.length === 1) return candidates[0];
406
+ const currentDir = currentFile.split("/").slice(0, -1).join("/");
407
+ candidates.sort((a, b) => {
408
+ const overlapA = commonPrefixLength(a, currentDir);
409
+ const overlapB = commonPrefixLength(b, currentDir);
410
+ return overlapB - overlapA;
411
+ });
412
+ return candidates[0];
413
+ }
414
+ };
415
+
416
+ // src/search/resolvers/GoResolver.ts
417
+ var GoResolver = class {
418
+ supports(filePath) {
419
+ return filePath.endsWith(".go");
420
+ }
421
+ extract(content) {
422
+ const imports = [];
423
+ const singlePattern = /^\s*import\s+"([^"]+)"/gm;
424
+ let match;
425
+ while ((match = singlePattern.exec(content)) !== null) {
426
+ imports.push(match[1]);
427
+ }
428
+ const blockPattern = /import\s*\(\s*([\s\S]*?)\s*\)/g;
429
+ while ((match = blockPattern.exec(content)) !== null) {
430
+ const block = match[1];
431
+ const linePattern = /"([^"]+)"/g;
432
+ let lineMatch;
433
+ while ((lineMatch = linePattern.exec(block)) !== null) {
434
+ imports.push(lineMatch[1]);
435
+ }
436
+ }
437
+ return imports;
438
+ }
439
+ resolve(importStr, currentFile, allFiles) {
440
+ if (!importStr.includes("/") && !importStr.includes(".")) {
441
+ return null;
442
+ }
443
+ const suffix = `/${importStr}/`;
444
+ const candidates = [];
445
+ for (const filePath of allFiles) {
446
+ if (filePath.endsWith(".go") && filePath.includes(suffix)) {
447
+ candidates.push(filePath);
448
+ }
449
+ }
450
+ if (candidates.length === 0) return null;
451
+ const nonTest = candidates.find((f) => !f.endsWith("_test.go"));
452
+ return nonTest || candidates[0];
453
+ }
454
+ };
455
+
456
+ // src/search/resolvers/JavaResolver.ts
457
+ var JavaResolver = class {
458
+ supports(filePath) {
459
+ return filePath.endsWith(".java");
460
+ }
461
+ extract(content) {
462
+ const imports = [];
463
+ const pattern = /^\s*import\s+(?:static\s+)?([\w.]+);/gm;
464
+ let match;
465
+ while ((match = pattern.exec(content)) !== null) {
466
+ imports.push(match[1]);
467
+ }
468
+ return imports;
469
+ }
470
+ resolve(importStr, currentFile, allFiles) {
471
+ if (importStr.endsWith(".*")) {
472
+ const pkgPath = importStr.slice(0, -2).replace(/\./g, "/");
473
+ const suffix2 = `/${pkgPath}/`;
474
+ for (const filePath of allFiles) {
475
+ if (filePath.endsWith(".java") && filePath.includes(suffix2)) {
476
+ return filePath;
477
+ }
478
+ }
479
+ return null;
480
+ }
481
+ const classPath = importStr.replace(/\./g, "/");
482
+ const suffix = `/${classPath}.java`;
483
+ for (const filePath of allFiles) {
484
+ if (filePath.endsWith(suffix)) {
485
+ return filePath;
486
+ }
487
+ }
488
+ return null;
489
+ }
490
+ };
491
+
492
+ // src/search/resolvers/RustResolver.ts
493
+ var RustResolver = class {
494
+ supports(filePath) {
495
+ return filePath.endsWith(".rs");
496
+ }
497
+ extract(content) {
498
+ const imports = [];
499
+ const modPattern = /^\s*(?:pub\s+)?mod\s+(\w+)\s*;/gm;
500
+ let match;
501
+ while ((match = modPattern.exec(content)) !== null) {
502
+ imports.push(`mod:${match[1]}`);
503
+ }
504
+ const usePattern = /^\s*(?:pub\s+)?use\s+((?:crate|super|self)(?:::\w+)+)/gm;
505
+ while ((match = usePattern.exec(content)) !== null) {
506
+ imports.push(`use:${match[1]}`);
507
+ }
508
+ return imports;
509
+ }
510
+ resolve(importStr, currentFile, allFiles) {
511
+ const currentDir = currentFile.split("/").slice(0, -1).join("/");
512
+ if (importStr.startsWith("mod:")) {
513
+ const modName = importStr.slice(4);
514
+ const candidates = [
515
+ `${currentDir}/${modName}.rs`,
516
+ `${currentDir}/${modName}/mod.rs`
517
+ ];
518
+ for (const candidate of candidates) {
519
+ if (allFiles.has(candidate)) {
520
+ return candidate;
521
+ }
522
+ }
523
+ return null;
524
+ }
525
+ if (importStr.startsWith("use:")) {
526
+ const usePath = importStr.slice(4);
527
+ const parts = usePath.split("::");
528
+ let baseParts;
529
+ let startIndex;
530
+ if (parts[0] === "crate") {
531
+ const srcIndex = currentFile.indexOf("/src/");
532
+ if (srcIndex !== -1) {
533
+ baseParts = currentFile.slice(0, srcIndex + 4).split("/");
534
+ } else {
535
+ baseParts = currentDir.split("/");
536
+ }
537
+ startIndex = 1;
538
+ } else if (parts[0] === "super") {
539
+ baseParts = currentDir.split("/").slice(0, -1);
540
+ startIndex = 1;
541
+ } else if (parts[0] === "self") {
542
+ baseParts = currentDir.split("/");
543
+ startIndex = 1;
544
+ } else {
545
+ return null;
546
+ }
547
+ const moduleParts = parts.slice(startIndex);
548
+ const modulePath = [...baseParts, ...moduleParts].join("/");
549
+ const candidates = [
550
+ `${modulePath}.rs`,
551
+ `${modulePath}/mod.rs`
552
+ ];
553
+ for (const candidate of candidates) {
554
+ if (allFiles.has(candidate)) {
555
+ return candidate;
556
+ }
557
+ }
558
+ return null;
559
+ }
560
+ return null;
561
+ }
562
+ };
563
+
564
+ // src/search/resolvers/index.ts
565
+ function createResolvers() {
566
+ return [
567
+ new JsTsResolver(),
568
+ new PythonResolver(),
569
+ new GoResolver(),
570
+ new JavaResolver(),
571
+ new RustResolver()
572
+ ];
573
+ }
574
+
575
+ // src/search/GraphExpander.ts
576
+ var GraphExpander = class {
577
+ projectId;
578
+ config;
579
+ vectorStore = null;
580
+ db = null;
581
+ // 缓存所有文件路径 (用于快速查找和模糊匹配)
582
+ allFilePaths = null;
583
+ // 注册解析器(按优先级排列)
584
+ resolvers = createResolvers();
585
+ constructor(projectId, config) {
586
+ this.projectId = projectId;
587
+ this.config = config;
588
+ }
589
+ async init() {
590
+ const embeddingConfig = getEmbeddingConfig();
591
+ this.vectorStore = await getVectorStore(this.projectId, embeddingConfig.dimensions);
592
+ this.db = initDb(this.projectId);
593
+ }
594
+ /**
595
+ * 加载文件索引 (Lazy Load)
596
+ * 相比反复查 DB,一次性加载所有路径到 Set 内存占用极低且速度极快
597
+ */
598
+ loadFileIndex() {
599
+ if (this.allFilePaths) return;
600
+ if (!this.db) this.db = initDb(this.projectId);
601
+ const rows = this.db.prepare("SELECT path FROM files").all();
602
+ this.allFilePaths = new Set(rows.map((r) => r.path));
603
+ logger.debug({ count: this.allFilePaths.size }, "GraphExpander: \u6587\u4EF6\u7D22\u5F15\u5DF2\u52A0\u8F7D");
604
+ }
605
+ /**
606
+ * 使文件索引失效(用于增量索引后刷新)
607
+ */
608
+ invalidateFileIndex() {
609
+ this.allFilePaths = null;
610
+ }
611
+ /**
612
+ * 扩展 seed chunks
613
+ */
614
+ async expand(seeds, queryTokens) {
615
+ if (!this.vectorStore || !this.db) {
616
+ await this.init();
617
+ }
618
+ this.loadFileIndex();
619
+ const stats = {
620
+ neighborCount: 0,
621
+ breadcrumbCount: 0,
622
+ importCount: 0,
623
+ importDepth1Count: 0
624
+ };
625
+ if (seeds.length === 0) {
626
+ return { chunks: [], stats };
627
+ }
628
+ const existingKeys = new Set(seeds.map((s) => this.getChunkKey(s)));
629
+ const expandedChunks = [];
630
+ const seedsByFile = this.groupByFile(seeds);
631
+ const neighborChunks = await this.expandNeighbors(seedsByFile, existingKeys);
632
+ this.addChunks(neighborChunks, expandedChunks, existingKeys);
633
+ stats.neighborCount = neighborChunks.length;
634
+ const breadcrumbChunks = await this.expandBreadcrumb(seeds, existingKeys);
635
+ this.addChunks(breadcrumbChunks, expandedChunks, existingKeys);
636
+ stats.breadcrumbCount = breadcrumbChunks.length;
637
+ const importChunks = await this.expandImports(seeds, existingKeys, queryTokens, stats);
638
+ this.addChunks(importChunks, expandedChunks, existingKeys);
639
+ stats.importCount = importChunks.length;
640
+ logger.debug(stats, "\u4E0A\u4E0B\u6587\u6269\u5C55\u5B8C\u6210");
641
+ return { chunks: expandedChunks, stats };
642
+ }
643
+ /**
644
+ * 添加 chunks 并更新去重集合
645
+ */
646
+ addChunks(newChunks, target, keys) {
647
+ for (const chunk of newChunks) {
648
+ const key = this.getChunkKey(chunk);
649
+ if (!keys.has(key)) {
650
+ keys.add(key);
651
+ target.push(chunk);
652
+ }
653
+ }
654
+ }
655
+ // =========================================
656
+ // E1: 同文件邻居扩展
657
+ // =========================================
658
+ /**
659
+ * 扩展同文件邻居
660
+ *
661
+ * 对于每个 seed,获取其前后 ±neighborHops 个 chunks
662
+ */
663
+ async expandNeighbors(seedsByFile, existingKeys) {
664
+ const result = [];
665
+ const { neighborHops, decayNeighbor } = this.config;
666
+ const allFilePaths = Array.from(seedsByFile.keys());
667
+ const allChunksMap = await this.vectorStore.getFilesChunks(allFilePaths);
668
+ for (const [filePath, fileSeeds] of seedsByFile) {
669
+ const allChunks = allChunksMap.get(filePath) ?? [];
670
+ if (allChunks.length === 0) continue;
671
+ const sortedChunks = allChunks.sort((a, b) => a.chunk_index - b.chunk_index);
672
+ const chunkMap = new Map(sortedChunks.map((c) => [c.chunk_index, c]));
673
+ const seedIndices = new Set(fileSeeds.map((s) => s.chunkIndex));
674
+ const neighborIndices = /* @__PURE__ */ new Set();
675
+ for (const seed of fileSeeds) {
676
+ const baseIndex = seed.chunkIndex;
677
+ for (let delta = -neighborHops; delta <= neighborHops; delta++) {
678
+ if (delta === 0) continue;
679
+ const neighborIndex = baseIndex + delta;
680
+ if (!seedIndices.has(neighborIndex) && chunkMap.has(neighborIndex)) {
681
+ neighborIndices.add(neighborIndex);
682
+ }
683
+ }
684
+ }
685
+ for (const neighborIndex of neighborIndices) {
686
+ const chunk = chunkMap.get(neighborIndex);
687
+ const key = `${filePath}#${neighborIndex}`;
688
+ if (existingKeys.has(key)) continue;
689
+ let minDistance = Infinity;
690
+ let maxSeedScore = 0;
691
+ for (const seed of fileSeeds) {
692
+ const distance = Math.abs(neighborIndex - seed.chunkIndex);
693
+ if (distance < minDistance) {
694
+ minDistance = distance;
695
+ maxSeedScore = seed.score;
696
+ } else if (distance === minDistance && seed.score > maxSeedScore) {
697
+ maxSeedScore = seed.score;
698
+ }
699
+ }
700
+ const decayedScore = maxSeedScore * Math.pow(decayNeighbor, minDistance);
701
+ result.push({
702
+ filePath,
703
+ chunkIndex: neighborIndex,
704
+ score: decayedScore,
705
+ source: "neighbor",
706
+ record: { ...chunk, _distance: 0 }
707
+ });
708
+ }
709
+ }
710
+ return result;
711
+ }
712
+ // =========================================
713
+ // E2: breadcrumb 补段
714
+ // =========================================
715
+ /**
716
+ * 扩展 breadcrumb 补段
717
+ *
718
+ * 对于每个 seed,找到具有相同 breadcrumb 前缀的其他 chunks
719
+ * 例如:如果 seed 的 breadcrumb 是 "src/foo.ts > class Foo > method bar"
720
+ * 则会找到 "src/foo.ts > class Foo > ..." 的其他 chunks
721
+ */
722
+ async expandBreadcrumb(seeds, existingKeys) {
723
+ const result = [];
724
+ const { breadcrumbExpandLimit, decayBreadcrumb } = this.config;
725
+ const prefixGroups = /* @__PURE__ */ new Map();
726
+ for (const seed of seeds) {
727
+ const prefix = this.extractBreadcrumbPrefix(seed.record.breadcrumb);
728
+ if (!prefix) continue;
729
+ if (!prefixGroups.has(prefix)) {
730
+ prefixGroups.set(prefix, []);
731
+ }
732
+ prefixGroups.get(prefix).push(seed);
733
+ }
734
+ const uniqueFilePaths = /* @__PURE__ */ new Set();
735
+ for (const prefixSeeds of prefixGroups.values()) {
736
+ uniqueFilePaths.add(prefixSeeds[0].filePath);
737
+ }
738
+ const allChunksMap = await this.vectorStore.getFilesChunks(Array.from(uniqueFilePaths));
739
+ for (const [prefix, prefixSeeds] of prefixGroups) {
740
+ const filePath = prefixSeeds[0].filePath;
741
+ const allChunks = allChunksMap.get(filePath) ?? [];
742
+ const matchingChunks = allChunks.filter((chunk) => {
743
+ const chunkPrefix = this.extractBreadcrumbPrefix(chunk.breadcrumb);
744
+ return chunkPrefix === prefix;
745
+ });
746
+ const seedIndices = new Set(prefixSeeds.map((s) => s.chunkIndex));
747
+ const newChunks = matchingChunks.filter((chunk) => !seedIndices.has(chunk.chunk_index)).filter((chunk) => !existingKeys.has(`${filePath}#${chunk.chunk_index}`)).slice(0, breadcrumbExpandLimit);
748
+ const maxSeedScore = Math.max(...prefixSeeds.map((s) => s.score));
749
+ for (const chunk of newChunks) {
750
+ result.push({
751
+ filePath,
752
+ chunkIndex: chunk.chunk_index,
753
+ score: maxSeedScore * decayBreadcrumb,
754
+ source: "breadcrumb",
755
+ record: { ...chunk, _distance: 0 }
756
+ });
757
+ }
758
+ }
759
+ return result;
760
+ }
761
+ /**
762
+ * 提取 breadcrumb 的父级前缀
763
+ *
764
+ * 例如:
765
+ * - "src/foo.ts > class Foo > method bar" → "src/foo.ts > class Foo"
766
+ * - "src/foo.ts > function baz" → "src/foo.ts"
767
+ * - "src/foo.ts" → null (没有父级)
768
+ */
769
+ extractBreadcrumbPrefix(breadcrumb) {
770
+ const parts = breadcrumb.split(" > ");
771
+ if (parts.length <= 1) return null;
772
+ return parts.slice(0, -1).join(" > ");
773
+ }
774
+ // =========================================
775
+ // E3: 跨文件引用解析(多语言支持)
776
+ // =========================================
777
+ /**
778
+ * 扩展 import 关系
779
+ *
780
+ * 解析 seed 文件中的 import 语句,获取被导入文件的 chunks
781
+ * 支持多语言:TypeScript/JavaScript, Python, Go, Java, Rust
782
+ */
783
+ async expandImports(seeds, existingKeys, queryTokens, stats) {
784
+ const result = [];
785
+ const { importFilesPerSeed, chunksPerImportFile, decayImport, decayDepth } = this.config;
786
+ const seedScoreByFile = this.buildSeedScoreByFile(seeds);
787
+ const queue = [];
788
+ const visited = /* @__PURE__ */ new Set();
789
+ for (const [filePath, seedScore] of seedScoreByFile.entries()) {
790
+ queue.push({ filePath, depth: 0, seedScore });
791
+ }
792
+ while (queue.length > 0) {
793
+ const { filePath, depth, seedScore } = queue.shift();
794
+ if (visited.has(filePath)) continue;
795
+ visited.add(filePath);
796
+ if (depth > 0 && !this.isBarrelFile(filePath)) continue;
797
+ const resolver = this.resolvers.find((r) => r.supports(filePath));
798
+ if (!resolver) continue;
799
+ const row = this.db.prepare("SELECT content FROM files WHERE path = ?").get(filePath);
800
+ if (!row?.content) continue;
801
+ const importStrs = resolver.extract(row.content);
802
+ if (importStrs.length === 0) continue;
803
+ const perFileLimit = depth === 0 ? importFilesPerSeed : Math.min(importFilesPerSeed, 2);
804
+ let importCount = 0;
805
+ const processedImports = /* @__PURE__ */ new Set();
806
+ for (const importStr of importStrs) {
807
+ if (importCount >= perFileLimit) break;
808
+ if (processedImports.has(importStr)) continue;
809
+ processedImports.add(importStr);
810
+ const targetPath = resolver.resolve(importStr, filePath, this.allFilePaths);
811
+ if (!targetPath || targetPath === filePath) continue;
812
+ const importChunks = await this.vectorStore.getFileChunks(targetPath);
813
+ if (importChunks.length === 0) continue;
814
+ const selectedChunks = this.selectImportChunks(importChunks, chunksPerImportFile, queryTokens);
815
+ const depthDecay = depth === 0 ? 1 : decayDepth;
816
+ for (const chunk of selectedChunks) {
817
+ const key = `${targetPath}#${chunk.chunk_index}`;
818
+ if (existingKeys.has(key)) continue;
819
+ result.push({
820
+ filePath: targetPath,
821
+ chunkIndex: chunk.chunk_index,
822
+ score: seedScore * decayImport * depthDecay,
823
+ source: "import",
824
+ record: { ...chunk, _distance: 0 }
825
+ });
826
+ }
827
+ importCount++;
828
+ if (depth === 0 && this.isBarrelFile(targetPath)) {
829
+ if (stats) stats.importDepth1Count++;
830
+ queue.push({ filePath: targetPath, depth: 1, seedScore });
831
+ }
832
+ }
833
+ }
834
+ return result;
835
+ }
836
+ // =========================================
837
+ // 工具方法
838
+ // =========================================
839
+ /**
840
+ * 生成 chunk 唯一键
841
+ */
842
+ getChunkKey(chunk) {
843
+ return `${chunk.filePath}#${chunk.chunkIndex}`;
844
+ }
845
+ /**
846
+ * 按文件分组
847
+ */
848
+ groupByFile(chunks) {
849
+ const groups = /* @__PURE__ */ new Map();
850
+ for (const chunk of chunks) {
851
+ if (!groups.has(chunk.filePath)) {
852
+ groups.set(chunk.filePath, []);
853
+ }
854
+ groups.get(chunk.filePath).push(chunk);
855
+ }
856
+ return groups;
857
+ }
858
+ /**
859
+ * 按文件汇总 seed 最大得分
860
+ */
861
+ buildSeedScoreByFile(seeds) {
862
+ const map = /* @__PURE__ */ new Map();
863
+ for (const seed of seeds) {
864
+ const current = map.get(seed.filePath);
865
+ if (current === void 0 || seed.score > current) {
866
+ map.set(seed.filePath, seed.score);
867
+ }
868
+ }
869
+ return map;
870
+ }
871
+ /**
872
+ * 选择导入文件的 chunks(优先 query overlap)
873
+ */
874
+ selectImportChunks(chunks, limit, queryTokens) {
875
+ if (limit <= 0) return [];
876
+ const sortedByIndex = chunks.slice().sort((a, b) => a.chunk_index - b.chunk_index);
877
+ if (!queryTokens || queryTokens.size === 0) {
878
+ return sortedByIndex.slice(0, limit);
879
+ }
880
+ const scored = sortedByIndex.map((chunk) => ({
881
+ chunk,
882
+ score: this.scoreChunkTokenOverlap(chunk, queryTokens)
883
+ }));
884
+ const overlapped = scored.filter((s) => s.score > 0).sort((a, b) => b.score - a.score).slice(0, limit).map((s) => s.chunk);
885
+ return overlapped.length > 0 ? overlapped : sortedByIndex.slice(0, limit);
886
+ }
887
+ /**
888
+ * 计算 chunk 与查询的 token overlap 得分
889
+ */
890
+ scoreChunkTokenOverlap(chunk, queryTokens) {
891
+ const text = (chunk.breadcrumb + " " + chunk.display_code).toLowerCase();
892
+ let score = 0;
893
+ for (const token of queryTokens) {
894
+ if (text.includes(token)) {
895
+ const wordBoundaryRegex = new RegExp(`\\b${token.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`);
896
+ if (wordBoundaryRegex.test(text)) {
897
+ score += 1;
898
+ } else {
899
+ score += 0.5;
900
+ }
901
+ }
902
+ }
903
+ return score;
904
+ }
905
+ /**
906
+ * 判断是否为 barrel/index 文件
907
+ */
908
+ isBarrelFile(filePath) {
909
+ const lower = filePath.toLowerCase();
910
+ if (lower.endsWith("/__init__.py")) return true;
911
+ if (lower.endsWith("/mod.rs")) return true;
912
+ return /\/index\.(ts|tsx|js|jsx|mts|mjs|cts|cjs)$/.test(lower);
913
+ }
914
+ };
915
+ var expanders = /* @__PURE__ */ new Map();
916
+ async function getGraphExpander(projectId, config) {
917
+ let expander = expanders.get(projectId);
918
+ if (!expander) {
919
+ expander = new GraphExpander(projectId, config);
920
+ await expander.init();
921
+ expanders.set(projectId, expander);
922
+ }
923
+ return expander;
924
+ }
925
+
926
+ // src/search/config.ts
927
+ var DEFAULT_CONFIG = {
928
+ // 召回
929
+ vectorTopK: 80,
930
+ vectorTopM: 60,
931
+ ftsTopKFiles: 20,
932
+ lexChunksPerFile: 2,
933
+ lexTotalChunks: 40,
934
+ // 融合
935
+ rrfK0: 20,
936
+ wVec: 0.6,
937
+ wLex: 0.4,
938
+ fusedTopM: 60,
939
+ // Rerank
940
+ rerankTopN: 10,
941
+ maxRerankChars: 1e3,
942
+ maxBreadcrumbChars: 250,
943
+ headRatio: 0.67,
944
+ // 扩展
945
+ neighborHops: 1,
946
+ breadcrumbExpandLimit: 1,
947
+ importFilesPerSeed: 5,
948
+ chunksPerImportFile: 2,
949
+ decayNeighbor: 0.8,
950
+ decayBreadcrumb: 0.7,
951
+ decayImport: 0.6,
952
+ decayDepth: 0.7,
953
+ // ContextPacker
954
+ maxSegmentsPerFile: 3,
955
+ maxTotalChars: 48e3,
956
+ // Smart TopK
957
+ enableSmartTopK: true,
958
+ smartTopScoreRatio: 0.5,
959
+ smartTopScoreDeltaAbs: 0.25,
960
+ smartMinScore: 0.25,
961
+ smartMinK: 2,
962
+ smartMaxK: 8
963
+ };
964
+
965
+ // src/search/SearchService.ts
966
+ var tokenBoundaryRegexCache = /* @__PURE__ */ new Map();
967
+ function getTokenBoundaryRegex(token) {
968
+ let regex = tokenBoundaryRegexCache.get(token);
969
+ if (!regex) {
970
+ const escaped = token.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
971
+ regex = new RegExp(`\\b${escaped}\\b`);
972
+ tokenBoundaryRegexCache.set(token, regex);
973
+ }
974
+ return regex;
975
+ }
976
+ var SearchService = class {
977
+ projectId;
978
+ projectPath;
979
+ indexer = null;
980
+ vectorStore = null;
981
+ db = null;
982
+ config;
983
+ constructor(projectId, projectPath, config) {
984
+ this.projectId = projectId;
985
+ this.projectPath = projectPath;
986
+ this.config = { ...DEFAULT_CONFIG, ...config };
987
+ }
988
+ async init() {
989
+ const embeddingConfig = getEmbeddingConfig();
990
+ this.indexer = await getIndexer(this.projectId, embeddingConfig.dimensions);
991
+ this.vectorStore = await getVectorStore(this.projectId, embeddingConfig.dimensions);
992
+ this.db = initDb(this.projectId);
993
+ }
994
+ // 公开接口
995
+ /**
996
+ * 构建上下文包(用于问答/生成)
997
+ */
998
+ async buildContextPack(query) {
999
+ const timingMs = {};
1000
+ let t0 = Date.now();
1001
+ const candidates = await this.hybridRetrieve(query);
1002
+ timingMs.retrieve = Date.now() - t0;
1003
+ t0 = Date.now();
1004
+ const topM = candidates.sort((a, b) => b.score - a.score).slice(0, this.config.fusedTopM);
1005
+ const reranked = await this.rerank(query, topM);
1006
+ timingMs.rerank = Date.now() - t0;
1007
+ t0 = Date.now();
1008
+ const seeds = this.applySmartCutoff(reranked);
1009
+ timingMs.smartCutoff = Date.now() - t0;
1010
+ t0 = Date.now();
1011
+ const queryTokens = this.extractQueryTokens(query);
1012
+ const expanded = await this.expand(seeds, queryTokens);
1013
+ timingMs.expand = Date.now() - t0;
1014
+ t0 = Date.now();
1015
+ const packer = new ContextPacker(this.projectId, this.config);
1016
+ const files = await packer.pack([...seeds, ...expanded]);
1017
+ timingMs.pack = Date.now() - t0;
1018
+ return {
1019
+ query,
1020
+ seeds,
1021
+ expanded,
1022
+ files,
1023
+ debug: {
1024
+ wVec: this.config.wVec,
1025
+ wLex: this.config.wLex,
1026
+ timingMs
1027
+ }
1028
+ };
1029
+ }
1030
+ // 召回方法
1031
+ /**
1032
+ * 混合召回:向量 + 词法
1033
+ */
1034
+ async hybridRetrieve(query) {
1035
+ const [vectorResults, lexicalResults] = await Promise.all([
1036
+ this.vectorRetrieve(query),
1037
+ this.lexicalRetrieve(query)
1038
+ ]);
1039
+ logger.debug({
1040
+ vectorCount: vectorResults.length,
1041
+ lexicalCount: lexicalResults.length
1042
+ }, "\u6DF7\u5408\u53EC\u56DE\u5B8C\u6210");
1043
+ if (lexicalResults.length === 0) {
1044
+ return vectorResults;
1045
+ }
1046
+ return this.fuse(vectorResults, lexicalResults);
1047
+ }
1048
+ /**
1049
+ * 向量召回
1050
+ */
1051
+ async vectorRetrieve(query) {
1052
+ if (!this.indexer) throw new Error("SearchService not initialized");
1053
+ const results = await this.indexer.textSearch(query, this.config.vectorTopK);
1054
+ return results.sort((a, b) => a._distance - b._distance).slice(0, this.config.vectorTopM).map((r, rank) => ({
1055
+ filePath: r.file_path,
1056
+ chunkIndex: r.chunk_index,
1057
+ score: 1 / (1 + r._distance),
1058
+ // 转为相似度(用于调试)
1059
+ source: "vector",
1060
+ record: r,
1061
+ _rank: rank
1062
+ // 用于 RRF
1063
+ }));
1064
+ }
1065
+ /**
1066
+ * 词法召回(FTS)
1067
+ *
1068
+ * 优先使用 chunk 级 FTS(更精准)
1069
+ * 如果 chunks_fts 不可用,降级到文件级 FTS + overlap 下钻
1070
+ */
1071
+ async lexicalRetrieve(query) {
1072
+ if (!this.db || !this.vectorStore) return [];
1073
+ if (isChunksFtsInitialized(this.db)) {
1074
+ return this.lexicalRetrieveFromChunksFts(query);
1075
+ }
1076
+ if (isFtsInitialized(this.db)) {
1077
+ return this.lexicalRetrieveFromFilesFts(query);
1078
+ }
1079
+ logger.debug("FTS \u672A\u521D\u59CB\u5316\uFF0C\u8DF3\u8FC7\u8BCD\u6CD5\u53EC\u56DE");
1080
+ return [];
1081
+ }
1082
+ /**
1083
+ * 从 chunks_fts 直接搜索(最优方案)
1084
+ */
1085
+ async lexicalRetrieveFromChunksFts(query) {
1086
+ const chunkResults = searchChunksFts(this.db, query, this.config.lexTotalChunks);
1087
+ if (chunkResults.length === 0) {
1088
+ logger.debug("Chunk FTS \u65E0\u547D\u4E2D");
1089
+ return [];
1090
+ }
1091
+ const allChunks = [];
1092
+ const fileChunksMap = /* @__PURE__ */ new Map();
1093
+ for (const result of chunkResults) {
1094
+ if (!fileChunksMap.has(result.filePath)) {
1095
+ fileChunksMap.set(result.filePath, /* @__PURE__ */ new Map());
1096
+ }
1097
+ fileChunksMap.get(result.filePath).set(result.chunkIndex, result.score);
1098
+ }
1099
+ const allFilePaths = Array.from(fileChunksMap.keys());
1100
+ const chunksMap = await this.vectorStore.getFilesChunks(allFilePaths);
1101
+ for (const [filePath, chunkScores] of fileChunksMap) {
1102
+ const chunks = chunksMap.get(filePath) ?? [];
1103
+ for (const chunk of chunks) {
1104
+ const score = chunkScores.get(chunk.chunk_index);
1105
+ if (score !== void 0) {
1106
+ allChunks.push({
1107
+ filePath: chunk.file_path,
1108
+ chunkIndex: chunk.chunk_index,
1109
+ score,
1110
+ source: "lexical",
1111
+ record: { ...chunk, _distance: 0 }
1112
+ });
1113
+ }
1114
+ }
1115
+ }
1116
+ logger.debug({
1117
+ totalChunks: allChunks.length,
1118
+ filesWithChunks: fileChunksMap.size
1119
+ }, "Chunk FTS \u53EC\u56DE\u5B8C\u6210");
1120
+ return allChunks.sort((a, b) => b.score - a.score).map((chunk, rank) => ({ ...chunk, _rank: rank }));
1121
+ }
1122
+ /**
1123
+ * 从 files_fts 搜索 + overlap 下钻(降级方案)
1124
+ */
1125
+ async lexicalRetrieveFromFilesFts(query) {
1126
+ const fileResults = searchFilesFts(this.db, query, this.config.ftsTopKFiles);
1127
+ if (fileResults.length === 0) {
1128
+ logger.debug("FTS \u65E0\u547D\u4E2D\u6587\u4EF6");
1129
+ return [];
1130
+ }
1131
+ const queryTokens = this.extractQueryTokens(query);
1132
+ logger.debug({
1133
+ fileCount: fileResults.length,
1134
+ queryTokens: Array.from(queryTokens).slice(0, 10)
1135
+ }, "FTS \u53EC\u56DE\u5F00\u59CB chunk \u9009\u62E9");
1136
+ const allChunks = [];
1137
+ let totalChunks = 0;
1138
+ let skippedFiles = 0;
1139
+ for (const { path: filePath, score: fileScore } of fileResults) {
1140
+ if (totalChunks >= this.config.lexTotalChunks) break;
1141
+ const chunks = await this.vectorStore.getFileChunks(filePath);
1142
+ if (chunks.length === 0) continue;
1143
+ const scoredChunks = chunks.map((chunk) => ({
1144
+ chunk,
1145
+ overlapScore: this.scoreChunkTokenOverlap(chunk, queryTokens)
1146
+ }));
1147
+ const maxOverlap = Math.max(...scoredChunks.map((c) => c.overlapScore));
1148
+ if (maxOverlap === 0) {
1149
+ skippedFiles++;
1150
+ continue;
1151
+ }
1152
+ const topChunks = scoredChunks.filter((c) => c.overlapScore > 0).sort((a, b) => b.overlapScore - a.overlapScore).slice(0, this.config.lexChunksPerFile);
1153
+ for (const { chunk, overlapScore } of topChunks) {
1154
+ if (totalChunks >= this.config.lexTotalChunks) break;
1155
+ const combinedScore = fileScore * (1 + overlapScore * 0.5);
1156
+ allChunks.push({
1157
+ filePath: chunk.file_path,
1158
+ chunkIndex: chunk.chunk_index,
1159
+ score: combinedScore,
1160
+ source: "lexical",
1161
+ record: { ...chunk, _distance: 0 }
1162
+ });
1163
+ totalChunks++;
1164
+ }
1165
+ }
1166
+ if (skippedFiles > 0) {
1167
+ logger.debug({ skippedFiles }, "FTS \u8DF3\u8FC7 overlap=0 \u7684\u6587\u4EF6");
1168
+ }
1169
+ logger.debug({
1170
+ totalChunks: allChunks.length,
1171
+ filesWithChunks: new Set(allChunks.map((c) => c.filePath)).size
1172
+ }, "FTS chunk \u9009\u62E9\u5B8C\u6210");
1173
+ return allChunks.sort((a, b) => b.score - a.score).map((chunk, rank) => ({ ...chunk, _rank: rank }));
1174
+ }
1175
+ /**
1176
+ * 提取查询中的 tokens
1177
+ *
1178
+ * 直接复用 fts.ts 中的 segmentQuery,确保召回和评分逻辑一致
1179
+ */
1180
+ extractQueryTokens(query) {
1181
+ const tokens = segmentQuery(query);
1182
+ return new Set(tokens);
1183
+ }
1184
+ /**
1185
+ * 计算 chunk 与查询的 token overlap 得分
1186
+ *
1187
+ * 匹配策略:
1188
+ * - breadcrumb 和 display_code 都参与匹配
1189
+ * - 精确匹配得 1 分,子串匹配得 0.5 分
1190
+ */
1191
+ scoreChunkTokenOverlap(chunk, queryTokens) {
1192
+ const text = (chunk.breadcrumb + " " + chunk.display_code).toLowerCase();
1193
+ let score = 0;
1194
+ for (const token of queryTokens) {
1195
+ if (text.includes(token)) {
1196
+ const regex = getTokenBoundaryRegex(token);
1197
+ if (regex.test(text)) {
1198
+ score += 1;
1199
+ } else {
1200
+ score += 0.5;
1201
+ }
1202
+ }
1203
+ }
1204
+ return score;
1205
+ }
1206
+ // =========================================
1207
+ // 融合方法
1208
+ // =========================================
1209
+ /**
1210
+ * RRF (Reciprocal Rank Fusion) 融合
1211
+ *
1212
+ * 公式: score = Σ w_i / (k + rank_i)
1213
+ * 其中 k 是平滑常数,rank 从 0 开始
1214
+ */
1215
+ fuse(vectorResults, lexicalResults) {
1216
+ const { rrfK0, wVec, wLex } = this.config;
1217
+ const fusedScores = /* @__PURE__ */ new Map();
1218
+ const getKey = (chunk) => `${chunk.filePath}#${chunk.chunkIndex}`;
1219
+ for (const result of vectorResults) {
1220
+ const key = getKey(result);
1221
+ const rank = result._rank ?? 0;
1222
+ const rrfScore = wVec / (rrfK0 + rank);
1223
+ const existing = fusedScores.get(key);
1224
+ if (existing) {
1225
+ existing.score += rrfScore;
1226
+ existing.sources.add("vector");
1227
+ } else {
1228
+ fusedScores.set(key, {
1229
+ score: rrfScore,
1230
+ chunk: result,
1231
+ sources: /* @__PURE__ */ new Set(["vector"])
1232
+ });
1233
+ }
1234
+ }
1235
+ for (const result of lexicalResults) {
1236
+ const key = getKey(result);
1237
+ const rank = result._rank ?? 0;
1238
+ const rrfScore = wLex / (rrfK0 + rank);
1239
+ const existing = fusedScores.get(key);
1240
+ if (existing) {
1241
+ existing.score += rrfScore;
1242
+ existing.sources.add("lexical");
1243
+ } else {
1244
+ fusedScores.set(key, {
1245
+ score: rrfScore,
1246
+ chunk: result,
1247
+ sources: /* @__PURE__ */ new Set(["lexical"])
1248
+ });
1249
+ }
1250
+ }
1251
+ const fused = Array.from(fusedScores.values()).map(({ score, chunk, sources }) => ({
1252
+ ...chunk,
1253
+ score,
1254
+ source: sources.size > 1 ? "vector" : chunk.source
1255
+ // 保留原始来源
1256
+ })).sort((a, b) => b.score - a.score);
1257
+ if (isDebugEnabled()) {
1258
+ logger.debug({
1259
+ vectorCount: vectorResults.length,
1260
+ lexicalCount: lexicalResults.length,
1261
+ fusedCount: fused.length,
1262
+ bothSources: Array.from(fusedScores.values()).filter((v) => v.sources.size > 1).length
1263
+ }, "RRF \u878D\u5408\u5B8C\u6210");
1264
+ }
1265
+ return fused;
1266
+ }
1267
+ // Rerank 方法
1268
+ /**
1269
+ * Rerank
1270
+ */
1271
+ async rerank(query, candidates) {
1272
+ if (candidates.length === 0) return [];
1273
+ const reranker = getRerankerClient();
1274
+ const queryTokens = this.extractQueryTokens(query);
1275
+ const textExtractor = (chunk) => {
1276
+ const bc = this.truncateMiddle(chunk.record.breadcrumb, this.config.maxBreadcrumbChars);
1277
+ const budget = Math.max(0, this.config.maxRerankChars - bc.length - 1);
1278
+ const code = this.extractAroundHit(chunk.record.display_code, queryTokens, budget);
1279
+ return bc + "\n" + code;
1280
+ };
1281
+ const reranked = await reranker.rerankWithData(
1282
+ query,
1283
+ candidates,
1284
+ textExtractor,
1285
+ { topN: this.config.rerankTopN }
1286
+ );
1287
+ return reranked.map((r) => ({
1288
+ ...r.data,
1289
+ score: r.score
1290
+ }));
1291
+ }
1292
+ // Smart TopK Cutoff
1293
+ /**
1294
+ * 智能截断策略(Anchor & Floor + Safe Harbor + Delta Guard)
1295
+ *
1296
+ * 核心逻辑:
1297
+ * 1. 低置信熔断:topScore < floor → 返回 top1(CLI 友好)或空
1298
+ * 2. 动态阈值:max(floor, min(ratioThreshold, deltaThreshold))
1299
+ * 3. Safe Harbor:前 minK 个只检查 floor,不检查 ratio/delta
1300
+ * 4. 去重 + 补齐:cutoff 后去重,不足 minK 时从后续补齐
1301
+ */
1302
+ applySmartCutoff(candidates) {
1303
+ if (!this.config.enableSmartTopK) {
1304
+ return candidates;
1305
+ }
1306
+ if (candidates.length === 0) return [];
1307
+ const sorted = candidates.slice().sort((a, b) => b.score - a.score);
1308
+ const {
1309
+ smartTopScoreRatio: ratio,
1310
+ smartTopScoreDeltaAbs: deltaAbs,
1311
+ smartMinScore: floor,
1312
+ smartMinK: minK,
1313
+ smartMaxK: maxK
1314
+ } = this.config;
1315
+ const topScore = sorted[0].score;
1316
+ if (topScore < floor) {
1317
+ logger.debug({ topScore, floor }, "SmartTopK: Top1 below floor, returning top1 only");
1318
+ return [sorted[0]];
1319
+ }
1320
+ const ratioThreshold = topScore * ratio;
1321
+ const deltaThreshold = topScore - deltaAbs;
1322
+ const dynamicThreshold = Math.max(floor, Math.min(ratioThreshold, deltaThreshold));
1323
+ const picked = [];
1324
+ for (let i = 0; i < sorted.length; i++) {
1325
+ if (picked.length >= maxK) break;
1326
+ const chunk = sorted[i];
1327
+ if (i < minK) {
1328
+ if (chunk.score >= floor) {
1329
+ picked.push(chunk);
1330
+ continue;
1331
+ }
1332
+ logger.debug(
1333
+ { rank: i, score: chunk.score, floor },
1334
+ "SmartTopK: Safe harbor chunk below floor, breaking"
1335
+ );
1336
+ break;
1337
+ }
1338
+ if (chunk.score < dynamicThreshold) {
1339
+ logger.debug(
1340
+ { rank: i, score: chunk.score, dynamicThreshold, topScore, ratioThreshold, deltaThreshold },
1341
+ "SmartTopK: cutoff at dynamic threshold"
1342
+ );
1343
+ break;
1344
+ }
1345
+ picked.push(chunk);
1346
+ }
1347
+ const deduped = this.dedupChunks(picked);
1348
+ if (deduped.length < Math.min(minK, maxK)) {
1349
+ const seen = new Set(deduped.map((c) => this.chunkKey(c)));
1350
+ for (const c of sorted) {
1351
+ if (deduped.length >= Math.min(minK, maxK)) break;
1352
+ if (c.score < floor) break;
1353
+ const key = this.chunkKey(c);
1354
+ if (!seen.has(key)) {
1355
+ seen.add(key);
1356
+ deduped.push(c);
1357
+ }
1358
+ }
1359
+ }
1360
+ logger.debug({
1361
+ originalCount: candidates.length,
1362
+ pickedCount: picked.length,
1363
+ finalCount: deduped.length,
1364
+ topScore,
1365
+ floor,
1366
+ ratio,
1367
+ deltaAbs,
1368
+ ratioThreshold: ratioThreshold.toFixed(3),
1369
+ deltaThreshold: deltaThreshold.toFixed(3),
1370
+ dynamicThreshold: dynamicThreshold.toFixed(3)
1371
+ }, "SmartTopK: done");
1372
+ return deduped;
1373
+ }
1374
+ /**
1375
+ * 生成 chunk 唯一键(用于去重)
1376
+ */
1377
+ chunkKey(chunk) {
1378
+ return `${chunk.filePath}#${chunk.chunkIndex}`;
1379
+ }
1380
+ /**
1381
+ * 按 file_path + chunk_index 去重
1382
+ */
1383
+ dedupChunks(list) {
1384
+ const seen = /* @__PURE__ */ new Set();
1385
+ const out = [];
1386
+ for (const c of list) {
1387
+ const k = this.chunkKey(c);
1388
+ if (seen.has(k)) continue;
1389
+ seen.add(k);
1390
+ out.push(c);
1391
+ }
1392
+ return out;
1393
+ }
1394
+ // 扩展方法
1395
+ /**
1396
+ * 扩展 seed chunks
1397
+ *
1398
+ * 使用 GraphExpander 执行三种扩展策略:
1399
+ * - E1: 同文件邻居
1400
+ * - E2: breadcrumb 补段
1401
+ * - E3: 相对路径 import 解析
1402
+ */
1403
+ async expand(seeds, queryTokens) {
1404
+ if (seeds.length === 0) return [];
1405
+ const expander = await getGraphExpander(this.projectId, this.config);
1406
+ const { chunks, stats } = await expander.expand(seeds, queryTokens);
1407
+ logger.debug(stats, "\u4E0A\u4E0B\u6587\u6269\u5C55\u7EDF\u8BA1");
1408
+ return chunks;
1409
+ }
1410
+ // 工具方法
1411
+ /**
1412
+ * 中间省略截断(保留首尾)
1413
+ */
1414
+ truncateMiddle(text, maxLen) {
1415
+ if (text.length <= maxLen) return text;
1416
+ const half = Math.floor((maxLen - 3) / 2);
1417
+ return text.slice(0, half) + "..." + text.slice(-half);
1418
+ }
1419
+ /**
1420
+ * 头尾截断(备用方法,当无命中行时使用)
1421
+ */
1422
+ truncateHeadTail(text, maxLen, headRatio) {
1423
+ if (text.length <= maxLen) return text;
1424
+ const headLen = Math.floor(maxLen * headRatio);
1425
+ const tailLen = maxLen - headLen - 3;
1426
+ if (tailLen <= 0) return text.slice(0, maxLen);
1427
+ return text.slice(0, headLen) + "..." + text.slice(-tailLen);
1428
+ }
1429
+ /**
1430
+ * 围绕命中行截取
1431
+ *
1432
+ * 找到第一个包含 query token 的行,截取其上下文
1433
+ * 如果没有命中,降级为头尾截断
1434
+ */
1435
+ extractAroundHit(text, queryTokens, maxLen) {
1436
+ if (text.length <= maxLen) return text;
1437
+ const lines = text.split("\n");
1438
+ const textLower = text.toLowerCase();
1439
+ let hitLineIdx = -1;
1440
+ let bestScore = 0;
1441
+ for (let i = 0; i < lines.length; i++) {
1442
+ const lineLower = lines[i].toLowerCase();
1443
+ let lineScore = 0;
1444
+ for (const token of queryTokens) {
1445
+ if (lineLower.includes(token)) {
1446
+ lineScore++;
1447
+ }
1448
+ }
1449
+ if (lineScore > bestScore) {
1450
+ bestScore = lineScore;
1451
+ hitLineIdx = i;
1452
+ }
1453
+ }
1454
+ if (hitLineIdx === -1) {
1455
+ return this.truncateHeadTail(text, maxLen, this.config.headRatio);
1456
+ }
1457
+ let start = hitLineIdx;
1458
+ let end = hitLineIdx;
1459
+ let currentLen = lines[hitLineIdx].length;
1460
+ while (currentLen < maxLen) {
1461
+ const canUp = start > 0;
1462
+ const canDown = end < lines.length - 1;
1463
+ if (!canUp && !canDown) break;
1464
+ if (canUp) {
1465
+ const upLen = lines[start - 1].length + 1;
1466
+ if (currentLen + upLen <= maxLen) {
1467
+ start--;
1468
+ currentLen += upLen;
1469
+ }
1470
+ }
1471
+ if (canDown) {
1472
+ const downLen = lines[end + 1].length + 1;
1473
+ if (currentLen + downLen <= maxLen) {
1474
+ end++;
1475
+ currentLen += downLen;
1476
+ }
1477
+ }
1478
+ if ((start === 0 || lines[start - 1].length + 1 + currentLen > maxLen) && (end === lines.length - 1 || lines[end + 1].length + 1 + currentLen > maxLen)) {
1479
+ break;
1480
+ }
1481
+ }
1482
+ const result = lines.slice(start, end + 1).join("\n");
1483
+ const prefix = start > 0 ? "..." : "";
1484
+ const suffix = end < lines.length - 1 ? "..." : "";
1485
+ return prefix + result + suffix;
1486
+ }
1487
+ /**
1488
+ * 获取当前配置
1489
+ */
1490
+ getConfig() {
1491
+ return { ...this.config };
1492
+ }
1493
+ };
1494
+ export {
1495
+ SearchService
1496
+ };