@chiway/contextweaver 1.1.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +138 -28
- package/dist/{SearchService-MYPOCM3B.js → SearchService-WVD6THR3.js} +170 -82
- package/dist/chunk-3BNHQV5W.js +373 -0
- package/dist/chunk-BFCIZ52F.js +102 -0
- package/dist/{chunk-NQR4CGQ6.js → chunk-GDVB6PJ4.js} +58 -10
- package/dist/{lock-DVY3KJSK.js → chunk-HHYPQA3X.js} +2 -3
- package/dist/chunk-ISVCQFB4.js +223 -0
- package/dist/chunk-IZ6IUHNN.js +77 -0
- package/dist/{chunk-AMQQK4P7.js → chunk-JVKVSTQ3.js} +1 -2
- package/dist/chunk-LB42CZEB.js +18 -0
- package/dist/{chunk-6Z4JEEVJ.js → chunk-PPLFJGO3.js} +303 -58
- package/dist/chunk-R6CNZXZ7.js +143 -0
- package/dist/{chunk-RJURH22T.js → chunk-SKBAE26T.js} +0 -1
- package/dist/chunk-TPM6YP43.js +38 -0
- package/dist/{chunk-7G5V7YT5.js → chunk-V3K4YVAR.js} +12 -120
- package/dist/chunk-VWBKZ6QL.js +115 -0
- package/dist/chunk-XFIM2T6S.js +57 -0
- package/dist/{chunk-6QMYML5V.js → chunk-XMZZZKG7.js} +361 -295
- package/dist/chunk-XTWNT7KP.js +156 -0
- package/dist/chunk-Y6H7C3NA.js +85 -0
- package/dist/codebaseRetrieval-DIS5RH2C.js +14 -0
- package/dist/{config-BWZ6CU3W.js → config-LCOJHTCF.js} +1 -2
- package/dist/db-GBCLP4GG.js +68 -0
- package/dist/findReferences-N7ML7TUP.js +16 -0
- package/dist/getSymbolDefinition-6KMY4H33.js +17 -0
- package/dist/index.js +271 -40
- package/dist/listFiles-4VT2TPJD.js +14 -0
- package/dist/loadConfig-XTVT2OWW.js +9 -0
- package/dist/lock-HNKQ6X5B.js +8 -0
- package/dist/scanner-QDFZJLP7.js +13 -0
- package/dist/server-UAI3U7AB.js +347 -0
- package/dist/stats-AGKUCJQI.js +12 -0
- package/dist/vectorStore-4ODCERRO.js +12 -0
- package/package.json +9 -23
- package/dist/codebaseRetrieval-NLAMGOA2.js +0 -12
- package/dist/scanner-RFG4YWYI.js +0 -11
- package/dist/server-27HI7WZO.js +0 -147
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
// src/vectorStore/index.ts
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import os from "os";
|
|
4
|
+
import path from "path";
|
|
5
|
+
import * as lancedb from "@lancedb/lancedb";
|
|
6
|
+
var BASE_DIR = path.join(os.homedir(), ".contextweaver");
|
|
7
|
+
function sampleCheckDisplayCode(oldRows, getContent, options = {}) {
|
|
8
|
+
const sampleSize = options.sampleSize ?? 100;
|
|
9
|
+
const maxMismatchRatio = options.maxMismatchRatio ?? 0.01;
|
|
10
|
+
if (oldRows.length === 0) {
|
|
11
|
+
return { abort: false, sampled: 0, mismatched: 0, ratio: 0 };
|
|
12
|
+
}
|
|
13
|
+
const indices = [];
|
|
14
|
+
const step = Math.max(1, Math.floor(oldRows.length / sampleSize));
|
|
15
|
+
for (let i = 0; i < oldRows.length && indices.length < sampleSize; i += step) {
|
|
16
|
+
indices.push(i);
|
|
17
|
+
}
|
|
18
|
+
let sampled = 0;
|
|
19
|
+
let mismatched = 0;
|
|
20
|
+
for (const idx of indices) {
|
|
21
|
+
const r = oldRows[idx];
|
|
22
|
+
const content = getContent(r.file_path);
|
|
23
|
+
if (content === null) continue;
|
|
24
|
+
sampled++;
|
|
25
|
+
const safeStart = Math.max(0, Math.min(r.start_index, content.length));
|
|
26
|
+
const safeEnd = Math.max(safeStart, Math.min(r.end_index, content.length));
|
|
27
|
+
const expected = content.slice(safeStart, safeEnd);
|
|
28
|
+
if (expected !== r.display_code) {
|
|
29
|
+
mismatched++;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
const ratio = sampled > 0 ? mismatched / sampled : 0;
|
|
33
|
+
return { abort: ratio > maxMismatchRatio, sampled, mismatched, ratio };
|
|
34
|
+
}
|
|
35
|
+
var VectorStore = class {
|
|
36
|
+
db = null;
|
|
37
|
+
table = null;
|
|
38
|
+
dbPath;
|
|
39
|
+
vectorDim;
|
|
40
|
+
constructor(projectId, vectorDim = 1024, dbPathOverride) {
|
|
41
|
+
this.dbPath = dbPathOverride ?? path.join(BASE_DIR, projectId, "vectors.lance");
|
|
42
|
+
this.vectorDim = vectorDim;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* 初始化连接
|
|
46
|
+
*/
|
|
47
|
+
async init() {
|
|
48
|
+
if (this.db) return;
|
|
49
|
+
const parent = path.dirname(this.dbPath);
|
|
50
|
+
if (!fs.existsSync(parent)) {
|
|
51
|
+
fs.mkdirSync(parent, { recursive: true });
|
|
52
|
+
}
|
|
53
|
+
this.db = await lancedb.connect(this.dbPath);
|
|
54
|
+
const tableNames = await this.db.tableNames();
|
|
55
|
+
if (tableNames.includes("chunks")) {
|
|
56
|
+
this.table = await this.db.openTable("chunks");
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* 确保表存在(首次插入时调用)
|
|
61
|
+
*/
|
|
62
|
+
async ensureTable(records) {
|
|
63
|
+
if (this.table) return;
|
|
64
|
+
if (!this.db) throw new Error("VectorStore not initialized");
|
|
65
|
+
if (records.length === 0) return;
|
|
66
|
+
this.table = await this.db.createTable(
|
|
67
|
+
"chunks",
|
|
68
|
+
records
|
|
69
|
+
);
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* C2 迁移:移除 chunks 表中的 display_code / vector_text 列
|
|
73
|
+
*
|
|
74
|
+
* LanceDB 不支持 ALTER DROP COLUMN,方案为 dropTable + recreate:
|
|
75
|
+
* 1. 读取所有现有 chunks,仅保留新 schema 字段(含 raw_start/raw_end 用于回查正文)
|
|
76
|
+
* 2. 抽样校验:display_code vs files.content.slice(raw_start, raw_end)
|
|
77
|
+
* 差异比例 > sampleMaxMismatchRatio 则中止迁移
|
|
78
|
+
* 3. drop chunks 表 + 用新 schema 重建
|
|
79
|
+
*
|
|
80
|
+
* 幂等:若表中已无 display_code 列,直接返回。
|
|
81
|
+
*
|
|
82
|
+
* @returns 迁移摘要;migrated=false 表示无需迁移或被中止
|
|
83
|
+
*/
|
|
84
|
+
/**
|
|
85
|
+
* 检测 chunks 表是否含 display_code 列(H3:纯 vector 操作)
|
|
86
|
+
*
|
|
87
|
+
* 返回值:
|
|
88
|
+
* - true: 表存在且含 display_code(需要迁移)
|
|
89
|
+
* - false: 表存在但不含 display_code(已迁移)
|
|
90
|
+
* - null: 表不存在(全新库 / 已 drop)
|
|
91
|
+
*/
|
|
92
|
+
async hasDisplayCodeColumn() {
|
|
93
|
+
if (!this.table) return null;
|
|
94
|
+
const schema = await this.table.schema();
|
|
95
|
+
return schema.fields.some((f) => f.name === "display_code");
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* 读取全表(H3:纯 vector 操作,供 bootstrap 模块抽样校验使用)
|
|
99
|
+
*/
|
|
100
|
+
async readAllRowsRaw() {
|
|
101
|
+
if (!this.table) return [];
|
|
102
|
+
return await this.table.query().toArray();
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Drop chunks 表并用新 schema 重建(H3:纯 vector 操作)
|
|
106
|
+
*
|
|
107
|
+
* 调用方需保证:传入的 newRows 已剥离 display_code/vector_text 字段。
|
|
108
|
+
* 崩溃恢复语义由 bootstrap 模块的 state machine 负责。
|
|
109
|
+
*/
|
|
110
|
+
async dropAndRecreateChunks(newRows) {
|
|
111
|
+
if (!this.db) throw new Error("VectorStore not initialized");
|
|
112
|
+
await this.db.dropTable("chunks");
|
|
113
|
+
this.table = null;
|
|
114
|
+
if (newRows.length > 0) {
|
|
115
|
+
this.table = await this.db.createTable("chunks", newRows);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* 单调版本更新:先插入新版本,再删除旧版本
|
|
120
|
+
*
|
|
121
|
+
* 这保证了:
|
|
122
|
+
* - 最坏情况(崩溃)是新旧版本共存(不缺失)
|
|
123
|
+
* - 正常情况下旧版本被清理
|
|
124
|
+
*/
|
|
125
|
+
async upsertFile(filePath, newHash, records) {
|
|
126
|
+
if (!this.db) throw new Error("VectorStore not initialized");
|
|
127
|
+
if (records.length === 0) {
|
|
128
|
+
await this.deleteFile(filePath);
|
|
129
|
+
return;
|
|
130
|
+
}
|
|
131
|
+
if (!this.table) {
|
|
132
|
+
await this.ensureTable(records);
|
|
133
|
+
} else {
|
|
134
|
+
await this.table.add(records);
|
|
135
|
+
}
|
|
136
|
+
if (this.table) {
|
|
137
|
+
await this.table.delete(
|
|
138
|
+
`file_path = '${this.escapeString(filePath)}' AND file_hash != '${this.escapeString(newHash)}'`
|
|
139
|
+
);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* 批量 upsert 多个文件(性能优化版,带分批机制)
|
|
144
|
+
*
|
|
145
|
+
* 流程:
|
|
146
|
+
* 1. 将文件分成小批次(每批最多 BATCH_FILES 个文件或 BATCH_RECORDS 条记录)
|
|
147
|
+
* 2. 每批执行:插入新 records → 删除旧版本
|
|
148
|
+
*
|
|
149
|
+
* 分批是必要的,因为 LanceDB native 模块在处理超大数据时可能崩溃
|
|
150
|
+
*
|
|
151
|
+
* @param files 文件列表,每个包含 path、hash 和 records
|
|
152
|
+
*/
|
|
153
|
+
async batchUpsertFiles(files) {
|
|
154
|
+
if (!this.db) throw new Error("VectorStore not initialized");
|
|
155
|
+
if (files.length === 0) return;
|
|
156
|
+
const BATCH_FILES = 50;
|
|
157
|
+
const BATCH_RECORDS = 5e3;
|
|
158
|
+
const batches = [];
|
|
159
|
+
let currentBatch = [];
|
|
160
|
+
let currentRecordCount = 0;
|
|
161
|
+
for (const file of files) {
|
|
162
|
+
if (currentBatch.length >= BATCH_FILES || currentRecordCount + file.records.length > BATCH_RECORDS) {
|
|
163
|
+
if (currentBatch.length > 0) {
|
|
164
|
+
batches.push(currentBatch);
|
|
165
|
+
}
|
|
166
|
+
currentBatch = [];
|
|
167
|
+
currentRecordCount = 0;
|
|
168
|
+
}
|
|
169
|
+
currentBatch.push(file);
|
|
170
|
+
currentRecordCount += file.records.length;
|
|
171
|
+
}
|
|
172
|
+
if (currentBatch.length > 0) {
|
|
173
|
+
batches.push(currentBatch);
|
|
174
|
+
}
|
|
175
|
+
for (const batch of batches) {
|
|
176
|
+
const batchRecords = [];
|
|
177
|
+
for (const file of batch) {
|
|
178
|
+
batchRecords.push(...file.records);
|
|
179
|
+
}
|
|
180
|
+
if (batchRecords.length === 0) {
|
|
181
|
+
const pathsToDelete = batch.map((f) => f.path);
|
|
182
|
+
await this.deleteFiles(pathsToDelete);
|
|
183
|
+
continue;
|
|
184
|
+
}
|
|
185
|
+
if (this.table && batch.length > 0) {
|
|
186
|
+
await this.deleteFilesByHash(batch.map((f) => ({ path: f.path, hash: f.hash })));
|
|
187
|
+
}
|
|
188
|
+
if (!this.table) {
|
|
189
|
+
await this.ensureTable(batchRecords);
|
|
190
|
+
} else {
|
|
191
|
+
await this.table.add(batchRecords);
|
|
192
|
+
}
|
|
193
|
+
if (this.table && batch.length > 0) {
|
|
194
|
+
const deleteConditions = batch.map(
|
|
195
|
+
(f) => `(file_path = '${this.escapeString(f.path)}' AND file_hash != '${this.escapeString(f.hash)}')`
|
|
196
|
+
).join(" OR ");
|
|
197
|
+
await this.table.delete(deleteConditions);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* 列出所有 chunks 的 (file_path, file_hash) 唯一组合
|
|
203
|
+
*
|
|
204
|
+
* 用于 GC 阶段对比 SQLite 权威数据,识别孤儿 chunks。
|
|
205
|
+
* 性能优化:仅 select 两列,按 (path, hash) 去重后返回。
|
|
206
|
+
*/
|
|
207
|
+
async listFileHashes() {
|
|
208
|
+
if (!this.table) return [];
|
|
209
|
+
const rows = await this.table.query().select(["file_path", "file_hash"]).toArray();
|
|
210
|
+
const seen = /* @__PURE__ */ new Set();
|
|
211
|
+
const result = [];
|
|
212
|
+
for (const r of rows) {
|
|
213
|
+
const key = `${r.file_path}\0${r.file_hash}`;
|
|
214
|
+
if (!seen.has(key)) {
|
|
215
|
+
seen.add(key);
|
|
216
|
+
result.push({ path: r.file_path, hash: r.file_hash });
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
return result;
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* 删除文件的所有 chunks
|
|
223
|
+
*/
|
|
224
|
+
async deleteFile(filePath) {
|
|
225
|
+
if (!this.table) return;
|
|
226
|
+
await this.table.delete(`file_path = '${this.escapeString(filePath)}'`);
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* 按 (file_path, file_hash) 精确删除 chunks
|
|
230
|
+
*
|
|
231
|
+
* 用于事务补偿:当下游写入(如 FTS)失败时,反向删除已 upsert 的新版本,
|
|
232
|
+
* 保留旧版本不动,确保 vector_index_hash 仍指向旧 hash 时 LanceDB 状态一致。
|
|
233
|
+
*/
|
|
234
|
+
async deleteFilesByHash(items) {
|
|
235
|
+
if (!this.table || items.length === 0) return;
|
|
236
|
+
const BATCH_SIZE = 500;
|
|
237
|
+
for (let i = 0; i < items.length; i += BATCH_SIZE) {
|
|
238
|
+
const batch = items.slice(i, i + BATCH_SIZE);
|
|
239
|
+
const conditions = batch.map(
|
|
240
|
+
(it) => `(file_path = '${this.escapeString(it.path)}' AND file_hash = '${this.escapeString(it.hash)}')`
|
|
241
|
+
).join(" OR ");
|
|
242
|
+
await this.table.delete(conditions);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
/**
|
|
246
|
+
* 批量删除文件(性能优化:单次 DELETE 替代 N 次循环)
|
|
247
|
+
* 当文件数超过 500 时分批处理,防止 LanceDB filter 字符串过长
|
|
248
|
+
*/
|
|
249
|
+
async deleteFiles(filePaths) {
|
|
250
|
+
if (!this.table || filePaths.length === 0) return;
|
|
251
|
+
const BATCH_SIZE = 500;
|
|
252
|
+
if (filePaths.length <= BATCH_SIZE) {
|
|
253
|
+
const conditions = filePaths.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
|
|
254
|
+
await this.table.delete(conditions);
|
|
255
|
+
} else {
|
|
256
|
+
for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
|
|
257
|
+
const batch = filePaths.slice(i, i + BATCH_SIZE);
|
|
258
|
+
const conditions = batch.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
|
|
259
|
+
await this.table.delete(conditions);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
/**
|
|
264
|
+
* 向量搜索
|
|
265
|
+
*/
|
|
266
|
+
async search(queryVector, limit = 10, filter) {
|
|
267
|
+
if (!this.table) return [];
|
|
268
|
+
let query = this.table.vectorSearch(queryVector).limit(limit);
|
|
269
|
+
if (filter) {
|
|
270
|
+
query = query.where(filter);
|
|
271
|
+
}
|
|
272
|
+
const results = await query.toArray();
|
|
273
|
+
return results;
|
|
274
|
+
}
|
|
275
|
+
/**
|
|
276
|
+
* 获取文件的所有 chunks(按 chunk_index 排序)
|
|
277
|
+
*/
|
|
278
|
+
async getFileChunks(filePath) {
|
|
279
|
+
if (!this.table) return [];
|
|
280
|
+
const results = await this.table.query().where(`file_path = '${this.escapeString(filePath)}'`).toArray();
|
|
281
|
+
const chunks = results;
|
|
282
|
+
return chunks.sort((a, b) => a.chunk_index - b.chunk_index);
|
|
283
|
+
}
|
|
284
|
+
/**
|
|
285
|
+
* 批量获取多个文件的 chunks(性能优化:单次查询替代 N 次循环)
|
|
286
|
+
* 当文件数超过 500 时分批处理,防止 LanceDB filter 字符串过长
|
|
287
|
+
*
|
|
288
|
+
* 适用于 GraphExpander 扩展、词法召回等需要批量获取的场景
|
|
289
|
+
* @returns Map<filePath, ChunkRecord[]>,每个文件的 chunks 已按 chunk_index 排序
|
|
290
|
+
*/
|
|
291
|
+
async getFilesChunks(filePaths) {
|
|
292
|
+
const result = /* @__PURE__ */ new Map();
|
|
293
|
+
if (!this.table || filePaths.length === 0) return result;
|
|
294
|
+
const BATCH_SIZE = 500;
|
|
295
|
+
for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
|
|
296
|
+
const batch = filePaths.slice(i, i + BATCH_SIZE);
|
|
297
|
+
const conditions = batch.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
|
|
298
|
+
const rows = await this.table.query().where(conditions).toArray();
|
|
299
|
+
for (const row of rows) {
|
|
300
|
+
let arr = result.get(row.file_path);
|
|
301
|
+
if (!arr) {
|
|
302
|
+
arr = [];
|
|
303
|
+
result.set(row.file_path, arr);
|
|
304
|
+
}
|
|
305
|
+
arr.push(row);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
for (const arr of result.values()) {
|
|
309
|
+
arr.sort((a, b) => a.chunk_index - b.chunk_index);
|
|
310
|
+
}
|
|
311
|
+
return result;
|
|
312
|
+
}
|
|
313
|
+
/**
|
|
314
|
+
* 获取表的总记录数
|
|
315
|
+
*/
|
|
316
|
+
async count() {
|
|
317
|
+
if (!this.table) return 0;
|
|
318
|
+
return await this.table.countRows();
|
|
319
|
+
}
|
|
320
|
+
/**
|
|
321
|
+
* 清空所有数据
|
|
322
|
+
*/
|
|
323
|
+
async clear() {
|
|
324
|
+
if (!this.db) return;
|
|
325
|
+
try {
|
|
326
|
+
await this.db.dropTable("chunks");
|
|
327
|
+
this.table = null;
|
|
328
|
+
} catch {
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
/**
|
|
332
|
+
* 获取向量维度
|
|
333
|
+
*/
|
|
334
|
+
getVectorDim() {
|
|
335
|
+
return this.vectorDim;
|
|
336
|
+
}
|
|
337
|
+
/**
|
|
338
|
+
* 转义字符串(防止 SQL 注入)
|
|
339
|
+
*/
|
|
340
|
+
escapeString(str) {
|
|
341
|
+
return str.replace(/'/g, "''");
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* 关闭连接
|
|
345
|
+
*/
|
|
346
|
+
async close() {
|
|
347
|
+
this.db = null;
|
|
348
|
+
this.table = null;
|
|
349
|
+
}
|
|
350
|
+
};
|
|
351
|
+
var vectorStores = /* @__PURE__ */ new Map();
|
|
352
|
+
async function getVectorStore(projectId, vectorDim = 1024) {
|
|
353
|
+
let store = vectorStores.get(projectId);
|
|
354
|
+
if (!store) {
|
|
355
|
+
store = new VectorStore(projectId, vectorDim);
|
|
356
|
+
await store.init();
|
|
357
|
+
vectorStores.set(projectId, store);
|
|
358
|
+
}
|
|
359
|
+
return store;
|
|
360
|
+
}
|
|
361
|
+
async function closeAllVectorStores() {
|
|
362
|
+
for (const store of vectorStores.values()) {
|
|
363
|
+
await store.close();
|
|
364
|
+
}
|
|
365
|
+
vectorStores.clear();
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
export {
|
|
369
|
+
sampleCheckDisplayCode,
|
|
370
|
+
VectorStore,
|
|
371
|
+
getVectorStore,
|
|
372
|
+
closeAllVectorStores
|
|
373
|
+
};
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
// src/search/config.ts
|
|
2
|
+
var SEARCH_CONFIG_BOUNDS = {
|
|
3
|
+
vectorTopK: { min: 40, max: 200, integer: true },
|
|
4
|
+
vectorTopM: { min: 30, max: 100, integer: true },
|
|
5
|
+
ftsTopKFiles: { min: 10, max: 50, integer: true },
|
|
6
|
+
lexChunksPerFile: { min: 1, max: 5, integer: true },
|
|
7
|
+
lexTotalChunks: { min: 20, max: 80, integer: true },
|
|
8
|
+
rrfK0: { min: 10, max: 60, integer: true },
|
|
9
|
+
wVec: { min: 0, max: 1, integer: false },
|
|
10
|
+
wLex: { min: 0, max: 1, integer: false },
|
|
11
|
+
fusedTopM: { min: 30, max: 100, integer: true },
|
|
12
|
+
rerankTopN: { min: 5, max: 20, integer: true },
|
|
13
|
+
maxRerankChars: { min: 500, max: 2e3, integer: true },
|
|
14
|
+
maxBreadcrumbChars: { min: 100, max: 500, integer: true },
|
|
15
|
+
headRatio: { min: 0.5, max: 0.8, integer: false },
|
|
16
|
+
neighborHops: { min: 1, max: 3, integer: true },
|
|
17
|
+
breadcrumbExpandLimit: { min: 1, max: 5, integer: true },
|
|
18
|
+
importFilesPerSeed: { min: 0, max: 5, integer: true },
|
|
19
|
+
chunksPerImportFile: { min: 1, max: 5, integer: true },
|
|
20
|
+
decayNeighbor: { min: 0.5, max: 0.9, integer: false },
|
|
21
|
+
decayBreadcrumb: { min: 0.4, max: 0.8, integer: false },
|
|
22
|
+
decayImport: { min: 0.3, max: 0.7, integer: false },
|
|
23
|
+
decayDepth: { min: 0.5, max: 0.9, integer: false },
|
|
24
|
+
maxSegmentsPerFile: { min: 1, max: 5, integer: true },
|
|
25
|
+
maxTotalChars: { min: 2e4, max: 8e4, integer: true },
|
|
26
|
+
smartTopScoreRatio: { min: 0.3, max: 0.7, integer: false },
|
|
27
|
+
smartTopScoreDeltaAbs: { min: 0.1, max: 0.4, integer: false },
|
|
28
|
+
smartMinScore: { min: 0.1, max: 0.4, integer: false },
|
|
29
|
+
smartMinK: { min: 1, max: 3, integer: true },
|
|
30
|
+
smartMaxK: { min: 5, max: 15, integer: true }
|
|
31
|
+
};
|
|
32
|
+
var DEFAULT_CONFIG = {
|
|
33
|
+
// ── Recall (向量 + 词法召回) ──
|
|
34
|
+
vectorTopK: 80,
|
|
35
|
+
// Vector ANN candidates before dedup. Range: 40–200. Higher = better recall, more compute.
|
|
36
|
+
vectorTopM: 60,
|
|
37
|
+
// Vectors kept after dedup. Range: 30–100.
|
|
38
|
+
ftsTopKFiles: 20,
|
|
39
|
+
// Max files returned by FTS5 full-text search. Range: 10–50.
|
|
40
|
+
lexChunksPerFile: 2,
|
|
41
|
+
// Chunks to pull per FTS-matched file. Range: 1–5. Low keeps diversity across files.
|
|
42
|
+
lexTotalChunks: 40,
|
|
43
|
+
// Hard cap on total lexical chunks. Range: 20–80.
|
|
44
|
+
// ── RRF Fusion (向量 + 词法分数融合) ──
|
|
45
|
+
rrfK0: 20,
|
|
46
|
+
// RRF smoothing constant. Range: 10–60. Lower amplifies top ranks.
|
|
47
|
+
wVec: 0.6,
|
|
48
|
+
// Vector weight in fused score. Range: 0.3–0.8. Semantic relevance emphasis.
|
|
49
|
+
wLex: 0.4,
|
|
50
|
+
// Lexical weight in fused score. wVec + wLex should equal 1.0.
|
|
51
|
+
fusedTopM: 60,
|
|
52
|
+
// Candidates after fusion, fed into reranker. Range: 30–100.
|
|
53
|
+
// ── Rerank (精排) ──
|
|
54
|
+
rerankTopN: 10,
|
|
55
|
+
// Final top-N results after reranking. Range: 5–20.
|
|
56
|
+
maxRerankChars: 1e3,
|
|
57
|
+
// Max chars per chunk sent to reranker. Truncated beyond this. Range: 500–2000.
|
|
58
|
+
maxBreadcrumbChars: 250,
|
|
59
|
+
// Max chars for breadcrumb context in rerank input. Range: 100–500.
|
|
60
|
+
headRatio: 0.67,
|
|
61
|
+
// Ratio of head vs tail when truncating chunks. Range: 0.5–0.8.
|
|
62
|
+
// ── Expansion (上下文扩展: E1 邻居 / E2 面包屑 / E3 跨文件导入) ──
|
|
63
|
+
neighborHops: 2,
|
|
64
|
+
// E1: How many sibling chunks to expand in each direction. Range: 1–3.
|
|
65
|
+
breadcrumbExpandLimit: 3,
|
|
66
|
+
// E2: Max ancestor breadcrumbs (class/function scope). Range: 1–5.
|
|
67
|
+
importFilesPerSeed: 3,
|
|
68
|
+
// E3: Cross-file import files to resolve per seed chunk. Range: 0–5. Set to 3 to enable import-graph expansion for better cross-file context.
|
|
69
|
+
chunksPerImportFile: 3,
|
|
70
|
+
// E3: Chunks to pull from each resolved import file. Range: 1–5. Set to 3 for balanced coverage of imported symbols.
|
|
71
|
+
decayNeighbor: 0.8,
|
|
72
|
+
// Score decay per E1 hop. Range: 0.5–0.9. Higher = neighbors stay relevant longer.
|
|
73
|
+
decayBreadcrumb: 0.7,
|
|
74
|
+
// Score decay per E2 level. Range: 0.4–0.8.
|
|
75
|
+
decayImport: 0.6,
|
|
76
|
+
// Score decay for E3 import chunks. Range: 0.3–0.7. Lower than E1/E2 since cross-file is less certain.
|
|
77
|
+
decayDepth: 0.7,
|
|
78
|
+
// General depth decay multiplier. Range: 0.5–0.9.
|
|
79
|
+
// ── ContextPacker (上下文打包) ──
|
|
80
|
+
maxSegmentsPerFile: 3,
|
|
81
|
+
// Max non-contiguous segments per file in output. Range: 1–5. Prevents excessive fragmentation.
|
|
82
|
+
maxTotalChars: 48e3,
|
|
83
|
+
// Token budget expressed as chars (~12k tokens). Range: 20000–80000.
|
|
84
|
+
// ── Smart TopK (动态结果数量) ──
|
|
85
|
+
enableSmartTopK: true,
|
|
86
|
+
// Dynamically adjust result count based on score distribution.
|
|
87
|
+
smartTopScoreRatio: 0.5,
|
|
88
|
+
// Min score as ratio of top-1 score to remain included. Range: 0.3–0.7.
|
|
89
|
+
smartTopScoreDeltaAbs: 0.25,
|
|
90
|
+
// Max absolute score drop from top-1 before cutting off. Range: 0.1–0.4.
|
|
91
|
+
smartMinScore: 0.25,
|
|
92
|
+
// Hard floor: chunks below this score are always excluded. Range: 0.1–0.4.
|
|
93
|
+
smartMinK: 2,
|
|
94
|
+
// Minimum results to return regardless of scores. Range: 1–3.
|
|
95
|
+
smartMaxK: 8
|
|
96
|
+
// Maximum results when smart topK is active. Range: 5–15.
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
export {
|
|
100
|
+
SEARCH_CONFIG_BOUNDS,
|
|
101
|
+
DEFAULT_CONFIG
|
|
102
|
+
};
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import {
|
|
2
2
|
closeAllIndexers,
|
|
3
|
-
closeAllVectorStores,
|
|
4
3
|
getIndexer,
|
|
5
4
|
invalidateAllExpanderCaches
|
|
6
|
-
} from "./chunk-
|
|
5
|
+
} from "./chunk-XMZZZKG7.js";
|
|
6
|
+
import {
|
|
7
|
+
closeAllVectorStores
|
|
8
|
+
} from "./chunk-3BNHQV5W.js";
|
|
7
9
|
import {
|
|
8
10
|
batchDelete,
|
|
9
11
|
batchUpdateMtime,
|
|
@@ -15,16 +17,19 @@ import {
|
|
|
15
17
|
getAllPaths,
|
|
16
18
|
getFilesNeedingVectorIndex,
|
|
17
19
|
getStoredEmbeddingDimensions,
|
|
20
|
+
incrementIndexVersion,
|
|
21
|
+
incrementStat,
|
|
18
22
|
initDb,
|
|
23
|
+
setStatJson,
|
|
19
24
|
setStoredEmbeddingDimensions
|
|
20
|
-
} from "./chunk-
|
|
25
|
+
} from "./chunk-PPLFJGO3.js";
|
|
21
26
|
import {
|
|
22
27
|
logger
|
|
23
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-JVKVSTQ3.js";
|
|
24
29
|
import {
|
|
25
30
|
getEmbeddingConfig,
|
|
26
31
|
getExcludePatterns
|
|
27
|
-
} from "./chunk-
|
|
32
|
+
} from "./chunk-SKBAE26T.js";
|
|
28
33
|
|
|
29
34
|
// src/scanner/index.ts
|
|
30
35
|
import path3 from "path";
|
|
@@ -523,6 +528,19 @@ var SourceAdapter = class {
|
|
|
523
528
|
/**
|
|
524
529
|
* 将字节偏移转换为字符偏移
|
|
525
530
|
*/
|
|
531
|
+
/**
|
|
532
|
+
* 将 tree-sitter 返回的偏移(可能是 UTF-8 字节或 UTF-16 字符域)
|
|
533
|
+
* 标准化为 UTF-16 字符域偏移。下游 String.prototype.slice 直接可用。
|
|
534
|
+
*
|
|
535
|
+
* 导出供 SemanticSplitter 在生成 ChunkMetadata 时统一域。
|
|
536
|
+
*/
|
|
537
|
+
toCharOffset(offset) {
|
|
538
|
+
if (this.domain === "utf16" || this.domain === "unknown") return offset;
|
|
539
|
+
return this.byteToChar(offset);
|
|
540
|
+
}
|
|
541
|
+
/**
|
|
542
|
+
* 将字节偏移转换为字符偏移(仅 utf8 域有效;utf16/unknown 直接返回原值)
|
|
543
|
+
*/
|
|
526
544
|
byteToChar(byteOffset) {
|
|
527
545
|
if (!this.byteToCharMap) return byteOffset;
|
|
528
546
|
const safeOffset = Math.max(0, Math.min(this.byteToCharMap.length - 1, byteOffset));
|
|
@@ -915,11 +933,12 @@ ${displayCode}`,
|
|
|
915
933
|
const vectorEnd = end;
|
|
916
934
|
const displayCode = this.adapter.slice(start, end);
|
|
917
935
|
const vectorCode = this.adapter.slice(vectorStart, vectorEnd);
|
|
936
|
+
const toChar = (n) => this.adapter.toCharOffset(n);
|
|
918
937
|
const metadata = {
|
|
919
|
-
startIndex: start,
|
|
920
|
-
endIndex: end,
|
|
921
|
-
rawSpan: { start: prevEnd, end: rawSpanEnd },
|
|
922
|
-
vectorSpan: { start: vectorStart, end: vectorEnd },
|
|
938
|
+
startIndex: toChar(start),
|
|
939
|
+
endIndex: toChar(end),
|
|
940
|
+
rawSpan: { start: toChar(prevEnd), end: toChar(rawSpanEnd) },
|
|
941
|
+
vectorSpan: { start: toChar(vectorStart), end: toChar(vectorEnd) },
|
|
923
942
|
filePath,
|
|
924
943
|
language,
|
|
925
944
|
contextPath: w.contextPath
|
|
@@ -1351,6 +1370,34 @@ async function scan(rootPath, options = {}) {
|
|
|
1351
1370
|
}
|
|
1352
1371
|
}
|
|
1353
1372
|
options.onProgress?.(100, 100, "\u7D22\u5F15\u5B8C\u6210");
|
|
1373
|
+
if (options.vectorIndex !== false) {
|
|
1374
|
+
try {
|
|
1375
|
+
const embeddingConfig = getEmbeddingConfig();
|
|
1376
|
+
const indexer = await getIndexer(projectId, embeddingConfig.dimensions);
|
|
1377
|
+
const gcResult = await indexer.gc(db);
|
|
1378
|
+
if (gcResult.orphans > 0) {
|
|
1379
|
+
logger.info({ orphans: gcResult.orphans }, "GC \u5B8C\u6210");
|
|
1380
|
+
} else if (gcResult.truncated) {
|
|
1381
|
+
logger.debug("GC \u8D85\u65F6\u8DF3\u8FC7\uFF0C\u4E0B\u6B21\u626B\u63CF\u91CD\u8BD5");
|
|
1382
|
+
}
|
|
1383
|
+
} catch (err) {
|
|
1384
|
+
const error = err;
|
|
1385
|
+
logger.warn({ error: error.message }, "GC \u8DF3\u8FC7");
|
|
1386
|
+
}
|
|
1387
|
+
}
|
|
1388
|
+
const didWork = stats.added + stats.modified + stats.deleted > 0;
|
|
1389
|
+
if (didWork) {
|
|
1390
|
+
incrementIndexVersion(db);
|
|
1391
|
+
}
|
|
1392
|
+
try {
|
|
1393
|
+
if (didWork) {
|
|
1394
|
+
incrementStat(db, "stats.index.total_runs");
|
|
1395
|
+
}
|
|
1396
|
+
setStatJson(db, "stats.index.last_run_json", stats);
|
|
1397
|
+
setStatJson(db, "stats.index.last_run_at", Date.now());
|
|
1398
|
+
} catch (err) {
|
|
1399
|
+
logger.warn({ error: err.message }, "\u7D22\u5F15\u7EDF\u8BA1\u57CB\u70B9\u5931\u8D25");
|
|
1400
|
+
}
|
|
1354
1401
|
invalidateAllExpanderCaches();
|
|
1355
1402
|
return stats;
|
|
1356
1403
|
} finally {
|
|
@@ -1361,6 +1408,7 @@ async function scan(rootPath, options = {}) {
|
|
|
1361
1408
|
}
|
|
1362
1409
|
|
|
1363
1410
|
export {
|
|
1411
|
+
initFilter,
|
|
1412
|
+
isFiltered,
|
|
1364
1413
|
scan
|
|
1365
1414
|
};
|
|
1366
|
-
//# sourceMappingURL=chunk-NQR4CGQ6.js.map
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
logger
|
|
3
|
-
} from "./chunk-
|
|
4
|
-
import "./chunk-RJURH22T.js";
|
|
3
|
+
} from "./chunk-JVKVSTQ3.js";
|
|
5
4
|
|
|
6
5
|
// src/utils/lock.ts
|
|
7
6
|
import fs from "fs";
|
|
@@ -122,7 +121,7 @@ async function withLock(projectId, operation, fn, timeoutMs = 3e4) {
|
|
|
122
121
|
releaseLock(projectId);
|
|
123
122
|
}
|
|
124
123
|
}
|
|
124
|
+
|
|
125
125
|
export {
|
|
126
126
|
withLock
|
|
127
127
|
};
|
|
128
|
-
//# sourceMappingURL=lock-DVY3KJSK.js.map
|