@chiway/contextweaver 1.1.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +138 -28
  2. package/dist/{SearchService-MYPOCM3B.js → SearchService-WVD6THR3.js} +170 -82
  3. package/dist/chunk-3BNHQV5W.js +373 -0
  4. package/dist/chunk-BFCIZ52F.js +102 -0
  5. package/dist/{chunk-NQR4CGQ6.js → chunk-GDVB6PJ4.js} +58 -10
  6. package/dist/{lock-DVY3KJSK.js → chunk-HHYPQA3X.js} +2 -3
  7. package/dist/chunk-ISVCQFB4.js +223 -0
  8. package/dist/chunk-IZ6IUHNN.js +77 -0
  9. package/dist/{chunk-AMQQK4P7.js → chunk-JVKVSTQ3.js} +1 -2
  10. package/dist/chunk-LB42CZEB.js +18 -0
  11. package/dist/{chunk-6Z4JEEVJ.js → chunk-PPLFJGO3.js} +303 -58
  12. package/dist/chunk-R6CNZXZ7.js +143 -0
  13. package/dist/{chunk-RJURH22T.js → chunk-SKBAE26T.js} +0 -1
  14. package/dist/chunk-TPM6YP43.js +38 -0
  15. package/dist/{chunk-7G5V7YT5.js → chunk-V3K4YVAR.js} +12 -120
  16. package/dist/chunk-VWBKZ6QL.js +115 -0
  17. package/dist/chunk-XFIM2T6S.js +57 -0
  18. package/dist/{chunk-6QMYML5V.js → chunk-XMZZZKG7.js} +361 -295
  19. package/dist/chunk-XTWNT7KP.js +156 -0
  20. package/dist/chunk-Y6H7C3NA.js +85 -0
  21. package/dist/codebaseRetrieval-DIS5RH2C.js +14 -0
  22. package/dist/{config-BWZ6CU3W.js → config-LCOJHTCF.js} +1 -2
  23. package/dist/db-GBCLP4GG.js +68 -0
  24. package/dist/findReferences-N7ML7TUP.js +16 -0
  25. package/dist/getSymbolDefinition-6KMY4H33.js +17 -0
  26. package/dist/index.js +271 -40
  27. package/dist/listFiles-4VT2TPJD.js +14 -0
  28. package/dist/loadConfig-XTVT2OWW.js +9 -0
  29. package/dist/lock-HNKQ6X5B.js +8 -0
  30. package/dist/scanner-QDFZJLP7.js +13 -0
  31. package/dist/server-UAI3U7AB.js +347 -0
  32. package/dist/stats-AGKUCJQI.js +12 -0
  33. package/dist/vectorStore-4ODCERRO.js +12 -0
  34. package/package.json +9 -23
  35. package/dist/codebaseRetrieval-NLAMGOA2.js +0 -12
  36. package/dist/scanner-RFG4YWYI.js +0 -11
  37. package/dist/server-27HI7WZO.js +0 -147
@@ -0,0 +1,373 @@
1
+ // src/vectorStore/index.ts
2
+ import fs from "fs";
3
+ import os from "os";
4
+ import path from "path";
5
+ import * as lancedb from "@lancedb/lancedb";
6
+ var BASE_DIR = path.join(os.homedir(), ".contextweaver");
7
+ function sampleCheckDisplayCode(oldRows, getContent, options = {}) {
8
+ const sampleSize = options.sampleSize ?? 100;
9
+ const maxMismatchRatio = options.maxMismatchRatio ?? 0.01;
10
+ if (oldRows.length === 0) {
11
+ return { abort: false, sampled: 0, mismatched: 0, ratio: 0 };
12
+ }
13
+ const indices = [];
14
+ const step = Math.max(1, Math.floor(oldRows.length / sampleSize));
15
+ for (let i = 0; i < oldRows.length && indices.length < sampleSize; i += step) {
16
+ indices.push(i);
17
+ }
18
+ let sampled = 0;
19
+ let mismatched = 0;
20
+ for (const idx of indices) {
21
+ const r = oldRows[idx];
22
+ const content = getContent(r.file_path);
23
+ if (content === null) continue;
24
+ sampled++;
25
+ const safeStart = Math.max(0, Math.min(r.start_index, content.length));
26
+ const safeEnd = Math.max(safeStart, Math.min(r.end_index, content.length));
27
+ const expected = content.slice(safeStart, safeEnd);
28
+ if (expected !== r.display_code) {
29
+ mismatched++;
30
+ }
31
+ }
32
+ const ratio = sampled > 0 ? mismatched / sampled : 0;
33
+ return { abort: ratio > maxMismatchRatio, sampled, mismatched, ratio };
34
+ }
35
+ var VectorStore = class {
36
+ db = null;
37
+ table = null;
38
+ dbPath;
39
+ vectorDim;
40
+ constructor(projectId, vectorDim = 1024, dbPathOverride) {
41
+ this.dbPath = dbPathOverride ?? path.join(BASE_DIR, projectId, "vectors.lance");
42
+ this.vectorDim = vectorDim;
43
+ }
44
+ /**
45
+ * 初始化连接
46
+ */
47
+ async init() {
48
+ if (this.db) return;
49
+ const parent = path.dirname(this.dbPath);
50
+ if (!fs.existsSync(parent)) {
51
+ fs.mkdirSync(parent, { recursive: true });
52
+ }
53
+ this.db = await lancedb.connect(this.dbPath);
54
+ const tableNames = await this.db.tableNames();
55
+ if (tableNames.includes("chunks")) {
56
+ this.table = await this.db.openTable("chunks");
57
+ }
58
+ }
59
+ /**
60
+ * 确保表存在(首次插入时调用)
61
+ */
62
+ async ensureTable(records) {
63
+ if (this.table) return;
64
+ if (!this.db) throw new Error("VectorStore not initialized");
65
+ if (records.length === 0) return;
66
+ this.table = await this.db.createTable(
67
+ "chunks",
68
+ records
69
+ );
70
+ }
71
+ /**
72
+ * C2 迁移:移除 chunks 表中的 display_code / vector_text 列
73
+ *
74
+ * LanceDB 不支持 ALTER DROP COLUMN,方案为 dropTable + recreate:
75
+ * 1. 读取所有现有 chunks,仅保留新 schema 字段(含 raw_start/raw_end 用于回查正文)
76
+ * 2. 抽样校验:display_code vs files.content.slice(raw_start, raw_end)
77
+ * 差异比例 > sampleMaxMismatchRatio 则中止迁移
78
+ * 3. drop chunks 表 + 用新 schema 重建
79
+ *
80
+ * 幂等:若表中已无 display_code 列,直接返回。
81
+ *
82
+ * @returns 迁移摘要;migrated=false 表示无需迁移或被中止
83
+ */
84
+ /**
85
+ * 检测 chunks 表是否含 display_code 列(H3:纯 vector 操作)
86
+ *
87
+ * 返回值:
88
+ * - true: 表存在且含 display_code(需要迁移)
89
+ * - false: 表存在但不含 display_code(已迁移)
90
+ * - null: 表不存在(全新库 / 已 drop)
91
+ */
92
+ async hasDisplayCodeColumn() {
93
+ if (!this.table) return null;
94
+ const schema = await this.table.schema();
95
+ return schema.fields.some((f) => f.name === "display_code");
96
+ }
97
+ /**
98
+ * 读取全表(H3:纯 vector 操作,供 bootstrap 模块抽样校验使用)
99
+ */
100
+ async readAllRowsRaw() {
101
+ if (!this.table) return [];
102
+ return await this.table.query().toArray();
103
+ }
104
+ /**
105
+ * Drop chunks 表并用新 schema 重建(H3:纯 vector 操作)
106
+ *
107
+ * 调用方需保证:传入的 newRows 已剥离 display_code/vector_text 字段。
108
+ * 崩溃恢复语义由 bootstrap 模块的 state machine 负责。
109
+ */
110
+ async dropAndRecreateChunks(newRows) {
111
+ if (!this.db) throw new Error("VectorStore not initialized");
112
+ await this.db.dropTable("chunks");
113
+ this.table = null;
114
+ if (newRows.length > 0) {
115
+ this.table = await this.db.createTable("chunks", newRows);
116
+ }
117
+ }
118
+ /**
119
+ * 单调版本更新:先插入新版本,再删除旧版本
120
+ *
121
+ * 这保证了:
122
+ * - 最坏情况(崩溃)是新旧版本共存(不缺失)
123
+ * - 正常情况下旧版本被清理
124
+ */
125
+ async upsertFile(filePath, newHash, records) {
126
+ if (!this.db) throw new Error("VectorStore not initialized");
127
+ if (records.length === 0) {
128
+ await this.deleteFile(filePath);
129
+ return;
130
+ }
131
+ if (!this.table) {
132
+ await this.ensureTable(records);
133
+ } else {
134
+ await this.table.add(records);
135
+ }
136
+ if (this.table) {
137
+ await this.table.delete(
138
+ `file_path = '${this.escapeString(filePath)}' AND file_hash != '${this.escapeString(newHash)}'`
139
+ );
140
+ }
141
+ }
142
+ /**
143
+ * 批量 upsert 多个文件(性能优化版,带分批机制)
144
+ *
145
+ * 流程:
146
+ * 1. 将文件分成小批次(每批最多 BATCH_FILES 个文件或 BATCH_RECORDS 条记录)
147
+ * 2. 每批执行:插入新 records → 删除旧版本
148
+ *
149
+ * 分批是必要的,因为 LanceDB native 模块在处理超大数据时可能崩溃
150
+ *
151
+ * @param files 文件列表,每个包含 path、hash 和 records
152
+ */
153
+ async batchUpsertFiles(files) {
154
+ if (!this.db) throw new Error("VectorStore not initialized");
155
+ if (files.length === 0) return;
156
+ const BATCH_FILES = 50;
157
+ const BATCH_RECORDS = 5e3;
158
+ const batches = [];
159
+ let currentBatch = [];
160
+ let currentRecordCount = 0;
161
+ for (const file of files) {
162
+ if (currentBatch.length >= BATCH_FILES || currentRecordCount + file.records.length > BATCH_RECORDS) {
163
+ if (currentBatch.length > 0) {
164
+ batches.push(currentBatch);
165
+ }
166
+ currentBatch = [];
167
+ currentRecordCount = 0;
168
+ }
169
+ currentBatch.push(file);
170
+ currentRecordCount += file.records.length;
171
+ }
172
+ if (currentBatch.length > 0) {
173
+ batches.push(currentBatch);
174
+ }
175
+ for (const batch of batches) {
176
+ const batchRecords = [];
177
+ for (const file of batch) {
178
+ batchRecords.push(...file.records);
179
+ }
180
+ if (batchRecords.length === 0) {
181
+ const pathsToDelete = batch.map((f) => f.path);
182
+ await this.deleteFiles(pathsToDelete);
183
+ continue;
184
+ }
185
+ if (this.table && batch.length > 0) {
186
+ await this.deleteFilesByHash(batch.map((f) => ({ path: f.path, hash: f.hash })));
187
+ }
188
+ if (!this.table) {
189
+ await this.ensureTable(batchRecords);
190
+ } else {
191
+ await this.table.add(batchRecords);
192
+ }
193
+ if (this.table && batch.length > 0) {
194
+ const deleteConditions = batch.map(
195
+ (f) => `(file_path = '${this.escapeString(f.path)}' AND file_hash != '${this.escapeString(f.hash)}')`
196
+ ).join(" OR ");
197
+ await this.table.delete(deleteConditions);
198
+ }
199
+ }
200
+ }
201
+ /**
202
+ * 列出所有 chunks 的 (file_path, file_hash) 唯一组合
203
+ *
204
+ * 用于 GC 阶段对比 SQLite 权威数据,识别孤儿 chunks。
205
+ * 性能优化:仅 select 两列,按 (path, hash) 去重后返回。
206
+ */
207
+ async listFileHashes() {
208
+ if (!this.table) return [];
209
+ const rows = await this.table.query().select(["file_path", "file_hash"]).toArray();
210
+ const seen = /* @__PURE__ */ new Set();
211
+ const result = [];
212
+ for (const r of rows) {
213
+ const key = `${r.file_path}\0${r.file_hash}`;
214
+ if (!seen.has(key)) {
215
+ seen.add(key);
216
+ result.push({ path: r.file_path, hash: r.file_hash });
217
+ }
218
+ }
219
+ return result;
220
+ }
221
+ /**
222
+ * 删除文件的所有 chunks
223
+ */
224
+ async deleteFile(filePath) {
225
+ if (!this.table) return;
226
+ await this.table.delete(`file_path = '${this.escapeString(filePath)}'`);
227
+ }
228
+ /**
229
+ * 按 (file_path, file_hash) 精确删除 chunks
230
+ *
231
+ * 用于事务补偿:当下游写入(如 FTS)失败时,反向删除已 upsert 的新版本,
232
+ * 保留旧版本不动,确保 vector_index_hash 仍指向旧 hash 时 LanceDB 状态一致。
233
+ */
234
+ async deleteFilesByHash(items) {
235
+ if (!this.table || items.length === 0) return;
236
+ const BATCH_SIZE = 500;
237
+ for (let i = 0; i < items.length; i += BATCH_SIZE) {
238
+ const batch = items.slice(i, i + BATCH_SIZE);
239
+ const conditions = batch.map(
240
+ (it) => `(file_path = '${this.escapeString(it.path)}' AND file_hash = '${this.escapeString(it.hash)}')`
241
+ ).join(" OR ");
242
+ await this.table.delete(conditions);
243
+ }
244
+ }
245
+ /**
246
+ * 批量删除文件(性能优化:单次 DELETE 替代 N 次循环)
247
+ * 当文件数超过 500 时分批处理,防止 LanceDB filter 字符串过长
248
+ */
249
+ async deleteFiles(filePaths) {
250
+ if (!this.table || filePaths.length === 0) return;
251
+ const BATCH_SIZE = 500;
252
+ if (filePaths.length <= BATCH_SIZE) {
253
+ const conditions = filePaths.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
254
+ await this.table.delete(conditions);
255
+ } else {
256
+ for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
257
+ const batch = filePaths.slice(i, i + BATCH_SIZE);
258
+ const conditions = batch.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
259
+ await this.table.delete(conditions);
260
+ }
261
+ }
262
+ }
263
+ /**
264
+ * 向量搜索
265
+ */
266
+ async search(queryVector, limit = 10, filter) {
267
+ if (!this.table) return [];
268
+ let query = this.table.vectorSearch(queryVector).limit(limit);
269
+ if (filter) {
270
+ query = query.where(filter);
271
+ }
272
+ const results = await query.toArray();
273
+ return results;
274
+ }
275
+ /**
276
+ * 获取文件的所有 chunks(按 chunk_index 排序)
277
+ */
278
+ async getFileChunks(filePath) {
279
+ if (!this.table) return [];
280
+ const results = await this.table.query().where(`file_path = '${this.escapeString(filePath)}'`).toArray();
281
+ const chunks = results;
282
+ return chunks.sort((a, b) => a.chunk_index - b.chunk_index);
283
+ }
284
+ /**
285
+ * 批量获取多个文件的 chunks(性能优化:单次查询替代 N 次循环)
286
+ * 当文件数超过 500 时分批处理,防止 LanceDB filter 字符串过长
287
+ *
288
+ * 适用于 GraphExpander 扩展、词法召回等需要批量获取的场景
289
+ * @returns Map<filePath, ChunkRecord[]>,每个文件的 chunks 已按 chunk_index 排序
290
+ */
291
+ async getFilesChunks(filePaths) {
292
+ const result = /* @__PURE__ */ new Map();
293
+ if (!this.table || filePaths.length === 0) return result;
294
+ const BATCH_SIZE = 500;
295
+ for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
296
+ const batch = filePaths.slice(i, i + BATCH_SIZE);
297
+ const conditions = batch.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
298
+ const rows = await this.table.query().where(conditions).toArray();
299
+ for (const row of rows) {
300
+ let arr = result.get(row.file_path);
301
+ if (!arr) {
302
+ arr = [];
303
+ result.set(row.file_path, arr);
304
+ }
305
+ arr.push(row);
306
+ }
307
+ }
308
+ for (const arr of result.values()) {
309
+ arr.sort((a, b) => a.chunk_index - b.chunk_index);
310
+ }
311
+ return result;
312
+ }
313
+ /**
314
+ * 获取表的总记录数
315
+ */
316
+ async count() {
317
+ if (!this.table) return 0;
318
+ return await this.table.countRows();
319
+ }
320
+ /**
321
+ * 清空所有数据
322
+ */
323
+ async clear() {
324
+ if (!this.db) return;
325
+ try {
326
+ await this.db.dropTable("chunks");
327
+ this.table = null;
328
+ } catch {
329
+ }
330
+ }
331
+ /**
332
+ * 获取向量维度
333
+ */
334
+ getVectorDim() {
335
+ return this.vectorDim;
336
+ }
337
+ /**
338
+ * 转义字符串(防止 SQL 注入)
339
+ */
340
+ escapeString(str) {
341
+ return str.replace(/'/g, "''");
342
+ }
343
+ /**
344
+ * 关闭连接
345
+ */
346
+ async close() {
347
+ this.db = null;
348
+ this.table = null;
349
+ }
350
+ };
351
+ var vectorStores = /* @__PURE__ */ new Map();
352
+ async function getVectorStore(projectId, vectorDim = 1024) {
353
+ let store = vectorStores.get(projectId);
354
+ if (!store) {
355
+ store = new VectorStore(projectId, vectorDim);
356
+ await store.init();
357
+ vectorStores.set(projectId, store);
358
+ }
359
+ return store;
360
+ }
361
+ async function closeAllVectorStores() {
362
+ for (const store of vectorStores.values()) {
363
+ await store.close();
364
+ }
365
+ vectorStores.clear();
366
+ }
367
+
368
+ export {
369
+ sampleCheckDisplayCode,
370
+ VectorStore,
371
+ getVectorStore,
372
+ closeAllVectorStores
373
+ };
@@ -0,0 +1,102 @@
1
+ // src/search/config.ts
2
+ var SEARCH_CONFIG_BOUNDS = {
3
+ vectorTopK: { min: 40, max: 200, integer: true },
4
+ vectorTopM: { min: 30, max: 100, integer: true },
5
+ ftsTopKFiles: { min: 10, max: 50, integer: true },
6
+ lexChunksPerFile: { min: 1, max: 5, integer: true },
7
+ lexTotalChunks: { min: 20, max: 80, integer: true },
8
+ rrfK0: { min: 10, max: 60, integer: true },
9
+ wVec: { min: 0, max: 1, integer: false },
10
+ wLex: { min: 0, max: 1, integer: false },
11
+ fusedTopM: { min: 30, max: 100, integer: true },
12
+ rerankTopN: { min: 5, max: 20, integer: true },
13
+ maxRerankChars: { min: 500, max: 2e3, integer: true },
14
+ maxBreadcrumbChars: { min: 100, max: 500, integer: true },
15
+ headRatio: { min: 0.5, max: 0.8, integer: false },
16
+ neighborHops: { min: 1, max: 3, integer: true },
17
+ breadcrumbExpandLimit: { min: 1, max: 5, integer: true },
18
+ importFilesPerSeed: { min: 0, max: 5, integer: true },
19
+ chunksPerImportFile: { min: 1, max: 5, integer: true },
20
+ decayNeighbor: { min: 0.5, max: 0.9, integer: false },
21
+ decayBreadcrumb: { min: 0.4, max: 0.8, integer: false },
22
+ decayImport: { min: 0.3, max: 0.7, integer: false },
23
+ decayDepth: { min: 0.5, max: 0.9, integer: false },
24
+ maxSegmentsPerFile: { min: 1, max: 5, integer: true },
25
+ maxTotalChars: { min: 2e4, max: 8e4, integer: true },
26
+ smartTopScoreRatio: { min: 0.3, max: 0.7, integer: false },
27
+ smartTopScoreDeltaAbs: { min: 0.1, max: 0.4, integer: false },
28
+ smartMinScore: { min: 0.1, max: 0.4, integer: false },
29
+ smartMinK: { min: 1, max: 3, integer: true },
30
+ smartMaxK: { min: 5, max: 15, integer: true }
31
+ };
32
+ var DEFAULT_CONFIG = {
33
+ // ── Recall (向量 + 词法召回) ──
34
+ vectorTopK: 80,
35
+ // Vector ANN candidates before dedup. Range: 40–200. Higher = better recall, more compute.
36
+ vectorTopM: 60,
37
+ // Vectors kept after dedup. Range: 30–100.
38
+ ftsTopKFiles: 20,
39
+ // Max files returned by FTS5 full-text search. Range: 10–50.
40
+ lexChunksPerFile: 2,
41
+ // Chunks to pull per FTS-matched file. Range: 1–5. Low keeps diversity across files.
42
+ lexTotalChunks: 40,
43
+ // Hard cap on total lexical chunks. Range: 20–80.
44
+ // ── RRF Fusion (向量 + 词法分数融合) ──
45
+ rrfK0: 20,
46
+ // RRF smoothing constant. Range: 10–60. Lower amplifies top ranks.
47
+ wVec: 0.6,
48
+ // Vector weight in fused score. Range: 0.3–0.8. Semantic relevance emphasis.
49
+ wLex: 0.4,
50
+ // Lexical weight in fused score. wVec + wLex should equal 1.0.
51
+ fusedTopM: 60,
52
+ // Candidates after fusion, fed into reranker. Range: 30–100.
53
+ // ── Rerank (精排) ──
54
+ rerankTopN: 10,
55
+ // Final top-N results after reranking. Range: 5–20.
56
+ maxRerankChars: 1e3,
57
+ // Max chars per chunk sent to reranker. Truncated beyond this. Range: 500–2000.
58
+ maxBreadcrumbChars: 250,
59
+ // Max chars for breadcrumb context in rerank input. Range: 100–500.
60
+ headRatio: 0.67,
61
+ // Ratio of head vs tail when truncating chunks. Range: 0.5–0.8.
62
+ // ── Expansion (上下文扩展: E1 邻居 / E2 面包屑 / E3 跨文件导入) ──
63
+ neighborHops: 2,
64
+ // E1: How many sibling chunks to expand in each direction. Range: 1–3.
65
+ breadcrumbExpandLimit: 3,
66
+ // E2: Max ancestor breadcrumbs (class/function scope). Range: 1–5.
67
+ importFilesPerSeed: 3,
68
+ // E3: Cross-file import files to resolve per seed chunk. Range: 0–5. Set to 3 to enable import-graph expansion for better cross-file context.
69
+ chunksPerImportFile: 3,
70
+ // E3: Chunks to pull from each resolved import file. Range: 1–5. Set to 3 for balanced coverage of imported symbols.
71
+ decayNeighbor: 0.8,
72
+ // Score decay per E1 hop. Range: 0.5–0.9. Higher = neighbors stay relevant longer.
73
+ decayBreadcrumb: 0.7,
74
+ // Score decay per E2 level. Range: 0.4–0.8.
75
+ decayImport: 0.6,
76
+ // Score decay for E3 import chunks. Range: 0.3–0.7. Lower than E1/E2 since cross-file is less certain.
77
+ decayDepth: 0.7,
78
+ // General depth decay multiplier. Range: 0.5–0.9.
79
+ // ── ContextPacker (上下文打包) ──
80
+ maxSegmentsPerFile: 3,
81
+ // Max non-contiguous segments per file in output. Range: 1–5. Prevents excessive fragmentation.
82
+ maxTotalChars: 48e3,
83
+ // Token budget expressed as chars (~12k tokens). Range: 20000–80000.
84
+ // ── Smart TopK (动态结果数量) ──
85
+ enableSmartTopK: true,
86
+ // Dynamically adjust result count based on score distribution.
87
+ smartTopScoreRatio: 0.5,
88
+ // Min score as ratio of top-1 score to remain included. Range: 0.3–0.7.
89
+ smartTopScoreDeltaAbs: 0.25,
90
+ // Max absolute score drop from top-1 before cutting off. Range: 0.1–0.4.
91
+ smartMinScore: 0.25,
92
+ // Hard floor: chunks below this score are always excluded. Range: 0.1–0.4.
93
+ smartMinK: 2,
94
+ // Minimum results to return regardless of scores. Range: 1–3.
95
+ smartMaxK: 8
96
+ // Maximum results when smart topK is active. Range: 5–15.
97
+ };
98
+
99
+ export {
100
+ SEARCH_CONFIG_BOUNDS,
101
+ DEFAULT_CONFIG
102
+ };
@@ -1,9 +1,11 @@
1
1
  import {
2
2
  closeAllIndexers,
3
- closeAllVectorStores,
4
3
  getIndexer,
5
4
  invalidateAllExpanderCaches
6
- } from "./chunk-6QMYML5V.js";
5
+ } from "./chunk-XMZZZKG7.js";
6
+ import {
7
+ closeAllVectorStores
8
+ } from "./chunk-3BNHQV5W.js";
7
9
  import {
8
10
  batchDelete,
9
11
  batchUpdateMtime,
@@ -15,16 +17,19 @@ import {
15
17
  getAllPaths,
16
18
  getFilesNeedingVectorIndex,
17
19
  getStoredEmbeddingDimensions,
20
+ incrementIndexVersion,
21
+ incrementStat,
18
22
  initDb,
23
+ setStatJson,
19
24
  setStoredEmbeddingDimensions
20
- } from "./chunk-6Z4JEEVJ.js";
25
+ } from "./chunk-PPLFJGO3.js";
21
26
  import {
22
27
  logger
23
- } from "./chunk-AMQQK4P7.js";
28
+ } from "./chunk-JVKVSTQ3.js";
24
29
  import {
25
30
  getEmbeddingConfig,
26
31
  getExcludePatterns
27
- } from "./chunk-RJURH22T.js";
32
+ } from "./chunk-SKBAE26T.js";
28
33
 
29
34
  // src/scanner/index.ts
30
35
  import path3 from "path";
@@ -523,6 +528,19 @@ var SourceAdapter = class {
523
528
  /**
524
529
  * 将字节偏移转换为字符偏移
525
530
  */
531
+ /**
532
+ * 将 tree-sitter 返回的偏移(可能是 UTF-8 字节或 UTF-16 字符域)
533
+ * 标准化为 UTF-16 字符域偏移。下游 String.prototype.slice 直接可用。
534
+ *
535
+ * 导出供 SemanticSplitter 在生成 ChunkMetadata 时统一域。
536
+ */
537
+ toCharOffset(offset) {
538
+ if (this.domain === "utf16" || this.domain === "unknown") return offset;
539
+ return this.byteToChar(offset);
540
+ }
541
+ /**
542
+ * 将字节偏移转换为字符偏移(仅 utf8 域有效;utf16/unknown 直接返回原值)
543
+ */
526
544
  byteToChar(byteOffset) {
527
545
  if (!this.byteToCharMap) return byteOffset;
528
546
  const safeOffset = Math.max(0, Math.min(this.byteToCharMap.length - 1, byteOffset));
@@ -915,11 +933,12 @@ ${displayCode}`,
915
933
  const vectorEnd = end;
916
934
  const displayCode = this.adapter.slice(start, end);
917
935
  const vectorCode = this.adapter.slice(vectorStart, vectorEnd);
936
+ const toChar = (n) => this.adapter.toCharOffset(n);
918
937
  const metadata = {
919
- startIndex: start,
920
- endIndex: end,
921
- rawSpan: { start: prevEnd, end: rawSpanEnd },
922
- vectorSpan: { start: vectorStart, end: vectorEnd },
938
+ startIndex: toChar(start),
939
+ endIndex: toChar(end),
940
+ rawSpan: { start: toChar(prevEnd), end: toChar(rawSpanEnd) },
941
+ vectorSpan: { start: toChar(vectorStart), end: toChar(vectorEnd) },
923
942
  filePath,
924
943
  language,
925
944
  contextPath: w.contextPath
@@ -1351,6 +1370,34 @@ async function scan(rootPath, options = {}) {
1351
1370
  }
1352
1371
  }
1353
1372
  options.onProgress?.(100, 100, "\u7D22\u5F15\u5B8C\u6210");
1373
+ if (options.vectorIndex !== false) {
1374
+ try {
1375
+ const embeddingConfig = getEmbeddingConfig();
1376
+ const indexer = await getIndexer(projectId, embeddingConfig.dimensions);
1377
+ const gcResult = await indexer.gc(db);
1378
+ if (gcResult.orphans > 0) {
1379
+ logger.info({ orphans: gcResult.orphans }, "GC \u5B8C\u6210");
1380
+ } else if (gcResult.truncated) {
1381
+ logger.debug("GC \u8D85\u65F6\u8DF3\u8FC7\uFF0C\u4E0B\u6B21\u626B\u63CF\u91CD\u8BD5");
1382
+ }
1383
+ } catch (err) {
1384
+ const error = err;
1385
+ logger.warn({ error: error.message }, "GC \u8DF3\u8FC7");
1386
+ }
1387
+ }
1388
+ const didWork = stats.added + stats.modified + stats.deleted > 0;
1389
+ if (didWork) {
1390
+ incrementIndexVersion(db);
1391
+ }
1392
+ try {
1393
+ if (didWork) {
1394
+ incrementStat(db, "stats.index.total_runs");
1395
+ }
1396
+ setStatJson(db, "stats.index.last_run_json", stats);
1397
+ setStatJson(db, "stats.index.last_run_at", Date.now());
1398
+ } catch (err) {
1399
+ logger.warn({ error: err.message }, "\u7D22\u5F15\u7EDF\u8BA1\u57CB\u70B9\u5931\u8D25");
1400
+ }
1354
1401
  invalidateAllExpanderCaches();
1355
1402
  return stats;
1356
1403
  } finally {
@@ -1361,6 +1408,7 @@ async function scan(rootPath, options = {}) {
1361
1408
  }
1362
1409
 
1363
1410
  export {
1411
+ initFilter,
1412
+ isFiltered,
1364
1413
  scan
1365
1414
  };
1366
- //# sourceMappingURL=chunk-NQR4CGQ6.js.map
@@ -1,7 +1,6 @@
1
1
  import {
2
2
  logger
3
- } from "./chunk-AMQQK4P7.js";
4
- import "./chunk-RJURH22T.js";
3
+ } from "./chunk-JVKVSTQ3.js";
5
4
 
6
5
  // src/utils/lock.ts
7
6
  import fs from "fs";
@@ -122,7 +121,7 @@ async function withLock(projectId, operation, fn, timeoutMs = 3e4) {
122
121
  releaseLock(projectId);
123
122
  }
124
123
  }
124
+
125
125
  export {
126
126
  withLock
127
127
  };
128
- //# sourceMappingURL=lock-DVY3KJSK.js.map