@chiway/contextweaver 1.1.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +138 -28
  2. package/dist/{SearchService-MYPOCM3B.js → SearchService-WVD6THR3.js} +170 -82
  3. package/dist/chunk-3BNHQV5W.js +373 -0
  4. package/dist/chunk-BFCIZ52F.js +102 -0
  5. package/dist/{chunk-NQR4CGQ6.js → chunk-GDVB6PJ4.js} +58 -10
  6. package/dist/{lock-DVY3KJSK.js → chunk-HHYPQA3X.js} +2 -3
  7. package/dist/chunk-ISVCQFB4.js +223 -0
  8. package/dist/chunk-IZ6IUHNN.js +77 -0
  9. package/dist/{chunk-AMQQK4P7.js → chunk-JVKVSTQ3.js} +1 -2
  10. package/dist/chunk-LB42CZEB.js +18 -0
  11. package/dist/{chunk-6Z4JEEVJ.js → chunk-PPLFJGO3.js} +303 -58
  12. package/dist/chunk-R6CNZXZ7.js +143 -0
  13. package/dist/{chunk-RJURH22T.js → chunk-SKBAE26T.js} +0 -1
  14. package/dist/chunk-TPM6YP43.js +38 -0
  15. package/dist/{chunk-7G5V7YT5.js → chunk-V3K4YVAR.js} +12 -120
  16. package/dist/chunk-VWBKZ6QL.js +115 -0
  17. package/dist/chunk-XFIM2T6S.js +57 -0
  18. package/dist/{chunk-6QMYML5V.js → chunk-XMZZZKG7.js} +361 -295
  19. package/dist/chunk-XTWNT7KP.js +156 -0
  20. package/dist/chunk-Y6H7C3NA.js +85 -0
  21. package/dist/codebaseRetrieval-DIS5RH2C.js +14 -0
  22. package/dist/{config-BWZ6CU3W.js → config-LCOJHTCF.js} +1 -2
  23. package/dist/db-GBCLP4GG.js +68 -0
  24. package/dist/findReferences-N7ML7TUP.js +16 -0
  25. package/dist/getSymbolDefinition-6KMY4H33.js +17 -0
  26. package/dist/index.js +271 -40
  27. package/dist/listFiles-4VT2TPJD.js +14 -0
  28. package/dist/loadConfig-XTVT2OWW.js +9 -0
  29. package/dist/lock-HNKQ6X5B.js +8 -0
  30. package/dist/scanner-QDFZJLP7.js +13 -0
  31. package/dist/server-UAI3U7AB.js +347 -0
  32. package/dist/stats-AGKUCJQI.js +12 -0
  33. package/dist/vectorStore-4ODCERRO.js +12 -0
  34. package/package.json +9 -23
  35. package/dist/codebaseRetrieval-NLAMGOA2.js +0 -12
  36. package/dist/scanner-RFG4YWYI.js +0 -11
  37. package/dist/server-27HI7WZO.js +0 -147
@@ -1,270 +1,151 @@
1
+ import {
2
+ commonPrefixLength
3
+ } from "./chunk-LB42CZEB.js";
4
+ import {
5
+ ChunkContentLoader
6
+ } from "./chunk-XFIM2T6S.js";
7
+ import {
8
+ getVectorStore,
9
+ sampleCheckDisplayCode
10
+ } from "./chunk-3BNHQV5W.js";
1
11
  import {
2
12
  batchDeleteFileChunksFts,
3
13
  batchUpdateVectorIndexHash,
4
14
  batchUpsertChunkFts,
15
+ clearAllVectorIndexHash,
5
16
  clearVectorIndexHash,
17
+ deletePendingMarks,
18
+ getLanceDbMigrationState,
6
19
  initDb,
7
- isChunksFtsInitialized
8
- } from "./chunk-6Z4JEEVJ.js";
20
+ insertPendingMarks,
21
+ isChunksFtsInitialized,
22
+ releaseLanceDbMigrationLock,
23
+ replayPendingMarks,
24
+ setLanceDbMigrationState,
25
+ tryAcquireLanceDbMigrationLock
26
+ } from "./chunk-PPLFJGO3.js";
9
27
  import {
10
28
  logger
11
- } from "./chunk-AMQQK4P7.js";
29
+ } from "./chunk-JVKVSTQ3.js";
12
30
  import {
13
31
  getEmbeddingConfig
14
- } from "./chunk-RJURH22T.js";
32
+ } from "./chunk-SKBAE26T.js";
15
33
 
16
- // src/vectorStore/index.ts
17
- import fs from "fs";
18
- import os from "os";
19
- import path from "path";
20
- import * as lancedb from "@lancedb/lancedb";
21
- var BASE_DIR = path.join(os.homedir(), ".contextweaver");
22
- var VectorStore = class {
23
- db = null;
24
- table = null;
25
- projectId;
26
- dbPath;
27
- vectorDim;
28
- constructor(projectId, vectorDim = 1024) {
29
- this.projectId = projectId;
30
- this.dbPath = path.join(BASE_DIR, projectId, "vectors.lance");
31
- this.vectorDim = vectorDim;
32
- }
33
- /**
34
- * 初始化连接
35
- */
36
- async init() {
37
- if (this.db) return;
38
- const projectDir = path.join(BASE_DIR, this.projectId);
39
- if (!fs.existsSync(projectDir)) {
40
- fs.mkdirSync(projectDir, { recursive: true });
41
- }
42
- this.db = await lancedb.connect(this.dbPath);
43
- const tableNames = await this.db.tableNames();
44
- if (tableNames.includes("chunks")) {
45
- this.table = await this.db.openTable("chunks");
34
+ // src/db/bootstrap.ts
35
+ async function bootstrap(db, vectorStore, options = {}) {
36
+ const result = {
37
+ replay: { applied: 0, discarded: 0 },
38
+ migration: { migrated: false, totalRows: 0 }
39
+ };
40
+ try {
41
+ result.replay = replayPendingMarks(db);
42
+ if (result.replay.applied > 0 || result.replay.discarded > 0) {
43
+ logger.info(result.replay, "pending_marks \u542F\u52A8\u91CD\u653E\uFF1A\u6807\u8BB0\u4E0A\u6B21\u672A\u6536\u655B\u7684\u7D22\u5F15\u72B6\u6001");
46
44
  }
45
+ } catch (err) {
46
+ const error = err;
47
+ logger.warn({ error: error.message }, "pending_marks \u91CD\u653E\u5931\u8D25\uFF0C\u672C\u6B21\u8DF3\u8FC7");
47
48
  }
48
- /**
49
- * 确保表存在(首次插入时调用)
50
- */
51
- async ensureTable(records) {
52
- if (this.table) return;
53
- if (!this.db) throw new Error("VectorStore not initialized");
54
- if (records.length === 0) return;
55
- this.table = await this.db.createTable(
56
- "chunks",
57
- records
58
- );
59
- }
60
- /**
61
- * 单调版本更新:先插入新版本,再删除旧版本
62
- *
63
- * 这保证了:
64
- * - 最坏情况(崩溃)是新旧版本共存(不缺失)
65
- * - 正常情况下旧版本被清理
66
- */
67
- async upsertFile(filePath, newHash, records) {
68
- if (!this.db) throw new Error("VectorStore not initialized");
69
- if (records.length === 0) {
70
- await this.deleteFile(filePath);
71
- return;
72
- }
73
- if (!this.table) {
74
- await this.ensureTable(records);
75
- } else {
76
- await this.table.add(records);
77
- }
78
- if (this.table) {
79
- await this.table.delete(
80
- `file_path = '${this.escapeString(filePath)}' AND file_hash != '${this.escapeString(newHash)}'`
49
+ try {
50
+ result.migration = await migrateRemoveDisplayCode(db, vectorStore, options);
51
+ if (result.migration.migrated) {
52
+ logger.info(
53
+ { totalRows: result.migration.totalRows, reason: result.migration.reason },
54
+ "LanceDB schema \u8FC1\u79FB\u5B8C\u6210\uFF1Achunks \u8868\u5DF2\u79FB\u9664 display_code/vector_text"
55
+ );
56
+ } else if (result.migration.reason?.startsWith("mismatch_ratio_")) {
57
+ logger.error(
58
+ { reason: result.migration.reason, mismatched: result.migration.mismatched },
59
+ "LanceDB schema \u8FC1\u79FB\u4E2D\u6B62\uFF1Adisplay_code \u4E0E files.content \u62BD\u6837\u5DEE\u5F02\u8FC7\u5927\uFF0C\u8BF7\u68C0\u67E5\u7D22\u5F15\u4E00\u81F4\u6027\u6216\u8FD0\u884C `contextweaver migrate --reset`"
81
60
  );
82
61
  }
62
+ } catch (err) {
63
+ const error = err;
64
+ logger.warn({ error: error.message }, "LanceDB schema \u8FC1\u79FB\u5931\u8D25\uFF0C\u672C\u6B21\u8DF3\u8FC7");
83
65
  }
84
- /**
85
- * 批量 upsert 多个文件(性能优化版,带分批机制)
86
- *
87
- * 流程:
88
- * 1. 将文件分成小批次(每批最多 BATCH_FILES 个文件或 BATCH_RECORDS 条记录)
89
- * 2. 每批执行:插入新 records 删除旧版本
90
- *
91
- * 分批是必要的,因为 LanceDB native 模块在处理超大数据时可能崩溃
92
- *
93
- * @param files 文件列表,每个包含 path、hash 和 records
94
- */
95
- async batchUpsertFiles(files) {
96
- if (!this.db) throw new Error("VectorStore not initialized");
97
- if (files.length === 0) return;
98
- const BATCH_FILES = 50;
99
- const BATCH_RECORDS = 5e3;
100
- const batches = [];
101
- let currentBatch = [];
102
- let currentRecordCount = 0;
103
- for (const file of files) {
104
- if (currentBatch.length >= BATCH_FILES || currentRecordCount + file.records.length > BATCH_RECORDS) {
105
- if (currentBatch.length > 0) {
106
- batches.push(currentBatch);
107
- }
108
- currentBatch = [];
109
- currentRecordCount = 0;
110
- }
111
- currentBatch.push(file);
112
- currentRecordCount += file.records.length;
113
- }
114
- if (currentBatch.length > 0) {
115
- batches.push(currentBatch);
116
- }
117
- for (const batch of batches) {
118
- const batchRecords = [];
119
- for (const file of batch) {
120
- batchRecords.push(...file.records);
121
- }
122
- if (batchRecords.length === 0) {
123
- const pathsToDelete = batch.map((f) => f.path);
124
- await this.deleteFiles(pathsToDelete);
125
- continue;
126
- }
127
- if (!this.table) {
128
- await this.ensureTable(batchRecords);
129
- } else {
130
- await this.table.add(batchRecords);
131
- }
132
- if (this.table && batch.length > 0) {
133
- const deleteConditions = batch.map(
134
- (f) => `(file_path = '${this.escapeString(f.path)}' AND file_hash != '${this.escapeString(f.hash)}')`
135
- ).join(" OR ");
136
- await this.table.delete(deleteConditions);
137
- }
138
- }
139
- }
140
- /**
141
- * 删除文件的所有 chunks
142
- */
143
- async deleteFile(filePath) {
144
- if (!this.table) return;
145
- await this.table.delete(`file_path = '${this.escapeString(filePath)}'`);
146
- }
147
- /**
148
- * 批量删除文件(性能优化:单次 DELETE 替代 N 次循环)
149
- * 当文件数超过 500 时分批处理,防止 LanceDB filter 字符串过长
150
- */
151
- async deleteFiles(filePaths) {
152
- if (!this.table || filePaths.length === 0) return;
153
- const BATCH_SIZE = 500;
154
- if (filePaths.length <= BATCH_SIZE) {
155
- const conditions = filePaths.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
156
- await this.table.delete(conditions);
157
- } else {
158
- for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
159
- const batch = filePaths.slice(i, i + BATCH_SIZE);
160
- const conditions = batch.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
161
- await this.table.delete(conditions);
66
+ return result;
67
+ }
68
+ async function migrateRemoveDisplayCode(db, vectorStore, options = {}) {
69
+ const earlyState = getLanceDbMigrationState(db);
70
+ if (earlyState === "done") {
71
+ return { migrated: false, totalRows: 0, reason: "already_migrated_persisted" };
72
+ }
73
+ if (earlyState === "aborted") {
74
+ return { migrated: false, totalRows: 0, reason: "aborted_awaiting_manual" };
75
+ }
76
+ if (!tryAcquireLanceDbMigrationLock(db)) {
77
+ return { migrated: false, totalRows: 0, reason: "lock_held_by_other_process" };
78
+ }
79
+ try {
80
+ const persistedState = getLanceDbMigrationState(db);
81
+ if (persistedState === "done") {
82
+ return { migrated: false, totalRows: 0, reason: "already_migrated_persisted" };
83
+ }
84
+ if (persistedState === "aborted") {
85
+ return { migrated: false, totalRows: 0, reason: "aborted_awaiting_manual" };
86
+ }
87
+ const hasCol = await vectorStore.hasDisplayCodeColumn();
88
+ if (persistedState === "pending") {
89
+ if (hasCol === null) {
90
+ setLanceDbMigrationState(db, "done");
91
+ return { migrated: true, totalRows: 0, reason: "recovered_pending_no_table" };
162
92
  }
163
93
  }
164
- }
165
- /**
166
- * 向量搜索
167
- */
168
- async search(queryVector, limit = 10, filter) {
169
- if (!this.table) return [];
170
- let query = this.table.vectorSearch(queryVector).limit(limit);
171
- if (filter) {
172
- query = query.where(filter);
173
- }
174
- const results = await query.toArray();
175
- return results;
176
- }
177
- /**
178
- * 获取文件的所有 chunks(按 chunk_index 排序)
179
- */
180
- async getFileChunks(filePath) {
181
- if (!this.table) return [];
182
- const results = await this.table.query().where(`file_path = '${this.escapeString(filePath)}'`).toArray();
183
- const chunks = results;
184
- return chunks.sort((a, b) => a.chunk_index - b.chunk_index);
185
- }
186
- /**
187
- * 批量获取多个文件的 chunks(性能优化:单次查询替代 N 次循环)
188
- * 当文件数超过 500 时分批处理,防止 LanceDB filter 字符串过长
189
- *
190
- * 适用于 GraphExpander 扩展、词法召回等需要批量获取的场景
191
- * @returns Map<filePath, ChunkRecord[]>,每个文件的 chunks 已按 chunk_index 排序
192
- */
193
- async getFilesChunks(filePaths) {
194
- const result = /* @__PURE__ */ new Map();
195
- if (!this.table || filePaths.length === 0) return result;
196
- const BATCH_SIZE = 500;
197
- for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
198
- const batch = filePaths.slice(i, i + BATCH_SIZE);
199
- const conditions = batch.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
200
- const rows = await this.table.query().where(conditions).toArray();
201
- for (const row of rows) {
202
- let arr = result.get(row.file_path);
203
- if (!arr) {
204
- arr = [];
205
- result.set(row.file_path, arr);
206
- }
207
- arr.push(row);
94
+ if (hasCol === null) {
95
+ setLanceDbMigrationState(db, "done");
96
+ return { migrated: false, totalRows: 0, reason: "empty" };
97
+ }
98
+ if (!hasCol) {
99
+ setLanceDbMigrationState(db, "done");
100
+ return { migrated: false, totalRows: 0, reason: "already_migrated" };
101
+ }
102
+ const sampleSize = options.sampleSize ?? 100;
103
+ const maxMismatchRatio = options.sampleMaxMismatchRatio ?? 0.01;
104
+ const oldRows = await vectorStore.readAllRowsRaw();
105
+ const totalRows = oldRows.length;
106
+ if (totalRows > 0) {
107
+ const stmt = db.prepare("SELECT content FROM files WHERE path = ?");
108
+ const getContent = (path) => {
109
+ const row = stmt.get(path);
110
+ return row?.content ?? null;
111
+ };
112
+ const check = sampleCheckDisplayCode(oldRows, getContent, {
113
+ sampleSize,
114
+ maxMismatchRatio
115
+ });
116
+ if (check.abort) {
117
+ setLanceDbMigrationState(db, "aborted");
118
+ return {
119
+ migrated: false,
120
+ totalRows,
121
+ mismatched: check.mismatched,
122
+ reason: `mismatch_ratio_${check.ratio.toFixed(3)}_exceeds_${maxMismatchRatio}`
123
+ };
208
124
  }
209
125
  }
210
- for (const arr of result.values()) {
211
- arr.sort((a, b) => a.chunk_index - b.chunk_index);
212
- }
213
- return result;
214
- }
215
- /**
216
- * 获取表的总记录数
217
- */
218
- async count() {
219
- if (!this.table) return 0;
220
- return await this.table.countRows();
221
- }
222
- /**
223
- * 清空所有数据
224
- */
225
- async clear() {
226
- if (!this.db) return;
227
- try {
228
- await this.db.dropTable("chunks");
229
- this.table = null;
230
- } catch {
231
- }
232
- }
233
- /**
234
- * 获取向量维度
235
- */
236
- getVectorDim() {
237
- return this.vectorDim;
238
- }
239
- /**
240
- * 转义字符串(防止 SQL 注入)
241
- */
242
- escapeString(str) {
243
- return str.replace(/'/g, "''");
244
- }
245
- /**
246
- * 关闭连接
247
- */
248
- async close() {
249
- this.db = null;
250
- this.table = null;
251
- }
252
- };
253
- var vectorStores = /* @__PURE__ */ new Map();
254
- async function getVectorStore(projectId, vectorDim = 1024) {
255
- let store = vectorStores.get(projectId);
256
- if (!store) {
257
- store = new VectorStore(projectId, vectorDim);
258
- await store.init();
259
- vectorStores.set(projectId, store);
260
- }
261
- return store;
262
- }
263
- async function closeAllVectorStores() {
264
- for (const store of vectorStores.values()) {
265
- await store.close();
126
+ const newRows = oldRows.map((r) => ({
127
+ chunk_id: r.chunk_id,
128
+ file_path: r.file_path,
129
+ file_hash: r.file_hash,
130
+ chunk_index: r.chunk_index,
131
+ vector: Array.from(r.vector),
132
+ language: r.language,
133
+ breadcrumb: r.breadcrumb,
134
+ start_index: r.start_index,
135
+ end_index: r.end_index,
136
+ raw_start: r.raw_start,
137
+ raw_end: r.raw_end,
138
+ vec_start: r.vec_start,
139
+ vec_end: r.vec_end
140
+ }));
141
+ const cleared = clearAllVectorIndexHash(db);
142
+ setLanceDbMigrationState(db, "pending");
143
+ await vectorStore.dropAndRecreateChunks(newRows);
144
+ setLanceDbMigrationState(db, "done");
145
+ return { migrated: true, totalRows, reason: `cleared_${cleared}_vector_index_hash` };
146
+ } finally {
147
+ releaseLanceDbMigrationLock(db);
266
148
  }
267
- vectorStores.clear();
268
149
  }
269
150
 
270
151
  // src/api/embedding.ts
@@ -634,6 +515,8 @@ var Indexer = class {
634
515
  vectorStore = null;
635
516
  embeddingClient;
636
517
  vectorDim;
518
+ /** bootstrap(pending_marks 重放 + LanceDB 迁移)只在每个 db 上执行一次 */
519
+ bootstrappedDbs = /* @__PURE__ */ new WeakSet();
637
520
  constructor(projectId, vectorDim = 1024) {
638
521
  this.projectId = projectId;
639
522
  this.vectorDim = vectorDim;
@@ -656,6 +539,31 @@ var Indexer = class {
656
539
  if (!this.vectorStore) {
657
540
  await this.init();
658
541
  }
542
+ if (!this.bootstrappedDbs.has(db)) {
543
+ this.bootstrappedDbs.add(db);
544
+ try {
545
+ await bootstrap(db, this.vectorStore);
546
+ } catch (err) {
547
+ const error = err;
548
+ logger.warn({ error: error.message }, "bootstrap \u5931\u8D25\uFF0C\u672C\u6B21\u8DF3\u8FC7");
549
+ }
550
+ }
551
+ const migrationState = getLanceDbMigrationState(db);
552
+ if (migrationState === "aborted") {
553
+ const errorCount = results.filter(
554
+ (r) => r.status === "added" || r.status === "modified"
555
+ ).length;
556
+ logger.error(
557
+ { migrationState, blockedFiles: errorCount },
558
+ "LanceDB \u5904\u4E8E aborted \u72B6\u6001\uFF0C\u62D2\u7EDD\u5199\u5165\u4EE5\u9632\u6B62 schema \u6C61\u67D3\u3002\u8FD0\u884C `contextweaver migrate --reset` \u6E05\u7A7A LanceDB \u5E76\u91CD\u65B0\u7D22\u5F15\u3002"
559
+ );
560
+ return {
561
+ indexed: 0,
562
+ deleted: 0,
563
+ errors: errorCount,
564
+ skipped: results.length - errorCount
565
+ };
566
+ }
659
567
  const stats = {
660
568
  indexed: 0,
661
569
  deleted: 0,
@@ -699,8 +607,14 @@ var Indexer = class {
699
607
  }
700
608
  }
701
609
  if (toDelete.length > 0) {
702
- await this.deleteFiles(db, toDelete);
703
- stats.deleted = toDelete.length;
610
+ try {
611
+ await this.deleteFiles(db, toDelete);
612
+ stats.deleted = toDelete.length;
613
+ } catch (err) {
614
+ const error = err;
615
+ logger.error({ error: error.message, count: toDelete.length }, "\u5220\u9664\u9636\u6BB5\u5931\u8D25\uFF0C\u5DF2\u6807\u8BB0\u91CD\u8BD5");
616
+ stats.errors += toDelete.length;
617
+ }
704
618
  }
705
619
  if (noChunkSettled.length > 0) {
706
620
  batchUpdateVectorIndexHash(db, noChunkSettled);
@@ -767,16 +681,27 @@ var Indexer = class {
767
681
  continue;
768
682
  }
769
683
  logger.info(
770
- { batch: `${batchNum}/${totalBatches}`, texts: batchTexts.length, files: batchFiles.length },
684
+ {
685
+ batch: `${batchNum}/${totalBatches}`,
686
+ texts: batchTexts.length,
687
+ files: batchFiles.length
688
+ },
771
689
  "\u6279\u6B21 Embedding \u5F00\u59CB"
772
690
  );
773
691
  let embeddings;
774
692
  const EMBED_BATCH_SIZE = 10;
775
693
  try {
776
694
  const batchOnProgress = onProgress ? (_completed, _total) => {
777
- onProgress(completedChunks + Math.min(_completed * EMBED_BATCH_SIZE, batchTexts.length), totalChunks);
695
+ onProgress(
696
+ completedChunks + Math.min(_completed * EMBED_BATCH_SIZE, batchTexts.length),
697
+ totalChunks
698
+ );
778
699
  } : void 0;
779
- const results = await this.embeddingClient.embedBatch(batchTexts, EMBED_BATCH_SIZE, batchOnProgress);
700
+ const results = await this.embeddingClient.embedBatch(
701
+ batchTexts,
702
+ EMBED_BATCH_SIZE,
703
+ batchOnProgress
704
+ );
780
705
  embeddings = results.map((r) => r.embedding);
781
706
  } catch (err) {
782
707
  const error = err;
@@ -812,8 +737,6 @@ var Indexer = class {
812
737
  file_hash: file.hash,
813
738
  chunk_index: chunkIdx,
814
739
  vector: embeddings[embIdx],
815
- display_code: chunk.displayCode,
816
- vector_text: chunk.vectorText,
817
740
  language: chunk.metadata.language,
818
741
  breadcrumb: chunk.metadata.contextPath.join(" > "),
819
742
  start_index: chunk.metadata.startIndex,
@@ -830,7 +753,7 @@ var Indexer = class {
830
753
  chunkIndex: record.chunk_index,
831
754
  breadcrumb: record.breadcrumb,
832
755
  content: `${record.breadcrumb}
833
- ${record.display_code}`
756
+ ${chunk.displayCode}`
834
757
  });
835
758
  }
836
759
  filesToUpsert.push({ path: file.path, hash: file.hash, records });
@@ -858,19 +781,64 @@ ${record.display_code}`
858
781
  completedChunks += batchTexts.length;
859
782
  continue;
860
783
  }
861
- }
862
- if (isChunksFtsInitialized(db) && ftsChunks.length > 0) {
863
- try {
864
- const pathsToDelete = filesToUpsert.map((f) => f.path);
865
- batchDeleteFileChunksFts(db, pathsToDelete);
866
- batchUpsertChunkFts(db, ftsChunks);
867
- } catch (err) {
868
- const error = err;
869
- logger.warn({ error: error.message }, "FTS \u6279\u91CF\u66F4\u65B0\u5931\u8D25\uFF08\u5411\u91CF\u7D22\u5F15\u5DF2\u6210\u529F\uFF09");
784
+ if (isChunksFtsInitialized(db) && ftsChunks.length > 0) {
785
+ try {
786
+ const ftsAndOutboxTx = db.transaction(() => {
787
+ batchUpsertChunkFts(db, ftsChunks);
788
+ insertPendingMarks(db, successFiles);
789
+ });
790
+ ftsAndOutboxTx();
791
+ } catch (err) {
792
+ const error = err;
793
+ logger.error(
794
+ { error: error.message, stack: error.stack, batch: `${batchNum}/${totalBatches}` },
795
+ "FTS/outbox \u5199\u5165\u5931\u8D25\uFF0C\u56DE\u6EDA LanceDB \u65B0\u7248\u672C"
796
+ );
797
+ try {
798
+ await this.vectorStore?.deleteFilesByHash(
799
+ filesToUpsert.map((f) => ({ path: f.path, hash: f.hash }))
800
+ );
801
+ } catch (rollbackErr) {
802
+ const rbError = rollbackErr;
803
+ logger.error(
804
+ { error: rbError.message },
805
+ "LanceDB \u56DE\u6EDA\u5931\u8D25\uFF0C\u5B64\u513F\u6570\u636E\u5C06\u7531\u4E0B\u6B21 GC \u6E05\u7406"
806
+ );
807
+ }
808
+ clearVectorIndexHash(
809
+ db,
810
+ batchFiles.map((f) => f.path)
811
+ );
812
+ totalErrors += batchFiles.length;
813
+ completedChunks += batchTexts.length;
814
+ continue;
815
+ }
816
+ } else if (successFiles.length > 0) {
817
+ try {
818
+ insertPendingMarks(db, successFiles);
819
+ } catch (err) {
820
+ const error = err;
821
+ logger.warn({ error: error.message }, "outbox \u5199\u5165\u5931\u8D25\uFF08\u65E0 FTS \u8DEF\u5F84\uFF09\uFF0C\u7EE7\u7EED stage6");
822
+ }
823
+ }
824
+ if (successFiles.length > 0) {
825
+ try {
826
+ const markTx = db.transaction(() => {
827
+ batchUpdateVectorIndexHash(db, successFiles);
828
+ deletePendingMarks(
829
+ db,
830
+ successFiles.map((f) => f.path)
831
+ );
832
+ });
833
+ markTx();
834
+ } catch (err) {
835
+ const error = err;
836
+ logger.warn(
837
+ { error: error.message, batch: `${batchNum}/${totalBatches}` },
838
+ "stage6 mark \u5931\u8D25\uFF0Coutbox \u5DF2\u4FDD\u7559\uFF0C\u4E0B\u6B21\u542F\u52A8\u5C06\u91CD\u653E"
839
+ );
840
+ }
870
841
  }
871
- }
872
- if (successFiles.length > 0) {
873
- batchUpdateVectorIndexHash(db, successFiles);
874
842
  }
875
843
  totalSuccess += successFiles.length;
876
844
  totalErrors += errorFiles.length;
@@ -889,12 +857,29 @@ ${record.display_code}`
889
857
  }
890
858
  /**
891
859
  * 删除文件的向量和 FTS 索引
860
+ *
861
+ * 顺序:先删 FTS(SQLite 事务,可靠)→ 再删 LanceDB(可能失败)
862
+ * 任一阶段失败均通过 clearVectorIndexHash 触发下次扫描自愈
892
863
  */
893
864
  async deleteFiles(db, paths) {
894
- if (!this.vectorStore) return;
895
- await this.vectorStore.deleteFiles(paths);
865
+ if (!this.vectorStore || paths.length === 0) return;
896
866
  if (isChunksFtsInitialized(db)) {
897
- batchDeleteFileChunksFts(db, paths);
867
+ try {
868
+ batchDeleteFileChunksFts(db, paths);
869
+ } catch (err) {
870
+ const error = err;
871
+ logger.error({ error: error.message, paths }, "FTS \u5220\u9664\u5931\u8D25");
872
+ clearVectorIndexHash(db, paths);
873
+ throw err;
874
+ }
875
+ }
876
+ try {
877
+ await this.vectorStore.deleteFiles(paths);
878
+ } catch (err) {
879
+ const error = err;
880
+ logger.error({ error: error.message, paths }, "LanceDB \u5220\u9664\u5931\u8D25\uFF0C\u5B64\u513F\u6570\u636E\u5C06\u7531 GC \u6E05\u7406");
881
+ clearVectorIndexHash(db, paths);
882
+ throw err;
898
883
  }
899
884
  logger.debug({ count: paths.length }, "\u5220\u9664\u6587\u4EF6\u7D22\u5F15");
900
885
  }
@@ -923,6 +908,67 @@ ${record.display_code}`
923
908
  }
924
909
  await this.vectorStore?.clear();
925
910
  }
911
+ /**
912
+ * 垃圾回收:清理 LanceDB 中的孤儿 chunks
913
+ *
914
+ * 孤儿来源:
915
+ * - 事务补偿失败遗留(FTS 回滚成功但 LanceDB 删除失败)
916
+ * - 跨进程崩溃导致的 hash 不匹配残留
917
+ * - 删除流程失败遗留
918
+ *
919
+ * 算法:以 SQLite files 表 (path, hash) 为权威源,删除 LanceDB 中不存在的组合。
920
+ * 同步清理 chunks_fts:仅当 path 在 SQLite 完全不存在时才删(hash 变化的 FTS 由 upsert 覆盖)。
921
+ *
922
+ * 性能护栏:time budget 默认 5s,超时则跳过避免阻塞扫描主流程。
923
+ */
924
+ async gc(db, options = {}) {
925
+ if (!this.vectorStore) {
926
+ await this.init();
927
+ }
928
+ const startTime = Date.now();
929
+ const timeBudget = options.maxScanMs ?? 5e3;
930
+ let vectorPairs;
931
+ try {
932
+ vectorPairs = await this.vectorStore?.listFileHashes() ?? [];
933
+ } catch (err) {
934
+ const error = err;
935
+ logger.warn({ error: error.message }, "GC: listFileHashes \u5931\u8D25\uFF0C\u8DF3\u8FC7");
936
+ return { orphans: 0 };
937
+ }
938
+ if (vectorPairs.length === 0) return { orphans: 0 };
939
+ if (Date.now() - startTime > timeBudget) {
940
+ logger.warn(
941
+ { elapsed: Date.now() - startTime, budget: timeBudget },
942
+ "GC \u8D85\u65F6\uFF08\u62C9\u53D6\u9636\u6BB5\uFF09\uFF0C\u672C\u6B21\u8DF3\u8FC7"
943
+ );
944
+ return { orphans: 0, truncated: true };
945
+ }
946
+ const sqliteRows = db.prepare("SELECT path, hash FROM files").all();
947
+ const validPairs = new Set(sqliteRows.map((r) => `${r.path} ${r.hash}`));
948
+ const sqlitePaths = new Set(sqliteRows.map((r) => r.path));
949
+ const orphans = vectorPairs.filter((p) => !validPairs.has(`${p.path} ${p.hash}`));
950
+ if (orphans.length === 0) return { orphans: 0 };
951
+ logger.info({ count: orphans.length }, "GC: \u53D1\u73B0\u5B64\u513F chunks");
952
+ try {
953
+ await this.vectorStore?.deleteFilesByHash(orphans);
954
+ } catch (err) {
955
+ const error = err;
956
+ logger.warn({ error: error.message }, "GC: LanceDB \u5220\u9664\u5931\u8D25\uFF0C\u4E0B\u6B21\u91CD\u8BD5");
957
+ return { orphans: 0 };
958
+ }
959
+ const pathsToFtsClean = Array.from(new Set(orphans.map((o) => o.path))).filter(
960
+ (p) => !sqlitePaths.has(p)
961
+ );
962
+ if (pathsToFtsClean.length > 0 && isChunksFtsInitialized(db)) {
963
+ try {
964
+ batchDeleteFileChunksFts(db, pathsToFtsClean);
965
+ } catch (err) {
966
+ const error = err;
967
+ logger.warn({ error: error.message }, "GC: chunks_fts \u6E05\u7406\u5931\u8D25");
968
+ }
969
+ }
970
+ return { orphans: orphans.length };
971
+ }
926
972
  /**
927
973
  * 获取索引统计
928
974
  */
@@ -959,8 +1005,8 @@ function getTokenBoundaryRegex(token) {
959
1005
  }
960
1006
  return regex;
961
1007
  }
962
- function scoreChunkTokenOverlap(chunk, queryTokens) {
963
- const text = `${chunk.breadcrumb} ${chunk.display_code}`.toLowerCase();
1008
+ function scoreChunkTokenOverlap(chunk, code, queryTokens) {
1009
+ const text = `${chunk.breadcrumb} ${code}`.toLowerCase();
964
1010
  let score = 0;
965
1011
  for (const token of queryTokens) {
966
1012
  if (text.includes(token)) {
@@ -975,21 +1021,6 @@ function scoreChunkTokenOverlap(chunk, queryTokens) {
975
1021
  return score;
976
1022
  }
977
1023
 
978
- // src/search/resolvers/types.ts
979
- function commonPrefixLength(path1, path2) {
980
- const parts1 = path1.split("/");
981
- const parts2 = path2.split("/");
982
- let count = 0;
983
- for (let i = 0; i < Math.min(parts1.length, parts2.length); i++) {
984
- if (parts1[i] === parts2[i]) {
985
- count++;
986
- } else {
987
- break;
988
- }
989
- }
990
- return count;
991
- }
992
-
993
1024
  // src/search/resolvers/CppResolver.ts
994
1025
  var CPP_EXTENSIONS = /* @__PURE__ */ new Set([".c", ".cpp", ".cc", ".cxx", ".h", ".hpp", ".hh", ".hxx"]);
995
1026
  var CppResolver = class {
@@ -1643,6 +1674,20 @@ var GraphExpander = class {
1643
1674
  if (allTargetPaths.size === 0) return result;
1644
1675
  const importChunksMap = await this.vectorStore?.getFilesChunks(Array.from(allTargetPaths));
1645
1676
  if (!importChunksMap) return result;
1677
+ const sharedLoader = new ChunkContentLoader(this.db);
1678
+ const allSlices = [];
1679
+ if (queryTokens && queryTokens.size > 0) {
1680
+ for (const chunks of importChunksMap.values()) {
1681
+ for (const c of chunks) {
1682
+ allSlices.push({
1683
+ filePath: c.file_path,
1684
+ start_index: c.start_index,
1685
+ end_index: c.end_index
1686
+ });
1687
+ }
1688
+ }
1689
+ }
1690
+ const sharedCodeMap = sharedLoader.loadMany(allSlices);
1646
1691
  const bestByKey = /* @__PURE__ */ new Map();
1647
1692
  for (const { targetPath, depth, seedScore } of resolvedImports) {
1648
1693
  const importChunks = importChunksMap.get(targetPath);
@@ -1650,7 +1695,8 @@ var GraphExpander = class {
1650
1695
  const selectedChunks = this.selectImportChunks(
1651
1696
  importChunks,
1652
1697
  chunksPerImportFile,
1653
- queryTokens
1698
+ queryTokens,
1699
+ sharedCodeMap
1654
1700
  );
1655
1701
  const depthDecay = depth === 0 ? 1 : decayDepth;
1656
1702
  for (const chunk of selectedChunks) {
@@ -1709,16 +1755,38 @@ var GraphExpander = class {
1709
1755
  /**
1710
1756
  * 选择导入文件的 chunks(优先 query overlap)
1711
1757
  */
1712
- selectImportChunks(chunks, limit, queryTokens) {
1758
+ selectImportChunks(chunks, limit, queryTokens, sharedCodeMap) {
1713
1759
  if (limit <= 0) return [];
1714
1760
  const sortedByIndex = chunks.slice().sort((a, b) => a.chunk_index - b.chunk_index);
1715
1761
  if (!queryTokens || queryTokens.size === 0) {
1716
1762
  return sortedByIndex.slice(0, limit);
1717
1763
  }
1718
- const scored = sortedByIndex.map((chunk) => ({
1719
- chunk,
1720
- score: scoreChunkTokenOverlap(chunk, queryTokens)
1721
- }));
1764
+ let codeMap;
1765
+ if (sharedCodeMap) {
1766
+ codeMap = sharedCodeMap;
1767
+ } else {
1768
+ const loader = new ChunkContentLoader(this.db);
1769
+ codeMap = loader.loadMany(
1770
+ sortedByIndex.map((c) => ({
1771
+ filePath: c.file_path,
1772
+ start_index: c.start_index,
1773
+ end_index: c.end_index
1774
+ }))
1775
+ );
1776
+ }
1777
+ const scored = sortedByIndex.map((chunk) => {
1778
+ const code = codeMap.get(
1779
+ ChunkContentLoader.key({
1780
+ filePath: chunk.file_path,
1781
+ start_index: chunk.start_index,
1782
+ end_index: chunk.end_index
1783
+ })
1784
+ ) ?? "";
1785
+ return {
1786
+ chunk,
1787
+ score: scoreChunkTokenOverlap(chunk, code, queryTokens)
1788
+ };
1789
+ });
1722
1790
  const overlapped = scored.filter((s) => s.score > 0).sort((a, b) => b.score - a.score).slice(0, limit).map((s) => s.chunk);
1723
1791
  return overlapped.length > 0 ? overlapped : sortedByIndex.slice(0, limit);
1724
1792
  }
@@ -1749,12 +1817,10 @@ async function getGraphExpander(projectId, config) {
1749
1817
  }
1750
1818
 
1751
1819
  export {
1752
- getVectorStore,
1753
- closeAllVectorStores,
1820
+ bootstrap,
1754
1821
  getIndexer,
1755
1822
  closeAllIndexers,
1756
1823
  scoreChunkTokenOverlap,
1757
1824
  invalidateAllExpanderCaches,
1758
1825
  getGraphExpander
1759
1826
  };
1760
- //# sourceMappingURL=chunk-6QMYML5V.js.map