@chiway/contextweaver 1.1.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,270 +1,145 @@
1
+ import {
2
+ getVectorStore,
3
+ sampleCheckDisplayCode
4
+ } from "./chunk-ZOMGPIU6.js";
1
5
  import {
2
6
  batchDeleteFileChunksFts,
3
7
  batchUpdateVectorIndexHash,
4
8
  batchUpsertChunkFts,
9
+ clearAllVectorIndexHash,
5
10
  clearVectorIndexHash,
11
+ deletePendingMarks,
12
+ getLanceDbMigrationState,
6
13
  initDb,
7
- isChunksFtsInitialized
8
- } from "./chunk-6Z4JEEVJ.js";
14
+ insertPendingMarks,
15
+ isChunksFtsInitialized,
16
+ releaseLanceDbMigrationLock,
17
+ replayPendingMarks,
18
+ setLanceDbMigrationState,
19
+ tryAcquireLanceDbMigrationLock
20
+ } from "./chunk-RGJSXUFS.js";
9
21
  import {
10
22
  logger
11
- } from "./chunk-AMQQK4P7.js";
23
+ } from "./chunk-JVKVSTQ3.js";
12
24
  import {
13
25
  getEmbeddingConfig
14
- } from "./chunk-RJURH22T.js";
26
+ } from "./chunk-SKBAE26T.js";
15
27
 
16
- // src/vectorStore/index.ts
17
- import fs from "fs";
18
- import os from "os";
19
- import path from "path";
20
- import * as lancedb from "@lancedb/lancedb";
21
- var BASE_DIR = path.join(os.homedir(), ".contextweaver");
22
- var VectorStore = class {
23
- db = null;
24
- table = null;
25
- projectId;
26
- dbPath;
27
- vectorDim;
28
- constructor(projectId, vectorDim = 1024) {
29
- this.projectId = projectId;
30
- this.dbPath = path.join(BASE_DIR, projectId, "vectors.lance");
31
- this.vectorDim = vectorDim;
32
- }
33
- /**
34
- * 初始化连接
35
- */
36
- async init() {
37
- if (this.db) return;
38
- const projectDir = path.join(BASE_DIR, this.projectId);
39
- if (!fs.existsSync(projectDir)) {
40
- fs.mkdirSync(projectDir, { recursive: true });
41
- }
42
- this.db = await lancedb.connect(this.dbPath);
43
- const tableNames = await this.db.tableNames();
44
- if (tableNames.includes("chunks")) {
45
- this.table = await this.db.openTable("chunks");
28
+ // src/db/bootstrap.ts
29
+ async function bootstrap(db, vectorStore, options = {}) {
30
+ const result = {
31
+ replay: { applied: 0, discarded: 0 },
32
+ migration: { migrated: false, totalRows: 0 }
33
+ };
34
+ try {
35
+ result.replay = replayPendingMarks(db);
36
+ if (result.replay.applied > 0 || result.replay.discarded > 0) {
37
+ logger.info(result.replay, "pending_marks \u542F\u52A8\u91CD\u653E\uFF1A\u6807\u8BB0\u4E0A\u6B21\u672A\u6536\u655B\u7684\u7D22\u5F15\u72B6\u6001");
46
38
  }
39
+ } catch (err) {
40
+ const error = err;
41
+ logger.warn({ error: error.message }, "pending_marks \u91CD\u653E\u5931\u8D25\uFF0C\u672C\u6B21\u8DF3\u8FC7");
47
42
  }
48
- /**
49
- * 确保表存在(首次插入时调用)
50
- */
51
- async ensureTable(records) {
52
- if (this.table) return;
53
- if (!this.db) throw new Error("VectorStore not initialized");
54
- if (records.length === 0) return;
55
- this.table = await this.db.createTable(
56
- "chunks",
57
- records
58
- );
59
- }
60
- /**
61
- * 单调版本更新:先插入新版本,再删除旧版本
62
- *
63
- * 这保证了:
64
- * - 最坏情况(崩溃)是新旧版本共存(不缺失)
65
- * - 正常情况下旧版本被清理
66
- */
67
- async upsertFile(filePath, newHash, records) {
68
- if (!this.db) throw new Error("VectorStore not initialized");
69
- if (records.length === 0) {
70
- await this.deleteFile(filePath);
71
- return;
72
- }
73
- if (!this.table) {
74
- await this.ensureTable(records);
75
- } else {
76
- await this.table.add(records);
77
- }
78
- if (this.table) {
79
- await this.table.delete(
80
- `file_path = '${this.escapeString(filePath)}' AND file_hash != '${this.escapeString(newHash)}'`
43
+ try {
44
+ result.migration = await migrateRemoveDisplayCode(db, vectorStore, options);
45
+ if (result.migration.migrated) {
46
+ logger.info(
47
+ { totalRows: result.migration.totalRows, reason: result.migration.reason },
48
+ "LanceDB schema \u8FC1\u79FB\u5B8C\u6210\uFF1Achunks \u8868\u5DF2\u79FB\u9664 display_code/vector_text"
49
+ );
50
+ } else if (result.migration.reason?.startsWith("mismatch_ratio_")) {
51
+ logger.error(
52
+ { reason: result.migration.reason, mismatched: result.migration.mismatched },
53
+ "LanceDB schema \u8FC1\u79FB\u4E2D\u6B62\uFF1Adisplay_code \u4E0E files.content \u62BD\u6837\u5DEE\u5F02\u8FC7\u5927\uFF0C\u8BF7\u68C0\u67E5\u7D22\u5F15\u4E00\u81F4\u6027\u6216\u8FD0\u884C `contextweaver migrate --reset`"
81
54
  );
82
55
  }
56
+ } catch (err) {
57
+ const error = err;
58
+ logger.warn({ error: error.message }, "LanceDB schema \u8FC1\u79FB\u5931\u8D25\uFF0C\u672C\u6B21\u8DF3\u8FC7");
83
59
  }
84
- /**
85
- * 批量 upsert 多个文件(性能优化版,带分批机制)
86
- *
87
- * 流程:
88
- * 1. 将文件分成小批次(每批最多 BATCH_FILES 个文件或 BATCH_RECORDS 条记录)
89
- * 2. 每批执行:插入新 records 删除旧版本
90
- *
91
- * 分批是必要的,因为 LanceDB native 模块在处理超大数据时可能崩溃
92
- *
93
- * @param files 文件列表,每个包含 path、hash 和 records
94
- */
95
- async batchUpsertFiles(files) {
96
- if (!this.db) throw new Error("VectorStore not initialized");
97
- if (files.length === 0) return;
98
- const BATCH_FILES = 50;
99
- const BATCH_RECORDS = 5e3;
100
- const batches = [];
101
- let currentBatch = [];
102
- let currentRecordCount = 0;
103
- for (const file of files) {
104
- if (currentBatch.length >= BATCH_FILES || currentRecordCount + file.records.length > BATCH_RECORDS) {
105
- if (currentBatch.length > 0) {
106
- batches.push(currentBatch);
107
- }
108
- currentBatch = [];
109
- currentRecordCount = 0;
110
- }
111
- currentBatch.push(file);
112
- currentRecordCount += file.records.length;
113
- }
114
- if (currentBatch.length > 0) {
115
- batches.push(currentBatch);
116
- }
117
- for (const batch of batches) {
118
- const batchRecords = [];
119
- for (const file of batch) {
120
- batchRecords.push(...file.records);
121
- }
122
- if (batchRecords.length === 0) {
123
- const pathsToDelete = batch.map((f) => f.path);
124
- await this.deleteFiles(pathsToDelete);
125
- continue;
126
- }
127
- if (!this.table) {
128
- await this.ensureTable(batchRecords);
129
- } else {
130
- await this.table.add(batchRecords);
131
- }
132
- if (this.table && batch.length > 0) {
133
- const deleteConditions = batch.map(
134
- (f) => `(file_path = '${this.escapeString(f.path)}' AND file_hash != '${this.escapeString(f.hash)}')`
135
- ).join(" OR ");
136
- await this.table.delete(deleteConditions);
137
- }
138
- }
139
- }
140
- /**
141
- * 删除文件的所有 chunks
142
- */
143
- async deleteFile(filePath) {
144
- if (!this.table) return;
145
- await this.table.delete(`file_path = '${this.escapeString(filePath)}'`);
146
- }
147
- /**
148
- * 批量删除文件(性能优化:单次 DELETE 替代 N 次循环)
149
- * 当文件数超过 500 时分批处理,防止 LanceDB filter 字符串过长
150
- */
151
- async deleteFiles(filePaths) {
152
- if (!this.table || filePaths.length === 0) return;
153
- const BATCH_SIZE = 500;
154
- if (filePaths.length <= BATCH_SIZE) {
155
- const conditions = filePaths.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
156
- await this.table.delete(conditions);
157
- } else {
158
- for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
159
- const batch = filePaths.slice(i, i + BATCH_SIZE);
160
- const conditions = batch.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
161
- await this.table.delete(conditions);
60
+ return result;
61
+ }
62
+ async function migrateRemoveDisplayCode(db, vectorStore, options = {}) {
63
+ const earlyState = getLanceDbMigrationState(db);
64
+ if (earlyState === "done") {
65
+ return { migrated: false, totalRows: 0, reason: "already_migrated_persisted" };
66
+ }
67
+ if (earlyState === "aborted") {
68
+ return { migrated: false, totalRows: 0, reason: "aborted_awaiting_manual" };
69
+ }
70
+ if (!tryAcquireLanceDbMigrationLock(db)) {
71
+ return { migrated: false, totalRows: 0, reason: "lock_held_by_other_process" };
72
+ }
73
+ try {
74
+ const persistedState = getLanceDbMigrationState(db);
75
+ if (persistedState === "done") {
76
+ return { migrated: false, totalRows: 0, reason: "already_migrated_persisted" };
77
+ }
78
+ if (persistedState === "aborted") {
79
+ return { migrated: false, totalRows: 0, reason: "aborted_awaiting_manual" };
80
+ }
81
+ const hasCol = await vectorStore.hasDisplayCodeColumn();
82
+ if (persistedState === "pending") {
83
+ if (hasCol === null) {
84
+ setLanceDbMigrationState(db, "done");
85
+ return { migrated: true, totalRows: 0, reason: "recovered_pending_no_table" };
162
86
  }
163
87
  }
164
- }
165
- /**
166
- * 向量搜索
167
- */
168
- async search(queryVector, limit = 10, filter) {
169
- if (!this.table) return [];
170
- let query = this.table.vectorSearch(queryVector).limit(limit);
171
- if (filter) {
172
- query = query.where(filter);
173
- }
174
- const results = await query.toArray();
175
- return results;
176
- }
177
- /**
178
- * 获取文件的所有 chunks(按 chunk_index 排序)
179
- */
180
- async getFileChunks(filePath) {
181
- if (!this.table) return [];
182
- const results = await this.table.query().where(`file_path = '${this.escapeString(filePath)}'`).toArray();
183
- const chunks = results;
184
- return chunks.sort((a, b) => a.chunk_index - b.chunk_index);
185
- }
186
- /**
187
- * 批量获取多个文件的 chunks(性能优化:单次查询替代 N 次循环)
188
- * 当文件数超过 500 时分批处理,防止 LanceDB filter 字符串过长
189
- *
190
- * 适用于 GraphExpander 扩展、词法召回等需要批量获取的场景
191
- * @returns Map<filePath, ChunkRecord[]>,每个文件的 chunks 已按 chunk_index 排序
192
- */
193
- async getFilesChunks(filePaths) {
194
- const result = /* @__PURE__ */ new Map();
195
- if (!this.table || filePaths.length === 0) return result;
196
- const BATCH_SIZE = 500;
197
- for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
198
- const batch = filePaths.slice(i, i + BATCH_SIZE);
199
- const conditions = batch.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
200
- const rows = await this.table.query().where(conditions).toArray();
201
- for (const row of rows) {
202
- let arr = result.get(row.file_path);
203
- if (!arr) {
204
- arr = [];
205
- result.set(row.file_path, arr);
206
- }
207
- arr.push(row);
88
+ if (hasCol === null) {
89
+ setLanceDbMigrationState(db, "done");
90
+ return { migrated: false, totalRows: 0, reason: "empty" };
91
+ }
92
+ if (!hasCol) {
93
+ setLanceDbMigrationState(db, "done");
94
+ return { migrated: false, totalRows: 0, reason: "already_migrated" };
95
+ }
96
+ const sampleSize = options.sampleSize ?? 100;
97
+ const maxMismatchRatio = options.sampleMaxMismatchRatio ?? 0.01;
98
+ const oldRows = await vectorStore.readAllRowsRaw();
99
+ const totalRows = oldRows.length;
100
+ if (totalRows > 0) {
101
+ const stmt = db.prepare("SELECT content FROM files WHERE path = ?");
102
+ const getContent = (path) => {
103
+ const row = stmt.get(path);
104
+ return row?.content ?? null;
105
+ };
106
+ const check = sampleCheckDisplayCode(oldRows, getContent, {
107
+ sampleSize,
108
+ maxMismatchRatio
109
+ });
110
+ if (check.abort) {
111
+ setLanceDbMigrationState(db, "aborted");
112
+ return {
113
+ migrated: false,
114
+ totalRows,
115
+ mismatched: check.mismatched,
116
+ reason: `mismatch_ratio_${check.ratio.toFixed(3)}_exceeds_${maxMismatchRatio}`
117
+ };
208
118
  }
209
119
  }
210
- for (const arr of result.values()) {
211
- arr.sort((a, b) => a.chunk_index - b.chunk_index);
212
- }
213
- return result;
214
- }
215
- /**
216
- * 获取表的总记录数
217
- */
218
- async count() {
219
- if (!this.table) return 0;
220
- return await this.table.countRows();
221
- }
222
- /**
223
- * 清空所有数据
224
- */
225
- async clear() {
226
- if (!this.db) return;
227
- try {
228
- await this.db.dropTable("chunks");
229
- this.table = null;
230
- } catch {
231
- }
232
- }
233
- /**
234
- * 获取向量维度
235
- */
236
- getVectorDim() {
237
- return this.vectorDim;
238
- }
239
- /**
240
- * 转义字符串(防止 SQL 注入)
241
- */
242
- escapeString(str) {
243
- return str.replace(/'/g, "''");
244
- }
245
- /**
246
- * 关闭连接
247
- */
248
- async close() {
249
- this.db = null;
250
- this.table = null;
251
- }
252
- };
253
- var vectorStores = /* @__PURE__ */ new Map();
254
- async function getVectorStore(projectId, vectorDim = 1024) {
255
- let store = vectorStores.get(projectId);
256
- if (!store) {
257
- store = new VectorStore(projectId, vectorDim);
258
- await store.init();
259
- vectorStores.set(projectId, store);
260
- }
261
- return store;
262
- }
263
- async function closeAllVectorStores() {
264
- for (const store of vectorStores.values()) {
265
- await store.close();
120
+ const newRows = oldRows.map((r) => ({
121
+ chunk_id: r.chunk_id,
122
+ file_path: r.file_path,
123
+ file_hash: r.file_hash,
124
+ chunk_index: r.chunk_index,
125
+ vector: Array.from(r.vector),
126
+ language: r.language,
127
+ breadcrumb: r.breadcrumb,
128
+ start_index: r.start_index,
129
+ end_index: r.end_index,
130
+ raw_start: r.raw_start,
131
+ raw_end: r.raw_end,
132
+ vec_start: r.vec_start,
133
+ vec_end: r.vec_end
134
+ }));
135
+ const cleared = clearAllVectorIndexHash(db);
136
+ setLanceDbMigrationState(db, "pending");
137
+ await vectorStore.dropAndRecreateChunks(newRows);
138
+ setLanceDbMigrationState(db, "done");
139
+ return { migrated: true, totalRows, reason: `cleared_${cleared}_vector_index_hash` };
140
+ } finally {
141
+ releaseLanceDbMigrationLock(db);
266
142
  }
267
- vectorStores.clear();
268
143
  }
269
144
 
270
145
  // src/api/embedding.ts
@@ -634,6 +509,8 @@ var Indexer = class {
634
509
  vectorStore = null;
635
510
  embeddingClient;
636
511
  vectorDim;
512
+ /** bootstrap(pending_marks 重放 + LanceDB 迁移)只在每个 db 上执行一次 */
513
+ bootstrappedDbs = /* @__PURE__ */ new WeakSet();
637
514
  constructor(projectId, vectorDim = 1024) {
638
515
  this.projectId = projectId;
639
516
  this.vectorDim = vectorDim;
@@ -656,6 +533,31 @@ var Indexer = class {
656
533
  if (!this.vectorStore) {
657
534
  await this.init();
658
535
  }
536
+ if (!this.bootstrappedDbs.has(db)) {
537
+ this.bootstrappedDbs.add(db);
538
+ try {
539
+ await bootstrap(db, this.vectorStore);
540
+ } catch (err) {
541
+ const error = err;
542
+ logger.warn({ error: error.message }, "bootstrap \u5931\u8D25\uFF0C\u672C\u6B21\u8DF3\u8FC7");
543
+ }
544
+ }
545
+ const migrationState = getLanceDbMigrationState(db);
546
+ if (migrationState === "aborted") {
547
+ const errorCount = results.filter(
548
+ (r) => r.status === "added" || r.status === "modified"
549
+ ).length;
550
+ logger.error(
551
+ { migrationState, blockedFiles: errorCount },
552
+ "LanceDB \u5904\u4E8E aborted \u72B6\u6001\uFF0C\u62D2\u7EDD\u5199\u5165\u4EE5\u9632\u6B62 schema \u6C61\u67D3\u3002\u8FD0\u884C `contextweaver migrate --reset` \u6E05\u7A7A LanceDB \u5E76\u91CD\u65B0\u7D22\u5F15\u3002"
553
+ );
554
+ return {
555
+ indexed: 0,
556
+ deleted: 0,
557
+ errors: errorCount,
558
+ skipped: results.length - errorCount
559
+ };
560
+ }
659
561
  const stats = {
660
562
  indexed: 0,
661
563
  deleted: 0,
@@ -699,8 +601,17 @@ var Indexer = class {
699
601
  }
700
602
  }
701
603
  if (toDelete.length > 0) {
702
- await this.deleteFiles(db, toDelete);
703
- stats.deleted = toDelete.length;
604
+ try {
605
+ await this.deleteFiles(db, toDelete);
606
+ stats.deleted = toDelete.length;
607
+ } catch (err) {
608
+ const error = err;
609
+ logger.error(
610
+ { error: error.message, count: toDelete.length },
611
+ "\u5220\u9664\u9636\u6BB5\u5931\u8D25\uFF0C\u5DF2\u6807\u8BB0\u91CD\u8BD5"
612
+ );
613
+ stats.errors += toDelete.length;
614
+ }
704
615
  }
705
616
  if (noChunkSettled.length > 0) {
706
617
  batchUpdateVectorIndexHash(db, noChunkSettled);
@@ -812,8 +723,6 @@ var Indexer = class {
812
723
  file_hash: file.hash,
813
724
  chunk_index: chunkIdx,
814
725
  vector: embeddings[embIdx],
815
- display_code: chunk.displayCode,
816
- vector_text: chunk.vectorText,
817
726
  language: chunk.metadata.language,
818
727
  breadcrumb: chunk.metadata.contextPath.join(" > "),
819
728
  start_index: chunk.metadata.startIndex,
@@ -830,7 +739,7 @@ var Indexer = class {
830
739
  chunkIndex: record.chunk_index,
831
740
  breadcrumb: record.breadcrumb,
832
741
  content: `${record.breadcrumb}
833
- ${record.display_code}`
742
+ ${chunk.displayCode}`
834
743
  });
835
744
  }
836
745
  filesToUpsert.push({ path: file.path, hash: file.hash, records });
@@ -858,19 +767,64 @@ ${record.display_code}`
858
767
  completedChunks += batchTexts.length;
859
768
  continue;
860
769
  }
861
- }
862
- if (isChunksFtsInitialized(db) && ftsChunks.length > 0) {
863
- try {
864
- const pathsToDelete = filesToUpsert.map((f) => f.path);
865
- batchDeleteFileChunksFts(db, pathsToDelete);
866
- batchUpsertChunkFts(db, ftsChunks);
867
- } catch (err) {
868
- const error = err;
869
- logger.warn({ error: error.message }, "FTS \u6279\u91CF\u66F4\u65B0\u5931\u8D25\uFF08\u5411\u91CF\u7D22\u5F15\u5DF2\u6210\u529F\uFF09");
770
+ if (isChunksFtsInitialized(db) && ftsChunks.length > 0) {
771
+ try {
772
+ const ftsAndOutboxTx = db.transaction(() => {
773
+ batchUpsertChunkFts(db, ftsChunks);
774
+ insertPendingMarks(db, successFiles);
775
+ });
776
+ ftsAndOutboxTx();
777
+ } catch (err) {
778
+ const error = err;
779
+ logger.error(
780
+ { error: error.message, stack: error.stack, batch: `${batchNum}/${totalBatches}` },
781
+ "FTS/outbox \u5199\u5165\u5931\u8D25\uFF0C\u56DE\u6EDA LanceDB \u65B0\u7248\u672C"
782
+ );
783
+ try {
784
+ await this.vectorStore?.deleteFilesByHash(
785
+ filesToUpsert.map((f) => ({ path: f.path, hash: f.hash }))
786
+ );
787
+ } catch (rollbackErr) {
788
+ const rbError = rollbackErr;
789
+ logger.error(
790
+ { error: rbError.message },
791
+ "LanceDB \u56DE\u6EDA\u5931\u8D25\uFF0C\u5B64\u513F\u6570\u636E\u5C06\u7531\u4E0B\u6B21 GC \u6E05\u7406"
792
+ );
793
+ }
794
+ clearVectorIndexHash(
795
+ db,
796
+ batchFiles.map((f) => f.path)
797
+ );
798
+ totalErrors += batchFiles.length;
799
+ completedChunks += batchTexts.length;
800
+ continue;
801
+ }
802
+ } else if (successFiles.length > 0) {
803
+ try {
804
+ insertPendingMarks(db, successFiles);
805
+ } catch (err) {
806
+ const error = err;
807
+ logger.warn({ error: error.message }, "outbox \u5199\u5165\u5931\u8D25\uFF08\u65E0 FTS \u8DEF\u5F84\uFF09\uFF0C\u7EE7\u7EED stage6");
808
+ }
809
+ }
810
+ if (successFiles.length > 0) {
811
+ try {
812
+ const markTx = db.transaction(() => {
813
+ batchUpdateVectorIndexHash(db, successFiles);
814
+ deletePendingMarks(
815
+ db,
816
+ successFiles.map((f) => f.path)
817
+ );
818
+ });
819
+ markTx();
820
+ } catch (err) {
821
+ const error = err;
822
+ logger.warn(
823
+ { error: error.message, batch: `${batchNum}/${totalBatches}` },
824
+ "stage6 mark \u5931\u8D25\uFF0Coutbox \u5DF2\u4FDD\u7559\uFF0C\u4E0B\u6B21\u542F\u52A8\u5C06\u91CD\u653E"
825
+ );
826
+ }
870
827
  }
871
- }
872
- if (successFiles.length > 0) {
873
- batchUpdateVectorIndexHash(db, successFiles);
874
828
  }
875
829
  totalSuccess += successFiles.length;
876
830
  totalErrors += errorFiles.length;
@@ -889,12 +843,29 @@ ${record.display_code}`
889
843
  }
890
844
  /**
891
845
  * 删除文件的向量和 FTS 索引
846
+ *
847
+ * 顺序:先删 FTS(SQLite 事务,可靠)→ 再删 LanceDB(可能失败)
848
+ * 任一阶段失败均通过 clearVectorIndexHash 触发下次扫描自愈
892
849
  */
893
850
  async deleteFiles(db, paths) {
894
- if (!this.vectorStore) return;
895
- await this.vectorStore.deleteFiles(paths);
851
+ if (!this.vectorStore || paths.length === 0) return;
896
852
  if (isChunksFtsInitialized(db)) {
897
- batchDeleteFileChunksFts(db, paths);
853
+ try {
854
+ batchDeleteFileChunksFts(db, paths);
855
+ } catch (err) {
856
+ const error = err;
857
+ logger.error({ error: error.message, paths }, "FTS \u5220\u9664\u5931\u8D25");
858
+ clearVectorIndexHash(db, paths);
859
+ throw err;
860
+ }
861
+ }
862
+ try {
863
+ await this.vectorStore.deleteFiles(paths);
864
+ } catch (err) {
865
+ const error = err;
866
+ logger.error({ error: error.message, paths }, "LanceDB \u5220\u9664\u5931\u8D25\uFF0C\u5B64\u513F\u6570\u636E\u5C06\u7531 GC \u6E05\u7406");
867
+ clearVectorIndexHash(db, paths);
868
+ throw err;
898
869
  }
899
870
  logger.debug({ count: paths.length }, "\u5220\u9664\u6587\u4EF6\u7D22\u5F15");
900
871
  }
@@ -923,6 +894,67 @@ ${record.display_code}`
923
894
  }
924
895
  await this.vectorStore?.clear();
925
896
  }
897
+ /**
898
+ * 垃圾回收:清理 LanceDB 中的孤儿 chunks
899
+ *
900
+ * 孤儿来源:
901
+ * - 事务补偿失败遗留(FTS 回滚成功但 LanceDB 删除失败)
902
+ * - 跨进程崩溃导致的 hash 不匹配残留
903
+ * - 删除流程失败遗留
904
+ *
905
+ * 算法:以 SQLite files 表 (path, hash) 为权威源,删除 LanceDB 中不存在的组合。
906
+ * 同步清理 chunks_fts:仅当 path 在 SQLite 完全不存在时才删(hash 变化的 FTS 由 upsert 覆盖)。
907
+ *
908
+ * 性能护栏:time budget 默认 5s,超时则跳过避免阻塞扫描主流程。
909
+ */
910
+ async gc(db, options = {}) {
911
+ if (!this.vectorStore) {
912
+ await this.init();
913
+ }
914
+ const startTime = Date.now();
915
+ const timeBudget = options.maxScanMs ?? 5e3;
916
+ let vectorPairs;
917
+ try {
918
+ vectorPairs = await this.vectorStore?.listFileHashes() ?? [];
919
+ } catch (err) {
920
+ const error = err;
921
+ logger.warn({ error: error.message }, "GC: listFileHashes \u5931\u8D25\uFF0C\u8DF3\u8FC7");
922
+ return { orphans: 0 };
923
+ }
924
+ if (vectorPairs.length === 0) return { orphans: 0 };
925
+ if (Date.now() - startTime > timeBudget) {
926
+ logger.warn(
927
+ { elapsed: Date.now() - startTime, budget: timeBudget },
928
+ "GC \u8D85\u65F6\uFF08\u62C9\u53D6\u9636\u6BB5\uFF09\uFF0C\u672C\u6B21\u8DF3\u8FC7"
929
+ );
930
+ return { orphans: 0, truncated: true };
931
+ }
932
+ const sqliteRows = db.prepare("SELECT path, hash FROM files").all();
933
+ const validPairs = new Set(sqliteRows.map((r) => `${r.path} ${r.hash}`));
934
+ const sqlitePaths = new Set(sqliteRows.map((r) => r.path));
935
+ const orphans = vectorPairs.filter((p) => !validPairs.has(`${p.path} ${p.hash}`));
936
+ if (orphans.length === 0) return { orphans: 0 };
937
+ logger.info({ count: orphans.length }, "GC: \u53D1\u73B0\u5B64\u513F chunks");
938
+ try {
939
+ await this.vectorStore?.deleteFilesByHash(orphans);
940
+ } catch (err) {
941
+ const error = err;
942
+ logger.warn({ error: error.message }, "GC: LanceDB \u5220\u9664\u5931\u8D25\uFF0C\u4E0B\u6B21\u91CD\u8BD5");
943
+ return { orphans: 0 };
944
+ }
945
+ const pathsToFtsClean = Array.from(new Set(orphans.map((o) => o.path))).filter(
946
+ (p) => !sqlitePaths.has(p)
947
+ );
948
+ if (pathsToFtsClean.length > 0 && isChunksFtsInitialized(db)) {
949
+ try {
950
+ batchDeleteFileChunksFts(db, pathsToFtsClean);
951
+ } catch (err) {
952
+ const error = err;
953
+ logger.warn({ error: error.message }, "GC: chunks_fts \u6E05\u7406\u5931\u8D25");
954
+ }
955
+ }
956
+ return { orphans: orphans.length };
957
+ }
926
958
  /**
927
959
  * 获取索引统计
928
960
  */
@@ -959,8 +991,8 @@ function getTokenBoundaryRegex(token) {
959
991
  }
960
992
  return regex;
961
993
  }
962
- function scoreChunkTokenOverlap(chunk, queryTokens) {
963
- const text = `${chunk.breadcrumb} ${chunk.display_code}`.toLowerCase();
994
+ function scoreChunkTokenOverlap(chunk, code, queryTokens) {
995
+ const text = `${chunk.breadcrumb} ${code}`.toLowerCase();
964
996
  let score = 0;
965
997
  for (const token of queryTokens) {
966
998
  if (text.includes(token)) {
@@ -975,6 +1007,60 @@ function scoreChunkTokenOverlap(chunk, queryTokens) {
975
1007
  return score;
976
1008
  }
977
1009
 
1010
+ // src/search/ChunkContentLoader.ts
1011
+ var ChunkContentLoader = class _ChunkContentLoader {
1012
+ constructor(db) {
1013
+ this.db = db;
1014
+ }
1015
+ /**
1016
+ * 生成 cache key
1017
+ */
1018
+ static key(slice) {
1019
+ return `${slice.filePath}#${slice.start_index}#${slice.end_index}`;
1020
+ }
1021
+ /**
1022
+ * 批量加载 chunk 正文
1023
+ *
1024
+ * @returns Map<key, code>,key 由 ChunkContentLoader.key 生成
1025
+ */
1026
+ loadMany(slices) {
1027
+ const result = /* @__PURE__ */ new Map();
1028
+ if (slices.length === 0) return result;
1029
+ const byPath = /* @__PURE__ */ new Map();
1030
+ for (const s of slices) {
1031
+ let arr = byPath.get(s.filePath);
1032
+ if (!arr) {
1033
+ arr = [];
1034
+ byPath.set(s.filePath, arr);
1035
+ }
1036
+ arr.push(s);
1037
+ }
1038
+ const stmt = this.db.prepare("SELECT content FROM files WHERE path = ?");
1039
+ for (const [path, spans] of byPath) {
1040
+ const row = stmt.get(path);
1041
+ const content = row?.content ?? null;
1042
+ for (const s of spans) {
1043
+ const k = _ChunkContentLoader.key(s);
1044
+ if (content === null) {
1045
+ result.set(k, "");
1046
+ continue;
1047
+ }
1048
+ const safeStart = Math.max(0, Math.min(s.start_index, content.length));
1049
+ const safeEnd = Math.max(safeStart, Math.min(s.end_index, content.length));
1050
+ result.set(k, content.slice(safeStart, safeEnd));
1051
+ }
1052
+ }
1053
+ return result;
1054
+ }
1055
+ /**
1056
+ * 加载单个 chunk 正文(便捷方法,不推荐在批量场景使用)
1057
+ */
1058
+ loadOne(slice) {
1059
+ const map = this.loadMany([slice]);
1060
+ return map.get(_ChunkContentLoader.key(slice)) ?? "";
1061
+ }
1062
+ };
1063
+
978
1064
  // src/search/resolvers/types.ts
979
1065
  function commonPrefixLength(path1, path2) {
980
1066
  const parts1 = path1.split("/");
@@ -1643,6 +1729,20 @@ var GraphExpander = class {
1643
1729
  if (allTargetPaths.size === 0) return result;
1644
1730
  const importChunksMap = await this.vectorStore?.getFilesChunks(Array.from(allTargetPaths));
1645
1731
  if (!importChunksMap) return result;
1732
+ const sharedLoader = new ChunkContentLoader(this.db);
1733
+ const allSlices = [];
1734
+ if (queryTokens && queryTokens.size > 0) {
1735
+ for (const chunks of importChunksMap.values()) {
1736
+ for (const c of chunks) {
1737
+ allSlices.push({
1738
+ filePath: c.file_path,
1739
+ start_index: c.start_index,
1740
+ end_index: c.end_index
1741
+ });
1742
+ }
1743
+ }
1744
+ }
1745
+ const sharedCodeMap = sharedLoader.loadMany(allSlices);
1646
1746
  const bestByKey = /* @__PURE__ */ new Map();
1647
1747
  for (const { targetPath, depth, seedScore } of resolvedImports) {
1648
1748
  const importChunks = importChunksMap.get(targetPath);
@@ -1650,7 +1750,8 @@ var GraphExpander = class {
1650
1750
  const selectedChunks = this.selectImportChunks(
1651
1751
  importChunks,
1652
1752
  chunksPerImportFile,
1653
- queryTokens
1753
+ queryTokens,
1754
+ sharedCodeMap
1654
1755
  );
1655
1756
  const depthDecay = depth === 0 ? 1 : decayDepth;
1656
1757
  for (const chunk of selectedChunks) {
@@ -1709,16 +1810,38 @@ var GraphExpander = class {
1709
1810
  /**
1710
1811
  * 选择导入文件的 chunks(优先 query overlap)
1711
1812
  */
1712
- selectImportChunks(chunks, limit, queryTokens) {
1813
+ selectImportChunks(chunks, limit, queryTokens, sharedCodeMap) {
1713
1814
  if (limit <= 0) return [];
1714
1815
  const sortedByIndex = chunks.slice().sort((a, b) => a.chunk_index - b.chunk_index);
1715
1816
  if (!queryTokens || queryTokens.size === 0) {
1716
1817
  return sortedByIndex.slice(0, limit);
1717
1818
  }
1718
- const scored = sortedByIndex.map((chunk) => ({
1719
- chunk,
1720
- score: scoreChunkTokenOverlap(chunk, queryTokens)
1721
- }));
1819
+ let codeMap;
1820
+ if (sharedCodeMap) {
1821
+ codeMap = sharedCodeMap;
1822
+ } else {
1823
+ const loader = new ChunkContentLoader(this.db);
1824
+ codeMap = loader.loadMany(
1825
+ sortedByIndex.map((c) => ({
1826
+ filePath: c.file_path,
1827
+ start_index: c.start_index,
1828
+ end_index: c.end_index
1829
+ }))
1830
+ );
1831
+ }
1832
+ const scored = sortedByIndex.map((chunk) => {
1833
+ const code = codeMap.get(
1834
+ ChunkContentLoader.key({
1835
+ filePath: chunk.file_path,
1836
+ start_index: chunk.start_index,
1837
+ end_index: chunk.end_index
1838
+ })
1839
+ ) ?? "";
1840
+ return {
1841
+ chunk,
1842
+ score: scoreChunkTokenOverlap(chunk, code, queryTokens)
1843
+ };
1844
+ });
1722
1845
  const overlapped = scored.filter((s) => s.score > 0).sort((a, b) => b.score - a.score).slice(0, limit).map((s) => s.chunk);
1723
1846
  return overlapped.length > 0 ? overlapped : sortedByIndex.slice(0, limit);
1724
1847
  }
@@ -1749,12 +1872,11 @@ async function getGraphExpander(projectId, config) {
1749
1872
  }
1750
1873
 
1751
1874
  export {
1752
- getVectorStore,
1753
- closeAllVectorStores,
1875
+ bootstrap,
1754
1876
  getIndexer,
1755
1877
  closeAllIndexers,
1756
1878
  scoreChunkTokenOverlap,
1879
+ ChunkContentLoader,
1757
1880
  invalidateAllExpanderCaches,
1758
1881
  getGraphExpander
1759
1882
  };
1760
- //# sourceMappingURL=chunk-6QMYML5V.js.map