@chiway/contextweaver 1.1.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{SearchService-MYPOCM3B.js → SearchService-OS7CYHNJ.js} +58 -12
- package/dist/{chunk-6QMYML5V.js → chunk-AB24E3Z7.js} +399 -277
- package/dist/{chunk-7G5V7YT5.js → chunk-EMSMLPMK.js} +6 -7
- package/dist/{chunk-AMQQK4P7.js → chunk-JVKVSTQ3.js} +1 -2
- package/dist/{chunk-6Z4JEEVJ.js → chunk-RGJSXUFS.js} +243 -58
- package/dist/{chunk-RJURH22T.js → chunk-SKBAE26T.js} +0 -1
- package/dist/{chunk-NQR4CGQ6.js → chunk-X7PAYQMT.js} +40 -10
- package/dist/chunk-ZOMGPIU6.js +377 -0
- package/dist/codebaseRetrieval-3Z4CRA7X.js +11 -0
- package/dist/{config-BWZ6CU3W.js → config-LCOJHTCF.js} +1 -2
- package/dist/db-PMVM7557.js +54 -0
- package/dist/index.js +37 -9
- package/dist/{lock-DVY3KJSK.js → lock-FL54LIQL.js} +2 -3
- package/dist/scanner-2XGJWYHR.js +11 -0
- package/dist/{server-27HI7WZO.js → server-XK6EINRV.js} +4 -5
- package/dist/vectorStore-HPQZOVWF.js +12 -0
- package/package.json +1 -1
- package/dist/codebaseRetrieval-NLAMGOA2.js +0 -12
- package/dist/scanner-RFG4YWYI.js +0 -11
|
@@ -1,270 +1,145 @@
|
|
|
1
|
+
import {
|
|
2
|
+
getVectorStore,
|
|
3
|
+
sampleCheckDisplayCode
|
|
4
|
+
} from "./chunk-ZOMGPIU6.js";
|
|
1
5
|
import {
|
|
2
6
|
batchDeleteFileChunksFts,
|
|
3
7
|
batchUpdateVectorIndexHash,
|
|
4
8
|
batchUpsertChunkFts,
|
|
9
|
+
clearAllVectorIndexHash,
|
|
5
10
|
clearVectorIndexHash,
|
|
11
|
+
deletePendingMarks,
|
|
12
|
+
getLanceDbMigrationState,
|
|
6
13
|
initDb,
|
|
7
|
-
|
|
8
|
-
|
|
14
|
+
insertPendingMarks,
|
|
15
|
+
isChunksFtsInitialized,
|
|
16
|
+
releaseLanceDbMigrationLock,
|
|
17
|
+
replayPendingMarks,
|
|
18
|
+
setLanceDbMigrationState,
|
|
19
|
+
tryAcquireLanceDbMigrationLock
|
|
20
|
+
} from "./chunk-RGJSXUFS.js";
|
|
9
21
|
import {
|
|
10
22
|
logger
|
|
11
|
-
} from "./chunk-
|
|
23
|
+
} from "./chunk-JVKVSTQ3.js";
|
|
12
24
|
import {
|
|
13
25
|
getEmbeddingConfig
|
|
14
|
-
} from "./chunk-
|
|
26
|
+
} from "./chunk-SKBAE26T.js";
|
|
15
27
|
|
|
16
|
-
// src/
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
dbPath;
|
|
27
|
-
vectorDim;
|
|
28
|
-
constructor(projectId, vectorDim = 1024) {
|
|
29
|
-
this.projectId = projectId;
|
|
30
|
-
this.dbPath = path.join(BASE_DIR, projectId, "vectors.lance");
|
|
31
|
-
this.vectorDim = vectorDim;
|
|
32
|
-
}
|
|
33
|
-
/**
|
|
34
|
-
* 初始化连接
|
|
35
|
-
*/
|
|
36
|
-
async init() {
|
|
37
|
-
if (this.db) return;
|
|
38
|
-
const projectDir = path.join(BASE_DIR, this.projectId);
|
|
39
|
-
if (!fs.existsSync(projectDir)) {
|
|
40
|
-
fs.mkdirSync(projectDir, { recursive: true });
|
|
41
|
-
}
|
|
42
|
-
this.db = await lancedb.connect(this.dbPath);
|
|
43
|
-
const tableNames = await this.db.tableNames();
|
|
44
|
-
if (tableNames.includes("chunks")) {
|
|
45
|
-
this.table = await this.db.openTable("chunks");
|
|
28
|
+
// src/db/bootstrap.ts
|
|
29
|
+
async function bootstrap(db, vectorStore, options = {}) {
|
|
30
|
+
const result = {
|
|
31
|
+
replay: { applied: 0, discarded: 0 },
|
|
32
|
+
migration: { migrated: false, totalRows: 0 }
|
|
33
|
+
};
|
|
34
|
+
try {
|
|
35
|
+
result.replay = replayPendingMarks(db);
|
|
36
|
+
if (result.replay.applied > 0 || result.replay.discarded > 0) {
|
|
37
|
+
logger.info(result.replay, "pending_marks \u542F\u52A8\u91CD\u653E\uFF1A\u6807\u8BB0\u4E0A\u6B21\u672A\u6536\u655B\u7684\u7D22\u5F15\u72B6\u6001");
|
|
46
38
|
}
|
|
39
|
+
} catch (err) {
|
|
40
|
+
const error = err;
|
|
41
|
+
logger.warn({ error: error.message }, "pending_marks \u91CD\u653E\u5931\u8D25\uFF0C\u672C\u6B21\u8DF3\u8FC7");
|
|
47
42
|
}
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
}
|
|
60
|
-
/**
|
|
61
|
-
* 单调版本更新:先插入新版本,再删除旧版本
|
|
62
|
-
*
|
|
63
|
-
* 这保证了:
|
|
64
|
-
* - 最坏情况(崩溃)是新旧版本共存(不缺失)
|
|
65
|
-
* - 正常情况下旧版本被清理
|
|
66
|
-
*/
|
|
67
|
-
async upsertFile(filePath, newHash, records) {
|
|
68
|
-
if (!this.db) throw new Error("VectorStore not initialized");
|
|
69
|
-
if (records.length === 0) {
|
|
70
|
-
await this.deleteFile(filePath);
|
|
71
|
-
return;
|
|
72
|
-
}
|
|
73
|
-
if (!this.table) {
|
|
74
|
-
await this.ensureTable(records);
|
|
75
|
-
} else {
|
|
76
|
-
await this.table.add(records);
|
|
77
|
-
}
|
|
78
|
-
if (this.table) {
|
|
79
|
-
await this.table.delete(
|
|
80
|
-
`file_path = '${this.escapeString(filePath)}' AND file_hash != '${this.escapeString(newHash)}'`
|
|
43
|
+
try {
|
|
44
|
+
result.migration = await migrateRemoveDisplayCode(db, vectorStore, options);
|
|
45
|
+
if (result.migration.migrated) {
|
|
46
|
+
logger.info(
|
|
47
|
+
{ totalRows: result.migration.totalRows, reason: result.migration.reason },
|
|
48
|
+
"LanceDB schema \u8FC1\u79FB\u5B8C\u6210\uFF1Achunks \u8868\u5DF2\u79FB\u9664 display_code/vector_text"
|
|
49
|
+
);
|
|
50
|
+
} else if (result.migration.reason?.startsWith("mismatch_ratio_")) {
|
|
51
|
+
logger.error(
|
|
52
|
+
{ reason: result.migration.reason, mismatched: result.migration.mismatched },
|
|
53
|
+
"LanceDB schema \u8FC1\u79FB\u4E2D\u6B62\uFF1Adisplay_code \u4E0E files.content \u62BD\u6837\u5DEE\u5F02\u8FC7\u5927\uFF0C\u8BF7\u68C0\u67E5\u7D22\u5F15\u4E00\u81F4\u6027\u6216\u8FD0\u884C `contextweaver migrate --reset`"
|
|
81
54
|
);
|
|
82
55
|
}
|
|
56
|
+
} catch (err) {
|
|
57
|
+
const error = err;
|
|
58
|
+
logger.warn({ error: error.message }, "LanceDB schema \u8FC1\u79FB\u5931\u8D25\uFF0C\u672C\u6B21\u8DF3\u8FC7");
|
|
83
59
|
}
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
const
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
}
|
|
111
|
-
currentBatch.push(file);
|
|
112
|
-
currentRecordCount += file.records.length;
|
|
113
|
-
}
|
|
114
|
-
if (currentBatch.length > 0) {
|
|
115
|
-
batches.push(currentBatch);
|
|
116
|
-
}
|
|
117
|
-
for (const batch of batches) {
|
|
118
|
-
const batchRecords = [];
|
|
119
|
-
for (const file of batch) {
|
|
120
|
-
batchRecords.push(...file.records);
|
|
121
|
-
}
|
|
122
|
-
if (batchRecords.length === 0) {
|
|
123
|
-
const pathsToDelete = batch.map((f) => f.path);
|
|
124
|
-
await this.deleteFiles(pathsToDelete);
|
|
125
|
-
continue;
|
|
126
|
-
}
|
|
127
|
-
if (!this.table) {
|
|
128
|
-
await this.ensureTable(batchRecords);
|
|
129
|
-
} else {
|
|
130
|
-
await this.table.add(batchRecords);
|
|
131
|
-
}
|
|
132
|
-
if (this.table && batch.length > 0) {
|
|
133
|
-
const deleteConditions = batch.map(
|
|
134
|
-
(f) => `(file_path = '${this.escapeString(f.path)}' AND file_hash != '${this.escapeString(f.hash)}')`
|
|
135
|
-
).join(" OR ");
|
|
136
|
-
await this.table.delete(deleteConditions);
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
/**
|
|
141
|
-
* 删除文件的所有 chunks
|
|
142
|
-
*/
|
|
143
|
-
async deleteFile(filePath) {
|
|
144
|
-
if (!this.table) return;
|
|
145
|
-
await this.table.delete(`file_path = '${this.escapeString(filePath)}'`);
|
|
146
|
-
}
|
|
147
|
-
/**
|
|
148
|
-
* 批量删除文件(性能优化:单次 DELETE 替代 N 次循环)
|
|
149
|
-
* 当文件数超过 500 时分批处理,防止 LanceDB filter 字符串过长
|
|
150
|
-
*/
|
|
151
|
-
async deleteFiles(filePaths) {
|
|
152
|
-
if (!this.table || filePaths.length === 0) return;
|
|
153
|
-
const BATCH_SIZE = 500;
|
|
154
|
-
if (filePaths.length <= BATCH_SIZE) {
|
|
155
|
-
const conditions = filePaths.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
|
|
156
|
-
await this.table.delete(conditions);
|
|
157
|
-
} else {
|
|
158
|
-
for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
|
|
159
|
-
const batch = filePaths.slice(i, i + BATCH_SIZE);
|
|
160
|
-
const conditions = batch.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
|
|
161
|
-
await this.table.delete(conditions);
|
|
60
|
+
return result;
|
|
61
|
+
}
|
|
62
|
+
async function migrateRemoveDisplayCode(db, vectorStore, options = {}) {
|
|
63
|
+
const earlyState = getLanceDbMigrationState(db);
|
|
64
|
+
if (earlyState === "done") {
|
|
65
|
+
return { migrated: false, totalRows: 0, reason: "already_migrated_persisted" };
|
|
66
|
+
}
|
|
67
|
+
if (earlyState === "aborted") {
|
|
68
|
+
return { migrated: false, totalRows: 0, reason: "aborted_awaiting_manual" };
|
|
69
|
+
}
|
|
70
|
+
if (!tryAcquireLanceDbMigrationLock(db)) {
|
|
71
|
+
return { migrated: false, totalRows: 0, reason: "lock_held_by_other_process" };
|
|
72
|
+
}
|
|
73
|
+
try {
|
|
74
|
+
const persistedState = getLanceDbMigrationState(db);
|
|
75
|
+
if (persistedState === "done") {
|
|
76
|
+
return { migrated: false, totalRows: 0, reason: "already_migrated_persisted" };
|
|
77
|
+
}
|
|
78
|
+
if (persistedState === "aborted") {
|
|
79
|
+
return { migrated: false, totalRows: 0, reason: "aborted_awaiting_manual" };
|
|
80
|
+
}
|
|
81
|
+
const hasCol = await vectorStore.hasDisplayCodeColumn();
|
|
82
|
+
if (persistedState === "pending") {
|
|
83
|
+
if (hasCol === null) {
|
|
84
|
+
setLanceDbMigrationState(db, "done");
|
|
85
|
+
return { migrated: true, totalRows: 0, reason: "recovered_pending_no_table" };
|
|
162
86
|
}
|
|
163
87
|
}
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
const
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
const result = /* @__PURE__ */ new Map();
|
|
195
|
-
if (!this.table || filePaths.length === 0) return result;
|
|
196
|
-
const BATCH_SIZE = 500;
|
|
197
|
-
for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
|
|
198
|
-
const batch = filePaths.slice(i, i + BATCH_SIZE);
|
|
199
|
-
const conditions = batch.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
|
|
200
|
-
const rows = await this.table.query().where(conditions).toArray();
|
|
201
|
-
for (const row of rows) {
|
|
202
|
-
let arr = result.get(row.file_path);
|
|
203
|
-
if (!arr) {
|
|
204
|
-
arr = [];
|
|
205
|
-
result.set(row.file_path, arr);
|
|
206
|
-
}
|
|
207
|
-
arr.push(row);
|
|
88
|
+
if (hasCol === null) {
|
|
89
|
+
setLanceDbMigrationState(db, "done");
|
|
90
|
+
return { migrated: false, totalRows: 0, reason: "empty" };
|
|
91
|
+
}
|
|
92
|
+
if (!hasCol) {
|
|
93
|
+
setLanceDbMigrationState(db, "done");
|
|
94
|
+
return { migrated: false, totalRows: 0, reason: "already_migrated" };
|
|
95
|
+
}
|
|
96
|
+
const sampleSize = options.sampleSize ?? 100;
|
|
97
|
+
const maxMismatchRatio = options.sampleMaxMismatchRatio ?? 0.01;
|
|
98
|
+
const oldRows = await vectorStore.readAllRowsRaw();
|
|
99
|
+
const totalRows = oldRows.length;
|
|
100
|
+
if (totalRows > 0) {
|
|
101
|
+
const stmt = db.prepare("SELECT content FROM files WHERE path = ?");
|
|
102
|
+
const getContent = (path) => {
|
|
103
|
+
const row = stmt.get(path);
|
|
104
|
+
return row?.content ?? null;
|
|
105
|
+
};
|
|
106
|
+
const check = sampleCheckDisplayCode(oldRows, getContent, {
|
|
107
|
+
sampleSize,
|
|
108
|
+
maxMismatchRatio
|
|
109
|
+
});
|
|
110
|
+
if (check.abort) {
|
|
111
|
+
setLanceDbMigrationState(db, "aborted");
|
|
112
|
+
return {
|
|
113
|
+
migrated: false,
|
|
114
|
+
totalRows,
|
|
115
|
+
mismatched: check.mismatched,
|
|
116
|
+
reason: `mismatch_ratio_${check.ratio.toFixed(3)}_exceeds_${maxMismatchRatio}`
|
|
117
|
+
};
|
|
208
118
|
}
|
|
209
119
|
}
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
}
|
|
233
|
-
/**
|
|
234
|
-
* 获取向量维度
|
|
235
|
-
*/
|
|
236
|
-
getVectorDim() {
|
|
237
|
-
return this.vectorDim;
|
|
238
|
-
}
|
|
239
|
-
/**
|
|
240
|
-
* 转义字符串(防止 SQL 注入)
|
|
241
|
-
*/
|
|
242
|
-
escapeString(str) {
|
|
243
|
-
return str.replace(/'/g, "''");
|
|
244
|
-
}
|
|
245
|
-
/**
|
|
246
|
-
* 关闭连接
|
|
247
|
-
*/
|
|
248
|
-
async close() {
|
|
249
|
-
this.db = null;
|
|
250
|
-
this.table = null;
|
|
251
|
-
}
|
|
252
|
-
};
|
|
253
|
-
var vectorStores = /* @__PURE__ */ new Map();
|
|
254
|
-
async function getVectorStore(projectId, vectorDim = 1024) {
|
|
255
|
-
let store = vectorStores.get(projectId);
|
|
256
|
-
if (!store) {
|
|
257
|
-
store = new VectorStore(projectId, vectorDim);
|
|
258
|
-
await store.init();
|
|
259
|
-
vectorStores.set(projectId, store);
|
|
260
|
-
}
|
|
261
|
-
return store;
|
|
262
|
-
}
|
|
263
|
-
async function closeAllVectorStores() {
|
|
264
|
-
for (const store of vectorStores.values()) {
|
|
265
|
-
await store.close();
|
|
120
|
+
const newRows = oldRows.map((r) => ({
|
|
121
|
+
chunk_id: r.chunk_id,
|
|
122
|
+
file_path: r.file_path,
|
|
123
|
+
file_hash: r.file_hash,
|
|
124
|
+
chunk_index: r.chunk_index,
|
|
125
|
+
vector: Array.from(r.vector),
|
|
126
|
+
language: r.language,
|
|
127
|
+
breadcrumb: r.breadcrumb,
|
|
128
|
+
start_index: r.start_index,
|
|
129
|
+
end_index: r.end_index,
|
|
130
|
+
raw_start: r.raw_start,
|
|
131
|
+
raw_end: r.raw_end,
|
|
132
|
+
vec_start: r.vec_start,
|
|
133
|
+
vec_end: r.vec_end
|
|
134
|
+
}));
|
|
135
|
+
const cleared = clearAllVectorIndexHash(db);
|
|
136
|
+
setLanceDbMigrationState(db, "pending");
|
|
137
|
+
await vectorStore.dropAndRecreateChunks(newRows);
|
|
138
|
+
setLanceDbMigrationState(db, "done");
|
|
139
|
+
return { migrated: true, totalRows, reason: `cleared_${cleared}_vector_index_hash` };
|
|
140
|
+
} finally {
|
|
141
|
+
releaseLanceDbMigrationLock(db);
|
|
266
142
|
}
|
|
267
|
-
vectorStores.clear();
|
|
268
143
|
}
|
|
269
144
|
|
|
270
145
|
// src/api/embedding.ts
|
|
@@ -634,6 +509,8 @@ var Indexer = class {
|
|
|
634
509
|
vectorStore = null;
|
|
635
510
|
embeddingClient;
|
|
636
511
|
vectorDim;
|
|
512
|
+
/** bootstrap(pending_marks 重放 + LanceDB 迁移)只在每个 db 上执行一次 */
|
|
513
|
+
bootstrappedDbs = /* @__PURE__ */ new WeakSet();
|
|
637
514
|
constructor(projectId, vectorDim = 1024) {
|
|
638
515
|
this.projectId = projectId;
|
|
639
516
|
this.vectorDim = vectorDim;
|
|
@@ -656,6 +533,31 @@ var Indexer = class {
|
|
|
656
533
|
if (!this.vectorStore) {
|
|
657
534
|
await this.init();
|
|
658
535
|
}
|
|
536
|
+
if (!this.bootstrappedDbs.has(db)) {
|
|
537
|
+
this.bootstrappedDbs.add(db);
|
|
538
|
+
try {
|
|
539
|
+
await bootstrap(db, this.vectorStore);
|
|
540
|
+
} catch (err) {
|
|
541
|
+
const error = err;
|
|
542
|
+
logger.warn({ error: error.message }, "bootstrap \u5931\u8D25\uFF0C\u672C\u6B21\u8DF3\u8FC7");
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
const migrationState = getLanceDbMigrationState(db);
|
|
546
|
+
if (migrationState === "aborted") {
|
|
547
|
+
const errorCount = results.filter(
|
|
548
|
+
(r) => r.status === "added" || r.status === "modified"
|
|
549
|
+
).length;
|
|
550
|
+
logger.error(
|
|
551
|
+
{ migrationState, blockedFiles: errorCount },
|
|
552
|
+
"LanceDB \u5904\u4E8E aborted \u72B6\u6001\uFF0C\u62D2\u7EDD\u5199\u5165\u4EE5\u9632\u6B62 schema \u6C61\u67D3\u3002\u8FD0\u884C `contextweaver migrate --reset` \u6E05\u7A7A LanceDB \u5E76\u91CD\u65B0\u7D22\u5F15\u3002"
|
|
553
|
+
);
|
|
554
|
+
return {
|
|
555
|
+
indexed: 0,
|
|
556
|
+
deleted: 0,
|
|
557
|
+
errors: errorCount,
|
|
558
|
+
skipped: results.length - errorCount
|
|
559
|
+
};
|
|
560
|
+
}
|
|
659
561
|
const stats = {
|
|
660
562
|
indexed: 0,
|
|
661
563
|
deleted: 0,
|
|
@@ -699,8 +601,17 @@ var Indexer = class {
|
|
|
699
601
|
}
|
|
700
602
|
}
|
|
701
603
|
if (toDelete.length > 0) {
|
|
702
|
-
|
|
703
|
-
|
|
604
|
+
try {
|
|
605
|
+
await this.deleteFiles(db, toDelete);
|
|
606
|
+
stats.deleted = toDelete.length;
|
|
607
|
+
} catch (err) {
|
|
608
|
+
const error = err;
|
|
609
|
+
logger.error(
|
|
610
|
+
{ error: error.message, count: toDelete.length },
|
|
611
|
+
"\u5220\u9664\u9636\u6BB5\u5931\u8D25\uFF0C\u5DF2\u6807\u8BB0\u91CD\u8BD5"
|
|
612
|
+
);
|
|
613
|
+
stats.errors += toDelete.length;
|
|
614
|
+
}
|
|
704
615
|
}
|
|
705
616
|
if (noChunkSettled.length > 0) {
|
|
706
617
|
batchUpdateVectorIndexHash(db, noChunkSettled);
|
|
@@ -812,8 +723,6 @@ var Indexer = class {
|
|
|
812
723
|
file_hash: file.hash,
|
|
813
724
|
chunk_index: chunkIdx,
|
|
814
725
|
vector: embeddings[embIdx],
|
|
815
|
-
display_code: chunk.displayCode,
|
|
816
|
-
vector_text: chunk.vectorText,
|
|
817
726
|
language: chunk.metadata.language,
|
|
818
727
|
breadcrumb: chunk.metadata.contextPath.join(" > "),
|
|
819
728
|
start_index: chunk.metadata.startIndex,
|
|
@@ -830,7 +739,7 @@ var Indexer = class {
|
|
|
830
739
|
chunkIndex: record.chunk_index,
|
|
831
740
|
breadcrumb: record.breadcrumb,
|
|
832
741
|
content: `${record.breadcrumb}
|
|
833
|
-
${
|
|
742
|
+
${chunk.displayCode}`
|
|
834
743
|
});
|
|
835
744
|
}
|
|
836
745
|
filesToUpsert.push({ path: file.path, hash: file.hash, records });
|
|
@@ -858,19 +767,64 @@ ${record.display_code}`
|
|
|
858
767
|
completedChunks += batchTexts.length;
|
|
859
768
|
continue;
|
|
860
769
|
}
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
770
|
+
if (isChunksFtsInitialized(db) && ftsChunks.length > 0) {
|
|
771
|
+
try {
|
|
772
|
+
const ftsAndOutboxTx = db.transaction(() => {
|
|
773
|
+
batchUpsertChunkFts(db, ftsChunks);
|
|
774
|
+
insertPendingMarks(db, successFiles);
|
|
775
|
+
});
|
|
776
|
+
ftsAndOutboxTx();
|
|
777
|
+
} catch (err) {
|
|
778
|
+
const error = err;
|
|
779
|
+
logger.error(
|
|
780
|
+
{ error: error.message, stack: error.stack, batch: `${batchNum}/${totalBatches}` },
|
|
781
|
+
"FTS/outbox \u5199\u5165\u5931\u8D25\uFF0C\u56DE\u6EDA LanceDB \u65B0\u7248\u672C"
|
|
782
|
+
);
|
|
783
|
+
try {
|
|
784
|
+
await this.vectorStore?.deleteFilesByHash(
|
|
785
|
+
filesToUpsert.map((f) => ({ path: f.path, hash: f.hash }))
|
|
786
|
+
);
|
|
787
|
+
} catch (rollbackErr) {
|
|
788
|
+
const rbError = rollbackErr;
|
|
789
|
+
logger.error(
|
|
790
|
+
{ error: rbError.message },
|
|
791
|
+
"LanceDB \u56DE\u6EDA\u5931\u8D25\uFF0C\u5B64\u513F\u6570\u636E\u5C06\u7531\u4E0B\u6B21 GC \u6E05\u7406"
|
|
792
|
+
);
|
|
793
|
+
}
|
|
794
|
+
clearVectorIndexHash(
|
|
795
|
+
db,
|
|
796
|
+
batchFiles.map((f) => f.path)
|
|
797
|
+
);
|
|
798
|
+
totalErrors += batchFiles.length;
|
|
799
|
+
completedChunks += batchTexts.length;
|
|
800
|
+
continue;
|
|
801
|
+
}
|
|
802
|
+
} else if (successFiles.length > 0) {
|
|
803
|
+
try {
|
|
804
|
+
insertPendingMarks(db, successFiles);
|
|
805
|
+
} catch (err) {
|
|
806
|
+
const error = err;
|
|
807
|
+
logger.warn({ error: error.message }, "outbox \u5199\u5165\u5931\u8D25\uFF08\u65E0 FTS \u8DEF\u5F84\uFF09\uFF0C\u7EE7\u7EED stage6");
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
if (successFiles.length > 0) {
|
|
811
|
+
try {
|
|
812
|
+
const markTx = db.transaction(() => {
|
|
813
|
+
batchUpdateVectorIndexHash(db, successFiles);
|
|
814
|
+
deletePendingMarks(
|
|
815
|
+
db,
|
|
816
|
+
successFiles.map((f) => f.path)
|
|
817
|
+
);
|
|
818
|
+
});
|
|
819
|
+
markTx();
|
|
820
|
+
} catch (err) {
|
|
821
|
+
const error = err;
|
|
822
|
+
logger.warn(
|
|
823
|
+
{ error: error.message, batch: `${batchNum}/${totalBatches}` },
|
|
824
|
+
"stage6 mark \u5931\u8D25\uFF0Coutbox \u5DF2\u4FDD\u7559\uFF0C\u4E0B\u6B21\u542F\u52A8\u5C06\u91CD\u653E"
|
|
825
|
+
);
|
|
826
|
+
}
|
|
870
827
|
}
|
|
871
|
-
}
|
|
872
|
-
if (successFiles.length > 0) {
|
|
873
|
-
batchUpdateVectorIndexHash(db, successFiles);
|
|
874
828
|
}
|
|
875
829
|
totalSuccess += successFiles.length;
|
|
876
830
|
totalErrors += errorFiles.length;
|
|
@@ -889,12 +843,29 @@ ${record.display_code}`
|
|
|
889
843
|
}
|
|
890
844
|
/**
|
|
891
845
|
* 删除文件的向量和 FTS 索引
|
|
846
|
+
*
|
|
847
|
+
* 顺序:先删 FTS(SQLite 事务,可靠)→ 再删 LanceDB(可能失败)
|
|
848
|
+
* 任一阶段失败均通过 clearVectorIndexHash 触发下次扫描自愈
|
|
892
849
|
*/
|
|
893
850
|
async deleteFiles(db, paths) {
|
|
894
|
-
if (!this.vectorStore) return;
|
|
895
|
-
await this.vectorStore.deleteFiles(paths);
|
|
851
|
+
if (!this.vectorStore || paths.length === 0) return;
|
|
896
852
|
if (isChunksFtsInitialized(db)) {
|
|
897
|
-
|
|
853
|
+
try {
|
|
854
|
+
batchDeleteFileChunksFts(db, paths);
|
|
855
|
+
} catch (err) {
|
|
856
|
+
const error = err;
|
|
857
|
+
logger.error({ error: error.message, paths }, "FTS \u5220\u9664\u5931\u8D25");
|
|
858
|
+
clearVectorIndexHash(db, paths);
|
|
859
|
+
throw err;
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
try {
|
|
863
|
+
await this.vectorStore.deleteFiles(paths);
|
|
864
|
+
} catch (err) {
|
|
865
|
+
const error = err;
|
|
866
|
+
logger.error({ error: error.message, paths }, "LanceDB \u5220\u9664\u5931\u8D25\uFF0C\u5B64\u513F\u6570\u636E\u5C06\u7531 GC \u6E05\u7406");
|
|
867
|
+
clearVectorIndexHash(db, paths);
|
|
868
|
+
throw err;
|
|
898
869
|
}
|
|
899
870
|
logger.debug({ count: paths.length }, "\u5220\u9664\u6587\u4EF6\u7D22\u5F15");
|
|
900
871
|
}
|
|
@@ -923,6 +894,67 @@ ${record.display_code}`
|
|
|
923
894
|
}
|
|
924
895
|
await this.vectorStore?.clear();
|
|
925
896
|
}
|
|
897
|
+
/**
|
|
898
|
+
* 垃圾回收:清理 LanceDB 中的孤儿 chunks
|
|
899
|
+
*
|
|
900
|
+
* 孤儿来源:
|
|
901
|
+
* - 事务补偿失败遗留(FTS 回滚成功但 LanceDB 删除失败)
|
|
902
|
+
* - 跨进程崩溃导致的 hash 不匹配残留
|
|
903
|
+
* - 删除流程失败遗留
|
|
904
|
+
*
|
|
905
|
+
* 算法:以 SQLite files 表 (path, hash) 为权威源,删除 LanceDB 中不存在的组合。
|
|
906
|
+
* 同步清理 chunks_fts:仅当 path 在 SQLite 完全不存在时才删(hash 变化的 FTS 由 upsert 覆盖)。
|
|
907
|
+
*
|
|
908
|
+
* 性能护栏:time budget 默认 5s,超时则跳过避免阻塞扫描主流程。
|
|
909
|
+
*/
|
|
910
|
+
async gc(db, options = {}) {
|
|
911
|
+
if (!this.vectorStore) {
|
|
912
|
+
await this.init();
|
|
913
|
+
}
|
|
914
|
+
const startTime = Date.now();
|
|
915
|
+
const timeBudget = options.maxScanMs ?? 5e3;
|
|
916
|
+
let vectorPairs;
|
|
917
|
+
try {
|
|
918
|
+
vectorPairs = await this.vectorStore?.listFileHashes() ?? [];
|
|
919
|
+
} catch (err) {
|
|
920
|
+
const error = err;
|
|
921
|
+
logger.warn({ error: error.message }, "GC: listFileHashes \u5931\u8D25\uFF0C\u8DF3\u8FC7");
|
|
922
|
+
return { orphans: 0 };
|
|
923
|
+
}
|
|
924
|
+
if (vectorPairs.length === 0) return { orphans: 0 };
|
|
925
|
+
if (Date.now() - startTime > timeBudget) {
|
|
926
|
+
logger.warn(
|
|
927
|
+
{ elapsed: Date.now() - startTime, budget: timeBudget },
|
|
928
|
+
"GC \u8D85\u65F6\uFF08\u62C9\u53D6\u9636\u6BB5\uFF09\uFF0C\u672C\u6B21\u8DF3\u8FC7"
|
|
929
|
+
);
|
|
930
|
+
return { orphans: 0, truncated: true };
|
|
931
|
+
}
|
|
932
|
+
const sqliteRows = db.prepare("SELECT path, hash FROM files").all();
|
|
933
|
+
const validPairs = new Set(sqliteRows.map((r) => `${r.path} ${r.hash}`));
|
|
934
|
+
const sqlitePaths = new Set(sqliteRows.map((r) => r.path));
|
|
935
|
+
const orphans = vectorPairs.filter((p) => !validPairs.has(`${p.path} ${p.hash}`));
|
|
936
|
+
if (orphans.length === 0) return { orphans: 0 };
|
|
937
|
+
logger.info({ count: orphans.length }, "GC: \u53D1\u73B0\u5B64\u513F chunks");
|
|
938
|
+
try {
|
|
939
|
+
await this.vectorStore?.deleteFilesByHash(orphans);
|
|
940
|
+
} catch (err) {
|
|
941
|
+
const error = err;
|
|
942
|
+
logger.warn({ error: error.message }, "GC: LanceDB \u5220\u9664\u5931\u8D25\uFF0C\u4E0B\u6B21\u91CD\u8BD5");
|
|
943
|
+
return { orphans: 0 };
|
|
944
|
+
}
|
|
945
|
+
const pathsToFtsClean = Array.from(new Set(orphans.map((o) => o.path))).filter(
|
|
946
|
+
(p) => !sqlitePaths.has(p)
|
|
947
|
+
);
|
|
948
|
+
if (pathsToFtsClean.length > 0 && isChunksFtsInitialized(db)) {
|
|
949
|
+
try {
|
|
950
|
+
batchDeleteFileChunksFts(db, pathsToFtsClean);
|
|
951
|
+
} catch (err) {
|
|
952
|
+
const error = err;
|
|
953
|
+
logger.warn({ error: error.message }, "GC: chunks_fts \u6E05\u7406\u5931\u8D25");
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
return { orphans: orphans.length };
|
|
957
|
+
}
|
|
926
958
|
/**
|
|
927
959
|
* 获取索引统计
|
|
928
960
|
*/
|
|
@@ -959,8 +991,8 @@ function getTokenBoundaryRegex(token) {
|
|
|
959
991
|
}
|
|
960
992
|
return regex;
|
|
961
993
|
}
|
|
962
|
-
function scoreChunkTokenOverlap(chunk, queryTokens) {
|
|
963
|
-
const text = `${chunk.breadcrumb} ${
|
|
994
|
+
function scoreChunkTokenOverlap(chunk, code, queryTokens) {
|
|
995
|
+
const text = `${chunk.breadcrumb} ${code}`.toLowerCase();
|
|
964
996
|
let score = 0;
|
|
965
997
|
for (const token of queryTokens) {
|
|
966
998
|
if (text.includes(token)) {
|
|
@@ -975,6 +1007,60 @@ function scoreChunkTokenOverlap(chunk, queryTokens) {
|
|
|
975
1007
|
return score;
|
|
976
1008
|
}
|
|
977
1009
|
|
|
1010
|
+
// src/search/ChunkContentLoader.ts
|
|
1011
|
+
var ChunkContentLoader = class _ChunkContentLoader {
|
|
1012
|
+
constructor(db) {
|
|
1013
|
+
this.db = db;
|
|
1014
|
+
}
|
|
1015
|
+
/**
|
|
1016
|
+
* 生成 cache key
|
|
1017
|
+
*/
|
|
1018
|
+
static key(slice) {
|
|
1019
|
+
return `${slice.filePath}#${slice.start_index}#${slice.end_index}`;
|
|
1020
|
+
}
|
|
1021
|
+
/**
|
|
1022
|
+
* 批量加载 chunk 正文
|
|
1023
|
+
*
|
|
1024
|
+
* @returns Map<key, code>,key 由 ChunkContentLoader.key 生成
|
|
1025
|
+
*/
|
|
1026
|
+
loadMany(slices) {
|
|
1027
|
+
const result = /* @__PURE__ */ new Map();
|
|
1028
|
+
if (slices.length === 0) return result;
|
|
1029
|
+
const byPath = /* @__PURE__ */ new Map();
|
|
1030
|
+
for (const s of slices) {
|
|
1031
|
+
let arr = byPath.get(s.filePath);
|
|
1032
|
+
if (!arr) {
|
|
1033
|
+
arr = [];
|
|
1034
|
+
byPath.set(s.filePath, arr);
|
|
1035
|
+
}
|
|
1036
|
+
arr.push(s);
|
|
1037
|
+
}
|
|
1038
|
+
const stmt = this.db.prepare("SELECT content FROM files WHERE path = ?");
|
|
1039
|
+
for (const [path, spans] of byPath) {
|
|
1040
|
+
const row = stmt.get(path);
|
|
1041
|
+
const content = row?.content ?? null;
|
|
1042
|
+
for (const s of spans) {
|
|
1043
|
+
const k = _ChunkContentLoader.key(s);
|
|
1044
|
+
if (content === null) {
|
|
1045
|
+
result.set(k, "");
|
|
1046
|
+
continue;
|
|
1047
|
+
}
|
|
1048
|
+
const safeStart = Math.max(0, Math.min(s.start_index, content.length));
|
|
1049
|
+
const safeEnd = Math.max(safeStart, Math.min(s.end_index, content.length));
|
|
1050
|
+
result.set(k, content.slice(safeStart, safeEnd));
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
return result;
|
|
1054
|
+
}
|
|
1055
|
+
/**
|
|
1056
|
+
* 加载单个 chunk 正文(便捷方法,不推荐在批量场景使用)
|
|
1057
|
+
*/
|
|
1058
|
+
loadOne(slice) {
|
|
1059
|
+
const map = this.loadMany([slice]);
|
|
1060
|
+
return map.get(_ChunkContentLoader.key(slice)) ?? "";
|
|
1061
|
+
}
|
|
1062
|
+
};
|
|
1063
|
+
|
|
978
1064
|
// src/search/resolvers/types.ts
|
|
979
1065
|
function commonPrefixLength(path1, path2) {
|
|
980
1066
|
const parts1 = path1.split("/");
|
|
@@ -1643,6 +1729,20 @@ var GraphExpander = class {
|
|
|
1643
1729
|
if (allTargetPaths.size === 0) return result;
|
|
1644
1730
|
const importChunksMap = await this.vectorStore?.getFilesChunks(Array.from(allTargetPaths));
|
|
1645
1731
|
if (!importChunksMap) return result;
|
|
1732
|
+
const sharedLoader = new ChunkContentLoader(this.db);
|
|
1733
|
+
const allSlices = [];
|
|
1734
|
+
if (queryTokens && queryTokens.size > 0) {
|
|
1735
|
+
for (const chunks of importChunksMap.values()) {
|
|
1736
|
+
for (const c of chunks) {
|
|
1737
|
+
allSlices.push({
|
|
1738
|
+
filePath: c.file_path,
|
|
1739
|
+
start_index: c.start_index,
|
|
1740
|
+
end_index: c.end_index
|
|
1741
|
+
});
|
|
1742
|
+
}
|
|
1743
|
+
}
|
|
1744
|
+
}
|
|
1745
|
+
const sharedCodeMap = sharedLoader.loadMany(allSlices);
|
|
1646
1746
|
const bestByKey = /* @__PURE__ */ new Map();
|
|
1647
1747
|
for (const { targetPath, depth, seedScore } of resolvedImports) {
|
|
1648
1748
|
const importChunks = importChunksMap.get(targetPath);
|
|
@@ -1650,7 +1750,8 @@ var GraphExpander = class {
|
|
|
1650
1750
|
const selectedChunks = this.selectImportChunks(
|
|
1651
1751
|
importChunks,
|
|
1652
1752
|
chunksPerImportFile,
|
|
1653
|
-
queryTokens
|
|
1753
|
+
queryTokens,
|
|
1754
|
+
sharedCodeMap
|
|
1654
1755
|
);
|
|
1655
1756
|
const depthDecay = depth === 0 ? 1 : decayDepth;
|
|
1656
1757
|
for (const chunk of selectedChunks) {
|
|
@@ -1709,16 +1810,38 @@ var GraphExpander = class {
|
|
|
1709
1810
|
/**
|
|
1710
1811
|
* 选择导入文件的 chunks(优先 query overlap)
|
|
1711
1812
|
*/
|
|
1712
|
-
selectImportChunks(chunks, limit, queryTokens) {
|
|
1813
|
+
selectImportChunks(chunks, limit, queryTokens, sharedCodeMap) {
|
|
1713
1814
|
if (limit <= 0) return [];
|
|
1714
1815
|
const sortedByIndex = chunks.slice().sort((a, b) => a.chunk_index - b.chunk_index);
|
|
1715
1816
|
if (!queryTokens || queryTokens.size === 0) {
|
|
1716
1817
|
return sortedByIndex.slice(0, limit);
|
|
1717
1818
|
}
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
}
|
|
1819
|
+
let codeMap;
|
|
1820
|
+
if (sharedCodeMap) {
|
|
1821
|
+
codeMap = sharedCodeMap;
|
|
1822
|
+
} else {
|
|
1823
|
+
const loader = new ChunkContentLoader(this.db);
|
|
1824
|
+
codeMap = loader.loadMany(
|
|
1825
|
+
sortedByIndex.map((c) => ({
|
|
1826
|
+
filePath: c.file_path,
|
|
1827
|
+
start_index: c.start_index,
|
|
1828
|
+
end_index: c.end_index
|
|
1829
|
+
}))
|
|
1830
|
+
);
|
|
1831
|
+
}
|
|
1832
|
+
const scored = sortedByIndex.map((chunk) => {
|
|
1833
|
+
const code = codeMap.get(
|
|
1834
|
+
ChunkContentLoader.key({
|
|
1835
|
+
filePath: chunk.file_path,
|
|
1836
|
+
start_index: chunk.start_index,
|
|
1837
|
+
end_index: chunk.end_index
|
|
1838
|
+
})
|
|
1839
|
+
) ?? "";
|
|
1840
|
+
return {
|
|
1841
|
+
chunk,
|
|
1842
|
+
score: scoreChunkTokenOverlap(chunk, code, queryTokens)
|
|
1843
|
+
};
|
|
1844
|
+
});
|
|
1722
1845
|
const overlapped = scored.filter((s) => s.score > 0).sort((a, b) => b.score - a.score).slice(0, limit).map((s) => s.chunk);
|
|
1723
1846
|
return overlapped.length > 0 ? overlapped : sortedByIndex.slice(0, limit);
|
|
1724
1847
|
}
|
|
@@ -1749,12 +1872,11 @@ async function getGraphExpander(projectId, config) {
|
|
|
1749
1872
|
}
|
|
1750
1873
|
|
|
1751
1874
|
export {
|
|
1752
|
-
|
|
1753
|
-
closeAllVectorStores,
|
|
1875
|
+
bootstrap,
|
|
1754
1876
|
getIndexer,
|
|
1755
1877
|
closeAllIndexers,
|
|
1756
1878
|
scoreChunkTokenOverlap,
|
|
1879
|
+
ChunkContentLoader,
|
|
1757
1880
|
invalidateAllExpanderCaches,
|
|
1758
1881
|
getGraphExpander
|
|
1759
1882
|
};
|
|
1760
|
-
//# sourceMappingURL=chunk-6QMYML5V.js.map
|