@chiway/contextweaver 1.1.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +138 -28
- package/dist/{SearchService-MYPOCM3B.js → SearchService-WVD6THR3.js} +170 -82
- package/dist/chunk-3BNHQV5W.js +373 -0
- package/dist/chunk-BFCIZ52F.js +102 -0
- package/dist/{chunk-NQR4CGQ6.js → chunk-GDVB6PJ4.js} +58 -10
- package/dist/{lock-DVY3KJSK.js → chunk-HHYPQA3X.js} +2 -3
- package/dist/chunk-ISVCQFB4.js +223 -0
- package/dist/chunk-IZ6IUHNN.js +77 -0
- package/dist/{chunk-AMQQK4P7.js → chunk-JVKVSTQ3.js} +1 -2
- package/dist/chunk-LB42CZEB.js +18 -0
- package/dist/{chunk-6Z4JEEVJ.js → chunk-PPLFJGO3.js} +303 -58
- package/dist/chunk-R6CNZXZ7.js +143 -0
- package/dist/{chunk-RJURH22T.js → chunk-SKBAE26T.js} +0 -1
- package/dist/chunk-TPM6YP43.js +38 -0
- package/dist/{chunk-7G5V7YT5.js → chunk-V3K4YVAR.js} +12 -120
- package/dist/chunk-VWBKZ6QL.js +115 -0
- package/dist/chunk-XFIM2T6S.js +57 -0
- package/dist/{chunk-6QMYML5V.js → chunk-XMZZZKG7.js} +361 -295
- package/dist/chunk-XTWNT7KP.js +156 -0
- package/dist/chunk-Y6H7C3NA.js +85 -0
- package/dist/codebaseRetrieval-DIS5RH2C.js +14 -0
- package/dist/{config-BWZ6CU3W.js → config-LCOJHTCF.js} +1 -2
- package/dist/db-GBCLP4GG.js +68 -0
- package/dist/findReferences-N7ML7TUP.js +16 -0
- package/dist/getSymbolDefinition-6KMY4H33.js +17 -0
- package/dist/index.js +271 -40
- package/dist/listFiles-4VT2TPJD.js +14 -0
- package/dist/loadConfig-XTVT2OWW.js +9 -0
- package/dist/lock-HNKQ6X5B.js +8 -0
- package/dist/scanner-QDFZJLP7.js +13 -0
- package/dist/server-UAI3U7AB.js +347 -0
- package/dist/stats-AGKUCJQI.js +12 -0
- package/dist/vectorStore-4ODCERRO.js +12 -0
- package/package.json +9 -23
- package/dist/codebaseRetrieval-NLAMGOA2.js +0 -12
- package/dist/scanner-RFG4YWYI.js +0 -11
- package/dist/server-27HI7WZO.js +0 -147
|
@@ -1,270 +1,151 @@
|
|
|
1
|
+
import {
|
|
2
|
+
commonPrefixLength
|
|
3
|
+
} from "./chunk-LB42CZEB.js";
|
|
4
|
+
import {
|
|
5
|
+
ChunkContentLoader
|
|
6
|
+
} from "./chunk-XFIM2T6S.js";
|
|
7
|
+
import {
|
|
8
|
+
getVectorStore,
|
|
9
|
+
sampleCheckDisplayCode
|
|
10
|
+
} from "./chunk-3BNHQV5W.js";
|
|
1
11
|
import {
|
|
2
12
|
batchDeleteFileChunksFts,
|
|
3
13
|
batchUpdateVectorIndexHash,
|
|
4
14
|
batchUpsertChunkFts,
|
|
15
|
+
clearAllVectorIndexHash,
|
|
5
16
|
clearVectorIndexHash,
|
|
17
|
+
deletePendingMarks,
|
|
18
|
+
getLanceDbMigrationState,
|
|
6
19
|
initDb,
|
|
7
|
-
|
|
8
|
-
|
|
20
|
+
insertPendingMarks,
|
|
21
|
+
isChunksFtsInitialized,
|
|
22
|
+
releaseLanceDbMigrationLock,
|
|
23
|
+
replayPendingMarks,
|
|
24
|
+
setLanceDbMigrationState,
|
|
25
|
+
tryAcquireLanceDbMigrationLock
|
|
26
|
+
} from "./chunk-PPLFJGO3.js";
|
|
9
27
|
import {
|
|
10
28
|
logger
|
|
11
|
-
} from "./chunk-
|
|
29
|
+
} from "./chunk-JVKVSTQ3.js";
|
|
12
30
|
import {
|
|
13
31
|
getEmbeddingConfig
|
|
14
|
-
} from "./chunk-
|
|
32
|
+
} from "./chunk-SKBAE26T.js";
|
|
15
33
|
|
|
16
|
-
// src/
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
dbPath;
|
|
27
|
-
vectorDim;
|
|
28
|
-
constructor(projectId, vectorDim = 1024) {
|
|
29
|
-
this.projectId = projectId;
|
|
30
|
-
this.dbPath = path.join(BASE_DIR, projectId, "vectors.lance");
|
|
31
|
-
this.vectorDim = vectorDim;
|
|
32
|
-
}
|
|
33
|
-
/**
|
|
34
|
-
* 初始化连接
|
|
35
|
-
*/
|
|
36
|
-
async init() {
|
|
37
|
-
if (this.db) return;
|
|
38
|
-
const projectDir = path.join(BASE_DIR, this.projectId);
|
|
39
|
-
if (!fs.existsSync(projectDir)) {
|
|
40
|
-
fs.mkdirSync(projectDir, { recursive: true });
|
|
41
|
-
}
|
|
42
|
-
this.db = await lancedb.connect(this.dbPath);
|
|
43
|
-
const tableNames = await this.db.tableNames();
|
|
44
|
-
if (tableNames.includes("chunks")) {
|
|
45
|
-
this.table = await this.db.openTable("chunks");
|
|
34
|
+
// src/db/bootstrap.ts
|
|
35
|
+
async function bootstrap(db, vectorStore, options = {}) {
|
|
36
|
+
const result = {
|
|
37
|
+
replay: { applied: 0, discarded: 0 },
|
|
38
|
+
migration: { migrated: false, totalRows: 0 }
|
|
39
|
+
};
|
|
40
|
+
try {
|
|
41
|
+
result.replay = replayPendingMarks(db);
|
|
42
|
+
if (result.replay.applied > 0 || result.replay.discarded > 0) {
|
|
43
|
+
logger.info(result.replay, "pending_marks \u542F\u52A8\u91CD\u653E\uFF1A\u6807\u8BB0\u4E0A\u6B21\u672A\u6536\u655B\u7684\u7D22\u5F15\u72B6\u6001");
|
|
46
44
|
}
|
|
45
|
+
} catch (err) {
|
|
46
|
+
const error = err;
|
|
47
|
+
logger.warn({ error: error.message }, "pending_marks \u91CD\u653E\u5931\u8D25\uFF0C\u672C\u6B21\u8DF3\u8FC7");
|
|
47
48
|
}
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
}
|
|
60
|
-
/**
|
|
61
|
-
* 单调版本更新:先插入新版本,再删除旧版本
|
|
62
|
-
*
|
|
63
|
-
* 这保证了:
|
|
64
|
-
* - 最坏情况(崩溃)是新旧版本共存(不缺失)
|
|
65
|
-
* - 正常情况下旧版本被清理
|
|
66
|
-
*/
|
|
67
|
-
async upsertFile(filePath, newHash, records) {
|
|
68
|
-
if (!this.db) throw new Error("VectorStore not initialized");
|
|
69
|
-
if (records.length === 0) {
|
|
70
|
-
await this.deleteFile(filePath);
|
|
71
|
-
return;
|
|
72
|
-
}
|
|
73
|
-
if (!this.table) {
|
|
74
|
-
await this.ensureTable(records);
|
|
75
|
-
} else {
|
|
76
|
-
await this.table.add(records);
|
|
77
|
-
}
|
|
78
|
-
if (this.table) {
|
|
79
|
-
await this.table.delete(
|
|
80
|
-
`file_path = '${this.escapeString(filePath)}' AND file_hash != '${this.escapeString(newHash)}'`
|
|
49
|
+
try {
|
|
50
|
+
result.migration = await migrateRemoveDisplayCode(db, vectorStore, options);
|
|
51
|
+
if (result.migration.migrated) {
|
|
52
|
+
logger.info(
|
|
53
|
+
{ totalRows: result.migration.totalRows, reason: result.migration.reason },
|
|
54
|
+
"LanceDB schema \u8FC1\u79FB\u5B8C\u6210\uFF1Achunks \u8868\u5DF2\u79FB\u9664 display_code/vector_text"
|
|
55
|
+
);
|
|
56
|
+
} else if (result.migration.reason?.startsWith("mismatch_ratio_")) {
|
|
57
|
+
logger.error(
|
|
58
|
+
{ reason: result.migration.reason, mismatched: result.migration.mismatched },
|
|
59
|
+
"LanceDB schema \u8FC1\u79FB\u4E2D\u6B62\uFF1Adisplay_code \u4E0E files.content \u62BD\u6837\u5DEE\u5F02\u8FC7\u5927\uFF0C\u8BF7\u68C0\u67E5\u7D22\u5F15\u4E00\u81F4\u6027\u6216\u8FD0\u884C `contextweaver migrate --reset`"
|
|
81
60
|
);
|
|
82
61
|
}
|
|
62
|
+
} catch (err) {
|
|
63
|
+
const error = err;
|
|
64
|
+
logger.warn({ error: error.message }, "LanceDB schema \u8FC1\u79FB\u5931\u8D25\uFF0C\u672C\u6B21\u8DF3\u8FC7");
|
|
83
65
|
}
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
const
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
}
|
|
111
|
-
currentBatch.push(file);
|
|
112
|
-
currentRecordCount += file.records.length;
|
|
113
|
-
}
|
|
114
|
-
if (currentBatch.length > 0) {
|
|
115
|
-
batches.push(currentBatch);
|
|
116
|
-
}
|
|
117
|
-
for (const batch of batches) {
|
|
118
|
-
const batchRecords = [];
|
|
119
|
-
for (const file of batch) {
|
|
120
|
-
batchRecords.push(...file.records);
|
|
121
|
-
}
|
|
122
|
-
if (batchRecords.length === 0) {
|
|
123
|
-
const pathsToDelete = batch.map((f) => f.path);
|
|
124
|
-
await this.deleteFiles(pathsToDelete);
|
|
125
|
-
continue;
|
|
126
|
-
}
|
|
127
|
-
if (!this.table) {
|
|
128
|
-
await this.ensureTable(batchRecords);
|
|
129
|
-
} else {
|
|
130
|
-
await this.table.add(batchRecords);
|
|
131
|
-
}
|
|
132
|
-
if (this.table && batch.length > 0) {
|
|
133
|
-
const deleteConditions = batch.map(
|
|
134
|
-
(f) => `(file_path = '${this.escapeString(f.path)}' AND file_hash != '${this.escapeString(f.hash)}')`
|
|
135
|
-
).join(" OR ");
|
|
136
|
-
await this.table.delete(deleteConditions);
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
/**
|
|
141
|
-
* 删除文件的所有 chunks
|
|
142
|
-
*/
|
|
143
|
-
async deleteFile(filePath) {
|
|
144
|
-
if (!this.table) return;
|
|
145
|
-
await this.table.delete(`file_path = '${this.escapeString(filePath)}'`);
|
|
146
|
-
}
|
|
147
|
-
/**
|
|
148
|
-
* 批量删除文件(性能优化:单次 DELETE 替代 N 次循环)
|
|
149
|
-
* 当文件数超过 500 时分批处理,防止 LanceDB filter 字符串过长
|
|
150
|
-
*/
|
|
151
|
-
async deleteFiles(filePaths) {
|
|
152
|
-
if (!this.table || filePaths.length === 0) return;
|
|
153
|
-
const BATCH_SIZE = 500;
|
|
154
|
-
if (filePaths.length <= BATCH_SIZE) {
|
|
155
|
-
const conditions = filePaths.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
|
|
156
|
-
await this.table.delete(conditions);
|
|
157
|
-
} else {
|
|
158
|
-
for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
|
|
159
|
-
const batch = filePaths.slice(i, i + BATCH_SIZE);
|
|
160
|
-
const conditions = batch.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
|
|
161
|
-
await this.table.delete(conditions);
|
|
66
|
+
return result;
|
|
67
|
+
}
|
|
68
|
+
async function migrateRemoveDisplayCode(db, vectorStore, options = {}) {
|
|
69
|
+
const earlyState = getLanceDbMigrationState(db);
|
|
70
|
+
if (earlyState === "done") {
|
|
71
|
+
return { migrated: false, totalRows: 0, reason: "already_migrated_persisted" };
|
|
72
|
+
}
|
|
73
|
+
if (earlyState === "aborted") {
|
|
74
|
+
return { migrated: false, totalRows: 0, reason: "aborted_awaiting_manual" };
|
|
75
|
+
}
|
|
76
|
+
if (!tryAcquireLanceDbMigrationLock(db)) {
|
|
77
|
+
return { migrated: false, totalRows: 0, reason: "lock_held_by_other_process" };
|
|
78
|
+
}
|
|
79
|
+
try {
|
|
80
|
+
const persistedState = getLanceDbMigrationState(db);
|
|
81
|
+
if (persistedState === "done") {
|
|
82
|
+
return { migrated: false, totalRows: 0, reason: "already_migrated_persisted" };
|
|
83
|
+
}
|
|
84
|
+
if (persistedState === "aborted") {
|
|
85
|
+
return { migrated: false, totalRows: 0, reason: "aborted_awaiting_manual" };
|
|
86
|
+
}
|
|
87
|
+
const hasCol = await vectorStore.hasDisplayCodeColumn();
|
|
88
|
+
if (persistedState === "pending") {
|
|
89
|
+
if (hasCol === null) {
|
|
90
|
+
setLanceDbMigrationState(db, "done");
|
|
91
|
+
return { migrated: true, totalRows: 0, reason: "recovered_pending_no_table" };
|
|
162
92
|
}
|
|
163
93
|
}
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
const
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
const result = /* @__PURE__ */ new Map();
|
|
195
|
-
if (!this.table || filePaths.length === 0) return result;
|
|
196
|
-
const BATCH_SIZE = 500;
|
|
197
|
-
for (let i = 0; i < filePaths.length; i += BATCH_SIZE) {
|
|
198
|
-
const batch = filePaths.slice(i, i + BATCH_SIZE);
|
|
199
|
-
const conditions = batch.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
|
|
200
|
-
const rows = await this.table.query().where(conditions).toArray();
|
|
201
|
-
for (const row of rows) {
|
|
202
|
-
let arr = result.get(row.file_path);
|
|
203
|
-
if (!arr) {
|
|
204
|
-
arr = [];
|
|
205
|
-
result.set(row.file_path, arr);
|
|
206
|
-
}
|
|
207
|
-
arr.push(row);
|
|
94
|
+
if (hasCol === null) {
|
|
95
|
+
setLanceDbMigrationState(db, "done");
|
|
96
|
+
return { migrated: false, totalRows: 0, reason: "empty" };
|
|
97
|
+
}
|
|
98
|
+
if (!hasCol) {
|
|
99
|
+
setLanceDbMigrationState(db, "done");
|
|
100
|
+
return { migrated: false, totalRows: 0, reason: "already_migrated" };
|
|
101
|
+
}
|
|
102
|
+
const sampleSize = options.sampleSize ?? 100;
|
|
103
|
+
const maxMismatchRatio = options.sampleMaxMismatchRatio ?? 0.01;
|
|
104
|
+
const oldRows = await vectorStore.readAllRowsRaw();
|
|
105
|
+
const totalRows = oldRows.length;
|
|
106
|
+
if (totalRows > 0) {
|
|
107
|
+
const stmt = db.prepare("SELECT content FROM files WHERE path = ?");
|
|
108
|
+
const getContent = (path) => {
|
|
109
|
+
const row = stmt.get(path);
|
|
110
|
+
return row?.content ?? null;
|
|
111
|
+
};
|
|
112
|
+
const check = sampleCheckDisplayCode(oldRows, getContent, {
|
|
113
|
+
sampleSize,
|
|
114
|
+
maxMismatchRatio
|
|
115
|
+
});
|
|
116
|
+
if (check.abort) {
|
|
117
|
+
setLanceDbMigrationState(db, "aborted");
|
|
118
|
+
return {
|
|
119
|
+
migrated: false,
|
|
120
|
+
totalRows,
|
|
121
|
+
mismatched: check.mismatched,
|
|
122
|
+
reason: `mismatch_ratio_${check.ratio.toFixed(3)}_exceeds_${maxMismatchRatio}`
|
|
123
|
+
};
|
|
208
124
|
}
|
|
209
125
|
}
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
}
|
|
233
|
-
/**
|
|
234
|
-
* 获取向量维度
|
|
235
|
-
*/
|
|
236
|
-
getVectorDim() {
|
|
237
|
-
return this.vectorDim;
|
|
238
|
-
}
|
|
239
|
-
/**
|
|
240
|
-
* 转义字符串(防止 SQL 注入)
|
|
241
|
-
*/
|
|
242
|
-
escapeString(str) {
|
|
243
|
-
return str.replace(/'/g, "''");
|
|
244
|
-
}
|
|
245
|
-
/**
|
|
246
|
-
* 关闭连接
|
|
247
|
-
*/
|
|
248
|
-
async close() {
|
|
249
|
-
this.db = null;
|
|
250
|
-
this.table = null;
|
|
251
|
-
}
|
|
252
|
-
};
|
|
253
|
-
var vectorStores = /* @__PURE__ */ new Map();
|
|
254
|
-
async function getVectorStore(projectId, vectorDim = 1024) {
|
|
255
|
-
let store = vectorStores.get(projectId);
|
|
256
|
-
if (!store) {
|
|
257
|
-
store = new VectorStore(projectId, vectorDim);
|
|
258
|
-
await store.init();
|
|
259
|
-
vectorStores.set(projectId, store);
|
|
260
|
-
}
|
|
261
|
-
return store;
|
|
262
|
-
}
|
|
263
|
-
async function closeAllVectorStores() {
|
|
264
|
-
for (const store of vectorStores.values()) {
|
|
265
|
-
await store.close();
|
|
126
|
+
const newRows = oldRows.map((r) => ({
|
|
127
|
+
chunk_id: r.chunk_id,
|
|
128
|
+
file_path: r.file_path,
|
|
129
|
+
file_hash: r.file_hash,
|
|
130
|
+
chunk_index: r.chunk_index,
|
|
131
|
+
vector: Array.from(r.vector),
|
|
132
|
+
language: r.language,
|
|
133
|
+
breadcrumb: r.breadcrumb,
|
|
134
|
+
start_index: r.start_index,
|
|
135
|
+
end_index: r.end_index,
|
|
136
|
+
raw_start: r.raw_start,
|
|
137
|
+
raw_end: r.raw_end,
|
|
138
|
+
vec_start: r.vec_start,
|
|
139
|
+
vec_end: r.vec_end
|
|
140
|
+
}));
|
|
141
|
+
const cleared = clearAllVectorIndexHash(db);
|
|
142
|
+
setLanceDbMigrationState(db, "pending");
|
|
143
|
+
await vectorStore.dropAndRecreateChunks(newRows);
|
|
144
|
+
setLanceDbMigrationState(db, "done");
|
|
145
|
+
return { migrated: true, totalRows, reason: `cleared_${cleared}_vector_index_hash` };
|
|
146
|
+
} finally {
|
|
147
|
+
releaseLanceDbMigrationLock(db);
|
|
266
148
|
}
|
|
267
|
-
vectorStores.clear();
|
|
268
149
|
}
|
|
269
150
|
|
|
270
151
|
// src/api/embedding.ts
|
|
@@ -634,6 +515,8 @@ var Indexer = class {
|
|
|
634
515
|
vectorStore = null;
|
|
635
516
|
embeddingClient;
|
|
636
517
|
vectorDim;
|
|
518
|
+
/** bootstrap(pending_marks 重放 + LanceDB 迁移)只在每个 db 上执行一次 */
|
|
519
|
+
bootstrappedDbs = /* @__PURE__ */ new WeakSet();
|
|
637
520
|
constructor(projectId, vectorDim = 1024) {
|
|
638
521
|
this.projectId = projectId;
|
|
639
522
|
this.vectorDim = vectorDim;
|
|
@@ -656,6 +539,31 @@ var Indexer = class {
|
|
|
656
539
|
if (!this.vectorStore) {
|
|
657
540
|
await this.init();
|
|
658
541
|
}
|
|
542
|
+
if (!this.bootstrappedDbs.has(db)) {
|
|
543
|
+
this.bootstrappedDbs.add(db);
|
|
544
|
+
try {
|
|
545
|
+
await bootstrap(db, this.vectorStore);
|
|
546
|
+
} catch (err) {
|
|
547
|
+
const error = err;
|
|
548
|
+
logger.warn({ error: error.message }, "bootstrap \u5931\u8D25\uFF0C\u672C\u6B21\u8DF3\u8FC7");
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
const migrationState = getLanceDbMigrationState(db);
|
|
552
|
+
if (migrationState === "aborted") {
|
|
553
|
+
const errorCount = results.filter(
|
|
554
|
+
(r) => r.status === "added" || r.status === "modified"
|
|
555
|
+
).length;
|
|
556
|
+
logger.error(
|
|
557
|
+
{ migrationState, blockedFiles: errorCount },
|
|
558
|
+
"LanceDB \u5904\u4E8E aborted \u72B6\u6001\uFF0C\u62D2\u7EDD\u5199\u5165\u4EE5\u9632\u6B62 schema \u6C61\u67D3\u3002\u8FD0\u884C `contextweaver migrate --reset` \u6E05\u7A7A LanceDB \u5E76\u91CD\u65B0\u7D22\u5F15\u3002"
|
|
559
|
+
);
|
|
560
|
+
return {
|
|
561
|
+
indexed: 0,
|
|
562
|
+
deleted: 0,
|
|
563
|
+
errors: errorCount,
|
|
564
|
+
skipped: results.length - errorCount
|
|
565
|
+
};
|
|
566
|
+
}
|
|
659
567
|
const stats = {
|
|
660
568
|
indexed: 0,
|
|
661
569
|
deleted: 0,
|
|
@@ -699,8 +607,14 @@ var Indexer = class {
|
|
|
699
607
|
}
|
|
700
608
|
}
|
|
701
609
|
if (toDelete.length > 0) {
|
|
702
|
-
|
|
703
|
-
|
|
610
|
+
try {
|
|
611
|
+
await this.deleteFiles(db, toDelete);
|
|
612
|
+
stats.deleted = toDelete.length;
|
|
613
|
+
} catch (err) {
|
|
614
|
+
const error = err;
|
|
615
|
+
logger.error({ error: error.message, count: toDelete.length }, "\u5220\u9664\u9636\u6BB5\u5931\u8D25\uFF0C\u5DF2\u6807\u8BB0\u91CD\u8BD5");
|
|
616
|
+
stats.errors += toDelete.length;
|
|
617
|
+
}
|
|
704
618
|
}
|
|
705
619
|
if (noChunkSettled.length > 0) {
|
|
706
620
|
batchUpdateVectorIndexHash(db, noChunkSettled);
|
|
@@ -767,16 +681,27 @@ var Indexer = class {
|
|
|
767
681
|
continue;
|
|
768
682
|
}
|
|
769
683
|
logger.info(
|
|
770
|
-
{
|
|
684
|
+
{
|
|
685
|
+
batch: `${batchNum}/${totalBatches}`,
|
|
686
|
+
texts: batchTexts.length,
|
|
687
|
+
files: batchFiles.length
|
|
688
|
+
},
|
|
771
689
|
"\u6279\u6B21 Embedding \u5F00\u59CB"
|
|
772
690
|
);
|
|
773
691
|
let embeddings;
|
|
774
692
|
const EMBED_BATCH_SIZE = 10;
|
|
775
693
|
try {
|
|
776
694
|
const batchOnProgress = onProgress ? (_completed, _total) => {
|
|
777
|
-
onProgress(
|
|
695
|
+
onProgress(
|
|
696
|
+
completedChunks + Math.min(_completed * EMBED_BATCH_SIZE, batchTexts.length),
|
|
697
|
+
totalChunks
|
|
698
|
+
);
|
|
778
699
|
} : void 0;
|
|
779
|
-
const results = await this.embeddingClient.embedBatch(
|
|
700
|
+
const results = await this.embeddingClient.embedBatch(
|
|
701
|
+
batchTexts,
|
|
702
|
+
EMBED_BATCH_SIZE,
|
|
703
|
+
batchOnProgress
|
|
704
|
+
);
|
|
780
705
|
embeddings = results.map((r) => r.embedding);
|
|
781
706
|
} catch (err) {
|
|
782
707
|
const error = err;
|
|
@@ -812,8 +737,6 @@ var Indexer = class {
|
|
|
812
737
|
file_hash: file.hash,
|
|
813
738
|
chunk_index: chunkIdx,
|
|
814
739
|
vector: embeddings[embIdx],
|
|
815
|
-
display_code: chunk.displayCode,
|
|
816
|
-
vector_text: chunk.vectorText,
|
|
817
740
|
language: chunk.metadata.language,
|
|
818
741
|
breadcrumb: chunk.metadata.contextPath.join(" > "),
|
|
819
742
|
start_index: chunk.metadata.startIndex,
|
|
@@ -830,7 +753,7 @@ var Indexer = class {
|
|
|
830
753
|
chunkIndex: record.chunk_index,
|
|
831
754
|
breadcrumb: record.breadcrumb,
|
|
832
755
|
content: `${record.breadcrumb}
|
|
833
|
-
${
|
|
756
|
+
${chunk.displayCode}`
|
|
834
757
|
});
|
|
835
758
|
}
|
|
836
759
|
filesToUpsert.push({ path: file.path, hash: file.hash, records });
|
|
@@ -858,19 +781,64 @@ ${record.display_code}`
|
|
|
858
781
|
completedChunks += batchTexts.length;
|
|
859
782
|
continue;
|
|
860
783
|
}
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
784
|
+
if (isChunksFtsInitialized(db) && ftsChunks.length > 0) {
|
|
785
|
+
try {
|
|
786
|
+
const ftsAndOutboxTx = db.transaction(() => {
|
|
787
|
+
batchUpsertChunkFts(db, ftsChunks);
|
|
788
|
+
insertPendingMarks(db, successFiles);
|
|
789
|
+
});
|
|
790
|
+
ftsAndOutboxTx();
|
|
791
|
+
} catch (err) {
|
|
792
|
+
const error = err;
|
|
793
|
+
logger.error(
|
|
794
|
+
{ error: error.message, stack: error.stack, batch: `${batchNum}/${totalBatches}` },
|
|
795
|
+
"FTS/outbox \u5199\u5165\u5931\u8D25\uFF0C\u56DE\u6EDA LanceDB \u65B0\u7248\u672C"
|
|
796
|
+
);
|
|
797
|
+
try {
|
|
798
|
+
await this.vectorStore?.deleteFilesByHash(
|
|
799
|
+
filesToUpsert.map((f) => ({ path: f.path, hash: f.hash }))
|
|
800
|
+
);
|
|
801
|
+
} catch (rollbackErr) {
|
|
802
|
+
const rbError = rollbackErr;
|
|
803
|
+
logger.error(
|
|
804
|
+
{ error: rbError.message },
|
|
805
|
+
"LanceDB \u56DE\u6EDA\u5931\u8D25\uFF0C\u5B64\u513F\u6570\u636E\u5C06\u7531\u4E0B\u6B21 GC \u6E05\u7406"
|
|
806
|
+
);
|
|
807
|
+
}
|
|
808
|
+
clearVectorIndexHash(
|
|
809
|
+
db,
|
|
810
|
+
batchFiles.map((f) => f.path)
|
|
811
|
+
);
|
|
812
|
+
totalErrors += batchFiles.length;
|
|
813
|
+
completedChunks += batchTexts.length;
|
|
814
|
+
continue;
|
|
815
|
+
}
|
|
816
|
+
} else if (successFiles.length > 0) {
|
|
817
|
+
try {
|
|
818
|
+
insertPendingMarks(db, successFiles);
|
|
819
|
+
} catch (err) {
|
|
820
|
+
const error = err;
|
|
821
|
+
logger.warn({ error: error.message }, "outbox \u5199\u5165\u5931\u8D25\uFF08\u65E0 FTS \u8DEF\u5F84\uFF09\uFF0C\u7EE7\u7EED stage6");
|
|
822
|
+
}
|
|
823
|
+
}
|
|
824
|
+
if (successFiles.length > 0) {
|
|
825
|
+
try {
|
|
826
|
+
const markTx = db.transaction(() => {
|
|
827
|
+
batchUpdateVectorIndexHash(db, successFiles);
|
|
828
|
+
deletePendingMarks(
|
|
829
|
+
db,
|
|
830
|
+
successFiles.map((f) => f.path)
|
|
831
|
+
);
|
|
832
|
+
});
|
|
833
|
+
markTx();
|
|
834
|
+
} catch (err) {
|
|
835
|
+
const error = err;
|
|
836
|
+
logger.warn(
|
|
837
|
+
{ error: error.message, batch: `${batchNum}/${totalBatches}` },
|
|
838
|
+
"stage6 mark \u5931\u8D25\uFF0Coutbox \u5DF2\u4FDD\u7559\uFF0C\u4E0B\u6B21\u542F\u52A8\u5C06\u91CD\u653E"
|
|
839
|
+
);
|
|
840
|
+
}
|
|
870
841
|
}
|
|
871
|
-
}
|
|
872
|
-
if (successFiles.length > 0) {
|
|
873
|
-
batchUpdateVectorIndexHash(db, successFiles);
|
|
874
842
|
}
|
|
875
843
|
totalSuccess += successFiles.length;
|
|
876
844
|
totalErrors += errorFiles.length;
|
|
@@ -889,12 +857,29 @@ ${record.display_code}`
|
|
|
889
857
|
}
|
|
890
858
|
/**
|
|
891
859
|
* 删除文件的向量和 FTS 索引
|
|
860
|
+
*
|
|
861
|
+
* 顺序:先删 FTS(SQLite 事务,可靠)→ 再删 LanceDB(可能失败)
|
|
862
|
+
* 任一阶段失败均通过 clearVectorIndexHash 触发下次扫描自愈
|
|
892
863
|
*/
|
|
893
864
|
async deleteFiles(db, paths) {
|
|
894
|
-
if (!this.vectorStore) return;
|
|
895
|
-
await this.vectorStore.deleteFiles(paths);
|
|
865
|
+
if (!this.vectorStore || paths.length === 0) return;
|
|
896
866
|
if (isChunksFtsInitialized(db)) {
|
|
897
|
-
|
|
867
|
+
try {
|
|
868
|
+
batchDeleteFileChunksFts(db, paths);
|
|
869
|
+
} catch (err) {
|
|
870
|
+
const error = err;
|
|
871
|
+
logger.error({ error: error.message, paths }, "FTS \u5220\u9664\u5931\u8D25");
|
|
872
|
+
clearVectorIndexHash(db, paths);
|
|
873
|
+
throw err;
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
try {
|
|
877
|
+
await this.vectorStore.deleteFiles(paths);
|
|
878
|
+
} catch (err) {
|
|
879
|
+
const error = err;
|
|
880
|
+
logger.error({ error: error.message, paths }, "LanceDB \u5220\u9664\u5931\u8D25\uFF0C\u5B64\u513F\u6570\u636E\u5C06\u7531 GC \u6E05\u7406");
|
|
881
|
+
clearVectorIndexHash(db, paths);
|
|
882
|
+
throw err;
|
|
898
883
|
}
|
|
899
884
|
logger.debug({ count: paths.length }, "\u5220\u9664\u6587\u4EF6\u7D22\u5F15");
|
|
900
885
|
}
|
|
@@ -923,6 +908,67 @@ ${record.display_code}`
|
|
|
923
908
|
}
|
|
924
909
|
await this.vectorStore?.clear();
|
|
925
910
|
}
|
|
911
|
+
/**
|
|
912
|
+
* 垃圾回收:清理 LanceDB 中的孤儿 chunks
|
|
913
|
+
*
|
|
914
|
+
* 孤儿来源:
|
|
915
|
+
* - 事务补偿失败遗留(FTS 回滚成功但 LanceDB 删除失败)
|
|
916
|
+
* - 跨进程崩溃导致的 hash 不匹配残留
|
|
917
|
+
* - 删除流程失败遗留
|
|
918
|
+
*
|
|
919
|
+
* 算法:以 SQLite files 表 (path, hash) 为权威源,删除 LanceDB 中不存在的组合。
|
|
920
|
+
* 同步清理 chunks_fts:仅当 path 在 SQLite 完全不存在时才删(hash 变化的 FTS 由 upsert 覆盖)。
|
|
921
|
+
*
|
|
922
|
+
* 性能护栏:time budget 默认 5s,超时则跳过避免阻塞扫描主流程。
|
|
923
|
+
*/
|
|
924
|
+
async gc(db, options = {}) {
|
|
925
|
+
if (!this.vectorStore) {
|
|
926
|
+
await this.init();
|
|
927
|
+
}
|
|
928
|
+
const startTime = Date.now();
|
|
929
|
+
const timeBudget = options.maxScanMs ?? 5e3;
|
|
930
|
+
let vectorPairs;
|
|
931
|
+
try {
|
|
932
|
+
vectorPairs = await this.vectorStore?.listFileHashes() ?? [];
|
|
933
|
+
} catch (err) {
|
|
934
|
+
const error = err;
|
|
935
|
+
logger.warn({ error: error.message }, "GC: listFileHashes \u5931\u8D25\uFF0C\u8DF3\u8FC7");
|
|
936
|
+
return { orphans: 0 };
|
|
937
|
+
}
|
|
938
|
+
if (vectorPairs.length === 0) return { orphans: 0 };
|
|
939
|
+
if (Date.now() - startTime > timeBudget) {
|
|
940
|
+
logger.warn(
|
|
941
|
+
{ elapsed: Date.now() - startTime, budget: timeBudget },
|
|
942
|
+
"GC \u8D85\u65F6\uFF08\u62C9\u53D6\u9636\u6BB5\uFF09\uFF0C\u672C\u6B21\u8DF3\u8FC7"
|
|
943
|
+
);
|
|
944
|
+
return { orphans: 0, truncated: true };
|
|
945
|
+
}
|
|
946
|
+
const sqliteRows = db.prepare("SELECT path, hash FROM files").all();
|
|
947
|
+
const validPairs = new Set(sqliteRows.map((r) => `${r.path} ${r.hash}`));
|
|
948
|
+
const sqlitePaths = new Set(sqliteRows.map((r) => r.path));
|
|
949
|
+
const orphans = vectorPairs.filter((p) => !validPairs.has(`${p.path} ${p.hash}`));
|
|
950
|
+
if (orphans.length === 0) return { orphans: 0 };
|
|
951
|
+
logger.info({ count: orphans.length }, "GC: \u53D1\u73B0\u5B64\u513F chunks");
|
|
952
|
+
try {
|
|
953
|
+
await this.vectorStore?.deleteFilesByHash(orphans);
|
|
954
|
+
} catch (err) {
|
|
955
|
+
const error = err;
|
|
956
|
+
logger.warn({ error: error.message }, "GC: LanceDB \u5220\u9664\u5931\u8D25\uFF0C\u4E0B\u6B21\u91CD\u8BD5");
|
|
957
|
+
return { orphans: 0 };
|
|
958
|
+
}
|
|
959
|
+
const pathsToFtsClean = Array.from(new Set(orphans.map((o) => o.path))).filter(
|
|
960
|
+
(p) => !sqlitePaths.has(p)
|
|
961
|
+
);
|
|
962
|
+
if (pathsToFtsClean.length > 0 && isChunksFtsInitialized(db)) {
|
|
963
|
+
try {
|
|
964
|
+
batchDeleteFileChunksFts(db, pathsToFtsClean);
|
|
965
|
+
} catch (err) {
|
|
966
|
+
const error = err;
|
|
967
|
+
logger.warn({ error: error.message }, "GC: chunks_fts \u6E05\u7406\u5931\u8D25");
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
return { orphans: orphans.length };
|
|
971
|
+
}
|
|
926
972
|
/**
|
|
927
973
|
* 获取索引统计
|
|
928
974
|
*/
|
|
@@ -959,8 +1005,8 @@ function getTokenBoundaryRegex(token) {
|
|
|
959
1005
|
}
|
|
960
1006
|
return regex;
|
|
961
1007
|
}
|
|
962
|
-
function scoreChunkTokenOverlap(chunk, queryTokens) {
|
|
963
|
-
const text = `${chunk.breadcrumb} ${
|
|
1008
|
+
function scoreChunkTokenOverlap(chunk, code, queryTokens) {
|
|
1009
|
+
const text = `${chunk.breadcrumb} ${code}`.toLowerCase();
|
|
964
1010
|
let score = 0;
|
|
965
1011
|
for (const token of queryTokens) {
|
|
966
1012
|
if (text.includes(token)) {
|
|
@@ -975,21 +1021,6 @@ function scoreChunkTokenOverlap(chunk, queryTokens) {
|
|
|
975
1021
|
return score;
|
|
976
1022
|
}
|
|
977
1023
|
|
|
978
|
-
// src/search/resolvers/types.ts
|
|
979
|
-
function commonPrefixLength(path1, path2) {
|
|
980
|
-
const parts1 = path1.split("/");
|
|
981
|
-
const parts2 = path2.split("/");
|
|
982
|
-
let count = 0;
|
|
983
|
-
for (let i = 0; i < Math.min(parts1.length, parts2.length); i++) {
|
|
984
|
-
if (parts1[i] === parts2[i]) {
|
|
985
|
-
count++;
|
|
986
|
-
} else {
|
|
987
|
-
break;
|
|
988
|
-
}
|
|
989
|
-
}
|
|
990
|
-
return count;
|
|
991
|
-
}
|
|
992
|
-
|
|
993
1024
|
// src/search/resolvers/CppResolver.ts
|
|
994
1025
|
var CPP_EXTENSIONS = /* @__PURE__ */ new Set([".c", ".cpp", ".cc", ".cxx", ".h", ".hpp", ".hh", ".hxx"]);
|
|
995
1026
|
var CppResolver = class {
|
|
@@ -1643,6 +1674,20 @@ var GraphExpander = class {
|
|
|
1643
1674
|
if (allTargetPaths.size === 0) return result;
|
|
1644
1675
|
const importChunksMap = await this.vectorStore?.getFilesChunks(Array.from(allTargetPaths));
|
|
1645
1676
|
if (!importChunksMap) return result;
|
|
1677
|
+
const sharedLoader = new ChunkContentLoader(this.db);
|
|
1678
|
+
const allSlices = [];
|
|
1679
|
+
if (queryTokens && queryTokens.size > 0) {
|
|
1680
|
+
for (const chunks of importChunksMap.values()) {
|
|
1681
|
+
for (const c of chunks) {
|
|
1682
|
+
allSlices.push({
|
|
1683
|
+
filePath: c.file_path,
|
|
1684
|
+
start_index: c.start_index,
|
|
1685
|
+
end_index: c.end_index
|
|
1686
|
+
});
|
|
1687
|
+
}
|
|
1688
|
+
}
|
|
1689
|
+
}
|
|
1690
|
+
const sharedCodeMap = sharedLoader.loadMany(allSlices);
|
|
1646
1691
|
const bestByKey = /* @__PURE__ */ new Map();
|
|
1647
1692
|
for (const { targetPath, depth, seedScore } of resolvedImports) {
|
|
1648
1693
|
const importChunks = importChunksMap.get(targetPath);
|
|
@@ -1650,7 +1695,8 @@ var GraphExpander = class {
|
|
|
1650
1695
|
const selectedChunks = this.selectImportChunks(
|
|
1651
1696
|
importChunks,
|
|
1652
1697
|
chunksPerImportFile,
|
|
1653
|
-
queryTokens
|
|
1698
|
+
queryTokens,
|
|
1699
|
+
sharedCodeMap
|
|
1654
1700
|
);
|
|
1655
1701
|
const depthDecay = depth === 0 ? 1 : decayDepth;
|
|
1656
1702
|
for (const chunk of selectedChunks) {
|
|
@@ -1709,16 +1755,38 @@ var GraphExpander = class {
|
|
|
1709
1755
|
/**
|
|
1710
1756
|
* 选择导入文件的 chunks(优先 query overlap)
|
|
1711
1757
|
*/
|
|
1712
|
-
selectImportChunks(chunks, limit, queryTokens) {
|
|
1758
|
+
selectImportChunks(chunks, limit, queryTokens, sharedCodeMap) {
|
|
1713
1759
|
if (limit <= 0) return [];
|
|
1714
1760
|
const sortedByIndex = chunks.slice().sort((a, b) => a.chunk_index - b.chunk_index);
|
|
1715
1761
|
if (!queryTokens || queryTokens.size === 0) {
|
|
1716
1762
|
return sortedByIndex.slice(0, limit);
|
|
1717
1763
|
}
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
}
|
|
1764
|
+
let codeMap;
|
|
1765
|
+
if (sharedCodeMap) {
|
|
1766
|
+
codeMap = sharedCodeMap;
|
|
1767
|
+
} else {
|
|
1768
|
+
const loader = new ChunkContentLoader(this.db);
|
|
1769
|
+
codeMap = loader.loadMany(
|
|
1770
|
+
sortedByIndex.map((c) => ({
|
|
1771
|
+
filePath: c.file_path,
|
|
1772
|
+
start_index: c.start_index,
|
|
1773
|
+
end_index: c.end_index
|
|
1774
|
+
}))
|
|
1775
|
+
);
|
|
1776
|
+
}
|
|
1777
|
+
const scored = sortedByIndex.map((chunk) => {
|
|
1778
|
+
const code = codeMap.get(
|
|
1779
|
+
ChunkContentLoader.key({
|
|
1780
|
+
filePath: chunk.file_path,
|
|
1781
|
+
start_index: chunk.start_index,
|
|
1782
|
+
end_index: chunk.end_index
|
|
1783
|
+
})
|
|
1784
|
+
) ?? "";
|
|
1785
|
+
return {
|
|
1786
|
+
chunk,
|
|
1787
|
+
score: scoreChunkTokenOverlap(chunk, code, queryTokens)
|
|
1788
|
+
};
|
|
1789
|
+
});
|
|
1722
1790
|
const overlapped = scored.filter((s) => s.score > 0).sort((a, b) => b.score - a.score).slice(0, limit).map((s) => s.chunk);
|
|
1723
1791
|
return overlapped.length > 0 ? overlapped : sortedByIndex.slice(0, limit);
|
|
1724
1792
|
}
|
|
@@ -1749,12 +1817,10 @@ async function getGraphExpander(projectId, config) {
|
|
|
1749
1817
|
}
|
|
1750
1818
|
|
|
1751
1819
|
export {
|
|
1752
|
-
|
|
1753
|
-
closeAllVectorStores,
|
|
1820
|
+
bootstrap,
|
|
1754
1821
|
getIndexer,
|
|
1755
1822
|
closeAllIndexers,
|
|
1756
1823
|
scoreChunkTokenOverlap,
|
|
1757
1824
|
invalidateAllExpanderCaches,
|
|
1758
1825
|
getGraphExpander
|
|
1759
1826
|
};
|
|
1760
|
-
//# sourceMappingURL=chunk-6QMYML5V.js.map
|