@hsingjui/contextweaver 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +380 -0
- package/dist/SearchService-YLOUJF4S.js +1496 -0
- package/dist/chunk-34YZ2U3O.js +1177 -0
- package/dist/chunk-5SRSUMKW.js +612 -0
- package/dist/chunk-5TV4JNTE.js +258 -0
- package/dist/chunk-6C2D5Y4R.js +798 -0
- package/dist/chunk-PN7DP6XL.js +158 -0
- package/dist/codebaseRetrieval-RDCNIUDM.js +10 -0
- package/dist/config-IEL3M4V5.js +18 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +130 -0
- package/dist/scanner-66CLKCSZ.js +9 -0
- package/dist/server-2SAFEAEY.js +131 -0
- package/package.json +59 -0
|
@@ -0,0 +1,798 @@
|
|
|
1
|
+
import {
|
|
2
|
+
batchDeleteFileChunksFts,
|
|
3
|
+
batchUpdateVectorIndexHash,
|
|
4
|
+
batchUpsertChunkFts,
|
|
5
|
+
clearVectorIndexHash,
|
|
6
|
+
isChunksFtsInitialized,
|
|
7
|
+
logger
|
|
8
|
+
} from "./chunk-5SRSUMKW.js";
|
|
9
|
+
import {
|
|
10
|
+
getEmbeddingConfig
|
|
11
|
+
} from "./chunk-PN7DP6XL.js";
|
|
12
|
+
|
|
13
|
+
// src/vectorStore/index.ts
|
|
14
|
+
import * as lancedb from "@lancedb/lancedb";
|
|
15
|
+
import path from "path";
|
|
16
|
+
import os from "os";
|
|
17
|
+
import fs from "fs";
|
|
18
|
+
var BASE_DIR = path.join(os.homedir(), ".contextweaver");
|
|
19
|
+
var VectorStore = class {
|
|
20
|
+
db = null;
|
|
21
|
+
table = null;
|
|
22
|
+
projectId;
|
|
23
|
+
dbPath;
|
|
24
|
+
vectorDim;
|
|
25
|
+
constructor(projectId, vectorDim = 1024) {
|
|
26
|
+
this.projectId = projectId;
|
|
27
|
+
this.dbPath = path.join(BASE_DIR, projectId, "vectors.lance");
|
|
28
|
+
this.vectorDim = vectorDim;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* 初始化连接
|
|
32
|
+
*/
|
|
33
|
+
async init() {
|
|
34
|
+
if (this.db) return;
|
|
35
|
+
const projectDir = path.join(BASE_DIR, this.projectId);
|
|
36
|
+
if (!fs.existsSync(projectDir)) {
|
|
37
|
+
fs.mkdirSync(projectDir, { recursive: true });
|
|
38
|
+
}
|
|
39
|
+
this.db = await lancedb.connect(this.dbPath);
|
|
40
|
+
const tableNames = await this.db.tableNames();
|
|
41
|
+
if (tableNames.includes("chunks")) {
|
|
42
|
+
this.table = await this.db.openTable("chunks");
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* 确保表存在(首次插入时调用)
|
|
47
|
+
*/
|
|
48
|
+
async ensureTable(records) {
|
|
49
|
+
if (this.table) return;
|
|
50
|
+
if (!this.db) throw new Error("VectorStore not initialized");
|
|
51
|
+
if (records.length === 0) return;
|
|
52
|
+
this.table = await this.db.createTable("chunks", records);
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* 单调版本更新:先插入新版本,再删除旧版本
|
|
56
|
+
*
|
|
57
|
+
* 这保证了:
|
|
58
|
+
* - 最坏情况(崩溃)是新旧版本共存(不缺失)
|
|
59
|
+
* - 正常情况下旧版本被清理
|
|
60
|
+
*/
|
|
61
|
+
async upsertFile(filePath, newHash, records) {
|
|
62
|
+
if (!this.db) throw new Error("VectorStore not initialized");
|
|
63
|
+
if (records.length === 0) {
|
|
64
|
+
await this.deleteFile(filePath);
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
if (!this.table) {
|
|
68
|
+
await this.ensureTable(records);
|
|
69
|
+
} else {
|
|
70
|
+
await this.table.add(records);
|
|
71
|
+
}
|
|
72
|
+
if (this.table) {
|
|
73
|
+
await this.table.delete(`file_path = '${this.escapeString(filePath)}' AND file_hash != '${this.escapeString(newHash)}'`);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* 批量 upsert 多个文件(性能优化版,带分批机制)
|
|
78
|
+
*
|
|
79
|
+
* 流程:
|
|
80
|
+
* 1. 将文件分成小批次(每批最多 BATCH_FILES 个文件或 BATCH_RECORDS 条记录)
|
|
81
|
+
* 2. 每批执行:插入新 records → 删除旧版本
|
|
82
|
+
*
|
|
83
|
+
* 分批是必要的,因为 LanceDB native 模块在处理超大数据时可能崩溃
|
|
84
|
+
*
|
|
85
|
+
* @param files 文件列表,每个包含 path、hash 和 records
|
|
86
|
+
*/
|
|
87
|
+
async batchUpsertFiles(files) {
|
|
88
|
+
if (!this.db) throw new Error("VectorStore not initialized");
|
|
89
|
+
if (files.length === 0) return;
|
|
90
|
+
const BATCH_FILES = 50;
|
|
91
|
+
const BATCH_RECORDS = 5e3;
|
|
92
|
+
const batches = [];
|
|
93
|
+
let currentBatch = [];
|
|
94
|
+
let currentRecordCount = 0;
|
|
95
|
+
for (const file of files) {
|
|
96
|
+
if (currentBatch.length >= BATCH_FILES || currentRecordCount + file.records.length > BATCH_RECORDS) {
|
|
97
|
+
if (currentBatch.length > 0) {
|
|
98
|
+
batches.push(currentBatch);
|
|
99
|
+
}
|
|
100
|
+
currentBatch = [];
|
|
101
|
+
currentRecordCount = 0;
|
|
102
|
+
}
|
|
103
|
+
currentBatch.push(file);
|
|
104
|
+
currentRecordCount += file.records.length;
|
|
105
|
+
}
|
|
106
|
+
if (currentBatch.length > 0) {
|
|
107
|
+
batches.push(currentBatch);
|
|
108
|
+
}
|
|
109
|
+
for (const batch of batches) {
|
|
110
|
+
const batchRecords = [];
|
|
111
|
+
for (const file of batch) {
|
|
112
|
+
batchRecords.push(...file.records);
|
|
113
|
+
}
|
|
114
|
+
if (batchRecords.length === 0) {
|
|
115
|
+
const pathsToDelete = batch.map((f) => f.path);
|
|
116
|
+
await this.deleteFiles(pathsToDelete);
|
|
117
|
+
continue;
|
|
118
|
+
}
|
|
119
|
+
if (!this.table) {
|
|
120
|
+
await this.ensureTable(batchRecords);
|
|
121
|
+
} else {
|
|
122
|
+
await this.table.add(batchRecords);
|
|
123
|
+
}
|
|
124
|
+
if (this.table && batch.length > 0) {
|
|
125
|
+
const deleteConditions = batch.map((f) => `(file_path = '${this.escapeString(f.path)}' AND file_hash != '${this.escapeString(f.hash)}')`).join(" OR ");
|
|
126
|
+
await this.table.delete(deleteConditions);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* 删除文件的所有 chunks
|
|
132
|
+
*/
|
|
133
|
+
async deleteFile(filePath) {
|
|
134
|
+
if (!this.table) return;
|
|
135
|
+
await this.table.delete(`file_path = '${this.escapeString(filePath)}'`);
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* 批量删除文件(性能优化:单次 DELETE 替代 N 次循环)
|
|
139
|
+
*/
|
|
140
|
+
async deleteFiles(filePaths) {
|
|
141
|
+
if (!this.table || filePaths.length === 0) return;
|
|
142
|
+
const conditions = filePaths.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
|
|
143
|
+
await this.table.delete(conditions);
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* 向量搜索
|
|
147
|
+
*/
|
|
148
|
+
async search(queryVector, limit = 10, filter) {
|
|
149
|
+
if (!this.table) return [];
|
|
150
|
+
let query = this.table.vectorSearch(queryVector).limit(limit);
|
|
151
|
+
if (filter) {
|
|
152
|
+
query = query.where(filter);
|
|
153
|
+
}
|
|
154
|
+
const results = await query.toArray();
|
|
155
|
+
return results;
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* 获取文件的所有 chunks(按 chunk_index 排序)
|
|
159
|
+
*/
|
|
160
|
+
async getFileChunks(filePath) {
|
|
161
|
+
if (!this.table) return [];
|
|
162
|
+
const results = await this.table.query().where(`file_path = '${this.escapeString(filePath)}'`).toArray();
|
|
163
|
+
const chunks = results;
|
|
164
|
+
return chunks.sort((a, b) => a.chunk_index - b.chunk_index);
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* 批量获取多个文件的 chunks(性能优化:单次查询替代 N 次循环)
|
|
168
|
+
*
|
|
169
|
+
* 适用于 GraphExpander 扩展、词法召回等需要批量获取的场景
|
|
170
|
+
* @returns Map<filePath, ChunkRecord[]>,每个文件的 chunks 已按 chunk_index 排序
|
|
171
|
+
*/
|
|
172
|
+
async getFilesChunks(filePaths) {
|
|
173
|
+
const result = /* @__PURE__ */ new Map();
|
|
174
|
+
if (!this.table || filePaths.length === 0) return result;
|
|
175
|
+
const conditions = filePaths.map((p) => `file_path = '${this.escapeString(p)}'`).join(" OR ");
|
|
176
|
+
const rows = await this.table.query().where(conditions).toArray();
|
|
177
|
+
for (const row of rows) {
|
|
178
|
+
let arr = result.get(row.file_path);
|
|
179
|
+
if (!arr) {
|
|
180
|
+
arr = [];
|
|
181
|
+
result.set(row.file_path, arr);
|
|
182
|
+
}
|
|
183
|
+
arr.push(row);
|
|
184
|
+
}
|
|
185
|
+
for (const arr of result.values()) {
|
|
186
|
+
arr.sort((a, b) => a.chunk_index - b.chunk_index);
|
|
187
|
+
}
|
|
188
|
+
return result;
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* 获取表的总记录数
|
|
192
|
+
*/
|
|
193
|
+
async count() {
|
|
194
|
+
if (!this.table) return 0;
|
|
195
|
+
return await this.table.countRows();
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* 清空所有数据
|
|
199
|
+
*/
|
|
200
|
+
async clear() {
|
|
201
|
+
if (!this.db) return;
|
|
202
|
+
try {
|
|
203
|
+
await this.db.dropTable("chunks");
|
|
204
|
+
this.table = null;
|
|
205
|
+
} catch {
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* 获取向量维度
|
|
210
|
+
*/
|
|
211
|
+
getVectorDim() {
|
|
212
|
+
return this.vectorDim;
|
|
213
|
+
}
|
|
214
|
+
/**
|
|
215
|
+
* 转义字符串(防止 SQL 注入)
|
|
216
|
+
*/
|
|
217
|
+
escapeString(str) {
|
|
218
|
+
return str.replace(/'/g, "''");
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* 关闭连接
|
|
222
|
+
*/
|
|
223
|
+
async close() {
|
|
224
|
+
this.db = null;
|
|
225
|
+
this.table = null;
|
|
226
|
+
}
|
|
227
|
+
};
|
|
228
|
+
var vectorStores = /* @__PURE__ */ new Map();
|
|
229
|
+
async function getVectorStore(projectId, vectorDim = 1024) {
|
|
230
|
+
let store = vectorStores.get(projectId);
|
|
231
|
+
if (!store) {
|
|
232
|
+
store = new VectorStore(projectId, vectorDim);
|
|
233
|
+
await store.init();
|
|
234
|
+
vectorStores.set(projectId, store);
|
|
235
|
+
}
|
|
236
|
+
return store;
|
|
237
|
+
}
|
|
238
|
+
async function closeAllVectorStores() {
|
|
239
|
+
for (const store of vectorStores.values()) {
|
|
240
|
+
await store.close();
|
|
241
|
+
}
|
|
242
|
+
vectorStores.clear();
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// src/indexer/index.ts
|
|
246
|
+
import "better-sqlite3";
|
|
247
|
+
|
|
248
|
+
// src/api/embedding.ts
|
|
249
|
+
var ProgressTracker = class {
|
|
250
|
+
completed = 0;
|
|
251
|
+
total;
|
|
252
|
+
totalTokens = 0;
|
|
253
|
+
startTime;
|
|
254
|
+
lastLogTime = 0;
|
|
255
|
+
logIntervalMs = 2e3;
|
|
256
|
+
// 每 3 秒输出一次
|
|
257
|
+
constructor(total) {
|
|
258
|
+
this.total = total;
|
|
259
|
+
this.startTime = Date.now();
|
|
260
|
+
}
|
|
261
|
+
/** 记录一个批次完成 */
|
|
262
|
+
recordBatch(tokens) {
|
|
263
|
+
this.completed++;
|
|
264
|
+
this.totalTokens += tokens;
|
|
265
|
+
const now = Date.now();
|
|
266
|
+
if (now - this.lastLogTime >= this.logIntervalMs) {
|
|
267
|
+
this.logProgress();
|
|
268
|
+
this.lastLogTime = now;
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
/** 输出进度 */
|
|
272
|
+
logProgress() {
|
|
273
|
+
const elapsed = (Date.now() - this.startTime) / 1e3;
|
|
274
|
+
const percent = Math.round(this.completed / this.total * 100);
|
|
275
|
+
const rate = this.completed / elapsed;
|
|
276
|
+
const eta = rate > 0 ? Math.round((this.total - this.completed) / rate) : 0;
|
|
277
|
+
logger.info(
|
|
278
|
+
{
|
|
279
|
+
progress: `${this.completed}/${this.total}`,
|
|
280
|
+
percent: `${percent}%`,
|
|
281
|
+
tokens: this.totalTokens,
|
|
282
|
+
elapsed: `${elapsed.toFixed(1)}s`,
|
|
283
|
+
eta: `${eta}s`
|
|
284
|
+
},
|
|
285
|
+
"Embedding \u8FDB\u5EA6"
|
|
286
|
+
);
|
|
287
|
+
}
|
|
288
|
+
/** 完成时输出最终统计 */
|
|
289
|
+
complete() {
|
|
290
|
+
const elapsed = (Date.now() - this.startTime) / 1e3;
|
|
291
|
+
logger.info(
|
|
292
|
+
{
|
|
293
|
+
batches: this.total,
|
|
294
|
+
tokens: this.totalTokens,
|
|
295
|
+
elapsed: `${elapsed.toFixed(1)}s`,
|
|
296
|
+
avgTokensPerBatch: Math.round(this.totalTokens / this.total)
|
|
297
|
+
},
|
|
298
|
+
"Embedding \u5B8C\u6210"
|
|
299
|
+
);
|
|
300
|
+
}
|
|
301
|
+
};
|
|
302
|
+
var RateLimitController = class {
|
|
303
|
+
/** 是否处于暂停状态 */
|
|
304
|
+
isPaused = false;
|
|
305
|
+
/** 暂停恢复的 Promise(所有请求等待此 Promise) */
|
|
306
|
+
pausePromise = null;
|
|
307
|
+
/** 当前有效并发数 */
|
|
308
|
+
currentConcurrency;
|
|
309
|
+
/** 配置的最大并发数 */
|
|
310
|
+
maxConcurrency;
|
|
311
|
+
/** 当前活跃请求数 */
|
|
312
|
+
activeRequests = 0;
|
|
313
|
+
/** 连续成功次数(用于渐进恢复并发) */
|
|
314
|
+
consecutiveSuccesses = 0;
|
|
315
|
+
/** 当前退避时间(毫秒) */
|
|
316
|
+
backoffMs = 5e3;
|
|
317
|
+
/** 恢复并发所需的连续成功次数 */
|
|
318
|
+
successesPerConcurrencyIncrease = 3;
|
|
319
|
+
/** 最小退避时间 */
|
|
320
|
+
minBackoffMs = 5e3;
|
|
321
|
+
/** 最大退避时间 */
|
|
322
|
+
maxBackoffMs = 6e4;
|
|
323
|
+
constructor(maxConcurrency) {
|
|
324
|
+
this.maxConcurrency = maxConcurrency;
|
|
325
|
+
this.currentConcurrency = maxConcurrency;
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* 获取执行槽位
|
|
329
|
+
* 如果当前暂停或并发已满,则等待
|
|
330
|
+
*/
|
|
331
|
+
async acquire() {
|
|
332
|
+
if (this.pausePromise) {
|
|
333
|
+
await this.pausePromise;
|
|
334
|
+
}
|
|
335
|
+
while (this.activeRequests >= this.currentConcurrency) {
|
|
336
|
+
await sleep(50);
|
|
337
|
+
if (this.pausePromise) {
|
|
338
|
+
await this.pausePromise;
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
this.activeRequests++;
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* 释放执行槽位(请求成功时调用)
|
|
345
|
+
*/
|
|
346
|
+
releaseSuccess() {
|
|
347
|
+
this.activeRequests = Math.max(0, this.activeRequests - 1);
|
|
348
|
+
this.consecutiveSuccesses++;
|
|
349
|
+
if (this.currentConcurrency < this.maxConcurrency && this.consecutiveSuccesses >= this.successesPerConcurrencyIncrease) {
|
|
350
|
+
this.currentConcurrency++;
|
|
351
|
+
this.consecutiveSuccesses = 0;
|
|
352
|
+
}
|
|
353
|
+
if (this.consecutiveSuccesses > 0 && this.consecutiveSuccesses % 10 === 0) {
|
|
354
|
+
this.backoffMs = Math.max(this.minBackoffMs, this.backoffMs / 2);
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
/**
|
|
358
|
+
* 释放执行槽位(请求失败但非 429 时调用)
|
|
359
|
+
*/
|
|
360
|
+
releaseFailure() {
|
|
361
|
+
this.activeRequests = Math.max(0, this.activeRequests - 1);
|
|
362
|
+
}
|
|
363
|
+
/**
|
|
364
|
+
* 释放执行槽位(429 重试前调用)
|
|
365
|
+
* 释放槽位并重置成功计数
|
|
366
|
+
*/
|
|
367
|
+
releaseForRetry() {
|
|
368
|
+
this.activeRequests = Math.max(0, this.activeRequests - 1);
|
|
369
|
+
this.consecutiveSuccesses = 0;
|
|
370
|
+
}
|
|
371
|
+
/**
|
|
372
|
+
* 触发 429 暂停
|
|
373
|
+
* 所有请求将等待恢复
|
|
374
|
+
*/
|
|
375
|
+
async triggerRateLimit() {
|
|
376
|
+
if (this.isPaused && this.pausePromise) {
|
|
377
|
+
logger.debug("\u901F\u7387\u9650\u5236\uFF1A\u7B49\u5F85\u73B0\u6709\u6682\u505C\u7ED3\u675F");
|
|
378
|
+
await this.pausePromise;
|
|
379
|
+
return;
|
|
380
|
+
}
|
|
381
|
+
this.isPaused = true;
|
|
382
|
+
this.consecutiveSuccesses = 0;
|
|
383
|
+
const previousConcurrency = this.currentConcurrency;
|
|
384
|
+
this.currentConcurrency = 1;
|
|
385
|
+
logger.warn(
|
|
386
|
+
{
|
|
387
|
+
backoffMs: this.backoffMs,
|
|
388
|
+
previousConcurrency,
|
|
389
|
+
newConcurrency: this.currentConcurrency,
|
|
390
|
+
activeRequests: this.activeRequests
|
|
391
|
+
},
|
|
392
|
+
"\u901F\u7387\u9650\u5236\uFF1A\u89E6\u53D1 429\uFF0C\u6682\u505C\u6240\u6709\u8BF7\u6C42"
|
|
393
|
+
);
|
|
394
|
+
let resumeResolve;
|
|
395
|
+
this.pausePromise = new Promise((resolve) => {
|
|
396
|
+
resumeResolve = resolve;
|
|
397
|
+
});
|
|
398
|
+
await sleep(this.backoffMs);
|
|
399
|
+
this.backoffMs = Math.min(this.maxBackoffMs, this.backoffMs * 2);
|
|
400
|
+
this.isPaused = false;
|
|
401
|
+
const resolvedPromise = this.pausePromise;
|
|
402
|
+
this.pausePromise = null;
|
|
403
|
+
resumeResolve();
|
|
404
|
+
logger.info(
|
|
405
|
+
{ waitMs: this.backoffMs },
|
|
406
|
+
"\u901F\u7387\u9650\u5236\uFF1A\u6062\u590D\u8BF7\u6C42"
|
|
407
|
+
);
|
|
408
|
+
}
|
|
409
|
+
/**
|
|
410
|
+
* 获取当前状态(用于调试)
|
|
411
|
+
*/
|
|
412
|
+
getStatus() {
|
|
413
|
+
return {
|
|
414
|
+
isPaused: this.isPaused,
|
|
415
|
+
currentConcurrency: this.currentConcurrency,
|
|
416
|
+
maxConcurrency: this.maxConcurrency,
|
|
417
|
+
activeRequests: this.activeRequests,
|
|
418
|
+
backoffMs: this.backoffMs
|
|
419
|
+
};
|
|
420
|
+
}
|
|
421
|
+
};
|
|
422
|
+
var globalRateLimitController = null;
|
|
423
|
+
function getRateLimitController(maxConcurrency) {
|
|
424
|
+
if (!globalRateLimitController) {
|
|
425
|
+
globalRateLimitController = new RateLimitController(maxConcurrency);
|
|
426
|
+
}
|
|
427
|
+
return globalRateLimitController;
|
|
428
|
+
}
|
|
429
|
+
var EmbeddingClient = class {
|
|
430
|
+
config;
|
|
431
|
+
rateLimiter;
|
|
432
|
+
constructor(config) {
|
|
433
|
+
this.config = config || getEmbeddingConfig();
|
|
434
|
+
this.rateLimiter = getRateLimitController(this.config.maxConcurrency);
|
|
435
|
+
}
|
|
436
|
+
/**
|
|
437
|
+
* 获取单个文本的 Embedding
|
|
438
|
+
*/
|
|
439
|
+
async embed(text) {
|
|
440
|
+
const results = await this.embedBatch([text]);
|
|
441
|
+
return results[0].embedding;
|
|
442
|
+
}
|
|
443
|
+
/**
|
|
444
|
+
* 批量获取 Embedding
|
|
445
|
+
* @param texts 待处理的文本数组
|
|
446
|
+
* @param batchSize 每批次发送的文本数量(默认 20)
|
|
447
|
+
*/
|
|
448
|
+
async embedBatch(texts, batchSize = 20) {
|
|
449
|
+
if (texts.length === 0) {
|
|
450
|
+
return [];
|
|
451
|
+
}
|
|
452
|
+
const batches = [];
|
|
453
|
+
for (let i = 0; i < texts.length; i += batchSize) {
|
|
454
|
+
batches.push(texts.slice(i, i + batchSize));
|
|
455
|
+
}
|
|
456
|
+
const progress = new ProgressTracker(batches.length);
|
|
457
|
+
const batchResults = await Promise.all(
|
|
458
|
+
batches.map(
|
|
459
|
+
(batch, batchIndex) => this.processWithRateLimit(batch, batchIndex * batchSize, progress)
|
|
460
|
+
)
|
|
461
|
+
);
|
|
462
|
+
progress.complete();
|
|
463
|
+
return batchResults.flat();
|
|
464
|
+
}
|
|
465
|
+
/**
|
|
466
|
+
* 带速率限制的批次处理
|
|
467
|
+
* 使用循环而非递归,避免栈溢出和槽位泄漏
|
|
468
|
+
*/
|
|
469
|
+
async processWithRateLimit(texts, startIndex, progress) {
|
|
470
|
+
while (true) {
|
|
471
|
+
await this.rateLimiter.acquire();
|
|
472
|
+
try {
|
|
473
|
+
const result = await this.processBatch(texts, startIndex, progress);
|
|
474
|
+
this.rateLimiter.releaseSuccess();
|
|
475
|
+
return result;
|
|
476
|
+
} catch (err) {
|
|
477
|
+
const isRateLimited = err.message?.includes("429") || err.message?.includes("rate");
|
|
478
|
+
if (isRateLimited) {
|
|
479
|
+
this.rateLimiter.releaseForRetry();
|
|
480
|
+
await this.rateLimiter.triggerRateLimit();
|
|
481
|
+
} else {
|
|
482
|
+
this.rateLimiter.releaseFailure();
|
|
483
|
+
throw err;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
/**
|
|
489
|
+
* 处理单个批次(单次请求,不含重试逻辑)
|
|
490
|
+
*/
|
|
491
|
+
async processBatch(texts, startIndex, progress) {
|
|
492
|
+
const requestBody = {
|
|
493
|
+
model: this.config.model,
|
|
494
|
+
input: texts,
|
|
495
|
+
encoding_format: "float"
|
|
496
|
+
};
|
|
497
|
+
const response = await fetch(this.config.baseUrl, {
|
|
498
|
+
method: "POST",
|
|
499
|
+
headers: {
|
|
500
|
+
"Content-Type": "application/json",
|
|
501
|
+
Authorization: `Bearer ${this.config.apiKey}`
|
|
502
|
+
},
|
|
503
|
+
body: JSON.stringify(requestBody)
|
|
504
|
+
});
|
|
505
|
+
const data = await response.json();
|
|
506
|
+
if (!response.ok || data.error) {
|
|
507
|
+
const errorMsg = data.error?.message || `HTTP ${response.status}`;
|
|
508
|
+
throw new Error(`Embedding API \u9519\u8BEF: ${errorMsg}`);
|
|
509
|
+
}
|
|
510
|
+
const results = data.data.map((item) => ({
|
|
511
|
+
text: texts[item.index],
|
|
512
|
+
embedding: item.embedding,
|
|
513
|
+
index: startIndex + item.index
|
|
514
|
+
}));
|
|
515
|
+
progress.recordBatch(data.usage?.total_tokens || 0);
|
|
516
|
+
return results;
|
|
517
|
+
}
|
|
518
|
+
/**
|
|
519
|
+
* 获取当前配置
|
|
520
|
+
*/
|
|
521
|
+
getConfig() {
|
|
522
|
+
return { ...this.config };
|
|
523
|
+
}
|
|
524
|
+
/**
|
|
525
|
+
* 获取速率限制器状态(用于调试)
|
|
526
|
+
*/
|
|
527
|
+
getRateLimiterStatus() {
|
|
528
|
+
return this.rateLimiter.getStatus();
|
|
529
|
+
}
|
|
530
|
+
};
|
|
531
|
+
var defaultClient = null;
|
|
532
|
+
function getEmbeddingClient() {
|
|
533
|
+
if (!defaultClient) {
|
|
534
|
+
defaultClient = new EmbeddingClient();
|
|
535
|
+
}
|
|
536
|
+
return defaultClient;
|
|
537
|
+
}
|
|
538
|
+
function sleep(ms) {
|
|
539
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
// src/indexer/index.ts
|
|
543
|
+
var Indexer = class {
|
|
544
|
+
projectId;
|
|
545
|
+
vectorStore = null;
|
|
546
|
+
embeddingClient;
|
|
547
|
+
vectorDim;
|
|
548
|
+
constructor(projectId, vectorDim = 1024) {
|
|
549
|
+
this.projectId = projectId;
|
|
550
|
+
this.vectorDim = vectorDim;
|
|
551
|
+
this.embeddingClient = getEmbeddingClient();
|
|
552
|
+
}
|
|
553
|
+
/**
|
|
554
|
+
* 初始化
|
|
555
|
+
*/
|
|
556
|
+
async init() {
|
|
557
|
+
this.vectorStore = await getVectorStore(this.projectId, this.vectorDim);
|
|
558
|
+
}
|
|
559
|
+
/**
|
|
560
|
+
* 处理扫描结果,更新向量索引
|
|
561
|
+
*
|
|
562
|
+
* @param db SQLite 数据库实例
|
|
563
|
+
* @param results 文件处理结果
|
|
564
|
+
*/
|
|
565
|
+
async indexFiles(db, results) {
|
|
566
|
+
if (!this.vectorStore) {
|
|
567
|
+
await this.init();
|
|
568
|
+
}
|
|
569
|
+
const stats = {
|
|
570
|
+
indexed: 0,
|
|
571
|
+
deleted: 0,
|
|
572
|
+
errors: 0,
|
|
573
|
+
skipped: 0
|
|
574
|
+
};
|
|
575
|
+
const toIndex = [];
|
|
576
|
+
const toDelete = [];
|
|
577
|
+
for (const result of results) {
|
|
578
|
+
switch (result.status) {
|
|
579
|
+
case "added":
|
|
580
|
+
case "modified":
|
|
581
|
+
if (result.chunks.length > 0) {
|
|
582
|
+
toIndex.push({
|
|
583
|
+
path: result.relPath,
|
|
584
|
+
hash: result.hash,
|
|
585
|
+
chunks: result.chunks
|
|
586
|
+
});
|
|
587
|
+
} else {
|
|
588
|
+
toDelete.push(result.relPath);
|
|
589
|
+
stats.skipped++;
|
|
590
|
+
}
|
|
591
|
+
break;
|
|
592
|
+
case "deleted":
|
|
593
|
+
toDelete.push(result.relPath);
|
|
594
|
+
break;
|
|
595
|
+
case "unchanged":
|
|
596
|
+
stats.skipped++;
|
|
597
|
+
break;
|
|
598
|
+
case "skipped":
|
|
599
|
+
case "error":
|
|
600
|
+
stats.skipped++;
|
|
601
|
+
break;
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
if (toDelete.length > 0) {
|
|
605
|
+
await this.deleteFiles(db, toDelete);
|
|
606
|
+
stats.deleted = toDelete.length;
|
|
607
|
+
}
|
|
608
|
+
if (toIndex.length > 0) {
|
|
609
|
+
const indexResult = await this.batchIndex(db, toIndex);
|
|
610
|
+
stats.indexed = indexResult.success;
|
|
611
|
+
stats.errors = indexResult.errors;
|
|
612
|
+
}
|
|
613
|
+
logger.info(
|
|
614
|
+
{ indexed: stats.indexed, deleted: stats.deleted, errors: stats.errors, skipped: stats.skipped },
|
|
615
|
+
"\u5411\u91CF\u7D22\u5F15\u5B8C\u6210"
|
|
616
|
+
);
|
|
617
|
+
return stats;
|
|
618
|
+
}
|
|
619
|
+
/**
|
|
620
|
+
* 批量索引文件(性能优化版)
|
|
621
|
+
*
|
|
622
|
+
* 优化策略:
|
|
623
|
+
* 1. Embedding 已批量化(原有)
|
|
624
|
+
* 2. LanceDB 写入批量化:N 次 upsertFile → 1 次 batchUpsertFiles
|
|
625
|
+
* 3. FTS 写入批量化:N 次删除+插入 → 1 次批量删除 + 1 次批量插入
|
|
626
|
+
* 4. 日志汇总化:逐文件日志 → 汇总日志
|
|
627
|
+
*/
|
|
628
|
+
async batchIndex(db, files) {
|
|
629
|
+
if (files.length === 0) {
|
|
630
|
+
return { success: 0, errors: 0 };
|
|
631
|
+
}
|
|
632
|
+
const allTexts = [];
|
|
633
|
+
const globalIndexByFileChunk = [];
|
|
634
|
+
for (let fileIdx = 0; fileIdx < files.length; fileIdx++) {
|
|
635
|
+
const file = files[fileIdx];
|
|
636
|
+
globalIndexByFileChunk[fileIdx] = [];
|
|
637
|
+
for (let chunkIdx = 0; chunkIdx < file.chunks.length; chunkIdx++) {
|
|
638
|
+
const globalIdx = allTexts.length;
|
|
639
|
+
allTexts.push(file.chunks[chunkIdx].vectorText);
|
|
640
|
+
globalIndexByFileChunk[fileIdx][chunkIdx] = globalIdx;
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
if (allTexts.length === 0) {
|
|
644
|
+
return { success: 0, errors: 0 };
|
|
645
|
+
}
|
|
646
|
+
logger.info({ count: allTexts.length, files: files.length }, "\u5F00\u59CB\u6279\u91CF Embedding");
|
|
647
|
+
let embeddings;
|
|
648
|
+
try {
|
|
649
|
+
const results = await this.embeddingClient.embedBatch(allTexts);
|
|
650
|
+
embeddings = results.map((r) => r.embedding);
|
|
651
|
+
} catch (err) {
|
|
652
|
+
logger.error({ error: err.message, stack: err.stack }, "Embedding \u5931\u8D25");
|
|
653
|
+
clearVectorIndexHash(db, files.map((f) => f.path));
|
|
654
|
+
return { success: 0, errors: files.length };
|
|
655
|
+
}
|
|
656
|
+
const filesToUpsert = [];
|
|
657
|
+
const allFtsChunks = [];
|
|
658
|
+
const successFiles = [];
|
|
659
|
+
const errorFiles = [];
|
|
660
|
+
for (let fileIdx = 0; fileIdx < files.length; fileIdx++) {
|
|
661
|
+
const file = files[fileIdx];
|
|
662
|
+
try {
|
|
663
|
+
const records = [];
|
|
664
|
+
for (let chunkIdx = 0; chunkIdx < file.chunks.length; chunkIdx++) {
|
|
665
|
+
const chunk = file.chunks[chunkIdx];
|
|
666
|
+
const globalIdx = globalIndexByFileChunk[fileIdx][chunkIdx];
|
|
667
|
+
if (globalIdx === void 0) {
|
|
668
|
+
throw new Error(`\u627E\u4E0D\u5230 chunk \u7684 embedding: ${file.path}#${chunkIdx}`);
|
|
669
|
+
}
|
|
670
|
+
const record = {
|
|
671
|
+
chunk_id: `${file.path}#${file.hash}#${chunkIdx}`,
|
|
672
|
+
file_path: file.path,
|
|
673
|
+
file_hash: file.hash,
|
|
674
|
+
chunk_index: chunkIdx,
|
|
675
|
+
vector: embeddings[globalIdx],
|
|
676
|
+
display_code: chunk.displayCode,
|
|
677
|
+
vector_text: chunk.vectorText,
|
|
678
|
+
language: chunk.metadata.language,
|
|
679
|
+
breadcrumb: chunk.metadata.contextPath.join(" > "),
|
|
680
|
+
start_index: chunk.metadata.startIndex,
|
|
681
|
+
end_index: chunk.metadata.endIndex,
|
|
682
|
+
raw_start: chunk.metadata.rawSpan.start,
|
|
683
|
+
raw_end: chunk.metadata.rawSpan.end,
|
|
684
|
+
vec_start: chunk.metadata.vectorSpan.start,
|
|
685
|
+
vec_end: chunk.metadata.vectorSpan.end
|
|
686
|
+
};
|
|
687
|
+
records.push(record);
|
|
688
|
+
allFtsChunks.push({
|
|
689
|
+
chunkId: record.chunk_id,
|
|
690
|
+
filePath: record.file_path,
|
|
691
|
+
chunkIndex: record.chunk_index,
|
|
692
|
+
breadcrumb: record.breadcrumb,
|
|
693
|
+
content: record.breadcrumb + "\n" + record.display_code
|
|
694
|
+
});
|
|
695
|
+
}
|
|
696
|
+
filesToUpsert.push({ path: file.path, hash: file.hash, records });
|
|
697
|
+
successFiles.push({ path: file.path, hash: file.hash });
|
|
698
|
+
} catch (err) {
|
|
699
|
+
logger.error({ path: file.path, error: err.message, stack: err.stack }, "\u7EC4\u88C5 ChunkRecord \u5931\u8D25");
|
|
700
|
+
errorFiles.push(file.path);
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
if (filesToUpsert.length > 0) {
|
|
704
|
+
try {
|
|
705
|
+
await this.vectorStore.batchUpsertFiles(filesToUpsert);
|
|
706
|
+
logger.info({ files: filesToUpsert.length, chunks: allFtsChunks.length }, "LanceDB \u6279\u91CF\u5199\u5165\u5B8C\u6210");
|
|
707
|
+
} catch (err) {
|
|
708
|
+
logger.error({ error: err.message, stack: err.stack }, "LanceDB \u6279\u91CF\u5199\u5165\u5931\u8D25");
|
|
709
|
+
clearVectorIndexHash(db, files.map((f) => f.path));
|
|
710
|
+
return { success: 0, errors: files.length };
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
if (isChunksFtsInitialized(db) && allFtsChunks.length > 0) {
|
|
714
|
+
try {
|
|
715
|
+
const pathsToDelete = filesToUpsert.map((f) => f.path);
|
|
716
|
+
batchDeleteFileChunksFts(db, pathsToDelete);
|
|
717
|
+
batchUpsertChunkFts(db, allFtsChunks);
|
|
718
|
+
logger.info({ files: pathsToDelete.length, chunks: allFtsChunks.length }, "FTS \u6279\u91CF\u66F4\u65B0\u5B8C\u6210");
|
|
719
|
+
} catch (err) {
|
|
720
|
+
logger.warn({ error: err.message }, "FTS \u6279\u91CF\u66F4\u65B0\u5931\u8D25\uFF08\u5411\u91CF\u7D22\u5F15\u5DF2\u6210\u529F\uFF09");
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
if (successFiles.length > 0) {
|
|
724
|
+
batchUpdateVectorIndexHash(db, successFiles);
|
|
725
|
+
}
|
|
726
|
+
logger.info(
|
|
727
|
+
{ success: successFiles.length, errors: errorFiles.length },
|
|
728
|
+
"\u6279\u91CF\u7D22\u5F15\u5B8C\u6210"
|
|
729
|
+
);
|
|
730
|
+
return { success: successFiles.length, errors: errorFiles.length };
|
|
731
|
+
}
|
|
732
|
+
/**
|
|
733
|
+
* 删除文件的向量和 FTS 索引
|
|
734
|
+
*/
|
|
735
|
+
async deleteFiles(db, paths) {
|
|
736
|
+
if (!this.vectorStore) return;
|
|
737
|
+
await this.vectorStore.deleteFiles(paths);
|
|
738
|
+
if (isChunksFtsInitialized(db)) {
|
|
739
|
+
batchDeleteFileChunksFts(db, paths);
|
|
740
|
+
}
|
|
741
|
+
logger.debug({ count: paths.length }, "\u5220\u9664\u6587\u4EF6\u7D22\u5F15");
|
|
742
|
+
}
|
|
743
|
+
/**
|
|
744
|
+
* 向量搜索
|
|
745
|
+
*/
|
|
746
|
+
async search(queryVector, limit = 10, filter) {
|
|
747
|
+
if (!this.vectorStore) {
|
|
748
|
+
await this.init();
|
|
749
|
+
}
|
|
750
|
+
return this.vectorStore.search(queryVector, limit, filter);
|
|
751
|
+
}
|
|
752
|
+
/**
|
|
753
|
+
* 文本搜索(先 embedding 再向量搜索)
|
|
754
|
+
*/
|
|
755
|
+
async textSearch(query, limit = 10, filter) {
|
|
756
|
+
const queryVector = await this.embeddingClient.embed(query);
|
|
757
|
+
return this.search(queryVector, limit, filter);
|
|
758
|
+
}
|
|
759
|
+
/**
|
|
760
|
+
* 清空索引
|
|
761
|
+
*/
|
|
762
|
+
async clear() {
|
|
763
|
+
if (!this.vectorStore) {
|
|
764
|
+
await this.init();
|
|
765
|
+
}
|
|
766
|
+
await this.vectorStore.clear();
|
|
767
|
+
}
|
|
768
|
+
/**
|
|
769
|
+
* 获取索引统计
|
|
770
|
+
*/
|
|
771
|
+
async getStats() {
|
|
772
|
+
if (!this.vectorStore) {
|
|
773
|
+
await this.init();
|
|
774
|
+
}
|
|
775
|
+
const count = await this.vectorStore.count();
|
|
776
|
+
return { totalChunks: count };
|
|
777
|
+
}
|
|
778
|
+
};
|
|
779
|
+
var indexers = /* @__PURE__ */ new Map();
|
|
780
|
+
async function getIndexer(projectId, vectorDim = 1024) {
|
|
781
|
+
let indexer = indexers.get(projectId);
|
|
782
|
+
if (!indexer) {
|
|
783
|
+
indexer = new Indexer(projectId, vectorDim);
|
|
784
|
+
await indexer.init();
|
|
785
|
+
indexers.set(projectId, indexer);
|
|
786
|
+
}
|
|
787
|
+
return indexer;
|
|
788
|
+
}
|
|
789
|
+
function closeAllIndexers() {
|
|
790
|
+
indexers.clear();
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
export {
|
|
794
|
+
getVectorStore,
|
|
795
|
+
closeAllVectorStores,
|
|
796
|
+
getIndexer,
|
|
797
|
+
closeAllIndexers
|
|
798
|
+
};
|