@hsingjui/contextweaver 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +380 -0
- package/dist/SearchService-YLOUJF4S.js +1496 -0
- package/dist/chunk-34YZ2U3O.js +1177 -0
- package/dist/chunk-5SRSUMKW.js +612 -0
- package/dist/chunk-5TV4JNTE.js +258 -0
- package/dist/chunk-6C2D5Y4R.js +798 -0
- package/dist/chunk-PN7DP6XL.js +158 -0
- package/dist/codebaseRetrieval-RDCNIUDM.js +10 -0
- package/dist/config-IEL3M4V5.js +18 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +130 -0
- package/dist/scanner-66CLKCSZ.js +9 -0
- package/dist/server-2SAFEAEY.js +131 -0
- package/package.json +59 -0
|
@@ -0,0 +1,1177 @@
|
|
|
1
|
+
import {
|
|
2
|
+
closeAllIndexers,
|
|
3
|
+
closeAllVectorStores,
|
|
4
|
+
getIndexer
|
|
5
|
+
} from "./chunk-6C2D5Y4R.js";
|
|
6
|
+
import {
|
|
7
|
+
batchDelete,
|
|
8
|
+
batchUpdateMtime,
|
|
9
|
+
batchUpsert,
|
|
10
|
+
clear,
|
|
11
|
+
closeDb,
|
|
12
|
+
generateProjectId,
|
|
13
|
+
getAllFileMeta,
|
|
14
|
+
getAllPaths,
|
|
15
|
+
getFilesNeedingVectorIndex,
|
|
16
|
+
getStoredEmbeddingDimensions,
|
|
17
|
+
initDb,
|
|
18
|
+
logger,
|
|
19
|
+
setStoredEmbeddingDimensions
|
|
20
|
+
} from "./chunk-5SRSUMKW.js";
|
|
21
|
+
import {
|
|
22
|
+
getEmbeddingConfig,
|
|
23
|
+
getExcludePatterns
|
|
24
|
+
} from "./chunk-PN7DP6XL.js";
|
|
25
|
+
|
|
26
|
+
// src/scanner/crawler.ts
|
|
27
|
+
import { fdir } from "fdir";
|
|
28
|
+
import path2 from "path";
|
|
29
|
+
|
|
30
|
+
// src/scanner/filter.ts
|
|
31
|
+
import ignore from "ignore";
|
|
32
|
+
import fs from "fs/promises";
|
|
33
|
+
import path from "path";
|
|
34
|
+
|
|
35
|
+
// src/scanner/language.ts
|
|
36
|
+
var LANGUAGE_MAP = {
|
|
37
|
+
".ts": "typescript",
|
|
38
|
+
".tsx": "typescript",
|
|
39
|
+
".js": "javascript",
|
|
40
|
+
".jsx": "javascript",
|
|
41
|
+
".mjs": "javascript",
|
|
42
|
+
".cjs": "javascript",
|
|
43
|
+
".md": "markdown",
|
|
44
|
+
".py": "python",
|
|
45
|
+
".go": "go",
|
|
46
|
+
".rs": "rust",
|
|
47
|
+
".java": "java",
|
|
48
|
+
".kt": "kotlin",
|
|
49
|
+
".swift": "swift",
|
|
50
|
+
".cpp": "cpp",
|
|
51
|
+
".cc": "cpp",
|
|
52
|
+
".cxx": "cpp",
|
|
53
|
+
".hpp": "cpp",
|
|
54
|
+
".h": "cpp",
|
|
55
|
+
".c": "c",
|
|
56
|
+
".sh": "shell",
|
|
57
|
+
".bash": "shell",
|
|
58
|
+
".zsh": "shell",
|
|
59
|
+
".fish": "shell",
|
|
60
|
+
".ps1": "powershell",
|
|
61
|
+
".sql": "sql",
|
|
62
|
+
".yaml": "yaml",
|
|
63
|
+
".yml": "yaml",
|
|
64
|
+
".json": "json",
|
|
65
|
+
".toml": "toml",
|
|
66
|
+
".xml": "xml",
|
|
67
|
+
".html": "html",
|
|
68
|
+
".css": "css",
|
|
69
|
+
".scss": "scss",
|
|
70
|
+
".sass": "sass",
|
|
71
|
+
".less": "less",
|
|
72
|
+
".vue": "vue",
|
|
73
|
+
".svelte": "svelte",
|
|
74
|
+
".rb": "ruby",
|
|
75
|
+
".php": "php",
|
|
76
|
+
".dart": "dart",
|
|
77
|
+
".lua": "lua",
|
|
78
|
+
".r": "r"
|
|
79
|
+
};
|
|
80
|
+
var ALLOWED_EXTENSIONS = new Set(Object.keys(LANGUAGE_MAP));
|
|
81
|
+
function getLanguage(filePath) {
|
|
82
|
+
const ext = getFileExtension(filePath);
|
|
83
|
+
return LANGUAGE_MAP[ext] || "unknown";
|
|
84
|
+
}
|
|
85
|
+
function isAllowedExtension(filePath) {
|
|
86
|
+
const ext = getFileExtension(filePath);
|
|
87
|
+
return ALLOWED_EXTENSIONS.has(ext);
|
|
88
|
+
}
|
|
89
|
+
function getFileExtension(filePath) {
|
|
90
|
+
const ext = filePath.split(".").pop();
|
|
91
|
+
return ext ? `.${ext.toLowerCase()}` : "";
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// src/scanner/filter.ts
|
|
95
|
+
var ignoreInstance = null;
|
|
96
|
+
var lastConfigHash = null;
|
|
97
|
+
var CONFIG_FILES = [
|
|
98
|
+
".gitignore",
|
|
99
|
+
".contextweaverignore"
|
|
100
|
+
];
|
|
101
|
+
async function generateConfigHash(rootPath) {
|
|
102
|
+
const crypto2 = await import("crypto");
|
|
103
|
+
const hashes = [];
|
|
104
|
+
for (const file of CONFIG_FILES) {
|
|
105
|
+
const filePath = path.join(rootPath, file);
|
|
106
|
+
try {
|
|
107
|
+
const content = await fs.readFile(filePath, "utf-8");
|
|
108
|
+
hashes.push(crypto2.createHash("sha256").update(content).digest("hex"));
|
|
109
|
+
} catch {
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
const envPatterns = process.env.IGNORE_PATTERNS || "";
|
|
113
|
+
const envHash = crypto2.createHash("sha256").update(envPatterns).digest("hex");
|
|
114
|
+
hashes.push(envHash);
|
|
115
|
+
const combined = hashes.join("|");
|
|
116
|
+
return crypto2.createHash("sha256").update(combined).digest("hex");
|
|
117
|
+
}
|
|
118
|
+
async function initFilter(rootPath) {
|
|
119
|
+
const currentHash = await generateConfigHash(rootPath);
|
|
120
|
+
if (lastConfigHash === currentHash && ignoreInstance) {
|
|
121
|
+
return;
|
|
122
|
+
}
|
|
123
|
+
const ig = ignore();
|
|
124
|
+
const patterns = getExcludePatterns();
|
|
125
|
+
ig.add(patterns);
|
|
126
|
+
const gitignorePath = path.join(rootPath, ".gitignore");
|
|
127
|
+
try {
|
|
128
|
+
await fs.access(gitignorePath);
|
|
129
|
+
ig.add(await fs.readFile(gitignorePath, "utf-8"));
|
|
130
|
+
} catch {
|
|
131
|
+
}
|
|
132
|
+
const cwignorePath = path.join(rootPath, ".contextweaverignore");
|
|
133
|
+
try {
|
|
134
|
+
await fs.access(cwignorePath);
|
|
135
|
+
ig.add(await fs.readFile(cwignorePath, "utf-8"));
|
|
136
|
+
} catch {
|
|
137
|
+
}
|
|
138
|
+
ignoreInstance = ig;
|
|
139
|
+
lastConfigHash = currentHash;
|
|
140
|
+
}
|
|
141
|
+
function isFiltered(relativePath) {
|
|
142
|
+
if (!ignoreInstance) {
|
|
143
|
+
throw new Error("Filter not initialized. Call initFilter() first.");
|
|
144
|
+
}
|
|
145
|
+
return ignoreInstance.ignores(relativePath);
|
|
146
|
+
}
|
|
147
|
+
function isAllowedFile(filePath) {
|
|
148
|
+
return isAllowedExtension(filePath);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// src/scanner/crawler.ts
|
|
152
|
+
async function crawl(rootPath) {
|
|
153
|
+
const api = new fdir().withFullPaths().withErrors().filter((filePath) => {
|
|
154
|
+
const relativePath = filePath.replace(rootPath + path2.sep, "").replace(new RegExp(`^${rootPath}/?`), "");
|
|
155
|
+
return !isFiltered(relativePath) && isAllowedFile(filePath);
|
|
156
|
+
});
|
|
157
|
+
const paths = await api.crawl(rootPath).withPromise();
|
|
158
|
+
return paths;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// src/scanner/processor.ts
|
|
162
|
+
import fs3 from "fs/promises";
|
|
163
|
+
import pLimit from "p-limit";
|
|
164
|
+
import path3 from "path";
|
|
165
|
+
import os from "os";
|
|
166
|
+
|
|
167
|
+
// src/scanner/hash.ts
|
|
168
|
+
import crypto from "crypto";
|
|
169
|
+
function sha256(data) {
|
|
170
|
+
return crypto.createHash("sha256").update(data).digest("hex");
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// src/utils/encoding.ts
|
|
174
|
+
import fs2 from "fs/promises";
|
|
175
|
+
import chardet from "chardet";
|
|
176
|
+
import iconv from "iconv-lite";
|
|
177
|
+
function normalizeEncoding(encoding) {
|
|
178
|
+
const map = {
|
|
179
|
+
"UTF-8": "utf8",
|
|
180
|
+
"UTF-16 LE": "utf16le",
|
|
181
|
+
"UTF-16 BE": "utf16be",
|
|
182
|
+
"UTF-32 LE": "utf32le",
|
|
183
|
+
"UTF-32 BE": "utf32be",
|
|
184
|
+
"GB18030": "gb18030",
|
|
185
|
+
"GBK": "gbk",
|
|
186
|
+
"GB2312": "gb2312",
|
|
187
|
+
"Big5": "big5",
|
|
188
|
+
"Shift_JIS": "shiftjis",
|
|
189
|
+
"EUC-JP": "eucjp",
|
|
190
|
+
"EUC-KR": "euckr",
|
|
191
|
+
"ISO-8859-1": "iso88591",
|
|
192
|
+
"windows-1252": "win1252",
|
|
193
|
+
"ASCII": "utf8"
|
|
194
|
+
// ASCII 是 UTF-8 的子集
|
|
195
|
+
};
|
|
196
|
+
return map[encoding] || encoding.toLowerCase().replace(/[^a-z0-9]/g, "");
|
|
197
|
+
}
|
|
198
|
+
function detectBOM(buffer) {
|
|
199
|
+
if (buffer.length >= 3) {
|
|
200
|
+
if (buffer[0] === 239 && buffer[1] === 187 && buffer[2] === 191) {
|
|
201
|
+
return "UTF-8";
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
if (buffer.length >= 4) {
|
|
205
|
+
if (buffer[0] === 255 && buffer[1] === 254 && buffer[2] === 0 && buffer[3] === 0) {
|
|
206
|
+
return "UTF-32 LE";
|
|
207
|
+
}
|
|
208
|
+
if (buffer[0] === 0 && buffer[1] === 0 && buffer[2] === 254 && buffer[3] === 255) {
|
|
209
|
+
return "UTF-32 BE";
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
if (buffer.length >= 2) {
|
|
213
|
+
if (buffer[0] === 255 && buffer[1] === 254) {
|
|
214
|
+
return "UTF-16 LE";
|
|
215
|
+
}
|
|
216
|
+
if (buffer[0] === 254 && buffer[1] === 255) {
|
|
217
|
+
return "UTF-16 BE";
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
return null;
|
|
221
|
+
}
|
|
222
|
+
async function readFileWithEncoding(filePath) {
|
|
223
|
+
const buffer = await fs2.readFile(filePath);
|
|
224
|
+
const bom = detectBOM(buffer);
|
|
225
|
+
let encoding = bom;
|
|
226
|
+
if (!encoding) {
|
|
227
|
+
const detected = chardet.detect(buffer);
|
|
228
|
+
encoding = detected || "UTF-8";
|
|
229
|
+
}
|
|
230
|
+
const normalizedEncoding = normalizeEncoding(encoding);
|
|
231
|
+
let content;
|
|
232
|
+
try {
|
|
233
|
+
if (iconv.encodingExists(normalizedEncoding)) {
|
|
234
|
+
content = iconv.decode(buffer, normalizedEncoding);
|
|
235
|
+
} else {
|
|
236
|
+
content = buffer.toString("utf-8");
|
|
237
|
+
}
|
|
238
|
+
} catch {
|
|
239
|
+
content = buffer.toString("utf-8");
|
|
240
|
+
}
|
|
241
|
+
return {
|
|
242
|
+
content,
|
|
243
|
+
encoding: "utf-8",
|
|
244
|
+
// 输出始终是 UTF-8
|
|
245
|
+
originalEncoding: encoding
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// src/chunking/SemanticSplitter.ts
|
|
250
|
+
import "tree-sitter";
|
|
251
|
+
|
|
252
|
+
// src/chunking/SourceAdapter.ts
|
|
253
|
+
var SourceAdapter = class {
|
|
254
|
+
code;
|
|
255
|
+
domain;
|
|
256
|
+
buffer;
|
|
257
|
+
// UTF-8 字节偏移 -> 字符偏移的映射表(仅 UTF-8 域使用)
|
|
258
|
+
byteToCharMap;
|
|
259
|
+
// UTF-16 前缀和(用于 NWS 计算)
|
|
260
|
+
nwsPrefixSum;
|
|
261
|
+
constructor(config) {
|
|
262
|
+
this.code = config.code;
|
|
263
|
+
const lenUtf16 = config.code.length;
|
|
264
|
+
const lenUtf8 = Buffer.byteLength(config.code, "utf8");
|
|
265
|
+
if (config.endIndex === lenUtf16) {
|
|
266
|
+
this.domain = "utf16";
|
|
267
|
+
this.buffer = null;
|
|
268
|
+
this.byteToCharMap = null;
|
|
269
|
+
} else if (config.endIndex === lenUtf8) {
|
|
270
|
+
this.domain = "utf8";
|
|
271
|
+
this.buffer = Buffer.from(config.code, "utf8");
|
|
272
|
+
this.byteToCharMap = this.buildByteToCharMap();
|
|
273
|
+
} else {
|
|
274
|
+
this.domain = "unknown";
|
|
275
|
+
this.buffer = null;
|
|
276
|
+
this.byteToCharMap = null;
|
|
277
|
+
console.warn(
|
|
278
|
+
`[SourceAdapter] Index domain unclear: endIndex=${config.endIndex}, utf16Len=${lenUtf16}, utf8Len=${lenUtf8}`
|
|
279
|
+
);
|
|
280
|
+
}
|
|
281
|
+
this.nwsPrefixSum = this.buildNwsPrefixSum();
|
|
282
|
+
}
|
|
283
|
+
/**
|
|
284
|
+
* 获取检测到的索引域
|
|
285
|
+
*/
|
|
286
|
+
getDomain() {
|
|
287
|
+
return this.domain;
|
|
288
|
+
}
|
|
289
|
+
/**
|
|
290
|
+
* 安全切片:根据索引域选择正确的切片方式
|
|
291
|
+
*
|
|
292
|
+
* 对于 UTF-8 域,先将字节边界对齐到字符边界,再进行切片
|
|
293
|
+
*
|
|
294
|
+
* @param start Tree-sitter 返回的 startIndex
|
|
295
|
+
* @param end Tree-sitter 返回的 endIndex
|
|
296
|
+
* @returns 切片后的字符串
|
|
297
|
+
*/
|
|
298
|
+
slice(start, end) {
|
|
299
|
+
if (this.domain === "utf16" || this.domain === "unknown") {
|
|
300
|
+
return this.code.slice(start, end);
|
|
301
|
+
}
|
|
302
|
+
if (!this.byteToCharMap) {
|
|
303
|
+
return this.code.slice(start, end);
|
|
304
|
+
}
|
|
305
|
+
const charStart = this.byteToChar(start);
|
|
306
|
+
const charEnd = this.byteToChar(end);
|
|
307
|
+
return this.code.slice(charStart, charEnd);
|
|
308
|
+
}
|
|
309
|
+
/**
|
|
310
|
+
* 计算区间的非空白字符数
|
|
311
|
+
*
|
|
312
|
+
* 注意:NWS 始终在字符域计算,保持语义一致性
|
|
313
|
+
* 如果索引域是 UTF-8,需要先将字节偏移转换为字符偏移
|
|
314
|
+
*
|
|
315
|
+
* @param start Tree-sitter 返回的 startIndex
|
|
316
|
+
* @param end Tree-sitter 返回的 endIndex
|
|
317
|
+
* @returns 非空白字符数
|
|
318
|
+
*/
|
|
319
|
+
nws(start, end) {
|
|
320
|
+
let charStart;
|
|
321
|
+
let charEnd;
|
|
322
|
+
if (this.domain === "utf8" && this.byteToCharMap) {
|
|
323
|
+
charStart = this.byteToChar(start);
|
|
324
|
+
charEnd = this.byteToChar(end);
|
|
325
|
+
} else {
|
|
326
|
+
charStart = start;
|
|
327
|
+
charEnd = end;
|
|
328
|
+
}
|
|
329
|
+
const maxIndex = this.nwsPrefixSum.length - 1;
|
|
330
|
+
const s = Math.max(0, Math.min(maxIndex, charStart));
|
|
331
|
+
const e = Math.max(0, Math.min(maxIndex, charEnd));
|
|
332
|
+
return this.nwsPrefixSum[e] - this.nwsPrefixSum[s];
|
|
333
|
+
}
|
|
334
|
+
/**
|
|
335
|
+
* 获取总的非空白字符数
|
|
336
|
+
*/
|
|
337
|
+
getTotalNws() {
|
|
338
|
+
return this.nwsPrefixSum[this.nwsPrefixSum.length - 1];
|
|
339
|
+
}
|
|
340
|
+
/**
|
|
341
|
+
* 将字节偏移转换为字符偏移
|
|
342
|
+
*/
|
|
343
|
+
byteToChar(byteOffset) {
|
|
344
|
+
if (!this.byteToCharMap) return byteOffset;
|
|
345
|
+
const safeOffset = Math.max(0, Math.min(this.byteToCharMap.length - 1, byteOffset));
|
|
346
|
+
return this.byteToCharMap[safeOffset];
|
|
347
|
+
}
|
|
348
|
+
/**
|
|
349
|
+
* 构建字节偏移到字符偏移的映射表
|
|
350
|
+
*
|
|
351
|
+
* 对于 UTF-8 编码,一个字符可能占用 1-4 个字节
|
|
352
|
+
* 此映射表允许 O(1) 查找任意字节偏移对应的字符偏移
|
|
353
|
+
*/
|
|
354
|
+
buildByteToCharMap() {
|
|
355
|
+
const buffer = this.buffer;
|
|
356
|
+
const map = new Uint32Array(buffer.length + 1);
|
|
357
|
+
let charIndex = 0;
|
|
358
|
+
let byteIndex = 0;
|
|
359
|
+
while (byteIndex < buffer.length) {
|
|
360
|
+
map[byteIndex] = charIndex;
|
|
361
|
+
const byte = buffer[byteIndex];
|
|
362
|
+
let charBytes;
|
|
363
|
+
if ((byte & 128) === 0) {
|
|
364
|
+
charBytes = 1;
|
|
365
|
+
} else if ((byte & 224) === 192) {
|
|
366
|
+
charBytes = 2;
|
|
367
|
+
} else if ((byte & 240) === 224) {
|
|
368
|
+
charBytes = 3;
|
|
369
|
+
} else if ((byte & 248) === 240) {
|
|
370
|
+
charBytes = 4;
|
|
371
|
+
} else {
|
|
372
|
+
charBytes = 1;
|
|
373
|
+
}
|
|
374
|
+
for (let i = 1; i < charBytes && byteIndex + i < buffer.length; i++) {
|
|
375
|
+
map[byteIndex + i] = charIndex;
|
|
376
|
+
}
|
|
377
|
+
byteIndex += charBytes;
|
|
378
|
+
if (charBytes === 4) {
|
|
379
|
+
charIndex += 2;
|
|
380
|
+
} else {
|
|
381
|
+
charIndex += 1;
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
map[buffer.length] = charIndex;
|
|
385
|
+
return map;
|
|
386
|
+
}
|
|
387
|
+
/**
|
|
388
|
+
* 构建字符域的 NWS 前缀和
|
|
389
|
+
*/
|
|
390
|
+
buildNwsPrefixSum() {
|
|
391
|
+
const prefixSum = new Uint32Array(this.code.length + 1);
|
|
392
|
+
let count = 0;
|
|
393
|
+
for (let i = 0; i < this.code.length; i++) {
|
|
394
|
+
const cc = this.code.charCodeAt(i);
|
|
395
|
+
if (!(cc === 32 || cc === 9 || cc === 10 || cc === 13)) {
|
|
396
|
+
count++;
|
|
397
|
+
}
|
|
398
|
+
prefixSum[i + 1] = count;
|
|
399
|
+
}
|
|
400
|
+
return prefixSum;
|
|
401
|
+
}
|
|
402
|
+
};
|
|
403
|
+
|
|
404
|
+
// src/chunking/LanguageSpec.ts
|
|
405
|
+
var LANGUAGE_SPECS = {
|
|
406
|
+
typescript: {
|
|
407
|
+
hierarchy: /* @__PURE__ */ new Set([
|
|
408
|
+
// 类和接口
|
|
409
|
+
"class_declaration",
|
|
410
|
+
"abstract_class_declaration",
|
|
411
|
+
"interface_declaration",
|
|
412
|
+
// 函数
|
|
413
|
+
"function_declaration",
|
|
414
|
+
"generator_function_declaration",
|
|
415
|
+
"method_definition",
|
|
416
|
+
"arrow_function",
|
|
417
|
+
// 模块
|
|
418
|
+
"export_statement",
|
|
419
|
+
"import_statement"
|
|
420
|
+
]),
|
|
421
|
+
nameFields: ["name", "id"]
|
|
422
|
+
},
|
|
423
|
+
javascript: {
|
|
424
|
+
hierarchy: /* @__PURE__ */ new Set([
|
|
425
|
+
"class_declaration",
|
|
426
|
+
"function_declaration",
|
|
427
|
+
"generator_function_declaration",
|
|
428
|
+
"method_definition",
|
|
429
|
+
"arrow_function"
|
|
430
|
+
]),
|
|
431
|
+
nameFields: ["name", "id"]
|
|
432
|
+
},
|
|
433
|
+
python: {
|
|
434
|
+
hierarchy: /* @__PURE__ */ new Set([
|
|
435
|
+
"class_definition",
|
|
436
|
+
"function_definition",
|
|
437
|
+
"decorated_definition"
|
|
438
|
+
]),
|
|
439
|
+
nameFields: ["name"]
|
|
440
|
+
},
|
|
441
|
+
go: {
|
|
442
|
+
hierarchy: /* @__PURE__ */ new Set([
|
|
443
|
+
// 函数和方法
|
|
444
|
+
"function_declaration",
|
|
445
|
+
"method_declaration",
|
|
446
|
+
// 类型定义
|
|
447
|
+
"type_spec",
|
|
448
|
+
"type_declaration",
|
|
449
|
+
// 结构体和接口
|
|
450
|
+
"struct_type",
|
|
451
|
+
"interface_type"
|
|
452
|
+
]),
|
|
453
|
+
nameFields: ["name"]
|
|
454
|
+
},
|
|
455
|
+
rust: {
|
|
456
|
+
hierarchy: /* @__PURE__ */ new Set([
|
|
457
|
+
// 函数
|
|
458
|
+
"function_item",
|
|
459
|
+
// 结构体、枚举、trait
|
|
460
|
+
"struct_item",
|
|
461
|
+
"enum_item",
|
|
462
|
+
"trait_item",
|
|
463
|
+
// impl 块
|
|
464
|
+
"impl_item",
|
|
465
|
+
// 模块
|
|
466
|
+
"mod_item",
|
|
467
|
+
// 类型别名
|
|
468
|
+
"type_item"
|
|
469
|
+
]),
|
|
470
|
+
nameFields: ["name"]
|
|
471
|
+
},
|
|
472
|
+
java: {
|
|
473
|
+
hierarchy: /* @__PURE__ */ new Set([
|
|
474
|
+
// 类和接口
|
|
475
|
+
"class_declaration",
|
|
476
|
+
"interface_declaration",
|
|
477
|
+
"enum_declaration",
|
|
478
|
+
"annotation_type_declaration",
|
|
479
|
+
// 方法和构造函数
|
|
480
|
+
"method_declaration",
|
|
481
|
+
"constructor_declaration",
|
|
482
|
+
// 记录类型 (Java 14+)
|
|
483
|
+
"record_declaration"
|
|
484
|
+
]),
|
|
485
|
+
nameFields: ["name", "identifier"]
|
|
486
|
+
}
|
|
487
|
+
};
|
|
488
|
+
function getLanguageSpec(language) {
|
|
489
|
+
return LANGUAGE_SPECS[language] ?? null;
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
// src/chunking/SemanticSplitter.ts
|
|
493
|
+
var SemanticSplitter = class {
|
|
494
|
+
config;
|
|
495
|
+
adapter;
|
|
496
|
+
code;
|
|
497
|
+
language;
|
|
498
|
+
constructor(config = {}) {
|
|
499
|
+
const maxChunkSize = config.maxChunkSize ?? 1e3;
|
|
500
|
+
this.config = {
|
|
501
|
+
maxChunkSize,
|
|
502
|
+
minChunkSize: config.minChunkSize ?? 50,
|
|
503
|
+
chunkOverlap: config.chunkOverlap ?? 100,
|
|
504
|
+
// 物理字符硬上限:默认为 maxChunkSize * 4(假设 1 token ≈ 4 chars)
|
|
505
|
+
maxRawChars: config.maxRawChars ?? maxChunkSize * 4
|
|
506
|
+
};
|
|
507
|
+
}
|
|
508
|
+
/**
|
|
509
|
+
* 对代码进行语义分片
|
|
510
|
+
* @param tree Tree-sitter 解析树
|
|
511
|
+
* @param code 源代码字符串
|
|
512
|
+
* @param filePath 文件路径
|
|
513
|
+
* @param language 语言标识
|
|
514
|
+
* @returns 处理后的分片数组
|
|
515
|
+
*/
|
|
516
|
+
split(tree, code, filePath, language) {
|
|
517
|
+
this.adapter = new SourceAdapter({
|
|
518
|
+
code,
|
|
519
|
+
endIndex: tree.rootNode.endIndex
|
|
520
|
+
});
|
|
521
|
+
const domain = this.adapter.getDomain();
|
|
522
|
+
if (domain === "unknown") {
|
|
523
|
+
console.warn(
|
|
524
|
+
`[SemanticSplitter] Unknown index domain for ${filePath}, falling back to simple split`
|
|
525
|
+
);
|
|
526
|
+
return this.fallbackSplit(code, filePath, language);
|
|
527
|
+
}
|
|
528
|
+
if (domain === "utf8") {
|
|
529
|
+
console.info(`[SemanticSplitter] Using UTF-8 byte indexing for ${filePath}`);
|
|
530
|
+
}
|
|
531
|
+
this.code = code;
|
|
532
|
+
this.language = language;
|
|
533
|
+
const initialContext = [filePath];
|
|
534
|
+
const windows = this.visitNode(tree.rootNode, initialContext);
|
|
535
|
+
return this.windowsToChunks(windows, filePath, language);
|
|
536
|
+
}
|
|
537
|
+
/**
|
|
538
|
+
* 公开的纯文本分片接口
|
|
539
|
+
*
|
|
540
|
+
* 用于不支持 AST 解析的语言,或作为 AST 解析失败时的降级方案。
|
|
541
|
+
* 使用 UTF-16 索引(JS 原生字符串),按行切分。
|
|
542
|
+
*
|
|
543
|
+
* @param code 源代码字符串
|
|
544
|
+
* @param filePath 文件路径
|
|
545
|
+
* @param language 语言标识
|
|
546
|
+
* @returns 处理后的分片数组
|
|
547
|
+
*/
|
|
548
|
+
splitPlainText(code, filePath, language) {
|
|
549
|
+
return this.fallbackSplit(code, filePath, language);
|
|
550
|
+
}
|
|
551
|
+
/**
|
|
552
|
+
* 降级分片:当索引域不明确时使用
|
|
553
|
+
*
|
|
554
|
+
* 使用 UTF-16 索引(JS 原生字符串),按行切分
|
|
555
|
+
* 注意:fallback 模式不支持 overlap
|
|
556
|
+
*/
|
|
557
|
+
fallbackSplit(code, filePath, language) {
|
|
558
|
+
const adapter = new SourceAdapter({
|
|
559
|
+
code,
|
|
560
|
+
endIndex: code.length
|
|
561
|
+
});
|
|
562
|
+
const totalSize = adapter.getTotalNws();
|
|
563
|
+
if (totalSize <= this.config.maxChunkSize) {
|
|
564
|
+
return [
|
|
565
|
+
{
|
|
566
|
+
displayCode: code,
|
|
567
|
+
vectorText: `// Context: ${filePath}
|
|
568
|
+
${code}`,
|
|
569
|
+
nwsSize: totalSize,
|
|
570
|
+
metadata: {
|
|
571
|
+
startIndex: 0,
|
|
572
|
+
endIndex: code.length,
|
|
573
|
+
rawSpan: { start: 0, end: code.length },
|
|
574
|
+
vectorSpan: { start: 0, end: code.length },
|
|
575
|
+
filePath,
|
|
576
|
+
language,
|
|
577
|
+
contextPath: [filePath]
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
];
|
|
581
|
+
}
|
|
582
|
+
const lines = code.split("\n");
|
|
583
|
+
const chunks = [];
|
|
584
|
+
let currentLines = [];
|
|
585
|
+
let currentSize = 0;
|
|
586
|
+
let lineStartIndex = 0;
|
|
587
|
+
let chunkStartIndex = 0;
|
|
588
|
+
let chunkRawStart = 0;
|
|
589
|
+
for (const line of lines) {
|
|
590
|
+
const lineEndIndex = lineStartIndex + line.length;
|
|
591
|
+
const lineNws = adapter.nws(lineStartIndex, lineEndIndex);
|
|
592
|
+
if (currentSize + lineNws > this.config.maxChunkSize && currentLines.length > 0) {
|
|
593
|
+
const displayCode = currentLines.join("\n");
|
|
594
|
+
const chunkEndIndex = chunkStartIndex + displayCode.length;
|
|
595
|
+
chunks.push({
|
|
596
|
+
displayCode,
|
|
597
|
+
vectorText: `// Context: ${filePath}
|
|
598
|
+
${displayCode}`,
|
|
599
|
+
nwsSize: currentSize,
|
|
600
|
+
metadata: {
|
|
601
|
+
startIndex: chunkStartIndex,
|
|
602
|
+
endIndex: chunkEndIndex,
|
|
603
|
+
rawSpan: { start: chunkRawStart, end: chunkEndIndex + 1 },
|
|
604
|
+
// +1 for newline gap
|
|
605
|
+
vectorSpan: { start: chunkStartIndex, end: chunkEndIndex },
|
|
606
|
+
filePath,
|
|
607
|
+
language,
|
|
608
|
+
contextPath: [filePath]
|
|
609
|
+
}
|
|
610
|
+
});
|
|
611
|
+
chunkRawStart = chunkEndIndex + 1;
|
|
612
|
+
chunkStartIndex += displayCode.length + 1;
|
|
613
|
+
currentLines = [line];
|
|
614
|
+
currentSize = lineNws;
|
|
615
|
+
} else {
|
|
616
|
+
currentLines.push(line);
|
|
617
|
+
currentSize += lineNws;
|
|
618
|
+
}
|
|
619
|
+
lineStartIndex = lineEndIndex + 1;
|
|
620
|
+
}
|
|
621
|
+
if (currentLines.length > 0) {
|
|
622
|
+
const displayCode = currentLines.join("\n");
|
|
623
|
+
const chunkEndIndex = chunkStartIndex + displayCode.length;
|
|
624
|
+
chunks.push({
|
|
625
|
+
displayCode,
|
|
626
|
+
vectorText: `// Context: ${filePath}
|
|
627
|
+
${displayCode}`,
|
|
628
|
+
nwsSize: currentSize,
|
|
629
|
+
metadata: {
|
|
630
|
+
startIndex: chunkStartIndex,
|
|
631
|
+
endIndex: chunkEndIndex,
|
|
632
|
+
rawSpan: { start: chunkRawStart, end: code.length },
|
|
633
|
+
vectorSpan: { start: chunkStartIndex, end: chunkEndIndex },
|
|
634
|
+
filePath,
|
|
635
|
+
language,
|
|
636
|
+
contextPath: [filePath]
|
|
637
|
+
}
|
|
638
|
+
});
|
|
639
|
+
}
|
|
640
|
+
return chunks;
|
|
641
|
+
}
|
|
642
|
+
/**
|
|
643
|
+
* 递归遍历 AST 节点
|
|
644
|
+
*/
|
|
645
|
+
visitNode(node, context) {
|
|
646
|
+
const start = node.startIndex;
|
|
647
|
+
const end = node.endIndex;
|
|
648
|
+
const nodeSize = this.adapter.nws(start, end);
|
|
649
|
+
let nextContext = context;
|
|
650
|
+
const spec = getLanguageSpec(this.language);
|
|
651
|
+
if (spec && spec.hierarchy.has(node.type)) {
|
|
652
|
+
let name = null;
|
|
653
|
+
for (const child of node.namedChildren) {
|
|
654
|
+
if (child.type === "identifier" || child.type === "type_identifier" || child.type === "name") {
|
|
655
|
+
name = child.text;
|
|
656
|
+
break;
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
if (!name && node.firstNamedChild) {
|
|
660
|
+
const firstChild = node.firstNamedChild;
|
|
661
|
+
if (firstChild.text.length <= 100 && !firstChild.text.includes("\n")) {
|
|
662
|
+
name = firstChild.text;
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
if (name) {
|
|
666
|
+
const typePrefix = this.getTypePrefix(node.type);
|
|
667
|
+
nextContext = [...context, `${typePrefix}${name}`];
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
if (nodeSize <= this.config.maxChunkSize) {
|
|
671
|
+
return [{ nodes: [node], size: nodeSize, contextPath: nextContext }];
|
|
672
|
+
}
|
|
673
|
+
const children = node.children;
|
|
674
|
+
if (children.length === 0) {
|
|
675
|
+
return [{ nodes: [node], size: nodeSize, contextPath: nextContext }];
|
|
676
|
+
}
|
|
677
|
+
let childWindows = [];
|
|
678
|
+
for (const child of children) {
|
|
679
|
+
childWindows.push(...this.visitNode(child, nextContext));
|
|
680
|
+
}
|
|
681
|
+
return this.mergeAdjacentWindows(childWindows);
|
|
682
|
+
}
|
|
683
|
+
/**
|
|
684
|
+
* 获取节点类型的简短前缀
|
|
685
|
+
*/
|
|
686
|
+
getTypePrefix(nodeType) {
|
|
687
|
+
if (nodeType.includes("class")) return "class ";
|
|
688
|
+
if (nodeType.includes("interface")) return "interface ";
|
|
689
|
+
if (nodeType.includes("method")) return "method ";
|
|
690
|
+
if (nodeType.includes("function")) return "function ";
|
|
691
|
+
return "";
|
|
692
|
+
}
|
|
693
|
+
/**
|
|
694
|
+
* Gap-Aware 相邻窗口合并
|
|
695
|
+
*
|
|
696
|
+
* 使用 NWS + Raw 双预算策略:
|
|
697
|
+
* - NWS 预算:控制有效代码量
|
|
698
|
+
* - Raw 预算:控制物理字符数,防止大量注释撑爆 Token
|
|
699
|
+
*/
|
|
700
|
+
mergeAdjacentWindows(windows) {
|
|
701
|
+
if (windows.length === 0) return [];
|
|
702
|
+
const merged = [];
|
|
703
|
+
let current = windows[0];
|
|
704
|
+
for (let i = 1; i < windows.length; i++) {
|
|
705
|
+
const next = windows[i];
|
|
706
|
+
const currentStart = current.nodes[0].startIndex;
|
|
707
|
+
const currentEnd = current.nodes[current.nodes.length - 1].endIndex;
|
|
708
|
+
const nextStart = next.nodes[0].startIndex;
|
|
709
|
+
const nextEnd = next.nodes[next.nodes.length - 1].endIndex;
|
|
710
|
+
const gapNws = this.adapter.nws(currentEnd, nextStart);
|
|
711
|
+
const combinedNws = current.size + gapNws + next.size;
|
|
712
|
+
const combinedRawLen = nextEnd - currentStart;
|
|
713
|
+
const isTiny = current.size < this.config.minChunkSize;
|
|
714
|
+
const fitsNwsBudget = combinedNws <= this.config.maxChunkSize || isTiny && combinedNws < this.config.maxChunkSize * 1.5;
|
|
715
|
+
const fitsRawBudget = combinedRawLen <= this.config.maxRawChars;
|
|
716
|
+
if (fitsNwsBudget && fitsRawBudget) {
|
|
717
|
+
current.nodes.push(...next.nodes);
|
|
718
|
+
current.size = combinedNws;
|
|
719
|
+
current.contextPath = this.commonPrefix(current.contextPath, next.contextPath);
|
|
720
|
+
} else {
|
|
721
|
+
merged.push(current);
|
|
722
|
+
current = next;
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
merged.push(current);
|
|
726
|
+
return merged;
|
|
727
|
+
}
|
|
728
|
+
/**
|
|
729
|
+
* 将窗口转换为最终的 ProcessedChunk
|
|
730
|
+
*
|
|
731
|
+
* Gap 归属策略:gap 归属到后一个 chunk(即 chunk 的 rawSpan.start 向前延伸到前一个 chunk 的 endIndex)
|
|
732
|
+
* Overlap 策略:vectorSpan 向前延伸 chunkOverlap 个 NWS 字符,提升语义检索召回率
|
|
733
|
+
*
|
|
734
|
+
* 保证:所有 rawSpan 拼接后 === 完整文件(不重叠)
|
|
735
|
+
*/
|
|
736
|
+
windowsToChunks(windows, filePath, language) {
|
|
737
|
+
if (windows.length === 0) return [];
|
|
738
|
+
const chunks = [];
|
|
739
|
+
let prevEnd = 0;
|
|
740
|
+
const overlap = this.config.chunkOverlap;
|
|
741
|
+
for (let i = 0; i < windows.length; i++) {
|
|
742
|
+
const w = windows[i];
|
|
743
|
+
const start = w.nodes[0].startIndex;
|
|
744
|
+
const end = w.nodes[w.nodes.length - 1].endIndex;
|
|
745
|
+
const isLast = i === windows.length - 1;
|
|
746
|
+
const codeEndIndex = this.adapter.getDomain() === "utf8" ? Buffer.byteLength(this.code, "utf8") : this.code.length;
|
|
747
|
+
const rawSpanEnd = isLast ? codeEndIndex : end;
|
|
748
|
+
let vectorStart = start;
|
|
749
|
+
if (i > 0 && overlap > 0) {
|
|
750
|
+
const candidateStart = this.findOverlapStart(start, overlap);
|
|
751
|
+
const overlapRawLen = start - candidateStart;
|
|
752
|
+
if (overlapRawLen <= this.config.maxRawChars * 0.25) {
|
|
753
|
+
vectorStart = candidateStart;
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
const vectorEnd = end;
|
|
757
|
+
const displayCode = this.adapter.slice(start, end);
|
|
758
|
+
const vectorCode = this.adapter.slice(vectorStart, vectorEnd);
|
|
759
|
+
const metadata = {
|
|
760
|
+
startIndex: start,
|
|
761
|
+
endIndex: end,
|
|
762
|
+
rawSpan: { start: prevEnd, end: rawSpanEnd },
|
|
763
|
+
vectorSpan: { start: vectorStart, end: vectorEnd },
|
|
764
|
+
filePath,
|
|
765
|
+
language,
|
|
766
|
+
contextPath: w.contextPath
|
|
767
|
+
};
|
|
768
|
+
chunks.push({
|
|
769
|
+
displayCode,
|
|
770
|
+
vectorText: generateVectorText(vectorCode, w.contextPath),
|
|
771
|
+
nwsSize: w.size,
|
|
772
|
+
metadata
|
|
773
|
+
});
|
|
774
|
+
prevEnd = end;
|
|
775
|
+
}
|
|
776
|
+
return chunks;
|
|
777
|
+
}
|
|
778
|
+
/**
|
|
779
|
+
* 找到 overlap 的起始位置
|
|
780
|
+
*
|
|
781
|
+
* 从 start 位置向前搜索,找到包含 targetNws 个非空白字符的位置
|
|
782
|
+
*
|
|
783
|
+
* @param start 当前 chunk 的起始位置
|
|
784
|
+
* @param targetNws 目标 overlap 大小(NWS 字符数)
|
|
785
|
+
* @returns overlap 起始位置
|
|
786
|
+
*/
|
|
787
|
+
findOverlapStart(start, targetNws) {
|
|
788
|
+
if (start <= 0 || targetNws <= 0) return start;
|
|
789
|
+
let low = 0;
|
|
790
|
+
let high = start;
|
|
791
|
+
let result = start;
|
|
792
|
+
while (low <= high) {
|
|
793
|
+
const mid = Math.floor((low + high) / 2);
|
|
794
|
+
const nwsInRange = this.adapter.nws(mid, start);
|
|
795
|
+
if (nwsInRange >= targetNws) {
|
|
796
|
+
result = mid;
|
|
797
|
+
low = mid + 1;
|
|
798
|
+
} else {
|
|
799
|
+
high = mid - 1;
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
return Math.max(0, result);
|
|
803
|
+
}
|
|
804
|
+
/**
|
|
805
|
+
* 计算两个路径数组的最长公共前缀(LCA)
|
|
806
|
+
*
|
|
807
|
+
* 用于合并窗口时更新 contextPath,避免 breadcrumb 误标
|
|
808
|
+
* 例如:["file", "class A", "method foo"] 和 ["file", "class A", "method bar"]
|
|
809
|
+
* => ["file", "class A"]
|
|
810
|
+
*/
|
|
811
|
+
commonPrefix(a, b) {
|
|
812
|
+
const result = [];
|
|
813
|
+
const len = Math.min(a.length, b.length);
|
|
814
|
+
for (let i = 0; i < len; i++) {
|
|
815
|
+
if (a[i] === b[i]) {
|
|
816
|
+
result.push(a[i]);
|
|
817
|
+
} else {
|
|
818
|
+
break;
|
|
819
|
+
}
|
|
820
|
+
}
|
|
821
|
+
return result;
|
|
822
|
+
}
|
|
823
|
+
};
|
|
824
|
+
function generateVectorText(code, contextPath) {
|
|
825
|
+
const breadcrumb = contextPath.join(" > ");
|
|
826
|
+
return `// Context: ${breadcrumb}
|
|
827
|
+
${code}`;
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
// src/chunking/ParserPool.ts
|
|
831
|
+
import Parser2 from "tree-sitter";
|
|
832
|
+
var GRAMMAR_MODULES = {
|
|
833
|
+
typescript: "tree-sitter-typescript",
|
|
834
|
+
javascript: "tree-sitter-javascript",
|
|
835
|
+
python: "tree-sitter-python",
|
|
836
|
+
go: "tree-sitter-go",
|
|
837
|
+
rust: "tree-sitter-rust",
|
|
838
|
+
java: "tree-sitter-java"
|
|
839
|
+
};
|
|
840
|
+
var loadedGrammars = /* @__PURE__ */ new Map();
|
|
841
|
+
var parserCache = /* @__PURE__ */ new Map();
|
|
842
|
+
async function loadGrammar(language) {
|
|
843
|
+
const cached = loadedGrammars.get(language);
|
|
844
|
+
if (cached) return cached;
|
|
845
|
+
const moduleName = GRAMMAR_MODULES[language];
|
|
846
|
+
if (!moduleName) return null;
|
|
847
|
+
try {
|
|
848
|
+
const grammarModule = await import(moduleName);
|
|
849
|
+
let grammar = null;
|
|
850
|
+
if (language === "typescript") {
|
|
851
|
+
grammar = grammarModule.default?.typescript ?? grammarModule.typescript;
|
|
852
|
+
} else {
|
|
853
|
+
const exported = grammarModule.default ?? grammarModule;
|
|
854
|
+
if (exported && typeof exported === "object" && "nodeTypeInfo" in exported) {
|
|
855
|
+
grammar = exported;
|
|
856
|
+
} else if (exported?.language) {
|
|
857
|
+
grammar = exported.language;
|
|
858
|
+
} else if (exported?.[language]) {
|
|
859
|
+
grammar = exported[language];
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
if (!grammar) {
|
|
863
|
+
console.error(`[ParserPool] Could not extract grammar for ${language} from module ${moduleName}`);
|
|
864
|
+
return null;
|
|
865
|
+
}
|
|
866
|
+
loadedGrammars.set(language, grammar);
|
|
867
|
+
return grammar;
|
|
868
|
+
} catch (err) {
|
|
869
|
+
console.error(`[ParserPool] Failed to load grammar for ${language}:`, err);
|
|
870
|
+
return null;
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
async function getParser(language) {
|
|
874
|
+
const cached = parserCache.get(language);
|
|
875
|
+
if (cached) return cached;
|
|
876
|
+
const grammar = await loadGrammar(language);
|
|
877
|
+
if (!grammar) return null;
|
|
878
|
+
const parser = new Parser2();
|
|
879
|
+
parser.setLanguage(grammar);
|
|
880
|
+
parserCache.set(language, parser);
|
|
881
|
+
return parser;
|
|
882
|
+
}
|
|
883
|
+
function isLanguageSupported(language) {
|
|
884
|
+
return language in GRAMMAR_MODULES;
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
// src/scanner/processor.ts
|
|
888
|
+
var MAX_FILE_SIZE = 500 * 1024;
|
|
889
|
+
var FALLBACK_LANGS = /* @__PURE__ */ new Set([
|
|
890
|
+
"python",
|
|
891
|
+
"go",
|
|
892
|
+
"rust",
|
|
893
|
+
"java",
|
|
894
|
+
"markdown",
|
|
895
|
+
"json"
|
|
896
|
+
]);
|
|
897
|
+
function shouldSkipJson(relPath) {
|
|
898
|
+
if (relPath.endsWith("-lock.json") || relPath.endsWith("package-lock.json")) {
|
|
899
|
+
return true;
|
|
900
|
+
}
|
|
901
|
+
if (relPath.includes("node_modules/") || relPath.includes("node_modules\\")) {
|
|
902
|
+
return true;
|
|
903
|
+
}
|
|
904
|
+
return false;
|
|
905
|
+
}
|
|
906
|
+
function getAdaptiveConcurrency() {
|
|
907
|
+
const cpuCount = os.cpus().length;
|
|
908
|
+
const concurrency = Math.max(4, Math.min(cpuCount - 1, 32));
|
|
909
|
+
return concurrency;
|
|
910
|
+
}
|
|
911
|
+
var splitter = new SemanticSplitter({
|
|
912
|
+
maxChunkSize: 500,
|
|
913
|
+
minChunkSize: 50,
|
|
914
|
+
chunkOverlap: 40
|
|
915
|
+
// 混合检索(BM25+向量+rerank)下的保守 overlap
|
|
916
|
+
});
|
|
917
|
+
async function processFile(absPath, relPath, known) {
|
|
918
|
+
const language = getLanguage(relPath);
|
|
919
|
+
try {
|
|
920
|
+
const stat = await fs3.stat(absPath);
|
|
921
|
+
const mtime = stat.mtimeMs;
|
|
922
|
+
const size = stat.size;
|
|
923
|
+
if (size > MAX_FILE_SIZE) {
|
|
924
|
+
return {
|
|
925
|
+
absPath,
|
|
926
|
+
relPath,
|
|
927
|
+
hash: "",
|
|
928
|
+
content: null,
|
|
929
|
+
chunks: [],
|
|
930
|
+
language,
|
|
931
|
+
mtime,
|
|
932
|
+
size,
|
|
933
|
+
status: "skipped",
|
|
934
|
+
error: `File too large (${size} bytes > ${MAX_FILE_SIZE} bytes)`
|
|
935
|
+
};
|
|
936
|
+
}
|
|
937
|
+
if (known && known.mtime === mtime && known.size === size) {
|
|
938
|
+
return {
|
|
939
|
+
absPath,
|
|
940
|
+
relPath,
|
|
941
|
+
hash: known.hash,
|
|
942
|
+
content: null,
|
|
943
|
+
chunks: [],
|
|
944
|
+
language,
|
|
945
|
+
mtime,
|
|
946
|
+
size,
|
|
947
|
+
status: "unchanged"
|
|
948
|
+
};
|
|
949
|
+
}
|
|
950
|
+
const { content, originalEncoding } = await readFileWithEncoding(absPath);
|
|
951
|
+
if (content.includes("\0")) {
|
|
952
|
+
return {
|
|
953
|
+
absPath,
|
|
954
|
+
relPath,
|
|
955
|
+
hash: "",
|
|
956
|
+
content: null,
|
|
957
|
+
chunks: [],
|
|
958
|
+
language,
|
|
959
|
+
mtime,
|
|
960
|
+
size,
|
|
961
|
+
status: "skipped",
|
|
962
|
+
error: `Binary file detected (original encoding: ${originalEncoding})`
|
|
963
|
+
};
|
|
964
|
+
}
|
|
965
|
+
const hash = sha256(content);
|
|
966
|
+
if (known && known.hash === hash) {
|
|
967
|
+
return {
|
|
968
|
+
absPath,
|
|
969
|
+
relPath,
|
|
970
|
+
hash,
|
|
971
|
+
content,
|
|
972
|
+
chunks: [],
|
|
973
|
+
language,
|
|
974
|
+
mtime,
|
|
975
|
+
size,
|
|
976
|
+
status: "unchanged"
|
|
977
|
+
};
|
|
978
|
+
}
|
|
979
|
+
if (language === "json" && shouldSkipJson(relPath)) {
|
|
980
|
+
return {
|
|
981
|
+
absPath,
|
|
982
|
+
relPath,
|
|
983
|
+
hash,
|
|
984
|
+
content: null,
|
|
985
|
+
chunks: [],
|
|
986
|
+
language,
|
|
987
|
+
mtime,
|
|
988
|
+
size,
|
|
989
|
+
status: "skipped",
|
|
990
|
+
error: "Lock file or node_modules JSON"
|
|
991
|
+
};
|
|
992
|
+
}
|
|
993
|
+
let chunks = [];
|
|
994
|
+
if (isLanguageSupported(language)) {
|
|
995
|
+
try {
|
|
996
|
+
const parser = await getParser(language);
|
|
997
|
+
if (parser) {
|
|
998
|
+
const tree = parser.parse(content);
|
|
999
|
+
chunks = splitter.split(tree, content, relPath, language);
|
|
1000
|
+
}
|
|
1001
|
+
} catch (err) {
|
|
1002
|
+
console.warn(`[Chunking] AST failed for ${relPath}: ${err.message}`);
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
if (chunks.length === 0 && FALLBACK_LANGS.has(language)) {
|
|
1006
|
+
chunks = splitter.splitPlainText(content, relPath, language);
|
|
1007
|
+
}
|
|
1008
|
+
return {
|
|
1009
|
+
absPath,
|
|
1010
|
+
relPath,
|
|
1011
|
+
hash,
|
|
1012
|
+
content,
|
|
1013
|
+
chunks,
|
|
1014
|
+
language,
|
|
1015
|
+
mtime,
|
|
1016
|
+
size,
|
|
1017
|
+
status: known ? "modified" : "added"
|
|
1018
|
+
};
|
|
1019
|
+
} catch (err) {
|
|
1020
|
+
return {
|
|
1021
|
+
absPath,
|
|
1022
|
+
relPath,
|
|
1023
|
+
hash: "",
|
|
1024
|
+
content: null,
|
|
1025
|
+
chunks: [],
|
|
1026
|
+
language,
|
|
1027
|
+
mtime: 0,
|
|
1028
|
+
size: 0,
|
|
1029
|
+
status: "error",
|
|
1030
|
+
error: err.message
|
|
1031
|
+
};
|
|
1032
|
+
}
|
|
1033
|
+
}
|
|
1034
|
+
async function processFiles(rootPath, filePaths, knownFiles) {
|
|
1035
|
+
const concurrency = getAdaptiveConcurrency();
|
|
1036
|
+
const limit = pLimit(concurrency);
|
|
1037
|
+
const tasks = filePaths.map((filePath) => {
|
|
1038
|
+
const relPath = path3.relative(rootPath, filePath);
|
|
1039
|
+
const known = knownFiles.get(relPath);
|
|
1040
|
+
return limit(() => processFile(filePath, relPath, known));
|
|
1041
|
+
});
|
|
1042
|
+
return Promise.all(tasks);
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
// src/scanner/index.ts
|
|
1046
|
+
async function scan(rootPath, options = {}) {
|
|
1047
|
+
const projectId = generateProjectId(rootPath);
|
|
1048
|
+
const db = initDb(projectId);
|
|
1049
|
+
try {
|
|
1050
|
+
await initFilter(rootPath);
|
|
1051
|
+
let forceReindex = options.force ?? false;
|
|
1052
|
+
if (options.vectorIndex !== false) {
|
|
1053
|
+
const currentDimensions = getEmbeddingConfig().dimensions;
|
|
1054
|
+
const storedDimensions = getStoredEmbeddingDimensions(db);
|
|
1055
|
+
if (storedDimensions !== null && storedDimensions !== currentDimensions) {
|
|
1056
|
+
logger.warn(
|
|
1057
|
+
{ stored: storedDimensions, current: currentDimensions },
|
|
1058
|
+
"Embedding \u7EF4\u5EA6\u53D8\u5316\uFF0C\u5F3A\u5236\u91CD\u65B0\u7D22\u5F15"
|
|
1059
|
+
);
|
|
1060
|
+
forceReindex = true;
|
|
1061
|
+
}
|
|
1062
|
+
setStoredEmbeddingDimensions(db, currentDimensions);
|
|
1063
|
+
}
|
|
1064
|
+
if (forceReindex) {
|
|
1065
|
+
logger.info("\u5F3A\u5236\u91CD\u65B0\u7D22\u5F15...");
|
|
1066
|
+
clear(db);
|
|
1067
|
+
if (options.vectorIndex !== false) {
|
|
1068
|
+
const embeddingConfig = getEmbeddingConfig();
|
|
1069
|
+
const indexer = await getIndexer(projectId, embeddingConfig.dimensions);
|
|
1070
|
+
await indexer.clear();
|
|
1071
|
+
}
|
|
1072
|
+
}
|
|
1073
|
+
const knownFiles = getAllFileMeta(db);
|
|
1074
|
+
const filePaths = await crawl(rootPath);
|
|
1075
|
+
const scannedPaths = new Set(filePaths.map((p) => p.replace(new RegExp(`^${rootPath}/?`), "")));
|
|
1076
|
+
let processedCount = 0;
|
|
1077
|
+
const results = [];
|
|
1078
|
+
const batchSize = 100;
|
|
1079
|
+
for (let i = 0; i < filePaths.length; i += batchSize) {
|
|
1080
|
+
const batch = filePaths.slice(i, i + batchSize);
|
|
1081
|
+
const batchResults = await processFiles(rootPath, batch, knownFiles);
|
|
1082
|
+
results.push(...batchResults);
|
|
1083
|
+
processedCount += batch.length;
|
|
1084
|
+
options.onProgress?.(processedCount, filePaths.length);
|
|
1085
|
+
}
|
|
1086
|
+
const toAdd = [];
|
|
1087
|
+
const toUpdateMtime = [];
|
|
1088
|
+
const deletedPaths = [];
|
|
1089
|
+
for (const result of results) {
|
|
1090
|
+
switch (result.status) {
|
|
1091
|
+
case "added":
|
|
1092
|
+
case "modified":
|
|
1093
|
+
toAdd.push({
|
|
1094
|
+
path: result.relPath,
|
|
1095
|
+
hash: result.hash,
|
|
1096
|
+
mtime: result.mtime,
|
|
1097
|
+
size: result.size,
|
|
1098
|
+
content: result.content,
|
|
1099
|
+
language: result.language,
|
|
1100
|
+
vectorIndexHash: null
|
|
1101
|
+
// 新文件/修改的文件需要重新索引
|
|
1102
|
+
});
|
|
1103
|
+
break;
|
|
1104
|
+
case "unchanged":
|
|
1105
|
+
toUpdateMtime.push({ path: result.relPath, mtime: result.mtime });
|
|
1106
|
+
break;
|
|
1107
|
+
case "skipped":
|
|
1108
|
+
logger.debug({ path: result.relPath, reason: result.error }, "\u8DF3\u8FC7\u6587\u4EF6");
|
|
1109
|
+
break;
|
|
1110
|
+
case "error":
|
|
1111
|
+
logger.error({ path: result.relPath, error: result.error }, "\u5904\u7406\u6587\u4EF6\u9519\u8BEF");
|
|
1112
|
+
break;
|
|
1113
|
+
}
|
|
1114
|
+
}
|
|
1115
|
+
const allIndexedPaths = getAllPaths(db);
|
|
1116
|
+
for (const indexedPath of allIndexedPaths) {
|
|
1117
|
+
if (!scannedPaths.has(indexedPath)) {
|
|
1118
|
+
deletedPaths.push(indexedPath);
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
batchUpsert(db, toAdd);
|
|
1122
|
+
batchUpdateMtime(db, toUpdateMtime);
|
|
1123
|
+
batchDelete(db, deletedPaths);
|
|
1124
|
+
const stats = {
|
|
1125
|
+
totalFiles: filePaths.length,
|
|
1126
|
+
added: results.filter((r) => r.status === "added").length,
|
|
1127
|
+
modified: results.filter((r) => r.status === "modified").length,
|
|
1128
|
+
unchanged: results.filter((r) => r.status === "unchanged").length,
|
|
1129
|
+
deleted: deletedPaths.length,
|
|
1130
|
+
skipped: results.filter((r) => r.status === "skipped").length,
|
|
1131
|
+
errors: results.filter((r) => r.status === "error").length
|
|
1132
|
+
};
|
|
1133
|
+
if (options.vectorIndex !== false) {
|
|
1134
|
+
const embeddingConfig = getEmbeddingConfig();
|
|
1135
|
+
const indexer = await getIndexer(projectId, embeddingConfig.dimensions);
|
|
1136
|
+
const needsVectorIndex = results.filter(
|
|
1137
|
+
(r) => r.status === "added" || r.status === "modified"
|
|
1138
|
+
);
|
|
1139
|
+
const healingPathSet = new Set(getFilesNeedingVectorIndex(db));
|
|
1140
|
+
const healingFiles = results.filter(
|
|
1141
|
+
(r) => r.status === "unchanged" && healingPathSet.has(r.relPath)
|
|
1142
|
+
);
|
|
1143
|
+
if (healingFiles.length > 0) {
|
|
1144
|
+
logger.info({ count: healingFiles.length }, "\u81EA\u6108\uFF1A\u53D1\u73B0\u9700\u8981\u8865\u7D22\u5F15\u7684\u6587\u4EF6");
|
|
1145
|
+
}
|
|
1146
|
+
const deletedResults = deletedPaths.map((path4) => ({
|
|
1147
|
+
absPath: "",
|
|
1148
|
+
relPath: path4,
|
|
1149
|
+
hash: "",
|
|
1150
|
+
content: null,
|
|
1151
|
+
chunks: [],
|
|
1152
|
+
language: "",
|
|
1153
|
+
mtime: 0,
|
|
1154
|
+
size: 0,
|
|
1155
|
+
status: "deleted"
|
|
1156
|
+
}));
|
|
1157
|
+
const allToIndex = [...needsVectorIndex, ...healingFiles, ...deletedResults];
|
|
1158
|
+
if (allToIndex.length > 0) {
|
|
1159
|
+
const indexStats = await indexer.indexFiles(db, allToIndex);
|
|
1160
|
+
stats.vectorIndex = {
|
|
1161
|
+
indexed: indexStats.indexed,
|
|
1162
|
+
deleted: indexStats.deleted,
|
|
1163
|
+
errors: indexStats.errors
|
|
1164
|
+
};
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
return stats;
|
|
1168
|
+
} finally {
|
|
1169
|
+
closeDb(db);
|
|
1170
|
+
closeAllIndexers();
|
|
1171
|
+
await closeAllVectorStores();
|
|
1172
|
+
}
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
export {
|
|
1176
|
+
scan
|
|
1177
|
+
};
|