@kk-irving/knowledge-mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,392 @@
1
+ /**
2
+ * analyze_issue —— 端到端 PR/Bug 分析编排(v1.0.0)。
3
+ *
4
+ * 一次调用串起:
5
+ * 1. 拉 Zmind issue 详情(subject + description + journals + attachments)
6
+ * 2. 准备工作目录(默认 `<workspace_root>/.workspace/issue-<id>/`)
7
+ * 3. 提取查询关键词(标题去停用词 + 描述前 200 字符 token 化)
8
+ * 4. 三源本地知识库混合检索(zmind / gerrit / confluence)
9
+ * 5. 推断潜在 module(从命中的 gerrit/zmind 路径片段反查 module-map)
10
+ * 6. (可选)AOSP 模块级精搜
11
+ * 7. 渲染 `analysis-context.md` 落盘到工作目录
12
+ * 8. 返回 JSON 汇总
13
+ *
14
+ * 任何子步骤失败均 best-effort 继续,错误写入 context.md "已知问题" 段。
15
+ *
16
+ * 注意:本工具**不下载附件**——附件下载由 zmind-mcp-server 的
17
+ * `prepare_issue_workspace` 负责,此处只读取已有 workspace。如果调用时
18
+ * workspace_root 不存在则只创建目录。
19
+ */
20
+ import { mkdir, writeFile, readdir, stat } from "node:fs/promises";
21
+ import * as path from "node:path";
22
+ import { searchLocal } from "./search.js";
23
+ import { searchAosp } from "./aosp/search.js";
24
+ import { loadModuleMap } from "./aosp/module-map-loader.js";
25
+ const ZMIND_URL = (process.env.ZMIND_URL ?? "https://zmind.whaletv.com").replace(/\/+$/, "");
26
+ const ZMIND_API_KEY = process.env.ZMIND_API_KEY ?? "";
27
+ // =============================================================================
28
+ // Zmind helper
29
+ // =============================================================================
30
+ async function fetchIssue(issueId) {
31
+ if (!ZMIND_API_KEY)
32
+ throw new Error("ZMIND_API_KEY 未配置");
33
+ const url = new URL(`/issues/${issueId}.json`, ZMIND_URL + "/");
34
+ url.searchParams.set("include", "attachments,journals");
35
+ url.searchParams.set("key", ZMIND_API_KEY);
36
+ const res = await fetch(url.toString(), {
37
+ headers: { Accept: "application/json" },
38
+ });
39
+ if (!res.ok) {
40
+ throw new Error(`Zmind HTTP ${res.status}: ${(await res.text()).slice(0, 300)}`);
41
+ }
42
+ const data = (await res.json());
43
+ return data.issue;
44
+ }
45
+ // =============================================================================
46
+ // 关键词提取
47
+ // =============================================================================
48
+ const STOPWORDS_ZH = new Set([
49
+ "的", "了", "和", "是", "在", "有", "也", "就", "都", "这", "那", "我", "你", "他",
50
+ "不", "为", "上", "下", "之", "对", "以", "及", "及其", "或", "等", "与",
51
+ "请", "问题", "情况", "发现", "出现", "存在", "需要", "处理", "解决", "测试",
52
+ "已", "未", "可", "能", "会", "时", "时候",
53
+ ]);
54
+ const STOPWORDS_EN = new Set([
55
+ "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
56
+ "of", "to", "in", "on", "at", "for", "with", "and", "or", "but",
57
+ "this", "that", "these", "those", "it", "its", "as",
58
+ "i", "we", "you", "they", "he", "she",
59
+ "have", "has", "had", "do", "does", "did", "can", "will", "would", "should",
60
+ "issue", "bug", "test", "fix", "fail", "failed", "problem",
61
+ ]);
62
+ /**
63
+ * 从 issue 标题 + 描述前 200 字符提取关键词。
64
+ *
65
+ * 简单策略:
66
+ * - 把内容按非字母数字字符切分
67
+ * - 同时切分中文(按字符为单位,但保留连续中文片段)
68
+ * - 过滤停用词、长度 < 2 的 token
69
+ * - 取前 8 个去重 token
70
+ */
71
+ export function extractKeywords(subject, description) {
72
+ const text = (subject + " " + (description || "").slice(0, 200)).toLowerCase();
73
+ // 同时切英文 token 与连续中文片段
74
+ const englishTokens = (text.match(/[a-z][a-z0-9_-]+/g) ?? []).filter((t) => t.length >= 2 && !STOPWORDS_EN.has(t));
75
+ const chineseTokens = (text.match(/[\u4e00-\u9fff]+/g) ?? []).filter((t) => t.length >= 2 && !STOPWORDS_ZH.has(t));
76
+ const seen = new Set();
77
+ const result = [];
78
+ for (const t of [...englishTokens, ...chineseTokens]) {
79
+ if (seen.has(t))
80
+ continue;
81
+ seen.add(t);
82
+ result.push(t);
83
+ if (result.length >= 8)
84
+ break;
85
+ }
86
+ return result;
87
+ }
88
+ // =============================================================================
89
+ // 平台 / 模块推断
90
+ // =============================================================================
91
+ const PLATFORM_KEYWORDS = {
92
+ D4: ["d4", "am30", "at30", "calla", "redi", "soddy", "t982"],
93
+ X5: ["x5", "am50", "br30", "bs30", "anemone", "dahlia", "daisy", "dryas"],
94
+ STB: ["stb", "stb16", "pascal", "qurra", "raman", "ross"],
95
+ };
96
+ function inferPlatform(text) {
97
+ const lower = text.toLowerCase();
98
+ for (const [platform, keywords] of Object.entries(PLATFORM_KEYWORDS)) {
99
+ for (const kw of keywords) {
100
+ if (lower.includes(kw))
101
+ return platform;
102
+ }
103
+ }
104
+ return undefined;
105
+ }
106
+ /**
107
+ * 从命中的搜索结果(含 gerrit project / 文件路径片段)反查可能的 module。
108
+ */
109
+ async function inferModulesFromHits(hits, platform) {
110
+ if (!platform || hits.length === 0)
111
+ return [];
112
+ let map;
113
+ try {
114
+ map = await loadModuleMap();
115
+ }
116
+ catch {
117
+ return [];
118
+ }
119
+ const platformMap = map.platforms[platform.toUpperCase()];
120
+ if (!platformMap)
121
+ return [];
122
+ const moduleScores = new Map();
123
+ // 对每个命中结果,扫描其 project / 路径片段,看是否匹配 module path 前缀
124
+ for (const hit of hits) {
125
+ const text = String((hit?.project ?? "") + " " + (hit?.snippet ?? "") + " " + (hit?.title ?? "")).toLowerCase();
126
+ for (const [moduleName, paths] of Object.entries(platformMap)) {
127
+ let hitCount = 0;
128
+ for (const p of paths) {
129
+ const pLower = p.toLowerCase();
130
+ // 取 path 最后一段作为最特征性 token
131
+ const segs = pLower.split("/").filter(Boolean);
132
+ for (const seg of segs.slice(-2)) {
133
+ if (seg.length >= 4 && text.includes(seg)) {
134
+ hitCount++;
135
+ break;
136
+ }
137
+ }
138
+ if (text.includes(moduleName))
139
+ hitCount++;
140
+ }
141
+ if (hitCount > 0) {
142
+ moduleScores.set(moduleName, (moduleScores.get(moduleName) ?? 0) + hitCount);
143
+ }
144
+ }
145
+ }
146
+ return Array.from(moduleScores.entries())
147
+ .sort((a, b) => b[1] - a[1])
148
+ .slice(0, 3)
149
+ .map(([m]) => m);
150
+ }
151
+ // =============================================================================
152
+ // 工作目录
153
+ // =============================================================================
154
+ async function ensureWorkspace(workspaceRoot, issueId) {
155
+ const wsRoot = path.resolve(workspaceRoot);
156
+ const target = path.join(wsRoot, ".workspace", `issue-${issueId}`);
157
+ await mkdir(target, { recursive: true });
158
+ return target;
159
+ }
160
+ async function summarizeAttachments(workspacePath) {
161
+ const attachDir = path.join(workspacePath, "attachments");
162
+ try {
163
+ const st = await stat(attachDir);
164
+ if (!st.isDirectory())
165
+ return { count: 0, sample: [] };
166
+ }
167
+ catch {
168
+ return { count: 0, sample: [] };
169
+ }
170
+ const entries = await readdir(attachDir, { withFileTypes: true });
171
+ const files = entries.filter((e) => e.isFile()).map((e) => e.name);
172
+ return { count: files.length, sample: files.slice(0, 5) };
173
+ }
174
+ // =============================================================================
175
+ // Markdown 渲染
176
+ // =============================================================================
177
+ function renderContextMd(args) {
178
+ const lines = [];
179
+ const issue = args.issue;
180
+ lines.push(`# Issue #${issue.id} 分析上下文`);
181
+ lines.push("");
182
+ lines.push(`- **标题**: ${issue.subject ?? "(无)"}`);
183
+ lines.push(`- **类型**: ${issue.tracker?.name ?? ""}`);
184
+ lines.push(`- **状态**: ${issue.status?.name ?? ""}`);
185
+ lines.push(`- **指派**: ${issue.assigned_to?.name ?? ""}`);
186
+ lines.push(`- **项目**: ${issue.project?.name ?? ""}`);
187
+ lines.push(`- **目标版本**: ${issue.fixed_version?.name ?? ""}`);
188
+ lines.push(`- **生成时间**: ${new Date().toISOString()}`);
189
+ lines.push(`- **工作目录**: ${args.workspace}`);
190
+ lines.push("");
191
+ // 描述
192
+ lines.push("## 问题描述");
193
+ lines.push("");
194
+ lines.push(issue.description ?? "(无描述)");
195
+ lines.push("");
196
+ // 附件
197
+ lines.push("## 附件");
198
+ lines.push("");
199
+ if (args.attachments.count === 0) {
200
+ lines.push("(暂无)— 如需下载附件请用 zmind-mcp 的 `prepare_issue_workspace` 工具");
201
+ }
202
+ else {
203
+ lines.push(`共 ${args.attachments.count} 个;样本:`);
204
+ for (const f of args.attachments.sample)
205
+ lines.push(`- \`${f}\``);
206
+ }
207
+ lines.push("");
208
+ // 关键词与平台/模块
209
+ lines.push("## 推断信息");
210
+ lines.push("");
211
+ lines.push(`- **关键词**: ${args.keywords.length > 0 ? args.keywords.join(", ") : "(空)"}`);
212
+ lines.push(`- **平台**: ${args.platform ?? "(未推断)"}`);
213
+ lines.push(`- **可能模块**: ${args.modules.length > 0 ? args.modules.join(", ") : "(未推断)"}`);
214
+ lines.push("");
215
+ // 相似历史
216
+ lines.push("## 相似历史(本地知识库 hybrid 检索)");
217
+ lines.push("");
218
+ for (const sourceName of ["zmind", "gerrit", "confluence"]) {
219
+ const hits = args.similar[sourceName] ?? [];
220
+ lines.push(`### ${sourceName} (${hits.length} 条)`);
221
+ lines.push("");
222
+ if (hits.length === 0) {
223
+ lines.push("(无命中)");
224
+ }
225
+ else {
226
+ for (const h of hits) {
227
+ const titlePart = h.title ?? "(无标题)";
228
+ const url = h.url ?? "";
229
+ const snippet = (h.snippet ?? "").replace(/\n/g, " ");
230
+ const score = typeof h.score === "number" ? h.score.toFixed(3) : "";
231
+ lines.push(`- **[${h.id}]** [${titlePart}](${url}) · score=${score} · match=${h.match}`);
232
+ if (snippet)
233
+ lines.push(` - ${snippet.slice(0, 240)}`);
234
+ }
235
+ }
236
+ lines.push("");
237
+ }
238
+ // AOSP
239
+ lines.push("## AOSP 代码(模块级精搜)");
240
+ lines.push("");
241
+ if (args.aospHits.length === 0) {
242
+ lines.push("(未启用 / 未命中)");
243
+ }
244
+ else {
245
+ for (const h of args.aospHits) {
246
+ lines.push(`- **${h.module}** ${h.file_path}:${h.line_start}-${h.line_end} (${h.symbol_kind} ${h.symbol_name})`);
247
+ const snip = (h.snippet ?? "").replace(/\n/g, " ");
248
+ if (snip)
249
+ lines.push(` - ${snip.slice(0, 240)}`);
250
+ }
251
+ }
252
+ lines.push("");
253
+ // 后续建议
254
+ lines.push("## 推荐动作");
255
+ lines.push("");
256
+ lines.push("1. **复现验证** — 在 issue 描述场景下尝试本地复现");
257
+ lines.push("2. **历史修复对比** — 重点查看上面三源 Top-K 命中里 status=Closed 的条目,对比修复方式");
258
+ lines.push("3. **模块改动建议** — 结合 AOSP 命中代码片段定位修改点;如未命中可手动用 `search_aosp` 缩小范围");
259
+ lines.push("");
260
+ // 已知问题
261
+ if (args.errors.length > 0) {
262
+ lines.push("## 已知问题(运行期错误)");
263
+ lines.push("");
264
+ for (const e of args.errors) {
265
+ lines.push(`- **${e.stage}**: ${e.message}`);
266
+ }
267
+ lines.push("");
268
+ }
269
+ return lines.join("\n");
270
+ }
271
+ // =============================================================================
272
+ // 主入口
273
+ // =============================================================================
274
+ export async function analyzeIssue(args) {
275
+ const errors = [];
276
+ const issueId = args.issue_id;
277
+ const workspaceRoot = args.workspace_root ?? process.cwd();
278
+ const perSourceLimit = Math.max(1, Math.min(10, args.per_source_limit ?? 3));
279
+ // 1. 拉 issue
280
+ let issue;
281
+ try {
282
+ issue = await fetchIssue(issueId);
283
+ }
284
+ catch (e) {
285
+ errors.push({ stage: "fetch_issue", message: e.message });
286
+ issue = { id: issueId };
287
+ }
288
+ // 2. 工作目录
289
+ const wsPath = await ensureWorkspace(workspaceRoot, issueId);
290
+ // 3. 附件汇总
291
+ let attachSummary = { count: 0, sample: [] };
292
+ try {
293
+ attachSummary = await summarizeAttachments(wsPath);
294
+ }
295
+ catch (e) {
296
+ errors.push({ stage: "summarize_attachments", message: e.message });
297
+ }
298
+ // 4. 关键词
299
+ const keywords = extractKeywords(issue.subject ?? "", issue.description ?? "");
300
+ // 5. 平台推断
301
+ const platformText = `${issue.subject ?? ""} ${issue.description ?? ""} ${issue.project?.name ?? ""} ${issue.fixed_version?.name ?? ""}`;
302
+ const inferredPlatform = (args.platform ?? inferPlatform(platformText) ?? "").toUpperCase() || undefined;
303
+ // 6. 三源 hybrid 检索
304
+ let similar = {};
305
+ if (keywords.length > 0) {
306
+ try {
307
+ const query = keywords.join(" ");
308
+ const result = (await searchLocal({
309
+ query,
310
+ source: "all",
311
+ mode: "hybrid",
312
+ limit: perSourceLimit,
313
+ }));
314
+ similar = {
315
+ zmind: result.zmind ?? [],
316
+ gerrit: result.gerrit ?? [],
317
+ confluence: result.confluence ?? [],
318
+ };
319
+ }
320
+ catch (e) {
321
+ errors.push({ stage: "search_local", message: e.message });
322
+ }
323
+ }
324
+ // 7. 模块推断
325
+ const allHitsForInference = [
326
+ ...(similar.zmind ?? []),
327
+ ...(similar.gerrit ?? []),
328
+ ];
329
+ let modules = [];
330
+ try {
331
+ modules = await inferModulesFromHits(allHitsForInference, inferredPlatform);
332
+ }
333
+ catch (e) {
334
+ errors.push({ stage: "infer_modules", message: e.message });
335
+ }
336
+ // 8. AOSP 检索(可选)
337
+ let aospHits = [];
338
+ if (args.include_aosp && inferredPlatform && modules.length > 0 && keywords.length > 0) {
339
+ try {
340
+ const result = await searchAosp({
341
+ query: keywords.join(" "),
342
+ platform: inferredPlatform,
343
+ module: modules[0],
344
+ mode: "hybrid",
345
+ limit: perSourceLimit,
346
+ });
347
+ aospHits = result.hits;
348
+ }
349
+ catch (e) {
350
+ errors.push({ stage: "search_aosp", message: e.message });
351
+ }
352
+ }
353
+ // 9. 渲染 context.md
354
+ const md = renderContextMd({
355
+ issue,
356
+ workspace: wsPath,
357
+ attachments: attachSummary,
358
+ keywords,
359
+ platform: inferredPlatform,
360
+ modules,
361
+ similar,
362
+ aospHits,
363
+ errors,
364
+ });
365
+ const ctxPath = path.join(wsPath, "analysis-context.md");
366
+ try {
367
+ await writeFile(ctxPath, md, "utf8");
368
+ }
369
+ catch (e) {
370
+ errors.push({ stage: "write_context_md", message: e.message });
371
+ }
372
+ return {
373
+ workspace_path: wsPath,
374
+ issue: {
375
+ id: issueId,
376
+ tracker: issue.tracker?.name ?? "",
377
+ subject: issue.subject ?? "",
378
+ status: issue.status?.name ?? "",
379
+ assignee: issue.assigned_to?.name ?? "",
380
+ project: issue.project?.name ?? "",
381
+ target_version: issue.fixed_version?.name ?? "",
382
+ },
383
+ attachments_summary: attachSummary,
384
+ keywords,
385
+ inferred_platform: inferredPlatform,
386
+ inferred_modules: modules,
387
+ similar,
388
+ aosp_hits: aospHits,
389
+ context_md_path: ctxPath,
390
+ errors,
391
+ };
392
+ }
@@ -0,0 +1,221 @@
1
+ /**
2
+ * 文件切块(chunker)。
3
+ *
4
+ * 设计:
5
+ * - 不上 tree-sitter(500MB+ 体积、需各语言 wasm)
6
+ * - 用 **正则边界识别 + 行级硬切**,足够覆盖 AOSP 主流语言(Java/Kotlin/C/C++/Python/Shell/AIDL/HIDL/XML 注释边界)
7
+ * - 每个 chunk ≤ MAX_CHUNK_CHARS(默认 2000 字符)
8
+ * - 优先在边界(class / fn / method)切;落不到边界时按 200 行硬切
9
+ * - 每个 chunk 记录 file_path、line_start、line_end、symbol_kind、symbol_name
10
+ *
11
+ * 黑名单:在 indexer 层处理(跳过 .git / out / build / 二进制 / >5MB)
12
+ */
13
+ const MAX_CHUNK_CHARS = 2000;
14
+ const HARD_LINES = 200;
15
+ // =============================================================================
16
+ // 边界识别(正则)
17
+ // =============================================================================
18
+ function detectBoundariesJava(lines) {
19
+ // class / interface / enum / methods
20
+ const result = [];
21
+ // 顶层 class / interface / enum
22
+ const classRe = /^\s*(?:public|private|protected|abstract|final|static|sealed)?\s*(class|interface|enum|@interface)\s+(\w+)/;
23
+ // method(有 paren 的非 class 行;不严格,但够用)
24
+ const methodRe = /^\s*(?:public|private|protected|static|final|abstract|synchronized|default|native)\s+(?:[\w<>?,\s.\[\]]+\s+)?(\w+)\s*\(/;
25
+ for (let i = 0; i < lines.length; i++) {
26
+ const ln = lines[i];
27
+ if (!ln)
28
+ continue;
29
+ let m = ln.match(classRe);
30
+ if (m) {
31
+ result.push({ line: i + 1, kind: m[1], name: m[2] });
32
+ continue;
33
+ }
34
+ m = ln.match(methodRe);
35
+ if (m && !ln.trimStart().startsWith("//")) {
36
+ // 排除 control statement
37
+ if (!/^\s*(if|for|while|switch|catch|synchronized|try)\s*\(/.test(ln)) {
38
+ result.push({ line: i + 1, kind: "method", name: m[1] });
39
+ }
40
+ }
41
+ }
42
+ return result;
43
+ }
44
+ function detectBoundariesKotlin(lines) {
45
+ const result = [];
46
+ const classRe = /^\s*(?:open|abstract|sealed|data|enum|object)?\s*(class|interface|object|enum)\s+(\w+)/;
47
+ const funRe = /^\s*(?:override|open|private|public|protected|internal|suspend|inline)?\s*fun\s+(?:<[^>]*>\s*)?(?:(\w+)\.)?(\w+)\s*\(/;
48
+ for (let i = 0; i < lines.length; i++) {
49
+ const ln = lines[i];
50
+ if (!ln)
51
+ continue;
52
+ let m = ln.match(classRe);
53
+ if (m) {
54
+ result.push({ line: i + 1, kind: m[1], name: m[2] });
55
+ continue;
56
+ }
57
+ m = ln.match(funRe);
58
+ if (m) {
59
+ result.push({ line: i + 1, kind: "function", name: m[2] || m[1] || "" });
60
+ }
61
+ }
62
+ return result;
63
+ }
64
+ function detectBoundariesCpp(lines) {
65
+ const result = [];
66
+ // class / struct
67
+ const classRe = /^\s*(?:template\s*<[^>]*>\s*)?(class|struct)\s+(\w+)/;
68
+ // 函数定义(含返回类型);非常宽松:以 ) { 结尾的行
69
+ const funRe = /^\s*(?:[\w:<>*&\s,]+)\s+(\w+)\s*\([^)]*\)\s*(?:const)?\s*(?:override)?\s*\{?\s*$/;
70
+ for (let i = 0; i < lines.length; i++) {
71
+ const ln = lines[i];
72
+ if (!ln)
73
+ continue;
74
+ let m = ln.match(classRe);
75
+ if (m) {
76
+ result.push({ line: i + 1, kind: m[1], name: m[2] });
77
+ continue;
78
+ }
79
+ m = ln.match(funRe);
80
+ if (m && !ln.trim().startsWith("//") && !/^\s*(if|for|while|switch|catch|return)\s*\(/.test(ln)) {
81
+ // 至少要看像函数签名(含返回类型字符):放到稍后 dedup
82
+ if (ln.includes(" ") || ln.includes(":")) {
83
+ result.push({ line: i + 1, kind: "function", name: m[1] });
84
+ }
85
+ }
86
+ }
87
+ return result;
88
+ }
89
+ function detectBoundariesPython(lines) {
90
+ const result = [];
91
+ const re = /^\s*(class|def|async def)\s+(\w+)/;
92
+ for (let i = 0; i < lines.length; i++) {
93
+ const ln = lines[i];
94
+ if (!ln)
95
+ continue;
96
+ const m = ln.match(re);
97
+ if (m) {
98
+ result.push({ line: i + 1, kind: m[1].includes("class") ? "class" : "function", name: m[2] });
99
+ }
100
+ }
101
+ return result;
102
+ }
103
+ function detectBoundariesGeneric(_lines) {
104
+ return []; // 未识别语言走纯硬切
105
+ }
106
+ function pickDetector(fileExt) {
107
+ switch (fileExt.toLowerCase()) {
108
+ case ".java":
109
+ case ".aidl":
110
+ case ".hal":
111
+ return detectBoundariesJava;
112
+ case ".kt":
113
+ case ".kts":
114
+ return detectBoundariesKotlin;
115
+ case ".c":
116
+ case ".cc":
117
+ case ".cpp":
118
+ case ".cxx":
119
+ case ".h":
120
+ case ".hh":
121
+ case ".hpp":
122
+ return detectBoundariesCpp;
123
+ case ".py":
124
+ return detectBoundariesPython;
125
+ default:
126
+ return detectBoundariesGeneric;
127
+ }
128
+ }
129
+ // =============================================================================
130
+ // 切块主入口
131
+ // =============================================================================
132
+ /**
133
+ * 把单个文件内容切成 chunks。
134
+ *
135
+ * 策略:
136
+ * 1. 用语言对应的正则识别边界点
137
+ * 2. 在每两个边界之间组成 chunk(如果合并段超 MAX_CHUNK_CHARS,按 HARD_LINES 硬切)
138
+ * 3. 没有边界时按 HARD_LINES 硬切
139
+ *
140
+ * 行号统一 1-indexed inclusive。
141
+ */
142
+ export function chunkFile(filePath, content) {
143
+ const lines = content.split(/\r?\n/);
144
+ if (lines.length === 0)
145
+ return [];
146
+ const ext = filePath.slice(filePath.lastIndexOf("."));
147
+ const detect = pickDetector(ext);
148
+ const boundaries = detect(lines);
149
+ // 把"文件起点"作为虚拟边界
150
+ const cuts = [{ line: 1, kind: "header", name: "" }, ...boundaries];
151
+ const chunks = [];
152
+ // 按相邻边界切片
153
+ for (let i = 0; i < cuts.length; i++) {
154
+ const start = cuts[i].line;
155
+ const end = i + 1 < cuts.length ? cuts[i + 1].line - 1 : lines.length;
156
+ if (end < start)
157
+ continue;
158
+ sliceAndPush(lines, filePath, start, end, cuts[i].kind, cuts[i].name, chunks);
159
+ }
160
+ return chunks;
161
+ }
162
+ function sliceAndPush(lines, _filePath, startLine, endLine, kind, name, out) {
163
+ // 把 [startLine..endLine] 视作一个 region,超长就硬切
164
+ let cursor = startLine;
165
+ while (cursor <= endLine) {
166
+ const lineCount = Math.min(endLine - cursor + 1, HARD_LINES);
167
+ let endCursor = cursor + lineCount - 1;
168
+ let body = lines.slice(cursor - 1, endCursor).join("\n");
169
+ // 字符上限再约束一次(超长则进一步缩 endCursor)
170
+ while (body.length > MAX_CHUNK_CHARS && endCursor > cursor) {
171
+ endCursor = cursor + Math.max(0, Math.floor((endCursor - cursor) * 0.8));
172
+ body = lines.slice(cursor - 1, endCursor).join("\n");
173
+ }
174
+ if (body.trim().length === 0) {
175
+ cursor = endCursor + 1;
176
+ continue;
177
+ }
178
+ out.push({
179
+ line_start: cursor,
180
+ line_end: endCursor,
181
+ symbol_kind: kind,
182
+ symbol_name: name,
183
+ content: body,
184
+ });
185
+ cursor = endCursor + 1;
186
+ }
187
+ }
188
+ // =============================================================================
189
+ // 文件类型过滤(黑名单)
190
+ // =============================================================================
191
+ const SOURCE_EXTS = new Set([
192
+ ".java", ".kt", ".kts", ".aidl", ".hal",
193
+ ".c", ".cc", ".cpp", ".cxx", ".h", ".hh", ".hpp",
194
+ ".py", ".sh", ".bp", ".mk", ".rs", ".go",
195
+ ".xml", ".rc", ".te", ".sepolicy",
196
+ ".js", ".ts", ".jsx", ".tsx", ".vue",
197
+ ".md", ".txt",
198
+ ]);
199
+ const SKIP_DIR_NAMES = new Set([
200
+ ".git", ".repo", "out", "build", ".gradle", "node_modules",
201
+ "__pycache__", ".idea", ".vscode",
202
+ "obj", "bin", "Debug", "Release", "target",
203
+ ]);
204
+ /** 是否应该索引该文件路径(基于扩展名 + 大小)。 */
205
+ export function shouldIndexFile(filePath, sizeBytes) {
206
+ if (sizeBytes > 5 * 1024 * 1024)
207
+ return false; // 单文件 5MB 上限
208
+ if (sizeBytes === 0)
209
+ return false;
210
+ const i = filePath.lastIndexOf(".");
211
+ if (i < 0)
212
+ return false;
213
+ const ext = filePath.slice(i).toLowerCase();
214
+ return SOURCE_EXTS.has(ext);
215
+ }
216
+ /** 是否应该跳过目录(按目录名)。 */
217
+ export function shouldSkipDir(dirName) {
218
+ if (dirName.startsWith(".") && SKIP_DIR_NAMES.has(dirName))
219
+ return true;
220
+ return SKIP_DIR_NAMES.has(dirName);
221
+ }
@@ -0,0 +1,56 @@
1
+ /**
2
+ * AOSP chunks 的嵌入回写。
3
+ *
4
+ * 与三源 embed_pending 拆开:aosp 通常 chunk 数量级巨大(几十万),
5
+ * 单独控制 batch / progress 更合理。
6
+ */
7
+ import { getDb, runInTransaction, vectorToBlob } from "../db.js";
8
+ import { embedTexts } from "../embedder.js";
9
+ import { invalidateAospIndex } from "./search.js";
10
+ export async function embedAospPending(args = {}) {
11
+ const limit = Math.max(1, Math.min(5000, args.batch_size ?? 200));
12
+ const db = getDb();
13
+ const where = ["embedding IS NULL"];
14
+ const params = [];
15
+ if (args.platform) {
16
+ where.push("platform = ?");
17
+ params.push(args.platform.toUpperCase());
18
+ }
19
+ if (args.module) {
20
+ where.push("module = ?");
21
+ params.push(args.module.toLowerCase());
22
+ }
23
+ // 1. 总待嵌入数(用于进度报告)
24
+ const totalRow = db
25
+ .prepare(`SELECT COUNT(*) AS n FROM aosp_chunks WHERE ${where.join(" AND ")}`)
26
+ .get(...params);
27
+ const total = totalRow?.n ?? 0;
28
+ if (total === 0)
29
+ return { total_pending: 0, embedded: 0, remaining: 0 };
30
+ // 2. 拉本批
31
+ const rows = db
32
+ .prepare(`SELECT id, content FROM aosp_chunks WHERE ${where.join(" AND ")} ORDER BY id LIMIT ?`)
33
+ .all(...params, limit);
34
+ if (rows.length === 0)
35
+ return { total_pending: total, embedded: 0, remaining: total };
36
+ // 3. embed
37
+ const texts = rows.map((r) => r.content);
38
+ const vectors = await embedTexts(texts);
39
+ if (vectors.length !== rows.length) {
40
+ throw new Error(`embedTexts 返回 ${vectors.length},期望 ${rows.length}`);
41
+ }
42
+ // 4. 回写
43
+ const ts = new Date().toISOString();
44
+ const stmt = db.prepare("UPDATE aosp_chunks SET embedding = ?, embedding_updated_at = ? WHERE id = ?");
45
+ runInTransaction(db, () => {
46
+ for (let i = 0; i < rows.length; i++) {
47
+ stmt.run(vectorToBlob(vectors[i]), ts, rows[i].id);
48
+ }
49
+ });
50
+ invalidateAospIndex();
51
+ return {
52
+ total_pending: total,
53
+ embedded: rows.length,
54
+ remaining: total - rows.length,
55
+ };
56
+ }