@kk-irving/knowledge-mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,166 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * knowledge-mcp-server v1.0.0
4
+ *
5
+ * 本地三源知识库(Zmind PR / Gerrit changes / Confluence pages):
6
+ * - SQLite + FTS5 全文索引
7
+ * - BGE-small-zh ONNX 嵌入(@xenova/transformers)+ Float32Array BLOB 列存向量
8
+ * - vector / fts / hybrid 三模式跨源融合检索
9
+ *
10
+ * 工具:
11
+ * 1. sync_zmind — 拉 Zmind issues 到本地(增量水位)
12
+ * 2. sync_gerrit — 拉 Gerrit changes 到本地(双通道认证)
13
+ * 3. sync_confluence — 拉 Confluence pages 到本地(cookie 认证)
14
+ * 4. embed_pending — 给未嵌入或已过期的行批量计算嵌入
15
+ * 5. search_local — 单源/跨源 vector|fts|hybrid 检索
16
+ * 6. get_indexed — 取本地索引完整记录
17
+ *
18
+ * 数据库:默认 ./data/knowledge.db(KNOWLEDGE_DB_PATH 可覆盖)
19
+ * 模型:默认 Xenova/bge-small-zh-v1.5(首次启动自动下载到 ./data/models/)
20
+ *
21
+ * 凭据复用:
22
+ * - Zmind: ZMIND_URL + ZMIND_API_KEY
23
+ * - Gerrit: GERRIT_URL + (GERRIT_AUTH_HEADER + GERRIT_COOKIE) 或 (GERRIT_USERNAME + GERRIT_HTTP_PASSWORD)
24
+ * - Confluence: CONFLUENCE_BASE_URL + CONFLUENCE_COOKIE
25
+ */
26
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
27
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
28
+ import { z } from "zod";
29
+ import { config } from "./config.js";
30
+ import { syncZmind } from "./sources/zmind-sync.js";
31
+ import { syncGerrit } from "./sources/gerrit-sync.js";
32
+ import { syncConfluence } from "./sources/confluence-sync.js";
33
+ import { embedPending } from "./embed-pending.js";
34
+ import { searchLocal, getIndexed } from "./search.js";
35
+ import { getDb } from "./db.js";
36
+ import { indexAospModule, clearAospIndex } from "./aosp/indexer.js";
37
+ import { searchAosp } from "./aosp/search.js";
38
+ import { embedAospPending } from "./aosp/embed-aosp.js";
39
+ import { listModulesOfPlatform, loadModuleMap } from "./aosp/module-map-loader.js";
40
+ import { analyzeIssue } from "./analyze-issue.js";
41
+ const server = new McpServer({ name: "knowledge-mcp-server", version: "1.0.0" });
42
+ // =============================================================================
43
+ // 错误统一包装
44
+ // =============================================================================
45
+ function asJsonText(value) {
46
+ return JSON.stringify(value, null, 2);
47
+ }
48
+ function wrap(handler) {
49
+ return async () => {
50
+ try {
51
+ const result = await handler();
52
+ return { content: [{ type: "text", text: asJsonText(result) }] };
53
+ }
54
+ catch (e) {
55
+ const msg = e instanceof Error ? e.message : String(e);
56
+ return {
57
+ isError: true,
58
+ content: [
59
+ {
60
+ type: "text",
61
+ text: asJsonText({ error: msg }),
62
+ },
63
+ ],
64
+ };
65
+ }
66
+ };
67
+ }
68
+ // =============================================================================
69
+ // 工具注册
70
+ // =============================================================================
71
+ server.tool("sync_zmind", "增量同步 Zmind issues 到本地 SQLite。默认按 sync_state 的 last_full_sync 水位增量拉取,未配置时全量从最新开始按 updated_on 降序拉。", {
72
+ since: z.string().optional().describe("(可选) 仅同步 updated_on >= since 的 issues,YYYY-MM-DD"),
73
+ limit: z.number().int().min(1).max(50000).default(1000).describe("最大同步条数"),
74
+ statusId: z.string().optional().describe("(可选) 状态过滤;默认 '*' 全部"),
75
+ }, ({ since, limit, statusId }) => wrap(() => syncZmind({ since, limit, statusId }))());
76
+ server.tool("sync_gerrit", "增量同步 Gerrit changes 到本地 SQLite。使用 v1.1 双通道认证。可指定 query 与 project 缩小范围。", {
77
+ query: z.string().optional().describe("(可选) Gerrit 原生 query 表达式(拼接到 query 后)"),
78
+ project: z.string().optional().describe("(可选) project 过滤"),
79
+ since: z.string().optional().describe("(可选) after:\"YYYY-MM-DD\" 增量水位;未传则用 sync_state"),
80
+ limit: z.number().int().min(1).max(50000).default(1000).describe("最大同步条数"),
81
+ }, ({ query, project, since, limit }) => wrap(() => syncGerrit({ query, project, since, limit }))());
82
+ server.tool("sync_confluence", "增量同步 Confluence pages 到本地 SQLite。space 不传时遍历所有 global 空间;HTML 自动转纯文本入库。", {
83
+ space: z.string().optional().describe("(可选) 空间 key(CSV 多个,如 'TVENG,DOC');未传则全空间"),
84
+ since: z.string().optional().describe("(可选) lastmodified > since(YYYY-MM-DD HH:MM)"),
85
+ limit: z.number().int().min(1).max(50000).default(1000).describe("最大同步条数"),
86
+ }, ({ space, since, limit }) => wrap(() => syncConfluence({ space, since, limit }))());
87
+ server.tool("embed_pending", "批量为未嵌入或嵌入过期(embedding_updated_at < updated)的行计算向量并回写。完成后内存索引会失效,下次 search_local 自动重建。", {
88
+ source: z.enum(["zmind", "gerrit", "confluence"]).describe("数据源"),
89
+ batch_size: z.number().int().min(1).max(5000).default(200).describe("单次处理上限"),
90
+ }, ({ source, batch_size }) => wrap(() => embedPending({ source, batch_size }))());
91
+ server.tool("search_local", "本地索引混合检索。source='all' 跨三源并行返回 { zmind, gerrit, confluence } 各自 Top-K。mode='hybrid' 走向量+FTS5 合并去重,'vector' 纯语义,'fts' 纯关键词。返回每条命中的 source/id/title/url/snippet/score/match。", {
92
+ query: z.string().min(1).describe("查询字符串"),
93
+ source: z.enum(["zmind", "gerrit", "confluence", "all"]).default("all").describe("数据源;'all' 跨源"),
94
+ mode: z.enum(["vector", "fts", "hybrid"]).default("hybrid").describe("检索模式"),
95
+ limit: z.number().int().min(1).max(20).default(5).describe("每个源返回上限"),
96
+ }, ({ query, source, mode, limit }) => wrap(() => searchLocal({ query, source, mode, limit }))());
97
+ server.tool("get_indexed", "从本地索引读取单条记录的完整字段(不含嵌入向量)。", {
98
+ source: z.enum(["zmind", "gerrit", "confluence"]).describe("数据源"),
99
+ id: z.union([z.string(), z.number()]).describe("主键(zmind=数字 / gerrit=Change-Id / confluence=页面 id)"),
100
+ }, ({ source, id }) => wrap(() => Promise.resolve(getIndexed({ source, id })))());
101
+ // =============================================================================
102
+ // P2: AOSP 模块级精搜
103
+ // =============================================================================
104
+ server.tool("list_aosp_modules", "列出 module-path-map 中某个平台已登记的全部模块名(来自 steering/module-path-map.md)。", {
105
+ platform: z.enum(["D4", "X5", "STB"]).describe("平台"),
106
+ }, ({ platform }) => wrap(async () => {
107
+ const map = await loadModuleMap();
108
+ return {
109
+ platform,
110
+ modules: listModulesOfPlatform(map, platform),
111
+ source: map.source,
112
+ generated_at: map.generated_at,
113
+ };
114
+ })());
115
+ server.tool("index_aosp_module", "为指定平台的指定模块建立 chunk 索引:递归扫描 module_path 下源码,按函数/类边界切块,写入 aosp_chunks 表。完成后再调 embed_aosp_pending 计算向量。", {
116
+ platform: z.enum(["D4", "X5", "STB"]).describe("平台"),
117
+ module: z.string().min(1).describe("模块名(与 module-path-map 一致;用作过滤 key)"),
118
+ module_path: z.string().min(1).describe("模块根路径(相对 repo_root 或绝对)"),
119
+ repo_root: z.string().min(1).describe("AOSP 工作树根目录(绝对路径)"),
120
+ }, ({ platform, module, module_path, repo_root }) => wrap(() => indexAospModule({ platform, module, module_path, repo_root }))());
121
+ server.tool("embed_aosp_pending", "给 aosp_chunks 中未嵌入的行批量计算向量。批量大小默认 200;可按 platform / module 过滤分批跑大模块。", {
122
+ batch_size: z.number().int().min(1).max(5000).default(200).describe("单次处理上限"),
123
+ platform: z.enum(["D4", "X5", "STB"]).optional().describe("(可选) 仅处理某平台"),
124
+ module: z.string().optional().describe("(可选) 仅处理某模块"),
125
+ }, ({ batch_size, platform, module }) => wrap(() => embedAospPending({ batch_size, platform, module }))());
126
+ server.tool("search_aosp", "在 aosp_chunks 上做 vector / fts / hybrid 检索,可按 platform + module 过滤搜索域(自动从 module-path-map 翻译为路径前缀)。返回命中文件的相对路径、行号、symbol、snippet、score。", {
127
+ query: z.string().min(1).describe("查询字符串(支持中英文)"),
128
+ platform: z.enum(["D4", "X5", "STB"]).optional().describe("(可选) 平台过滤"),
129
+ module: z.string().optional().describe("(可选) 模块名(需与 platform 同时给)"),
130
+ module_path: z.string().optional().describe("(可选) 直接传路径前缀(绕过 module-map)"),
131
+ mode: z.enum(["vector", "fts", "hybrid"]).default("hybrid").describe("检索模式"),
132
+ limit: z.number().int().min(1).max(20).default(5).describe("返回上限"),
133
+ }, ({ query, platform, module, module_path, mode, limit }) => wrap(() => searchAosp({ query, platform, module, module_path, mode, limit }))());
134
+ server.tool("clear_aosp_index", "按 platform / module 删除 aosp_chunks 行(用于源码大改后重建)。两个参数都不传则清空整张表。", {
135
+ platform: z.enum(["D4", "X5", "STB"]).optional(),
136
+ module: z.string().optional(),
137
+ }, ({ platform, module }) => wrap(() => Promise.resolve(clearAospIndex({ platform, module })))());
138
+ // =============================================================================
139
+ // P2: analyze_issue 端到端工作流
140
+ // =============================================================================
141
+ server.tool("analyze_issue", "一键 PR/Bug 端到端分析:拉 Zmind issue → 准备工作目录 → 提取关键词 → 三源 hybrid 检索 → 推断平台/模块 → (可选) AOSP 模块级精搜 → 渲染 analysis-context.md 落盘。返回 JSON 汇总,best-effort 模式(任何子步骤失败都继续)。", {
142
+ issue_id: z.number().int().positive().describe("Zmind issue ID"),
143
+ workspace_root: z.string().optional().describe("(可选) 工作目录根,最终路径 <root>/.workspace/issue-<id>/;默认 cwd"),
144
+ include_aosp: z.boolean().default(false).describe("是否启用 AOSP 模块级精搜(需先 index_aosp_module + embed_aosp_pending)"),
145
+ platform: z.enum(["D4", "X5", "STB"]).optional().describe("(可选) 强制指定平台;不传则从 issue/project 推断"),
146
+ per_source_limit: z.number().int().min(1).max(10).default(3).describe("单源命中上限"),
147
+ }, ({ issue_id, workspace_root, include_aosp, platform, per_source_limit }) => wrap(() => analyzeIssue({ issue_id, workspace_root, include_aosp, platform, per_source_limit }))());
148
+ // =============================================================================
149
+ // 启动
150
+ // =============================================================================
151
+ async function main() {
152
+ const transport = new StdioServerTransport();
153
+ await server.connect(transport);
154
+ // 提前初始化 DB(让首次工具调用不带建表延迟)
155
+ try {
156
+ getDb();
157
+ }
158
+ catch (e) {
159
+ console.error(`[knowledge-mcp-server] DB 初始化失败: ${e.message}`);
160
+ }
161
+ console.error(`[knowledge-mcp-server v1.0.0] started — db=${config.dbPath}, model=${config.embeddingModelId} (dim=${config.embeddingDim}, threads=${config.embeddingThreads})`);
162
+ }
163
+ main().catch((err) => {
164
+ console.error("[knowledge-mcp-server] fatal error:", err);
165
+ process.exit(1);
166
+ });
package/dist/search.js ADDED
@@ -0,0 +1,274 @@
1
+ /**
2
+ * 检索:vector / fts / hybrid 三模式 + 单源/跨源。
3
+ */
4
+ import { config, isSourceName } from "./config.js";
5
+ import { getDb } from "./db.js";
6
+ import { embedOne } from "./embedder.js";
7
+ import { vectorTopK } from "./index-store.js";
8
+ const ALL_SOURCES = ["zmind", "gerrit", "confluence"];
9
+ function ftsSearch(source, query, limit) {
10
+ const db = getDb();
11
+ // 注意:FTS5 MATCH 需要把 query 中的特殊字符转义,否则会被当成查询语法
12
+ const safeQuery = escapeFts5(query);
13
+ if (source === "zmind") {
14
+ const rows = db
15
+ .prepare(`SELECT z.id AS _key,
16
+ bm25(zmind_issues_fts) AS bm,
17
+ z.subject AS title,
18
+ z.description AS body,
19
+ z.status, z.project_name, z.updated_on
20
+ FROM zmind_issues_fts
21
+ JOIN zmind_issues z ON z.id = zmind_issues_fts.rowid
22
+ WHERE zmind_issues_fts MATCH ?
23
+ ORDER BY bm
24
+ LIMIT ?`)
25
+ .all(safeQuery, limit);
26
+ return rows.map((r) => ({
27
+ key: r._key,
28
+ bm25: r.bm,
29
+ meta: {
30
+ source: "zmind",
31
+ id: r._key,
32
+ title: r.title ?? "",
33
+ url: zmindUrl(r._key),
34
+ snippet: snippet(r.body ?? "", config.snippetMaxChars),
35
+ status: r.status ?? "",
36
+ project: r.project_name ?? "",
37
+ updated: r.updated_on ?? "",
38
+ },
39
+ }));
40
+ }
41
+ if (source === "gerrit") {
42
+ const rows = db
43
+ .prepare(`SELECT g.change_id AS _key,
44
+ bm25(gerrit_changes_fts) AS bm,
45
+ g.subject AS title,
46
+ g.commit_message AS body,
47
+ g.project, g.number, g.status, g.updated
48
+ FROM gerrit_changes_fts
49
+ JOIN gerrit_changes g ON g.rowid = gerrit_changes_fts.rowid
50
+ WHERE gerrit_changes_fts MATCH ?
51
+ ORDER BY bm
52
+ LIMIT ?`)
53
+ .all(safeQuery, limit);
54
+ return rows.map((r) => ({
55
+ key: r._key,
56
+ bm25: r.bm,
57
+ meta: {
58
+ source: "gerrit",
59
+ id: r._key,
60
+ title: r.title ?? "",
61
+ url: gerritUrl(r.project, r.number),
62
+ snippet: snippet(r.body ?? "", config.snippetMaxChars),
63
+ status: r.status ?? "",
64
+ project: r.project ?? "",
65
+ updated: r.updated ?? "",
66
+ extra: { number: r.number },
67
+ },
68
+ }));
69
+ }
70
+ // confluence
71
+ const rows = db
72
+ .prepare(`SELECT c.id AS _key,
73
+ bm25(confluence_pages_fts) AS bm,
74
+ c.title,
75
+ c.body_text AS body,
76
+ c.space_key, c.webui, c.version, c.updated
77
+ FROM confluence_pages_fts
78
+ JOIN confluence_pages c ON c.rowid = confluence_pages_fts.rowid
79
+ WHERE confluence_pages_fts MATCH ?
80
+ ORDER BY bm
81
+ LIMIT ?`)
82
+ .all(safeQuery, limit);
83
+ return rows.map((r) => ({
84
+ key: r._key,
85
+ bm25: r.bm,
86
+ meta: {
87
+ source: "confluence",
88
+ id: r._key,
89
+ title: r.title ?? "",
90
+ url: confluenceUrl(r.webui),
91
+ snippet: snippet(r.body ?? "", config.snippetMaxChars),
92
+ space: r.space_key ?? "",
93
+ updated: r.updated ?? "",
94
+ extra: { version: r.version },
95
+ },
96
+ }));
97
+ }
98
+ // =============================================================================
99
+ // Hybrid 合并
100
+ // =============================================================================
101
+ /** 把 BM25 分数 normalize 到 (0, 1]:bm25 越小越好(SQLite FTS5 约定为负数 / 越接近 0 越相似)。 */
102
+ function normalizeFtsScore(bm25Values) {
103
+ const map = new Map();
104
+ if (bm25Values.length === 0)
105
+ return map;
106
+ // FTS5 默认 bm25 返回值越小越好(相似度越高),通常是负数
107
+ const min = Math.min(...bm25Values);
108
+ const max = Math.max(...bm25Values);
109
+ const span = max - min || 1;
110
+ bm25Values.forEach((v) => {
111
+ // map (min..max) → (1..0)
112
+ map.set(v, 1 - (v - min) / span);
113
+ });
114
+ return map;
115
+ }
116
+ async function searchOneSource(source, query, mode, limit) {
117
+ if (!query.trim())
118
+ return [];
119
+ let vectorHits = [];
120
+ let ftsHits = [];
121
+ if (mode === "vector" || mode === "hybrid") {
122
+ try {
123
+ const vec = await embedOne(query);
124
+ vectorHits = vectorTopK(source, vec, limit * 2);
125
+ }
126
+ catch (e) {
127
+ // 嵌入失败(模型未加载等)→ 在 hybrid 下降级为 fts only
128
+ if (mode === "vector")
129
+ throw e;
130
+ }
131
+ }
132
+ if (mode === "fts" || mode === "hybrid") {
133
+ try {
134
+ ftsHits = ftsSearch(source, query, limit * 2);
135
+ }
136
+ catch (e) {
137
+ if (mode === "fts")
138
+ throw e;
139
+ }
140
+ }
141
+ if (mode === "vector") {
142
+ return vectorHits
143
+ .slice(0, limit)
144
+ .map((h) => ({ ...h.meta, score: h.score, match: "vector" }));
145
+ }
146
+ if (mode === "fts") {
147
+ const norm = normalizeFtsScore(ftsHits.map((h) => h.bm25));
148
+ return ftsHits
149
+ .slice(0, limit)
150
+ .map((h) => ({
151
+ ...h.meta,
152
+ score: norm.get(h.bm25) ?? 0,
153
+ match: "fts",
154
+ }));
155
+ }
156
+ // hybrid:合并去重
157
+ const merged = new Map();
158
+ for (const h of vectorHits) {
159
+ const key = String(h.id) + "::" + h.meta.source;
160
+ merged.set(key, { ...h.meta, score: h.score, match: "vector" });
161
+ }
162
+ const norm = normalizeFtsScore(ftsHits.map((h) => h.bm25));
163
+ for (const h of ftsHits) {
164
+ const key = String(h.key) + "::" + h.meta.source;
165
+ const existed = merged.get(key);
166
+ const ftsScore = norm.get(h.bm25) ?? 0;
167
+ if (existed) {
168
+ // 同时命中:取最高分,标 both
169
+ const finalScore = Math.max(existed.score, ftsScore);
170
+ merged.set(key, { ...h.meta, score: finalScore, match: "both" });
171
+ }
172
+ else {
173
+ merged.set(key, { ...h.meta, score: ftsScore, match: "fts" });
174
+ }
175
+ }
176
+ return Array.from(merged.values())
177
+ .sort((a, b) => b.score - a.score)
178
+ .slice(0, limit);
179
+ }
180
+ // =============================================================================
181
+ // 公共入口
182
+ // =============================================================================
183
+ export async function searchLocal(args) {
184
+ const limit = Math.max(1, Math.min(config.searchMaxLimit, args.limit ?? config.searchDefaultLimit));
185
+ const mode = args.mode ?? "hybrid";
186
+ const source = args.source ?? "all";
187
+ if (source === "all") {
188
+ const out = {};
189
+ await Promise.all(ALL_SOURCES.map(async (s) => {
190
+ try {
191
+ out[s] = await searchOneSource(s, args.query, mode, limit);
192
+ }
193
+ catch (e) {
194
+ out[s] = [];
195
+ out[s + "_error"] = e.message;
196
+ }
197
+ }));
198
+ return { source: "all", query: args.query, mode, ...out };
199
+ }
200
+ if (!isSourceName(source)) {
201
+ throw new Error(`unknown source: ${source}`);
202
+ }
203
+ const hits = await searchOneSource(source, args.query, mode, limit);
204
+ return { source, query: args.query, mode, hits };
205
+ }
206
+ // =============================================================================
207
+ // FTS5 query 转义
208
+ // =============================================================================
209
+ function escapeFts5(q) {
210
+ // 把每个 token 用双引号包起来,让 FTS5 当 phrase 处理;同时用空格连接(OR 语义会被 default 处理)
211
+ // 简化版:保留中文/英文/数字 token,用空格分割
212
+ const tokens = q
213
+ .replace(/["']/g, " ")
214
+ .split(/\s+/)
215
+ .filter((t) => t.length > 0);
216
+ if (tokens.length === 0)
217
+ return q;
218
+ return tokens.map((t) => `"${t}"`).join(" ");
219
+ }
220
+ export function getIndexed(args) {
221
+ const db = getDb();
222
+ if (args.source === "zmind") {
223
+ const r = db
224
+ .prepare("SELECT * FROM zmind_issues WHERE id = ?")
225
+ .get(typeof args.id === "string" ? parseInt(args.id, 10) : args.id);
226
+ if (!r)
227
+ return null;
228
+ delete r.embedding;
229
+ return { source: "zmind", id: args.id, data: r };
230
+ }
231
+ if (args.source === "gerrit") {
232
+ const r = db
233
+ .prepare("SELECT * FROM gerrit_changes WHERE change_id = ?")
234
+ .get(String(args.id));
235
+ if (!r)
236
+ return null;
237
+ delete r.embedding;
238
+ return { source: "gerrit", id: args.id, data: r };
239
+ }
240
+ const r = db
241
+ .prepare("SELECT * FROM confluence_pages WHERE id = ?")
242
+ .get(String(args.id));
243
+ if (!r)
244
+ return null;
245
+ delete r.embedding;
246
+ return { source: "confluence", id: args.id, data: r };
247
+ }
248
+ // =============================================================================
249
+ // 辅助函数
250
+ // =============================================================================
251
+ function snippet(text, n) {
252
+ const clean = text.replace(/\s+/g, " ").trim();
253
+ if (clean.length <= n)
254
+ return clean;
255
+ return clean.slice(0, n).trimEnd() + "…";
256
+ }
257
+ function zmindUrl(id) {
258
+ const base = (process.env.ZMIND_URL ?? "https://zmind.whaletv.com").replace(/\/+$/, "");
259
+ return `${base}/issues/${id}`;
260
+ }
261
+ function gerritUrl(project, num) {
262
+ const base = (process.env.GERRIT_URL ?? "").replace(/\/+$/, "");
263
+ if (!base || !num)
264
+ return "";
265
+ return `${base}/c/${project}/+/${num}`;
266
+ }
267
+ function confluenceUrl(webui) {
268
+ if (!webui)
269
+ return "";
270
+ if (/^https?:\/\//i.test(webui))
271
+ return webui;
272
+ const base = (process.env.CONFLUENCE_BASE_URL ?? "").replace(/\/+$/, "");
273
+ return base ? base + webui : webui;
274
+ }
@@ -0,0 +1,180 @@
1
+ /**
2
+ * Confluence 同步:拉 pages 写入本地 confluence_pages 表。
3
+ *
4
+ * 使用 cookie 认证(CONFLUENCE_BASE_URL + CONFLUENCE_COOKIE)。
5
+ * 走 GET /rest/api/content?spaceKey=...&type=page 分页拉取所有页面,HTML 转纯文本入库。
6
+ */
7
+ import { getDb, getSyncState, setSyncState, runInTransaction } from "../db.js";
8
+ const CONFLUENCE_BASE_URL = (process.env.CONFLUENCE_BASE_URL ?? "").replace(/\/+$/, "");
9
+ const CONFLUENCE_COOKIE = (process.env.CONFLUENCE_COOKIE ?? "").trim();
10
+ const CONFLUENCE_REQUEST_DELAY_MS = (() => {
11
+ const raw = (process.env.CONFLUENCE_REQUEST_DELAY_MS ?? "").trim();
12
+ if (!/^\d+$/.test(raw))
13
+ return 150;
14
+ return parseInt(raw, 10);
15
+ })();
16
+ async function confluenceGet(pathOrUrl, params) {
17
+ if (!CONFLUENCE_BASE_URL || !CONFLUENCE_COOKIE) {
18
+ throw new Error("缺少 Confluence 凭据:请配置 CONFLUENCE_BASE_URL + CONFLUENCE_COOKIE。运行 scripts/refresh-auth.* 自动生成。");
19
+ }
20
+ const url = new URL(pathOrUrl, CONFLUENCE_BASE_URL);
21
+ for (const [k, v] of Object.entries(params))
22
+ url.searchParams.set(k, String(v));
23
+ const res = await fetch(url.toString(), {
24
+ method: "GET",
25
+ headers: {
26
+ Accept: "application/json",
27
+ Cookie: CONFLUENCE_COOKIE,
28
+ },
29
+ redirect: "manual",
30
+ });
31
+ if (res.status === 302 || res.status === 303) {
32
+ throw new Error("Confluence 302 → /login.action:cookie 已过期。运行 scripts/refresh-auth.* 重新抓取。");
33
+ }
34
+ const text = await res.text();
35
+ if (!res.ok) {
36
+ throw new Error(`Confluence HTTP ${res.status}: ${text.slice(0, 300)}`);
37
+ }
38
+ return JSON.parse(text);
39
+ }
40
+ function stripHtml(html) {
41
+ if (!html)
42
+ return "";
43
+ let t = html.replace(/<(script|style)[^>]*>[\s\S]*?<\/\1>/gi, " ");
44
+ t = t.replace(/<[^>]+>/g, " ");
45
+ t = t
46
+ .replace(/&nbsp;/gi, " ")
47
+ .replace(/&amp;/gi, "&")
48
+ .replace(/&lt;/gi, "<")
49
+ .replace(/&gt;/gi, ">")
50
+ .replace(/&quot;/gi, '"')
51
+ .replace(/&#39;/gi, "'")
52
+ .replace(/&apos;/gi, "'");
53
+ return t.replace(/\s+/g, " ").trim();
54
+ }
55
+ const UPSERT_SQL = `
56
+ INSERT INTO confluence_pages (
57
+ id, space_key, title, body_text, version, webui, created, updated
58
+ ) VALUES (
59
+ @id, @space_key, @title, @body_text, @version, @webui, @created, @updated
60
+ )
61
+ ON CONFLICT(id) DO UPDATE SET
62
+ space_key = excluded.space_key,
63
+ title = excluded.title,
64
+ body_text = excluded.body_text,
65
+ version = excluded.version,
66
+ webui = excluded.webui,
67
+ created = excluded.created,
68
+ updated = excluded.updated
69
+ `;
70
+ async function listAllSpaces() {
71
+ const out = [];
72
+ let start = 0;
73
+ for (let i = 0; i < 50; i++) {
74
+ const data = await confluenceGet("/rest/api/space", {
75
+ type: "global",
76
+ start,
77
+ limit: 100,
78
+ });
79
+ const items = data.results ?? [];
80
+ for (const s of items)
81
+ if (s.key)
82
+ out.push(s.key);
83
+ if (items.length < 100)
84
+ break;
85
+ start += items.length;
86
+ await sleep(CONFLUENCE_REQUEST_DELAY_MS);
87
+ }
88
+ return out;
89
+ }
90
+ function sleep(ms) {
91
+ return new Promise((r) => setTimeout(r, ms));
92
+ }
93
+ /**
94
+ * 全量/增量同步 Confluence pages。
95
+ *
96
+ * @param args.space 仅同步指定空间(可重复传入多个空间用 CSV,如 "TVENG,DOC")
97
+ * @param args.limit 最大同步条数(防一次性拉爆)。默认 1000
98
+ * @param args.since 仅同步 lastmodified > since 的页面(YYYY-MM-DD HH:MM)
99
+ */
100
+ export async function syncConfluence(args = {}) {
101
+ const db = getDb();
102
+ const limit = Math.max(1, Math.min(50000, args.limit ?? 1000));
103
+ const stateSince = args.since ?? getSyncState("confluence", "last_full_sync") ?? "";
104
+ // 决定要同步的空间集合
105
+ let spaces;
106
+ if (args.space) {
107
+ spaces = args.space.split(",").map((s) => s.trim()).filter((s) => s.length > 0);
108
+ }
109
+ else {
110
+ spaces = await listAllSpaces();
111
+ }
112
+ const upsert = db.prepare(UPSERT_SQL);
113
+ function upsertMany(rows) {
114
+ runInTransaction(db, () => {
115
+ for (const r of rows)
116
+ upsert.run(r);
117
+ });
118
+ }
119
+ let fetched = 0;
120
+ let upserted = 0;
121
+ const pageSize = 100;
122
+ for (const spaceKey of spaces) {
123
+ let start = 0;
124
+ while (fetched < limit) {
125
+ const remaining = limit - fetched;
126
+ const n = Math.min(pageSize, remaining);
127
+ let resp;
128
+ if (stateSince) {
129
+ // 走 CQL 增量
130
+ const cql = `space.key = "${spaceKey}" AND type = page AND lastmodified > "${stateSince}"`;
131
+ resp = await confluenceGet("/rest/api/content/search", {
132
+ cql,
133
+ start,
134
+ limit: n,
135
+ expand: "body.storage,version,space",
136
+ });
137
+ }
138
+ else {
139
+ resp = await confluenceGet("/rest/api/content", {
140
+ spaceKey,
141
+ type: "page",
142
+ start,
143
+ limit: n,
144
+ expand: "body.storage,version,space",
145
+ });
146
+ }
147
+ const items = resp.results ?? [];
148
+ if (items.length === 0)
149
+ break;
150
+ const rows = items.map((p) => ({
151
+ id: String(p.id),
152
+ space_key: p.space?.key ?? spaceKey,
153
+ title: p.title ?? "",
154
+ body_text: stripHtml(p.body?.storage?.value),
155
+ version: p.version?.number ?? 0,
156
+ webui: p._links?.webui ?? "",
157
+ created: p.history?.createdDate ?? "",
158
+ updated: p.version?.when ?? "",
159
+ }));
160
+ upsertMany(rows);
161
+ fetched += items.length;
162
+ upserted += items.length;
163
+ start += items.length;
164
+ if (items.length < n)
165
+ break;
166
+ await sleep(CONFLUENCE_REQUEST_DELAY_MS);
167
+ }
168
+ if (fetched >= limit)
169
+ break;
170
+ }
171
+ const watermark = new Date().toISOString();
172
+ setSyncState("confluence", "last_full_sync", watermark);
173
+ return {
174
+ source: "confluence",
175
+ fetched,
176
+ upserted,
177
+ spaces: spaces.length,
178
+ watermark,
179
+ };
180
+ }