agent-wiki 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -5
- package/dist/index.js +18 -4
- package/dist/tools/feedback.d.ts +20 -0
- package/dist/tools/feedback.js +118 -0
- package/dist/tools/ingest.d.ts +4 -3
- package/dist/tools/ingest.js +216 -25
- package/dist/tools/search.js +4 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -6,24 +6,90 @@
|
|
|
6
6
|
|
|
7
7
|
## 安装
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
所有主流 Agent 通用,底层都是同一个 `npx agent-wiki`。装完重启 Agent 就行。
|
|
10
|
+
|
|
11
|
+
### Claude Code
|
|
12
|
+
|
|
10
13
|
```bash
|
|
11
14
|
claude mcp add agent-wiki -- npx agent-wiki
|
|
12
15
|
```
|
|
13
16
|
|
|
14
|
-
|
|
17
|
+
### Cursor
|
|
18
|
+
|
|
19
|
+
Settings → MCP → Add new MCP Server:
|
|
20
|
+
|
|
21
|
+
```json
|
|
22
|
+
{
|
|
23
|
+
"mcpServers": {
|
|
24
|
+
"agent-wiki": {
|
|
25
|
+
"command": "npx",
|
|
26
|
+
"args": ["-y", "agent-wiki"]
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Trae
|
|
33
|
+
|
|
34
|
+
Settings → MCP → 添加服务器(或从 MCP Marketplace 搜索安装):
|
|
35
|
+
|
|
36
|
+
```json
|
|
37
|
+
{
|
|
38
|
+
"mcpServers": {
|
|
39
|
+
"agent-wiki": {
|
|
40
|
+
"command": "npx",
|
|
41
|
+
"args": ["-y", "agent-wiki"]
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Qoder
|
|
48
|
+
|
|
49
|
+
MCP Settings → Connect your own MCP server(或从 MCP Square 搜索安装):
|
|
50
|
+
|
|
15
51
|
```json
|
|
16
52
|
{
|
|
17
53
|
"mcpServers": {
|
|
18
54
|
"agent-wiki": {
|
|
19
55
|
"command": "npx",
|
|
20
|
-
"args": ["agent-wiki"]
|
|
56
|
+
"args": ["-y", "agent-wiki"]
|
|
21
57
|
}
|
|
22
58
|
}
|
|
23
59
|
}
|
|
24
60
|
```
|
|
25
61
|
|
|
26
|
-
|
|
62
|
+
### Windsurf
|
|
63
|
+
|
|
64
|
+
Plugins → Browse MCP Servers → Install,或编辑 `mcp_config.json`:
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
{
|
|
68
|
+
"mcpServers": {
|
|
69
|
+
"agent-wiki": {
|
|
70
|
+
"command": "npx",
|
|
71
|
+
"args": ["-y", "agent-wiki"]
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### OpenClaw
|
|
78
|
+
|
|
79
|
+
编辑 `~/.openclaw/workspace/_workspace/config/mcporter.json`:
|
|
80
|
+
|
|
81
|
+
```json
|
|
82
|
+
{
|
|
83
|
+
"mcpServers": {
|
|
84
|
+
"agent-wiki": {
|
|
85
|
+
"command": "npx",
|
|
86
|
+
"args": ["-y", "agent-wiki"]
|
|
87
|
+
"env": {},
|
|
88
|
+
"disabled": false
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
```
|
|
27
93
|
|
|
28
94
|
## 首次使用
|
|
29
95
|
|
|
@@ -50,15 +116,28 @@ Agent:你好!我是 agent-wiki 知识库助手。
|
|
|
50
116
|
你:整理一下知识库
|
|
51
117
|
你:搜一下 RAG 相关的内容
|
|
52
118
|
你:帮我写一篇 Agent 工程化的综述
|
|
119
|
+
你:这段分析不错,存到知识库吧
|
|
53
120
|
你:检查一下知识库健不健康
|
|
54
121
|
```
|
|
55
122
|
|
|
123
|
+
## 支持的文件格式
|
|
124
|
+
|
|
125
|
+
| 格式 | 处理方式 |
|
|
126
|
+
|------|---------|
|
|
127
|
+
| `.md` | 直接摄入 |
|
|
128
|
+
| `.txt` | 自动转为 .md |
|
|
129
|
+
| `.html` | 去标签提取正文,转 .md |
|
|
130
|
+
| `.csv` | 转为 Markdown 表格 |
|
|
131
|
+
| `.pdf` / `.docx` / `.xlsx` | 原样存储,Agent 多模态读取 |
|
|
132
|
+
| 图片 | 存储到 raw/assets/ |
|
|
133
|
+
|
|
56
134
|
## 知识库结构
|
|
57
135
|
|
|
58
136
|
```
|
|
59
137
|
你的知识库目录/
|
|
60
138
|
├── SCHEMA.md # 知识库说明书(Agent 自动读取)
|
|
61
139
|
├── raw/ # 原始文章(不可变)
|
|
140
|
+
│ └── assets/ # 图片等资源
|
|
62
141
|
├── wiki/ # AI 编译后的维基页面
|
|
63
142
|
│ ├── INDEX.md # 分类索引
|
|
64
143
|
│ ├── LOG.md # 操作日志
|
|
@@ -72,12 +151,13 @@ Agent:你好!我是 agent-wiki 知识库助手。
|
|
|
72
151
|
|------|------|
|
|
73
152
|
| wiki_status | 查询知识库状态,首次使用自动引导 |
|
|
74
153
|
| wiki_init | 初始化知识库(创建目录 + SCHEMA.md) |
|
|
75
|
-
| wiki_ingest | 摄入文章到 raw/ |
|
|
154
|
+
| wiki_ingest | 摄入文章到 raw/(支持 md/txt/html/csv/pdf/docx/xlsx/图片) |
|
|
76
155
|
| wiki_tag | 写入标签到 frontmatter |
|
|
77
156
|
| wiki_compile | 生成 Wiki 页面 + 更新索引 + 记录日志 |
|
|
78
157
|
| wiki_search | 标签 + 全文搜索 |
|
|
79
158
|
| wiki_article | 保存原创文章到 outputs/ |
|
|
80
159
|
| wiki_lint | 自检死链接、缺摘要等 |
|
|
160
|
+
| wiki_feedback | 保存问答分析到知识库(用户确认后) |
|
|
81
161
|
|
|
82
162
|
## 写文章风格
|
|
83
163
|
|
|
@@ -86,6 +166,10 @@ Agent:你好!我是 agent-wiki 知识库助手。
|
|
|
86
166
|
- 有自己的观点和态度
|
|
87
167
|
- 适合直接发微信公众号
|
|
88
168
|
|
|
169
|
+
## 知识自进化
|
|
170
|
+
|
|
171
|
+
每次问答中的综合分析,用户确认后会自动存入知识库。问得越多,知识库越厚。存入的内容会标记为 `query-derived`,与原始材料编译的内容区分开。
|
|
172
|
+
|
|
89
173
|
## 技术栈
|
|
90
174
|
|
|
91
175
|
- TypeScript + @modelcontextprotocol/sdk
|
package/dist/index.js
CHANGED
|
@@ -20,6 +20,7 @@ import { handleCompile } from "./tools/compile.js";
|
|
|
20
20
|
import { handleSearch } from "./tools/search.js";
|
|
21
21
|
import { handleArticle } from "./tools/article.js";
|
|
22
22
|
import { handleLint } from "./tools/lint.js";
|
|
23
|
+
import { handleFeedback } from "./tools/feedback.js";
|
|
23
24
|
const server = new McpServer({
|
|
24
25
|
name: "agent-wiki",
|
|
25
26
|
version: "0.1.0",
|
|
@@ -42,11 +43,11 @@ server.tool("wiki_init", "初始化知识库。创建目录结构和 SCHEMA.md
|
|
|
42
43
|
interests: z.array(z.string()).describe("兴趣方向列表,如 ['LLM', '创业', 'RAG']"),
|
|
43
44
|
}, async (params) => handleInit(params));
|
|
44
45
|
/**
|
|
45
|
-
* wiki_ingest —
|
|
46
|
-
*
|
|
47
|
-
*
|
|
46
|
+
* wiki_ingest — 摄入文件到知识库(支持多格式)
|
|
47
|
+
* md/txt/html/csv 零依赖转换为 md
|
|
48
|
+
* pdf/docx/xlsx/图片原样存储,Agent 多模态读取
|
|
48
49
|
*/
|
|
49
|
-
server.tool("wiki_ingest", "
|
|
50
|
+
server.tool("wiki_ingest", "摄入文件到知识库。支持 md/txt/html/csv(自动转 markdown)、pdf/docx/xlsx/图片(原样存储)。从指定路径复制到 raw/ 目录(增量,已摄入的跳过)。返回新文件内容预览,供你用 LLM 提取标签和摘要。", {
|
|
50
51
|
source: z.string().describe("源文件路径或目录路径"),
|
|
51
52
|
force: z.boolean().optional().describe("强制重新摄入已存在的文件,默认 false"),
|
|
52
53
|
}, async (params) => handleIngest(params));
|
|
@@ -99,6 +100,19 @@ server.tool("wiki_article", "保存原创文章到 outputs/ 目录。保存前
|
|
|
99
100
|
server.tool("wiki_lint", "自检知识库健康状态。检测死链接、缺失摘要、内容过短、缺少标签等问题,返回问题列表。", {
|
|
100
101
|
check_only: z.boolean().optional().describe("仅检测不修复,默认 true"),
|
|
101
102
|
}, async (params) => handleLint(params));
|
|
103
|
+
/**
|
|
104
|
+
* wiki_feedback — 保存问答分析到知识库(用户确认版)
|
|
105
|
+
* 用户问问题 → Agent 回答 → 如果回答有综合分析价值 → 用户说"存" → 调用此工具
|
|
106
|
+
* 标记 type: query-derived,区分原始材料编译 vs 问答生成
|
|
107
|
+
* 防止幻觉复利:不自动存,必须用户确认
|
|
108
|
+
*/
|
|
109
|
+
server.tool("wiki_feedback", "保存问答分析到知识库。仅当用户明确说'存'或'保存'时才调用。将问答中的综合分析、对比、洞察保存为新的 Wiki 页面,标记为 query-derived 类型(区分原始材料编译 vs 问答生成)。", {
|
|
110
|
+
question: z.string().describe("用户的原始问题"),
|
|
111
|
+
answer: z.string().describe("Agent 的回答内容(有价值综合分析部分)"),
|
|
112
|
+
sources: z.array(z.string()).describe("参考了哪些 Wiki 页面(文件名列表)"),
|
|
113
|
+
tags: z.array(z.string()).describe("标签列表,如 ['RAG', '对比分析']"),
|
|
114
|
+
category: z.string().describe("分类名,如 'AI工程化'"),
|
|
115
|
+
}, async (params) => handleFeedback(params));
|
|
102
116
|
// ─── 启动 MCP Server ───
|
|
103
117
|
async function main() {
|
|
104
118
|
const transport = new StdioServerTransport();
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* wiki_feedback — 保存问答分析到知识库(用户确认版)
|
|
3
|
+
*
|
|
4
|
+
* 核心理念(来自 Karpathy):
|
|
5
|
+
* "good answers can be filed back into the wiki as new pages"
|
|
6
|
+
* 用户的提问和 Agent 的回答,不应该消失在聊天记录里,应该沉淀回知识库
|
|
7
|
+
*
|
|
8
|
+
* 防止幻觉复利的关键设计:
|
|
9
|
+
* ❌ 不自动存:必须用户说"存"才调用
|
|
10
|
+
* ✅ 标记来源:type: query-derived,区分原始材料 vs 问答生成
|
|
11
|
+
* ✅ 搜索排序:compiled 类型排在 query-derived 前面
|
|
12
|
+
*/
|
|
13
|
+
import { ToolResult } from "../types.js";
|
|
14
|
+
export declare function handleFeedback(params: {
|
|
15
|
+
question: string;
|
|
16
|
+
answer: string;
|
|
17
|
+
sources: string[];
|
|
18
|
+
tags: string[];
|
|
19
|
+
category: string;
|
|
20
|
+
}): Promise<ToolResult>;
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* wiki_feedback — 保存问答分析到知识库(用户确认版)
|
|
3
|
+
*
|
|
4
|
+
* 核心理念(来自 Karpathy):
|
|
5
|
+
* "good answers can be filed back into the wiki as new pages"
|
|
6
|
+
* 用户的提问和 Agent 的回答,不应该消失在聊天记录里,应该沉淀回知识库
|
|
7
|
+
*
|
|
8
|
+
* 防止幻觉复利的关键设计:
|
|
9
|
+
* ❌ 不自动存:必须用户说"存"才调用
|
|
10
|
+
* ✅ 标记来源:type: query-derived,区分原始材料 vs 问答生成
|
|
11
|
+
* ✅ 搜索排序:compiled 类型排在 query-derived 前面
|
|
12
|
+
*/
|
|
13
|
+
import * as path from "path";
|
|
14
|
+
import * as fs from "fs";
|
|
15
|
+
import { getWikiDir, getWikiPagesDir } from "../state.js";
|
|
16
|
+
import { safeFilename, todayStr, nowStr } from "../utils.js";
|
|
17
|
+
import { textResult } from "../types.js";
|
|
18
|
+
export async function handleFeedback(params) {
|
|
19
|
+
const wikiDir = getWikiDir();
|
|
20
|
+
if (!wikiDir) {
|
|
21
|
+
return textResult(JSON.stringify({ error: "知识库未初始化" }));
|
|
22
|
+
}
|
|
23
|
+
const { question, answer, sources, tags, category } = params;
|
|
24
|
+
const wikiPagesDir = getWikiPagesDir(wikiDir);
|
|
25
|
+
// 从问题生成标题(截取前 30 字符)
|
|
26
|
+
const title = `Q&A: ${question.substring(0, 30)}${question.length > 30 ? "..." : ""}`;
|
|
27
|
+
const fileName = `${safeFilename(title)}.md`;
|
|
28
|
+
const filePath = path.join(wikiPagesDir, fileName);
|
|
29
|
+
// 构建 sources 的 wikilink 格式
|
|
30
|
+
const sourceLinks = sources.map((s) => `- "[[${s}]]"`).join("\n");
|
|
31
|
+
// 生成 Wiki 页面内容
|
|
32
|
+
const content = `---
|
|
33
|
+
title: "${title.replace(/"/g, '\\"')}"
|
|
34
|
+
date: ${todayStr()}
|
|
35
|
+
tags: [${tags.join(", ")}]
|
|
36
|
+
category: ${category}
|
|
37
|
+
type: query-derived
|
|
38
|
+
sources:
|
|
39
|
+
${sourceLinks}
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
# ${title}
|
|
43
|
+
|
|
44
|
+
## 问题
|
|
45
|
+
${question}
|
|
46
|
+
|
|
47
|
+
## 回答
|
|
48
|
+
${answer}
|
|
49
|
+
|
|
50
|
+
## 参考来源
|
|
51
|
+
${sources.map((s) => `- [[${s.replace(".md", "")}]]`).join("\n")}
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
*来源: 用户问答反馈 · 生成时间: ${nowStr()}*
|
|
55
|
+
`;
|
|
56
|
+
// 写入 Wiki 页面
|
|
57
|
+
fs.writeFileSync(filePath, content, "utf-8");
|
|
58
|
+
// 更新 INDEX.md
|
|
59
|
+
const indexPath = path.join(wikiPagesDir, "INDEX.md");
|
|
60
|
+
if (fs.existsSync(indexPath)) {
|
|
61
|
+
const indexContent = fs.readFileSync(indexPath, "utf-8");
|
|
62
|
+
// 解析已有分类
|
|
63
|
+
const sections = {};
|
|
64
|
+
const lines = indexContent.split("\n");
|
|
65
|
+
let currentCategory = "";
|
|
66
|
+
for (const line of lines) {
|
|
67
|
+
if (line.startsWith("## ") && !line.includes("知识库索引")) {
|
|
68
|
+
currentCategory = line.replace("## ", "").trim();
|
|
69
|
+
if (!sections[currentCategory])
|
|
70
|
+
sections[currentCategory] = [];
|
|
71
|
+
}
|
|
72
|
+
else if (line.startsWith("- [[") && currentCategory) {
|
|
73
|
+
const linkMatch = line.match(/- \[\[(.+?)\]\]\s*\((.+?)\)/);
|
|
74
|
+
if (linkMatch && !sections[currentCategory]) {
|
|
75
|
+
sections[currentCategory] = [];
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
if (!sections[category])
|
|
80
|
+
sections[category] = [];
|
|
81
|
+
sections[category].push({ title, tags: tags.slice(0, 3) });
|
|
82
|
+
// 重新生成 INDEX.md
|
|
83
|
+
const totalEntries = Object.values(sections).reduce((s, e) => s + e.length, 0);
|
|
84
|
+
const newLines = [
|
|
85
|
+
"---",
|
|
86
|
+
`title: 知识库索引`,
|
|
87
|
+
`date: ${todayStr()}`,
|
|
88
|
+
"---",
|
|
89
|
+
"",
|
|
90
|
+
"# 知识库索引",
|
|
91
|
+
"",
|
|
92
|
+
];
|
|
93
|
+
for (const [cat, entries] of Object.entries(sections)) {
|
|
94
|
+
if (entries.length === 0)
|
|
95
|
+
continue;
|
|
96
|
+
newLines.push(`## ${cat}`);
|
|
97
|
+
for (const entry of entries) {
|
|
98
|
+
newLines.push(`- [[${entry.title}]] (${entry.tags.join(", ")})`);
|
|
99
|
+
}
|
|
100
|
+
newLines.push("");
|
|
101
|
+
}
|
|
102
|
+
newLines.push(`---\n*共 ${totalEntries} 篇 · 更新于 ${todayStr()}*`);
|
|
103
|
+
fs.writeFileSync(indexPath, newLines.join("\n"), "utf-8");
|
|
104
|
+
}
|
|
105
|
+
// 追加 LOG.md
|
|
106
|
+
const logPath = path.join(wikiPagesDir, "LOG.md");
|
|
107
|
+
const logEntry = `\n## [${nowStr()}] feedback | ${title}\n来源: 用户问答 | 分类: ${category} | 标签: ${tags.join(", ")} | 参考: ${sources.join(", ")}\n`;
|
|
108
|
+
if (fs.existsSync(logPath)) {
|
|
109
|
+
fs.appendFileSync(logPath, logEntry, "utf-8");
|
|
110
|
+
}
|
|
111
|
+
return textResult(JSON.stringify({
|
|
112
|
+
status: "ok",
|
|
113
|
+
file: fileName,
|
|
114
|
+
title,
|
|
115
|
+
type: "query-derived",
|
|
116
|
+
note: "已保存为 Wiki 页面。搜索时 compiled 类型结果会排在 query-derived 前面。",
|
|
117
|
+
}));
|
|
118
|
+
}
|
package/dist/tools/ingest.d.ts
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* wiki_ingest —
|
|
2
|
+
* wiki_ingest — 从指定路径摄入文件到 raw/(增量,支持多格式)
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* 零依赖转换:.md(直接复制)、.txt(改后缀)、.html(去标签)、.csv(转表格)
|
|
5
|
+
* 原样存储:.pdf、.docx、.xlsx、.xls、图片 → Agent 多模态读取
|
|
5
6
|
* 已摄入的文件跳过(除非 force=true)
|
|
6
|
-
*
|
|
7
|
+
* 返回每个文件的名称、类型和前 500 字预览(供 Agent 用 LLM 处理)
|
|
7
8
|
*/
|
|
8
9
|
import { ToolResult } from "../types.js";
|
|
9
10
|
export declare function handleIngest(params: {
|
package/dist/tools/ingest.js
CHANGED
|
@@ -1,65 +1,256 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* wiki_ingest —
|
|
2
|
+
* wiki_ingest — 从指定路径摄入文件到 raw/(增量,支持多格式)
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* 零依赖转换:.md(直接复制)、.txt(改后缀)、.html(去标签)、.csv(转表格)
|
|
5
|
+
* 原样存储:.pdf、.docx、.xlsx、.xls、图片 → Agent 多模态读取
|
|
5
6
|
* 已摄入的文件跳过(除非 force=true)
|
|
6
|
-
*
|
|
7
|
+
* 返回每个文件的名称、类型和前 500 字预览(供 Agent 用 LLM 处理)
|
|
7
8
|
*/
|
|
8
9
|
import * as path from "path";
|
|
9
10
|
import * as fs from "fs";
|
|
10
11
|
import { getWikiDir, getRawDir, getState, setState } from "../state.js";
|
|
11
12
|
import { readFile_safe, ensureDir } from "../utils.js";
|
|
12
13
|
import { textResult } from "../types.js";
|
|
14
|
+
// ─── 支持的文件类型 ───
|
|
15
|
+
/** 零依赖可转为 .md 的格式 */
|
|
16
|
+
const CONVERTIBLE_EXTENSIONS = new Set([".md", ".txt", ".html", ".csv"]);
|
|
17
|
+
/** 原样存储的二进制格式(Agent 多模态读取) */
|
|
18
|
+
const BINARY_EXTENSIONS = new Set([".pdf", ".docx", ".xlsx", ".xls"]);
|
|
19
|
+
/** 图片格式 */
|
|
20
|
+
const IMAGE_EXTENSIONS = new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".svg"]);
|
|
21
|
+
/** 所有支持的格式 */
|
|
22
|
+
const SUPPORTED_EXTENSIONS = new Set([
|
|
23
|
+
...CONVERTIBLE_EXTENSIONS,
|
|
24
|
+
...BINARY_EXTENSIONS,
|
|
25
|
+
...IMAGE_EXTENSIONS,
|
|
26
|
+
]);
|
|
27
|
+
// ─── 格式转换函数 ───
|
|
28
|
+
/**
|
|
29
|
+
* .txt → .md(直接改后缀,内容不变)
|
|
30
|
+
*/
|
|
31
|
+
function convertTxt(content) {
|
|
32
|
+
return content;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* .html → .md(去除 HTML 标签,保留纯文本)
|
|
36
|
+
* 零依赖,用正则处理。不追求完美还原,只提取可读文本
|
|
37
|
+
*/
|
|
38
|
+
function convertHtml(html) {
|
|
39
|
+
let text = html;
|
|
40
|
+
// 去除 script、style 标签及其内容
|
|
41
|
+
text = text.replace(/<script[\s\S]*?<\/script>/gi, "");
|
|
42
|
+
text = text.replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
43
|
+
// 处理常见块级标签,转为换行
|
|
44
|
+
text = text.replace(/<\/?(p|div|br|h[1-6]|li|tr|blockquote)[^>]*>/gi, "\n");
|
|
45
|
+
// 处理标题标签,转为 markdown 标题
|
|
46
|
+
text = text.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, "# $1\n");
|
|
47
|
+
text = text.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, "## $1\n");
|
|
48
|
+
text = text.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, "### $1\n");
|
|
49
|
+
text = text.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, "#### $1\n");
|
|
50
|
+
// 处理列表项
|
|
51
|
+
text = text.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, "- $1\n");
|
|
52
|
+
// 处理加粗和斜体
|
|
53
|
+
text = text.replace(/<(strong|b)[^>]*>([\s\S]*?)<\/(strong|b)>/gi, "**$2**");
|
|
54
|
+
text = text.replace(/<(em|i)[^>]*>([\s\S]*?)<\/(em|i)>/gi, "*$2*");
|
|
55
|
+
// 处理链接
|
|
56
|
+
text = text.replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, "[$2]($1)");
|
|
57
|
+
// 去除所有剩余 HTML 标签
|
|
58
|
+
text = text.replace(/<[^>]+>/g, "");
|
|
59
|
+
// 解码常见 HTML 实体
|
|
60
|
+
text = text.replace(/&/g, "&");
|
|
61
|
+
text = text.replace(/</g, "<");
|
|
62
|
+
text = text.replace(/>/g, ">");
|
|
63
|
+
text = text.replace(/"/g, '"');
|
|
64
|
+
text = text.replace(/'/g, "'");
|
|
65
|
+
text = text.replace(/ /g, " ");
|
|
66
|
+
// 清理多余空行(超过 2 个连续换行压缩为 2 个)
|
|
67
|
+
text = text.replace(/\n{3,}/g, "\n\n");
|
|
68
|
+
return text.trim();
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* .csv → markdown 表格
|
|
72
|
+
* 零依赖,手动解析。处理简单 CSV(带引号的字段、逗号在引号内)
|
|
73
|
+
*/
|
|
74
|
+
function convertCsv(csv) {
|
|
75
|
+
// 简单 CSV 解析(处理引号内的逗号)
|
|
76
|
+
function parseLine(line) {
|
|
77
|
+
const fields = [];
|
|
78
|
+
let current = "";
|
|
79
|
+
let inQuotes = false;
|
|
80
|
+
for (let i = 0; i < line.length; i++) {
|
|
81
|
+
const ch = line[i];
|
|
82
|
+
if (ch === '"') {
|
|
83
|
+
if (inQuotes && i + 1 < line.length && line[i + 1] === '"') {
|
|
84
|
+
// 转义引号 ""
|
|
85
|
+
current += '"';
|
|
86
|
+
i++;
|
|
87
|
+
}
|
|
88
|
+
else {
|
|
89
|
+
inQuotes = !inQuotes;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
else if (ch === "," && !inQuotes) {
|
|
93
|
+
fields.push(current.trim());
|
|
94
|
+
current = "";
|
|
95
|
+
}
|
|
96
|
+
else {
|
|
97
|
+
current += ch;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
fields.push(current.trim());
|
|
101
|
+
return fields;
|
|
102
|
+
}
|
|
103
|
+
const lines = csv.split(/\r?\n/).filter((line) => line.trim());
|
|
104
|
+
if (lines.length === 0)
|
|
105
|
+
return "";
|
|
106
|
+
// 第一行是表头
|
|
107
|
+
const headers = parseLine(lines[0]);
|
|
108
|
+
const colCount = headers.length;
|
|
109
|
+
// 构建 markdown 表格
|
|
110
|
+
const mdLines = [];
|
|
111
|
+
// 表头行
|
|
112
|
+
mdLines.push("| " + headers.join(" | ") + " |");
|
|
113
|
+
// 分隔行
|
|
114
|
+
mdLines.push("| " + headers.map(() => "---").join(" | ") + " |");
|
|
115
|
+
// 数据行
|
|
116
|
+
for (let i = 1; i < lines.length; i++) {
|
|
117
|
+
const fields = parseLine(lines[i]);
|
|
118
|
+
// 补齐或截断到与表头同列数
|
|
119
|
+
while (fields.length < colCount)
|
|
120
|
+
fields.push("");
|
|
121
|
+
mdLines.push("| " + fields.slice(0, colCount).join(" | ") + " |");
|
|
122
|
+
}
|
|
123
|
+
return mdLines.join("\n");
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* 根据文件扩展名,将文件内容转为 .md 格式
|
|
127
|
+
* 返回 null 表示无法转换(二进制文件)
|
|
128
|
+
*/
|
|
129
|
+
function convertToMarkdown(filePath) {
|
|
130
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
131
|
+
const content = readFile_safe(filePath);
|
|
132
|
+
if (content === null)
|
|
133
|
+
return null;
|
|
134
|
+
switch (ext) {
|
|
135
|
+
case ".md":
|
|
136
|
+
return content;
|
|
137
|
+
case ".txt":
|
|
138
|
+
return convertTxt(content);
|
|
139
|
+
case ".html":
|
|
140
|
+
return convertHtml(content);
|
|
141
|
+
case ".csv":
|
|
142
|
+
return convertCsv(content);
|
|
143
|
+
default:
|
|
144
|
+
return null;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
// ─── 文件类型判断 ───
|
|
148
|
+
function getFileType(filePath) {
|
|
149
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
150
|
+
if (CONVERTIBLE_EXTENSIONS.has(ext))
|
|
151
|
+
return "convertible";
|
|
152
|
+
if (BINARY_EXTENSIONS.has(ext))
|
|
153
|
+
return "binary";
|
|
154
|
+
if (IMAGE_EXTENSIONS.has(ext))
|
|
155
|
+
return "image";
|
|
156
|
+
return "unsupported";
|
|
157
|
+
}
|
|
158
|
+
// ─── 主处理函数 ───
|
|
13
159
|
export async function handleIngest(params) {
|
|
14
160
|
const wikiDir = getWikiDir();
|
|
15
161
|
if (!wikiDir) {
|
|
16
162
|
return textResult(JSON.stringify({ error: "知识库未初始化,请先调用 wiki_init" }));
|
|
17
163
|
}
|
|
18
164
|
const rawDir = getRawDir(wikiDir);
|
|
165
|
+
const assetsDir = path.join(rawDir, "assets");
|
|
19
166
|
ensureDir(rawDir);
|
|
167
|
+
ensureDir(assetsDir);
|
|
20
168
|
const state = getState(wikiDir);
|
|
21
169
|
const source = params.source.replace(/^~/, process.env.HOME || "~");
|
|
22
170
|
const force = params.force ?? false;
|
|
23
171
|
// 收集要摄入的文件列表
|
|
24
172
|
let filesToIngest = [];
|
|
25
173
|
if (fs.statSync(source).isDirectory()) {
|
|
174
|
+
// 目录:扫描所有支持的文件类型
|
|
26
175
|
filesToIngest = fs
|
|
27
176
|
.readdirSync(source)
|
|
28
|
-
.filter((f) =>
|
|
177
|
+
.filter((f) => SUPPORTED_EXTENSIONS.has(path.extname(f).toLowerCase()))
|
|
29
178
|
.map((f) => path.join(source, f));
|
|
30
179
|
}
|
|
31
|
-
else if (source.endsWith(".md")) {
|
|
32
|
-
filesToIngest = [source];
|
|
33
|
-
}
|
|
34
180
|
else {
|
|
35
|
-
|
|
181
|
+
// 单文件:检查是否支持
|
|
182
|
+
const ext = path.extname(source).toLowerCase();
|
|
183
|
+
if (SUPPORTED_EXTENSIONS.has(ext)) {
|
|
184
|
+
filesToIngest = [source];
|
|
185
|
+
}
|
|
186
|
+
else {
|
|
187
|
+
return textResult(JSON.stringify({
|
|
188
|
+
error: `不支持的文件格式: ${ext}。支持的格式: md, txt, html, csv, pdf, docx, xlsx, xls, png, jpg, jpeg, gif, webp`,
|
|
189
|
+
}));
|
|
190
|
+
}
|
|
36
191
|
}
|
|
37
|
-
// 增量过滤
|
|
192
|
+
// 增量过滤 + 处理
|
|
38
193
|
const results = [];
|
|
39
194
|
let ingested = 0;
|
|
40
195
|
for (const filePath of filesToIngest) {
|
|
41
196
|
const fileName = path.basename(filePath);
|
|
42
|
-
const
|
|
197
|
+
const fileType = getFileType(filePath);
|
|
198
|
+
// 生成 raw/ 中的目标文件名
|
|
199
|
+
let destFileName = fileName;
|
|
200
|
+
if (fileType === "convertible" && !fileName.endsWith(".md")) {
|
|
201
|
+
// 可转换格式:目标文件名改为 .md
|
|
202
|
+
destFileName = path.basename(fileName, path.extname(fileName)) + ".md";
|
|
203
|
+
}
|
|
43
204
|
// 已摄入则跳过(除非 force)
|
|
44
|
-
if (!force && state.ingested.includes(
|
|
45
|
-
results.push({ name: fileName, content_preview: "(已摄入,跳过)", skipped: true });
|
|
205
|
+
if (!force && state.ingested.includes(destFileName)) {
|
|
206
|
+
results.push({ name: fileName, type: fileType, content_preview: "(已摄入,跳过)", skipped: true });
|
|
46
207
|
continue;
|
|
47
208
|
}
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
209
|
+
if (fileType === "convertible") {
|
|
210
|
+
// ── 零依赖转换 ──
|
|
211
|
+
const mdContent = convertToMarkdown(filePath);
|
|
212
|
+
if (!mdContent) {
|
|
213
|
+
results.push({ name: fileName, type: fileType, content_preview: "(无法读取文件)", skipped: true });
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
const destPath = path.join(rawDir, destFileName);
|
|
217
|
+
fs.writeFileSync(destPath, mdContent, "utf-8");
|
|
218
|
+
if (!state.ingested.includes(destFileName)) {
|
|
219
|
+
state.ingested.push(destFileName);
|
|
220
|
+
}
|
|
221
|
+
results.push({
|
|
222
|
+
name: fileName,
|
|
223
|
+
type: "converted",
|
|
224
|
+
content_preview: mdContent.substring(0, 500),
|
|
225
|
+
note: `已转为 ${destFileName}`,
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
else if (fileType === "binary") {
|
|
229
|
+
// ── 原样存储(PDF/DOCX/XLSX) ──
|
|
230
|
+
const destPath = path.join(rawDir, fileName);
|
|
231
|
+
fs.copyFileSync(filePath, destPath);
|
|
232
|
+
if (!state.ingested.includes(fileName)) {
|
|
233
|
+
state.ingested.push(fileName);
|
|
234
|
+
}
|
|
235
|
+
results.push({
|
|
236
|
+
name: fileName,
|
|
237
|
+
type: "binary",
|
|
238
|
+
note: "原文件已存储,请直接读取 raw/ 下的文件",
|
|
239
|
+
});
|
|
53
240
|
}
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
241
|
+
else if (fileType === "image") {
|
|
242
|
+
// ── 图片存储到 raw/assets/ ──
|
|
243
|
+
const destPath = path.join(assetsDir, fileName);
|
|
244
|
+
fs.copyFileSync(filePath, destPath);
|
|
245
|
+
if (!state.ingested.includes(fileName)) {
|
|
246
|
+
state.ingested.push(fileName);
|
|
247
|
+
}
|
|
248
|
+
results.push({
|
|
249
|
+
name: fileName,
|
|
250
|
+
type: "image",
|
|
251
|
+
note: "已存储到 raw/assets/,可直接查看",
|
|
252
|
+
});
|
|
58
253
|
}
|
|
59
|
-
results.push({
|
|
60
|
-
name: fileName,
|
|
61
|
-
content_preview: content.substring(0, 500),
|
|
62
|
-
});
|
|
63
254
|
ingested++;
|
|
64
255
|
}
|
|
65
256
|
// 保存状态
|
package/dist/tools/search.js
CHANGED
|
@@ -80,12 +80,15 @@ export async function handleSearch(params) {
|
|
|
80
80
|
}
|
|
81
81
|
// 至少有一点匹配才加入结果
|
|
82
82
|
if (score > 0) {
|
|
83
|
+
// 类型权重:compiled(原始材料编译)排在 query-derived(问答生成)前面
|
|
84
|
+
const typeBoost = String(meta.type) === "query-derived" ? 0.9 : 1.0;
|
|
83
85
|
results.push({
|
|
84
86
|
title,
|
|
85
87
|
file: fileName,
|
|
86
88
|
tags: fileTags,
|
|
87
89
|
summary,
|
|
88
|
-
score: Math.round(score * 100) / 100,
|
|
90
|
+
score: Math.round(score * typeBoost * 100) / 100,
|
|
91
|
+
type: String(meta.type || "compiled"),
|
|
89
92
|
});
|
|
90
93
|
}
|
|
91
94
|
}
|