@claudeink/mcp-server 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +104 -76
- package/package.json +2 -1
- package/workflow/CLAUDE.md +167 -0
- package/workflow/accounts/_template.yaml +70 -0
- package/workflow/base-rules.md +235 -0
- package/workflow/platforms/blog.md +52 -0
- package/workflow/platforms/toutiao.md +51 -0
- package/workflow/platforms/wechat.md +94 -0
- package/workflow/platforms/x-twitter.md +48 -0
- package/workflow/platforms/xiaohongshu.md +50 -0
- package/workflow/tools/crawler/SOURCES.md +85 -0
- package/workflow/tools/crawler/config.json +173 -0
- package/workflow/tools/crawler/config.test.json +35 -0
- package/workflow/tools/crawler/crawl.mjs +398 -0
- package/workflow/tools/crawler/package-lock.json +817 -0
- package/workflow/tools/crawler/package.json +16 -0
- package/workflow/tools/crawler/test-robot-report.mjs +31 -0
- package/workflow/tools/pack.sh +35 -0
- package/workflow/tools/setup.sh +93 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
{
|
|
2
|
+
"sources": [
|
|
3
|
+
{
|
|
4
|
+
"id": "beomniscient",
|
|
5
|
+
"name": "Omniscient Digital Blog",
|
|
6
|
+
"blogUrl": "https://beomniscient.com/blog/",
|
|
7
|
+
"type": "paginated",
|
|
8
|
+
"pagination": {
|
|
9
|
+
"pattern": "https://beomniscient.com/blog/page/{page}/",
|
|
10
|
+
"startPage": 1,
|
|
11
|
+
"maxPages": 100
|
|
12
|
+
},
|
|
13
|
+
"articleSelector": "a[href*='/blog/']",
|
|
14
|
+
"articleUrlPattern": "^https://beomniscient\\.com/blog/[a-z0-9-]+/?$",
|
|
15
|
+
"excludePatterns": [
|
|
16
|
+
"/blog/page/",
|
|
17
|
+
"/blog/category/",
|
|
18
|
+
"/blog/author/",
|
|
19
|
+
"/blog/tag/"
|
|
20
|
+
],
|
|
21
|
+
"enabled": true
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"id": "9to5mac",
|
|
25
|
+
"name": "9to5Mac",
|
|
26
|
+
"blogUrl": "https://9to5mac.com/",
|
|
27
|
+
"type": "paginated",
|
|
28
|
+
"pagination": {
|
|
29
|
+
"pattern": "https://9to5mac.com/page/{page}/",
|
|
30
|
+
"startPage": 1,
|
|
31
|
+
"maxPages": 1
|
|
32
|
+
},
|
|
33
|
+
"articleSelector": "a[href*='9to5mac.com/2']",
|
|
34
|
+
"articleUrlPattern": "^https://9to5mac\\.com/\\d{4}/\\d{2}/\\d{2}/[a-z0-9-]+/?$",
|
|
35
|
+
"excludePatterns": [
|
|
36
|
+
"extended-comments",
|
|
37
|
+
"#comments",
|
|
38
|
+
"#more-",
|
|
39
|
+
"/page/"
|
|
40
|
+
],
|
|
41
|
+
"enabled": true
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"id": "macrumors",
|
|
45
|
+
"name": "MacRumors",
|
|
46
|
+
"blogUrl": "https://www.macrumors.com/",
|
|
47
|
+
"type": "paginated",
|
|
48
|
+
"pagination": {
|
|
49
|
+
"pattern": "https://www.macrumors.com/page/{page}/",
|
|
50
|
+
"startPage": 1,
|
|
51
|
+
"maxPages": 1
|
|
52
|
+
},
|
|
53
|
+
"articleSelector": "a[href*='macrumors.com/2']",
|
|
54
|
+
"articleUrlPattern": "^https://www\\.macrumors\\.com/\\d{4}/\\d{2}/\\d{2}/[a-z0-9-]+/?$",
|
|
55
|
+
"excludePatterns": [
|
|
56
|
+
"/page/",
|
|
57
|
+
"/roundup/",
|
|
58
|
+
"/guide/",
|
|
59
|
+
"/deals/"
|
|
60
|
+
],
|
|
61
|
+
"enabled": true
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
"id": "appleinsider",
|
|
65
|
+
"name": "AppleInsider",
|
|
66
|
+
"blogUrl": "https://appleinsider.com/",
|
|
67
|
+
"type": "paginated",
|
|
68
|
+
"pagination": {
|
|
69
|
+
"pattern": "https://appleinsider.com/articles/page/{page}",
|
|
70
|
+
"startPage": 1,
|
|
71
|
+
"maxPages": 1
|
|
72
|
+
},
|
|
73
|
+
"articleSelector": "a[href*='/articles/']",
|
|
74
|
+
"articleUrlPattern": "^https://appleinsider\\.com/articles/\\d{2}/\\d{2}/\\d{2}/[a-z0-9-]+",
|
|
75
|
+
"excludePatterns": [
|
|
76
|
+
"/articles/page/"
|
|
77
|
+
],
|
|
78
|
+
"enabled": true
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"id": "simonwillison",
|
|
82
|
+
"name": "Simon Willison's Weblog",
|
|
83
|
+
"blogUrl": "https://simonwillison.net/",
|
|
84
|
+
"type": "paginated",
|
|
85
|
+
"pagination": {
|
|
86
|
+
"pattern": "https://simonwillison.net/{page}/",
|
|
87
|
+
"startPage": 1,
|
|
88
|
+
"maxPages": 1
|
|
89
|
+
},
|
|
90
|
+
"articleSelector": "a[href*='/202']",
|
|
91
|
+
"articleUrlPattern": "^https://simonwillison\\.net/\\d{4}/[A-Za-z]+/\\d+/[a-z0-9-]+/?$",
|
|
92
|
+
"excludePatterns": [
|
|
93
|
+
"/tags/",
|
|
94
|
+
"/search/"
|
|
95
|
+
],
|
|
96
|
+
"maxArticles": 15,
|
|
97
|
+
"enabled": true
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
"id": "qbitai",
|
|
101
|
+
"name": "量子位",
|
|
102
|
+
"blogUrl": "https://www.qbitai.com/",
|
|
103
|
+
"type": "paginated",
|
|
104
|
+
"pagination": {
|
|
105
|
+
"pattern": "https://www.qbitai.com/page/{page}",
|
|
106
|
+
"startPage": 1,
|
|
107
|
+
"maxPages": 1
|
|
108
|
+
},
|
|
109
|
+
"articleSelector": "a[href*='qbitai.com/2']",
|
|
110
|
+
"articleUrlPattern": "^https://www\\.qbitai\\.com/\\d{4}/\\d{2}/\\d+\\.html$",
|
|
111
|
+
"excludePatterns": [
|
|
112
|
+
"/page/",
|
|
113
|
+
"/category/",
|
|
114
|
+
"/tag/",
|
|
115
|
+
"/author/"
|
|
116
|
+
],
|
|
117
|
+
"maxArticles": 15,
|
|
118
|
+
"enabled": true
|
|
119
|
+
},
|
|
120
|
+
{
|
|
121
|
+
"id": "importai",
|
|
122
|
+
"name": "Import AI (Jack Clark)",
|
|
123
|
+
"blogUrl": "https://jack-clark.net/",
|
|
124
|
+
"type": "paginated",
|
|
125
|
+
"pagination": {
|
|
126
|
+
"pattern": "https://jack-clark.net/page/{page}/",
|
|
127
|
+
"startPage": 1,
|
|
128
|
+
"maxPages": 1
|
|
129
|
+
},
|
|
130
|
+
"articleSelector": "a[href*='jack-clark.net/202']",
|
|
131
|
+
"articleUrlPattern": "^https://jack-clark\\.net/\\d{4}/\\d{2}/\\d{2}/import-ai-[a-z0-9-]+/?$",
|
|
132
|
+
"excludePatterns": [
|
|
133
|
+
"/page/",
|
|
134
|
+
"/category/",
|
|
135
|
+
"/tag/"
|
|
136
|
+
],
|
|
137
|
+
"maxArticles": 15,
|
|
138
|
+
"enabled": true
|
|
139
|
+
},
|
|
140
|
+
{
|
|
141
|
+
"id": "techcrunch-robotics",
|
|
142
|
+
"name": "TechCrunch Robotics",
|
|
143
|
+
"blogUrl": "https://techcrunch.com/category/robotics/",
|
|
144
|
+
"type": "paginated",
|
|
145
|
+
"pagination": {
|
|
146
|
+
"pattern": "https://techcrunch.com/category/robotics/page/{page}/",
|
|
147
|
+
"startPage": 1,
|
|
148
|
+
"maxPages": 1
|
|
149
|
+
},
|
|
150
|
+
"articleSelector": "a[href*='techcrunch.com/202']",
|
|
151
|
+
"articleUrlPattern": "^https://techcrunch\\.com/\\d{4}/\\d{2}/\\d{2}/[a-z0-9-]+/?$",
|
|
152
|
+
"excludePatterns": [
|
|
153
|
+
"/page/",
|
|
154
|
+
"/category/",
|
|
155
|
+
"/tag/",
|
|
156
|
+
"/author/"
|
|
157
|
+
],
|
|
158
|
+
"maxArticles": 15,
|
|
159
|
+
"enabled": true
|
|
160
|
+
}
|
|
161
|
+
],
|
|
162
|
+
"output": {
|
|
163
|
+
"baseDir": "../../sources/articles",
|
|
164
|
+
"filenameFormat": "{date}-{slug}",
|
|
165
|
+
"includeMetadata": true
|
|
166
|
+
},
|
|
167
|
+
"settings": {
|
|
168
|
+
"requestDelay": 1500,
|
|
169
|
+
"maxConcurrent": 3,
|
|
170
|
+
"userAgent": "AustonBot/1.0 (Content Research)",
|
|
171
|
+
"maxRetries": 2
|
|
172
|
+
}
|
|
173
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"sources": [
|
|
3
|
+
{
|
|
4
|
+
"id": "beomniscient",
|
|
5
|
+
"name": "Omniscient Digital Blog",
|
|
6
|
+
"blogUrl": "https://beomniscient.com/blog/",
|
|
7
|
+
"type": "paginated",
|
|
8
|
+
"pagination": {
|
|
9
|
+
"pattern": "https://beomniscient.com/blog/page/{page}/",
|
|
10
|
+
"startPage": 1,
|
|
11
|
+
"maxPages": 1
|
|
12
|
+
},
|
|
13
|
+
"articleSelector": "a[href*='/blog/']",
|
|
14
|
+
"articleUrlPattern": "^https://beomniscient\\.com/blog/[a-z0-9-]+/?$",
|
|
15
|
+
"excludePatterns": [
|
|
16
|
+
"/blog/page/",
|
|
17
|
+
"/blog/category/",
|
|
18
|
+
"/blog/author/",
|
|
19
|
+
"/blog/tag/"
|
|
20
|
+
],
|
|
21
|
+
"enabled": true
|
|
22
|
+
}
|
|
23
|
+
],
|
|
24
|
+
"output": {
|
|
25
|
+
"baseDir": "../../sources/articles",
|
|
26
|
+
"filenameFormat": "{date}-{slug}",
|
|
27
|
+
"includeMetadata": true
|
|
28
|
+
},
|
|
29
|
+
"settings": {
|
|
30
|
+
"requestDelay": 1500,
|
|
31
|
+
"maxConcurrent": 3,
|
|
32
|
+
"userAgent": "AustonBot/1.0 (Content Research)",
|
|
33
|
+
"maxRetries": 2
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Auston Blog Crawler
|
|
5
|
+
*
|
|
6
|
+
* 订阅式 Blog 爬虫,基于 Defuddle 提取正文
|
|
7
|
+
*
|
|
8
|
+
* 用法:
|
|
9
|
+
* node crawl.mjs # 增量抓取所有已启用的源(默认模式)
|
|
10
|
+
* node crawl.mjs --full # 全量抓取所有源(首次使用)
|
|
11
|
+
* node crawl.mjs --full --source beomniscient # 全量抓取指定源
|
|
12
|
+
* node crawl.mjs --source beomniscient # 增量抓取指定源
|
|
13
|
+
* node crawl.mjs --url https://... # 抓取单篇文章
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { JSDOM } from 'jsdom';
|
|
17
|
+
import { Defuddle } from 'defuddle/node';
|
|
18
|
+
import fs from 'fs';
|
|
19
|
+
import path from 'path';
|
|
20
|
+
import { fileURLToPath } from 'url';
|
|
21
|
+
|
|
22
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
23
|
+
const CONFIG_PATH = path.join(__dirname, 'config.json');
|
|
24
|
+
const STATE_PATH = path.join(__dirname, '..', '..', '.crawler-state.json');
|
|
25
|
+
|
|
26
|
+
// ============================================================
|
|
27
|
+
// Utility Functions
|
|
28
|
+
// ============================================================
|
|
29
|
+
|
|
30
|
+
function loadConfig() {
|
|
31
|
+
return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8'));
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function loadState() {
|
|
35
|
+
if (fs.existsSync(STATE_PATH)) {
|
|
36
|
+
return JSON.parse(fs.readFileSync(STATE_PATH, 'utf-8'));
|
|
37
|
+
}
|
|
38
|
+
return { sources: {} };
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function saveState(state) {
|
|
42
|
+
fs.writeFileSync(STATE_PATH, JSON.stringify(state, null, 2), 'utf-8');
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function sleep(ms) {
|
|
46
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function slugify(text) {
|
|
50
|
+
return text
|
|
51
|
+
.toLowerCase()
|
|
52
|
+
.replace(/[^a-z0-9\u4e00-\u9fff]+/g, '-')
|
|
53
|
+
.replace(/^-|-$/g, '')
|
|
54
|
+
.slice(0, 60);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function ensureDir(dirPath) {
|
|
58
|
+
if (!fs.existsSync(dirPath)) {
|
|
59
|
+
fs.mkdirSync(dirPath, { recursive: true });
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function log(level, msg) {
|
|
64
|
+
const ts = new Date().toISOString().slice(0, 19).replace('T', ' ');
|
|
65
|
+
const prefix = { info: 'ℹ️', ok: '✅', warn: '⚠️', error: '❌', skip: '⏭️' }[level] || '•';
|
|
66
|
+
console.log(`[${ts}] ${prefix} ${msg}`);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// ============================================================
|
|
70
|
+
// Core: Discover Article URLs from a Blog
|
|
71
|
+
// ============================================================
|
|
72
|
+
|
|
73
|
+
async function discoverArticles(source, config, state, fullMode) {
|
|
74
|
+
const sourceState = state.sources[source.id] || { fetchedUrls: [], lastFetch: null };
|
|
75
|
+
const fetchedSet = new Set(sourceState.fetchedUrls || []);
|
|
76
|
+
const newUrls = [];
|
|
77
|
+
const excludePatterns = source.excludePatterns || [];
|
|
78
|
+
const urlRegex = source.articleUrlPattern ? new RegExp(source.articleUrlPattern) : null;
|
|
79
|
+
|
|
80
|
+
log('info', `开始发现文章: ${source.name} (${fullMode ? '全量' : '增量'}模式)`);
|
|
81
|
+
|
|
82
|
+
const maxArticles = source.maxArticles || 0; // 0 = 不限制
|
|
83
|
+
let page = source.pagination.startPage;
|
|
84
|
+
let maxPages = fullMode ? source.pagination.maxPages : 10; // 增量模式最多翻10页
|
|
85
|
+
let consecutiveKnown = 0;
|
|
86
|
+
|
|
87
|
+
while (page <= maxPages) {
|
|
88
|
+
const pageUrl = page === 1
|
|
89
|
+
? source.blogUrl
|
|
90
|
+
: source.pagination.pattern.replace('{page}', page);
|
|
91
|
+
|
|
92
|
+
log('info', `抓取列表页: ${pageUrl}`);
|
|
93
|
+
|
|
94
|
+
try {
|
|
95
|
+
const dom = await JSDOM.fromURL(pageUrl, {
|
|
96
|
+
pretendToBeVisual: true,
|
|
97
|
+
userAgent: config.settings.userAgent,
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
const doc = dom.window.document;
|
|
101
|
+
const links = doc.querySelectorAll(source.articleSelector);
|
|
102
|
+
let pageNewCount = 0;
|
|
103
|
+
|
|
104
|
+
for (const link of links) {
|
|
105
|
+
let href = link.href;
|
|
106
|
+
if (!href) continue;
|
|
107
|
+
|
|
108
|
+
// 确保是完整 URL
|
|
109
|
+
if (href.startsWith('/')) {
|
|
110
|
+
const base = new URL(source.blogUrl);
|
|
111
|
+
href = `${base.protocol}//${base.host}${href}`;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// 过滤
|
|
115
|
+
const shouldExclude = excludePatterns.some(p => href.includes(p));
|
|
116
|
+
if (shouldExclude) continue;
|
|
117
|
+
if (urlRegex && !urlRegex.test(href)) continue;
|
|
118
|
+
|
|
119
|
+
// 去重
|
|
120
|
+
if (fetchedSet.has(href) || newUrls.includes(href)) {
|
|
121
|
+
consecutiveKnown++;
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
newUrls.push(href);
|
|
126
|
+
pageNewCount++;
|
|
127
|
+
consecutiveKnown = 0;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
log('info', `第 ${page} 页发现 ${pageNewCount} 篇新文章`);
|
|
131
|
+
|
|
132
|
+
// maxArticles 限制:够了就停
|
|
133
|
+
if (maxArticles > 0 && newUrls.length >= maxArticles) {
|
|
134
|
+
log('info', `已达到 maxArticles 上限 (${maxArticles}),停止翻页`);
|
|
135
|
+
break;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// 增量模式:如果连续遇到太多已知文章,说明已经到了上次抓取的位置
|
|
139
|
+
if (!fullMode && consecutiveKnown >= 15) {
|
|
140
|
+
log('info', `连续 ${consecutiveKnown} 篇已知文章,停止翻页`);
|
|
141
|
+
break;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// 检查是否还有下一页(简单检测)
|
|
145
|
+
const hasNext = doc.querySelector('a[href*="page/"]') ||
|
|
146
|
+
doc.querySelector('.next') ||
|
|
147
|
+
doc.querySelector('a.next');
|
|
148
|
+
|
|
149
|
+
if (!hasNext && page > 1) {
|
|
150
|
+
log('info', '没有更多分页了');
|
|
151
|
+
break;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
page++;
|
|
155
|
+
await sleep(config.settings.requestDelay);
|
|
156
|
+
} catch (err) {
|
|
157
|
+
if (err.message && err.message.includes('404')) {
|
|
158
|
+
log('info', `第 ${page} 页不存在,停止翻页`);
|
|
159
|
+
break;
|
|
160
|
+
}
|
|
161
|
+
log('error', `抓取第 ${page} 页失败: ${err.message}`);
|
|
162
|
+
page++;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// 去重并截断
|
|
167
|
+
let uniqueUrls = [...new Set(newUrls)];
|
|
168
|
+
if (maxArticles > 0 && uniqueUrls.length > maxArticles) {
|
|
169
|
+
uniqueUrls = uniqueUrls.slice(0, maxArticles);
|
|
170
|
+
}
|
|
171
|
+
log('ok', `${source.name}: 共发现 ${uniqueUrls.length} 篇新文章${maxArticles > 0 ? ` (上限 ${maxArticles})` : ''}`);
|
|
172
|
+
return uniqueUrls;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// ============================================================
|
|
176
|
+
// Core: Fetch & Extract a Single Article
|
|
177
|
+
// ============================================================
|
|
178
|
+
|
|
179
|
+
async function fetchArticle(url, sourceId, config) {
|
|
180
|
+
try {
|
|
181
|
+
const dom = await JSDOM.fromURL(url, {
|
|
182
|
+
pretendToBeVisual: true,
|
|
183
|
+
userAgent: config.settings.userAgent,
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
const result = await Defuddle(dom, url, { markdown: true });
|
|
187
|
+
|
|
188
|
+
if (!result || !result.content || result.content.trim().length < 100) {
|
|
189
|
+
log('warn', `内容过短或为空: ${url}`);
|
|
190
|
+
return null;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
let date = new Date().toISOString().slice(0, 10);
|
|
194
|
+
if (result.published) {
|
|
195
|
+
try {
|
|
196
|
+
const d = new Date(result.published);
|
|
197
|
+
if (!isNaN(d.getTime())) {
|
|
198
|
+
date = d.toISOString().slice(0, 10);
|
|
199
|
+
}
|
|
200
|
+
} catch (_) { /* use today's date as fallback */ }
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
const slug = slugify(result.title || path.basename(new URL(url).pathname));
|
|
204
|
+
const filename = `${date}-${slug}.md`;
|
|
205
|
+
|
|
206
|
+
const frontMatter = [
|
|
207
|
+
'---',
|
|
208
|
+
`title: "${(result.title || '').replace(/"/g, '\\"')}"`,
|
|
209
|
+
`author: "${(result.author || '').replace(/"/g, '\\"')}"`,
|
|
210
|
+
`source: "${url}"`,
|
|
211
|
+
`domain: "${result.domain || ''}"`,
|
|
212
|
+
`site: "${result.site || ''}"`,
|
|
213
|
+
`published: "${result.published || ''}"`,
|
|
214
|
+
`fetched: "${new Date().toISOString().slice(0, 10)}"`,
|
|
215
|
+
`wordCount: ${result.wordCount || 0}`,
|
|
216
|
+
`description: "${(result.description || '').replace(/"/g, '\\"')}"`,
|
|
217
|
+
'---',
|
|
218
|
+
].join('\n');
|
|
219
|
+
|
|
220
|
+
const output = `${frontMatter}\n\n${result.content}\n`;
|
|
221
|
+
|
|
222
|
+
// 保存
|
|
223
|
+
const outputDir = path.resolve(__dirname, config.output.baseDir, sourceId || 'manual');
|
|
224
|
+
ensureDir(outputDir);
|
|
225
|
+
const filepath = path.join(outputDir, filename);
|
|
226
|
+
fs.writeFileSync(filepath, output, 'utf-8');
|
|
227
|
+
|
|
228
|
+
return {
|
|
229
|
+
url,
|
|
230
|
+
title: result.title,
|
|
231
|
+
author: result.author,
|
|
232
|
+
published: result.published,
|
|
233
|
+
wordCount: result.wordCount,
|
|
234
|
+
filename,
|
|
235
|
+
filepath,
|
|
236
|
+
};
|
|
237
|
+
} catch (err) {
|
|
238
|
+
log('error', `抓取失败 ${url}: ${err.message}`);
|
|
239
|
+
return null;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// ============================================================
|
|
244
|
+
// Core: Batch Fetch with Concurrency Control
|
|
245
|
+
// ============================================================
|
|
246
|
+
|
|
247
|
+
async function batchFetch(urls, sourceId, config) {
|
|
248
|
+
const results = [];
|
|
249
|
+
const maxConcurrent = config.settings.maxConcurrent || 3;
|
|
250
|
+
|
|
251
|
+
for (let i = 0; i < urls.length; i += maxConcurrent) {
|
|
252
|
+
const batch = urls.slice(i, i + maxConcurrent);
|
|
253
|
+
const batchNum = Math.floor(i / maxConcurrent) + 1;
|
|
254
|
+
const totalBatches = Math.ceil(urls.length / maxConcurrent);
|
|
255
|
+
|
|
256
|
+
log('info', `批次 ${batchNum}/${totalBatches}: 抓取 ${batch.length} 篇`);
|
|
257
|
+
|
|
258
|
+
const batchResults = await Promise.all(
|
|
259
|
+
batch.map(url => fetchArticle(url, sourceId, config))
|
|
260
|
+
);
|
|
261
|
+
|
|
262
|
+
results.push(...batchResults.filter(Boolean));
|
|
263
|
+
|
|
264
|
+
if (i + maxConcurrent < urls.length) {
|
|
265
|
+
await sleep(config.settings.requestDelay);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
return results;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// ============================================================
|
|
273
|
+
// Report Generation
|
|
274
|
+
// ============================================================
|
|
275
|
+
|
|
276
|
+
function generateReport(allResults, startTime) {
|
|
277
|
+
const duration = ((Date.now() - startTime) / 1000).toFixed(1);
|
|
278
|
+
const date = new Date().toISOString().slice(0, 10);
|
|
279
|
+
const time = new Date().toISOString().slice(11, 19);
|
|
280
|
+
|
|
281
|
+
const lines = [
|
|
282
|
+
`# 抓取报告`,
|
|
283
|
+
``,
|
|
284
|
+
`- 时间: ${date} ${time}`,
|
|
285
|
+
`- 耗时: ${duration}s`,
|
|
286
|
+
`- 总计抓取: ${allResults.length} 篇`,
|
|
287
|
+
``,
|
|
288
|
+
];
|
|
289
|
+
|
|
290
|
+
// 按源分组
|
|
291
|
+
const bySource = {};
|
|
292
|
+
for (const r of allResults) {
|
|
293
|
+
const source = path.basename(path.dirname(r.filepath));
|
|
294
|
+
if (!bySource[source]) bySource[source] = [];
|
|
295
|
+
bySource[source].push(r);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
for (const [source, results] of Object.entries(bySource)) {
|
|
299
|
+
lines.push(`## ${source} (${results.length} 篇)`);
|
|
300
|
+
lines.push('');
|
|
301
|
+
for (const r of results) {
|
|
302
|
+
lines.push(`- **${r.title}** — ${r.author || '未知作者'} — ${r.published || '日期未知'} — ${r.wordCount} 词`);
|
|
303
|
+
}
|
|
304
|
+
lines.push('');
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
return lines.join('\n');
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// ============================================================
|
|
311
|
+
// Main
|
|
312
|
+
// ============================================================
|
|
313
|
+
|
|
314
|
+
async function main() {
|
|
315
|
+
const args = process.argv.slice(2);
|
|
316
|
+
const fullMode = args.includes('--full');
|
|
317
|
+
const sourceFilter = args.includes('--source')
|
|
318
|
+
? args[args.indexOf('--source') + 1]
|
|
319
|
+
: null;
|
|
320
|
+
const singleUrl = args.includes('--url')
|
|
321
|
+
? args[args.indexOf('--url') + 1]
|
|
322
|
+
: null;
|
|
323
|
+
|
|
324
|
+
const config = loadConfig();
|
|
325
|
+
const state = loadState();
|
|
326
|
+
const startTime = Date.now();
|
|
327
|
+
const allResults = [];
|
|
328
|
+
|
|
329
|
+
// 模式 1: 单篇 URL
|
|
330
|
+
if (singleUrl) {
|
|
331
|
+
log('info', `单篇抓取: ${singleUrl}`);
|
|
332
|
+
const result = await fetchArticle(singleUrl, 'manual', config);
|
|
333
|
+
if (result) {
|
|
334
|
+
log('ok', `已保存: ${result.filename}`);
|
|
335
|
+
allResults.push(result);
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
// 模式 2: 订阅源抓取
|
|
339
|
+
else {
|
|
340
|
+
const sources = config.sources
|
|
341
|
+
.filter(s => s.enabled)
|
|
342
|
+
.filter(s => !sourceFilter || s.id === sourceFilter);
|
|
343
|
+
|
|
344
|
+
if (sources.length === 0) {
|
|
345
|
+
log('warn', sourceFilter
|
|
346
|
+
? `未找到源: ${sourceFilter}`
|
|
347
|
+
: '没有已启用的订阅源');
|
|
348
|
+
process.exit(1);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
for (const source of sources) {
|
|
352
|
+
// Step 1: 发现新文章 URL
|
|
353
|
+
const newUrls = await discoverArticles(source, config, state, fullMode);
|
|
354
|
+
|
|
355
|
+
if (newUrls.length === 0) {
|
|
356
|
+
log('skip', `${source.name}: 没有新文章`);
|
|
357
|
+
continue;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
// Step 2: 批量抓取
|
|
361
|
+
const results = await batchFetch(newUrls, source.id, config);
|
|
362
|
+
allResults.push(...results);
|
|
363
|
+
|
|
364
|
+
// Step 3: 更新状态
|
|
365
|
+
if (!state.sources[source.id]) {
|
|
366
|
+
state.sources[source.id] = { fetchedUrls: [], lastFetch: null };
|
|
367
|
+
}
|
|
368
|
+
const fetchedUrls = results.map(r => r.url);
|
|
369
|
+
state.sources[source.id].fetchedUrls = [
|
|
370
|
+
...new Set([
|
|
371
|
+
...fetchedUrls,
|
|
372
|
+
...(state.sources[source.id].fetchedUrls || []),
|
|
373
|
+
])
|
|
374
|
+
];
|
|
375
|
+
state.sources[source.id].lastFetch = new Date().toISOString();
|
|
376
|
+
saveState(state);
|
|
377
|
+
|
|
378
|
+
log('ok', `${source.name}: 成功抓取 ${results.length}/${newUrls.length} 篇`);
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// 生成报告
|
|
383
|
+
if (allResults.length > 0) {
|
|
384
|
+
const report = generateReport(allResults, startTime);
|
|
385
|
+
const reportDir = path.resolve(__dirname, '..', '..');
|
|
386
|
+
const reportPath = path.join(reportDir, 'crawl-report.md');
|
|
387
|
+
fs.writeFileSync(reportPath, report, 'utf-8');
|
|
388
|
+
log('ok', `报告已生成: crawl-report.md`);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
log('ok', `完成! 共抓取 ${allResults.length} 篇文章`);
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
main().catch(err => {
|
|
395
|
+
log('error', `致命错误: ${err.message}`);
|
|
396
|
+
console.error(err);
|
|
397
|
+
process.exit(1);
|
|
398
|
+
});
|