079project 2.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/crawler/agent.cjs +97 -0
- package/crawler/index.cjs +515 -0
- package/crawler/storage.cjs +163 -0
- package/groupmanager.cjs +2 -1
- package/main_Serve.cjs +1136 -210
- package/main_Study.cjs +1584 -349
- package/package.json +2 -1
- package/robots/seeds.txt +2 -0
- package/schedule.cjs +745 -0
- package/todo-list.txt +0 -86
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
const urlLib = require('url');
|
|
2
|
+
|
|
3
|
+
// 基础 tokenizer(英文/中文混合,清停用词可在调用层做)
|
|
4
|
+
function tokenize(s) {
|
|
5
|
+
return String(s || '')
|
|
6
|
+
.toLowerCase()
|
|
7
|
+
.replace(/[^a-z0-9\u4e00-\u9fa5\s]/g, ' ')
|
|
8
|
+
.split(/\s+/)
|
|
9
|
+
.filter(Boolean);
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
// 简单查询规划:原始提示 + 派生关键词 + 站点/纵向限定
|
|
13
|
+
function planQueries(prompt, vertical = 'general') {
|
|
14
|
+
const base = String(prompt || '').trim();
|
|
15
|
+
const tk = tokenize(base);
|
|
16
|
+
const heads = tk.slice(0, 6);
|
|
17
|
+
const queries = new Set();
|
|
18
|
+
|
|
19
|
+
if (heads.length) queries.add(heads.join(' '));
|
|
20
|
+
if (tk.length > 6) queries.add(tk.slice(0, 10).join(' '));
|
|
21
|
+
if (tk.length >= 2) queries.add(`${tk[0]} ${tk[1]} analysis`);
|
|
22
|
+
if (tk.length >= 2) queries.add(`${tk[0]} vs ${tk[1]} review`);
|
|
23
|
+
queries.add(`${heads.join(' ')} site:wikipedia.org`);
|
|
24
|
+
queries.add(`${heads.join(' ')} filetype:pdf`);
|
|
25
|
+
|
|
26
|
+
// 简单纵向词汇
|
|
27
|
+
const vMap = {
|
|
28
|
+
science: ['site:nature.com', 'site:sciencedirect.com', 'site:arxiv.org'],
|
|
29
|
+
news: ['site:reuters.com', 'site:bbc.com', 'site:apnews.com'],
|
|
30
|
+
tech: ['site:stackoverflow.com', 'site:github.com', 'site:arstechnica.com'],
|
|
31
|
+
finance: ['site:bloomberg.com', 'site:ft.com', 'site:wsj.com'],
|
|
32
|
+
general: []
|
|
33
|
+
};
|
|
34
|
+
(vMap[vertical] || []).forEach(s => queries.add(`${heads.join(' ')} ${s}`));
|
|
35
|
+
|
|
36
|
+
return Array.from(queries).filter(q => q.length > 0).slice(0, 10);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// 从 SERP HTML 提取候选链接(DuckDuckGo/简单通用版)
|
|
40
|
+
function extractSerpLinks(html, base = 'https://duckduckgo.com') {
|
|
41
|
+
const aTagRe = /<a\s+[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
|
|
42
|
+
const results = [];
|
|
43
|
+
let m;
|
|
44
|
+
while ((m = aTagRe.exec(html)) !== null) {
|
|
45
|
+
const href = (m[1] || '').trim();
|
|
46
|
+
const text = String(m[2] || '').replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
|
|
47
|
+
if (!href) continue;
|
|
48
|
+
try {
|
|
49
|
+
const u = new urlLib.URL(href, base);
|
|
50
|
+
if (!/^https?:$/i.test(u.protocol)) continue;
|
|
51
|
+
// 过滤自身域/跳转器
|
|
52
|
+
if (/duckduckgo\.com|bing\.com|google\./i.test(u.hostname)) continue;
|
|
53
|
+
results.push({ url: u.toString(), title: text });
|
|
54
|
+
} catch {}
|
|
55
|
+
}
|
|
56
|
+
return results;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// 相关性与方向性评分
|
|
60
|
+
function relevanceScore({ url, title = '', snippet = '' }, prompt, vertical = 'general') {
|
|
61
|
+
const pt = tokenize(prompt);
|
|
62
|
+
const tt = tokenize(title + ' ' + snippet);
|
|
63
|
+
if (!pt.length || !tt.length) return 0;
|
|
64
|
+
|
|
65
|
+
const pSet = new Set(pt);
|
|
66
|
+
const tSet = new Set(tt);
|
|
67
|
+
let inter = 0;
|
|
68
|
+
for (const w of pSet) if (tSet.has(w)) inter++;
|
|
69
|
+
|
|
70
|
+
// Jaccard + 竖向/域名加权
|
|
71
|
+
const j = inter / Math.max(1, new Set([...pSet, ...tSet]).size);
|
|
72
|
+
|
|
73
|
+
let vBoost = 0;
|
|
74
|
+
const host = safeHost(url);
|
|
75
|
+
const verticalDomains = {
|
|
76
|
+
science: ['nature.com', 'sciencedirect.com', 'arxiv.org', 'acm.org'],
|
|
77
|
+
news: ['reuters.com', 'bbc.com', 'apnews.com', 'nytimes.com', 'theguardian.com'],
|
|
78
|
+
tech: ['stackoverflow.com', 'github.com', 'arstechnica.com', 'wired.com'],
|
|
79
|
+
finance: ['bloomberg.com', 'ft.com', 'wsj.com', 'reuters.com'],
|
|
80
|
+
general: []
|
|
81
|
+
};
|
|
82
|
+
if ((verticalDomains[vertical] || []).some(d => host.endsWith(d))) vBoost += 0.08;
|
|
83
|
+
if (/\.pdf$/i.test(url)) vBoost += 0.05;
|
|
84
|
+
if (/wikipedia\.org$/i.test(host)) vBoost += 0.03;
|
|
85
|
+
|
|
86
|
+
return j + vBoost;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
function safeHost(u) {
|
|
90
|
+
try { return new urlLib.URL(u).hostname || ''; } catch { return ''; }
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
module.exports = {
|
|
94
|
+
planQueries,
|
|
95
|
+
extractSerpLinks,
|
|
96
|
+
relevanceScore
|
|
97
|
+
};
|
|
@@ -0,0 +1,515 @@
|
|
|
1
|
+
// ...existing code...
|
|
2
|
+
const fs = require('fs');
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const urlLib = require('url');
|
|
5
|
+
const http = require('http');
|
|
6
|
+
const https = require('https');
|
|
7
|
+
const { CrawlerStorage } = require('./storage.cjs');
|
|
8
|
+
// 新增:定向检索工具(若不存在也不影响本改造)
|
|
9
|
+
let planQueries, extractSerpLinks, relevanceScore;
|
|
10
|
+
try {
|
|
11
|
+
({ planQueries, extractSerpLinks, relevanceScore } = require('./agent.cjs'));
|
|
12
|
+
} catch { /* 忽略 */ }
|
|
13
|
+
|
|
14
|
+
// 可选代理 Agent(若未安装对应包,将自动降级直连)
|
|
15
|
+
let HttpsProxyAgent = null;
|
|
16
|
+
let SocksProxyAgent = null;
|
|
17
|
+
try { HttpsProxyAgent = require('https-proxy-agent'); } catch {}
|
|
18
|
+
try { SocksProxyAgent = require('socks-proxy-agent'); } catch {}
|
|
19
|
+
|
|
20
|
+
// 新增:尝试动态渲染(Puppeteer / Puppeteer-Core)
|
|
21
|
+
let Puppeteer = null;
|
|
22
|
+
try { Puppeteer = require('puppeteer'); } catch {
|
|
23
|
+
try { Puppeteer = require('puppeteer-core'); } catch {}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const DEFAULT_UAS = [
|
|
27
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36',
|
|
28
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0',
|
|
29
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
|
30
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36',
|
|
31
|
+
];
|
|
32
|
+
|
|
33
|
+
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
|
|
34
|
+
function randomIPv4() { return Array(4).fill(0).map(() => Math.floor(Math.random() * 254) + 1).join('.'); }
|
|
35
|
+
|
|
36
|
+
class RobotsCache {
|
|
37
|
+
constructor() {
|
|
38
|
+
this.cache = new Map(); // host -> { fetchedAt, rules: [{allow|disallow, path}] }
|
|
39
|
+
this.ttl = 12 * 60 * 60 * 1000; // 12h
|
|
40
|
+
}
|
|
41
|
+
async fetch(host, fetchFn) {
|
|
42
|
+
const hit = this.cache.get(host);
|
|
43
|
+
const now = Date.now();
|
|
44
|
+
if (hit && now - hit.fetchedAt < this.ttl) return hit.rules;
|
|
45
|
+
const url = `https://${host}/robots.txt`;
|
|
46
|
+
try {
|
|
47
|
+
const { status, body } = await fetchFn(url, { timeout: 8000 });
|
|
48
|
+
let rules = [];
|
|
49
|
+
if (status === 200 && body) {
|
|
50
|
+
rules = this.parse(body);
|
|
51
|
+
}
|
|
52
|
+
this.cache.set(host, { fetchedAt: now, rules });
|
|
53
|
+
return rules;
|
|
54
|
+
} catch {
|
|
55
|
+
this.cache.set(host, { fetchedAt: now, rules: [] });
|
|
56
|
+
return [];
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
parse(content) {
|
|
60
|
+
const lines = String(content).split(/\r?\n/);
|
|
61
|
+
let uaStar = false;
|
|
62
|
+
const rules = [];
|
|
63
|
+
for (const ln of lines) {
|
|
64
|
+
const s = ln.trim();
|
|
65
|
+
if (!s || s.startsWith('#')) continue;
|
|
66
|
+
const [k, v] = s.split(':').map(x => (x || '').trim());
|
|
67
|
+
if (/^user-agent$/i.test(k)) {
|
|
68
|
+
uaStar = v === '*' ? true : false;
|
|
69
|
+
} else if (uaStar && /^disallow$/i.test(k)) {
|
|
70
|
+
if (v) rules.push({ type: 'disallow', path: v });
|
|
71
|
+
} else if (uaStar && /^allow$/i.test(k)) {
|
|
72
|
+
if (v) rules.push({ type: 'allow', path: v });
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return rules;
|
|
76
|
+
}
|
|
77
|
+
allowed(host, pathName, rules) {
|
|
78
|
+
if (!rules || rules.length === 0) return true;
|
|
79
|
+
let allowLen = -1, disallowLen = -1;
|
|
80
|
+
for (const r of rules) {
|
|
81
|
+
if (!r.path) continue;
|
|
82
|
+
if (pathName.startsWith(r.path)) {
|
|
83
|
+
if (r.type === 'allow') allowLen = Math.max(allowLen, r.path.length);
|
|
84
|
+
else if (r.type === 'disallow') disallowLen = Math.max(disallowLen, r.path.length);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
if (disallowLen > allowLen) return false;
|
|
88
|
+
return true;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
class CrawlerManager {
|
|
93
|
+
constructor(options = {}) {
|
|
94
|
+
this.opts = Object.assign({
|
|
95
|
+
concurrency: 4,
|
|
96
|
+
perHostDelayMs: 1500,
|
|
97
|
+
requestTimeoutMs: 12000,
|
|
98
|
+
userAgents: DEFAULT_UAS,
|
|
99
|
+
proxiesFile: path.join(__dirname, 'proxies.txt'),
|
|
100
|
+
seedsFile: path.join(__dirname, '..', 'robots', 'seeds.txt'),
|
|
101
|
+
maxDocPerTick: 8,
|
|
102
|
+
allowLang: ['en', 'zh'],
|
|
103
|
+
minTextLen: 200,
|
|
104
|
+
// 新增:动态渲染与递归抓取
|
|
105
|
+
renderDynamic: 'auto', // 'never' | 'auto' | 'always'
|
|
106
|
+
renderTimeoutMs: 15000,
|
|
107
|
+
renderConcurrency: 1, // 同时渲染页面数
|
|
108
|
+
dynamicHosts: [], // 总是渲染的 host 列表
|
|
109
|
+
dynamicPathRegex: null, // 匹配则渲染
|
|
110
|
+
maxDepth: 2, // crawlRecursive 默认深度
|
|
111
|
+
sameDomainOnly: true, // 限制域内递归
|
|
112
|
+
maxPagesPerTask: 100 // crawlRecursive 默认最大页数
|
|
113
|
+
}, options);
|
|
114
|
+
this.running = false;
|
|
115
|
+
this.storage = new CrawlerStorage();
|
|
116
|
+
this.robots = new RobotsCache();
|
|
117
|
+
this.perHostTime = new Map();
|
|
118
|
+
this.queue = new Set();
|
|
119
|
+
this.uaIdx = 0;
|
|
120
|
+
this.proxies = this._loadProxies();
|
|
121
|
+
// 渲染器状态
|
|
122
|
+
this._puppeteer = Puppeteer || null;
|
|
123
|
+
this._browser = null;
|
|
124
|
+
this._renderInFlight = 0;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// ---------- 定向检索(如果 agent.cjs 可用) ----------
|
|
128
|
+
async directedSearch(prompt, {
|
|
129
|
+
vertical = 'general',
|
|
130
|
+
perQuery = 8,
|
|
131
|
+
maxEnqueue = 30,
|
|
132
|
+
crawl = true,
|
|
133
|
+
maxCrawl = 12
|
|
134
|
+
} = {}) {
|
|
135
|
+
if (!planQueries || !extractSerpLinks || !relevanceScore) {
|
|
136
|
+
return { prompt, vertical, queries: [], candidates: [], enqueued: 0, crawled: 0, saved: 0, note: 'agent.cjs 不可用' };
|
|
137
|
+
}
|
|
138
|
+
const queries = planQueries(prompt, vertical);
|
|
139
|
+
const serpHost = 'https://html.duckduckgo.com/html/';
|
|
140
|
+
const candidates = [];
|
|
141
|
+
|
|
142
|
+
for (const q of queries) {
|
|
143
|
+
const url = `${serpHost}?q=${encodeURIComponent(q)}`;
|
|
144
|
+
const { status, body } = await this.fetchUrl(url, { timeout: 12000 });
|
|
145
|
+
if (status !== 200 || !body) continue;
|
|
146
|
+
|
|
147
|
+
const links = extractSerpLinks(body, serpHost).slice(0, perQuery).map(x => ({ ...x, query: q }));
|
|
148
|
+
for (const it of links) {
|
|
149
|
+
const score = relevanceScore({ url: it.url, title: it.title }, prompt, vertical);
|
|
150
|
+
candidates.push({ ...it, score });
|
|
151
|
+
}
|
|
152
|
+
await sleep(300 + Math.floor(Math.random() * 400));
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
156
|
+
const picked = candidates.slice(0, maxEnqueue);
|
|
157
|
+
const added = this.storage.enqueueLinks(picked.map(p => p.url), maxEnqueue);
|
|
158
|
+
|
|
159
|
+
let crawled = 0, savedDocs = 0;
|
|
160
|
+
if (crawl) {
|
|
161
|
+
for (const p of picked.slice(0, maxCrawl)) {
|
|
162
|
+
const r = await this._crawlOne(p.url, { parent: null, depth: 0 });
|
|
163
|
+
if (r && r.ok) { crawled++; savedDocs += r.saved ? 1 : 0; }
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return {
|
|
168
|
+
prompt, vertical, queries,
|
|
169
|
+
candidates: candidates.slice(0, 50).map(({ url, title, score }) => ({ url, title, score: Number(score.toFixed(3)) })),
|
|
170
|
+
enqueued: added, crawled, saved: savedDocs
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// ---------- 代理/UA ----------
|
|
175
|
+
_loadProxies() {
|
|
176
|
+
try {
|
|
177
|
+
if (!fs.existsSync(this.opts.proxiesFile)) return [];
|
|
178
|
+
const lines = fs.readFileSync(this.opts.proxiesFile, 'utf-8').split(/\r?\n/).map(s => s.trim()).filter(Boolean);
|
|
179
|
+
return lines;
|
|
180
|
+
} catch { return []; }
|
|
181
|
+
}
|
|
182
|
+
_pickUA() {
|
|
183
|
+
const ua = this.opts.userAgents[this.uaIdx % this.opts.userAgents.length];
|
|
184
|
+
this.uaIdx++;
|
|
185
|
+
return ua;
|
|
186
|
+
}
|
|
187
|
+
_pickProxy() {
|
|
188
|
+
if (!this.proxies.length) return null;
|
|
189
|
+
const i = Math.floor(Math.random() * this.proxies.length);
|
|
190
|
+
return this.proxies[i];
|
|
191
|
+
}
|
|
192
|
+
_agentForUrl(targetUrl, proxyStr) {
|
|
193
|
+
if (!proxyStr) return null;
|
|
194
|
+
try {
|
|
195
|
+
if (/^socks/i.test(proxyStr) && SocksProxyAgent) return new SocksProxyAgent.SocksProxyAgent(proxyStr);
|
|
196
|
+
if (/^http/i.test(proxyStr) && HttpsProxyAgent) return new HttpsProxyAgent.HttpsProxyAgent(proxyStr);
|
|
197
|
+
} catch {}
|
|
198
|
+
return null;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// ---------- 静态抓取 ----------
|
|
202
|
+
async fetchUrl(targetUrl, { timeout = 8000 } = {}) {
|
|
203
|
+
return new Promise((resolve) => {
|
|
204
|
+
try {
|
|
205
|
+
const u = new urlLib.URL(targetUrl);
|
|
206
|
+
const proxy = this._pickProxy();
|
|
207
|
+
const agent = this._agentForUrl(targetUrl, proxy);
|
|
208
|
+
const client = u.protocol === 'https:' ? https : http;
|
|
209
|
+
const headers = {
|
|
210
|
+
'User-Agent': this._pickUA(),
|
|
211
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
212
|
+
'Accept-Language': Math.random() < 0.5 ? 'en-US,en;q=0.8' : 'zh-CN,zh;q=0.8,en;q=0.6',
|
|
213
|
+
'Connection': 'keep-alive',
|
|
214
|
+
'X-Forwarded-For': randomIPv4(),
|
|
215
|
+
'X-Real-IP': randomIPv4()
|
|
216
|
+
};
|
|
217
|
+
const opts = {
|
|
218
|
+
protocol: u.protocol,
|
|
219
|
+
hostname: u.hostname,
|
|
220
|
+
port: u.port || (u.protocol === 'https:' ? 443 : 80),
|
|
221
|
+
path: u.pathname + (u.search || ''),
|
|
222
|
+
method: 'GET',
|
|
223
|
+
headers,
|
|
224
|
+
timeout,
|
|
225
|
+
agent: agent || undefined,
|
|
226
|
+
};
|
|
227
|
+
const req = client.request(opts, (res) => {
|
|
228
|
+
const { statusCode } = res;
|
|
229
|
+
const chunks = [];
|
|
230
|
+
const limit = 3 * 1024 * 1024;
|
|
231
|
+
res.on('data', d => {
|
|
232
|
+
chunks.push(d);
|
|
233
|
+
if (chunks.reduce((a, b) => a + b.length, 0) > limit) req.destroy();
|
|
234
|
+
});
|
|
235
|
+
res.on('end', () => {
|
|
236
|
+
const buf = Buffer.concat(chunks);
|
|
237
|
+
resolve({ status: statusCode, headers: res.headers, body: buf.toString('utf-8'), finalUrl: targetUrl });
|
|
238
|
+
});
|
|
239
|
+
});
|
|
240
|
+
req.on('timeout', () => { req.destroy(); resolve({ status: 0, error: 'timeout' }); });
|
|
241
|
+
req.on('error', () => resolve({ status: 0, error: 'network' }));
|
|
242
|
+
req.end();
|
|
243
|
+
} catch {
|
|
244
|
+
resolve({ status: 0, error: 'badurl' });
|
|
245
|
+
}
|
|
246
|
+
});
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// ---------- 动态渲染 ----------
|
|
250
|
+
async _ensureRenderer() {
|
|
251
|
+
if (!this._puppeteer) return false;
|
|
252
|
+
if (this._browser) return true;
|
|
253
|
+
try {
|
|
254
|
+
this._browser = await this._puppeteer.launch({
|
|
255
|
+
headless: 'new',
|
|
256
|
+
args: [
|
|
257
|
+
'--no-sandbox',
|
|
258
|
+
'--disable-setuid-sandbox',
|
|
259
|
+
'--disable-dev-shm-usage'
|
|
260
|
+
]
|
|
261
|
+
});
|
|
262
|
+
return true;
|
|
263
|
+
} catch (e) {
|
|
264
|
+
console.warn('[CRAWLER][RENDER] 启动失败:', e.message);
|
|
265
|
+
this._browser = null;
|
|
266
|
+
return false;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
async _renderHtml(url, { timeout = 15000 } = {}) {
|
|
271
|
+
if (!await this._ensureRenderer()) return { ok: false };
|
|
272
|
+
// 控制并发
|
|
273
|
+
while (this._renderInFlight >= Math.max(1, this.opts.renderConcurrency)) {
|
|
274
|
+
await sleep(100);
|
|
275
|
+
}
|
|
276
|
+
this._renderInFlight++;
|
|
277
|
+
try {
|
|
278
|
+
const page = await this._browser.newPage();
|
|
279
|
+
await page.setUserAgent(this._pickUA());
|
|
280
|
+
await page.setExtraHTTPHeaders({
|
|
281
|
+
'Accept-Language': Math.random() < 0.5 ? 'en-US,en;q=0.8' : 'zh-CN,zh;q=0.8,en;q=0.6',
|
|
282
|
+
'X-Forwarded-For': randomIPv4(),
|
|
283
|
+
'X-Real-IP': randomIPv4()
|
|
284
|
+
});
|
|
285
|
+
await page.setRequestInterception(true);
|
|
286
|
+
page.on('request', (req) => {
|
|
287
|
+
const rtype = req.resourceType();
|
|
288
|
+
// 阻断大资源
|
|
289
|
+
if (['image', 'media', 'font'].includes(rtype)) return req.abort();
|
|
290
|
+
req.continue();
|
|
291
|
+
});
|
|
292
|
+
await page.goto(url, { waitUntil: 'networkidle2', timeout });
|
|
293
|
+
// 滚动以触发懒加载
|
|
294
|
+
await page.evaluate(async () => {
|
|
295
|
+
await new Promise(resolve => {
|
|
296
|
+
let t = 0;
|
|
297
|
+
const id = setInterval(() => {
|
|
298
|
+
window.scrollBy(0, 800);
|
|
299
|
+
if ((t += 200) >= 2000) { clearInterval(id); resolve(); }
|
|
300
|
+
}, 200);
|
|
301
|
+
});
|
|
302
|
+
});
|
|
303
|
+
const content = await page.content();
|
|
304
|
+
// 提取链接(渲染后)
|
|
305
|
+
const links = await page.$$eval('a[href]', as => as.map(a => a.href).filter(Boolean));
|
|
306
|
+
const finalUrl = page.url();
|
|
307
|
+
await page.close();
|
|
308
|
+
return { ok: true, html: content, links, finalUrl };
|
|
309
|
+
} catch (e) {
|
|
310
|
+
return { ok: false, error: e.message };
|
|
311
|
+
} finally {
|
|
312
|
+
this._renderInFlight--;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// ---------- 解析 ----------
|
|
317
|
+
extractContent(html, baseUrl) {
|
|
318
|
+
let text = String(html || '');
|
|
319
|
+
text = text.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
|
320
|
+
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
|
321
|
+
.replace(/<!--[\s\S]*?-->/g, ' ');
|
|
322
|
+
const mTitle = text.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
323
|
+
const title = mTitle ? mTitle[1].replace(/\s+/g, ' ').trim() : '';
|
|
324
|
+
let body = text.replace(/<\/(p|div|h\d|br|li)>/gi, '\n')
|
|
325
|
+
.replace(/<[^>]+>/g, ' ')
|
|
326
|
+
.replace(/ /g, ' ')
|
|
327
|
+
.replace(/&/g, '&')
|
|
328
|
+
.replace(/\s+\n/g, '\n')
|
|
329
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
330
|
+
.replace(/[ \t]{2,}/g, ' ')
|
|
331
|
+
.trim();
|
|
332
|
+
const links = [];
|
|
333
|
+
const aTagRe = /<a\s+[^>]*href=["']([^"']+)["'][^>]*>/gi;
|
|
334
|
+
let m;
|
|
335
|
+
let base;
|
|
336
|
+
try { base = new urlLib.URL(baseUrl); } catch { base = null; }
|
|
337
|
+
while ((m = aTagRe.exec(html)) !== null) {
|
|
338
|
+
let href = (m[1] || '').trim();
|
|
339
|
+
if (!href) continue;
|
|
340
|
+
try {
|
|
341
|
+
const abs = base ? new urlLib.URL(href, base).toString() : href;
|
|
342
|
+
links.push(abs);
|
|
343
|
+
} catch {}
|
|
344
|
+
}
|
|
345
|
+
const nonAscii = (body.match(/[^\x00-\x7F]/g) || []).length;
|
|
346
|
+
const ratio = nonAscii / Math.max(1, body.length);
|
|
347
|
+
const lang = ratio > 0.3 ? 'zh' : 'en';
|
|
348
|
+
return { title, text: body, links, lang };
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
_hostDelayOk(host) {
|
|
352
|
+
const last = this.perHostTime.get(host) || 0;
|
|
353
|
+
const now = Date.now();
|
|
354
|
+
if (now - last >= this.opts.perHostDelayMs) {
|
|
355
|
+
this.perHostTime.set(host, now);
|
|
356
|
+
return true;
|
|
357
|
+
}
|
|
358
|
+
return false;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// ---------- 单页抓取(含动态渲染回退) ----------
|
|
362
|
+
async _crawlOne(url, ctx = { parent: null, depth: 0 }) {
|
|
363
|
+
try {
|
|
364
|
+
const u = new urlLib.URL(url);
|
|
365
|
+
const rules = await this.robots.fetch(u.host, (u2, o) => this.fetchUrl(u2, o));
|
|
366
|
+
const ok = this.robots.allowed(u.host, u.pathname, rules);
|
|
367
|
+
if (!ok) return { ok: false, reason: 'robots' };
|
|
368
|
+
|
|
369
|
+
while (!this._hostDelayOk(u.host)) await sleep(200);
|
|
370
|
+
|
|
371
|
+
// 静态抓
|
|
372
|
+
let { status, body, finalUrl } = await this.fetchUrl(url, { timeout: this.opts.requestTimeoutMs });
|
|
373
|
+
if (status !== 200) body = '';
|
|
374
|
+
|
|
375
|
+
// 解析静态
|
|
376
|
+
let parsed = this.extractContent(body || '', finalUrl || url);
|
|
377
|
+
let useRender = false;
|
|
378
|
+
|
|
379
|
+
// 动态渲染触发条件
|
|
380
|
+
const alwaysRenderHost = this.opts.dynamicHosts.some(h => u.hostname.endsWith(h));
|
|
381
|
+
const matchPath = this.opts.dynamicPathRegex && this.opts.dynamicPathRegex.test(u.pathname);
|
|
382
|
+
const heuristicHeavy = (body.match(/<script/gi) || []).length >= 10 || (parsed.text || '').length < this.opts.minTextLen;
|
|
383
|
+
|
|
384
|
+
if (this.opts.renderDynamic === 'always' || alwaysRenderHost || matchPath ||
|
|
385
|
+
(this.opts.renderDynamic === 'auto' && heuristicHeavy)) {
|
|
386
|
+
useRender = true;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
let renderedLinks = [];
|
|
390
|
+
if (useRender) {
|
|
391
|
+
const r = await this._renderHtml(url, { timeout: this.opts.renderTimeoutMs });
|
|
392
|
+
if (r.ok && r.html) {
|
|
393
|
+
parsed = this.extractContent(r.html, r.finalUrl || url);
|
|
394
|
+
renderedLinks = Array.isArray(r.links) ? r.links : [];
|
|
395
|
+
finalUrl = r.finalUrl || finalUrl || url;
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
if (!this.opts.allowLang.includes(parsed.lang) || !parsed.text || parsed.text.length < this.opts.minTextLen) {
|
|
400
|
+
this.storage.state.stats.deduped++;
|
|
401
|
+
return { ok: false, reason: 'short-or-lang', lang: parsed.lang };
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
const saved = this.storage.saveDocument({
|
|
405
|
+
url: finalUrl || url,
|
|
406
|
+
title: parsed.title,
|
|
407
|
+
text: parsed.text,
|
|
408
|
+
lang: parsed.lang,
|
|
409
|
+
depth: ctx.depth || 0,
|
|
410
|
+
parent: ctx.parent || ''
|
|
411
|
+
});
|
|
412
|
+
|
|
413
|
+
// 扩展链接(合并渲染后的链接)
|
|
414
|
+
const links = Array.from(new Set([...(parsed.links || []), ...renderedLinks])).slice(0, 800);
|
|
415
|
+
const added = this.storage.enqueueLinks(links, 200);
|
|
416
|
+
|
|
417
|
+
this.storage.markVisited(url);
|
|
418
|
+
this.storage.state.stats.fetched++;
|
|
419
|
+
return { ok: true, saved, added, lang: parsed.lang, links };
|
|
420
|
+
} catch (e) {
|
|
421
|
+
this.storage.state.stats.errors++;
|
|
422
|
+
return { ok: false, reason: e.message };
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
// ---------- 递归抓取(多层链接) ----------
|
|
427
|
+
async crawlRecursive(seedUrl, {
|
|
428
|
+
maxDepth = this.opts.maxDepth,
|
|
429
|
+
sameDomainOnly = this.opts.sameDomainOnly,
|
|
430
|
+
maxPages = this.opts.maxPagesPerTask
|
|
431
|
+
} = {}) {
|
|
432
|
+
let seed;
|
|
433
|
+
try { seed = new urlLib.URL(seedUrl); } catch { return { ok: false, error: 'bad seed url' }; }
|
|
434
|
+
|
|
435
|
+
const visitedLocal = new Set();
|
|
436
|
+
const q = [{ url: seedUrl, depth: 0, parent: '' }];
|
|
437
|
+
let crawled = 0, savedDocs = 0;
|
|
438
|
+
|
|
439
|
+
while (q.length > 0) {
|
|
440
|
+
if (crawled >= maxPages) break;
|
|
441
|
+
const { url, depth, parent } = q.shift();
|
|
442
|
+
if (visitedLocal.has(url)) continue;
|
|
443
|
+
visitedLocal.add(url);
|
|
444
|
+
|
|
445
|
+
// 域限制
|
|
446
|
+
if (sameDomainOnly) {
|
|
447
|
+
try {
|
|
448
|
+
const u = new urlLib.URL(url);
|
|
449
|
+
if (u.hostname !== seed.hostname) continue;
|
|
450
|
+
} catch { continue; }
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
const res = await this._crawlOne(url, { parent, depth });
|
|
454
|
+
if (res && res.ok) {
|
|
455
|
+
crawled++;
|
|
456
|
+
if (res.saved) savedDocs++;
|
|
457
|
+
if (depth + 1 <= maxDepth && Array.isArray(res.links)) {
|
|
458
|
+
for (const l of res.links) {
|
|
459
|
+
if (!l || visitedLocal.has(l)) continue;
|
|
460
|
+
q.push({ url: l, depth: depth + 1, parent: url });
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// 轻微退避,避免单站被打爆
|
|
466
|
+
await sleep(40 + Math.floor(Math.random() * 120));
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
return { ok: true, seed: seedUrl, crawled, saved: savedDocs, maxDepth, sameDomainOnly, maxPages };
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
async _workerLoop(slotId) {
|
|
473
|
+
while (this.running) {
|
|
474
|
+
const url = this.storage.nextUrl();
|
|
475
|
+
if (!url) { await sleep(1000); continue; }
|
|
476
|
+
if (this.queue.has(url)) continue;
|
|
477
|
+
this.queue.add(url);
|
|
478
|
+
await this._crawlOne(url, { parent: null, depth: 0 });
|
|
479
|
+
this.queue.delete(url);
|
|
480
|
+
if (!this.running) break;
|
|
481
|
+
await sleep(50 + Math.floor(Math.random() * 150));
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
start() {
|
|
486
|
+
if (this.running) return;
|
|
487
|
+
if (fs.existsSync(this.opts.seedsFile)) {
|
|
488
|
+
const seeds = fs.readFileSync(this.opts.seedsFile, 'utf-8').split(/\r?\n/).map(s => s.trim()).filter(x => /^https?:\/\//i.test(x));
|
|
489
|
+
if (seeds.length) this.storage.addSeed(seeds);
|
|
490
|
+
}
|
|
491
|
+
this.running = true;
|
|
492
|
+
this.workers = Array.from({ length: this.opts.concurrency }, (_, i) => this._workerLoop(i));
|
|
493
|
+
console.log(`[CRAWLER] started, concurrency=${this.opts.concurrency}, proxies=${this.proxies.length}, render=${this.opts.renderDynamic}${this._puppeteer ? '' : ' (no-puppeteer)'}`);
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
stop() {
|
|
497
|
+
this.running = false;
|
|
498
|
+
console.log('[CRAWLER] stopping...');
|
|
499
|
+
// 关闭渲染器
|
|
500
|
+
setTimeout(async () => {
|
|
501
|
+
try { if (this._browser) await this._browser.close(); } catch {}
|
|
502
|
+
this._browser = null;
|
|
503
|
+
}, 0);
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
addSeeds(urls = []) { return this.storage.addSeed(urls); }
|
|
507
|
+
stats() {
|
|
508
|
+
const s = this.storage.stats();
|
|
509
|
+
return Object.assign(s, { running: this.running, inflight: this.queue.size, renderInFlight: this._renderInFlight });
|
|
510
|
+
}
|
|
511
|
+
loadRecentDocs(maxFiles = 20) { return this.storage.loadRecentDocs(maxFiles); }
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
module.exports = { CrawlerManager };
|
|
515
|
+
// ...existing code...
|