079project 1.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/GroupStarter.cjs +211 -35
- package/README.md +3 -3
- package/crawler/agent.cjs +97 -0
- package/crawler/index.cjs +515 -0
- package/crawler/storage.cjs +163 -0
- package/groupmanager.cjs +2 -1
- package/loggerworker.cjs +202 -0
- package/main_Serve.cjs +1132 -115
- package/main_Study.cjs +1749 -365
- package/package.json +2 -1
- package/robots/seeds.txt +2 -0
- package/schedule.cjs +745 -0
- package/wikitext/wikitext-103-all.txt +0 -0
- package/wikitext/.gitattributes +0 -27
- package/wikitext/README.md +0 -344
- package/wikitext/describtion.txt +0 -1
|
@@ -0,0 +1,515 @@
|
|
|
1
|
+
// ...existing code...
|
|
2
|
+
const fs = require('fs');
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const urlLib = require('url');
|
|
5
|
+
const http = require('http');
|
|
6
|
+
const https = require('https');
|
|
7
|
+
const { CrawlerStorage } = require('./storage.cjs');
|
|
8
|
+
// 新增:定向检索工具(若不存在也不影响本改造)
|
|
9
|
+
let planQueries, extractSerpLinks, relevanceScore;
|
|
10
|
+
try {
|
|
11
|
+
({ planQueries, extractSerpLinks, relevanceScore } = require('./agent.cjs'));
|
|
12
|
+
} catch { /* 忽略 */ }
|
|
13
|
+
|
|
14
|
+
// 可选代理 Agent(若未安装对应包,将自动降级直连)
|
|
15
|
+
let HttpsProxyAgent = null;
|
|
16
|
+
let SocksProxyAgent = null;
|
|
17
|
+
try { HttpsProxyAgent = require('https-proxy-agent'); } catch {}
|
|
18
|
+
try { SocksProxyAgent = require('socks-proxy-agent'); } catch {}
|
|
19
|
+
|
|
20
|
+
// 新增:尝试动态渲染(Puppeteer / Puppeteer-Core)
|
|
21
|
+
let Puppeteer = null;
|
|
22
|
+
try { Puppeteer = require('puppeteer'); } catch {
|
|
23
|
+
try { Puppeteer = require('puppeteer-core'); } catch {}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const DEFAULT_UAS = [
|
|
27
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36',
|
|
28
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0',
|
|
29
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
|
30
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36',
|
|
31
|
+
];
|
|
32
|
+
|
|
33
|
+
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
|
|
34
|
+
function randomIPv4() { return Array(4).fill(0).map(() => Math.floor(Math.random() * 254) + 1).join('.'); }
|
|
35
|
+
|
|
36
|
+
class RobotsCache {
|
|
37
|
+
constructor() {
|
|
38
|
+
this.cache = new Map(); // host -> { fetchedAt, rules: [{allow|disallow, path}] }
|
|
39
|
+
this.ttl = 12 * 60 * 60 * 1000; // 12h
|
|
40
|
+
}
|
|
41
|
+
async fetch(host, fetchFn) {
|
|
42
|
+
const hit = this.cache.get(host);
|
|
43
|
+
const now = Date.now();
|
|
44
|
+
if (hit && now - hit.fetchedAt < this.ttl) return hit.rules;
|
|
45
|
+
const url = `https://${host}/robots.txt`;
|
|
46
|
+
try {
|
|
47
|
+
const { status, body } = await fetchFn(url, { timeout: 8000 });
|
|
48
|
+
let rules = [];
|
|
49
|
+
if (status === 200 && body) {
|
|
50
|
+
rules = this.parse(body);
|
|
51
|
+
}
|
|
52
|
+
this.cache.set(host, { fetchedAt: now, rules });
|
|
53
|
+
return rules;
|
|
54
|
+
} catch {
|
|
55
|
+
this.cache.set(host, { fetchedAt: now, rules: [] });
|
|
56
|
+
return [];
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
parse(content) {
|
|
60
|
+
const lines = String(content).split(/\r?\n/);
|
|
61
|
+
let uaStar = false;
|
|
62
|
+
const rules = [];
|
|
63
|
+
for (const ln of lines) {
|
|
64
|
+
const s = ln.trim();
|
|
65
|
+
if (!s || s.startsWith('#')) continue;
|
|
66
|
+
const [k, v] = s.split(':').map(x => (x || '').trim());
|
|
67
|
+
if (/^user-agent$/i.test(k)) {
|
|
68
|
+
uaStar = v === '*' ? true : false;
|
|
69
|
+
} else if (uaStar && /^disallow$/i.test(k)) {
|
|
70
|
+
if (v) rules.push({ type: 'disallow', path: v });
|
|
71
|
+
} else if (uaStar && /^allow$/i.test(k)) {
|
|
72
|
+
if (v) rules.push({ type: 'allow', path: v });
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return rules;
|
|
76
|
+
}
|
|
77
|
+
allowed(host, pathName, rules) {
|
|
78
|
+
if (!rules || rules.length === 0) return true;
|
|
79
|
+
let allowLen = -1, disallowLen = -1;
|
|
80
|
+
for (const r of rules) {
|
|
81
|
+
if (!r.path) continue;
|
|
82
|
+
if (pathName.startsWith(r.path)) {
|
|
83
|
+
if (r.type === 'allow') allowLen = Math.max(allowLen, r.path.length);
|
|
84
|
+
else if (r.type === 'disallow') disallowLen = Math.max(disallowLen, r.path.length);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
if (disallowLen > allowLen) return false;
|
|
88
|
+
return true;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
class CrawlerManager {
|
|
93
|
+
constructor(options = {}) {
|
|
94
|
+
this.opts = Object.assign({
|
|
95
|
+
concurrency: 4,
|
|
96
|
+
perHostDelayMs: 1500,
|
|
97
|
+
requestTimeoutMs: 12000,
|
|
98
|
+
userAgents: DEFAULT_UAS,
|
|
99
|
+
proxiesFile: path.join(__dirname, 'proxies.txt'),
|
|
100
|
+
seedsFile: path.join(__dirname, '..', 'robots', 'seeds.txt'),
|
|
101
|
+
maxDocPerTick: 8,
|
|
102
|
+
allowLang: ['en', 'zh'],
|
|
103
|
+
minTextLen: 200,
|
|
104
|
+
// 新增:动态渲染与递归抓取
|
|
105
|
+
renderDynamic: 'auto', // 'never' | 'auto' | 'always'
|
|
106
|
+
renderTimeoutMs: 15000,
|
|
107
|
+
renderConcurrency: 1, // 同时渲染页面数
|
|
108
|
+
dynamicHosts: [], // 总是渲染的 host 列表
|
|
109
|
+
dynamicPathRegex: null, // 匹配则渲染
|
|
110
|
+
maxDepth: 2, // crawlRecursive 默认深度
|
|
111
|
+
sameDomainOnly: true, // 限制域内递归
|
|
112
|
+
maxPagesPerTask: 100 // crawlRecursive 默认最大页数
|
|
113
|
+
}, options);
|
|
114
|
+
this.running = false;
|
|
115
|
+
this.storage = new CrawlerStorage();
|
|
116
|
+
this.robots = new RobotsCache();
|
|
117
|
+
this.perHostTime = new Map();
|
|
118
|
+
this.queue = new Set();
|
|
119
|
+
this.uaIdx = 0;
|
|
120
|
+
this.proxies = this._loadProxies();
|
|
121
|
+
// 渲染器状态
|
|
122
|
+
this._puppeteer = Puppeteer || null;
|
|
123
|
+
this._browser = null;
|
|
124
|
+
this._renderInFlight = 0;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// ---------- 定向检索(如果 agent.cjs 可用) ----------
|
|
128
|
+
async directedSearch(prompt, {
|
|
129
|
+
vertical = 'general',
|
|
130
|
+
perQuery = 8,
|
|
131
|
+
maxEnqueue = 30,
|
|
132
|
+
crawl = true,
|
|
133
|
+
maxCrawl = 12
|
|
134
|
+
} = {}) {
|
|
135
|
+
if (!planQueries || !extractSerpLinks || !relevanceScore) {
|
|
136
|
+
return { prompt, vertical, queries: [], candidates: [], enqueued: 0, crawled: 0, saved: 0, note: 'agent.cjs 不可用' };
|
|
137
|
+
}
|
|
138
|
+
const queries = planQueries(prompt, vertical);
|
|
139
|
+
const serpHost = 'https://html.duckduckgo.com/html/';
|
|
140
|
+
const candidates = [];
|
|
141
|
+
|
|
142
|
+
for (const q of queries) {
|
|
143
|
+
const url = `${serpHost}?q=${encodeURIComponent(q)}`;
|
|
144
|
+
const { status, body } = await this.fetchUrl(url, { timeout: 12000 });
|
|
145
|
+
if (status !== 200 || !body) continue;
|
|
146
|
+
|
|
147
|
+
const links = extractSerpLinks(body, serpHost).slice(0, perQuery).map(x => ({ ...x, query: q }));
|
|
148
|
+
for (const it of links) {
|
|
149
|
+
const score = relevanceScore({ url: it.url, title: it.title }, prompt, vertical);
|
|
150
|
+
candidates.push({ ...it, score });
|
|
151
|
+
}
|
|
152
|
+
await sleep(300 + Math.floor(Math.random() * 400));
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
156
|
+
const picked = candidates.slice(0, maxEnqueue);
|
|
157
|
+
const added = this.storage.enqueueLinks(picked.map(p => p.url), maxEnqueue);
|
|
158
|
+
|
|
159
|
+
let crawled = 0, savedDocs = 0;
|
|
160
|
+
if (crawl) {
|
|
161
|
+
for (const p of picked.slice(0, maxCrawl)) {
|
|
162
|
+
const r = await this._crawlOne(p.url, { parent: null, depth: 0 });
|
|
163
|
+
if (r && r.ok) { crawled++; savedDocs += r.saved ? 1 : 0; }
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return {
|
|
168
|
+
prompt, vertical, queries,
|
|
169
|
+
candidates: candidates.slice(0, 50).map(({ url, title, score }) => ({ url, title, score: Number(score.toFixed(3)) })),
|
|
170
|
+
enqueued: added, crawled, saved: savedDocs
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// ---------- 代理/UA ----------
|
|
175
|
+
_loadProxies() {
|
|
176
|
+
try {
|
|
177
|
+
if (!fs.existsSync(this.opts.proxiesFile)) return [];
|
|
178
|
+
const lines = fs.readFileSync(this.opts.proxiesFile, 'utf-8').split(/\r?\n/).map(s => s.trim()).filter(Boolean);
|
|
179
|
+
return lines;
|
|
180
|
+
} catch { return []; }
|
|
181
|
+
}
|
|
182
|
+
_pickUA() {
|
|
183
|
+
const ua = this.opts.userAgents[this.uaIdx % this.opts.userAgents.length];
|
|
184
|
+
this.uaIdx++;
|
|
185
|
+
return ua;
|
|
186
|
+
}
|
|
187
|
+
_pickProxy() {
|
|
188
|
+
if (!this.proxies.length) return null;
|
|
189
|
+
const i = Math.floor(Math.random() * this.proxies.length);
|
|
190
|
+
return this.proxies[i];
|
|
191
|
+
}
|
|
192
|
+
_agentForUrl(targetUrl, proxyStr) {
|
|
193
|
+
if (!proxyStr) return null;
|
|
194
|
+
try {
|
|
195
|
+
if (/^socks/i.test(proxyStr) && SocksProxyAgent) return new SocksProxyAgent.SocksProxyAgent(proxyStr);
|
|
196
|
+
if (/^http/i.test(proxyStr) && HttpsProxyAgent) return new HttpsProxyAgent.HttpsProxyAgent(proxyStr);
|
|
197
|
+
} catch {}
|
|
198
|
+
return null;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// ---------- 静态抓取 ----------
|
|
202
|
+
async fetchUrl(targetUrl, { timeout = 8000 } = {}) {
|
|
203
|
+
return new Promise((resolve) => {
|
|
204
|
+
try {
|
|
205
|
+
const u = new urlLib.URL(targetUrl);
|
|
206
|
+
const proxy = this._pickProxy();
|
|
207
|
+
const agent = this._agentForUrl(targetUrl, proxy);
|
|
208
|
+
const client = u.protocol === 'https:' ? https : http;
|
|
209
|
+
const headers = {
|
|
210
|
+
'User-Agent': this._pickUA(),
|
|
211
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
212
|
+
'Accept-Language': Math.random() < 0.5 ? 'en-US,en;q=0.8' : 'zh-CN,zh;q=0.8,en;q=0.6',
|
|
213
|
+
'Connection': 'keep-alive',
|
|
214
|
+
'X-Forwarded-For': randomIPv4(),
|
|
215
|
+
'X-Real-IP': randomIPv4()
|
|
216
|
+
};
|
|
217
|
+
const opts = {
|
|
218
|
+
protocol: u.protocol,
|
|
219
|
+
hostname: u.hostname,
|
|
220
|
+
port: u.port || (u.protocol === 'https:' ? 443 : 80),
|
|
221
|
+
path: u.pathname + (u.search || ''),
|
|
222
|
+
method: 'GET',
|
|
223
|
+
headers,
|
|
224
|
+
timeout,
|
|
225
|
+
agent: agent || undefined,
|
|
226
|
+
};
|
|
227
|
+
const req = client.request(opts, (res) => {
|
|
228
|
+
const { statusCode } = res;
|
|
229
|
+
const chunks = [];
|
|
230
|
+
const limit = 3 * 1024 * 1024;
|
|
231
|
+
res.on('data', d => {
|
|
232
|
+
chunks.push(d);
|
|
233
|
+
if (chunks.reduce((a, b) => a + b.length, 0) > limit) req.destroy();
|
|
234
|
+
});
|
|
235
|
+
res.on('end', () => {
|
|
236
|
+
const buf = Buffer.concat(chunks);
|
|
237
|
+
resolve({ status: statusCode, headers: res.headers, body: buf.toString('utf-8'), finalUrl: targetUrl });
|
|
238
|
+
});
|
|
239
|
+
});
|
|
240
|
+
req.on('timeout', () => { req.destroy(); resolve({ status: 0, error: 'timeout' }); });
|
|
241
|
+
req.on('error', () => resolve({ status: 0, error: 'network' }));
|
|
242
|
+
req.end();
|
|
243
|
+
} catch {
|
|
244
|
+
resolve({ status: 0, error: 'badurl' });
|
|
245
|
+
}
|
|
246
|
+
});
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// ---------- 动态渲染 ----------
|
|
250
|
+
async _ensureRenderer() {
|
|
251
|
+
if (!this._puppeteer) return false;
|
|
252
|
+
if (this._browser) return true;
|
|
253
|
+
try {
|
|
254
|
+
this._browser = await this._puppeteer.launch({
|
|
255
|
+
headless: 'new',
|
|
256
|
+
args: [
|
|
257
|
+
'--no-sandbox',
|
|
258
|
+
'--disable-setuid-sandbox',
|
|
259
|
+
'--disable-dev-shm-usage'
|
|
260
|
+
]
|
|
261
|
+
});
|
|
262
|
+
return true;
|
|
263
|
+
} catch (e) {
|
|
264
|
+
console.warn('[CRAWLER][RENDER] 启动失败:', e.message);
|
|
265
|
+
this._browser = null;
|
|
266
|
+
return false;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
async _renderHtml(url, { timeout = 15000 } = {}) {
|
|
271
|
+
if (!await this._ensureRenderer()) return { ok: false };
|
|
272
|
+
// 控制并发
|
|
273
|
+
while (this._renderInFlight >= Math.max(1, this.opts.renderConcurrency)) {
|
|
274
|
+
await sleep(100);
|
|
275
|
+
}
|
|
276
|
+
this._renderInFlight++;
|
|
277
|
+
try {
|
|
278
|
+
const page = await this._browser.newPage();
|
|
279
|
+
await page.setUserAgent(this._pickUA());
|
|
280
|
+
await page.setExtraHTTPHeaders({
|
|
281
|
+
'Accept-Language': Math.random() < 0.5 ? 'en-US,en;q=0.8' : 'zh-CN,zh;q=0.8,en;q=0.6',
|
|
282
|
+
'X-Forwarded-For': randomIPv4(),
|
|
283
|
+
'X-Real-IP': randomIPv4()
|
|
284
|
+
});
|
|
285
|
+
await page.setRequestInterception(true);
|
|
286
|
+
page.on('request', (req) => {
|
|
287
|
+
const rtype = req.resourceType();
|
|
288
|
+
// 阻断大资源
|
|
289
|
+
if (['image', 'media', 'font'].includes(rtype)) return req.abort();
|
|
290
|
+
req.continue();
|
|
291
|
+
});
|
|
292
|
+
await page.goto(url, { waitUntil: 'networkidle2', timeout });
|
|
293
|
+
// 滚动以触发懒加载
|
|
294
|
+
await page.evaluate(async () => {
|
|
295
|
+
await new Promise(resolve => {
|
|
296
|
+
let t = 0;
|
|
297
|
+
const id = setInterval(() => {
|
|
298
|
+
window.scrollBy(0, 800);
|
|
299
|
+
if ((t += 200) >= 2000) { clearInterval(id); resolve(); }
|
|
300
|
+
}, 200);
|
|
301
|
+
});
|
|
302
|
+
});
|
|
303
|
+
const content = await page.content();
|
|
304
|
+
// 提取链接(渲染后)
|
|
305
|
+
const links = await page.$$eval('a[href]', as => as.map(a => a.href).filter(Boolean));
|
|
306
|
+
const finalUrl = page.url();
|
|
307
|
+
await page.close();
|
|
308
|
+
return { ok: true, html: content, links, finalUrl };
|
|
309
|
+
} catch (e) {
|
|
310
|
+
return { ok: false, error: e.message };
|
|
311
|
+
} finally {
|
|
312
|
+
this._renderInFlight--;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// ---------- 解析 ----------
|
|
317
|
+
extractContent(html, baseUrl) {
|
|
318
|
+
let text = String(html || '');
|
|
319
|
+
text = text.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
|
320
|
+
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
|
321
|
+
.replace(/<!--[\s\S]*?-->/g, ' ');
|
|
322
|
+
const mTitle = text.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
323
|
+
const title = mTitle ? mTitle[1].replace(/\s+/g, ' ').trim() : '';
|
|
324
|
+
let body = text.replace(/<\/(p|div|h\d|br|li)>/gi, '\n')
|
|
325
|
+
.replace(/<[^>]+>/g, ' ')
|
|
326
|
+
.replace(/ /g, ' ')
|
|
327
|
+
.replace(/&/g, '&')
|
|
328
|
+
.replace(/\s+\n/g, '\n')
|
|
329
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
330
|
+
.replace(/[ \t]{2,}/g, ' ')
|
|
331
|
+
.trim();
|
|
332
|
+
const links = [];
|
|
333
|
+
const aTagRe = /<a\s+[^>]*href=["']([^"']+)["'][^>]*>/gi;
|
|
334
|
+
let m;
|
|
335
|
+
let base;
|
|
336
|
+
try { base = new urlLib.URL(baseUrl); } catch { base = null; }
|
|
337
|
+
while ((m = aTagRe.exec(html)) !== null) {
|
|
338
|
+
let href = (m[1] || '').trim();
|
|
339
|
+
if (!href) continue;
|
|
340
|
+
try {
|
|
341
|
+
const abs = base ? new urlLib.URL(href, base).toString() : href;
|
|
342
|
+
links.push(abs);
|
|
343
|
+
} catch {}
|
|
344
|
+
}
|
|
345
|
+
const nonAscii = (body.match(/[^\x00-\x7F]/g) || []).length;
|
|
346
|
+
const ratio = nonAscii / Math.max(1, body.length);
|
|
347
|
+
const lang = ratio > 0.3 ? 'zh' : 'en';
|
|
348
|
+
return { title, text: body, links, lang };
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
_hostDelayOk(host) {
|
|
352
|
+
const last = this.perHostTime.get(host) || 0;
|
|
353
|
+
const now = Date.now();
|
|
354
|
+
if (now - last >= this.opts.perHostDelayMs) {
|
|
355
|
+
this.perHostTime.set(host, now);
|
|
356
|
+
return true;
|
|
357
|
+
}
|
|
358
|
+
return false;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// ---------- 单页抓取(含动态渲染回退) ----------
|
|
362
|
+
async _crawlOne(url, ctx = { parent: null, depth: 0 }) {
|
|
363
|
+
try {
|
|
364
|
+
const u = new urlLib.URL(url);
|
|
365
|
+
const rules = await this.robots.fetch(u.host, (u2, o) => this.fetchUrl(u2, o));
|
|
366
|
+
const ok = this.robots.allowed(u.host, u.pathname, rules);
|
|
367
|
+
if (!ok) return { ok: false, reason: 'robots' };
|
|
368
|
+
|
|
369
|
+
while (!this._hostDelayOk(u.host)) await sleep(200);
|
|
370
|
+
|
|
371
|
+
// 静态抓
|
|
372
|
+
let { status, body, finalUrl } = await this.fetchUrl(url, { timeout: this.opts.requestTimeoutMs });
|
|
373
|
+
if (status !== 200) body = '';
|
|
374
|
+
|
|
375
|
+
// 解析静态
|
|
376
|
+
let parsed = this.extractContent(body || '', finalUrl || url);
|
|
377
|
+
let useRender = false;
|
|
378
|
+
|
|
379
|
+
// 动态渲染触发条件
|
|
380
|
+
const alwaysRenderHost = this.opts.dynamicHosts.some(h => u.hostname.endsWith(h));
|
|
381
|
+
const matchPath = this.opts.dynamicPathRegex && this.opts.dynamicPathRegex.test(u.pathname);
|
|
382
|
+
const heuristicHeavy = (body.match(/<script/gi) || []).length >= 10 || (parsed.text || '').length < this.opts.minTextLen;
|
|
383
|
+
|
|
384
|
+
if (this.opts.renderDynamic === 'always' || alwaysRenderHost || matchPath ||
|
|
385
|
+
(this.opts.renderDynamic === 'auto' && heuristicHeavy)) {
|
|
386
|
+
useRender = true;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
let renderedLinks = [];
|
|
390
|
+
if (useRender) {
|
|
391
|
+
const r = await this._renderHtml(url, { timeout: this.opts.renderTimeoutMs });
|
|
392
|
+
if (r.ok && r.html) {
|
|
393
|
+
parsed = this.extractContent(r.html, r.finalUrl || url);
|
|
394
|
+
renderedLinks = Array.isArray(r.links) ? r.links : [];
|
|
395
|
+
finalUrl = r.finalUrl || finalUrl || url;
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
if (!this.opts.allowLang.includes(parsed.lang) || !parsed.text || parsed.text.length < this.opts.minTextLen) {
|
|
400
|
+
this.storage.state.stats.deduped++;
|
|
401
|
+
return { ok: false, reason: 'short-or-lang', lang: parsed.lang };
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
const saved = this.storage.saveDocument({
|
|
405
|
+
url: finalUrl || url,
|
|
406
|
+
title: parsed.title,
|
|
407
|
+
text: parsed.text,
|
|
408
|
+
lang: parsed.lang,
|
|
409
|
+
depth: ctx.depth || 0,
|
|
410
|
+
parent: ctx.parent || ''
|
|
411
|
+
});
|
|
412
|
+
|
|
413
|
+
// 扩展链接(合并渲染后的链接)
|
|
414
|
+
const links = Array.from(new Set([...(parsed.links || []), ...renderedLinks])).slice(0, 800);
|
|
415
|
+
const added = this.storage.enqueueLinks(links, 200);
|
|
416
|
+
|
|
417
|
+
this.storage.markVisited(url);
|
|
418
|
+
this.storage.state.stats.fetched++;
|
|
419
|
+
return { ok: true, saved, added, lang: parsed.lang, links };
|
|
420
|
+
} catch (e) {
|
|
421
|
+
this.storage.state.stats.errors++;
|
|
422
|
+
return { ok: false, reason: e.message };
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
// ---------- 递归抓取(多层链接) ----------
|
|
427
|
+
async crawlRecursive(seedUrl, {
|
|
428
|
+
maxDepth = this.opts.maxDepth,
|
|
429
|
+
sameDomainOnly = this.opts.sameDomainOnly,
|
|
430
|
+
maxPages = this.opts.maxPagesPerTask
|
|
431
|
+
} = {}) {
|
|
432
|
+
let seed;
|
|
433
|
+
try { seed = new urlLib.URL(seedUrl); } catch { return { ok: false, error: 'bad seed url' }; }
|
|
434
|
+
|
|
435
|
+
const visitedLocal = new Set();
|
|
436
|
+
const q = [{ url: seedUrl, depth: 0, parent: '' }];
|
|
437
|
+
let crawled = 0, savedDocs = 0;
|
|
438
|
+
|
|
439
|
+
while (q.length > 0) {
|
|
440
|
+
if (crawled >= maxPages) break;
|
|
441
|
+
const { url, depth, parent } = q.shift();
|
|
442
|
+
if (visitedLocal.has(url)) continue;
|
|
443
|
+
visitedLocal.add(url);
|
|
444
|
+
|
|
445
|
+
// 域限制
|
|
446
|
+
if (sameDomainOnly) {
|
|
447
|
+
try {
|
|
448
|
+
const u = new urlLib.URL(url);
|
|
449
|
+
if (u.hostname !== seed.hostname) continue;
|
|
450
|
+
} catch { continue; }
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
const res = await this._crawlOne(url, { parent, depth });
|
|
454
|
+
if (res && res.ok) {
|
|
455
|
+
crawled++;
|
|
456
|
+
if (res.saved) savedDocs++;
|
|
457
|
+
if (depth + 1 <= maxDepth && Array.isArray(res.links)) {
|
|
458
|
+
for (const l of res.links) {
|
|
459
|
+
if (!l || visitedLocal.has(l)) continue;
|
|
460
|
+
q.push({ url: l, depth: depth + 1, parent: url });
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// 轻微退避,避免单站被打爆
|
|
466
|
+
await sleep(40 + Math.floor(Math.random() * 120));
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
return { ok: true, seed: seedUrl, crawled, saved: savedDocs, maxDepth, sameDomainOnly, maxPages };
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
async _workerLoop(slotId) {
|
|
473
|
+
while (this.running) {
|
|
474
|
+
const url = this.storage.nextUrl();
|
|
475
|
+
if (!url) { await sleep(1000); continue; }
|
|
476
|
+
if (this.queue.has(url)) continue;
|
|
477
|
+
this.queue.add(url);
|
|
478
|
+
await this._crawlOne(url, { parent: null, depth: 0 });
|
|
479
|
+
this.queue.delete(url);
|
|
480
|
+
if (!this.running) break;
|
|
481
|
+
await sleep(50 + Math.floor(Math.random() * 150));
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
start() {
|
|
486
|
+
if (this.running) return;
|
|
487
|
+
if (fs.existsSync(this.opts.seedsFile)) {
|
|
488
|
+
const seeds = fs.readFileSync(this.opts.seedsFile, 'utf-8').split(/\r?\n/).map(s => s.trim()).filter(x => /^https?:\/\//i.test(x));
|
|
489
|
+
if (seeds.length) this.storage.addSeed(seeds);
|
|
490
|
+
}
|
|
491
|
+
this.running = true;
|
|
492
|
+
this.workers = Array.from({ length: this.opts.concurrency }, (_, i) => this._workerLoop(i));
|
|
493
|
+
console.log(`[CRAWLER] started, concurrency=${this.opts.concurrency}, proxies=${this.proxies.length}, render=${this.opts.renderDynamic}${this._puppeteer ? '' : ' (no-puppeteer)'}`);
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
stop() {
|
|
497
|
+
this.running = false;
|
|
498
|
+
console.log('[CRAWLER] stopping...');
|
|
499
|
+
// 关闭渲染器
|
|
500
|
+
setTimeout(async () => {
|
|
501
|
+
try { if (this._browser) await this._browser.close(); } catch {}
|
|
502
|
+
this._browser = null;
|
|
503
|
+
}, 0);
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
addSeeds(urls = []) { return this.storage.addSeed(urls); }
|
|
507
|
+
stats() {
|
|
508
|
+
const s = this.storage.stats();
|
|
509
|
+
return Object.assign(s, { running: this.running, inflight: this.queue.size, renderInFlight: this._renderInFlight });
|
|
510
|
+
}
|
|
511
|
+
loadRecentDocs(maxFiles = 20) { return this.storage.loadRecentDocs(maxFiles); }
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
module.exports = { CrawlerManager };
|
|
515
|
+
// ...existing code...
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const path = require('path');
|
|
3
|
+
const crypto = require('crypto');
|
|
4
|
+
const { planQueries, extractSerpLinks, relevanceScore } = require('./agent.cjs');
|
|
5
|
+
// ...existing code...
|
|
6
|
+
|
|
7
|
+
function randomIPv4() {
|
|
8
|
+
return Array(4).fill(0).map(() => Math.floor(Math.random() * 254) + 1).join('.');
|
|
9
|
+
}
|
|
10
|
+
class CrawlerStorage {
|
|
11
|
+
constructor(baseDir = path.join(__dirname, '..', 'crawler_data')) {
|
|
12
|
+
this.baseDir = baseDir;
|
|
13
|
+
this.stateFile = path.join(this.baseDir, 'state.json');
|
|
14
|
+
this.docsDir = path.join(this.baseDir, 'docs');
|
|
15
|
+
this.ensureDirs();
|
|
16
|
+
this.state = {
|
|
17
|
+
frontier: [], // 待抓取队列(url)
|
|
18
|
+
visited: {}, // urlHash -> timestamp
|
|
19
|
+
enqueued: {}, // urlHash -> 1
|
|
20
|
+
stats: { fetched: 0, saved: 0, deduped: 0, errors: 0, lastSave: 0 }
|
|
21
|
+
};
|
|
22
|
+
this._loadState();
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
ensureDirs() {
|
|
26
|
+
fs.mkdirSync(this.baseDir, { recursive: true });
|
|
27
|
+
fs.mkdirSync(this.docsDir, { recursive: true });
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
_hash(str) {
|
|
31
|
+
return crypto.createHash('sha1').update(String(str)).digest('hex');
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
_loadState() {
|
|
35
|
+
try {
|
|
36
|
+
if (fs.existsSync(this.stateFile)) {
|
|
37
|
+
const obj = JSON.parse(fs.readFileSync(this.stateFile, 'utf-8'));
|
|
38
|
+
if (obj && typeof obj === 'object') this.state = obj;
|
|
39
|
+
}
|
|
40
|
+
} catch (e) {
|
|
41
|
+
console.warn('[CRAWLER][STATE] load failed:', e.message);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
_saveState() {
|
|
46
|
+
try {
|
|
47
|
+
fs.writeFileSync(this.stateFile, JSON.stringify(this.state, null, 2), 'utf-8');
|
|
48
|
+
this.state.stats.lastSave = Date.now();
|
|
49
|
+
} catch (e) {
|
|
50
|
+
console.warn('[CRAWLER][STATE] save failed:', e.message);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
addSeed(urls = []) {
|
|
55
|
+
let n = 0;
|
|
56
|
+
for (const u of urls) {
|
|
57
|
+
const url = String(u || '').trim();
|
|
58
|
+
if (!url || !/^https?:\/\//i.test(url)) continue;
|
|
59
|
+
const h = this._hash(url);
|
|
60
|
+
if (this.state.enqueued[h] || this.state.visited[h]) continue;
|
|
61
|
+
this.state.frontier.push(url);
|
|
62
|
+
this.state.enqueued[h] = 1;
|
|
63
|
+
n++;
|
|
64
|
+
}
|
|
65
|
+
if (n > 0) this._saveState();
|
|
66
|
+
return n;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
nextUrl() {
|
|
70
|
+
while (this.state.frontier.length > 0) {
|
|
71
|
+
const url = this.state.frontier.shift();
|
|
72
|
+
if (!url) continue;
|
|
73
|
+
const h = this._hash(url);
|
|
74
|
+
if (this.state.visited[h]) continue;
|
|
75
|
+
return url;
|
|
76
|
+
}
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
markVisited(url) {
|
|
81
|
+
const h = this._hash(url);
|
|
82
|
+
this.state.visited[h] = Date.now();
|
|
83
|
+
this._saveState();
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// 新链接入队(去重)
|
|
87
|
+
enqueueLinks(links = [], limitPerBatch = 200) {
|
|
88
|
+
let n = 0;
|
|
89
|
+
for (const l of links) {
|
|
90
|
+
if (!l || !/^https?:\/\//i.test(l)) continue;
|
|
91
|
+
const h = this._hash(l);
|
|
92
|
+
if (this.state.enqueued[h] || this.state.visited[h]) continue;
|
|
93
|
+
this.state.frontier.push(l);
|
|
94
|
+
this.state.enqueued[h] = 1;
|
|
95
|
+
n++;
|
|
96
|
+
if (n >= limitPerBatch) break;
|
|
97
|
+
}
|
|
98
|
+
if (n > 0) this._saveState();
|
|
99
|
+
return n;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// 保存清洗后的正文为 txt,返回保存路径(增强:记录 parent/depth)
|
|
103
|
+
saveDocument(doc) {
|
|
104
|
+
try {
|
|
105
|
+
const day = new Date();
|
|
106
|
+
const dir = path.join(this.docsDir,
|
|
107
|
+
`${day.getFullYear()}-${String(day.getMonth() + 1).padStart(2, '0')}-${String(day.getDate()).padStart(2, '0')}`
|
|
108
|
+
);
|
|
109
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
110
|
+
const name = `${Date.now()}_${Math.floor(Math.random() * 1e6)}.txt`;
|
|
111
|
+
const file = path.join(dir, name);
|
|
112
|
+
const meta = [
|
|
113
|
+
`URL: ${doc.url || ''}`,
|
|
114
|
+
`Title: ${doc.title || ''}`,
|
|
115
|
+
`FetchedAt: ${new Date().toISOString()}`,
|
|
116
|
+
`Lang: ${doc.lang || ''}`,
|
|
117
|
+
`Depth: ${doc.depth ?? 0}`,
|
|
118
|
+
`Parent: ${doc.parent || ''}`
|
|
119
|
+
].join('\n');
|
|
120
|
+
const body = (doc.text || '').trim();
|
|
121
|
+
if (!body) return null;
|
|
122
|
+
fs.writeFileSync(file, meta + '\n\n' + body, 'utf-8');
|
|
123
|
+
this.state.stats.saved++;
|
|
124
|
+
this._saveState();
|
|
125
|
+
return file;
|
|
126
|
+
} catch (e) {
|
|
127
|
+
this.state.stats.errors++;
|
|
128
|
+
this._saveState();
|
|
129
|
+
return null;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
stats() {
|
|
134
|
+
return Object.assign({}, this.state.stats, {
|
|
135
|
+
frontier: this.state.frontier.length,
|
|
136
|
+
visited: Object.keys(this.state.visited).length
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// 消费已保存文档,按批次读入返回文本数组
|
|
141
|
+
// 返回:[{ path, text }]
|
|
142
|
+
loadRecentDocs(maxFiles = 20) {
|
|
143
|
+
const d = this.docsDir;
|
|
144
|
+
if (!fs.existsSync(d)) return [];
|
|
145
|
+
const days = fs.readdirSync(d).filter(f => /^\d{4}-\d{2}-\d{2}$/.test(f)).sort().reverse();
|
|
146
|
+
const out = [];
|
|
147
|
+
for (const day of days) {
|
|
148
|
+
const dir = path.join(d, day);
|
|
149
|
+
const files = fs.readdirSync(dir).filter(f => f.endsWith('.txt')).sort().reverse();
|
|
150
|
+
for (const f of files) {
|
|
151
|
+
try {
|
|
152
|
+
const p = path.join(dir, f);
|
|
153
|
+
const text = fs.readFileSync(p, 'utf-8');
|
|
154
|
+
out.push({ path: p, text });
|
|
155
|
+
if (out.length >= maxFiles) return out;
|
|
156
|
+
} catch {}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
return out;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
module.exports = { CrawlerStorage };
|