079project 2.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/crawler/agent.cjs +97 -0
- package/crawler/index.cjs +515 -0
- package/crawler/storage.cjs +163 -0
- package/groupmanager.cjs +2 -1
- package/main_Serve.cjs +1136 -210
- package/main_Study.cjs +1584 -349
- package/package.json +2 -1
- package/robots/seeds.txt +2 -0
- package/schedule.cjs +745 -0
- package/todo-list.txt +0 -86
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const path = require('path');
|
|
3
|
+
const crypto = require('crypto');
|
|
4
|
+
const { planQueries, extractSerpLinks, relevanceScore } = require('./agent.cjs');
|
|
5
|
+
// ...existing code...
|
|
6
|
+
|
|
7
|
+
function randomIPv4() {
|
|
8
|
+
return Array(4).fill(0).map(() => Math.floor(Math.random() * 254) + 1).join('.');
|
|
9
|
+
}
|
|
10
|
+
class CrawlerStorage {
|
|
11
|
+
constructor(baseDir = path.join(__dirname, '..', 'crawler_data')) {
|
|
12
|
+
this.baseDir = baseDir;
|
|
13
|
+
this.stateFile = path.join(this.baseDir, 'state.json');
|
|
14
|
+
this.docsDir = path.join(this.baseDir, 'docs');
|
|
15
|
+
this.ensureDirs();
|
|
16
|
+
this.state = {
|
|
17
|
+
frontier: [], // 待抓取队列(url)
|
|
18
|
+
visited: {}, // urlHash -> timestamp
|
|
19
|
+
enqueued: {}, // urlHash -> 1
|
|
20
|
+
stats: { fetched: 0, saved: 0, deduped: 0, errors: 0, lastSave: 0 }
|
|
21
|
+
};
|
|
22
|
+
this._loadState();
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
ensureDirs() {
|
|
26
|
+
fs.mkdirSync(this.baseDir, { recursive: true });
|
|
27
|
+
fs.mkdirSync(this.docsDir, { recursive: true });
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
_hash(str) {
|
|
31
|
+
return crypto.createHash('sha1').update(String(str)).digest('hex');
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
_loadState() {
|
|
35
|
+
try {
|
|
36
|
+
if (fs.existsSync(this.stateFile)) {
|
|
37
|
+
const obj = JSON.parse(fs.readFileSync(this.stateFile, 'utf-8'));
|
|
38
|
+
if (obj && typeof obj === 'object') this.state = obj;
|
|
39
|
+
}
|
|
40
|
+
} catch (e) {
|
|
41
|
+
console.warn('[CRAWLER][STATE] load failed:', e.message);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
_saveState() {
|
|
46
|
+
try {
|
|
47
|
+
fs.writeFileSync(this.stateFile, JSON.stringify(this.state, null, 2), 'utf-8');
|
|
48
|
+
this.state.stats.lastSave = Date.now();
|
|
49
|
+
} catch (e) {
|
|
50
|
+
console.warn('[CRAWLER][STATE] save failed:', e.message);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
addSeed(urls = []) {
|
|
55
|
+
let n = 0;
|
|
56
|
+
for (const u of urls) {
|
|
57
|
+
const url = String(u || '').trim();
|
|
58
|
+
if (!url || !/^https?:\/\//i.test(url)) continue;
|
|
59
|
+
const h = this._hash(url);
|
|
60
|
+
if (this.state.enqueued[h] || this.state.visited[h]) continue;
|
|
61
|
+
this.state.frontier.push(url);
|
|
62
|
+
this.state.enqueued[h] = 1;
|
|
63
|
+
n++;
|
|
64
|
+
}
|
|
65
|
+
if (n > 0) this._saveState();
|
|
66
|
+
return n;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
nextUrl() {
|
|
70
|
+
while (this.state.frontier.length > 0) {
|
|
71
|
+
const url = this.state.frontier.shift();
|
|
72
|
+
if (!url) continue;
|
|
73
|
+
const h = this._hash(url);
|
|
74
|
+
if (this.state.visited[h]) continue;
|
|
75
|
+
return url;
|
|
76
|
+
}
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
markVisited(url) {
|
|
81
|
+
const h = this._hash(url);
|
|
82
|
+
this.state.visited[h] = Date.now();
|
|
83
|
+
this._saveState();
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// 新链接入队(去重)
|
|
87
|
+
enqueueLinks(links = [], limitPerBatch = 200) {
|
|
88
|
+
let n = 0;
|
|
89
|
+
for (const l of links) {
|
|
90
|
+
if (!l || !/^https?:\/\//i.test(l)) continue;
|
|
91
|
+
const h = this._hash(l);
|
|
92
|
+
if (this.state.enqueued[h] || this.state.visited[h]) continue;
|
|
93
|
+
this.state.frontier.push(l);
|
|
94
|
+
this.state.enqueued[h] = 1;
|
|
95
|
+
n++;
|
|
96
|
+
if (n >= limitPerBatch) break;
|
|
97
|
+
}
|
|
98
|
+
if (n > 0) this._saveState();
|
|
99
|
+
return n;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// 保存清洗后的正文为 txt,返回保存路径(增强:记录 parent/depth)
|
|
103
|
+
saveDocument(doc) {
|
|
104
|
+
try {
|
|
105
|
+
const day = new Date();
|
|
106
|
+
const dir = path.join(this.docsDir,
|
|
107
|
+
`${day.getFullYear()}-${String(day.getMonth() + 1).padStart(2, '0')}-${String(day.getDate()).padStart(2, '0')}`
|
|
108
|
+
);
|
|
109
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
110
|
+
const name = `${Date.now()}_${Math.floor(Math.random() * 1e6)}.txt`;
|
|
111
|
+
const file = path.join(dir, name);
|
|
112
|
+
const meta = [
|
|
113
|
+
`URL: ${doc.url || ''}`,
|
|
114
|
+
`Title: ${doc.title || ''}`,
|
|
115
|
+
`FetchedAt: ${new Date().toISOString()}`,
|
|
116
|
+
`Lang: ${doc.lang || ''}`,
|
|
117
|
+
`Depth: ${doc.depth ?? 0}`,
|
|
118
|
+
`Parent: ${doc.parent || ''}`
|
|
119
|
+
].join('\n');
|
|
120
|
+
const body = (doc.text || '').trim();
|
|
121
|
+
if (!body) return null;
|
|
122
|
+
fs.writeFileSync(file, meta + '\n\n' + body, 'utf-8');
|
|
123
|
+
this.state.stats.saved++;
|
|
124
|
+
this._saveState();
|
|
125
|
+
return file;
|
|
126
|
+
} catch (e) {
|
|
127
|
+
this.state.stats.errors++;
|
|
128
|
+
this._saveState();
|
|
129
|
+
return null;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
stats() {
|
|
134
|
+
return Object.assign({}, this.state.stats, {
|
|
135
|
+
frontier: this.state.frontier.length,
|
|
136
|
+
visited: Object.keys(this.state.visited).length
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// 消费已保存文档,按批次读入返回文本数组
|
|
141
|
+
// 返回:[{ path, text }]
|
|
142
|
+
loadRecentDocs(maxFiles = 20) {
|
|
143
|
+
const d = this.docsDir;
|
|
144
|
+
if (!fs.existsSync(d)) return [];
|
|
145
|
+
const days = fs.readdirSync(d).filter(f => /^\d{4}-\d{2}-\d{2}$/.test(f)).sort().reverse();
|
|
146
|
+
const out = [];
|
|
147
|
+
for (const day of days) {
|
|
148
|
+
const dir = path.join(d, day);
|
|
149
|
+
const files = fs.readdirSync(dir).filter(f => f.endsWith('.txt')).sort().reverse();
|
|
150
|
+
for (const f of files) {
|
|
151
|
+
try {
|
|
152
|
+
const p = path.join(dir, f);
|
|
153
|
+
const text = fs.readFileSync(p, 'utf-8');
|
|
154
|
+
out.push({ path: p, text });
|
|
155
|
+
if (out.length >= maxFiles) return out;
|
|
156
|
+
} catch {}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
return out;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
module.exports = { CrawlerStorage };
|