@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
|
@@ -0,0 +1,518 @@
|
|
|
1
|
+
import chalk from 'chalk';
|
|
2
|
+
import pLimit from 'p-limit';
|
|
3
|
+
import robotsParser from 'robots-parser';
|
|
4
|
+
import { Fetcher } from './fetcher.js';
|
|
5
|
+
import { Parser } from './parser.js';
|
|
6
|
+
import { Sitemap } from './sitemap.js';
|
|
7
|
+
import { normalizeUrl } from './normalize.js';
|
|
8
|
+
import { TrapDetector } from './trap.js';
|
|
9
|
+
import { ScopeManager } from '../core/scope/scopeManager.js';
|
|
10
|
+
import { getDb } from '../db/index.js';
|
|
11
|
+
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
12
|
+
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
13
|
+
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
14
|
+
import { EdgeRepository } from '../db/repositories/EdgeRepository.js';
|
|
15
|
+
import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
|
|
16
|
+
import { analyzeContent, calculateThinContentScore } from '../analysis/content.js';
|
|
17
|
+
import { analyzeLinks } from '../analysis/links.js';
|
|
18
|
+
// Fallback context for backward compatibility or when no context is provided
|
|
19
|
+
const nullContext = {
|
|
20
|
+
emit: (event) => {
|
|
21
|
+
// Basic console fallback for critical events if no listener is attached
|
|
22
|
+
// This maintains some visibility for consumers not using the event system
|
|
23
|
+
if (event.type === 'error') {
|
|
24
|
+
console.error(event.message, event.error || '');
|
|
25
|
+
}
|
|
26
|
+
else if (event.type === 'warn') {
|
|
27
|
+
console.warn(event.message);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
};
|
|
31
|
+
export class Crawler {
|
|
32
|
+
startUrl;
|
|
33
|
+
options;
|
|
34
|
+
context;
|
|
35
|
+
visited;
|
|
36
|
+
uniqueQueue;
|
|
37
|
+
queue;
|
|
38
|
+
active;
|
|
39
|
+
pagesCrawled;
|
|
40
|
+
reachedLimit;
|
|
41
|
+
maxDepthInCrawl;
|
|
42
|
+
concurrency;
|
|
43
|
+
limitConcurrency;
|
|
44
|
+
// Repositories
|
|
45
|
+
siteRepo = null;
|
|
46
|
+
snapshotRepo = null;
|
|
47
|
+
pageRepo = null;
|
|
48
|
+
edgeRepo = null;
|
|
49
|
+
metricsRepo = null;
|
|
50
|
+
// Site/Snapshot info
|
|
51
|
+
siteId = null;
|
|
52
|
+
snapshotId = null;
|
|
53
|
+
rootOrigin = '';
|
|
54
|
+
// Discovery tracking
|
|
55
|
+
discoveryDepths = new Map();
|
|
56
|
+
// Buffers for batch operations
|
|
57
|
+
pageBuffer = new Map();
|
|
58
|
+
edgeBuffer = [];
|
|
59
|
+
metricsBuffer = [];
|
|
60
|
+
// Modules
|
|
61
|
+
scopeManager = null;
|
|
62
|
+
fetcher = null;
|
|
63
|
+
parser = null;
|
|
64
|
+
sitemapFetcher = null;
|
|
65
|
+
trapDetector = null;
|
|
66
|
+
robots = null;
|
|
67
|
+
constructor(startUrl, options, context) {
|
|
68
|
+
this.startUrl = startUrl;
|
|
69
|
+
this.options = options;
|
|
70
|
+
this.context = context || nullContext;
|
|
71
|
+
this.visited = new Set();
|
|
72
|
+
this.uniqueQueue = new Set();
|
|
73
|
+
this.queue = [];
|
|
74
|
+
this.active = 0;
|
|
75
|
+
this.pagesCrawled = 0;
|
|
76
|
+
this.reachedLimit = false;
|
|
77
|
+
this.maxDepthInCrawl = Math.min(options.depth, 10);
|
|
78
|
+
this.concurrency = Math.min(options.concurrency || 2, 10);
|
|
79
|
+
this.limitConcurrency = pLimit(this.concurrency);
|
|
80
|
+
}
|
|
81
|
+
async initialize() {
|
|
82
|
+
const db = getDb();
|
|
83
|
+
this.siteRepo = new SiteRepository(db);
|
|
84
|
+
this.snapshotRepo = new SnapshotRepository(db);
|
|
85
|
+
this.pageRepo = new PageRepository(db);
|
|
86
|
+
this.edgeRepo = new EdgeRepository(db);
|
|
87
|
+
this.metricsRepo = new MetricsRepository(db);
|
|
88
|
+
const rootUrl = normalizeUrl(this.startUrl, '', { stripQuery: this.options.stripQuery });
|
|
89
|
+
if (!rootUrl)
|
|
90
|
+
throw new Error('Invalid start URL');
|
|
91
|
+
const urlObj = new URL(rootUrl);
|
|
92
|
+
const domain = urlObj.hostname.replace('www.', '');
|
|
93
|
+
const site = this.siteRepo.firstOrCreateSite(domain);
|
|
94
|
+
this.siteId = site.id;
|
|
95
|
+
const type = this.options.snapshotType || (this.options.previousGraph ? 'incremental' : 'full');
|
|
96
|
+
this.snapshotId = this.snapshotRepo.createSnapshot(this.siteId, type);
|
|
97
|
+
this.rootOrigin = urlObj.origin;
|
|
98
|
+
this.startUrl = rootUrl;
|
|
99
|
+
// Seed discovery depth for root
|
|
100
|
+
this.discoveryDepths.set(this.startUrl, 0);
|
|
101
|
+
}
|
|
102
|
+
setupModules() {
|
|
103
|
+
this.scopeManager = new ScopeManager({
|
|
104
|
+
allowedDomains: this.options.allowedDomains || [],
|
|
105
|
+
deniedDomains: this.options.deniedDomains || [],
|
|
106
|
+
includeSubdomains: this.options.includeSubdomains || false,
|
|
107
|
+
rootUrl: this.startUrl
|
|
108
|
+
});
|
|
109
|
+
this.fetcher = new Fetcher({
|
|
110
|
+
rate: this.options.rate,
|
|
111
|
+
proxyUrl: this.options.proxyUrl,
|
|
112
|
+
scopeManager: this.scopeManager,
|
|
113
|
+
maxRedirects: this.options.maxRedirects,
|
|
114
|
+
userAgent: this.options.userAgent
|
|
115
|
+
});
|
|
116
|
+
this.parser = new Parser();
|
|
117
|
+
this.sitemapFetcher = new Sitemap(this.context);
|
|
118
|
+
this.trapDetector = new TrapDetector();
|
|
119
|
+
}
|
|
120
|
+
async fetchRobots() {
|
|
121
|
+
try {
|
|
122
|
+
const robotsUrl = new URL('/robots.txt', this.rootOrigin).toString();
|
|
123
|
+
const res = await this.fetcher.fetch(robotsUrl, { maxBytes: 500000 });
|
|
124
|
+
if (res && typeof res.status === 'number' && res.status >= 200 && res.status < 300) {
|
|
125
|
+
this.robots = robotsParser(robotsUrl, res.body);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
catch {
|
|
129
|
+
// Suppressed expected network warnings when robots block
|
|
130
|
+
console.warn('Failed to fetch robots.txt, proceeding...');
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
shouldEnqueue(url, depth) {
|
|
134
|
+
if (this.visited.has(url))
|
|
135
|
+
return false;
|
|
136
|
+
if (this.uniqueQueue.has(url))
|
|
137
|
+
return false;
|
|
138
|
+
if (depth > this.maxDepthInCrawl)
|
|
139
|
+
return false;
|
|
140
|
+
if (this.scopeManager.isUrlEligible(url) !== 'allowed')
|
|
141
|
+
return false;
|
|
142
|
+
if (this.options.detectTraps) {
|
|
143
|
+
const trap = this.trapDetector.checkTrap(url, depth);
|
|
144
|
+
if (trap.risk > 0.8)
|
|
145
|
+
return false;
|
|
146
|
+
}
|
|
147
|
+
return true;
|
|
148
|
+
}
|
|
149
|
+
addToQueue(u, d) {
|
|
150
|
+
if (this.scopeManager.isUrlEligible(u) !== 'allowed')
|
|
151
|
+
return;
|
|
152
|
+
if (!this.uniqueQueue.has(u)) {
|
|
153
|
+
this.uniqueQueue.add(u);
|
|
154
|
+
this.queue.push({ url: u, depth: d });
|
|
155
|
+
this.context.emit({ type: 'queue:enqueue', url: u, depth: d });
|
|
156
|
+
const currentDiscovery = this.discoveryDepths.get(u);
|
|
157
|
+
if (currentDiscovery === undefined || d < currentDiscovery) {
|
|
158
|
+
this.discoveryDepths.set(u, d);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
async seedQueue() {
|
|
163
|
+
// Seed from Sitemap
|
|
164
|
+
if (this.options.sitemap) {
|
|
165
|
+
try {
|
|
166
|
+
const sitemapUrl = this.options.sitemap === 'true' ? new URL('/sitemap.xml', this.rootOrigin).toString() : this.options.sitemap;
|
|
167
|
+
if (sitemapUrl.startsWith('http')) {
|
|
168
|
+
this.context.emit({ type: 'info', message: 'Fetching sitemap', context: { url: sitemapUrl } });
|
|
169
|
+
const sitemapUrls = await this.sitemapFetcher.fetch(sitemapUrl);
|
|
170
|
+
for (const u of sitemapUrls) {
|
|
171
|
+
const normalized = normalizeUrl(u, '', this.options);
|
|
172
|
+
if (normalized)
|
|
173
|
+
this.addToQueue(normalized, 0);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
catch (e) {
|
|
178
|
+
this.context.emit({ type: 'warn', message: 'Sitemap fetch failed', context: e });
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
// Seed from startUrl
|
|
182
|
+
this.addToQueue(this.startUrl, 0);
|
|
183
|
+
}
|
|
184
|
+
bufferPage(url, depth, status, data = {}) {
|
|
185
|
+
const existing = this.pageBuffer.get(url);
|
|
186
|
+
const knownDiscovery = this.discoveryDepths.get(url);
|
|
187
|
+
// Always use the best (minimum) depth discovered for this URL
|
|
188
|
+
const finalDepth = knownDiscovery !== undefined ? Math.min(knownDiscovery, depth) : depth;
|
|
189
|
+
if (knownDiscovery === undefined || depth < knownDiscovery) {
|
|
190
|
+
this.discoveryDepths.set(url, depth);
|
|
191
|
+
}
|
|
192
|
+
// If we already have a buffered record, only update if the new one is more "complete" (has status)
|
|
193
|
+
// or if the depth is better.
|
|
194
|
+
if (existing) {
|
|
195
|
+
const isStatusUpdate = status !== 0 && existing.http_status === 0;
|
|
196
|
+
const isBetterDepth = finalDepth < existing.depth;
|
|
197
|
+
if (!isStatusUpdate && !isBetterDepth && Object.keys(data).length === 0) {
|
|
198
|
+
return;
|
|
199
|
+
}
|
|
200
|
+
this.pageBuffer.set(url, {
|
|
201
|
+
...existing,
|
|
202
|
+
depth: finalDepth,
|
|
203
|
+
http_status: status !== 0 ? status : existing.http_status,
|
|
204
|
+
...data
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
else {
|
|
208
|
+
this.pageBuffer.set(url, {
|
|
209
|
+
site_id: this.siteId,
|
|
210
|
+
normalized_url: url,
|
|
211
|
+
depth: finalDepth,
|
|
212
|
+
http_status: status,
|
|
213
|
+
last_seen_snapshot_id: this.snapshotId,
|
|
214
|
+
...data
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
if (this.pageBuffer.size >= 50) {
|
|
218
|
+
this.flushPages();
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
flushPages() {
|
|
222
|
+
if (this.pageBuffer.size === 0)
|
|
223
|
+
return;
|
|
224
|
+
this.pageRepo.upsertMany(Array.from(this.pageBuffer.values()));
|
|
225
|
+
this.pageBuffer.clear();
|
|
226
|
+
}
|
|
227
|
+
bufferEdge(sourceUrl, targetUrl, weight = 1.0, rel = 'internal') {
|
|
228
|
+
this.edgeBuffer.push({ sourceUrl, targetUrl, weight, rel });
|
|
229
|
+
if (this.edgeBuffer.length >= 100) {
|
|
230
|
+
this.flushEdges();
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
flushEdges() {
|
|
234
|
+
if (this.edgeBuffer.length === 0)
|
|
235
|
+
return;
|
|
236
|
+
// To resolve URLs to IDs, we need to make sure pages are flushed first
|
|
237
|
+
this.flushPages();
|
|
238
|
+
const identities = this.pageRepo.getPagesIdentityBySnapshot(this.snapshotId);
|
|
239
|
+
const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
|
|
240
|
+
const edgesToInsert = this.edgeBuffer
|
|
241
|
+
.map(e => ({
|
|
242
|
+
snapshot_id: this.snapshotId,
|
|
243
|
+
source_page_id: urlToId.get(e.sourceUrl),
|
|
244
|
+
target_page_id: urlToId.get(e.targetUrl),
|
|
245
|
+
weight: e.weight,
|
|
246
|
+
rel: e.rel
|
|
247
|
+
}))
|
|
248
|
+
.filter(e => e.source_page_id !== undefined && e.target_page_id !== undefined);
|
|
249
|
+
if (edgesToInsert.length > 0) {
|
|
250
|
+
this.edgeRepo.insertEdges(edgesToInsert);
|
|
251
|
+
}
|
|
252
|
+
this.edgeBuffer = [];
|
|
253
|
+
}
|
|
254
|
+
bufferMetrics(url, data) {
|
|
255
|
+
this.metricsBuffer.push({ url, data });
|
|
256
|
+
if (this.metricsBuffer.length >= 50) {
|
|
257
|
+
this.flushMetrics();
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
flushMetrics() {
|
|
261
|
+
if (this.metricsBuffer.length === 0)
|
|
262
|
+
return;
|
|
263
|
+
this.flushPages();
|
|
264
|
+
const identities = this.pageRepo.getPagesIdentityBySnapshot(this.snapshotId);
|
|
265
|
+
const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
|
|
266
|
+
const metricsList = this.metricsBuffer.map(item => {
|
|
267
|
+
const pageId = urlToId.get(item.url);
|
|
268
|
+
if (!pageId)
|
|
269
|
+
return null;
|
|
270
|
+
return {
|
|
271
|
+
snapshot_id: this.snapshotId,
|
|
272
|
+
page_id: pageId,
|
|
273
|
+
authority_score: null,
|
|
274
|
+
hub_score: null,
|
|
275
|
+
pagerank: null,
|
|
276
|
+
pagerank_score: null,
|
|
277
|
+
link_role: null,
|
|
278
|
+
crawl_status: null,
|
|
279
|
+
word_count: null,
|
|
280
|
+
thin_content_score: null,
|
|
281
|
+
external_link_ratio: null,
|
|
282
|
+
orphan_score: null,
|
|
283
|
+
duplicate_cluster_id: null,
|
|
284
|
+
duplicate_type: null,
|
|
285
|
+
is_cluster_primary: 0,
|
|
286
|
+
...item.data
|
|
287
|
+
};
|
|
288
|
+
}).filter(m => m !== null);
|
|
289
|
+
if (metricsList.length > 0) {
|
|
290
|
+
this.metricsRepo.insertMany(metricsList);
|
|
291
|
+
}
|
|
292
|
+
this.metricsBuffer = [];
|
|
293
|
+
}
|
|
294
|
+
async flushAll() {
|
|
295
|
+
this.flushPages();
|
|
296
|
+
this.flushEdges();
|
|
297
|
+
this.flushMetrics();
|
|
298
|
+
}
|
|
299
|
+
async fetchPage(url, depth, prevNode) {
|
|
300
|
+
const startTime = Date.now();
|
|
301
|
+
try {
|
|
302
|
+
this.context.emit({ type: 'crawl:start', url });
|
|
303
|
+
const res = await this.fetcher.fetch(url, {
|
|
304
|
+
maxBytes: this.options.maxBytes,
|
|
305
|
+
crawlDelay: this.robots ? this.robots.getCrawlDelay('crawlith') : undefined,
|
|
306
|
+
etag: prevNode?.etag,
|
|
307
|
+
lastModified: prevNode?.lastModified
|
|
308
|
+
});
|
|
309
|
+
const durationMs = Date.now() - startTime;
|
|
310
|
+
this.context.emit({
|
|
311
|
+
type: 'crawl:success',
|
|
312
|
+
url,
|
|
313
|
+
status: typeof res.status === 'number' ? res.status : 0,
|
|
314
|
+
durationMs,
|
|
315
|
+
depth
|
|
316
|
+
});
|
|
317
|
+
return res;
|
|
318
|
+
}
|
|
319
|
+
catch (e) {
|
|
320
|
+
this.context.emit({ type: 'crawl:error', url, error: String(e), depth });
|
|
321
|
+
return null;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
handleCachedResponse(url, finalUrl, depth, prevNode) {
|
|
325
|
+
this.bufferPage(finalUrl, depth, 200, {
|
|
326
|
+
html: prevNode.html,
|
|
327
|
+
canonical_url: prevNode.canonical,
|
|
328
|
+
content_hash: prevNode.contentHash,
|
|
329
|
+
simhash: prevNode.simhash,
|
|
330
|
+
etag: prevNode.etag,
|
|
331
|
+
last_modified: prevNode.lastModified,
|
|
332
|
+
noindex: prevNode.noindex ? 1 : 0,
|
|
333
|
+
nofollow: prevNode.nofollow ? 1 : 0
|
|
334
|
+
});
|
|
335
|
+
this.bufferMetrics(finalUrl, {
|
|
336
|
+
crawl_status: 'cached'
|
|
337
|
+
});
|
|
338
|
+
// Re-discovery links from previous graph to continue crawling if needed
|
|
339
|
+
const prevLinks = this.options.previousGraph?.getEdges()
|
|
340
|
+
.filter(e => e.source === url)
|
|
341
|
+
.map(e => e.target);
|
|
342
|
+
if (prevLinks) {
|
|
343
|
+
for (const link of prevLinks) {
|
|
344
|
+
const normalizedLink = normalizeUrl(link, '', this.options);
|
|
345
|
+
if (normalizedLink && normalizedLink !== finalUrl) {
|
|
346
|
+
this.bufferPage(normalizedLink, depth + 1, 0);
|
|
347
|
+
this.bufferEdge(finalUrl, normalizedLink, 1.0, 'internal');
|
|
348
|
+
if (this.shouldEnqueue(normalizedLink, depth + 1)) {
|
|
349
|
+
this.addToQueue(normalizedLink, depth + 1);
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
handleRedirects(chain, depth) {
|
|
356
|
+
for (const step of chain) {
|
|
357
|
+
const source = normalizeUrl(step.url, '', this.options);
|
|
358
|
+
const target = normalizeUrl(step.target, '', this.options);
|
|
359
|
+
if (source && target) {
|
|
360
|
+
this.bufferPage(source, depth, step.status);
|
|
361
|
+
this.bufferPage(target, depth, 0);
|
|
362
|
+
this.bufferEdge(source, target);
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
handleSuccessResponse(res, finalUrl, depth, isBlocked = false) {
|
|
367
|
+
const contentTypeHeader = res.headers['content-type'];
|
|
368
|
+
const contentType = Array.isArray(contentTypeHeader) ? contentTypeHeader[0] : (contentTypeHeader || '');
|
|
369
|
+
if (!contentType || !contentType.toLowerCase().includes('text/html')) {
|
|
370
|
+
this.bufferPage(finalUrl, depth, typeof res.status === 'number' ? res.status : 0);
|
|
371
|
+
return;
|
|
372
|
+
}
|
|
373
|
+
const parseResult = this.parser.parse(res.body, finalUrl, res.status);
|
|
374
|
+
this.bufferPage(finalUrl, depth, res.status, {
|
|
375
|
+
html: parseResult.html,
|
|
376
|
+
canonical_url: parseResult.canonical || undefined,
|
|
377
|
+
noindex: parseResult.noindex ? 1 : 0,
|
|
378
|
+
nofollow: parseResult.nofollow ? 1 : 0,
|
|
379
|
+
content_hash: parseResult.contentHash,
|
|
380
|
+
simhash: parseResult.simhash,
|
|
381
|
+
soft404_score: parseResult.soft404Score,
|
|
382
|
+
etag: res.etag,
|
|
383
|
+
last_modified: res.lastModified,
|
|
384
|
+
retries: res.retries
|
|
385
|
+
});
|
|
386
|
+
try {
|
|
387
|
+
const contentAnalysis = analyzeContent(parseResult.html);
|
|
388
|
+
const linkAnalysis = analyzeLinks(parseResult.html, finalUrl, this.rootOrigin);
|
|
389
|
+
const thinScore = calculateThinContentScore(contentAnalysis, 0);
|
|
390
|
+
this.bufferMetrics(finalUrl, {
|
|
391
|
+
crawl_status: isBlocked ? 'blocked_by_robots' : 'fetched',
|
|
392
|
+
word_count: contentAnalysis.wordCount,
|
|
393
|
+
thin_content_score: thinScore,
|
|
394
|
+
external_link_ratio: linkAnalysis.externalRatio
|
|
395
|
+
});
|
|
396
|
+
}
|
|
397
|
+
catch (e) {
|
|
398
|
+
this.context.emit({ type: 'error', message: 'Error calculating per-page metrics', error: e, context: { url: finalUrl } });
|
|
399
|
+
}
|
|
400
|
+
for (const linkItem of parseResult.links) {
|
|
401
|
+
const normalizedLink = normalizeUrl(linkItem.url, '', this.options);
|
|
402
|
+
if (normalizedLink && normalizedLink !== finalUrl) {
|
|
403
|
+
this.bufferPage(normalizedLink, depth + 1, 0);
|
|
404
|
+
this.bufferEdge(finalUrl, normalizedLink, 1.0, 'internal');
|
|
405
|
+
if (this.shouldEnqueue(normalizedLink, depth + 1)) {
|
|
406
|
+
this.addToQueue(normalizedLink, depth + 1);
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
async processPage(item, isBlocked = false) {
|
|
412
|
+
const { url, depth } = item;
|
|
413
|
+
if (this.scopeManager.isUrlEligible(url) !== 'allowed') {
|
|
414
|
+
this.bufferPage(url, depth, 0, { securityError: 'blocked_by_domain_filter' });
|
|
415
|
+
return;
|
|
416
|
+
}
|
|
417
|
+
try {
|
|
418
|
+
const prevNode = this.options.previousGraph?.nodes.get(url);
|
|
419
|
+
const res = await this.fetchPage(url, depth, prevNode);
|
|
420
|
+
if (!res)
|
|
421
|
+
return;
|
|
422
|
+
const finalUrl = normalizeUrl(res.finalUrl, '', this.options);
|
|
423
|
+
if (!finalUrl)
|
|
424
|
+
return;
|
|
425
|
+
if (res.status === 304 && prevNode) {
|
|
426
|
+
this.handleCachedResponse(url, finalUrl, depth, prevNode);
|
|
427
|
+
return;
|
|
428
|
+
}
|
|
429
|
+
this.handleRedirects(res.redirectChain, depth);
|
|
430
|
+
const isStringStatus = typeof res.status === 'string';
|
|
431
|
+
if (isStringStatus || (typeof res.status === 'number' && res.status >= 300)) {
|
|
432
|
+
const statusNum = typeof res.status === 'number' ? res.status : 0;
|
|
433
|
+
this.bufferPage(finalUrl, depth, statusNum, {
|
|
434
|
+
security_error: isStringStatus ? res.status : undefined,
|
|
435
|
+
retries: res.retries
|
|
436
|
+
});
|
|
437
|
+
this.bufferMetrics(finalUrl, {
|
|
438
|
+
crawl_status: isStringStatus ? res.status : 'fetched_error'
|
|
439
|
+
});
|
|
440
|
+
return;
|
|
441
|
+
}
|
|
442
|
+
if (res.status === 200) {
|
|
443
|
+
this.handleSuccessResponse(res, finalUrl, depth, isBlocked);
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
catch (e) {
|
|
447
|
+
this.context.emit({ type: 'crawl:error', url, error: String(e), depth });
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
async run() {
|
|
451
|
+
await this.initialize();
|
|
452
|
+
this.setupModules();
|
|
453
|
+
await this.fetchRobots();
|
|
454
|
+
await this.seedQueue();
|
|
455
|
+
return new Promise((resolve) => {
|
|
456
|
+
const checkDone = async () => {
|
|
457
|
+
if (this.queue.length === 0 && this.active === 0) {
|
|
458
|
+
await this.flushAll();
|
|
459
|
+
this.snapshotRepo.updateSnapshotStatus(this.snapshotId, 'completed', {
|
|
460
|
+
limit_reached: this.reachedLimit ? 1 : 0
|
|
461
|
+
});
|
|
462
|
+
resolve(this.snapshotId);
|
|
463
|
+
return true;
|
|
464
|
+
}
|
|
465
|
+
return false;
|
|
466
|
+
};
|
|
467
|
+
const next = async () => {
|
|
468
|
+
if (await checkDone())
|
|
469
|
+
return;
|
|
470
|
+
if (this.pagesCrawled >= this.options.limit) {
|
|
471
|
+
this.reachedLimit = true;
|
|
472
|
+
if (this.active === 0) {
|
|
473
|
+
await this.flushAll();
|
|
474
|
+
this.snapshotRepo.updateSnapshotStatus(this.snapshotId, 'completed', {
|
|
475
|
+
limit_reached: 1
|
|
476
|
+
});
|
|
477
|
+
this.context.emit({ type: 'crawl:limit-reached', limit: this.options.limit });
|
|
478
|
+
resolve(this.snapshotId);
|
|
479
|
+
}
|
|
480
|
+
return;
|
|
481
|
+
}
|
|
482
|
+
while (this.queue.length > 0 && this.active < this.concurrency && this.pagesCrawled < this.options.limit) {
|
|
483
|
+
const item = this.queue.shift();
|
|
484
|
+
if (this.visited.has(item.url))
|
|
485
|
+
continue;
|
|
486
|
+
// Robust robots check: if path doesn't end in /, check both /path and /path/
|
|
487
|
+
// to handle cases where normalization stripped a slash that robots.txt relies on.
|
|
488
|
+
const isBlocked = this.robots && (!this.robots.isAllowed(item.url, 'crawlith') ||
|
|
489
|
+
(!item.url.endsWith('/') && !this.robots.isAllowed(item.url + '/', 'crawlith')));
|
|
490
|
+
if (isBlocked) {
|
|
491
|
+
if (this.options.debug) {
|
|
492
|
+
console.log(`${chalk.yellow('⊘ Robots')} ${chalk.gray(item.url)}`);
|
|
493
|
+
}
|
|
494
|
+
// Tag as blocked for reporting
|
|
495
|
+
this.bufferMetrics(item.url, {
|
|
496
|
+
crawl_status: 'blocked_by_robots'
|
|
497
|
+
});
|
|
498
|
+
this.bufferPage(item.url, item.depth, 0);
|
|
499
|
+
if (!this.options.ignoreRobots) {
|
|
500
|
+
this.visited.add(item.url);
|
|
501
|
+
this.pagesCrawled++;
|
|
502
|
+
continue;
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
this.active++;
|
|
506
|
+
this.pagesCrawled++;
|
|
507
|
+
this.visited.add(item.url);
|
|
508
|
+
this.limitConcurrency(() => this.processPage(item, isBlocked)).finally(() => {
|
|
509
|
+
this.active--;
|
|
510
|
+
next();
|
|
511
|
+
});
|
|
512
|
+
}
|
|
513
|
+
await checkDone();
|
|
514
|
+
};
|
|
515
|
+
next();
|
|
516
|
+
});
|
|
517
|
+
}
|
|
518
|
+
}
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Extracts all links from an HTML document.
|
|
3
3
|
* Returns absolute URLs.
|
|
4
|
+
* @param html The HTML content string
|
|
5
|
+
* @param baseUrl The base URL to resolve relative links against
|
|
6
|
+
* @param onError Optional callback for handling extraction errors
|
|
4
7
|
*/
|
|
5
|
-
export declare function extractLinks(html: string, baseUrl: string): string[];
|
|
8
|
+
export declare function extractLinks(html: string, baseUrl: string, onError?: (error: unknown) => void): string[];
|
package/dist/crawler/extract.js
CHANGED
|
@@ -2,8 +2,11 @@ import * as cheerio from 'cheerio';
|
|
|
2
2
|
/**
|
|
3
3
|
* Extracts all links from an HTML document.
|
|
4
4
|
* Returns absolute URLs.
|
|
5
|
+
* @param html The HTML content string
|
|
6
|
+
* @param baseUrl The base URL to resolve relative links against
|
|
7
|
+
* @param onError Optional callback for handling extraction errors
|
|
5
8
|
*/
|
|
6
|
-
export function extractLinks(html, baseUrl) {
|
|
9
|
+
export function extractLinks(html, baseUrl, onError) {
|
|
7
10
|
try {
|
|
8
11
|
const $ = cheerio.load(html);
|
|
9
12
|
const links = new Set();
|
|
@@ -27,7 +30,9 @@ export function extractLinks(html, baseUrl) {
|
|
|
27
30
|
return Array.from(links);
|
|
28
31
|
}
|
|
29
32
|
catch (e) {
|
|
30
|
-
|
|
33
|
+
if (onError) {
|
|
34
|
+
onError(e);
|
|
35
|
+
}
|
|
31
36
|
return [];
|
|
32
37
|
}
|
|
33
38
|
}
|
package/dist/crawler/fetcher.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { request } from 'undici';
|
|
2
|
+
import * as net from 'net';
|
|
2
3
|
import { IPGuard } from '../core/security/ipGuard.js';
|
|
3
4
|
import { RateLimiter } from '../core/network/rateLimiter.js';
|
|
4
5
|
import { RetryPolicy } from '../core/network/retryPolicy.js';
|
|
@@ -10,11 +11,18 @@ export class Fetcher {
|
|
|
10
11
|
userAgent = 'crawlith/1.0';
|
|
11
12
|
rateLimiter;
|
|
12
13
|
proxyAdapter;
|
|
14
|
+
secureDispatcher;
|
|
13
15
|
scopeManager;
|
|
14
16
|
maxRedirects;
|
|
15
17
|
constructor(options = {}) {
|
|
16
18
|
this.rateLimiter = new RateLimiter(options.rate || 2);
|
|
17
19
|
this.proxyAdapter = new ProxyAdapter(options.proxyUrl);
|
|
20
|
+
if (this.proxyAdapter.dispatcher) {
|
|
21
|
+
this.secureDispatcher = this.proxyAdapter.dispatcher;
|
|
22
|
+
}
|
|
23
|
+
else {
|
|
24
|
+
this.secureDispatcher = IPGuard.getSecureDispatcher();
|
|
25
|
+
}
|
|
18
26
|
this.scopeManager = options.scopeManager;
|
|
19
27
|
this.maxRedirects = Math.min(options.maxRedirects ?? 2, 11);
|
|
20
28
|
this.userAgent = options.userAgent || `crawlith/${version}`;
|
|
@@ -28,10 +36,14 @@ export class Fetcher {
|
|
|
28
36
|
// Use a while(true) and explicit return/continue to handle redirects
|
|
29
37
|
while (true) {
|
|
30
38
|
const urlObj = new URL(currentUrl);
|
|
31
|
-
// 1. SSRF Guard
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
39
|
+
// 1. SSRF Guard (IP Literals only)
|
|
40
|
+
// We only check explicit IP literals here to fail fast.
|
|
41
|
+
// For domains, we rely on the secureDispatcher (which uses IPGuard.secureLookup)
|
|
42
|
+
// to resolve and validate the IP at connection time, preventing TOCTOU attacks.
|
|
43
|
+
if (net.isIP(urlObj.hostname)) {
|
|
44
|
+
if (IPGuard.isInternal(urlObj.hostname)) {
|
|
45
|
+
return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
|
|
46
|
+
}
|
|
35
47
|
}
|
|
36
48
|
// 2. Scope Validation (Domain & Subdomain)
|
|
37
49
|
if (this.scopeManager) {
|
|
@@ -61,7 +73,7 @@ export class Fetcher {
|
|
|
61
73
|
method: 'GET',
|
|
62
74
|
headers,
|
|
63
75
|
maxRedirections: 0,
|
|
64
|
-
dispatcher: this.
|
|
76
|
+
dispatcher: this.secureDispatcher,
|
|
65
77
|
headersTimeout: 10000,
|
|
66
78
|
bodyTimeout: 10000
|
|
67
79
|
});
|
|
@@ -141,6 +153,9 @@ export class Fetcher {
|
|
|
141
153
|
catch (error) {
|
|
142
154
|
// Map common network errors to specific statuses if needed
|
|
143
155
|
const isProxyError = error.message?.toLowerCase().includes('proxy') || error.code === 'ECONNREFUSED';
|
|
156
|
+
if (error.code === 'EBLOCKED' || error.message?.includes('Blocked internal IP')) {
|
|
157
|
+
return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
|
|
158
|
+
}
|
|
144
159
|
const finalStatus = isProxyError ? 'proxy_connection_failed' : 'network_error';
|
|
145
160
|
return this.errorResult(totalRetries >= RetryPolicy.DEFAULT_CONFIG.maxRetries ? 'failed_after_retries' : finalStatus, currentUrl, redirectChain, totalRetries);
|
|
146
161
|
}
|
|
@@ -1 +1,3 @@
|
|
|
1
|
-
|
|
1
|
+
import { EngineContext } from '../events.js';
|
|
2
|
+
import { Graph } from '../graph/graph.js';
|
|
3
|
+
export declare function runPostCrawlMetrics(snapshotId: number, maxDepth: number, context?: EngineContext, limitReached?: boolean, graphInstance?: Graph): void;
|