@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/dist/crawler/crawler.js
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import chalk from 'chalk';
|
|
1
|
+
import chalk from '../utils/chalk.js';
|
|
2
2
|
import pLimit from 'p-limit';
|
|
3
3
|
import robotsParser from 'robots-parser';
|
|
4
4
|
import { Fetcher } from './fetcher.js';
|
|
5
5
|
import { Parser } from './parser.js';
|
|
6
6
|
import { Sitemap } from './sitemap.js';
|
|
7
|
-
import { normalizeUrl } from './normalize.js';
|
|
8
|
-
import {
|
|
7
|
+
import { normalizeUrl, UrlUtil } from './normalize.js';
|
|
8
|
+
import { UrlResolver } from './resolver.js';
|
|
9
9
|
import { ScopeManager } from '../core/scope/scopeManager.js';
|
|
10
10
|
import { getDb } from '../db/index.js';
|
|
11
11
|
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
@@ -15,6 +15,7 @@ import { EdgeRepository } from '../db/repositories/EdgeRepository.js';
|
|
|
15
15
|
import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
|
|
16
16
|
import { analyzeContent, calculateThinContentScore } from '../analysis/content.js';
|
|
17
17
|
import { analyzeLinks } from '../analysis/links.js';
|
|
18
|
+
import { DEFAULTS } from '../constants.js';
|
|
18
19
|
// Fallback context for backward compatibility or when no context is provided
|
|
19
20
|
const nullContext = {
|
|
20
21
|
emit: (event) => {
|
|
@@ -32,6 +33,7 @@ export class Crawler {
|
|
|
32
33
|
startUrl;
|
|
33
34
|
options;
|
|
34
35
|
context;
|
|
36
|
+
registry;
|
|
35
37
|
visited;
|
|
36
38
|
uniqueQueue;
|
|
37
39
|
queue;
|
|
@@ -50,6 +52,8 @@ export class Crawler {
|
|
|
50
52
|
// Site/Snapshot info
|
|
51
53
|
siteId = null;
|
|
52
54
|
snapshotId = null;
|
|
55
|
+
reusingSnapshot = false;
|
|
56
|
+
runType = 'completed';
|
|
53
57
|
rootOrigin = '';
|
|
54
58
|
// Discovery tracking
|
|
55
59
|
discoveryDepths = new Map();
|
|
@@ -57,27 +61,34 @@ export class Crawler {
|
|
|
57
61
|
pageBuffer = new Map();
|
|
58
62
|
edgeBuffer = [];
|
|
59
63
|
metricsBuffer = [];
|
|
64
|
+
pendingSitemaps = 0;
|
|
65
|
+
edgesFound = 0;
|
|
66
|
+
lastProgressEmitAt = 0;
|
|
67
|
+
progressPhase = 'crawling';
|
|
60
68
|
// Modules
|
|
61
69
|
scopeManager = null;
|
|
62
70
|
fetcher = null;
|
|
63
71
|
parser = null;
|
|
64
72
|
sitemapFetcher = null;
|
|
65
|
-
trapDetector = null;
|
|
66
73
|
robots = null;
|
|
67
74
|
constructor(startUrl, options, context) {
|
|
68
75
|
this.startUrl = startUrl;
|
|
69
76
|
this.options = options;
|
|
70
77
|
this.context = context || nullContext;
|
|
78
|
+
this.registry = options.registry;
|
|
71
79
|
this.visited = new Set();
|
|
72
80
|
this.uniqueQueue = new Set();
|
|
73
81
|
this.queue = [];
|
|
74
82
|
this.active = 0;
|
|
75
83
|
this.pagesCrawled = 0;
|
|
76
84
|
this.reachedLimit = false;
|
|
77
|
-
this.maxDepthInCrawl = Math.min(options.depth,
|
|
78
|
-
this.concurrency = Math.min(options.concurrency ||
|
|
85
|
+
this.maxDepthInCrawl = Math.min(options.depth || DEFAULTS.MAX_DEPTH, DEFAULTS.MAX_DEPTH_LIMIT);
|
|
86
|
+
this.concurrency = Math.min(options.concurrency || DEFAULTS.CONCURRENCY, DEFAULTS.CONCURRENCY_LIMIT);
|
|
79
87
|
this.limitConcurrency = pLimit(this.concurrency);
|
|
80
88
|
}
|
|
89
|
+
toStorageUrl(url) {
|
|
90
|
+
return UrlUtil.isInternal(url, this.rootOrigin) ? UrlUtil.toPath(url, this.rootOrigin) : url;
|
|
91
|
+
}
|
|
81
92
|
async initialize() {
|
|
82
93
|
const db = getDb();
|
|
83
94
|
this.siteRepo = new SiteRepository(db);
|
|
@@ -85,41 +96,64 @@ export class Crawler {
|
|
|
85
96
|
this.pageRepo = new PageRepository(db);
|
|
86
97
|
this.edgeRepo = new EdgeRepository(db);
|
|
87
98
|
this.metricsRepo = new MetricsRepository(db);
|
|
88
|
-
|
|
99
|
+
// Use resolver to find canonical origin and SSL
|
|
100
|
+
const resolver = new UrlResolver();
|
|
101
|
+
const tempFetcher = new Fetcher({ userAgent: this.options.userAgent, rate: this.options.rate });
|
|
102
|
+
const resolved = await resolver.resolve(this.startUrl, tempFetcher);
|
|
103
|
+
this.rootOrigin = resolved.url;
|
|
104
|
+
// Use the resolved absolute URL as the base — NOT this.startUrl which may be
|
|
105
|
+
// a bare domain (e.g. 'callforpaper.org') that would be treated as a relative
|
|
106
|
+
// path when passed to normalizeUrl, producing '/callforpaper.org'.
|
|
107
|
+
const rootUrl = normalizeUrl(this.rootOrigin, '', { stripQuery: this.options.stripQuery });
|
|
89
108
|
if (!rootUrl)
|
|
90
109
|
throw new Error('Invalid start URL');
|
|
91
|
-
const urlObj = new URL(
|
|
110
|
+
const urlObj = new URL(this.rootOrigin);
|
|
92
111
|
const domain = urlObj.hostname.replace('www.', '');
|
|
93
112
|
const site = this.siteRepo.firstOrCreateSite(domain);
|
|
94
113
|
this.siteId = site.id;
|
|
95
|
-
|
|
96
|
-
this.
|
|
114
|
+
// Persist the resolved preferred URL and SSL status
|
|
115
|
+
this.siteRepo.updateSitePreference(this.siteId, {
|
|
116
|
+
preferred_url: this.rootOrigin,
|
|
117
|
+
ssl: this.rootOrigin.startsWith('https') ? 1 : 0
|
|
118
|
+
});
|
|
97
119
|
this.rootOrigin = urlObj.origin;
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
this.
|
|
101
|
-
|
|
102
|
-
setupModules() {
|
|
120
|
+
// Keep storage path-first for internal URLs and reconcile any legacy absolute rows.
|
|
121
|
+
this.pageRepo.reconcileInternalUrls(this.siteId, this.rootOrigin);
|
|
122
|
+
this.startUrl = this.toStorageUrl(rootUrl);
|
|
123
|
+
// Now that rootOrigin is resolved, initialize ScopeManager with the correct absolute origin
|
|
103
124
|
this.scopeManager = new ScopeManager({
|
|
104
125
|
allowedDomains: this.options.allowedDomains || [],
|
|
105
126
|
deniedDomains: this.options.deniedDomains || [],
|
|
106
127
|
includeSubdomains: this.options.includeSubdomains || false,
|
|
107
|
-
rootUrl: this.
|
|
128
|
+
rootUrl: this.rootOrigin
|
|
108
129
|
});
|
|
130
|
+
// Update fetcher with the now-initialized scopeManager
|
|
131
|
+
if (this.fetcher) {
|
|
132
|
+
this.fetcher.scopeManager = this.scopeManager;
|
|
133
|
+
}
|
|
134
|
+
// Every scan now creates a new snapshot (no reuse)
|
|
135
|
+
const runType = this.options.snapshotRunType || (this.options.previousGraph ? 'incremental' : 'completed');
|
|
136
|
+
this.snapshotId = this.snapshotRepo.createSnapshot(this.siteId, runType);
|
|
137
|
+
this.runType = runType;
|
|
138
|
+
// Expose snapshot context for plugins that persist per-snapshot data.
|
|
139
|
+
this.context.snapshotId = this.snapshotId;
|
|
140
|
+
// Seed discovery depth for root
|
|
141
|
+
this.discoveryDepths.set(this.startUrl, 0);
|
|
142
|
+
}
|
|
143
|
+
setupModules() {
|
|
109
144
|
this.fetcher = new Fetcher({
|
|
110
145
|
rate: this.options.rate,
|
|
111
146
|
proxyUrl: this.options.proxyUrl,
|
|
112
|
-
scopeManager: this.scopeManager,
|
|
147
|
+
scopeManager: this.scopeManager ?? undefined,
|
|
113
148
|
maxRedirects: this.options.maxRedirects,
|
|
114
149
|
userAgent: this.options.userAgent
|
|
115
150
|
});
|
|
116
151
|
this.parser = new Parser();
|
|
117
|
-
this.sitemapFetcher = new Sitemap(this.context);
|
|
118
|
-
this.trapDetector = new TrapDetector();
|
|
152
|
+
this.sitemapFetcher = new Sitemap(this.context, this.fetcher);
|
|
119
153
|
}
|
|
120
154
|
async fetchRobots() {
|
|
155
|
+
const robotsUrl = new URL('/robots.txt', this.rootOrigin).toString();
|
|
121
156
|
try {
|
|
122
|
-
const robotsUrl = new URL('/robots.txt', this.rootOrigin).toString();
|
|
123
157
|
const res = await this.fetcher.fetch(robotsUrl, { maxBytes: 500000 });
|
|
124
158
|
if (res && typeof res.status === 'number' && res.status >= 200 && res.status < 300) {
|
|
125
159
|
this.robots = robotsParser(robotsUrl, res.body);
|
|
@@ -139,20 +173,22 @@ export class Crawler {
|
|
|
139
173
|
return false;
|
|
140
174
|
if (this.scopeManager.isUrlEligible(url) !== 'allowed')
|
|
141
175
|
return false;
|
|
142
|
-
if (this.
|
|
143
|
-
const
|
|
144
|
-
if (
|
|
176
|
+
if (this.registry) {
|
|
177
|
+
const allowed = this.registry.runSyncBailHook('shouldEnqueueUrl', this.context, url, depth);
|
|
178
|
+
if (allowed === false)
|
|
145
179
|
return false;
|
|
146
180
|
}
|
|
147
181
|
return true;
|
|
148
182
|
}
|
|
149
|
-
addToQueue(u, d) {
|
|
183
|
+
addToQueue(u, d, data = {}) {
|
|
150
184
|
if (this.scopeManager.isUrlEligible(u) !== 'allowed')
|
|
151
185
|
return;
|
|
152
186
|
if (!this.uniqueQueue.has(u)) {
|
|
153
187
|
this.uniqueQueue.add(u);
|
|
154
188
|
this.queue.push({ url: u, depth: d });
|
|
155
189
|
this.context.emit({ type: 'queue:enqueue', url: u, depth: d });
|
|
190
|
+
this.emitProgress();
|
|
191
|
+
this.bufferPage(u, d, 0, data);
|
|
156
192
|
const currentDiscovery = this.discoveryDepths.get(u);
|
|
157
193
|
if (currentDiscovery === undefined || d < currentDiscovery) {
|
|
158
194
|
this.discoveryDepths.set(u, d);
|
|
@@ -160,26 +196,72 @@ export class Crawler {
|
|
|
160
196
|
}
|
|
161
197
|
}
|
|
162
198
|
async seedQueue() {
|
|
163
|
-
// Seed from
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
this.addToQueue(normalized, 0);
|
|
174
|
-
}
|
|
175
|
-
}
|
|
199
|
+
// Seed from startUrl first to ensure it's prioritized in the queue
|
|
200
|
+
this.addToQueue(this.startUrl, 0);
|
|
201
|
+
const sitemapsToFetch = new Set();
|
|
202
|
+
// 1. Explicitly configured sitemap
|
|
203
|
+
if (this.options.sitemap && this.runType !== 'single') {
|
|
204
|
+
const explicitUrl = this.options.sitemap === 'true' || this.options.sitemap === true
|
|
205
|
+
? new URL('/sitemap.xml', this.rootOrigin).toString()
|
|
206
|
+
: this.options.sitemap;
|
|
207
|
+
if (typeof explicitUrl === 'string' && explicitUrl.startsWith('http')) {
|
|
208
|
+
sitemapsToFetch.add(explicitUrl);
|
|
176
209
|
}
|
|
177
|
-
|
|
178
|
-
|
|
210
|
+
}
|
|
211
|
+
// 2. Discover sitemaps from robots.txt (unless explicitly disabled)
|
|
212
|
+
// Only auto-fetch on the FIRST real crawl (full/incremental).
|
|
213
|
+
// page --live reuses snapshots and should NOT trigger sitemap fetch.
|
|
214
|
+
const isFirstFullCrawl = this.runType !== 'single' && !this.snapshotRepo?.hasFullCrawl(this.siteId);
|
|
215
|
+
if (this.options.sitemap !== false && (this.options.sitemap || isFirstFullCrawl) && this.robots && this.runType !== 'single') {
|
|
216
|
+
const robotsSitemaps = this.robots.getSitemaps();
|
|
217
|
+
for (const s of robotsSitemaps) {
|
|
218
|
+
if (s)
|
|
219
|
+
sitemapsToFetch.add(s);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
// Process all discovered sitemaps in background
|
|
223
|
+
if (sitemapsToFetch.size > 0) {
|
|
224
|
+
for (const sitemapUrl of sitemapsToFetch) {
|
|
225
|
+
this.pendingSitemaps++;
|
|
226
|
+
// KICK OFF BACKGROUND TASK (Un-awaited)
|
|
227
|
+
(async () => {
|
|
228
|
+
try {
|
|
229
|
+
this.context.emit({ type: 'debug', message: 'Fetching sitemap in background', context: { url: sitemapUrl } });
|
|
230
|
+
const sitemapUrls = await this.sitemapFetcher.fetch(sitemapUrl);
|
|
231
|
+
if (sitemapUrls.length > 0) {
|
|
232
|
+
this.context.emit({ type: 'debug', message: `Mapping ${sitemapUrls.length} URLs from sitemap... (Background)` });
|
|
233
|
+
const sitemapEntries = sitemapUrls.map(u => {
|
|
234
|
+
const normalized = normalizeUrl(u, this.rootOrigin, this.options);
|
|
235
|
+
if (!normalized)
|
|
236
|
+
return null;
|
|
237
|
+
const path = this.toStorageUrl(normalized);
|
|
238
|
+
return {
|
|
239
|
+
site_id: this.siteId,
|
|
240
|
+
normalized_url: path,
|
|
241
|
+
first_seen_snapshot_id: this.snapshotId,
|
|
242
|
+
last_seen_snapshot_id: this.snapshotId,
|
|
243
|
+
discovered_via_sitemap: 1,
|
|
244
|
+
depth: 0,
|
|
245
|
+
http_status: 0
|
|
246
|
+
};
|
|
247
|
+
}).filter((p) => p !== null);
|
|
248
|
+
// Bulk register to DB
|
|
249
|
+
this.pageRepo.upsertMany(sitemapEntries);
|
|
250
|
+
// Add to queue for Actual Crawling
|
|
251
|
+
for (const entry of sitemapEntries) {
|
|
252
|
+
this.addToQueue(entry.normalized_url, 0, { discovered_via_sitemap: 1 });
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
catch (e) {
|
|
257
|
+
this.context.emit({ type: 'warn', message: 'Sitemap fetch failed', context: { url: sitemapUrl, error: String(e) } });
|
|
258
|
+
}
|
|
259
|
+
finally {
|
|
260
|
+
this.pendingSitemaps--;
|
|
261
|
+
}
|
|
262
|
+
})();
|
|
179
263
|
}
|
|
180
264
|
}
|
|
181
|
-
// Seed from startUrl
|
|
182
|
-
this.addToQueue(this.startUrl, 0);
|
|
183
265
|
}
|
|
184
266
|
bufferPage(url, depth, status, data = {}) {
|
|
185
267
|
const existing = this.pageBuffer.get(url);
|
|
@@ -226,10 +308,27 @@ export class Crawler {
|
|
|
226
308
|
}
|
|
227
309
|
bufferEdge(sourceUrl, targetUrl, weight = 1.0, rel = 'internal') {
|
|
228
310
|
this.edgeBuffer.push({ sourceUrl, targetUrl, weight, rel });
|
|
311
|
+
this.edgesFound += 1;
|
|
312
|
+
this.emitProgress();
|
|
229
313
|
if (this.edgeBuffer.length >= 100) {
|
|
230
314
|
this.flushEdges();
|
|
231
315
|
}
|
|
232
316
|
}
|
|
317
|
+
emitProgress(force = false) {
|
|
318
|
+
const now = Date.now();
|
|
319
|
+
if (!force && now - this.lastProgressEmitAt < 200)
|
|
320
|
+
return;
|
|
321
|
+
this.lastProgressEmitAt = now;
|
|
322
|
+
this.context.emit({
|
|
323
|
+
type: 'crawl:progress',
|
|
324
|
+
pagesCrawled: this.pagesCrawled,
|
|
325
|
+
queued: this.queue.length,
|
|
326
|
+
active: this.active,
|
|
327
|
+
nodesFound: this.uniqueQueue.size,
|
|
328
|
+
edgesFound: this.edgesFound,
|
|
329
|
+
phase: this.progressPhase
|
|
330
|
+
});
|
|
331
|
+
}
|
|
233
332
|
flushEdges() {
|
|
234
333
|
if (this.edgeBuffer.length === 0)
|
|
235
334
|
return;
|
|
@@ -237,6 +336,13 @@ export class Crawler {
|
|
|
237
336
|
this.flushPages();
|
|
238
337
|
const identities = this.pageRepo.getPagesIdentityBySnapshot(this.snapshotId);
|
|
239
338
|
const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
|
|
339
|
+
// When reusing a snapshot, clean up stale edges for pages being re-crawled
|
|
340
|
+
if (this.reusingSnapshot) {
|
|
341
|
+
const sourcePageIds = new Set(this.edgeBuffer.map(e => urlToId.get(e.sourceUrl)).filter((id) => id !== undefined));
|
|
342
|
+
for (const pageId of sourcePageIds) {
|
|
343
|
+
this.edgeRepo.deleteEdgesForPage(this.snapshotId, pageId);
|
|
344
|
+
}
|
|
345
|
+
}
|
|
240
346
|
const edgesToInsert = this.edgeBuffer
|
|
241
347
|
.map(e => ({
|
|
242
348
|
snapshot_id: this.snapshotId,
|
|
@@ -270,18 +376,23 @@ export class Crawler {
|
|
|
270
376
|
return {
|
|
271
377
|
snapshot_id: this.snapshotId,
|
|
272
378
|
page_id: pageId,
|
|
273
|
-
authority_score: null,
|
|
274
|
-
hub_score: null,
|
|
275
|
-
pagerank: null,
|
|
276
|
-
pagerank_score: null,
|
|
277
|
-
link_role: null,
|
|
278
379
|
crawl_status: null,
|
|
279
380
|
word_count: null,
|
|
280
381
|
thin_content_score: null,
|
|
281
382
|
external_link_ratio: null,
|
|
282
|
-
|
|
383
|
+
pagerank_score: null,
|
|
384
|
+
hub_score: null,
|
|
385
|
+
auth_score: null,
|
|
386
|
+
link_role: null,
|
|
283
387
|
duplicate_cluster_id: null,
|
|
284
388
|
duplicate_type: null,
|
|
389
|
+
cluster_id: null,
|
|
390
|
+
soft404_score: null,
|
|
391
|
+
heading_score: null,
|
|
392
|
+
orphan_score: null,
|
|
393
|
+
orphan_type: null,
|
|
394
|
+
impact_level: null,
|
|
395
|
+
heading_data: null,
|
|
285
396
|
is_cluster_primary: 0,
|
|
286
397
|
...item.data
|
|
287
398
|
};
|
|
@@ -322,31 +433,39 @@ export class Crawler {
|
|
|
322
433
|
}
|
|
323
434
|
}
|
|
324
435
|
handleCachedResponse(url, finalUrl, depth, prevNode) {
|
|
325
|
-
|
|
436
|
+
const path = url;
|
|
437
|
+
const finalPath = this.toStorageUrl(finalUrl);
|
|
438
|
+
this.bufferPage(finalPath, depth, prevNode.status, {
|
|
326
439
|
html: prevNode.html,
|
|
327
440
|
canonical_url: prevNode.canonical,
|
|
441
|
+
noindex: prevNode.noindex ? 1 : 0,
|
|
442
|
+
nofollow: prevNode.nofollow ? 1 : 0,
|
|
328
443
|
content_hash: prevNode.contentHash,
|
|
329
444
|
simhash: prevNode.simhash,
|
|
330
445
|
etag: prevNode.etag,
|
|
331
|
-
last_modified: prevNode.lastModified
|
|
332
|
-
noindex: prevNode.noindex ? 1 : 0,
|
|
333
|
-
nofollow: prevNode.nofollow ? 1 : 0
|
|
446
|
+
last_modified: prevNode.lastModified
|
|
334
447
|
});
|
|
335
|
-
this.bufferMetrics(
|
|
336
|
-
crawl_status: 'cached'
|
|
448
|
+
this.bufferMetrics(finalPath, {
|
|
449
|
+
crawl_status: 'cached',
|
|
450
|
+
word_count: prevNode.wordCount,
|
|
451
|
+
thin_content_score: prevNode.thinContentScore,
|
|
452
|
+
external_link_ratio: prevNode.externalLinkRatio
|
|
337
453
|
});
|
|
338
454
|
// Re-discovery links from previous graph to continue crawling if needed
|
|
339
455
|
const prevLinks = this.options.previousGraph?.getEdges()
|
|
340
|
-
.filter(e => e.source ===
|
|
456
|
+
.filter(e => e.source === path)
|
|
341
457
|
.map(e => e.target);
|
|
342
458
|
if (prevLinks) {
|
|
343
459
|
for (const link of prevLinks) {
|
|
344
|
-
const normalizedLink = normalizeUrl(link,
|
|
345
|
-
if (normalizedLink
|
|
346
|
-
this.
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
this.
|
|
460
|
+
const normalizedLink = normalizeUrl(link, this.rootOrigin, this.options);
|
|
461
|
+
if (normalizedLink) {
|
|
462
|
+
const path = this.toStorageUrl(normalizedLink);
|
|
463
|
+
if (path !== url) {
|
|
464
|
+
this.bufferPage(path, depth + 1, 0);
|
|
465
|
+
this.bufferEdge(url, path, 1.0, 'internal');
|
|
466
|
+
if (this.shouldEnqueue(path, depth + 1)) {
|
|
467
|
+
this.addToQueue(path, depth + 1);
|
|
468
|
+
}
|
|
350
469
|
}
|
|
351
470
|
}
|
|
352
471
|
}
|
|
@@ -354,40 +473,53 @@ export class Crawler {
|
|
|
354
473
|
}
|
|
355
474
|
handleRedirects(chain, depth) {
|
|
356
475
|
for (const step of chain) {
|
|
357
|
-
const
|
|
358
|
-
const
|
|
359
|
-
if (
|
|
360
|
-
this.
|
|
361
|
-
this.
|
|
362
|
-
|
|
476
|
+
const sourceAbs = normalizeUrl(step.url, this.rootOrigin, this.options);
|
|
477
|
+
const targetAbs = normalizeUrl(step.target, this.rootOrigin, this.options);
|
|
478
|
+
if (sourceAbs && targetAbs) {
|
|
479
|
+
const sourcePath = this.toStorageUrl(sourceAbs);
|
|
480
|
+
const targetPath = this.toStorageUrl(targetAbs);
|
|
481
|
+
const sourceInternal = UrlUtil.isInternal(sourceAbs, this.rootOrigin);
|
|
482
|
+
const targetInternal = UrlUtil.isInternal(targetAbs, this.rootOrigin);
|
|
483
|
+
this.bufferPage(sourcePath, depth, step.status, { is_internal: sourceInternal ? 1 : 0 });
|
|
484
|
+
this.bufferPage(targetPath, depth, 0, { is_internal: targetInternal ? 1 : 0 });
|
|
485
|
+
this.bufferEdge(sourcePath, targetPath, 1.0, targetInternal ? 'internal' : 'external');
|
|
363
486
|
}
|
|
364
487
|
}
|
|
365
488
|
}
|
|
366
|
-
handleSuccessResponse(res,
|
|
489
|
+
handleSuccessResponse(res, path, absoluteUrl, depth, isBlocked = false) {
|
|
367
490
|
const contentTypeHeader = res.headers['content-type'];
|
|
368
491
|
const contentType = Array.isArray(contentTypeHeader) ? contentTypeHeader[0] : (contentTypeHeader || '');
|
|
369
492
|
if (!contentType || !contentType.toLowerCase().includes('text/html')) {
|
|
370
|
-
this.bufferPage(
|
|
493
|
+
this.bufferPage(path, depth, typeof res.status === 'number' ? res.status : 0);
|
|
371
494
|
return;
|
|
372
495
|
}
|
|
373
|
-
const parseResult = this.parser.parse(res.body,
|
|
374
|
-
this.
|
|
496
|
+
const parseResult = this.parser.parse(res.body, absoluteUrl, res.status);
|
|
497
|
+
if (this.registry) {
|
|
498
|
+
this.registry.runHook('onPageParsed', this.context, {
|
|
499
|
+
url: absoluteUrl,
|
|
500
|
+
status: res.status,
|
|
501
|
+
depth: depth,
|
|
502
|
+
headers: res.headers,
|
|
503
|
+
...parseResult
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
this.bufferPage(path, depth, res.status, {
|
|
375
507
|
html: parseResult.html,
|
|
376
508
|
canonical_url: parseResult.canonical || undefined,
|
|
377
509
|
noindex: parseResult.noindex ? 1 : 0,
|
|
378
510
|
nofollow: parseResult.nofollow ? 1 : 0,
|
|
379
511
|
content_hash: parseResult.contentHash,
|
|
380
512
|
simhash: parseResult.simhash,
|
|
381
|
-
soft404_score: parseResult.soft404Score,
|
|
382
513
|
etag: res.etag,
|
|
383
514
|
last_modified: res.lastModified,
|
|
384
|
-
retries: res.retries
|
|
515
|
+
retries: res.retries,
|
|
516
|
+
bytes_received: res.bytesReceived
|
|
385
517
|
});
|
|
386
518
|
try {
|
|
387
519
|
const contentAnalysis = analyzeContent(parseResult.html);
|
|
388
|
-
const linkAnalysis = analyzeLinks(parseResult.html,
|
|
520
|
+
const linkAnalysis = analyzeLinks(parseResult.html, absoluteUrl, this.rootOrigin);
|
|
389
521
|
const thinScore = calculateThinContentScore(contentAnalysis, 0);
|
|
390
|
-
this.bufferMetrics(
|
|
522
|
+
this.bufferMetrics(path, {
|
|
391
523
|
crawl_status: isBlocked ? 'blocked_by_robots' : 'fetched',
|
|
392
524
|
word_count: contentAnalysis.wordCount,
|
|
393
525
|
thin_content_score: thinScore,
|
|
@@ -395,15 +527,19 @@ export class Crawler {
|
|
|
395
527
|
});
|
|
396
528
|
}
|
|
397
529
|
catch (e) {
|
|
398
|
-
this.context.emit({ type: 'error', message: 'Error calculating per-page metrics', error: e, context: { url:
|
|
530
|
+
this.context.emit({ type: 'error', message: 'Error calculating per-page metrics', error: e, context: { url: absoluteUrl } });
|
|
399
531
|
}
|
|
400
532
|
for (const linkItem of parseResult.links) {
|
|
401
|
-
const normalizedLink = normalizeUrl(linkItem.url,
|
|
402
|
-
if (normalizedLink
|
|
403
|
-
this.
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
this.
|
|
533
|
+
const normalizedLink = normalizeUrl(linkItem.url, absoluteUrl, this.options);
|
|
534
|
+
if (normalizedLink) {
|
|
535
|
+
const targetPath = this.toStorageUrl(normalizedLink);
|
|
536
|
+
if (targetPath !== path) {
|
|
537
|
+
const isInternal = UrlUtil.isInternal(normalizedLink, this.rootOrigin);
|
|
538
|
+
this.bufferPage(targetPath, depth + 1, 0, { is_internal: isInternal ? 1 : 0 });
|
|
539
|
+
this.bufferEdge(path, targetPath, 1.0, isInternal ? 'internal' : 'external');
|
|
540
|
+
if (isInternal && this.shouldEnqueue(targetPath, depth + 1)) {
|
|
541
|
+
this.addToQueue(targetPath, depth + 1);
|
|
542
|
+
}
|
|
407
543
|
}
|
|
408
544
|
}
|
|
409
545
|
}
|
|
@@ -414,14 +550,19 @@ export class Crawler {
|
|
|
414
550
|
this.bufferPage(url, depth, 0, { securityError: 'blocked_by_domain_filter' });
|
|
415
551
|
return;
|
|
416
552
|
}
|
|
553
|
+
// Convert stored path to absolute URL for fetching.
|
|
554
|
+
// External/subdomain URLs are already absolute (UrlUtil.toPath returns them as-is).
|
|
555
|
+
const fetchUrl = UrlUtil.toAbsolute(url, this.rootOrigin);
|
|
417
556
|
try {
|
|
418
557
|
const prevNode = this.options.previousGraph?.nodes.get(url);
|
|
419
|
-
const res = await this.fetchPage(
|
|
558
|
+
const res = await this.fetchPage(fetchUrl, depth, prevNode);
|
|
420
559
|
if (!res)
|
|
421
560
|
return;
|
|
422
|
-
const finalUrl = normalizeUrl(res.finalUrl,
|
|
561
|
+
const finalUrl = normalizeUrl(res.finalUrl, this.rootOrigin, this.options);
|
|
423
562
|
if (!finalUrl)
|
|
424
563
|
return;
|
|
564
|
+
const fullUrl = finalUrl; // Already absolute
|
|
565
|
+
const finalPath = this.toStorageUrl(finalUrl);
|
|
425
566
|
if (res.status === 304 && prevNode) {
|
|
426
567
|
this.handleCachedResponse(url, finalUrl, depth, prevNode);
|
|
427
568
|
return;
|
|
@@ -430,17 +571,17 @@ export class Crawler {
|
|
|
430
571
|
const isStringStatus = typeof res.status === 'string';
|
|
431
572
|
if (isStringStatus || (typeof res.status === 'number' && res.status >= 300)) {
|
|
432
573
|
const statusNum = typeof res.status === 'number' ? res.status : 0;
|
|
433
|
-
this.bufferPage(
|
|
574
|
+
this.bufferPage(finalPath, depth, statusNum, {
|
|
434
575
|
security_error: isStringStatus ? res.status : undefined,
|
|
435
576
|
retries: res.retries
|
|
436
577
|
});
|
|
437
|
-
this.bufferMetrics(
|
|
578
|
+
this.bufferMetrics(finalPath, {
|
|
438
579
|
crawl_status: isStringStatus ? res.status : 'fetched_error'
|
|
439
580
|
});
|
|
440
581
|
return;
|
|
441
582
|
}
|
|
442
583
|
if (res.status === 200) {
|
|
443
|
-
this.handleSuccessResponse(res,
|
|
584
|
+
this.handleSuccessResponse(res, finalPath, fullUrl, depth, isBlocked);
|
|
444
585
|
}
|
|
445
586
|
}
|
|
446
587
|
catch (e) {
|
|
@@ -448,17 +589,30 @@ export class Crawler {
|
|
|
448
589
|
}
|
|
449
590
|
}
|
|
450
591
|
async run() {
|
|
451
|
-
|
|
592
|
+
// 1. Setup fetcher and basic modules
|
|
452
593
|
this.setupModules();
|
|
453
|
-
|
|
594
|
+
// 2. Initialize repositories, resolve URL (SSL/WWW), and set up site context
|
|
595
|
+
await this.initialize();
|
|
596
|
+
if (this.options.robots) {
|
|
597
|
+
this.robots = this.options.robots;
|
|
598
|
+
}
|
|
599
|
+
else {
|
|
600
|
+
await this.fetchRobots();
|
|
601
|
+
}
|
|
454
602
|
await this.seedQueue();
|
|
455
603
|
return new Promise((resolve) => {
|
|
456
604
|
const checkDone = async () => {
|
|
457
|
-
if (this.queue.length === 0 && this.active === 0) {
|
|
605
|
+
if (this.queue.length === 0 && this.active === 0 && this.pendingSitemaps === 0) {
|
|
606
|
+
this.progressPhase = 'finalizing';
|
|
607
|
+
this.emitProgress(true);
|
|
458
608
|
await this.flushAll();
|
|
459
609
|
this.snapshotRepo.updateSnapshotStatus(this.snapshotId, 'completed', {
|
|
460
610
|
limit_reached: this.reachedLimit ? 1 : 0
|
|
461
611
|
});
|
|
612
|
+
this.snapshotRepo.pruneSnapshots(this.siteId, DEFAULTS.MAX_SNAPSHOTS, DEFAULTS.MAX_SINGLE_SNAPSHOTS, this.snapshotId);
|
|
613
|
+
if (this.reusingSnapshot) {
|
|
614
|
+
this.snapshotRepo.touchSnapshot(this.snapshotId);
|
|
615
|
+
}
|
|
462
616
|
resolve(this.snapshotId);
|
|
463
617
|
return true;
|
|
464
618
|
}
|
|
@@ -469,12 +623,20 @@ export class Crawler {
|
|
|
469
623
|
return;
|
|
470
624
|
if (this.pagesCrawled >= this.options.limit) {
|
|
471
625
|
this.reachedLimit = true;
|
|
626
|
+
this.progressPhase = 'limit reached';
|
|
627
|
+
this.emitProgress();
|
|
472
628
|
if (this.active === 0) {
|
|
629
|
+
this.context.emit({ type: 'crawl:limit-reached', limit: this.options.limit });
|
|
630
|
+
this.progressPhase = 'finalizing';
|
|
631
|
+
this.emitProgress(true);
|
|
473
632
|
await this.flushAll();
|
|
474
633
|
this.snapshotRepo.updateSnapshotStatus(this.snapshotId, 'completed', {
|
|
475
634
|
limit_reached: 1
|
|
476
635
|
});
|
|
477
|
-
this.
|
|
636
|
+
this.snapshotRepo.pruneSnapshots(this.siteId, DEFAULTS.MAX_SNAPSHOTS, DEFAULTS.MAX_SINGLE_SNAPSHOTS, this.snapshotId);
|
|
637
|
+
if (this.reusingSnapshot) {
|
|
638
|
+
this.snapshotRepo.touchSnapshot(this.snapshotId);
|
|
639
|
+
}
|
|
478
640
|
resolve(this.snapshotId);
|
|
479
641
|
}
|
|
480
642
|
return;
|
|
@@ -483,10 +645,11 @@ export class Crawler {
|
|
|
483
645
|
const item = this.queue.shift();
|
|
484
646
|
if (this.visited.has(item.url))
|
|
485
647
|
continue;
|
|
486
|
-
// Robust robots check:
|
|
487
|
-
//
|
|
488
|
-
const
|
|
489
|
-
|
|
648
|
+
// Robust robots check: reconstruct absolute URL since robots-parser needs full URLs,
|
|
649
|
+
// not root-relative paths. Also check /path/ variant in case robots.txt uses trailing slash.
|
|
650
|
+
const absUrlForRobots = UrlUtil.toAbsolute(item.url, this.rootOrigin);
|
|
651
|
+
const isBlocked = this.robots && (!this.robots.isAllowed(absUrlForRobots, 'crawlith') ||
|
|
652
|
+
(!absUrlForRobots.endsWith('/') && !this.robots.isAllowed(absUrlForRobots + '/', 'crawlith')));
|
|
490
653
|
if (isBlocked) {
|
|
491
654
|
if (this.options.debug) {
|
|
492
655
|
console.log(`${chalk.yellow('⊘ Robots')} ${chalk.gray(item.url)}`);
|
|
@@ -507,9 +670,11 @@ export class Crawler {
|
|
|
507
670
|
this.visited.add(item.url);
|
|
508
671
|
this.limitConcurrency(() => this.processPage(item, isBlocked)).finally(() => {
|
|
509
672
|
this.active--;
|
|
673
|
+
this.emitProgress();
|
|
510
674
|
next();
|
|
511
675
|
});
|
|
512
676
|
}
|
|
677
|
+
this.emitProgress();
|
|
513
678
|
await checkDone();
|
|
514
679
|
};
|
|
515
680
|
next();
|
package/dist/crawler/fetcher.js
CHANGED
|
@@ -6,16 +6,16 @@ import { RetryPolicy } from '../core/network/retryPolicy.js';
|
|
|
6
6
|
import { ResponseLimiter } from '../core/network/responseLimiter.js';
|
|
7
7
|
import { RedirectController } from '../core/network/redirectController.js';
|
|
8
8
|
import { ProxyAdapter } from '../core/network/proxyAdapter.js';
|
|
9
|
-
import {
|
|
9
|
+
import { DEFAULTS } from '../constants.js';
|
|
10
10
|
export class Fetcher {
|
|
11
|
-
userAgent =
|
|
11
|
+
userAgent = DEFAULTS.USER_AGENT;
|
|
12
12
|
rateLimiter;
|
|
13
13
|
proxyAdapter;
|
|
14
14
|
secureDispatcher;
|
|
15
15
|
scopeManager;
|
|
16
16
|
maxRedirects;
|
|
17
17
|
constructor(options = {}) {
|
|
18
|
-
this.rateLimiter = new RateLimiter(options.rate ||
|
|
18
|
+
this.rateLimiter = new RateLimiter(options.rate || DEFAULTS.RATE_LIMIT);
|
|
19
19
|
this.proxyAdapter = new ProxyAdapter(options.proxyUrl);
|
|
20
20
|
if (this.proxyAdapter.dispatcher) {
|
|
21
21
|
this.secureDispatcher = this.proxyAdapter.dispatcher;
|
|
@@ -24,11 +24,11 @@ export class Fetcher {
|
|
|
24
24
|
this.secureDispatcher = IPGuard.getSecureDispatcher();
|
|
25
25
|
}
|
|
26
26
|
this.scopeManager = options.scopeManager;
|
|
27
|
-
this.maxRedirects = Math.min(options.maxRedirects ??
|
|
28
|
-
this.userAgent = options.userAgent ||
|
|
27
|
+
this.maxRedirects = Math.min(options.maxRedirects ?? DEFAULTS.MAX_REDIRECTS, DEFAULTS.MAX_REDIRECTS_LIMIT);
|
|
28
|
+
this.userAgent = options.userAgent || DEFAULTS.USER_AGENT;
|
|
29
29
|
}
|
|
30
30
|
async fetch(url, options = {}) {
|
|
31
|
-
const maxBytes = options.maxBytes ||
|
|
31
|
+
const maxBytes = options.maxBytes || DEFAULTS.MAX_BYTES;
|
|
32
32
|
const redirectChain = [];
|
|
33
33
|
const redirectController = new RedirectController(this.maxRedirects, url);
|
|
34
34
|
let currentUrl = url;
|