@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -0,0 +1,683 @@
|
|
|
1
|
+
import chalk from '../utils/chalk.js';
|
|
2
|
+
import pLimit from 'p-limit';
|
|
3
|
+
import robotsParser from 'robots-parser';
|
|
4
|
+
import { Fetcher } from './fetcher.js';
|
|
5
|
+
import { Parser } from './parser.js';
|
|
6
|
+
import { Sitemap } from './sitemap.js';
|
|
7
|
+
import { normalizeUrl, UrlUtil } from './normalize.js';
|
|
8
|
+
import { UrlResolver } from './resolver.js';
|
|
9
|
+
import { ScopeManager } from '../core/scope/scopeManager.js';
|
|
10
|
+
import { getDb } from '../db/index.js';
|
|
11
|
+
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
12
|
+
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
13
|
+
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
14
|
+
import { EdgeRepository } from '../db/repositories/EdgeRepository.js';
|
|
15
|
+
import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
|
|
16
|
+
import { analyzeContent, calculateThinContentScore } from '../analysis/content.js';
|
|
17
|
+
import { analyzeLinks } from '../analysis/links.js';
|
|
18
|
+
import { DEFAULTS } from '../constants.js';
|
|
19
|
+
// Fallback context for backward compatibility or when no context is provided
|
|
20
|
+
const nullContext = {
|
|
21
|
+
emit: (event) => {
|
|
22
|
+
// Basic console fallback for critical events if no listener is attached
|
|
23
|
+
// This maintains some visibility for consumers not using the event system
|
|
24
|
+
if (event.type === 'error') {
|
|
25
|
+
console.error(event.message, event.error || '');
|
|
26
|
+
}
|
|
27
|
+
else if (event.type === 'warn') {
|
|
28
|
+
console.warn(event.message);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
};
|
|
32
|
+
export class Crawler {
|
|
33
|
+
startUrl;
|
|
34
|
+
options;
|
|
35
|
+
context;
|
|
36
|
+
registry;
|
|
37
|
+
visited;
|
|
38
|
+
uniqueQueue;
|
|
39
|
+
queue;
|
|
40
|
+
active;
|
|
41
|
+
pagesCrawled;
|
|
42
|
+
reachedLimit;
|
|
43
|
+
maxDepthInCrawl;
|
|
44
|
+
concurrency;
|
|
45
|
+
limitConcurrency;
|
|
46
|
+
// Repositories
|
|
47
|
+
siteRepo = null;
|
|
48
|
+
snapshotRepo = null;
|
|
49
|
+
pageRepo = null;
|
|
50
|
+
edgeRepo = null;
|
|
51
|
+
metricsRepo = null;
|
|
52
|
+
// Site/Snapshot info
|
|
53
|
+
siteId = null;
|
|
54
|
+
snapshotId = null;
|
|
55
|
+
reusingSnapshot = false;
|
|
56
|
+
runType = 'completed';
|
|
57
|
+
rootOrigin = '';
|
|
58
|
+
// Discovery tracking
|
|
59
|
+
discoveryDepths = new Map();
|
|
60
|
+
// Buffers for batch operations
|
|
61
|
+
pageBuffer = new Map();
|
|
62
|
+
edgeBuffer = [];
|
|
63
|
+
metricsBuffer = [];
|
|
64
|
+
pendingSitemaps = 0;
|
|
65
|
+
edgesFound = 0;
|
|
66
|
+
lastProgressEmitAt = 0;
|
|
67
|
+
progressPhase = 'crawling';
|
|
68
|
+
// Modules
|
|
69
|
+
scopeManager = null;
|
|
70
|
+
fetcher = null;
|
|
71
|
+
parser = null;
|
|
72
|
+
sitemapFetcher = null;
|
|
73
|
+
robots = null;
|
|
74
|
+
constructor(startUrl, options, context) {
|
|
75
|
+
this.startUrl = startUrl;
|
|
76
|
+
this.options = options;
|
|
77
|
+
this.context = context || nullContext;
|
|
78
|
+
this.registry = options.registry;
|
|
79
|
+
this.visited = new Set();
|
|
80
|
+
this.uniqueQueue = new Set();
|
|
81
|
+
this.queue = [];
|
|
82
|
+
this.active = 0;
|
|
83
|
+
this.pagesCrawled = 0;
|
|
84
|
+
this.reachedLimit = false;
|
|
85
|
+
this.maxDepthInCrawl = Math.min(options.depth || DEFAULTS.MAX_DEPTH, DEFAULTS.MAX_DEPTH_LIMIT);
|
|
86
|
+
this.concurrency = Math.min(options.concurrency || DEFAULTS.CONCURRENCY, DEFAULTS.CONCURRENCY_LIMIT);
|
|
87
|
+
this.limitConcurrency = pLimit(this.concurrency);
|
|
88
|
+
}
|
|
89
|
+
toStorageUrl(url) {
|
|
90
|
+
return UrlUtil.isInternal(url, this.rootOrigin) ? UrlUtil.toPath(url, this.rootOrigin) : url;
|
|
91
|
+
}
|
|
92
|
+
async initialize() {
|
|
93
|
+
const db = getDb();
|
|
94
|
+
this.siteRepo = new SiteRepository(db);
|
|
95
|
+
this.snapshotRepo = new SnapshotRepository(db);
|
|
96
|
+
this.pageRepo = new PageRepository(db);
|
|
97
|
+
this.edgeRepo = new EdgeRepository(db);
|
|
98
|
+
this.metricsRepo = new MetricsRepository(db);
|
|
99
|
+
// Use resolver to find canonical origin and SSL
|
|
100
|
+
const resolver = new UrlResolver();
|
|
101
|
+
const tempFetcher = new Fetcher({ userAgent: this.options.userAgent, rate: this.options.rate });
|
|
102
|
+
const resolved = await resolver.resolve(this.startUrl, tempFetcher);
|
|
103
|
+
this.rootOrigin = resolved.url;
|
|
104
|
+
// Use the resolved absolute URL as the base — NOT this.startUrl which may be
|
|
105
|
+
// a bare domain (e.g. 'callforpaper.org') that would be treated as a relative
|
|
106
|
+
// path when passed to normalizeUrl, producing '/callforpaper.org'.
|
|
107
|
+
const rootUrl = normalizeUrl(this.rootOrigin, '', { stripQuery: this.options.stripQuery });
|
|
108
|
+
if (!rootUrl)
|
|
109
|
+
throw new Error('Invalid start URL');
|
|
110
|
+
const urlObj = new URL(this.rootOrigin);
|
|
111
|
+
const domain = urlObj.hostname.replace('www.', '');
|
|
112
|
+
const site = this.siteRepo.firstOrCreateSite(domain);
|
|
113
|
+
this.siteId = site.id;
|
|
114
|
+
// Persist the resolved preferred URL and SSL status
|
|
115
|
+
this.siteRepo.updateSitePreference(this.siteId, {
|
|
116
|
+
preferred_url: this.rootOrigin,
|
|
117
|
+
ssl: this.rootOrigin.startsWith('https') ? 1 : 0
|
|
118
|
+
});
|
|
119
|
+
this.rootOrigin = urlObj.origin;
|
|
120
|
+
// Keep storage path-first for internal URLs and reconcile any legacy absolute rows.
|
|
121
|
+
this.pageRepo.reconcileInternalUrls(this.siteId, this.rootOrigin);
|
|
122
|
+
this.startUrl = this.toStorageUrl(rootUrl);
|
|
123
|
+
// Now that rootOrigin is resolved, initialize ScopeManager with the correct absolute origin
|
|
124
|
+
this.scopeManager = new ScopeManager({
|
|
125
|
+
allowedDomains: this.options.allowedDomains || [],
|
|
126
|
+
deniedDomains: this.options.deniedDomains || [],
|
|
127
|
+
includeSubdomains: this.options.includeSubdomains || false,
|
|
128
|
+
rootUrl: this.rootOrigin
|
|
129
|
+
});
|
|
130
|
+
// Update fetcher with the now-initialized scopeManager
|
|
131
|
+
if (this.fetcher) {
|
|
132
|
+
this.fetcher.scopeManager = this.scopeManager;
|
|
133
|
+
}
|
|
134
|
+
// Every scan now creates a new snapshot (no reuse)
|
|
135
|
+
const runType = this.options.snapshotRunType || (this.options.previousGraph ? 'incremental' : 'completed');
|
|
136
|
+
this.snapshotId = this.snapshotRepo.createSnapshot(this.siteId, runType);
|
|
137
|
+
this.runType = runType;
|
|
138
|
+
// Expose snapshot context for plugins that persist per-snapshot data.
|
|
139
|
+
this.context.snapshotId = this.snapshotId;
|
|
140
|
+
// Seed discovery depth for root
|
|
141
|
+
this.discoveryDepths.set(this.startUrl, 0);
|
|
142
|
+
}
|
|
143
|
+
setupModules() {
|
|
144
|
+
this.fetcher = new Fetcher({
|
|
145
|
+
rate: this.options.rate,
|
|
146
|
+
proxyUrl: this.options.proxyUrl,
|
|
147
|
+
scopeManager: this.scopeManager ?? undefined,
|
|
148
|
+
maxRedirects: this.options.maxRedirects,
|
|
149
|
+
userAgent: this.options.userAgent
|
|
150
|
+
});
|
|
151
|
+
this.parser = new Parser();
|
|
152
|
+
this.sitemapFetcher = new Sitemap(this.context, this.fetcher);
|
|
153
|
+
}
|
|
154
|
+
async fetchRobots() {
|
|
155
|
+
const robotsUrl = new URL('/robots.txt', this.rootOrigin).toString();
|
|
156
|
+
try {
|
|
157
|
+
const res = await this.fetcher.fetch(robotsUrl, { maxBytes: 500000 });
|
|
158
|
+
if (res && typeof res.status === 'number' && res.status >= 200 && res.status < 300) {
|
|
159
|
+
this.robots = robotsParser(robotsUrl, res.body);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
catch {
|
|
163
|
+
// Suppressed expected network warnings when robots block
|
|
164
|
+
console.warn('Failed to fetch robots.txt, proceeding...');
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
shouldEnqueue(url, depth) {
|
|
168
|
+
if (this.visited.has(url))
|
|
169
|
+
return false;
|
|
170
|
+
if (this.uniqueQueue.has(url))
|
|
171
|
+
return false;
|
|
172
|
+
if (depth > this.maxDepthInCrawl)
|
|
173
|
+
return false;
|
|
174
|
+
if (this.scopeManager.isUrlEligible(url) !== 'allowed')
|
|
175
|
+
return false;
|
|
176
|
+
if (this.registry) {
|
|
177
|
+
const allowed = this.registry.runSyncBailHook('shouldEnqueueUrl', this.context, url, depth);
|
|
178
|
+
if (allowed === false)
|
|
179
|
+
return false;
|
|
180
|
+
}
|
|
181
|
+
return true;
|
|
182
|
+
}
|
|
183
|
+
addToQueue(u, d, data = {}) {
|
|
184
|
+
if (this.scopeManager.isUrlEligible(u) !== 'allowed')
|
|
185
|
+
return;
|
|
186
|
+
if (!this.uniqueQueue.has(u)) {
|
|
187
|
+
this.uniqueQueue.add(u);
|
|
188
|
+
this.queue.push({ url: u, depth: d });
|
|
189
|
+
this.context.emit({ type: 'queue:enqueue', url: u, depth: d });
|
|
190
|
+
this.emitProgress();
|
|
191
|
+
this.bufferPage(u, d, 0, data);
|
|
192
|
+
const currentDiscovery = this.discoveryDepths.get(u);
|
|
193
|
+
if (currentDiscovery === undefined || d < currentDiscovery) {
|
|
194
|
+
this.discoveryDepths.set(u, d);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
async seedQueue() {
|
|
199
|
+
// Seed from startUrl first to ensure it's prioritized in the queue
|
|
200
|
+
this.addToQueue(this.startUrl, 0);
|
|
201
|
+
const sitemapsToFetch = new Set();
|
|
202
|
+
// 1. Explicitly configured sitemap
|
|
203
|
+
if (this.options.sitemap && this.runType !== 'single') {
|
|
204
|
+
const explicitUrl = this.options.sitemap === 'true' || this.options.sitemap === true
|
|
205
|
+
? new URL('/sitemap.xml', this.rootOrigin).toString()
|
|
206
|
+
: this.options.sitemap;
|
|
207
|
+
if (typeof explicitUrl === 'string' && explicitUrl.startsWith('http')) {
|
|
208
|
+
sitemapsToFetch.add(explicitUrl);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
// 2. Discover sitemaps from robots.txt (unless explicitly disabled)
|
|
212
|
+
// Only auto-fetch on the FIRST real crawl (full/incremental).
|
|
213
|
+
// page --live reuses snapshots and should NOT trigger sitemap fetch.
|
|
214
|
+
const isFirstFullCrawl = this.runType !== 'single' && !this.snapshotRepo?.hasFullCrawl(this.siteId);
|
|
215
|
+
if (this.options.sitemap !== false && (this.options.sitemap || isFirstFullCrawl) && this.robots && this.runType !== 'single') {
|
|
216
|
+
const robotsSitemaps = this.robots.getSitemaps();
|
|
217
|
+
for (const s of robotsSitemaps) {
|
|
218
|
+
if (s)
|
|
219
|
+
sitemapsToFetch.add(s);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
// Process all discovered sitemaps in background
|
|
223
|
+
if (sitemapsToFetch.size > 0) {
|
|
224
|
+
for (const sitemapUrl of sitemapsToFetch) {
|
|
225
|
+
this.pendingSitemaps++;
|
|
226
|
+
// KICK OFF BACKGROUND TASK (Un-awaited)
|
|
227
|
+
(async () => {
|
|
228
|
+
try {
|
|
229
|
+
this.context.emit({ type: 'debug', message: 'Fetching sitemap in background', context: { url: sitemapUrl } });
|
|
230
|
+
const sitemapUrls = await this.sitemapFetcher.fetch(sitemapUrl);
|
|
231
|
+
if (sitemapUrls.length > 0) {
|
|
232
|
+
this.context.emit({ type: 'debug', message: `Mapping ${sitemapUrls.length} URLs from sitemap... (Background)` });
|
|
233
|
+
const sitemapEntries = sitemapUrls.map(u => {
|
|
234
|
+
const normalized = normalizeUrl(u, this.rootOrigin, this.options);
|
|
235
|
+
if (!normalized)
|
|
236
|
+
return null;
|
|
237
|
+
const path = this.toStorageUrl(normalized);
|
|
238
|
+
return {
|
|
239
|
+
site_id: this.siteId,
|
|
240
|
+
normalized_url: path,
|
|
241
|
+
first_seen_snapshot_id: this.snapshotId,
|
|
242
|
+
last_seen_snapshot_id: this.snapshotId,
|
|
243
|
+
discovered_via_sitemap: 1,
|
|
244
|
+
depth: 0,
|
|
245
|
+
http_status: 0
|
|
246
|
+
};
|
|
247
|
+
}).filter((p) => p !== null);
|
|
248
|
+
// Bulk register to DB
|
|
249
|
+
this.pageRepo.upsertMany(sitemapEntries);
|
|
250
|
+
// Add to queue for Actual Crawling
|
|
251
|
+
for (const entry of sitemapEntries) {
|
|
252
|
+
this.addToQueue(entry.normalized_url, 0, { discovered_via_sitemap: 1 });
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
catch (e) {
|
|
257
|
+
this.context.emit({ type: 'warn', message: 'Sitemap fetch failed', context: { url: sitemapUrl, error: String(e) } });
|
|
258
|
+
}
|
|
259
|
+
finally {
|
|
260
|
+
this.pendingSitemaps--;
|
|
261
|
+
}
|
|
262
|
+
})();
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
bufferPage(url, depth, status, data = {}) {
|
|
267
|
+
const existing = this.pageBuffer.get(url);
|
|
268
|
+
const knownDiscovery = this.discoveryDepths.get(url);
|
|
269
|
+
// Always use the best (minimum) depth discovered for this URL
|
|
270
|
+
const finalDepth = knownDiscovery !== undefined ? Math.min(knownDiscovery, depth) : depth;
|
|
271
|
+
if (knownDiscovery === undefined || depth < knownDiscovery) {
|
|
272
|
+
this.discoveryDepths.set(url, depth);
|
|
273
|
+
}
|
|
274
|
+
// If we already have a buffered record, only update if the new one is more "complete" (has status)
|
|
275
|
+
// or if the depth is better.
|
|
276
|
+
if (existing) {
|
|
277
|
+
const isStatusUpdate = status !== 0 && existing.http_status === 0;
|
|
278
|
+
const isBetterDepth = finalDepth < existing.depth;
|
|
279
|
+
if (!isStatusUpdate && !isBetterDepth && Object.keys(data).length === 0) {
|
|
280
|
+
return;
|
|
281
|
+
}
|
|
282
|
+
this.pageBuffer.set(url, {
|
|
283
|
+
...existing,
|
|
284
|
+
depth: finalDepth,
|
|
285
|
+
http_status: status !== 0 ? status : existing.http_status,
|
|
286
|
+
...data
|
|
287
|
+
});
|
|
288
|
+
}
|
|
289
|
+
else {
|
|
290
|
+
this.pageBuffer.set(url, {
|
|
291
|
+
site_id: this.siteId,
|
|
292
|
+
normalized_url: url,
|
|
293
|
+
depth: finalDepth,
|
|
294
|
+
http_status: status,
|
|
295
|
+
last_seen_snapshot_id: this.snapshotId,
|
|
296
|
+
...data
|
|
297
|
+
});
|
|
298
|
+
}
|
|
299
|
+
if (this.pageBuffer.size >= 50) {
|
|
300
|
+
this.flushPages();
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
flushPages() {
|
|
304
|
+
if (this.pageBuffer.size === 0)
|
|
305
|
+
return;
|
|
306
|
+
this.pageRepo.upsertMany(Array.from(this.pageBuffer.values()));
|
|
307
|
+
this.pageBuffer.clear();
|
|
308
|
+
}
|
|
309
|
+
bufferEdge(sourceUrl, targetUrl, weight = 1.0, rel = 'internal') {
|
|
310
|
+
this.edgeBuffer.push({ sourceUrl, targetUrl, weight, rel });
|
|
311
|
+
this.edgesFound += 1;
|
|
312
|
+
this.emitProgress();
|
|
313
|
+
if (this.edgeBuffer.length >= 100) {
|
|
314
|
+
this.flushEdges();
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
emitProgress(force = false) {
|
|
318
|
+
const now = Date.now();
|
|
319
|
+
if (!force && now - this.lastProgressEmitAt < 200)
|
|
320
|
+
return;
|
|
321
|
+
this.lastProgressEmitAt = now;
|
|
322
|
+
this.context.emit({
|
|
323
|
+
type: 'crawl:progress',
|
|
324
|
+
pagesCrawled: this.pagesCrawled,
|
|
325
|
+
queued: this.queue.length,
|
|
326
|
+
active: this.active,
|
|
327
|
+
nodesFound: this.uniqueQueue.size,
|
|
328
|
+
edgesFound: this.edgesFound,
|
|
329
|
+
phase: this.progressPhase
|
|
330
|
+
});
|
|
331
|
+
}
|
|
332
|
+
flushEdges() {
|
|
333
|
+
if (this.edgeBuffer.length === 0)
|
|
334
|
+
return;
|
|
335
|
+
// To resolve URLs to IDs, we need to make sure pages are flushed first
|
|
336
|
+
this.flushPages();
|
|
337
|
+
const identities = this.pageRepo.getPagesIdentityBySnapshot(this.snapshotId);
|
|
338
|
+
const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
|
|
339
|
+
// When reusing a snapshot, clean up stale edges for pages being re-crawled
|
|
340
|
+
if (this.reusingSnapshot) {
|
|
341
|
+
const sourcePageIds = new Set(this.edgeBuffer.map(e => urlToId.get(e.sourceUrl)).filter((id) => id !== undefined));
|
|
342
|
+
for (const pageId of sourcePageIds) {
|
|
343
|
+
this.edgeRepo.deleteEdgesForPage(this.snapshotId, pageId);
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
const edgesToInsert = this.edgeBuffer
|
|
347
|
+
.map(e => ({
|
|
348
|
+
snapshot_id: this.snapshotId,
|
|
349
|
+
source_page_id: urlToId.get(e.sourceUrl),
|
|
350
|
+
target_page_id: urlToId.get(e.targetUrl),
|
|
351
|
+
weight: e.weight,
|
|
352
|
+
rel: e.rel
|
|
353
|
+
}))
|
|
354
|
+
.filter(e => e.source_page_id !== undefined && e.target_page_id !== undefined);
|
|
355
|
+
if (edgesToInsert.length > 0) {
|
|
356
|
+
this.edgeRepo.insertEdges(edgesToInsert);
|
|
357
|
+
}
|
|
358
|
+
this.edgeBuffer = [];
|
|
359
|
+
}
|
|
360
|
+
bufferMetrics(url, data) {
|
|
361
|
+
this.metricsBuffer.push({ url, data });
|
|
362
|
+
if (this.metricsBuffer.length >= 50) {
|
|
363
|
+
this.flushMetrics();
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
flushMetrics() {
|
|
367
|
+
if (this.metricsBuffer.length === 0)
|
|
368
|
+
return;
|
|
369
|
+
this.flushPages();
|
|
370
|
+
const identities = this.pageRepo.getPagesIdentityBySnapshot(this.snapshotId);
|
|
371
|
+
const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
|
|
372
|
+
const metricsList = this.metricsBuffer.map(item => {
|
|
373
|
+
const pageId = urlToId.get(item.url);
|
|
374
|
+
if (!pageId)
|
|
375
|
+
return null;
|
|
376
|
+
return {
|
|
377
|
+
snapshot_id: this.snapshotId,
|
|
378
|
+
page_id: pageId,
|
|
379
|
+
crawl_status: null,
|
|
380
|
+
word_count: null,
|
|
381
|
+
thin_content_score: null,
|
|
382
|
+
external_link_ratio: null,
|
|
383
|
+
pagerank_score: null,
|
|
384
|
+
hub_score: null,
|
|
385
|
+
auth_score: null,
|
|
386
|
+
link_role: null,
|
|
387
|
+
duplicate_cluster_id: null,
|
|
388
|
+
duplicate_type: null,
|
|
389
|
+
cluster_id: null,
|
|
390
|
+
soft404_score: null,
|
|
391
|
+
heading_score: null,
|
|
392
|
+
orphan_score: null,
|
|
393
|
+
orphan_type: null,
|
|
394
|
+
impact_level: null,
|
|
395
|
+
heading_data: null,
|
|
396
|
+
is_cluster_primary: 0,
|
|
397
|
+
...item.data
|
|
398
|
+
};
|
|
399
|
+
}).filter(m => m !== null);
|
|
400
|
+
if (metricsList.length > 0) {
|
|
401
|
+
this.metricsRepo.insertMany(metricsList);
|
|
402
|
+
}
|
|
403
|
+
this.metricsBuffer = [];
|
|
404
|
+
}
|
|
405
|
+
async flushAll() {
|
|
406
|
+
this.flushPages();
|
|
407
|
+
this.flushEdges();
|
|
408
|
+
this.flushMetrics();
|
|
409
|
+
}
|
|
410
|
+
async fetchPage(url, depth, prevNode) {
|
|
411
|
+
const startTime = Date.now();
|
|
412
|
+
try {
|
|
413
|
+
this.context.emit({ type: 'crawl:start', url });
|
|
414
|
+
const res = await this.fetcher.fetch(url, {
|
|
415
|
+
maxBytes: this.options.maxBytes,
|
|
416
|
+
crawlDelay: this.robots ? this.robots.getCrawlDelay('crawlith') : undefined,
|
|
417
|
+
etag: prevNode?.etag,
|
|
418
|
+
lastModified: prevNode?.lastModified
|
|
419
|
+
});
|
|
420
|
+
const durationMs = Date.now() - startTime;
|
|
421
|
+
this.context.emit({
|
|
422
|
+
type: 'crawl:success',
|
|
423
|
+
url,
|
|
424
|
+
status: typeof res.status === 'number' ? res.status : 0,
|
|
425
|
+
durationMs,
|
|
426
|
+
depth
|
|
427
|
+
});
|
|
428
|
+
return res;
|
|
429
|
+
}
|
|
430
|
+
catch (e) {
|
|
431
|
+
this.context.emit({ type: 'crawl:error', url, error: String(e), depth });
|
|
432
|
+
return null;
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
handleCachedResponse(url, finalUrl, depth, prevNode) {
|
|
436
|
+
const path = url;
|
|
437
|
+
const finalPath = this.toStorageUrl(finalUrl);
|
|
438
|
+
this.bufferPage(finalPath, depth, prevNode.status, {
|
|
439
|
+
html: prevNode.html,
|
|
440
|
+
canonical_url: prevNode.canonical,
|
|
441
|
+
noindex: prevNode.noindex ? 1 : 0,
|
|
442
|
+
nofollow: prevNode.nofollow ? 1 : 0,
|
|
443
|
+
content_hash: prevNode.contentHash,
|
|
444
|
+
simhash: prevNode.simhash,
|
|
445
|
+
etag: prevNode.etag,
|
|
446
|
+
last_modified: prevNode.lastModified
|
|
447
|
+
});
|
|
448
|
+
this.bufferMetrics(finalPath, {
|
|
449
|
+
crawl_status: 'cached',
|
|
450
|
+
word_count: prevNode.wordCount,
|
|
451
|
+
thin_content_score: prevNode.thinContentScore,
|
|
452
|
+
external_link_ratio: prevNode.externalLinkRatio
|
|
453
|
+
});
|
|
454
|
+
// Re-discovery links from previous graph to continue crawling if needed
|
|
455
|
+
const prevLinks = this.options.previousGraph?.getEdges()
|
|
456
|
+
.filter(e => e.source === path)
|
|
457
|
+
.map(e => e.target);
|
|
458
|
+
if (prevLinks) {
|
|
459
|
+
for (const link of prevLinks) {
|
|
460
|
+
const normalizedLink = normalizeUrl(link, this.rootOrigin, this.options);
|
|
461
|
+
if (normalizedLink) {
|
|
462
|
+
const path = this.toStorageUrl(normalizedLink);
|
|
463
|
+
if (path !== url) {
|
|
464
|
+
this.bufferPage(path, depth + 1, 0);
|
|
465
|
+
this.bufferEdge(url, path, 1.0, 'internal');
|
|
466
|
+
if (this.shouldEnqueue(path, depth + 1)) {
|
|
467
|
+
this.addToQueue(path, depth + 1);
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
handleRedirects(chain, depth) {
|
|
475
|
+
for (const step of chain) {
|
|
476
|
+
const sourceAbs = normalizeUrl(step.url, this.rootOrigin, this.options);
|
|
477
|
+
const targetAbs = normalizeUrl(step.target, this.rootOrigin, this.options);
|
|
478
|
+
if (sourceAbs && targetAbs) {
|
|
479
|
+
const sourcePath = this.toStorageUrl(sourceAbs);
|
|
480
|
+
const targetPath = this.toStorageUrl(targetAbs);
|
|
481
|
+
const sourceInternal = UrlUtil.isInternal(sourceAbs, this.rootOrigin);
|
|
482
|
+
const targetInternal = UrlUtil.isInternal(targetAbs, this.rootOrigin);
|
|
483
|
+
this.bufferPage(sourcePath, depth, step.status, { is_internal: sourceInternal ? 1 : 0 });
|
|
484
|
+
this.bufferPage(targetPath, depth, 0, { is_internal: targetInternal ? 1 : 0 });
|
|
485
|
+
this.bufferEdge(sourcePath, targetPath, 1.0, targetInternal ? 'internal' : 'external');
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
handleSuccessResponse(res, path, absoluteUrl, depth, isBlocked = false) {
|
|
490
|
+
const contentTypeHeader = res.headers['content-type'];
|
|
491
|
+
const contentType = Array.isArray(contentTypeHeader) ? contentTypeHeader[0] : (contentTypeHeader || '');
|
|
492
|
+
if (!contentType || !contentType.toLowerCase().includes('text/html')) {
|
|
493
|
+
this.bufferPage(path, depth, typeof res.status === 'number' ? res.status : 0);
|
|
494
|
+
return;
|
|
495
|
+
}
|
|
496
|
+
const parseResult = this.parser.parse(res.body, absoluteUrl, res.status);
|
|
497
|
+
if (this.registry) {
|
|
498
|
+
this.registry.runHook('onPageParsed', this.context, {
|
|
499
|
+
url: absoluteUrl,
|
|
500
|
+
status: res.status,
|
|
501
|
+
depth: depth,
|
|
502
|
+
headers: res.headers,
|
|
503
|
+
...parseResult
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
this.bufferPage(path, depth, res.status, {
|
|
507
|
+
html: parseResult.html,
|
|
508
|
+
canonical_url: parseResult.canonical || undefined,
|
|
509
|
+
noindex: parseResult.noindex ? 1 : 0,
|
|
510
|
+
nofollow: parseResult.nofollow ? 1 : 0,
|
|
511
|
+
content_hash: parseResult.contentHash,
|
|
512
|
+
simhash: parseResult.simhash,
|
|
513
|
+
etag: res.etag,
|
|
514
|
+
last_modified: res.lastModified,
|
|
515
|
+
retries: res.retries,
|
|
516
|
+
bytes_received: res.bytesReceived
|
|
517
|
+
});
|
|
518
|
+
try {
|
|
519
|
+
const contentAnalysis = analyzeContent(parseResult.html);
|
|
520
|
+
const linkAnalysis = analyzeLinks(parseResult.html, absoluteUrl, this.rootOrigin);
|
|
521
|
+
const thinScore = calculateThinContentScore(contentAnalysis, 0);
|
|
522
|
+
this.bufferMetrics(path, {
|
|
523
|
+
crawl_status: isBlocked ? 'blocked_by_robots' : 'fetched',
|
|
524
|
+
word_count: contentAnalysis.wordCount,
|
|
525
|
+
thin_content_score: thinScore,
|
|
526
|
+
external_link_ratio: linkAnalysis.externalRatio
|
|
527
|
+
});
|
|
528
|
+
}
|
|
529
|
+
catch (e) {
|
|
530
|
+
this.context.emit({ type: 'error', message: 'Error calculating per-page metrics', error: e, context: { url: absoluteUrl } });
|
|
531
|
+
}
|
|
532
|
+
for (const linkItem of parseResult.links) {
|
|
533
|
+
const normalizedLink = normalizeUrl(linkItem.url, absoluteUrl, this.options);
|
|
534
|
+
if (normalizedLink) {
|
|
535
|
+
const targetPath = this.toStorageUrl(normalizedLink);
|
|
536
|
+
if (targetPath !== path) {
|
|
537
|
+
const isInternal = UrlUtil.isInternal(normalizedLink, this.rootOrigin);
|
|
538
|
+
this.bufferPage(targetPath, depth + 1, 0, { is_internal: isInternal ? 1 : 0 });
|
|
539
|
+
this.bufferEdge(path, targetPath, 1.0, isInternal ? 'internal' : 'external');
|
|
540
|
+
if (isInternal && this.shouldEnqueue(targetPath, depth + 1)) {
|
|
541
|
+
this.addToQueue(targetPath, depth + 1);
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
async processPage(item, isBlocked = false) {
|
|
548
|
+
const { url, depth } = item;
|
|
549
|
+
if (this.scopeManager.isUrlEligible(url) !== 'allowed') {
|
|
550
|
+
this.bufferPage(url, depth, 0, { securityError: 'blocked_by_domain_filter' });
|
|
551
|
+
return;
|
|
552
|
+
}
|
|
553
|
+
// Convert stored path to absolute URL for fetching.
|
|
554
|
+
// External/subdomain URLs are already absolute (UrlUtil.toPath returns them as-is).
|
|
555
|
+
const fetchUrl = UrlUtil.toAbsolute(url, this.rootOrigin);
|
|
556
|
+
try {
|
|
557
|
+
const prevNode = this.options.previousGraph?.nodes.get(url);
|
|
558
|
+
const res = await this.fetchPage(fetchUrl, depth, prevNode);
|
|
559
|
+
if (!res)
|
|
560
|
+
return;
|
|
561
|
+
const finalUrl = normalizeUrl(res.finalUrl, this.rootOrigin, this.options);
|
|
562
|
+
if (!finalUrl)
|
|
563
|
+
return;
|
|
564
|
+
const fullUrl = finalUrl; // Already absolute
|
|
565
|
+
const finalPath = this.toStorageUrl(finalUrl);
|
|
566
|
+
if (res.status === 304 && prevNode) {
|
|
567
|
+
this.handleCachedResponse(url, finalUrl, depth, prevNode);
|
|
568
|
+
return;
|
|
569
|
+
}
|
|
570
|
+
this.handleRedirects(res.redirectChain, depth);
|
|
571
|
+
const isStringStatus = typeof res.status === 'string';
|
|
572
|
+
if (isStringStatus || (typeof res.status === 'number' && res.status >= 300)) {
|
|
573
|
+
const statusNum = typeof res.status === 'number' ? res.status : 0;
|
|
574
|
+
this.bufferPage(finalPath, depth, statusNum, {
|
|
575
|
+
security_error: isStringStatus ? res.status : undefined,
|
|
576
|
+
retries: res.retries
|
|
577
|
+
});
|
|
578
|
+
this.bufferMetrics(finalPath, {
|
|
579
|
+
crawl_status: isStringStatus ? res.status : 'fetched_error'
|
|
580
|
+
});
|
|
581
|
+
return;
|
|
582
|
+
}
|
|
583
|
+
if (res.status === 200) {
|
|
584
|
+
this.handleSuccessResponse(res, finalPath, fullUrl, depth, isBlocked);
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
catch (e) {
|
|
588
|
+
this.context.emit({ type: 'crawl:error', url, error: String(e), depth });
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
async run() {
|
|
592
|
+
// 1. Setup fetcher and basic modules
|
|
593
|
+
this.setupModules();
|
|
594
|
+
// 2. Initialize repositories, resolve URL (SSL/WWW), and set up site context
|
|
595
|
+
await this.initialize();
|
|
596
|
+
if (this.options.robots) {
|
|
597
|
+
this.robots = this.options.robots;
|
|
598
|
+
}
|
|
599
|
+
else {
|
|
600
|
+
await this.fetchRobots();
|
|
601
|
+
}
|
|
602
|
+
await this.seedQueue();
|
|
603
|
+
return new Promise((resolve) => {
|
|
604
|
+
const checkDone = async () => {
|
|
605
|
+
if (this.queue.length === 0 && this.active === 0 && this.pendingSitemaps === 0) {
|
|
606
|
+
this.progressPhase = 'finalizing';
|
|
607
|
+
this.emitProgress(true);
|
|
608
|
+
await this.flushAll();
|
|
609
|
+
this.snapshotRepo.updateSnapshotStatus(this.snapshotId, 'completed', {
|
|
610
|
+
limit_reached: this.reachedLimit ? 1 : 0
|
|
611
|
+
});
|
|
612
|
+
this.snapshotRepo.pruneSnapshots(this.siteId, DEFAULTS.MAX_SNAPSHOTS, DEFAULTS.MAX_SINGLE_SNAPSHOTS, this.snapshotId);
|
|
613
|
+
if (this.reusingSnapshot) {
|
|
614
|
+
this.snapshotRepo.touchSnapshot(this.snapshotId);
|
|
615
|
+
}
|
|
616
|
+
resolve(this.snapshotId);
|
|
617
|
+
return true;
|
|
618
|
+
}
|
|
619
|
+
return false;
|
|
620
|
+
};
|
|
621
|
+
const next = async () => {
|
|
622
|
+
if (await checkDone())
|
|
623
|
+
return;
|
|
624
|
+
if (this.pagesCrawled >= this.options.limit) {
|
|
625
|
+
this.reachedLimit = true;
|
|
626
|
+
this.progressPhase = 'limit reached';
|
|
627
|
+
this.emitProgress();
|
|
628
|
+
if (this.active === 0) {
|
|
629
|
+
this.context.emit({ type: 'crawl:limit-reached', limit: this.options.limit });
|
|
630
|
+
this.progressPhase = 'finalizing';
|
|
631
|
+
this.emitProgress(true);
|
|
632
|
+
await this.flushAll();
|
|
633
|
+
this.snapshotRepo.updateSnapshotStatus(this.snapshotId, 'completed', {
|
|
634
|
+
limit_reached: 1
|
|
635
|
+
});
|
|
636
|
+
this.snapshotRepo.pruneSnapshots(this.siteId, DEFAULTS.MAX_SNAPSHOTS, DEFAULTS.MAX_SINGLE_SNAPSHOTS, this.snapshotId);
|
|
637
|
+
if (this.reusingSnapshot) {
|
|
638
|
+
this.snapshotRepo.touchSnapshot(this.snapshotId);
|
|
639
|
+
}
|
|
640
|
+
resolve(this.snapshotId);
|
|
641
|
+
}
|
|
642
|
+
return;
|
|
643
|
+
}
|
|
644
|
+
while (this.queue.length > 0 && this.active < this.concurrency && this.pagesCrawled < this.options.limit) {
|
|
645
|
+
const item = this.queue.shift();
|
|
646
|
+
if (this.visited.has(item.url))
|
|
647
|
+
continue;
|
|
648
|
+
// Robust robots check: reconstruct absolute URL since robots-parser needs full URLs,
|
|
649
|
+
// not root-relative paths. Also check /path/ variant in case robots.txt uses trailing slash.
|
|
650
|
+
const absUrlForRobots = UrlUtil.toAbsolute(item.url, this.rootOrigin);
|
|
651
|
+
const isBlocked = this.robots && (!this.robots.isAllowed(absUrlForRobots, 'crawlith') ||
|
|
652
|
+
(!absUrlForRobots.endsWith('/') && !this.robots.isAllowed(absUrlForRobots + '/', 'crawlith')));
|
|
653
|
+
if (isBlocked) {
|
|
654
|
+
if (this.options.debug) {
|
|
655
|
+
console.log(`${chalk.yellow('⊘ Robots')} ${chalk.gray(item.url)}`);
|
|
656
|
+
}
|
|
657
|
+
// Tag as blocked for reporting
|
|
658
|
+
this.bufferMetrics(item.url, {
|
|
659
|
+
crawl_status: 'blocked_by_robots'
|
|
660
|
+
});
|
|
661
|
+
this.bufferPage(item.url, item.depth, 0);
|
|
662
|
+
if (!this.options.ignoreRobots) {
|
|
663
|
+
this.visited.add(item.url);
|
|
664
|
+
this.pagesCrawled++;
|
|
665
|
+
continue;
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
this.active++;
|
|
669
|
+
this.pagesCrawled++;
|
|
670
|
+
this.visited.add(item.url);
|
|
671
|
+
this.limitConcurrency(() => this.processPage(item, isBlocked)).finally(() => {
|
|
672
|
+
this.active--;
|
|
673
|
+
this.emitProgress();
|
|
674
|
+
next();
|
|
675
|
+
});
|
|
676
|
+
}
|
|
677
|
+
this.emitProgress();
|
|
678
|
+
await checkDone();
|
|
679
|
+
};
|
|
680
|
+
next();
|
|
681
|
+
});
|
|
682
|
+
}
|
|
683
|
+
}
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Extracts all links from an HTML document.
|
|
3
3
|
* Returns absolute URLs.
|
|
4
|
+
* @param html The HTML content string
|
|
5
|
+
* @param baseUrl The base URL to resolve relative links against
|
|
6
|
+
* @param onError Optional callback for handling extraction errors
|
|
4
7
|
*/
|
|
5
|
-
export declare function extractLinks(html: string, baseUrl: string): string[];
|
|
8
|
+
export declare function extractLinks(html: string, baseUrl: string, onError?: (error: unknown) => void): string[];
|