@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/src/crawler/crawler.ts
DELETED
|
@@ -1,601 +0,0 @@
|
|
|
1
|
-
import chalk from 'chalk';
|
|
2
|
-
import pLimit from 'p-limit';
|
|
3
|
-
import robotsParser from 'robots-parser';
|
|
4
|
-
import { Graph, GraphNode } from '../graph/graph.js';
|
|
5
|
-
import { Fetcher, FetchResult } from './fetcher.js';
|
|
6
|
-
import { Parser } from './parser.js';
|
|
7
|
-
import { Sitemap } from './sitemap.js';
|
|
8
|
-
import { normalizeUrl } from './normalize.js';
|
|
9
|
-
import { TrapDetector } from './trap.js';
|
|
10
|
-
import { ScopeManager } from '../core/scope/scopeManager.js';
|
|
11
|
-
import { getDb } from '../db/index.js';
|
|
12
|
-
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
13
|
-
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
14
|
-
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
15
|
-
import { EdgeRepository } from '../db/repositories/EdgeRepository.js';
|
|
16
|
-
import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
|
|
17
|
-
import { analyzeContent, calculateThinContentScore } from '../analysis/content.js';
|
|
18
|
-
import { analyzeLinks } from '../analysis/links.js';
|
|
19
|
-
import { EngineContext } from '../events.js';
|
|
20
|
-
|
|
21
|
-
export interface CrawlOptions {
|
|
22
|
-
limit: number;
|
|
23
|
-
depth: number;
|
|
24
|
-
concurrency?: number;
|
|
25
|
-
ignoreRobots?: boolean;
|
|
26
|
-
stripQuery?: boolean;
|
|
27
|
-
previousGraph?: Graph;
|
|
28
|
-
sitemap?: string;
|
|
29
|
-
debug?: boolean;
|
|
30
|
-
detectSoft404?: boolean;
|
|
31
|
-
detectTraps?: boolean;
|
|
32
|
-
rate?: number;
|
|
33
|
-
maxBytes?: number;
|
|
34
|
-
allowedDomains?: string[];
|
|
35
|
-
deniedDomains?: string[];
|
|
36
|
-
includeSubdomains?: boolean;
|
|
37
|
-
proxyUrl?: string;
|
|
38
|
-
maxRedirects?: number;
|
|
39
|
-
userAgent?: string;
|
|
40
|
-
snapshotType?: 'full' | 'partial' | 'incremental';
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
interface QueueItem {
|
|
44
|
-
url: string;
|
|
45
|
-
depth: number;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
// Fallback context for backward compatibility or when no context is provided
|
|
49
|
-
const nullContext: EngineContext = {
|
|
50
|
-
emit: (event) => {
|
|
51
|
-
// Basic console fallback for critical events if no listener is attached
|
|
52
|
-
// This maintains some visibility for consumers not using the event system
|
|
53
|
-
if (event.type === 'error') {
|
|
54
|
-
console.error(event.message, event.error || '');
|
|
55
|
-
} else if (event.type === 'warn') {
|
|
56
|
-
console.warn(event.message);
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
};
|
|
60
|
-
|
|
61
|
-
export class Crawler {
|
|
62
|
-
private startUrl: string;
|
|
63
|
-
private options: CrawlOptions;
|
|
64
|
-
private context: EngineContext;
|
|
65
|
-
private visited: Set<string>;
|
|
66
|
-
private uniqueQueue: Set<string>;
|
|
67
|
-
private queue: QueueItem[];
|
|
68
|
-
private active: number;
|
|
69
|
-
private pagesCrawled: number;
|
|
70
|
-
private reachedLimit: boolean;
|
|
71
|
-
private maxDepthInCrawl: number;
|
|
72
|
-
private concurrency: number;
|
|
73
|
-
private limitConcurrency: ReturnType<typeof pLimit>;
|
|
74
|
-
|
|
75
|
-
// Repositories
|
|
76
|
-
private siteRepo: SiteRepository | null = null;
|
|
77
|
-
private snapshotRepo: SnapshotRepository | null = null;
|
|
78
|
-
private pageRepo: PageRepository | null = null;
|
|
79
|
-
private edgeRepo: EdgeRepository | null = null;
|
|
80
|
-
private metricsRepo: MetricsRepository | null = null;
|
|
81
|
-
|
|
82
|
-
// Site/Snapshot info
|
|
83
|
-
private siteId: number | null = null;
|
|
84
|
-
private snapshotId: number | null = null;
|
|
85
|
-
private rootOrigin: string = '';
|
|
86
|
-
|
|
87
|
-
// Discovery tracking
|
|
88
|
-
private discoveryDepths: Map<string, number> = new Map();
|
|
89
|
-
|
|
90
|
-
// Buffers for batch operations
|
|
91
|
-
private pageBuffer: Map<string, any> = new Map();
|
|
92
|
-
private edgeBuffer: { sourceUrl: string; targetUrl: string; weight: number; rel: string }[] = [];
|
|
93
|
-
private metricsBuffer: any[] = [];
|
|
94
|
-
|
|
95
|
-
// Modules
|
|
96
|
-
private scopeManager: ScopeManager | null = null;
|
|
97
|
-
private fetcher: Fetcher | null = null;
|
|
98
|
-
private parser: Parser | null = null;
|
|
99
|
-
private sitemapFetcher: Sitemap | null = null;
|
|
100
|
-
private trapDetector: TrapDetector | null = null;
|
|
101
|
-
private robots: any = null;
|
|
102
|
-
|
|
103
|
-
constructor(startUrl: string, options: CrawlOptions, context?: EngineContext) {
|
|
104
|
-
this.startUrl = startUrl;
|
|
105
|
-
this.options = options;
|
|
106
|
-
this.context = context || nullContext;
|
|
107
|
-
this.visited = new Set<string>();
|
|
108
|
-
this.uniqueQueue = new Set<string>();
|
|
109
|
-
this.queue = [];
|
|
110
|
-
this.active = 0;
|
|
111
|
-
this.pagesCrawled = 0;
|
|
112
|
-
this.reachedLimit = false;
|
|
113
|
-
this.maxDepthInCrawl = Math.min(options.depth, 10);
|
|
114
|
-
this.concurrency = Math.min(options.concurrency || 2, 10);
|
|
115
|
-
this.limitConcurrency = pLimit(this.concurrency);
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
async initialize(): Promise<void> {
|
|
119
|
-
const db = getDb();
|
|
120
|
-
this.siteRepo = new SiteRepository(db);
|
|
121
|
-
this.snapshotRepo = new SnapshotRepository(db);
|
|
122
|
-
this.pageRepo = new PageRepository(db);
|
|
123
|
-
this.edgeRepo = new EdgeRepository(db);
|
|
124
|
-
this.metricsRepo = new MetricsRepository(db);
|
|
125
|
-
|
|
126
|
-
const rootUrl = normalizeUrl(this.startUrl, '', { stripQuery: this.options.stripQuery });
|
|
127
|
-
if (!rootUrl) throw new Error('Invalid start URL');
|
|
128
|
-
|
|
129
|
-
const urlObj = new URL(rootUrl);
|
|
130
|
-
const domain = urlObj.hostname.replace('www.', '');
|
|
131
|
-
const site = this.siteRepo.firstOrCreateSite(domain);
|
|
132
|
-
this.siteId = site.id;
|
|
133
|
-
const type = this.options.snapshotType || (this.options.previousGraph ? 'incremental' : 'full');
|
|
134
|
-
this.snapshotId = this.snapshotRepo.createSnapshot(this.siteId, type);
|
|
135
|
-
this.rootOrigin = urlObj.origin;
|
|
136
|
-
this.startUrl = rootUrl;
|
|
137
|
-
|
|
138
|
-
// Seed discovery depth for root
|
|
139
|
-
this.discoveryDepths.set(this.startUrl, 0);
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
setupModules(): void {
|
|
143
|
-
this.scopeManager = new ScopeManager({
|
|
144
|
-
allowedDomains: this.options.allowedDomains || [],
|
|
145
|
-
deniedDomains: this.options.deniedDomains || [],
|
|
146
|
-
includeSubdomains: this.options.includeSubdomains || false,
|
|
147
|
-
rootUrl: this.startUrl
|
|
148
|
-
});
|
|
149
|
-
|
|
150
|
-
this.fetcher = new Fetcher({
|
|
151
|
-
rate: this.options.rate,
|
|
152
|
-
proxyUrl: this.options.proxyUrl,
|
|
153
|
-
scopeManager: this.scopeManager,
|
|
154
|
-
maxRedirects: this.options.maxRedirects,
|
|
155
|
-
userAgent: this.options.userAgent
|
|
156
|
-
});
|
|
157
|
-
|
|
158
|
-
this.parser = new Parser();
|
|
159
|
-
this.sitemapFetcher = new Sitemap(this.context);
|
|
160
|
-
this.trapDetector = new TrapDetector();
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
async fetchRobots(): Promise<void> {
|
|
164
|
-
try {
|
|
165
|
-
const robotsUrl = new URL('/robots.txt', this.rootOrigin).toString();
|
|
166
|
-
const res = await this.fetcher!.fetch(robotsUrl, { maxBytes: 500000 });
|
|
167
|
-
if (res && typeof res.status === 'number' && res.status >= 200 && res.status < 300) {
|
|
168
|
-
this.robots = (robotsParser as any)(robotsUrl, res.body);
|
|
169
|
-
}
|
|
170
|
-
} catch {
|
|
171
|
-
// Suppressed expected network warnings when robots block
|
|
172
|
-
console.warn('Failed to fetch robots.txt, proceeding...');
|
|
173
|
-
}
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
shouldEnqueue(url: string, depth: number): boolean {
|
|
177
|
-
if (this.visited.has(url)) return false;
|
|
178
|
-
if (this.uniqueQueue.has(url)) return false;
|
|
179
|
-
if (depth > this.maxDepthInCrawl) return false;
|
|
180
|
-
if (this.scopeManager!.isUrlEligible(url) !== 'allowed') return false;
|
|
181
|
-
|
|
182
|
-
if (this.options.detectTraps) {
|
|
183
|
-
const trap = this.trapDetector!.checkTrap(url, depth);
|
|
184
|
-
if (trap.risk > 0.8) return false;
|
|
185
|
-
}
|
|
186
|
-
return true;
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
addToQueue(u: string, d: number): void {
|
|
190
|
-
if (this.scopeManager!.isUrlEligible(u) !== 'allowed') return;
|
|
191
|
-
if (!this.uniqueQueue.has(u)) {
|
|
192
|
-
this.uniqueQueue.add(u);
|
|
193
|
-
this.queue.push({ url: u, depth: d });
|
|
194
|
-
this.context.emit({ type: 'queue:enqueue', url: u, depth: d });
|
|
195
|
-
|
|
196
|
-
const currentDiscovery = this.discoveryDepths.get(u);
|
|
197
|
-
if (currentDiscovery === undefined || d < currentDiscovery) {
|
|
198
|
-
this.discoveryDepths.set(u, d);
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
async seedQueue(): Promise<void> {
|
|
204
|
-
// Seed from Sitemap
|
|
205
|
-
if (this.options.sitemap) {
|
|
206
|
-
try {
|
|
207
|
-
const sitemapUrl = this.options.sitemap === 'true' ? new URL('/sitemap.xml', this.rootOrigin).toString() : this.options.sitemap;
|
|
208
|
-
if (sitemapUrl.startsWith('http')) {
|
|
209
|
-
this.context.emit({ type: 'info', message: 'Fetching sitemap', context: { url: sitemapUrl } });
|
|
210
|
-
const sitemapUrls = await this.sitemapFetcher!.fetch(sitemapUrl);
|
|
211
|
-
for (const u of sitemapUrls) {
|
|
212
|
-
const normalized = normalizeUrl(u, '', this.options);
|
|
213
|
-
if (normalized) this.addToQueue(normalized, 0);
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
} catch (e) {
|
|
217
|
-
this.context.emit({ type: 'warn', message: 'Sitemap fetch failed', context: e });
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
// Seed from startUrl
|
|
222
|
-
this.addToQueue(this.startUrl, 0);
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
private bufferPage(url: string, depth: number, status: number, data: any = {}): void {
|
|
226
|
-
const existing = this.pageBuffer.get(url);
|
|
227
|
-
const knownDiscovery = this.discoveryDepths.get(url);
|
|
228
|
-
|
|
229
|
-
// Always use the best (minimum) depth discovered for this URL
|
|
230
|
-
const finalDepth = knownDiscovery !== undefined ? Math.min(knownDiscovery, depth) : depth;
|
|
231
|
-
if (knownDiscovery === undefined || depth < knownDiscovery) {
|
|
232
|
-
this.discoveryDepths.set(url, depth);
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
// If we already have a buffered record, only update if the new one is more "complete" (has status)
|
|
236
|
-
// or if the depth is better.
|
|
237
|
-
if (existing) {
|
|
238
|
-
const isStatusUpdate = status !== 0 && existing.http_status === 0;
|
|
239
|
-
const isBetterDepth = finalDepth < existing.depth;
|
|
240
|
-
|
|
241
|
-
if (!isStatusUpdate && !isBetterDepth && Object.keys(data).length === 0) {
|
|
242
|
-
return;
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
this.pageBuffer.set(url, {
|
|
246
|
-
...existing,
|
|
247
|
-
depth: finalDepth,
|
|
248
|
-
http_status: status !== 0 ? status : existing.http_status,
|
|
249
|
-
...data
|
|
250
|
-
});
|
|
251
|
-
} else {
|
|
252
|
-
this.pageBuffer.set(url, {
|
|
253
|
-
site_id: this.siteId!,
|
|
254
|
-
normalized_url: url,
|
|
255
|
-
depth: finalDepth,
|
|
256
|
-
http_status: status,
|
|
257
|
-
last_seen_snapshot_id: this.snapshotId!,
|
|
258
|
-
...data
|
|
259
|
-
});
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
if (this.pageBuffer.size >= 50) {
|
|
263
|
-
this.flushPages();
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
private flushPages(): void {
|
|
268
|
-
if (this.pageBuffer.size === 0) return;
|
|
269
|
-
this.pageRepo!.upsertMany(Array.from(this.pageBuffer.values()));
|
|
270
|
-
this.pageBuffer.clear();
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
private bufferEdge(sourceUrl: string, targetUrl: string, weight: number = 1.0, rel: string = 'internal'): void {
|
|
274
|
-
this.edgeBuffer.push({ sourceUrl, targetUrl, weight, rel });
|
|
275
|
-
if (this.edgeBuffer.length >= 100) {
|
|
276
|
-
this.flushEdges();
|
|
277
|
-
}
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
private flushEdges(): void {
|
|
281
|
-
if (this.edgeBuffer.length === 0) return;
|
|
282
|
-
|
|
283
|
-
// To resolve URLs to IDs, we need to make sure pages are flushed first
|
|
284
|
-
this.flushPages();
|
|
285
|
-
|
|
286
|
-
const identities = this.pageRepo!.getPagesIdentityBySnapshot(this.snapshotId!);
|
|
287
|
-
const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
|
|
288
|
-
|
|
289
|
-
const edgesToInsert = this.edgeBuffer
|
|
290
|
-
.map(e => ({
|
|
291
|
-
snapshot_id: this.snapshotId!,
|
|
292
|
-
source_page_id: urlToId.get(e.sourceUrl)!,
|
|
293
|
-
target_page_id: urlToId.get(e.targetUrl)!,
|
|
294
|
-
weight: e.weight,
|
|
295
|
-
rel: e.rel as any
|
|
296
|
-
}))
|
|
297
|
-
.filter(e => e.source_page_id !== undefined && e.target_page_id !== undefined);
|
|
298
|
-
|
|
299
|
-
if (edgesToInsert.length > 0) {
|
|
300
|
-
this.edgeRepo!.insertEdges(edgesToInsert);
|
|
301
|
-
}
|
|
302
|
-
this.edgeBuffer = [];
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
private bufferMetrics(url: string, data: any): void {
|
|
306
|
-
this.metricsBuffer.push({ url, data });
|
|
307
|
-
if (this.metricsBuffer.length >= 50) {
|
|
308
|
-
this.flushMetrics();
|
|
309
|
-
}
|
|
310
|
-
}
|
|
311
|
-
|
|
312
|
-
private flushMetrics(): void {
|
|
313
|
-
if (this.metricsBuffer.length === 0) return;
|
|
314
|
-
|
|
315
|
-
this.flushPages();
|
|
316
|
-
const identities = this.pageRepo!.getPagesIdentityBySnapshot(this.snapshotId!);
|
|
317
|
-
const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
|
|
318
|
-
|
|
319
|
-
const metricsList = this.metricsBuffer.map(item => {
|
|
320
|
-
const pageId = urlToId.get(item.url);
|
|
321
|
-
if (!pageId) return null;
|
|
322
|
-
return {
|
|
323
|
-
snapshot_id: this.snapshotId!,
|
|
324
|
-
page_id: pageId,
|
|
325
|
-
authority_score: null,
|
|
326
|
-
hub_score: null,
|
|
327
|
-
pagerank: null,
|
|
328
|
-
pagerank_score: null,
|
|
329
|
-
link_role: null,
|
|
330
|
-
crawl_status: null,
|
|
331
|
-
word_count: null,
|
|
332
|
-
thin_content_score: null,
|
|
333
|
-
external_link_ratio: null,
|
|
334
|
-
orphan_score: null,
|
|
335
|
-
duplicate_cluster_id: null,
|
|
336
|
-
duplicate_type: null,
|
|
337
|
-
is_cluster_primary: 0,
|
|
338
|
-
...item.data
|
|
339
|
-
};
|
|
340
|
-
}).filter(m => m !== null);
|
|
341
|
-
|
|
342
|
-
if (metricsList.length > 0) {
|
|
343
|
-
this.metricsRepo!.insertMany(metricsList as any[]);
|
|
344
|
-
}
|
|
345
|
-
this.metricsBuffer = [];
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
async flushAll(): Promise<void> {
|
|
349
|
-
this.flushPages();
|
|
350
|
-
this.flushEdges();
|
|
351
|
-
this.flushMetrics();
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
private async fetchPage(url: string, depth: number, prevNode?: GraphNode): Promise<FetchResult | null> {
|
|
355
|
-
const startTime = Date.now();
|
|
356
|
-
try {
|
|
357
|
-
this.context.emit({ type: 'crawl:start', url });
|
|
358
|
-
const res = await this.fetcher!.fetch(url, {
|
|
359
|
-
maxBytes: this.options.maxBytes,
|
|
360
|
-
crawlDelay: this.robots ? this.robots.getCrawlDelay('crawlith') : undefined,
|
|
361
|
-
etag: prevNode?.etag,
|
|
362
|
-
lastModified: prevNode?.lastModified
|
|
363
|
-
});
|
|
364
|
-
|
|
365
|
-
const durationMs = Date.now() - startTime;
|
|
366
|
-
|
|
367
|
-
this.context.emit({
|
|
368
|
-
type: 'crawl:success',
|
|
369
|
-
url,
|
|
370
|
-
status: typeof res.status === 'number' ? res.status : 0,
|
|
371
|
-
durationMs,
|
|
372
|
-
depth
|
|
373
|
-
});
|
|
374
|
-
|
|
375
|
-
return res;
|
|
376
|
-
} catch (e) {
|
|
377
|
-
this.context.emit({ type: 'crawl:error', url, error: String(e), depth });
|
|
378
|
-
return null;
|
|
379
|
-
}
|
|
380
|
-
}
|
|
381
|
-
|
|
382
|
-
private handleCachedResponse(url: string, finalUrl: string, depth: number, prevNode: GraphNode): void {
|
|
383
|
-
this.bufferPage(finalUrl, depth, 200, {
|
|
384
|
-
html: prevNode.html,
|
|
385
|
-
canonical_url: prevNode.canonical,
|
|
386
|
-
content_hash: prevNode.contentHash,
|
|
387
|
-
simhash: prevNode.simhash,
|
|
388
|
-
etag: prevNode.etag,
|
|
389
|
-
last_modified: prevNode.lastModified,
|
|
390
|
-
noindex: prevNode.noindex ? 1 : 0,
|
|
391
|
-
nofollow: prevNode.nofollow ? 1 : 0
|
|
392
|
-
});
|
|
393
|
-
this.bufferMetrics(finalUrl, {
|
|
394
|
-
crawl_status: 'cached'
|
|
395
|
-
});
|
|
396
|
-
|
|
397
|
-
// Re-discovery links from previous graph to continue crawling if needed
|
|
398
|
-
const prevLinks = this.options.previousGraph?.getEdges()
|
|
399
|
-
.filter(e => e.source === url)
|
|
400
|
-
.map(e => e.target);
|
|
401
|
-
|
|
402
|
-
if (prevLinks) {
|
|
403
|
-
for (const link of prevLinks) {
|
|
404
|
-
const normalizedLink = normalizeUrl(link, '', this.options);
|
|
405
|
-
if (normalizedLink && normalizedLink !== finalUrl) {
|
|
406
|
-
this.bufferPage(normalizedLink, depth + 1, 0);
|
|
407
|
-
this.bufferEdge(finalUrl, normalizedLink, 1.0, 'internal');
|
|
408
|
-
if (this.shouldEnqueue(normalizedLink, depth + 1)) {
|
|
409
|
-
this.addToQueue(normalizedLink, depth + 1);
|
|
410
|
-
}
|
|
411
|
-
}
|
|
412
|
-
}
|
|
413
|
-
}
|
|
414
|
-
}
|
|
415
|
-
|
|
416
|
-
private handleRedirects(chain: FetchResult['redirectChain'], depth: number): void {
|
|
417
|
-
for (const step of chain) {
|
|
418
|
-
const source = normalizeUrl(step.url, '', this.options);
|
|
419
|
-
const target = normalizeUrl(step.target, '', this.options);
|
|
420
|
-
if (source && target) {
|
|
421
|
-
this.bufferPage(source, depth, step.status);
|
|
422
|
-
this.bufferPage(target, depth, 0);
|
|
423
|
-
this.bufferEdge(source, target);
|
|
424
|
-
}
|
|
425
|
-
}
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
private handleSuccessResponse(res: FetchResult, finalUrl: string, depth: number, isBlocked: boolean = false): void {
|
|
429
|
-
const contentTypeHeader = res.headers['content-type'];
|
|
430
|
-
const contentType = Array.isArray(contentTypeHeader) ? contentTypeHeader[0] : (contentTypeHeader || '');
|
|
431
|
-
if (!contentType || !contentType.toLowerCase().includes('text/html')) {
|
|
432
|
-
this.bufferPage(finalUrl, depth, typeof res.status === 'number' ? res.status : 0);
|
|
433
|
-
return;
|
|
434
|
-
}
|
|
435
|
-
|
|
436
|
-
const parseResult = this.parser!.parse(res.body, finalUrl, res.status as number);
|
|
437
|
-
|
|
438
|
-
this.bufferPage(finalUrl, depth, res.status as number, {
|
|
439
|
-
html: parseResult.html,
|
|
440
|
-
canonical_url: parseResult.canonical || undefined,
|
|
441
|
-
noindex: parseResult.noindex ? 1 : 0,
|
|
442
|
-
nofollow: parseResult.nofollow ? 1 : 0,
|
|
443
|
-
content_hash: parseResult.contentHash,
|
|
444
|
-
simhash: parseResult.simhash,
|
|
445
|
-
soft404_score: parseResult.soft404Score,
|
|
446
|
-
etag: res.etag,
|
|
447
|
-
last_modified: res.lastModified,
|
|
448
|
-
retries: res.retries
|
|
449
|
-
});
|
|
450
|
-
|
|
451
|
-
try {
|
|
452
|
-
const contentAnalysis = analyzeContent(parseResult.html);
|
|
453
|
-
const linkAnalysis = analyzeLinks(parseResult.html, finalUrl, this.rootOrigin);
|
|
454
|
-
const thinScore = calculateThinContentScore(contentAnalysis, 0);
|
|
455
|
-
|
|
456
|
-
this.bufferMetrics(finalUrl, {
|
|
457
|
-
crawl_status: isBlocked ? 'blocked_by_robots' : 'fetched',
|
|
458
|
-
word_count: contentAnalysis.wordCount,
|
|
459
|
-
thin_content_score: thinScore,
|
|
460
|
-
external_link_ratio: linkAnalysis.externalRatio
|
|
461
|
-
});
|
|
462
|
-
} catch (e) {
|
|
463
|
-
this.context.emit({ type: 'error', message: 'Error calculating per-page metrics', error: e, context: { url: finalUrl } });
|
|
464
|
-
}
|
|
465
|
-
|
|
466
|
-
for (const linkItem of parseResult.links) {
|
|
467
|
-
const normalizedLink = normalizeUrl(linkItem.url, '', this.options);
|
|
468
|
-
if (normalizedLink && normalizedLink !== finalUrl) {
|
|
469
|
-
this.bufferPage(normalizedLink, depth + 1, 0);
|
|
470
|
-
this.bufferEdge(finalUrl, normalizedLink, 1.0, 'internal');
|
|
471
|
-
if (this.shouldEnqueue(normalizedLink, depth + 1)) {
|
|
472
|
-
this.addToQueue(normalizedLink, depth + 1);
|
|
473
|
-
}
|
|
474
|
-
}
|
|
475
|
-
}
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
private async processPage(item: QueueItem, isBlocked: boolean = false): Promise<void> {
|
|
479
|
-
const { url, depth } = item;
|
|
480
|
-
if (this.scopeManager!.isUrlEligible(url) !== 'allowed') {
|
|
481
|
-
this.bufferPage(url, depth, 0, { securityError: 'blocked_by_domain_filter' });
|
|
482
|
-
return;
|
|
483
|
-
}
|
|
484
|
-
|
|
485
|
-
try {
|
|
486
|
-
const prevNode = this.options.previousGraph?.nodes.get(url);
|
|
487
|
-
const res = await this.fetchPage(url, depth, prevNode);
|
|
488
|
-
|
|
489
|
-
if (!res) return;
|
|
490
|
-
|
|
491
|
-
const finalUrl = normalizeUrl(res.finalUrl, '', this.options);
|
|
492
|
-
if (!finalUrl) return;
|
|
493
|
-
|
|
494
|
-
if (res.status === 304 && prevNode) {
|
|
495
|
-
this.handleCachedResponse(url, finalUrl, depth, prevNode);
|
|
496
|
-
return;
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
this.handleRedirects(res.redirectChain, depth);
|
|
500
|
-
|
|
501
|
-
const isStringStatus = typeof res.status === 'string';
|
|
502
|
-
if (isStringStatus || (typeof res.status === 'number' && res.status >= 300)) {
|
|
503
|
-
const statusNum = typeof res.status === 'number' ? res.status : 0;
|
|
504
|
-
this.bufferPage(finalUrl, depth, statusNum, {
|
|
505
|
-
security_error: isStringStatus ? res.status : undefined,
|
|
506
|
-
retries: res.retries
|
|
507
|
-
});
|
|
508
|
-
this.bufferMetrics(finalUrl, {
|
|
509
|
-
crawl_status: isStringStatus ? res.status : 'fetched_error'
|
|
510
|
-
});
|
|
511
|
-
return;
|
|
512
|
-
}
|
|
513
|
-
|
|
514
|
-
if (res.status === 200) {
|
|
515
|
-
this.handleSuccessResponse(res, finalUrl, depth, isBlocked);
|
|
516
|
-
}
|
|
517
|
-
} catch (e) {
|
|
518
|
-
this.context.emit({ type: 'crawl:error', url, error: String(e), depth });
|
|
519
|
-
}
|
|
520
|
-
}
|
|
521
|
-
|
|
522
|
-
async run(): Promise<number> {
|
|
523
|
-
await this.initialize();
|
|
524
|
-
this.setupModules();
|
|
525
|
-
await this.fetchRobots();
|
|
526
|
-
await this.seedQueue();
|
|
527
|
-
|
|
528
|
-
return new Promise((resolve) => {
|
|
529
|
-
const checkDone = async () => {
|
|
530
|
-
if (this.queue.length === 0 && this.active === 0) {
|
|
531
|
-
await this.flushAll();
|
|
532
|
-
this.snapshotRepo!.updateSnapshotStatus(this.snapshotId!, 'completed', {
|
|
533
|
-
limit_reached: this.reachedLimit ? 1 : 0
|
|
534
|
-
});
|
|
535
|
-
resolve(this.snapshotId!);
|
|
536
|
-
return true;
|
|
537
|
-
}
|
|
538
|
-
return false;
|
|
539
|
-
};
|
|
540
|
-
|
|
541
|
-
const next = async () => {
|
|
542
|
-
if (await checkDone()) return;
|
|
543
|
-
|
|
544
|
-
if (this.pagesCrawled >= this.options.limit) {
|
|
545
|
-
this.reachedLimit = true;
|
|
546
|
-
if (this.active === 0) {
|
|
547
|
-
await this.flushAll();
|
|
548
|
-
this.snapshotRepo!.updateSnapshotStatus(this.snapshotId!, 'completed', {
|
|
549
|
-
limit_reached: 1
|
|
550
|
-
});
|
|
551
|
-
this.context.emit({ type: 'crawl:limit-reached', limit: this.options.limit });
|
|
552
|
-
resolve(this.snapshotId!);
|
|
553
|
-
}
|
|
554
|
-
return;
|
|
555
|
-
}
|
|
556
|
-
|
|
557
|
-
while (this.queue.length > 0 && this.active < this.concurrency && this.pagesCrawled < this.options.limit) {
|
|
558
|
-
const item = this.queue.shift()!;
|
|
559
|
-
if (this.visited.has(item.url)) continue;
|
|
560
|
-
|
|
561
|
-
// Robust robots check: if path doesn't end in /, check both /path and /path/
|
|
562
|
-
// to handle cases where normalization stripped a slash that robots.txt relies on.
|
|
563
|
-
const isBlocked = this.robots && (
|
|
564
|
-
!this.robots.isAllowed(item.url, 'crawlith') ||
|
|
565
|
-
(!item.url.endsWith('/') && !this.robots.isAllowed(item.url + '/', 'crawlith'))
|
|
566
|
-
);
|
|
567
|
-
|
|
568
|
-
if (isBlocked) {
|
|
569
|
-
if (this.options.debug) {
|
|
570
|
-
console.log(`${chalk.yellow('⊘ Robots')} ${chalk.gray(item.url)}`);
|
|
571
|
-
}
|
|
572
|
-
|
|
573
|
-
// Tag as blocked for reporting
|
|
574
|
-
this.bufferMetrics(item.url, {
|
|
575
|
-
crawl_status: 'blocked_by_robots'
|
|
576
|
-
});
|
|
577
|
-
this.bufferPage(item.url, item.depth, 0);
|
|
578
|
-
|
|
579
|
-
if (!this.options.ignoreRobots) {
|
|
580
|
-
this.visited.add(item.url);
|
|
581
|
-
this.pagesCrawled++;
|
|
582
|
-
continue;
|
|
583
|
-
}
|
|
584
|
-
}
|
|
585
|
-
|
|
586
|
-
this.active++;
|
|
587
|
-
this.pagesCrawled++;
|
|
588
|
-
this.visited.add(item.url);
|
|
589
|
-
|
|
590
|
-
this.limitConcurrency(() => this.processPage(item, isBlocked)).finally(() => {
|
|
591
|
-
this.active--;
|
|
592
|
-
next();
|
|
593
|
-
});
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
await checkDone();
|
|
597
|
-
};
|
|
598
|
-
next();
|
|
599
|
-
});
|
|
600
|
-
}
|
|
601
|
-
}
|
package/src/crawler/extract.ts
DELETED
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
import * as cheerio from 'cheerio';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Extracts all links from an HTML document.
|
|
5
|
-
* Returns absolute URLs.
|
|
6
|
-
* @param html The HTML content string
|
|
7
|
-
* @param baseUrl The base URL to resolve relative links against
|
|
8
|
-
* @param onError Optional callback for handling extraction errors
|
|
9
|
-
*/
|
|
10
|
-
export function extractLinks(html: string, baseUrl: string, onError?: (error: unknown) => void): string[] {
|
|
11
|
-
try {
|
|
12
|
-
const $ = cheerio.load(html);
|
|
13
|
-
const links = new Set<string>();
|
|
14
|
-
|
|
15
|
-
$('a').each((_, element) => {
|
|
16
|
-
const href = $(element).attr('href');
|
|
17
|
-
if (href) {
|
|
18
|
-
try {
|
|
19
|
-
const absoluteUrl = new URL(href, baseUrl);
|
|
20
|
-
// Only http(s) links
|
|
21
|
-
if (absoluteUrl.protocol === 'http:' || absoluteUrl.protocol === 'https:') {
|
|
22
|
-
// Remove hash fragments immediately as they are irrelevant for crawling
|
|
23
|
-
absoluteUrl.hash = '';
|
|
24
|
-
links.add(absoluteUrl.toString());
|
|
25
|
-
}
|
|
26
|
-
} catch (_e) {
|
|
27
|
-
// Invalid URL, skip
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
});
|
|
31
|
-
|
|
32
|
-
return Array.from(links);
|
|
33
|
-
} catch (e) {
|
|
34
|
-
if (onError) {
|
|
35
|
-
onError(e);
|
|
36
|
-
}
|
|
37
|
-
return [];
|
|
38
|
-
}
|
|
39
|
-
}
|