@crawlith/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +17 -3
- package/dist/analysis/analyze.js +192 -248
- package/dist/analysis/scoring.js +7 -1
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +75 -0
- package/dist/crawler/crawler.js +518 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +1 -0
- package/dist/crawler/fetcher.js +20 -5
- package/dist/crawler/metricsRunner.d.ts +3 -1
- package/dist/crawler/metricsRunner.js +55 -46
- package/dist/crawler/sitemap.d.ts +3 -0
- package/dist/crawler/sitemap.js +5 -1
- package/dist/db/graphLoader.js +32 -3
- package/dist/db/index.d.ts +3 -0
- package/dist/db/index.js +4 -0
- package/dist/db/repositories/EdgeRepository.d.ts +8 -0
- package/dist/db/repositories/EdgeRepository.js +13 -0
- package/dist/db/repositories/MetricsRepository.d.ts +3 -0
- package/dist/db/repositories/MetricsRepository.js +14 -1
- package/dist/db/repositories/PageRepository.d.ts +11 -0
- package/dist/db/repositories/PageRepository.js +112 -19
- package/dist/db/repositories/SiteRepository.d.ts +3 -0
- package/dist/db/repositories/SiteRepository.js +9 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
- package/dist/db/repositories/SnapshotRepository.js +23 -2
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/graph/cluster.js +62 -14
- package/dist/graph/duplicate.js +242 -191
- package/dist/graph/graph.d.ts +16 -0
- package/dist/graph/graph.js +17 -4
- package/dist/graph/metrics.js +12 -0
- package/dist/graph/pagerank.js +2 -0
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +5 -2
- package/dist/index.js +5 -2
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +4 -1
- package/dist/lock/lockManager.js +23 -13
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/html.js +15 -216
- package/dist/scoring/health.d.ts +50 -0
- package/dist/scoring/health.js +170 -0
- package/dist/scoring/hits.d.ts +1 -0
- package/dist/scoring/hits.js +64 -44
- package/dist/scoring/orphanSeverity.d.ts +5 -5
- package/package.json +3 -3
- package/scripts/copy-assets.js +37 -0
- package/src/analysis/analysis_list.html +35 -0
- package/src/analysis/analysis_page.html +123 -0
- package/src/analysis/analyze.ts +218 -261
- package/src/analysis/scoring.ts +8 -1
- package/src/analysis/templates.ts +9 -0
- package/src/core/security/ipGuard.ts +82 -3
- package/src/crawler/crawl.ts +6 -379
- package/src/crawler/crawler.ts +601 -0
- package/src/crawler/extract.ts +7 -2
- package/src/crawler/fetcher.ts +24 -6
- package/src/crawler/metricsRunner.ts +60 -47
- package/src/crawler/sitemap.ts +4 -1
- package/src/db/graphLoader.ts +33 -3
- package/src/db/index.ts +5 -0
- package/src/db/repositories/EdgeRepository.ts +14 -0
- package/src/db/repositories/MetricsRepository.ts +15 -1
- package/src/db/repositories/PageRepository.ts +119 -19
- package/src/db/repositories/SiteRepository.ts +11 -0
- package/src/db/repositories/SnapshotRepository.ts +28 -3
- package/src/events.ts +16 -0
- package/src/graph/cluster.ts +69 -15
- package/src/graph/duplicate.ts +249 -185
- package/src/graph/graph.ts +24 -4
- package/src/graph/metrics.ts +15 -0
- package/src/graph/pagerank.ts +1 -0
- package/src/graph/simhash.ts +15 -0
- package/src/index.ts +5 -2
- package/src/lock/hashKey.ts +1 -1
- package/src/lock/lockManager.ts +21 -13
- package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
- package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
- package/src/report/crawl_template.ts +9 -0
- package/src/report/html.ts +17 -217
- package/src/scoring/health.ts +241 -0
- package/src/scoring/hits.ts +67 -45
- package/src/scoring/orphanSeverity.ts +8 -8
- package/tests/analysis.unit.test.ts +44 -0
- package/tests/analyze.integration.test.ts +88 -53
- package/tests/analyze_markdown.test.ts +98 -0
- package/tests/audit/audit.test.ts +101 -0
- package/tests/audit/scoring.test.ts +25 -25
- package/tests/audit/transport.test.ts +0 -1
- package/tests/clustering_risk.test.ts +118 -0
- package/tests/crawler.test.ts +19 -13
- package/tests/db/index.test.ts +134 -0
- package/tests/db/repositories.test.ts +115 -0
- package/tests/db_repos.test.ts +72 -0
- package/tests/duplicate.test.ts +2 -2
- package/tests/extract.test.ts +86 -0
- package/tests/fetcher.test.ts +5 -1
- package/tests/fetcher_safety.test.ts +9 -3
- package/tests/graph/graph.test.ts +100 -0
- package/tests/graphLoader.test.ts +124 -0
- package/tests/html_report.test.ts +52 -51
- package/tests/ipGuard.test.ts +73 -0
- package/tests/lock/lockManager.test.ts +77 -17
- package/tests/normalize.test.ts +6 -19
- package/tests/orphanSeverity.test.ts +9 -9
- package/tests/redirect_safety.test.ts +5 -1
- package/tests/renderAnalysisCsv.test.ts +183 -0
- package/tests/safety.test.ts +12 -0
- package/tests/scope.test.ts +18 -0
- package/tests/scoring.test.ts +25 -24
- package/tests/sitemap.test.ts +13 -1
- package/tests/ssrf_fix.test.ts +69 -0
- package/tests/visualization_data.test.ts +10 -10
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
package/src/analysis/scoring.ts
CHANGED
|
@@ -8,6 +8,9 @@ export interface SiteScore {
|
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
export function scorePageSeo(page: PageAnalysis): number {
|
|
11
|
+
if (page.meta.crawlStatus === 'blocked_by_robots') {
|
|
12
|
+
return 0;
|
|
13
|
+
}
|
|
11
14
|
const titleMeta = (scoreTextStatus(page.title.status) + scoreTextStatus(page.metaDescription.status)) / 2;
|
|
12
15
|
const h1 = page.h1.status === 'ok' ? 100 : page.h1.status === 'warning' ? 60 : 10;
|
|
13
16
|
const wordQuality = Math.min(100, (page.content.wordCount / 600) * 100) * 0.7 + Math.min(100, page.content.textHtmlRatio * 500) * 0.3;
|
|
@@ -49,7 +52,11 @@ export function aggregateSiteScore(metrics: Metrics, pages: PageAnalysis[]): Sit
|
|
|
49
52
|
const orphanPenalty = metrics.totalPages === 0 ? 0 : (metrics.orphanPages.length / metrics.totalPages) * 100;
|
|
50
53
|
const authorityEntropyOrphanScore = Math.max(0, Math.min(100, (avgAuthority * 100 * 0.4) + (entropyScore * 0.35) + ((100 - orphanPenalty) * 0.25)));
|
|
51
54
|
|
|
52
|
-
|
|
55
|
+
let overallScore = Number((seoHealthScore * 0.7 + authorityEntropyOrphanScore * 0.3).toFixed(2));
|
|
56
|
+
|
|
57
|
+
if (pages.some(p => p.meta.crawlStatus === 'blocked_by_robots')) {
|
|
58
|
+
overallScore = 0;
|
|
59
|
+
}
|
|
53
60
|
|
|
54
61
|
return {
|
|
55
62
|
seoHealthScore: Number(seoHealthScore.toFixed(2)),
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { fileURLToPath } from 'node:url';
|
|
4
|
+
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = path.dirname(__filename);
|
|
7
|
+
|
|
8
|
+
export const ANALYSIS_LIST_TEMPLATE = fs.readFileSync(path.join(__dirname, 'analysis_list.html'), 'utf-8');
|
|
9
|
+
export const ANALYSIS_PAGE_TEMPLATE = fs.readFileSync(path.join(__dirname, 'analysis_page.html'), 'utf-8');
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import * as dns from 'dns';
|
|
2
2
|
import * as net from 'net';
|
|
3
3
|
import { promisify } from 'util';
|
|
4
|
+
import { Agent } from 'undici';
|
|
4
5
|
|
|
5
6
|
const resolve4 = promisify(dns.resolve4);
|
|
6
7
|
const resolve6 = promisify(dns.resolve6);
|
|
@@ -48,6 +49,15 @@ export class IPGuard {
|
|
|
48
49
|
// fe80::/10 (Link Local)
|
|
49
50
|
if ((firstWord & 0xffc0) === 0xfe80) return true;
|
|
50
51
|
|
|
52
|
+
// IPv4-mapped IPv6: ::ffff:0:0/96
|
|
53
|
+
if (expanded.startsWith('0000:0000:0000:0000:0000:ffff:')) {
|
|
54
|
+
const parts = expanded.split(':');
|
|
55
|
+
const p7 = parseInt(parts[6], 16);
|
|
56
|
+
const p8 = parseInt(parts[7], 16);
|
|
57
|
+
const ip4 = `${(p7 >> 8) & 255}.${p7 & 255}.${(p8 >> 8) & 255}.${p8 & 255}`;
|
|
58
|
+
return IPGuard.isInternal(ip4);
|
|
59
|
+
}
|
|
60
|
+
|
|
51
61
|
return false;
|
|
52
62
|
}
|
|
53
63
|
|
|
@@ -76,11 +86,80 @@ export class IPGuard {
|
|
|
76
86
|
}
|
|
77
87
|
}
|
|
78
88
|
|
|
89
|
+
/**
|
|
90
|
+
* Custom lookup function for undici that validates the resolved IP.
|
|
91
|
+
* Prevents DNS Rebinding attacks by checking the IP immediately before connection.
|
|
92
|
+
*/
|
|
93
|
+
static secureLookup(
|
|
94
|
+
hostname: string,
|
|
95
|
+
options: dns.LookupOneOptions | dns.LookupAllOptions,
|
|
96
|
+
callback: (err: NodeJS.ErrnoException | null, address: string | dns.LookupAddress[], family: number) => void
|
|
97
|
+
): void {
|
|
98
|
+
dns.lookup(hostname, options as any, (err: NodeJS.ErrnoException | null, address: string | dns.LookupAddress[], family: number) => {
|
|
99
|
+
if (err) {
|
|
100
|
+
return callback(err, address as any, family);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const checkIP = (ip: string) => {
|
|
104
|
+
if (IPGuard.isInternal(ip)) {
|
|
105
|
+
return new Error(`Blocked internal IP: ${ip}`);
|
|
106
|
+
}
|
|
107
|
+
return null;
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
if (typeof address === 'string') {
|
|
111
|
+
const error = checkIP(address);
|
|
112
|
+
if (error) {
|
|
113
|
+
// Return a custom error that undici will propagate
|
|
114
|
+
const blockedError = new Error(`Blocked internal IP: ${address}`);
|
|
115
|
+
(blockedError as any).code = 'EBLOCKED';
|
|
116
|
+
return callback(blockedError, address, family);
|
|
117
|
+
}
|
|
118
|
+
} else if (Array.isArray(address)) {
|
|
119
|
+
// Handle array of addresses (if options.all is true)
|
|
120
|
+
for (const addr of address) {
|
|
121
|
+
const error = checkIP(addr.address);
|
|
122
|
+
if (error) {
|
|
123
|
+
const blockedError = new Error(`Blocked internal IP: ${addr.address}`);
|
|
124
|
+
(blockedError as any).code = 'EBLOCKED';
|
|
125
|
+
return callback(blockedError, address, family);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
callback(null, address, family);
|
|
131
|
+
});
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Returns an undici Agent configured with secure DNS lookup.
|
|
136
|
+
*/
|
|
137
|
+
static getSecureDispatcher(): Agent {
|
|
138
|
+
return new Agent({
|
|
139
|
+
connect: {
|
|
140
|
+
lookup: IPGuard.secureLookup as any
|
|
141
|
+
}
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
|
|
79
145
|
private static expandIPv6(ip: string): string {
|
|
80
146
|
if (ip === '::') return '0000:0000:0000:0000:0000:0000:0000:0000';
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
147
|
+
|
|
148
|
+
let normalizedIp = ip;
|
|
149
|
+
if (ip.includes('.')) {
|
|
150
|
+
const lastColonIndex = ip.lastIndexOf(':');
|
|
151
|
+
const lastPart = ip.substring(lastColonIndex + 1);
|
|
152
|
+
if (net.isIPv4(lastPart)) {
|
|
153
|
+
const parts = lastPart.split('.').map(Number);
|
|
154
|
+
const hex1 = ((parts[0] << 8) | parts[1]).toString(16);
|
|
155
|
+
const hex2 = ((parts[2] << 8) | parts[3]).toString(16);
|
|
156
|
+
normalizedIp = ip.substring(0, lastColonIndex + 1) + hex1 + ':' + hex2;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
let full = normalizedIp;
|
|
161
|
+
if (normalizedIp.includes('::')) {
|
|
162
|
+
const parts = normalizedIp.split('::');
|
|
84
163
|
const left = parts[0].split(':').filter(x => x !== '');
|
|
85
164
|
const right = parts[1].split(':').filter(x => x !== '');
|
|
86
165
|
const missing = 8 - (left.length + right.length);
|
package/src/crawler/crawl.ts
CHANGED
|
@@ -1,382 +1,9 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import
|
|
3
|
-
import chalk from 'chalk';
|
|
4
|
-
import robotsParser from 'robots-parser';
|
|
5
|
-
import { Graph } from '../graph/graph.js';
|
|
6
|
-
import { Fetcher } from './fetcher.js';
|
|
7
|
-
import { Parser } from './parser.js';
|
|
8
|
-
import { Sitemap } from './sitemap.js';
|
|
9
|
-
import { normalizeUrl } from './normalize.js';
|
|
10
|
-
import { TrapDetector } from './trap.js';
|
|
11
|
-
import { ScopeManager } from '../core/scope/scopeManager.js';
|
|
12
|
-
import { getDb } from '../db/index.js';
|
|
13
|
-
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
14
|
-
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
15
|
-
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
16
|
-
import { EdgeRepository } from '../db/repositories/EdgeRepository.js';
|
|
17
|
-
import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
|
|
18
|
-
import { analyzeContent, calculateThinContentScore } from '../analysis/content.js';
|
|
19
|
-
import { analyzeLinks } from '../analysis/links.js';
|
|
1
|
+
import { Crawler, CrawlOptions } from './crawler.js';
|
|
2
|
+
import { EngineContext } from '../events.js';
|
|
20
3
|
|
|
21
|
-
export
|
|
22
|
-
limit: number;
|
|
23
|
-
depth: number;
|
|
24
|
-
concurrency?: number;
|
|
25
|
-
ignoreRobots?: boolean;
|
|
26
|
-
stripQuery?: boolean;
|
|
27
|
-
previousGraph?: Graph;
|
|
28
|
-
sitemap?: string;
|
|
29
|
-
debug?: boolean;
|
|
30
|
-
detectSoft404?: boolean;
|
|
31
|
-
detectTraps?: boolean;
|
|
32
|
-
rate?: number;
|
|
33
|
-
maxBytes?: number;
|
|
34
|
-
allowedDomains?: string[];
|
|
35
|
-
deniedDomains?: string[];
|
|
36
|
-
includeSubdomains?: boolean;
|
|
37
|
-
proxyUrl?: string;
|
|
38
|
-
maxRedirects?: number;
|
|
39
|
-
userAgent?: string;
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
interface QueueItem {
|
|
43
|
-
url: string;
|
|
44
|
-
depth: number;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
export async function crawl(startUrl: string, options: CrawlOptions): Promise<number> {
|
|
48
|
-
const visited = new Set<string>();
|
|
49
|
-
const concurrency = Math.min(options.concurrency || 2, 10);
|
|
50
|
-
const limitConcurrency = pLimit(concurrency);
|
|
51
|
-
const trapDetector = new TrapDetector();
|
|
52
|
-
|
|
53
|
-
const db = getDb();
|
|
54
|
-
const siteRepo = new SiteRepository(db);
|
|
55
|
-
const snapshotRepo = new SnapshotRepository(db);
|
|
56
|
-
const pageRepo = new PageRepository(db);
|
|
57
|
-
const edgeRepo = new EdgeRepository(db);
|
|
58
|
-
const metricsRepo = new MetricsRepository(db);
|
|
59
|
-
|
|
60
|
-
const rootUrl = normalizeUrl(startUrl, '', { stripQuery: options.stripQuery });
|
|
61
|
-
if (!rootUrl) throw new Error('Invalid start URL');
|
|
62
|
-
|
|
63
|
-
const urlObj = new URL(rootUrl);
|
|
64
|
-
const domain = urlObj.hostname.replace('www.', '');
|
|
65
|
-
const site = siteRepo.firstOrCreateSite(domain);
|
|
66
|
-
const siteId = site.id;
|
|
67
|
-
|
|
68
|
-
const snapshotId = snapshotRepo.createSnapshot(siteId, options.previousGraph ? 'incremental' : 'full');
|
|
69
|
-
const rootOrigin = urlObj.origin;
|
|
70
|
-
|
|
71
|
-
// DB Helper
|
|
72
|
-
const savePageToDb = (url: string, depth: number, status: number, data: any = {}): number | null => {
|
|
73
|
-
try {
|
|
74
|
-
const existing = pageRepo.getPage(siteId!, url);
|
|
75
|
-
const isSameSnapshot = existing?.last_seen_snapshot_id === snapshotId;
|
|
76
|
-
|
|
77
|
-
return pageRepo.upsertAndGetId({
|
|
78
|
-
site_id: siteId!,
|
|
79
|
-
normalized_url: url,
|
|
80
|
-
depth: isSameSnapshot ? existing.depth : depth,
|
|
81
|
-
http_status: status,
|
|
82
|
-
first_seen_snapshot_id: existing ? existing.first_seen_snapshot_id : snapshotId,
|
|
83
|
-
last_seen_snapshot_id: snapshotId,
|
|
84
|
-
canonical_url: data.canonical !== undefined ? data.canonical : existing?.canonical_url,
|
|
85
|
-
content_hash: data.contentHash !== undefined ? data.contentHash : existing?.content_hash,
|
|
86
|
-
simhash: data.simhash !== undefined ? data.simhash : existing?.simhash,
|
|
87
|
-
etag: data.etag !== undefined ? data.etag : existing?.etag,
|
|
88
|
-
last_modified: data.lastModified !== undefined ? data.lastModified : existing?.last_modified,
|
|
89
|
-
html: data.html !== undefined ? data.html : existing?.html,
|
|
90
|
-
soft404_score: data.soft404Score !== undefined ? data.soft404Score : existing?.soft404_score,
|
|
91
|
-
noindex: data.noindex !== undefined ? (data.noindex ? 1 : 0) : existing?.noindex,
|
|
92
|
-
nofollow: data.nofollow !== undefined ? (data.nofollow ? 1 : 0) : existing?.nofollow,
|
|
93
|
-
security_error: data.securityError !== undefined ? data.securityError : existing?.security_error,
|
|
94
|
-
retries: data.retries !== undefined ? data.retries : existing?.retries
|
|
95
|
-
});
|
|
96
|
-
} catch (e) {
|
|
97
|
-
console.error(`Failed to save page ${url}:`, e);
|
|
98
|
-
return null;
|
|
99
|
-
}
|
|
100
|
-
};
|
|
101
|
-
|
|
102
|
-
const saveEdgeToDb = (sourceUrl: string, targetUrl: string, weight: number = 1.0, rel: string = 'internal') => {
|
|
103
|
-
try {
|
|
104
|
-
const sourceId = pageRepo.getIdByUrl(siteId!, sourceUrl);
|
|
105
|
-
const targetId = pageRepo.getIdByUrl(siteId!, targetUrl);
|
|
106
|
-
if (sourceId && targetId) {
|
|
107
|
-
edgeRepo.insertEdge(snapshotId, sourceId, targetId, weight, rel);
|
|
108
|
-
}
|
|
109
|
-
} catch (e) {
|
|
110
|
-
console.error(`Failed to save edge ${sourceUrl} -> ${targetUrl}:`, e);
|
|
111
|
-
}
|
|
112
|
-
};
|
|
113
|
-
|
|
114
|
-
// Initialize Modules
|
|
115
|
-
const scopeManager = new ScopeManager({
|
|
116
|
-
allowedDomains: options.allowedDomains || [],
|
|
117
|
-
deniedDomains: options.deniedDomains || [],
|
|
118
|
-
includeSubdomains: options.includeSubdomains || false,
|
|
119
|
-
rootUrl: startUrl
|
|
120
|
-
});
|
|
121
|
-
|
|
122
|
-
const fetcher = new Fetcher({
|
|
123
|
-
rate: options.rate,
|
|
124
|
-
proxyUrl: options.proxyUrl,
|
|
125
|
-
scopeManager,
|
|
126
|
-
maxRedirects: options.maxRedirects,
|
|
127
|
-
userAgent: options.userAgent
|
|
128
|
-
});
|
|
129
|
-
|
|
130
|
-
const parser = new Parser();
|
|
131
|
-
const sitemapFetcher = new Sitemap();
|
|
132
|
-
|
|
133
|
-
// Handle robots.txt
|
|
134
|
-
let robots: any = null;
|
|
135
|
-
if (!options.ignoreRobots) {
|
|
136
|
-
try {
|
|
137
|
-
const robotsUrl = new URL('/robots.txt', rootOrigin).toString();
|
|
138
|
-
const res = await request(robotsUrl, {
|
|
139
|
-
maxRedirections: 3,
|
|
140
|
-
headers: { 'User-Agent': 'crawlith/1.0' },
|
|
141
|
-
headersTimeout: 5000,
|
|
142
|
-
bodyTimeout: 5000
|
|
143
|
-
});
|
|
144
|
-
if (res.statusCode >= 200 && res.statusCode < 300) {
|
|
145
|
-
const txt = await res.body.text();
|
|
146
|
-
robots = (robotsParser as any)(robotsUrl, txt);
|
|
147
|
-
} else {
|
|
148
|
-
await res.body.dump();
|
|
149
|
-
}
|
|
150
|
-
} catch {
|
|
151
|
-
console.warn('Failed to fetch robots.txt, proceeding...');
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
// Queue Setup
|
|
156
|
-
const queue: QueueItem[] = [];
|
|
157
|
-
const uniqueQueue = new Set<string>();
|
|
158
|
-
|
|
159
|
-
const addToQueue = (u: string, d: number) => {
|
|
160
|
-
if (scopeManager.isUrlEligible(u) !== 'allowed') return;
|
|
161
|
-
if (!uniqueQueue.has(u)) {
|
|
162
|
-
uniqueQueue.add(u);
|
|
163
|
-
queue.push({ url: u, depth: d });
|
|
164
|
-
}
|
|
165
|
-
};
|
|
166
|
-
|
|
167
|
-
// Seed from Sitemap
|
|
168
|
-
if (options.sitemap) {
|
|
169
|
-
try {
|
|
170
|
-
const sitemapUrl = options.sitemap === 'true' ? new URL('/sitemap.xml', rootOrigin).toString() : options.sitemap;
|
|
171
|
-
if (sitemapUrl.startsWith('http')) {
|
|
172
|
-
console.log(`Fetching sitemap: ${sitemapUrl}`);
|
|
173
|
-
const sitemapUrls = await sitemapFetcher.fetch(sitemapUrl);
|
|
174
|
-
for (const u of sitemapUrls) {
|
|
175
|
-
const normalized = normalizeUrl(u, '', options);
|
|
176
|
-
if (normalized) addToQueue(normalized, 0);
|
|
177
|
-
}
|
|
178
|
-
}
|
|
179
|
-
} catch (e) {
|
|
180
|
-
console.warn('Sitemap fetch failed', e);
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
// Seed from startUrl
|
|
185
|
-
addToQueue(rootUrl, 0);
|
|
186
|
-
|
|
187
|
-
let pagesCrawled = 0;
|
|
188
|
-
let active = 0;
|
|
189
|
-
let reachedLimit = false;
|
|
190
|
-
const maxDepthInCrawl = Math.min(options.depth, 10);
|
|
191
|
-
|
|
192
|
-
const shouldEnqueue = (url: string, depth: number) => {
|
|
193
|
-
if (visited.has(url)) return false;
|
|
194
|
-
if (uniqueQueue.has(url)) return false;
|
|
195
|
-
if (depth > maxDepthInCrawl) return false;
|
|
196
|
-
if (scopeManager.isUrlEligible(url) !== 'allowed') return false;
|
|
197
|
-
|
|
198
|
-
if (options.detectTraps) {
|
|
199
|
-
const trap = trapDetector.checkTrap(url, depth);
|
|
200
|
-
if (trap.risk > 0.8) return false;
|
|
201
|
-
}
|
|
202
|
-
return true;
|
|
203
|
-
};
|
|
204
|
-
|
|
205
|
-
return new Promise((resolve) => {
|
|
206
|
-
const checkDone = () => {
|
|
207
|
-
if (queue.length === 0 && active === 0) {
|
|
208
|
-
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
|
|
209
|
-
limit_reached: reachedLimit ? 1 : 0
|
|
210
|
-
});
|
|
211
|
-
resolve(snapshotId);
|
|
212
|
-
return true;
|
|
213
|
-
}
|
|
214
|
-
return false;
|
|
215
|
-
};
|
|
216
|
-
|
|
217
|
-
const next = () => {
|
|
218
|
-
if (checkDone()) return;
|
|
219
|
-
if (pagesCrawled >= options.limit) {
|
|
220
|
-
reachedLimit = true;
|
|
221
|
-
if (active === 0) {
|
|
222
|
-
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
|
|
223
|
-
limit_reached: 1
|
|
224
|
-
});
|
|
225
|
-
resolve(snapshotId);
|
|
226
|
-
}
|
|
227
|
-
return;
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
while (queue.length > 0 && active < concurrency && pagesCrawled < options.limit) {
|
|
231
|
-
const item = queue.shift()!;
|
|
232
|
-
if (visited.has(item.url)) continue;
|
|
233
|
-
if (robots && !robots.isAllowed(item.url, 'crawlith')) continue;
|
|
234
|
-
|
|
235
|
-
active++;
|
|
236
|
-
pagesCrawled++;
|
|
237
|
-
visited.add(item.url);
|
|
238
|
-
|
|
239
|
-
limitConcurrency(() => processPage(item)).finally(() => {
|
|
240
|
-
active--;
|
|
241
|
-
next();
|
|
242
|
-
});
|
|
243
|
-
}
|
|
244
|
-
};
|
|
245
|
-
|
|
246
|
-
const processPage = async (item: QueueItem) => {
|
|
247
|
-
const { url, depth } = item;
|
|
248
|
-
if (scopeManager.isUrlEligible(url) !== 'allowed') {
|
|
249
|
-
savePageToDb(url, depth, 0, { securityError: 'blocked_by_domain_filter' });
|
|
250
|
-
return;
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
const existingInDb = pageRepo.getPage(siteId!, url);
|
|
254
|
-
savePageToDb(url, depth, 0);
|
|
255
|
-
|
|
256
|
-
try {
|
|
257
|
-
const res = await fetcher.fetch(url, {
|
|
258
|
-
etag: existingInDb?.etag || undefined,
|
|
259
|
-
lastModified: existingInDb?.last_modified || undefined,
|
|
260
|
-
maxBytes: options.maxBytes,
|
|
261
|
-
crawlDelay: robots ? robots.getCrawlDelay('crawlith') : undefined
|
|
262
|
-
});
|
|
263
|
-
|
|
264
|
-
if (options.debug) {
|
|
265
|
-
console.log(`${chalk.gray(`[D:${depth}]`)} ${res.status} ${chalk.blue(url)}`);
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
if (res.status === 304) {
|
|
269
|
-
savePageToDb(url, depth, 304);
|
|
270
|
-
metricsRepo.insertMetrics({
|
|
271
|
-
snapshot_id: snapshotId,
|
|
272
|
-
page_id: existingInDb!.id,
|
|
273
|
-
authority_score: null,
|
|
274
|
-
hub_score: null,
|
|
275
|
-
pagerank: null,
|
|
276
|
-
pagerank_score: null,
|
|
277
|
-
link_role: null,
|
|
278
|
-
crawl_status: 'cached',
|
|
279
|
-
word_count: null,
|
|
280
|
-
thin_content_score: null,
|
|
281
|
-
external_link_ratio: null,
|
|
282
|
-
orphan_score: null,
|
|
283
|
-
duplicate_cluster_id: null,
|
|
284
|
-
duplicate_type: null,
|
|
285
|
-
is_cluster_primary: 0
|
|
286
|
-
});
|
|
287
|
-
return;
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
const chain = res.redirectChain;
|
|
291
|
-
for (const step of chain) {
|
|
292
|
-
const source = normalizeUrl(step.url, '', options);
|
|
293
|
-
const target = normalizeUrl(step.target, '', options);
|
|
294
|
-
if (source && target) {
|
|
295
|
-
savePageToDb(source, depth, step.status);
|
|
296
|
-
savePageToDb(target, depth, 0);
|
|
297
|
-
saveEdgeToDb(source, target);
|
|
298
|
-
}
|
|
299
|
-
}
|
|
300
|
-
|
|
301
|
-
const finalUrl = normalizeUrl(res.finalUrl, '', options);
|
|
302
|
-
if (!finalUrl) return;
|
|
303
|
-
|
|
304
|
-
const isStringStatus = typeof res.status === 'string';
|
|
305
|
-
if (isStringStatus || (typeof res.status === 'number' && res.status >= 300)) {
|
|
306
|
-
savePageToDb(finalUrl, depth, typeof res.status === 'number' ? res.status : 0, {
|
|
307
|
-
securityError: isStringStatus ? res.status : undefined,
|
|
308
|
-
retries: res.retries
|
|
309
|
-
});
|
|
310
|
-
return;
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
if (res.status === 200) {
|
|
314
|
-
const contentTypeHeader = res.headers['content-type'];
|
|
315
|
-
const contentType = Array.isArray(contentTypeHeader) ? contentTypeHeader[0] : (contentTypeHeader || '');
|
|
316
|
-
if (!contentType || !contentType.toLowerCase().includes('text/html')) {
|
|
317
|
-
savePageToDb(finalUrl, depth, res.status);
|
|
318
|
-
return;
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
savePageToDb(finalUrl, depth, res.status);
|
|
322
|
-
const parseResult = parser.parse(res.body, finalUrl, res.status);
|
|
323
|
-
|
|
324
|
-
const pageId = savePageToDb(finalUrl, depth, res.status, {
|
|
325
|
-
html: parseResult.html,
|
|
326
|
-
canonical: parseResult.canonical || undefined,
|
|
327
|
-
noindex: parseResult.noindex,
|
|
328
|
-
nofollow: parseResult.nofollow,
|
|
329
|
-
contentHash: parseResult.contentHash,
|
|
330
|
-
simhash: parseResult.simhash,
|
|
331
|
-
soft404Score: parseResult.soft404Score,
|
|
332
|
-
etag: res.etag,
|
|
333
|
-
lastModified: res.lastModified,
|
|
334
|
-
retries: res.retries
|
|
335
|
-
});
|
|
336
|
-
|
|
337
|
-
if (pageId) {
|
|
338
|
-
try {
|
|
339
|
-
const contentAnalysis = analyzeContent(parseResult.html);
|
|
340
|
-
const linkAnalysis = analyzeLinks(parseResult.html, finalUrl, rootOrigin);
|
|
341
|
-
const thinScore = calculateThinContentScore(contentAnalysis, 0);
|
|
342
|
-
|
|
343
|
-
metricsRepo.insertMetrics({
|
|
344
|
-
snapshot_id: snapshotId,
|
|
345
|
-
page_id: pageId,
|
|
346
|
-
authority_score: null,
|
|
347
|
-
hub_score: null,
|
|
348
|
-
pagerank: null,
|
|
349
|
-
pagerank_score: null,
|
|
350
|
-
link_role: null,
|
|
351
|
-
crawl_status: 'fetched',
|
|
352
|
-
word_count: contentAnalysis.wordCount,
|
|
353
|
-
thin_content_score: thinScore,
|
|
354
|
-
external_link_ratio: linkAnalysis.externalRatio,
|
|
355
|
-
orphan_score: null,
|
|
356
|
-
duplicate_cluster_id: null,
|
|
357
|
-
duplicate_type: null,
|
|
358
|
-
is_cluster_primary: 0
|
|
359
|
-
});
|
|
360
|
-
} catch (e) {
|
|
361
|
-
console.error(`Error calculating per-page metrics for ${finalUrl}:`, e);
|
|
362
|
-
}
|
|
363
|
-
}
|
|
4
|
+
export { CrawlOptions };
|
|
364
5
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
savePageToDb(normalizedLink, depth + 1, 0);
|
|
369
|
-
saveEdgeToDb(finalUrl, normalizedLink, 1.0, 'internal');
|
|
370
|
-
if (shouldEnqueue(normalizedLink, depth + 1)) {
|
|
371
|
-
addToQueue(normalizedLink, depth + 1);
|
|
372
|
-
}
|
|
373
|
-
}
|
|
374
|
-
}
|
|
375
|
-
}
|
|
376
|
-
} catch (e) {
|
|
377
|
-
console.error(`Error processing ${url}:`, e);
|
|
378
|
-
}
|
|
379
|
-
};
|
|
380
|
-
next();
|
|
381
|
-
});
|
|
6
|
+
export async function crawl(startUrl: string, options: CrawlOptions, context?: EngineContext): Promise<number> {
|
|
7
|
+
const crawler = new Crawler(startUrl, options, context);
|
|
8
|
+
return crawler.run();
|
|
382
9
|
}
|