@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import * as dns from 'dns';
|
|
2
|
+
import * as net from 'net';
|
|
3
|
+
import { promisify } from 'util';
|
|
4
|
+
const resolve4 = promisify(dns.resolve4);
|
|
5
|
+
const resolve6 = promisify(dns.resolve6);
|
|
6
|
+
export class IPGuard {
|
|
7
|
+
/**
|
|
8
|
+
* Checks if an IP address is internal/private
|
|
9
|
+
*/
|
|
10
|
+
static isInternal(ip) {
|
|
11
|
+
if (net.isIPv4(ip)) {
|
|
12
|
+
const parts = ip.split('.').map(Number);
|
|
13
|
+
// 127.0.0.0/8
|
|
14
|
+
if (parts[0] === 127)
|
|
15
|
+
return true;
|
|
16
|
+
// 10.0.0.0/8
|
|
17
|
+
if (parts[0] === 10)
|
|
18
|
+
return true;
|
|
19
|
+
// 192.168.0.0/16
|
|
20
|
+
if (parts[0] === 192 && parts[1] === 168)
|
|
21
|
+
return true;
|
|
22
|
+
// 172.16.0.0 – 172.31.255.255
|
|
23
|
+
if (parts[0] === 172 && parts[1] >= 16 && parts[1] <= 31)
|
|
24
|
+
return true;
|
|
25
|
+
// 169.254.0.0/16
|
|
26
|
+
if (parts[0] === 169 && parts[1] === 254)
|
|
27
|
+
return true;
|
|
28
|
+
// 0.0.0.0/8
|
|
29
|
+
if (parts[0] === 0)
|
|
30
|
+
return true;
|
|
31
|
+
return false;
|
|
32
|
+
}
|
|
33
|
+
if (net.isIPv6(ip)) {
|
|
34
|
+
// Normalize IPv6
|
|
35
|
+
const expanded = IPGuard.expandIPv6(ip);
|
|
36
|
+
// ::1
|
|
37
|
+
if (expanded === '0000:0000:0000:0000:0000:0000:0000:0001')
|
|
38
|
+
return true;
|
|
39
|
+
// fc00::/7 (Unique Local Address) -> fc or fd
|
|
40
|
+
const firstWord = parseInt(expanded.split(':')[0], 16);
|
|
41
|
+
if ((firstWord & 0xfe00) === 0xfc00)
|
|
42
|
+
return true;
|
|
43
|
+
// fe80::/10 (Link Local)
|
|
44
|
+
if ((firstWord & 0xffc0) === 0xfe80)
|
|
45
|
+
return true;
|
|
46
|
+
return false;
|
|
47
|
+
}
|
|
48
|
+
return true; // Unknown format, block it for safety
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Resolves a hostname and validates all result IPs
|
|
52
|
+
*/
|
|
53
|
+
static async validateHost(host) {
|
|
54
|
+
if (net.isIP(host)) {
|
|
55
|
+
return !IPGuard.isInternal(host);
|
|
56
|
+
}
|
|
57
|
+
try {
|
|
58
|
+
const res4 = await resolve4(host).catch(() => []);
|
|
59
|
+
const res6 = await resolve6(host).catch(() => []);
|
|
60
|
+
const ips = [...res4, ...res6];
|
|
61
|
+
if (ips.length === 0)
|
|
62
|
+
return true; // Let the fetcher handle DNS failures
|
|
63
|
+
return ips.every(ip => !IPGuard.isInternal(ip));
|
|
64
|
+
}
|
|
65
|
+
catch (_e) {
|
|
66
|
+
// If resolution fails drastically, we block for safety or let fetcher try
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
static expandIPv6(ip) {
|
|
71
|
+
if (ip === '::')
|
|
72
|
+
return '0000:0000:0000:0000:0000:0000:0000:0000';
|
|
73
|
+
let full = ip;
|
|
74
|
+
if (ip.includes('::')) {
|
|
75
|
+
const parts = ip.split('::');
|
|
76
|
+
const left = parts[0].split(':').filter(x => x !== '');
|
|
77
|
+
const right = parts[1].split(':').filter(x => x !== '');
|
|
78
|
+
const missing = 8 - (left.length + right.length);
|
|
79
|
+
const middle = Array(missing).fill('0000');
|
|
80
|
+
full = [...left, ...middle, ...right].join(':');
|
|
81
|
+
}
|
|
82
|
+
return full.split(':').map(part => part.padStart(4, '0')).join(':');
|
|
83
|
+
}
|
|
84
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { Graph } from '../graph/graph.js';
|
|
2
|
+
export interface CrawlOptions {
|
|
3
|
+
limit: number;
|
|
4
|
+
depth: number;
|
|
5
|
+
concurrency?: number;
|
|
6
|
+
ignoreRobots?: boolean;
|
|
7
|
+
stripQuery?: boolean;
|
|
8
|
+
previousGraph?: Graph;
|
|
9
|
+
sitemap?: string;
|
|
10
|
+
debug?: boolean;
|
|
11
|
+
detectSoft404?: boolean;
|
|
12
|
+
detectTraps?: boolean;
|
|
13
|
+
rate?: number;
|
|
14
|
+
maxBytes?: number;
|
|
15
|
+
allowedDomains?: string[];
|
|
16
|
+
deniedDomains?: string[];
|
|
17
|
+
includeSubdomains?: boolean;
|
|
18
|
+
proxyUrl?: string;
|
|
19
|
+
maxRedirects?: number;
|
|
20
|
+
userAgent?: string;
|
|
21
|
+
}
|
|
22
|
+
export declare function crawl(startUrl: string, options: CrawlOptions): Promise<number>;
|
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
import { request } from 'undici';
|
|
2
|
+
import pLimit from 'p-limit';
|
|
3
|
+
import chalk from 'chalk';
|
|
4
|
+
import robotsParser from 'robots-parser';
|
|
5
|
+
import { Fetcher } from './fetcher.js';
|
|
6
|
+
import { Parser } from './parser.js';
|
|
7
|
+
import { Sitemap } from './sitemap.js';
|
|
8
|
+
import { normalizeUrl } from './normalize.js';
|
|
9
|
+
import { TrapDetector } from './trap.js';
|
|
10
|
+
import { ScopeManager } from '../core/scope/scopeManager.js';
|
|
11
|
+
import { getDb } from '../db/index.js';
|
|
12
|
+
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
13
|
+
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
14
|
+
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
15
|
+
import { EdgeRepository } from '../db/repositories/EdgeRepository.js';
|
|
16
|
+
import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
|
|
17
|
+
import { analyzeContent, calculateThinContentScore } from '../analysis/content.js';
|
|
18
|
+
import { analyzeLinks } from '../analysis/links.js';
|
|
19
|
+
export async function crawl(startUrl, options) {
|
|
20
|
+
const visited = new Set();
|
|
21
|
+
const concurrency = Math.min(options.concurrency || 2, 10);
|
|
22
|
+
const limitConcurrency = pLimit(concurrency);
|
|
23
|
+
const trapDetector = new TrapDetector();
|
|
24
|
+
const db = getDb();
|
|
25
|
+
const siteRepo = new SiteRepository(db);
|
|
26
|
+
const snapshotRepo = new SnapshotRepository(db);
|
|
27
|
+
const pageRepo = new PageRepository(db);
|
|
28
|
+
const edgeRepo = new EdgeRepository(db);
|
|
29
|
+
const metricsRepo = new MetricsRepository(db);
|
|
30
|
+
const rootUrl = normalizeUrl(startUrl, '', { stripQuery: options.stripQuery });
|
|
31
|
+
if (!rootUrl)
|
|
32
|
+
throw new Error('Invalid start URL');
|
|
33
|
+
const urlObj = new URL(rootUrl);
|
|
34
|
+
const domain = urlObj.hostname.replace('www.', '');
|
|
35
|
+
const site = siteRepo.firstOrCreateSite(domain);
|
|
36
|
+
const siteId = site.id;
|
|
37
|
+
const snapshotId = snapshotRepo.createSnapshot(siteId, options.previousGraph ? 'incremental' : 'full');
|
|
38
|
+
const rootOrigin = urlObj.origin;
|
|
39
|
+
// DB Helper
|
|
40
|
+
const savePageToDb = (url, depth, status, data = {}) => {
|
|
41
|
+
try {
|
|
42
|
+
const existing = pageRepo.getPage(siteId, url);
|
|
43
|
+
const isSameSnapshot = existing?.last_seen_snapshot_id === snapshotId;
|
|
44
|
+
return pageRepo.upsertAndGetId({
|
|
45
|
+
site_id: siteId,
|
|
46
|
+
normalized_url: url,
|
|
47
|
+
depth: isSameSnapshot ? existing.depth : depth,
|
|
48
|
+
http_status: status,
|
|
49
|
+
first_seen_snapshot_id: existing ? existing.first_seen_snapshot_id : snapshotId,
|
|
50
|
+
last_seen_snapshot_id: snapshotId,
|
|
51
|
+
canonical_url: data.canonical !== undefined ? data.canonical : existing?.canonical_url,
|
|
52
|
+
content_hash: data.contentHash !== undefined ? data.contentHash : existing?.content_hash,
|
|
53
|
+
simhash: data.simhash !== undefined ? data.simhash : existing?.simhash,
|
|
54
|
+
etag: data.etag !== undefined ? data.etag : existing?.etag,
|
|
55
|
+
last_modified: data.lastModified !== undefined ? data.lastModified : existing?.last_modified,
|
|
56
|
+
html: data.html !== undefined ? data.html : existing?.html,
|
|
57
|
+
soft404_score: data.soft404Score !== undefined ? data.soft404Score : existing?.soft404_score,
|
|
58
|
+
noindex: data.noindex !== undefined ? (data.noindex ? 1 : 0) : existing?.noindex,
|
|
59
|
+
nofollow: data.nofollow !== undefined ? (data.nofollow ? 1 : 0) : existing?.nofollow,
|
|
60
|
+
security_error: data.securityError !== undefined ? data.securityError : existing?.security_error,
|
|
61
|
+
retries: data.retries !== undefined ? data.retries : existing?.retries
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
catch (e) {
|
|
65
|
+
console.error(`Failed to save page ${url}:`, e);
|
|
66
|
+
return null;
|
|
67
|
+
}
|
|
68
|
+
};
|
|
69
|
+
const saveEdgeToDb = (sourceUrl, targetUrl, weight = 1.0, rel = 'internal') => {
|
|
70
|
+
try {
|
|
71
|
+
const sourceId = pageRepo.getIdByUrl(siteId, sourceUrl);
|
|
72
|
+
const targetId = pageRepo.getIdByUrl(siteId, targetUrl);
|
|
73
|
+
if (sourceId && targetId) {
|
|
74
|
+
edgeRepo.insertEdge(snapshotId, sourceId, targetId, weight, rel);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
catch (e) {
|
|
78
|
+
console.error(`Failed to save edge ${sourceUrl} -> ${targetUrl}:`, e);
|
|
79
|
+
}
|
|
80
|
+
};
|
|
81
|
+
// Initialize Modules
|
|
82
|
+
const scopeManager = new ScopeManager({
|
|
83
|
+
allowedDomains: options.allowedDomains || [],
|
|
84
|
+
deniedDomains: options.deniedDomains || [],
|
|
85
|
+
includeSubdomains: options.includeSubdomains || false,
|
|
86
|
+
rootUrl: startUrl
|
|
87
|
+
});
|
|
88
|
+
const fetcher = new Fetcher({
|
|
89
|
+
rate: options.rate,
|
|
90
|
+
proxyUrl: options.proxyUrl,
|
|
91
|
+
scopeManager,
|
|
92
|
+
maxRedirects: options.maxRedirects,
|
|
93
|
+
userAgent: options.userAgent
|
|
94
|
+
});
|
|
95
|
+
const parser = new Parser();
|
|
96
|
+
const sitemapFetcher = new Sitemap();
|
|
97
|
+
// Handle robots.txt
|
|
98
|
+
let robots = null;
|
|
99
|
+
if (!options.ignoreRobots) {
|
|
100
|
+
try {
|
|
101
|
+
const robotsUrl = new URL('/robots.txt', rootOrigin).toString();
|
|
102
|
+
const res = await request(robotsUrl, {
|
|
103
|
+
maxRedirections: 3,
|
|
104
|
+
headers: { 'User-Agent': 'crawlith/1.0' },
|
|
105
|
+
headersTimeout: 5000,
|
|
106
|
+
bodyTimeout: 5000
|
|
107
|
+
});
|
|
108
|
+
if (res.statusCode >= 200 && res.statusCode < 300) {
|
|
109
|
+
const txt = await res.body.text();
|
|
110
|
+
robots = robotsParser(robotsUrl, txt);
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
await res.body.dump();
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
console.warn('Failed to fetch robots.txt, proceeding...');
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
// Queue Setup
|
|
121
|
+
const queue = [];
|
|
122
|
+
const uniqueQueue = new Set();
|
|
123
|
+
const addToQueue = (u, d) => {
|
|
124
|
+
if (scopeManager.isUrlEligible(u) !== 'allowed')
|
|
125
|
+
return;
|
|
126
|
+
if (!uniqueQueue.has(u)) {
|
|
127
|
+
uniqueQueue.add(u);
|
|
128
|
+
queue.push({ url: u, depth: d });
|
|
129
|
+
}
|
|
130
|
+
};
|
|
131
|
+
// Seed from Sitemap
|
|
132
|
+
if (options.sitemap) {
|
|
133
|
+
try {
|
|
134
|
+
const sitemapUrl = options.sitemap === 'true' ? new URL('/sitemap.xml', rootOrigin).toString() : options.sitemap;
|
|
135
|
+
if (sitemapUrl.startsWith('http')) {
|
|
136
|
+
console.log(`Fetching sitemap: ${sitemapUrl}`);
|
|
137
|
+
const sitemapUrls = await sitemapFetcher.fetch(sitemapUrl);
|
|
138
|
+
for (const u of sitemapUrls) {
|
|
139
|
+
const normalized = normalizeUrl(u, '', options);
|
|
140
|
+
if (normalized)
|
|
141
|
+
addToQueue(normalized, 0);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
catch (e) {
|
|
146
|
+
console.warn('Sitemap fetch failed', e);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
// Seed from startUrl
|
|
150
|
+
addToQueue(rootUrl, 0);
|
|
151
|
+
let pagesCrawled = 0;
|
|
152
|
+
let active = 0;
|
|
153
|
+
let reachedLimit = false;
|
|
154
|
+
const maxDepthInCrawl = Math.min(options.depth, 10);
|
|
155
|
+
const shouldEnqueue = (url, depth) => {
|
|
156
|
+
if (visited.has(url))
|
|
157
|
+
return false;
|
|
158
|
+
if (uniqueQueue.has(url))
|
|
159
|
+
return false;
|
|
160
|
+
if (depth > maxDepthInCrawl)
|
|
161
|
+
return false;
|
|
162
|
+
if (scopeManager.isUrlEligible(url) !== 'allowed')
|
|
163
|
+
return false;
|
|
164
|
+
if (options.detectTraps) {
|
|
165
|
+
const trap = trapDetector.checkTrap(url, depth);
|
|
166
|
+
if (trap.risk > 0.8)
|
|
167
|
+
return false;
|
|
168
|
+
}
|
|
169
|
+
return true;
|
|
170
|
+
};
|
|
171
|
+
return new Promise((resolve) => {
|
|
172
|
+
const checkDone = () => {
|
|
173
|
+
if (queue.length === 0 && active === 0) {
|
|
174
|
+
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
|
|
175
|
+
limit_reached: reachedLimit ? 1 : 0
|
|
176
|
+
});
|
|
177
|
+
resolve(snapshotId);
|
|
178
|
+
return true;
|
|
179
|
+
}
|
|
180
|
+
return false;
|
|
181
|
+
};
|
|
182
|
+
const next = () => {
|
|
183
|
+
if (checkDone())
|
|
184
|
+
return;
|
|
185
|
+
if (pagesCrawled >= options.limit) {
|
|
186
|
+
reachedLimit = true;
|
|
187
|
+
if (active === 0) {
|
|
188
|
+
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
|
|
189
|
+
limit_reached: 1
|
|
190
|
+
});
|
|
191
|
+
resolve(snapshotId);
|
|
192
|
+
}
|
|
193
|
+
return;
|
|
194
|
+
}
|
|
195
|
+
while (queue.length > 0 && active < concurrency && pagesCrawled < options.limit) {
|
|
196
|
+
const item = queue.shift();
|
|
197
|
+
if (visited.has(item.url))
|
|
198
|
+
continue;
|
|
199
|
+
if (robots && !robots.isAllowed(item.url, 'crawlith'))
|
|
200
|
+
continue;
|
|
201
|
+
active++;
|
|
202
|
+
pagesCrawled++;
|
|
203
|
+
visited.add(item.url);
|
|
204
|
+
limitConcurrency(() => processPage(item)).finally(() => {
|
|
205
|
+
active--;
|
|
206
|
+
next();
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
};
|
|
210
|
+
const processPage = async (item) => {
|
|
211
|
+
const { url, depth } = item;
|
|
212
|
+
if (scopeManager.isUrlEligible(url) !== 'allowed') {
|
|
213
|
+
savePageToDb(url, depth, 0, { securityError: 'blocked_by_domain_filter' });
|
|
214
|
+
return;
|
|
215
|
+
}
|
|
216
|
+
const existingInDb = pageRepo.getPage(siteId, url);
|
|
217
|
+
savePageToDb(url, depth, 0);
|
|
218
|
+
try {
|
|
219
|
+
const res = await fetcher.fetch(url, {
|
|
220
|
+
etag: existingInDb?.etag || undefined,
|
|
221
|
+
lastModified: existingInDb?.last_modified || undefined,
|
|
222
|
+
maxBytes: options.maxBytes,
|
|
223
|
+
crawlDelay: robots ? robots.getCrawlDelay('crawlith') : undefined
|
|
224
|
+
});
|
|
225
|
+
if (options.debug) {
|
|
226
|
+
console.log(`${chalk.gray(`[D:${depth}]`)} ${res.status} ${chalk.blue(url)}`);
|
|
227
|
+
}
|
|
228
|
+
if (res.status === 304) {
|
|
229
|
+
savePageToDb(url, depth, 304);
|
|
230
|
+
metricsRepo.insertMetrics({
|
|
231
|
+
snapshot_id: snapshotId,
|
|
232
|
+
page_id: existingInDb.id,
|
|
233
|
+
authority_score: null,
|
|
234
|
+
hub_score: null,
|
|
235
|
+
pagerank: null,
|
|
236
|
+
pagerank_score: null,
|
|
237
|
+
link_role: null,
|
|
238
|
+
crawl_status: 'cached',
|
|
239
|
+
word_count: null,
|
|
240
|
+
thin_content_score: null,
|
|
241
|
+
external_link_ratio: null,
|
|
242
|
+
orphan_score: null,
|
|
243
|
+
duplicate_cluster_id: null,
|
|
244
|
+
duplicate_type: null,
|
|
245
|
+
is_cluster_primary: 0
|
|
246
|
+
});
|
|
247
|
+
return;
|
|
248
|
+
}
|
|
249
|
+
const chain = res.redirectChain;
|
|
250
|
+
for (const step of chain) {
|
|
251
|
+
const source = normalizeUrl(step.url, '', options);
|
|
252
|
+
const target = normalizeUrl(step.target, '', options);
|
|
253
|
+
if (source && target) {
|
|
254
|
+
savePageToDb(source, depth, step.status);
|
|
255
|
+
savePageToDb(target, depth, 0);
|
|
256
|
+
saveEdgeToDb(source, target);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
const finalUrl = normalizeUrl(res.finalUrl, '', options);
|
|
260
|
+
if (!finalUrl)
|
|
261
|
+
return;
|
|
262
|
+
const isStringStatus = typeof res.status === 'string';
|
|
263
|
+
if (isStringStatus || (typeof res.status === 'number' && res.status >= 300)) {
|
|
264
|
+
savePageToDb(finalUrl, depth, typeof res.status === 'number' ? res.status : 0, {
|
|
265
|
+
securityError: isStringStatus ? res.status : undefined,
|
|
266
|
+
retries: res.retries
|
|
267
|
+
});
|
|
268
|
+
return;
|
|
269
|
+
}
|
|
270
|
+
if (res.status === 200) {
|
|
271
|
+
const contentTypeHeader = res.headers['content-type'];
|
|
272
|
+
const contentType = Array.isArray(contentTypeHeader) ? contentTypeHeader[0] : (contentTypeHeader || '');
|
|
273
|
+
if (!contentType || !contentType.toLowerCase().includes('text/html')) {
|
|
274
|
+
savePageToDb(finalUrl, depth, res.status);
|
|
275
|
+
return;
|
|
276
|
+
}
|
|
277
|
+
savePageToDb(finalUrl, depth, res.status);
|
|
278
|
+
const parseResult = parser.parse(res.body, finalUrl, res.status);
|
|
279
|
+
const pageId = savePageToDb(finalUrl, depth, res.status, {
|
|
280
|
+
html: parseResult.html,
|
|
281
|
+
canonical: parseResult.canonical || undefined,
|
|
282
|
+
noindex: parseResult.noindex,
|
|
283
|
+
nofollow: parseResult.nofollow,
|
|
284
|
+
contentHash: parseResult.contentHash,
|
|
285
|
+
simhash: parseResult.simhash,
|
|
286
|
+
soft404Score: parseResult.soft404Score,
|
|
287
|
+
etag: res.etag,
|
|
288
|
+
lastModified: res.lastModified,
|
|
289
|
+
retries: res.retries
|
|
290
|
+
});
|
|
291
|
+
if (pageId) {
|
|
292
|
+
try {
|
|
293
|
+
const contentAnalysis = analyzeContent(parseResult.html);
|
|
294
|
+
const linkAnalysis = analyzeLinks(parseResult.html, finalUrl, rootOrigin);
|
|
295
|
+
const thinScore = calculateThinContentScore(contentAnalysis, 0);
|
|
296
|
+
metricsRepo.insertMetrics({
|
|
297
|
+
snapshot_id: snapshotId,
|
|
298
|
+
page_id: pageId,
|
|
299
|
+
authority_score: null,
|
|
300
|
+
hub_score: null,
|
|
301
|
+
pagerank: null,
|
|
302
|
+
pagerank_score: null,
|
|
303
|
+
link_role: null,
|
|
304
|
+
crawl_status: 'fetched',
|
|
305
|
+
word_count: contentAnalysis.wordCount,
|
|
306
|
+
thin_content_score: thinScore,
|
|
307
|
+
external_link_ratio: linkAnalysis.externalRatio,
|
|
308
|
+
orphan_score: null,
|
|
309
|
+
duplicate_cluster_id: null,
|
|
310
|
+
duplicate_type: null,
|
|
311
|
+
is_cluster_primary: 0
|
|
312
|
+
});
|
|
313
|
+
}
|
|
314
|
+
catch (e) {
|
|
315
|
+
console.error(`Error calculating per-page metrics for ${finalUrl}:`, e);
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
for (const linkItem of parseResult.links) {
|
|
319
|
+
const normalizedLink = normalizeUrl(linkItem.url, '', options);
|
|
320
|
+
if (normalizedLink && normalizedLink !== finalUrl) {
|
|
321
|
+
savePageToDb(normalizedLink, depth + 1, 0);
|
|
322
|
+
saveEdgeToDb(finalUrl, normalizedLink, 1.0, 'internal');
|
|
323
|
+
if (shouldEnqueue(normalizedLink, depth + 1)) {
|
|
324
|
+
addToQueue(normalizedLink, depth + 1);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
catch (e) {
|
|
331
|
+
console.error(`Error processing ${url}:`, e);
|
|
332
|
+
}
|
|
333
|
+
};
|
|
334
|
+
next();
|
|
335
|
+
});
|
|
336
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import * as cheerio from 'cheerio';
|
|
2
|
+
/**
|
|
3
|
+
* Extracts all links from an HTML document.
|
|
4
|
+
* Returns absolute URLs.
|
|
5
|
+
*/
|
|
6
|
+
export function extractLinks(html, baseUrl) {
|
|
7
|
+
try {
|
|
8
|
+
const $ = cheerio.load(html);
|
|
9
|
+
const links = new Set();
|
|
10
|
+
$('a').each((_, element) => {
|
|
11
|
+
const href = $(element).attr('href');
|
|
12
|
+
if (href) {
|
|
13
|
+
try {
|
|
14
|
+
const absoluteUrl = new URL(href, baseUrl);
|
|
15
|
+
// Only http(s) links
|
|
16
|
+
if (absoluteUrl.protocol === 'http:' || absoluteUrl.protocol === 'https:') {
|
|
17
|
+
// Remove hash fragments immediately as they are irrelevant for crawling
|
|
18
|
+
absoluteUrl.hash = '';
|
|
19
|
+
links.add(absoluteUrl.toString());
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
catch (_e) {
|
|
23
|
+
// Invalid URL, skip
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
return Array.from(links);
|
|
28
|
+
}
|
|
29
|
+
catch (e) {
|
|
30
|
+
console.error(`Error extracting links from ${baseUrl}:`, e);
|
|
31
|
+
return [];
|
|
32
|
+
}
|
|
33
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { ScopeManager } from '../core/scope/scopeManager.js';
|
|
2
|
+
export interface RedirectStep {
|
|
3
|
+
url: string;
|
|
4
|
+
status: number;
|
|
5
|
+
target: string;
|
|
6
|
+
}
|
|
7
|
+
export interface FetchResult {
|
|
8
|
+
status: number | 'blocked_internal_ip' | 'blocked_by_domain_filter' | 'blocked_subdomain' | 'oversized' | 'failed_after_retries' | 'network_error' | 'redirect_limit_exceeded' | 'redirect_loop' | 'proxy_connection_failed';
|
|
9
|
+
headers: Record<string, string | string[] | undefined>;
|
|
10
|
+
body: string;
|
|
11
|
+
redirectChain: RedirectStep[];
|
|
12
|
+
etag: string | null;
|
|
13
|
+
lastModified: string | null;
|
|
14
|
+
finalUrl: string;
|
|
15
|
+
retries?: number;
|
|
16
|
+
bytesReceived?: number;
|
|
17
|
+
}
|
|
18
|
+
export interface FetchOptions {
|
|
19
|
+
etag?: string;
|
|
20
|
+
lastModified?: string;
|
|
21
|
+
rate?: number;
|
|
22
|
+
maxBytes?: number;
|
|
23
|
+
crawlDelay?: number;
|
|
24
|
+
}
|
|
25
|
+
export declare class Fetcher {
|
|
26
|
+
private userAgent;
|
|
27
|
+
private rateLimiter;
|
|
28
|
+
private proxyAdapter;
|
|
29
|
+
private scopeManager?;
|
|
30
|
+
private maxRedirects;
|
|
31
|
+
constructor(options?: {
|
|
32
|
+
rate?: number;
|
|
33
|
+
proxyUrl?: string;
|
|
34
|
+
scopeManager?: ScopeManager;
|
|
35
|
+
maxRedirects?: number;
|
|
36
|
+
userAgent?: string;
|
|
37
|
+
});
|
|
38
|
+
fetch(url: string, options?: FetchOptions): Promise<FetchResult>;
|
|
39
|
+
private errorResult;
|
|
40
|
+
}
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import { request } from 'undici';
|
|
2
|
+
import { IPGuard } from '../core/security/ipGuard.js';
|
|
3
|
+
import { RateLimiter } from '../core/network/rateLimiter.js';
|
|
4
|
+
import { RetryPolicy } from '../core/network/retryPolicy.js';
|
|
5
|
+
import { ResponseLimiter } from '../core/network/responseLimiter.js';
|
|
6
|
+
import { RedirectController } from '../core/network/redirectController.js';
|
|
7
|
+
import { ProxyAdapter } from '../core/network/proxyAdapter.js';
|
|
8
|
+
import { version } from '../utils/version.js';
|
|
9
|
+
export class Fetcher {
|
|
10
|
+
userAgent = 'crawlith/1.0';
|
|
11
|
+
rateLimiter;
|
|
12
|
+
proxyAdapter;
|
|
13
|
+
scopeManager;
|
|
14
|
+
maxRedirects;
|
|
15
|
+
constructor(options = {}) {
|
|
16
|
+
this.rateLimiter = new RateLimiter(options.rate || 2);
|
|
17
|
+
this.proxyAdapter = new ProxyAdapter(options.proxyUrl);
|
|
18
|
+
this.scopeManager = options.scopeManager;
|
|
19
|
+
this.maxRedirects = Math.min(options.maxRedirects ?? 2, 11);
|
|
20
|
+
this.userAgent = options.userAgent || `crawlith/${version}`;
|
|
21
|
+
}
|
|
22
|
+
async fetch(url, options = {}) {
|
|
23
|
+
const maxBytes = options.maxBytes || 2000000;
|
|
24
|
+
const redirectChain = [];
|
|
25
|
+
const redirectController = new RedirectController(this.maxRedirects, url);
|
|
26
|
+
let currentUrl = url;
|
|
27
|
+
let totalRetries = 0;
|
|
28
|
+
// Use a while(true) and explicit return/continue to handle redirects
|
|
29
|
+
while (true) {
|
|
30
|
+
const urlObj = new URL(currentUrl);
|
|
31
|
+
// 1. SSRF Guard
|
|
32
|
+
const isSafe = await IPGuard.validateHost(urlObj.hostname);
|
|
33
|
+
if (!isSafe) {
|
|
34
|
+
return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
|
|
35
|
+
}
|
|
36
|
+
// 2. Scope Validation (Domain & Subdomain)
|
|
37
|
+
if (this.scopeManager) {
|
|
38
|
+
const eligibility = this.scopeManager.isUrlEligible(currentUrl);
|
|
39
|
+
if (eligibility !== 'allowed') {
|
|
40
|
+
return this.errorResult(eligibility, currentUrl, redirectChain, totalRetries);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
// 3. Rate Limiting
|
|
44
|
+
await this.rateLimiter.waitForToken(urlObj.hostname, options.crawlDelay);
|
|
45
|
+
try {
|
|
46
|
+
// 4. Retry Strategy
|
|
47
|
+
const result = await RetryPolicy.execute(async (attempt) => {
|
|
48
|
+
if (attempt > 0)
|
|
49
|
+
totalRetries++;
|
|
50
|
+
const headers = {
|
|
51
|
+
'User-Agent': this.userAgent
|
|
52
|
+
};
|
|
53
|
+
// Conditional GET only for the FIRST request in a chain
|
|
54
|
+
if (redirectChain.length === 0) {
|
|
55
|
+
if (options.etag)
|
|
56
|
+
headers['If-None-Match'] = options.etag;
|
|
57
|
+
if (options.lastModified)
|
|
58
|
+
headers['If-Modified-Since'] = options.lastModified;
|
|
59
|
+
}
|
|
60
|
+
const res = await request(currentUrl, {
|
|
61
|
+
method: 'GET',
|
|
62
|
+
headers,
|
|
63
|
+
maxRedirections: 0,
|
|
64
|
+
dispatcher: this.proxyAdapter.dispatcher,
|
|
65
|
+
headersTimeout: 10000,
|
|
66
|
+
bodyTimeout: 10000
|
|
67
|
+
});
|
|
68
|
+
if (RetryPolicy.isRetryableStatus(res.statusCode)) {
|
|
69
|
+
await res.body.dump();
|
|
70
|
+
throw new Error(`Status ${res.statusCode}`);
|
|
71
|
+
}
|
|
72
|
+
return res;
|
|
73
|
+
}, (error) => RetryPolicy.isNetworkError(error) || error.message.startsWith('Status '));
|
|
74
|
+
const status = result.statusCode;
|
|
75
|
+
const resHeaders = result.headers;
|
|
76
|
+
const getHeader = (name) => {
|
|
77
|
+
const val = resHeaders[name.toLowerCase()];
|
|
78
|
+
if (Array.isArray(val))
|
|
79
|
+
return val[0];
|
|
80
|
+
return val || null;
|
|
81
|
+
};
|
|
82
|
+
const etag = getHeader('etag');
|
|
83
|
+
const lastModified = getHeader('last-modified');
|
|
84
|
+
// Handle Redirects
|
|
85
|
+
if (status >= 300 && status < 400 && status !== 304) {
|
|
86
|
+
const location = getHeader('location');
|
|
87
|
+
if (location) {
|
|
88
|
+
let targetUrl;
|
|
89
|
+
try {
|
|
90
|
+
targetUrl = new URL(location, currentUrl).toString();
|
|
91
|
+
}
|
|
92
|
+
catch (_e) {
|
|
93
|
+
// Bad redirect location, treat as final but maybe error?
|
|
94
|
+
const body = await ResponseLimiter.streamToString(result.body, maxBytes);
|
|
95
|
+
return { status, headers: resHeaders, body, redirectChain, etag: null, lastModified: null, finalUrl: currentUrl, retries: totalRetries };
|
|
96
|
+
}
|
|
97
|
+
const redirectError = redirectController.nextHop(targetUrl);
|
|
98
|
+
if (redirectError) {
|
|
99
|
+
await result.body.dump();
|
|
100
|
+
return this.errorResult(redirectError, currentUrl, redirectChain, totalRetries);
|
|
101
|
+
}
|
|
102
|
+
redirectChain.push({ url: currentUrl, status, target: targetUrl });
|
|
103
|
+
await result.body.dump();
|
|
104
|
+
currentUrl = targetUrl;
|
|
105
|
+
continue; // Next iteration for redirect target
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
// 5. Max Response Size (Streaming)
|
|
109
|
+
let bytesReceived = 0;
|
|
110
|
+
try {
|
|
111
|
+
const body = status === 304 ? '' : await ResponseLimiter.streamToString(result.body, maxBytes, (bytes) => { bytesReceived = bytes; });
|
|
112
|
+
return {
|
|
113
|
+
status,
|
|
114
|
+
headers: resHeaders,
|
|
115
|
+
body,
|
|
116
|
+
redirectChain,
|
|
117
|
+
etag,
|
|
118
|
+
lastModified,
|
|
119
|
+
finalUrl: currentUrl,
|
|
120
|
+
retries: totalRetries,
|
|
121
|
+
bytesReceived
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
catch (e) {
|
|
125
|
+
if (e.message === 'Oversized response') {
|
|
126
|
+
return {
|
|
127
|
+
status: 'oversized',
|
|
128
|
+
headers: resHeaders,
|
|
129
|
+
body: '',
|
|
130
|
+
redirectChain,
|
|
131
|
+
etag: null,
|
|
132
|
+
lastModified: null,
|
|
133
|
+
finalUrl: currentUrl,
|
|
134
|
+
retries: totalRetries,
|
|
135
|
+
bytesReceived
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
throw e;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
catch (error) {
|
|
142
|
+
// Map common network errors to specific statuses if needed
|
|
143
|
+
const isProxyError = error.message?.toLowerCase().includes('proxy') || error.code === 'ECONNREFUSED';
|
|
144
|
+
const finalStatus = isProxyError ? 'proxy_connection_failed' : 'network_error';
|
|
145
|
+
return this.errorResult(totalRetries >= RetryPolicy.DEFAULT_CONFIG.maxRetries ? 'failed_after_retries' : finalStatus, currentUrl, redirectChain, totalRetries);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
errorResult(status, finalUrl, redirectChain, retries) {
|
|
150
|
+
return {
|
|
151
|
+
status,
|
|
152
|
+
headers: {},
|
|
153
|
+
body: '',
|
|
154
|
+
redirectChain,
|
|
155
|
+
etag: null,
|
|
156
|
+
lastModified: null,
|
|
157
|
+
finalUrl,
|
|
158
|
+
retries
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
}
|