@crawlith/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/analysis/analyze.d.ts +70 -0
- package/dist/analysis/analyze.js +436 -0
- package/dist/analysis/content.d.ts +12 -0
- package/dist/analysis/content.js +33 -0
- package/dist/analysis/images.d.ts +6 -0
- package/dist/analysis/images.js +18 -0
- package/dist/analysis/links.d.ts +7 -0
- package/dist/analysis/links.js +30 -0
- package/dist/analysis/scoring.d.ts +9 -0
- package/dist/analysis/scoring.js +42 -0
- package/dist/analysis/seo.d.ts +15 -0
- package/dist/analysis/seo.js +64 -0
- package/dist/analysis/structuredData.d.ts +6 -0
- package/dist/analysis/structuredData.js +51 -0
- package/dist/audit/dns.d.ts +2 -0
- package/dist/audit/dns.js +42 -0
- package/dist/audit/headers.d.ts +2 -0
- package/dist/audit/headers.js +95 -0
- package/dist/audit/index.d.ts +2 -0
- package/dist/audit/index.js +50 -0
- package/dist/audit/scoring.d.ts +14 -0
- package/dist/audit/scoring.js +214 -0
- package/dist/audit/transport.d.ts +6 -0
- package/dist/audit/transport.js +207 -0
- package/dist/audit/types.d.ts +88 -0
- package/dist/audit/types.js +1 -0
- package/dist/core/network/proxyAdapter.d.ts +6 -0
- package/dist/core/network/proxyAdapter.js +19 -0
- package/dist/core/network/rateLimiter.d.ts +6 -0
- package/dist/core/network/rateLimiter.js +31 -0
- package/dist/core/network/redirectController.d.ts +13 -0
- package/dist/core/network/redirectController.js +41 -0
- package/dist/core/network/responseLimiter.d.ts +4 -0
- package/dist/core/network/responseLimiter.js +26 -0
- package/dist/core/network/retryPolicy.d.ts +10 -0
- package/dist/core/network/retryPolicy.js +41 -0
- package/dist/core/scope/domainFilter.d.ts +11 -0
- package/dist/core/scope/domainFilter.js +40 -0
- package/dist/core/scope/scopeManager.d.ts +14 -0
- package/dist/core/scope/scopeManager.js +39 -0
- package/dist/core/scope/subdomainPolicy.d.ts +6 -0
- package/dist/core/scope/subdomainPolicy.js +35 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +84 -0
- package/dist/crawler/crawl.d.ts +22 -0
- package/dist/crawler/crawl.js +336 -0
- package/dist/crawler/extract.d.ts +5 -0
- package/dist/crawler/extract.js +33 -0
- package/dist/crawler/fetcher.d.ts +40 -0
- package/dist/crawler/fetcher.js +161 -0
- package/dist/crawler/metricsRunner.d.ts +1 -0
- package/dist/crawler/metricsRunner.js +108 -0
- package/dist/crawler/normalize.d.ts +7 -0
- package/dist/crawler/normalize.js +88 -0
- package/dist/crawler/parser.d.ts +22 -0
- package/dist/crawler/parser.js +158 -0
- package/dist/crawler/sitemap.d.ts +8 -0
- package/dist/crawler/sitemap.js +70 -0
- package/dist/crawler/trap.d.ts +24 -0
- package/dist/crawler/trap.js +78 -0
- package/dist/db/graphLoader.d.ts +2 -0
- package/dist/db/graphLoader.js +96 -0
- package/dist/db/index.d.ts +4 -0
- package/dist/db/index.js +61 -0
- package/dist/db/repositories/EdgeRepository.d.ts +16 -0
- package/dist/db/repositories/EdgeRepository.js +17 -0
- package/dist/db/repositories/MetricsRepository.d.ts +26 -0
- package/dist/db/repositories/MetricsRepository.js +27 -0
- package/dist/db/repositories/PageRepository.d.ts +47 -0
- package/dist/db/repositories/PageRepository.js +93 -0
- package/dist/db/repositories/SiteRepository.d.ts +15 -0
- package/dist/db/repositories/SiteRepository.js +22 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
- package/dist/db/repositories/SnapshotRepository.js +55 -0
- package/dist/db/schema.d.ts +2 -0
- package/dist/db/schema.js +169 -0
- package/dist/diff/compare.d.ts +26 -0
- package/dist/diff/compare.js +64 -0
- package/dist/graph/cluster.d.ts +6 -0
- package/dist/graph/cluster.js +173 -0
- package/dist/graph/duplicate.d.ts +10 -0
- package/dist/graph/duplicate.js +251 -0
- package/dist/graph/graph.d.ts +103 -0
- package/dist/graph/graph.js +106 -0
- package/dist/graph/metrics.d.ts +29 -0
- package/dist/graph/metrics.js +74 -0
- package/dist/graph/pagerank.d.ts +12 -0
- package/dist/graph/pagerank.js +102 -0
- package/dist/graph/simhash.d.ts +17 -0
- package/dist/graph/simhash.js +56 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.js +30 -0
- package/dist/lock/hashKey.d.ts +1 -0
- package/dist/lock/hashKey.js +44 -0
- package/dist/lock/lockManager.d.ts +7 -0
- package/dist/lock/lockManager.js +112 -0
- package/dist/lock/pidCheck.d.ts +1 -0
- package/dist/lock/pidCheck.js +14 -0
- package/dist/report/html.d.ts +2 -0
- package/dist/report/html.js +223 -0
- package/dist/report/sitegraphExport.d.ts +3 -0
- package/dist/report/sitegraphExport.js +52 -0
- package/dist/report/sitegraph_template.d.ts +1 -0
- package/dist/report/sitegraph_template.js +630 -0
- package/dist/scoring/hits.d.ts +9 -0
- package/dist/scoring/hits.js +111 -0
- package/dist/scoring/orphanSeverity.d.ts +39 -0
- package/dist/scoring/orphanSeverity.js +125 -0
- package/dist/utils/version.d.ts +2 -0
- package/dist/utils/version.js +15 -0
- package/package.json +33 -0
- package/src/analysis/analyze.ts +548 -0
- package/src/analysis/content.ts +62 -0
- package/src/analysis/images.ts +28 -0
- package/src/analysis/links.ts +41 -0
- package/src/analysis/scoring.ts +59 -0
- package/src/analysis/seo.ts +82 -0
- package/src/analysis/structuredData.ts +62 -0
- package/src/audit/dns.ts +49 -0
- package/src/audit/headers.ts +98 -0
- package/src/audit/index.ts +66 -0
- package/src/audit/scoring.ts +232 -0
- package/src/audit/transport.ts +258 -0
- package/src/audit/types.ts +102 -0
- package/src/core/network/proxyAdapter.ts +21 -0
- package/src/core/network/rateLimiter.ts +39 -0
- package/src/core/network/redirectController.ts +47 -0
- package/src/core/network/responseLimiter.ts +34 -0
- package/src/core/network/retryPolicy.ts +57 -0
- package/src/core/scope/domainFilter.ts +45 -0
- package/src/core/scope/scopeManager.ts +52 -0
- package/src/core/scope/subdomainPolicy.ts +39 -0
- package/src/core/security/ipGuard.ts +92 -0
- package/src/crawler/crawl.ts +382 -0
- package/src/crawler/extract.ts +34 -0
- package/src/crawler/fetcher.ts +233 -0
- package/src/crawler/metricsRunner.ts +124 -0
- package/src/crawler/normalize.ts +108 -0
- package/src/crawler/parser.ts +190 -0
- package/src/crawler/sitemap.ts +73 -0
- package/src/crawler/trap.ts +96 -0
- package/src/db/graphLoader.ts +105 -0
- package/src/db/index.ts +70 -0
- package/src/db/repositories/EdgeRepository.ts +29 -0
- package/src/db/repositories/MetricsRepository.ts +49 -0
- package/src/db/repositories/PageRepository.ts +128 -0
- package/src/db/repositories/SiteRepository.ts +32 -0
- package/src/db/repositories/SnapshotRepository.ts +74 -0
- package/src/db/schema.ts +177 -0
- package/src/diff/compare.ts +84 -0
- package/src/graph/cluster.ts +192 -0
- package/src/graph/duplicate.ts +286 -0
- package/src/graph/graph.ts +172 -0
- package/src/graph/metrics.ts +110 -0
- package/src/graph/pagerank.ts +125 -0
- package/src/graph/simhash.ts +61 -0
- package/src/index.ts +30 -0
- package/src/lock/hashKey.ts +51 -0
- package/src/lock/lockManager.ts +124 -0
- package/src/lock/pidCheck.ts +13 -0
- package/src/report/html.ts +227 -0
- package/src/report/sitegraphExport.ts +58 -0
- package/src/report/sitegraph_template.ts +630 -0
- package/src/scoring/hits.ts +131 -0
- package/src/scoring/orphanSeverity.ts +176 -0
- package/src/utils/version.ts +18 -0
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
- package/tests/analysis.unit.test.ts +98 -0
- package/tests/analyze.integration.test.ts +98 -0
- package/tests/audit/dns.test.ts +31 -0
- package/tests/audit/headers.test.ts +45 -0
- package/tests/audit/scoring.test.ts +133 -0
- package/tests/audit/security.test.ts +12 -0
- package/tests/audit/transport.test.ts +112 -0
- package/tests/clustering.test.ts +118 -0
- package/tests/crawler.test.ts +358 -0
- package/tests/db.test.ts +159 -0
- package/tests/diff.test.ts +67 -0
- package/tests/duplicate.test.ts +110 -0
- package/tests/fetcher.test.ts +106 -0
- package/tests/fetcher_safety.test.ts +85 -0
- package/tests/fixtures/analyze-crawl.json +26 -0
- package/tests/hits.test.ts +134 -0
- package/tests/html_report.test.ts +58 -0
- package/tests/lock/lockManager.test.ts +138 -0
- package/tests/metrics.test.ts +196 -0
- package/tests/normalize.test.ts +101 -0
- package/tests/orphanSeverity.test.ts +160 -0
- package/tests/pagerank.test.ts +98 -0
- package/tests/parser.test.ts +117 -0
- package/tests/proxy_safety.test.ts +57 -0
- package/tests/redirect_safety.test.ts +73 -0
- package/tests/safety.test.ts +114 -0
- package/tests/scope.test.ts +66 -0
- package/tests/scoring.test.ts +59 -0
- package/tests/sitemap.test.ts +88 -0
- package/tests/soft404.test.ts +41 -0
- package/tests/trap.test.ts +39 -0
- package/tests/visualization_data.test.ts +46 -0
- package/tsconfig.json +11 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import https from 'node:https';
|
|
2
|
+
import http from 'node:http';
|
|
3
|
+
import tls from 'node:tls';
|
|
4
|
+
import { URL } from 'node:url';
|
|
5
|
+
import { IPGuard } from '../core/security/ipGuard.js';
|
|
6
|
+
export async function analyzeTransport(targetUrl, timeout) {
|
|
7
|
+
const maxRedirects = 10;
|
|
8
|
+
let currentUrl = targetUrl;
|
|
9
|
+
let redirectCount = 0;
|
|
10
|
+
const redirects = [];
|
|
11
|
+
const issues = [];
|
|
12
|
+
// Cumulative metrics
|
|
13
|
+
let totalRedirectTime = 0;
|
|
14
|
+
for (let i = 0; i < maxRedirects; i++) {
|
|
15
|
+
const urlObj = new URL(currentUrl);
|
|
16
|
+
const isSafe = await IPGuard.validateHost(urlObj.hostname);
|
|
17
|
+
if (!isSafe) {
|
|
18
|
+
throw new Error(`Blocked: Redirect to internal/private IP prohibited (${currentUrl})`);
|
|
19
|
+
}
|
|
20
|
+
try {
|
|
21
|
+
const result = await executeRequest(currentUrl, timeout);
|
|
22
|
+
if (result.redirectUrl) {
|
|
23
|
+
redirectCount++;
|
|
24
|
+
totalRedirectTime += result.timings.total;
|
|
25
|
+
redirects.push({
|
|
26
|
+
url: currentUrl,
|
|
27
|
+
statusCode: result.response.statusCode || 0,
|
|
28
|
+
location: result.redirectUrl
|
|
29
|
+
});
|
|
30
|
+
currentUrl = result.redirectUrl;
|
|
31
|
+
continue;
|
|
32
|
+
}
|
|
33
|
+
// Final destination reached
|
|
34
|
+
const { response, body, timings, socket } = result;
|
|
35
|
+
// Collect Certificate Info
|
|
36
|
+
let certInfo = null;
|
|
37
|
+
let tlsVersion = null;
|
|
38
|
+
let cipherSuite = null;
|
|
39
|
+
let alpnProtocol = null;
|
|
40
|
+
if (socket instanceof tls.TLSSocket) {
|
|
41
|
+
const cert = socket.getPeerCertificate(true);
|
|
42
|
+
tlsVersion = socket.getProtocol();
|
|
43
|
+
const cipher = socket.getCipher();
|
|
44
|
+
cipherSuite = cipher ? cipher.name : null;
|
|
45
|
+
alpnProtocol = socket.alpnProtocol || null;
|
|
46
|
+
if (cert && Object.keys(cert).length > 0) {
|
|
47
|
+
certInfo = {
|
|
48
|
+
subject: (cert.subject && cert.subject.CN) ? cert.subject.CN : 'Unknown',
|
|
49
|
+
issuer: (cert.issuer && cert.issuer.CN) ? cert.issuer.CN : 'Unknown',
|
|
50
|
+
validFrom: cert.valid_from,
|
|
51
|
+
validTo: cert.valid_to,
|
|
52
|
+
daysUntilExpiry: Math.floor((new Date(cert.valid_to).getTime() - Date.now()) / (1000 * 60 * 60 * 24)),
|
|
53
|
+
isSelfSigned: cert.issuer && cert.subject && cert.issuer.CN === cert.subject.CN,
|
|
54
|
+
isValidChain: socket.authorized,
|
|
55
|
+
fingerprint: cert.fingerprint,
|
|
56
|
+
serialNumber: cert.serialNumber,
|
|
57
|
+
subjectAltName: cert.subjectaltname
|
|
58
|
+
};
|
|
59
|
+
if (!socket.authorized) {
|
|
60
|
+
issues.push({
|
|
61
|
+
id: 'cert-invalid',
|
|
62
|
+
severity: 'severe',
|
|
63
|
+
category: 'tls',
|
|
64
|
+
message: `Certificate validation failed: ${socket.authorizationError}`,
|
|
65
|
+
scorePenalty: 30
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
const httpVersion = response.httpVersion;
|
|
71
|
+
const contentEncoding = response.headers['content-encoding'];
|
|
72
|
+
const compression = [];
|
|
73
|
+
if (contentEncoding) {
|
|
74
|
+
compression.push(contentEncoding);
|
|
75
|
+
}
|
|
76
|
+
const connectionHeader = response.headers['connection'];
|
|
77
|
+
const keepAlive = connectionHeader ? connectionHeader.toLowerCase() !== 'close' : true;
|
|
78
|
+
const serverHeader = response.headers['server'] || null;
|
|
79
|
+
const headerText = `HTTP/${response.httpVersion} ${response.statusCode} ${response.statusMessage}\r\n` +
|
|
80
|
+
Object.entries(response.headers).map(([k, v]) => `${k}: ${v}`).join('\r\n') +
|
|
81
|
+
'\r\n\r\n';
|
|
82
|
+
const headerSize = Buffer.byteLength(headerText);
|
|
83
|
+
const htmlSize = body.length;
|
|
84
|
+
const transport = {
|
|
85
|
+
tlsVersion,
|
|
86
|
+
cipherSuite,
|
|
87
|
+
alpnProtocol: alpnProtocol || (httpVersion === '2.0' ? 'h2' : 'http/1.1'),
|
|
88
|
+
certificate: certInfo,
|
|
89
|
+
httpVersion,
|
|
90
|
+
compression,
|
|
91
|
+
keepAlive,
|
|
92
|
+
transferEncoding: response.headers['transfer-encoding'] || null,
|
|
93
|
+
redirectCount,
|
|
94
|
+
redirects,
|
|
95
|
+
serverHeader,
|
|
96
|
+
headers: response.headers
|
|
97
|
+
};
|
|
98
|
+
const performance = {
|
|
99
|
+
dnsLookupTime: timings.dns,
|
|
100
|
+
tcpConnectTime: timings.tcp,
|
|
101
|
+
tlsHandshakeTime: timings.tls,
|
|
102
|
+
ttfb: timings.ttfb,
|
|
103
|
+
totalTime: timings.total + totalRedirectTime,
|
|
104
|
+
htmlSize,
|
|
105
|
+
headerSize,
|
|
106
|
+
redirectTime: totalRedirectTime
|
|
107
|
+
};
|
|
108
|
+
return { transport, performance, issues };
|
|
109
|
+
}
|
|
110
|
+
catch (error) {
|
|
111
|
+
throw new Error(`Transport analysis failed for ${currentUrl}: ${error.message}`, { cause: error });
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
throw new Error(`Too many redirects (limit: ${maxRedirects})`);
|
|
115
|
+
}
|
|
116
|
+
function executeRequest(urlStr, timeout) {
|
|
117
|
+
return new Promise((resolve, reject) => {
|
|
118
|
+
let url;
|
|
119
|
+
try {
|
|
120
|
+
url = new URL(urlStr);
|
|
121
|
+
}
|
|
122
|
+
catch (_e) {
|
|
123
|
+
return reject(new Error(`Invalid URL: ${urlStr}`));
|
|
124
|
+
}
|
|
125
|
+
const isHttps = url.protocol === 'https:';
|
|
126
|
+
const requestModule = isHttps ? https : http;
|
|
127
|
+
const timings = {
|
|
128
|
+
dns: 0,
|
|
129
|
+
tcp: 0,
|
|
130
|
+
tls: 0,
|
|
131
|
+
ttfb: 0,
|
|
132
|
+
total: 0
|
|
133
|
+
};
|
|
134
|
+
const t0 = performance.now();
|
|
135
|
+
let tDNS = t0;
|
|
136
|
+
let tTCP = t0;
|
|
137
|
+
let tTLS = t0;
|
|
138
|
+
let tReqSent = 0;
|
|
139
|
+
// We use agent: false to force new connection for accurate timing
|
|
140
|
+
const options = {
|
|
141
|
+
method: 'GET',
|
|
142
|
+
timeout,
|
|
143
|
+
rejectUnauthorized: false,
|
|
144
|
+
agent: false,
|
|
145
|
+
headers: {
|
|
146
|
+
'User-Agent': 'Crawlith/Audit',
|
|
147
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
148
|
+
'Accept-Encoding': 'gzip, deflate, br'
|
|
149
|
+
}
|
|
150
|
+
};
|
|
151
|
+
const req = requestModule.request(url, options, (res) => {
|
|
152
|
+
// TTFB: Time from request sent to first byte of headers received
|
|
153
|
+
timings.ttfb = performance.now() - (tReqSent || t0);
|
|
154
|
+
const chunks = [];
|
|
155
|
+
res.on('data', (chunk) => chunks.push(chunk));
|
|
156
|
+
res.on('end', () => {
|
|
157
|
+
timings.total = performance.now() - t0;
|
|
158
|
+
const body = Buffer.concat(chunks);
|
|
159
|
+
let redirectUrl = null;
|
|
160
|
+
if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
|
|
161
|
+
try {
|
|
162
|
+
redirectUrl = new URL(res.headers.location, urlStr).toString();
|
|
163
|
+
}
|
|
164
|
+
catch (_e) {
|
|
165
|
+
// Ignore invalid redirect
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
resolve({
|
|
169
|
+
url: urlStr,
|
|
170
|
+
response: res,
|
|
171
|
+
body,
|
|
172
|
+
timings,
|
|
173
|
+
socket: res.socket,
|
|
174
|
+
redirectUrl
|
|
175
|
+
});
|
|
176
|
+
});
|
|
177
|
+
});
|
|
178
|
+
req.on('socket', (socket) => {
|
|
179
|
+
socket.on('lookup', () => {
|
|
180
|
+
tDNS = performance.now();
|
|
181
|
+
timings.dns = tDNS - t0;
|
|
182
|
+
});
|
|
183
|
+
socket.on('connect', () => {
|
|
184
|
+
tTCP = performance.now();
|
|
185
|
+
if (timings.dns === 0 && tDNS === t0) {
|
|
186
|
+
// No lookup event
|
|
187
|
+
timings.dns = 0;
|
|
188
|
+
tDNS = t0;
|
|
189
|
+
}
|
|
190
|
+
timings.tcp = tTCP - tDNS;
|
|
191
|
+
});
|
|
192
|
+
socket.on('secureConnect', () => {
|
|
193
|
+
tTLS = performance.now();
|
|
194
|
+
timings.tls = tTLS - tTCP;
|
|
195
|
+
});
|
|
196
|
+
});
|
|
197
|
+
req.on('finish', () => {
|
|
198
|
+
tReqSent = performance.now();
|
|
199
|
+
});
|
|
200
|
+
req.on('error', (err) => reject(err));
|
|
201
|
+
req.on('timeout', () => {
|
|
202
|
+
req.destroy();
|
|
203
|
+
reject(new Error('Request timed out'));
|
|
204
|
+
});
|
|
205
|
+
req.end();
|
|
206
|
+
});
|
|
207
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
export interface AuditResult {
|
|
2
|
+
url: string;
|
|
3
|
+
transport: TransportDiagnostics;
|
|
4
|
+
securityHeaders: SecurityHeadersResult;
|
|
5
|
+
dns: DnsDiagnostics;
|
|
6
|
+
performance: PerformanceMetrics;
|
|
7
|
+
score: number;
|
|
8
|
+
grade: 'A' | 'B' | 'C' | 'D' | 'F';
|
|
9
|
+
issues: AuditIssue[];
|
|
10
|
+
}
|
|
11
|
+
export interface TransportDiagnostics {
|
|
12
|
+
tlsVersion: string | null;
|
|
13
|
+
cipherSuite: string | null;
|
|
14
|
+
alpnProtocol: string | null;
|
|
15
|
+
certificate: CertificateInfo | null;
|
|
16
|
+
httpVersion: string;
|
|
17
|
+
compression: string[];
|
|
18
|
+
keepAlive: boolean;
|
|
19
|
+
transferEncoding: string | null;
|
|
20
|
+
redirectCount: number;
|
|
21
|
+
redirects: RedirectInfo[];
|
|
22
|
+
serverHeader: string | null;
|
|
23
|
+
headers: Record<string, string | string[] | undefined>;
|
|
24
|
+
}
|
|
25
|
+
export interface CertificateInfo {
|
|
26
|
+
issuer: string;
|
|
27
|
+
subject: string;
|
|
28
|
+
validFrom: string;
|
|
29
|
+
validTo: string;
|
|
30
|
+
daysUntilExpiry: number;
|
|
31
|
+
isSelfSigned: boolean;
|
|
32
|
+
isValidChain: boolean;
|
|
33
|
+
fingerprint: string;
|
|
34
|
+
serialNumber: string;
|
|
35
|
+
subjectAltName?: string;
|
|
36
|
+
}
|
|
37
|
+
export interface RedirectInfo {
|
|
38
|
+
url: string;
|
|
39
|
+
statusCode: number;
|
|
40
|
+
location: string | null;
|
|
41
|
+
}
|
|
42
|
+
export interface SecurityHeadersResult {
|
|
43
|
+
strictTransportSecurity: HeaderStatus;
|
|
44
|
+
contentSecurityPolicy: HeaderStatus;
|
|
45
|
+
xFrameOptions: HeaderStatus;
|
|
46
|
+
xContentTypeOptions: HeaderStatus;
|
|
47
|
+
referrerPolicy: HeaderStatus;
|
|
48
|
+
permissionsPolicy: HeaderStatus;
|
|
49
|
+
details: Record<string, string>;
|
|
50
|
+
score: number;
|
|
51
|
+
}
|
|
52
|
+
export interface HeaderStatus {
|
|
53
|
+
present: boolean;
|
|
54
|
+
value: string | null;
|
|
55
|
+
valid: boolean;
|
|
56
|
+
issues?: string[];
|
|
57
|
+
}
|
|
58
|
+
export interface DnsDiagnostics {
|
|
59
|
+
a: string[];
|
|
60
|
+
aaaa: string[];
|
|
61
|
+
cname: string[];
|
|
62
|
+
reverse: string[];
|
|
63
|
+
ipCount: number;
|
|
64
|
+
ipv6Support: boolean;
|
|
65
|
+
resolutionTime: number;
|
|
66
|
+
}
|
|
67
|
+
export interface PerformanceMetrics {
|
|
68
|
+
dnsLookupTime: number;
|
|
69
|
+
tcpConnectTime: number;
|
|
70
|
+
tlsHandshakeTime: number;
|
|
71
|
+
ttfb: number;
|
|
72
|
+
totalTime: number;
|
|
73
|
+
htmlSize: number;
|
|
74
|
+
headerSize: number;
|
|
75
|
+
redirectTime?: number;
|
|
76
|
+
}
|
|
77
|
+
export interface AuditIssue {
|
|
78
|
+
id: string;
|
|
79
|
+
severity: 'critical' | 'severe' | 'moderate' | 'minor' | 'info';
|
|
80
|
+
category: 'tls' | 'http' | 'headers' | 'dns' | 'performance';
|
|
81
|
+
message: string;
|
|
82
|
+
scorePenalty: number;
|
|
83
|
+
}
|
|
84
|
+
export interface AuditOptions {
|
|
85
|
+
timeout?: number;
|
|
86
|
+
verbose?: boolean;
|
|
87
|
+
debug?: boolean;
|
|
88
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { ProxyAgent } from 'undici';
|
|
2
|
+
export class ProxyAdapter {
|
|
3
|
+
agent;
|
|
4
|
+
constructor(proxyUrl) {
|
|
5
|
+
if (proxyUrl) {
|
|
6
|
+
try {
|
|
7
|
+
// Validate URL
|
|
8
|
+
new URL(proxyUrl);
|
|
9
|
+
this.agent = new ProxyAgent(proxyUrl);
|
|
10
|
+
}
|
|
11
|
+
catch {
|
|
12
|
+
throw new Error(`Invalid proxy URL: ${proxyUrl}`);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
get dispatcher() {
|
|
17
|
+
return this.agent;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
export class RateLimiter {
|
|
2
|
+
buckets = new Map();
|
|
3
|
+
rate; // tokens per second
|
|
4
|
+
constructor(rate = 2) {
|
|
5
|
+
this.rate = rate;
|
|
6
|
+
}
|
|
7
|
+
async waitForToken(host, crawlDelay = 0) {
|
|
8
|
+
const effectiveRate = crawlDelay > 0 ? Math.min(this.rate, 1 / crawlDelay) : this.rate;
|
|
9
|
+
const interval = 1000 / effectiveRate;
|
|
10
|
+
if (!this.buckets.has(host)) {
|
|
11
|
+
this.buckets.set(host, { tokens: this.rate - 1, lastRefill: Date.now() });
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
const bucket = this.buckets.get(host);
|
|
15
|
+
while (true) {
|
|
16
|
+
const now = Date.now();
|
|
17
|
+
const elapsed = now - bucket.lastRefill;
|
|
18
|
+
if (elapsed > 0) {
|
|
19
|
+
const newTokens = elapsed / interval;
|
|
20
|
+
bucket.tokens = Math.min(this.rate, bucket.tokens + newTokens);
|
|
21
|
+
bucket.lastRefill = now;
|
|
22
|
+
}
|
|
23
|
+
if (bucket.tokens >= 1) {
|
|
24
|
+
bucket.tokens -= 1;
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
const waitTime = Math.max(0, interval - (Date.now() - bucket.lastRefill));
|
|
28
|
+
await new Promise(resolve => setTimeout(resolve, waitTime));
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export declare class RedirectController {
|
|
2
|
+
private maxHops;
|
|
3
|
+
private currentHops;
|
|
4
|
+
private history;
|
|
5
|
+
constructor(maxHops?: number, seedUrl?: string);
|
|
6
|
+
/**
|
|
7
|
+
* Records a hop and checks if it's within limits and not a loop.
|
|
8
|
+
* Returns null if allowed, or an error status string if blocked.
|
|
9
|
+
*/
|
|
10
|
+
nextHop(url: string): 'redirect_limit_exceeded' | 'redirect_loop' | null;
|
|
11
|
+
get hops(): number;
|
|
12
|
+
private normalize;
|
|
13
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
export class RedirectController {
|
|
2
|
+
maxHops;
|
|
3
|
+
currentHops = 0;
|
|
4
|
+
history = new Set();
|
|
5
|
+
constructor(maxHops = 5, seedUrl) {
|
|
6
|
+
this.maxHops = maxHops;
|
|
7
|
+
if (seedUrl) {
|
|
8
|
+
this.history.add(this.normalize(seedUrl));
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Records a hop and checks if it's within limits and not a loop.
|
|
13
|
+
* Returns null if allowed, or an error status string if blocked.
|
|
14
|
+
*/
|
|
15
|
+
nextHop(url) {
|
|
16
|
+
// Normalize URL for loop detection (basic)
|
|
17
|
+
const normalized = this.normalize(url);
|
|
18
|
+
if (this.history.has(normalized)) {
|
|
19
|
+
return 'redirect_loop';
|
|
20
|
+
}
|
|
21
|
+
if (this.currentHops >= this.maxHops) {
|
|
22
|
+
return 'redirect_limit_exceeded';
|
|
23
|
+
}
|
|
24
|
+
this.history.add(normalized);
|
|
25
|
+
this.currentHops++;
|
|
26
|
+
return null;
|
|
27
|
+
}
|
|
28
|
+
get hops() {
|
|
29
|
+
return this.currentHops;
|
|
30
|
+
}
|
|
31
|
+
normalize(url) {
|
|
32
|
+
try {
|
|
33
|
+
const u = new URL(url);
|
|
34
|
+
u.hash = ''; // Ignore hash for loop detection
|
|
35
|
+
return u.toString();
|
|
36
|
+
}
|
|
37
|
+
catch {
|
|
38
|
+
return url;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export class ResponseLimiter {
|
|
2
|
+
static async streamToString(stream, maxBytes, onOversized) {
|
|
3
|
+
return new Promise((resolve, reject) => {
|
|
4
|
+
let accumulated = 0;
|
|
5
|
+
const chunks = [];
|
|
6
|
+
stream.on('data', (chunk) => {
|
|
7
|
+
const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
8
|
+
accumulated += buffer.length;
|
|
9
|
+
if (accumulated > maxBytes) {
|
|
10
|
+
stream.destroy();
|
|
11
|
+
if (onOversized)
|
|
12
|
+
onOversized(accumulated);
|
|
13
|
+
reject(new Error('Oversized response'));
|
|
14
|
+
return;
|
|
15
|
+
}
|
|
16
|
+
chunks.push(buffer);
|
|
17
|
+
});
|
|
18
|
+
stream.on('end', () => {
|
|
19
|
+
resolve(Buffer.concat(chunks).toString('utf-8'));
|
|
20
|
+
});
|
|
21
|
+
stream.on('error', (err) => {
|
|
22
|
+
reject(err);
|
|
23
|
+
});
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export interface RetryConfig {
|
|
2
|
+
maxRetries: number;
|
|
3
|
+
baseDelay: number;
|
|
4
|
+
}
|
|
5
|
+
export declare class RetryPolicy {
|
|
6
|
+
static DEFAULT_CONFIG: RetryConfig;
|
|
7
|
+
static execute<T>(operation: (attempt: number) => Promise<T>, isRetryable: (error: any) => boolean, config?: RetryConfig): Promise<T>;
|
|
8
|
+
static isRetryableStatus(status: number): boolean;
|
|
9
|
+
static isNetworkError(error: any): boolean;
|
|
10
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
export class RetryPolicy {
|
|
2
|
+
static DEFAULT_CONFIG = {
|
|
3
|
+
maxRetries: 3,
|
|
4
|
+
baseDelay: 500
|
|
5
|
+
};
|
|
6
|
+
static async execute(operation, isRetryable, config = RetryPolicy.DEFAULT_CONFIG) {
|
|
7
|
+
let lastError;
|
|
8
|
+
for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
|
|
9
|
+
try {
|
|
10
|
+
return await operation(attempt);
|
|
11
|
+
}
|
|
12
|
+
catch (error) {
|
|
13
|
+
lastError = error;
|
|
14
|
+
if (attempt === config.maxRetries || !isRetryable(error)) {
|
|
15
|
+
throw error;
|
|
16
|
+
}
|
|
17
|
+
const delay = config.baseDelay * Math.pow(2, attempt);
|
|
18
|
+
const jitter = delay * 0.1 * (Math.random() * 2 - 1);
|
|
19
|
+
const finalDelay = Math.max(0, delay + jitter);
|
|
20
|
+
await new Promise(resolve => setTimeout(resolve, finalDelay));
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
throw lastError;
|
|
24
|
+
}
|
|
25
|
+
static isRetryableStatus(status) {
|
|
26
|
+
return status === 429 || (status >= 500 && status <= 599);
|
|
27
|
+
}
|
|
28
|
+
static isNetworkError(error) {
|
|
29
|
+
const code = error?.code || error?.cause?.code;
|
|
30
|
+
return [
|
|
31
|
+
'ETIMEDOUT',
|
|
32
|
+
'ECONNRESET',
|
|
33
|
+
'EADDRINUSE',
|
|
34
|
+
'ECONNREFUSED',
|
|
35
|
+
'EPIPE',
|
|
36
|
+
'ENOTFOUND',
|
|
37
|
+
'ENETUNREACH',
|
|
38
|
+
'EAI_AGAIN'
|
|
39
|
+
].includes(code);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export declare class DomainFilter {
|
|
2
|
+
private allowed;
|
|
3
|
+
private denied;
|
|
4
|
+
constructor(allowed?: string[], denied?: string[]);
|
|
5
|
+
/**
|
|
6
|
+
* Normalizes a hostname: lowercase, strip trailing dot.
|
|
7
|
+
* Note: We expect hostnames, not URLs.
|
|
8
|
+
*/
|
|
9
|
+
private normalize;
|
|
10
|
+
isAllowed(hostname: string): boolean;
|
|
11
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
export class DomainFilter {
|
|
2
|
+
allowed;
|
|
3
|
+
denied;
|
|
4
|
+
constructor(allowed = [], denied = []) {
|
|
5
|
+
this.allowed = new Set(allowed.map(d => this.normalize(d)));
|
|
6
|
+
this.denied = new Set(denied.map(d => this.normalize(d)));
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Normalizes a hostname: lowercase, strip trailing dot.
|
|
10
|
+
* Note: We expect hostnames, not URLs.
|
|
11
|
+
*/
|
|
12
|
+
normalize(hostname) {
|
|
13
|
+
let h = hostname.toLowerCase().trim();
|
|
14
|
+
if (h.endsWith('.')) {
|
|
15
|
+
h = h.slice(0, -1);
|
|
16
|
+
}
|
|
17
|
+
// Use URL to handle punycode and basic validation if possible
|
|
18
|
+
try {
|
|
19
|
+
// We wrap it in a dummy URL to let the browser/node logic normalize it
|
|
20
|
+
const url = new URL(`http://${h}`);
|
|
21
|
+
return url.hostname;
|
|
22
|
+
}
|
|
23
|
+
catch {
|
|
24
|
+
return h;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
isAllowed(hostname) {
|
|
28
|
+
const normalized = this.normalize(hostname);
|
|
29
|
+
// 1. Deny list match -> Reject
|
|
30
|
+
if (this.denied.has(normalized)) {
|
|
31
|
+
return false;
|
|
32
|
+
}
|
|
33
|
+
// 2. Allow list not empty AND no match -> Reject
|
|
34
|
+
if (this.allowed.size > 0 && !this.allowed.has(normalized)) {
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
37
|
+
// 3. Otherwise -> Allow
|
|
38
|
+
return true;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export interface ScopeOptions {
|
|
2
|
+
allowedDomains?: string[];
|
|
3
|
+
deniedDomains?: string[];
|
|
4
|
+
includeSubdomains?: boolean;
|
|
5
|
+
rootUrl: string;
|
|
6
|
+
}
|
|
7
|
+
export type EligibilityResult = 'allowed' | 'blocked_by_domain_filter' | 'blocked_subdomain';
|
|
8
|
+
export declare class ScopeManager {
|
|
9
|
+
private domainFilter;
|
|
10
|
+
private subdomainPolicy;
|
|
11
|
+
private explicitAllowed;
|
|
12
|
+
constructor(options: ScopeOptions);
|
|
13
|
+
isUrlEligible(url: string): EligibilityResult;
|
|
14
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { DomainFilter } from './domainFilter.js';
|
|
2
|
+
import { SubdomainPolicy } from './subdomainPolicy.js';
|
|
3
|
+
export class ScopeManager {
|
|
4
|
+
domainFilter;
|
|
5
|
+
subdomainPolicy;
|
|
6
|
+
explicitAllowed;
|
|
7
|
+
constructor(options) {
|
|
8
|
+
this.domainFilter = new DomainFilter(options.allowedDomains, options.deniedDomains);
|
|
9
|
+
this.subdomainPolicy = new SubdomainPolicy(options.rootUrl, options.includeSubdomains);
|
|
10
|
+
this.explicitAllowed = new Set((options.allowedDomains || []).map(d => {
|
|
11
|
+
let h = d.toLowerCase().trim();
|
|
12
|
+
if (h.endsWith('.'))
|
|
13
|
+
h = h.slice(0, -1);
|
|
14
|
+
return h;
|
|
15
|
+
}));
|
|
16
|
+
}
|
|
17
|
+
isUrlEligible(url) {
|
|
18
|
+
let hostname;
|
|
19
|
+
try {
|
|
20
|
+
hostname = new URL(url).hostname.toLowerCase();
|
|
21
|
+
if (hostname.endsWith('.'))
|
|
22
|
+
hostname = hostname.slice(0, -1);
|
|
23
|
+
}
|
|
24
|
+
catch {
|
|
25
|
+
return 'blocked_by_domain_filter'; // Invalid URL is effectively blocked
|
|
26
|
+
}
|
|
27
|
+
if (!this.domainFilter.isAllowed(hostname)) {
|
|
28
|
+
return 'blocked_by_domain_filter';
|
|
29
|
+
}
|
|
30
|
+
// If explicit whitelist is used, and this domain is in it, allow it
|
|
31
|
+
if (this.explicitAllowed.has(hostname)) {
|
|
32
|
+
return 'allowed';
|
|
33
|
+
}
|
|
34
|
+
if (!this.subdomainPolicy.isAllowed(hostname)) {
|
|
35
|
+
return 'blocked_subdomain';
|
|
36
|
+
}
|
|
37
|
+
return 'allowed';
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
export class SubdomainPolicy {
|
|
2
|
+
rootHost;
|
|
3
|
+
includeSubdomains;
|
|
4
|
+
constructor(rootUrl, includeSubdomains = false) {
|
|
5
|
+
try {
|
|
6
|
+
this.rootHost = new URL(rootUrl).hostname.toLowerCase();
|
|
7
|
+
if (this.rootHost.endsWith('.')) {
|
|
8
|
+
this.rootHost = this.rootHost.slice(0, -1);
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
catch {
|
|
12
|
+
this.rootHost = '';
|
|
13
|
+
}
|
|
14
|
+
this.includeSubdomains = includeSubdomains;
|
|
15
|
+
}
|
|
16
|
+
isAllowed(hostname) {
|
|
17
|
+
let target = hostname.toLowerCase().trim();
|
|
18
|
+
if (target.endsWith('.')) {
|
|
19
|
+
target = target.slice(0, -1);
|
|
20
|
+
}
|
|
21
|
+
// Exact match is always allowed if rootHost is set
|
|
22
|
+
if (target === this.rootHost) {
|
|
23
|
+
return true;
|
|
24
|
+
}
|
|
25
|
+
if (!this.includeSubdomains) {
|
|
26
|
+
return false;
|
|
27
|
+
}
|
|
28
|
+
// Label-based check for subdomains
|
|
29
|
+
// target must end with .rootHost
|
|
30
|
+
if (!target.endsWith(`.${this.rootHost}`)) {
|
|
31
|
+
return false;
|
|
32
|
+
}
|
|
33
|
+
return true;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export declare class IPGuard {
|
|
2
|
+
/**
|
|
3
|
+
* Checks if an IP address is internal/private
|
|
4
|
+
*/
|
|
5
|
+
static isInternal(ip: string): boolean;
|
|
6
|
+
/**
|
|
7
|
+
* Resolves a hostname and validates all result IPs
|
|
8
|
+
*/
|
|
9
|
+
static validateHost(host: string): Promise<boolean>;
|
|
10
|
+
private static expandIPv6;
|
|
11
|
+
}
|