@crawlith/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/CHANGELOG.md +7 -0
  2. package/dist/analysis/analyze.d.ts +70 -0
  3. package/dist/analysis/analyze.js +436 -0
  4. package/dist/analysis/content.d.ts +12 -0
  5. package/dist/analysis/content.js +33 -0
  6. package/dist/analysis/images.d.ts +6 -0
  7. package/dist/analysis/images.js +18 -0
  8. package/dist/analysis/links.d.ts +7 -0
  9. package/dist/analysis/links.js +30 -0
  10. package/dist/analysis/scoring.d.ts +9 -0
  11. package/dist/analysis/scoring.js +42 -0
  12. package/dist/analysis/seo.d.ts +15 -0
  13. package/dist/analysis/seo.js +64 -0
  14. package/dist/analysis/structuredData.d.ts +6 -0
  15. package/dist/analysis/structuredData.js +51 -0
  16. package/dist/audit/dns.d.ts +2 -0
  17. package/dist/audit/dns.js +42 -0
  18. package/dist/audit/headers.d.ts +2 -0
  19. package/dist/audit/headers.js +95 -0
  20. package/dist/audit/index.d.ts +2 -0
  21. package/dist/audit/index.js +50 -0
  22. package/dist/audit/scoring.d.ts +14 -0
  23. package/dist/audit/scoring.js +214 -0
  24. package/dist/audit/transport.d.ts +6 -0
  25. package/dist/audit/transport.js +207 -0
  26. package/dist/audit/types.d.ts +88 -0
  27. package/dist/audit/types.js +1 -0
  28. package/dist/core/network/proxyAdapter.d.ts +6 -0
  29. package/dist/core/network/proxyAdapter.js +19 -0
  30. package/dist/core/network/rateLimiter.d.ts +6 -0
  31. package/dist/core/network/rateLimiter.js +31 -0
  32. package/dist/core/network/redirectController.d.ts +13 -0
  33. package/dist/core/network/redirectController.js +41 -0
  34. package/dist/core/network/responseLimiter.d.ts +4 -0
  35. package/dist/core/network/responseLimiter.js +26 -0
  36. package/dist/core/network/retryPolicy.d.ts +10 -0
  37. package/dist/core/network/retryPolicy.js +41 -0
  38. package/dist/core/scope/domainFilter.d.ts +11 -0
  39. package/dist/core/scope/domainFilter.js +40 -0
  40. package/dist/core/scope/scopeManager.d.ts +14 -0
  41. package/dist/core/scope/scopeManager.js +39 -0
  42. package/dist/core/scope/subdomainPolicy.d.ts +6 -0
  43. package/dist/core/scope/subdomainPolicy.js +35 -0
  44. package/dist/core/security/ipGuard.d.ts +11 -0
  45. package/dist/core/security/ipGuard.js +84 -0
  46. package/dist/crawler/crawl.d.ts +22 -0
  47. package/dist/crawler/crawl.js +336 -0
  48. package/dist/crawler/extract.d.ts +5 -0
  49. package/dist/crawler/extract.js +33 -0
  50. package/dist/crawler/fetcher.d.ts +40 -0
  51. package/dist/crawler/fetcher.js +161 -0
  52. package/dist/crawler/metricsRunner.d.ts +1 -0
  53. package/dist/crawler/metricsRunner.js +108 -0
  54. package/dist/crawler/normalize.d.ts +7 -0
  55. package/dist/crawler/normalize.js +88 -0
  56. package/dist/crawler/parser.d.ts +22 -0
  57. package/dist/crawler/parser.js +158 -0
  58. package/dist/crawler/sitemap.d.ts +8 -0
  59. package/dist/crawler/sitemap.js +70 -0
  60. package/dist/crawler/trap.d.ts +24 -0
  61. package/dist/crawler/trap.js +78 -0
  62. package/dist/db/graphLoader.d.ts +2 -0
  63. package/dist/db/graphLoader.js +96 -0
  64. package/dist/db/index.d.ts +4 -0
  65. package/dist/db/index.js +61 -0
  66. package/dist/db/repositories/EdgeRepository.d.ts +16 -0
  67. package/dist/db/repositories/EdgeRepository.js +17 -0
  68. package/dist/db/repositories/MetricsRepository.d.ts +26 -0
  69. package/dist/db/repositories/MetricsRepository.js +27 -0
  70. package/dist/db/repositories/PageRepository.d.ts +47 -0
  71. package/dist/db/repositories/PageRepository.js +93 -0
  72. package/dist/db/repositories/SiteRepository.d.ts +15 -0
  73. package/dist/db/repositories/SiteRepository.js +22 -0
  74. package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
  75. package/dist/db/repositories/SnapshotRepository.js +55 -0
  76. package/dist/db/schema.d.ts +2 -0
  77. package/dist/db/schema.js +169 -0
  78. package/dist/diff/compare.d.ts +26 -0
  79. package/dist/diff/compare.js +64 -0
  80. package/dist/graph/cluster.d.ts +6 -0
  81. package/dist/graph/cluster.js +173 -0
  82. package/dist/graph/duplicate.d.ts +10 -0
  83. package/dist/graph/duplicate.js +251 -0
  84. package/dist/graph/graph.d.ts +103 -0
  85. package/dist/graph/graph.js +106 -0
  86. package/dist/graph/metrics.d.ts +29 -0
  87. package/dist/graph/metrics.js +74 -0
  88. package/dist/graph/pagerank.d.ts +12 -0
  89. package/dist/graph/pagerank.js +102 -0
  90. package/dist/graph/simhash.d.ts +17 -0
  91. package/dist/graph/simhash.js +56 -0
  92. package/dist/index.d.ts +30 -0
  93. package/dist/index.js +30 -0
  94. package/dist/lock/hashKey.d.ts +1 -0
  95. package/dist/lock/hashKey.js +44 -0
  96. package/dist/lock/lockManager.d.ts +7 -0
  97. package/dist/lock/lockManager.js +112 -0
  98. package/dist/lock/pidCheck.d.ts +1 -0
  99. package/dist/lock/pidCheck.js +14 -0
  100. package/dist/report/html.d.ts +2 -0
  101. package/dist/report/html.js +223 -0
  102. package/dist/report/sitegraphExport.d.ts +3 -0
  103. package/dist/report/sitegraphExport.js +52 -0
  104. package/dist/report/sitegraph_template.d.ts +1 -0
  105. package/dist/report/sitegraph_template.js +630 -0
  106. package/dist/scoring/hits.d.ts +9 -0
  107. package/dist/scoring/hits.js +111 -0
  108. package/dist/scoring/orphanSeverity.d.ts +39 -0
  109. package/dist/scoring/orphanSeverity.js +125 -0
  110. package/dist/utils/version.d.ts +2 -0
  111. package/dist/utils/version.js +15 -0
  112. package/package.json +33 -0
  113. package/src/analysis/analyze.ts +548 -0
  114. package/src/analysis/content.ts +62 -0
  115. package/src/analysis/images.ts +28 -0
  116. package/src/analysis/links.ts +41 -0
  117. package/src/analysis/scoring.ts +59 -0
  118. package/src/analysis/seo.ts +82 -0
  119. package/src/analysis/structuredData.ts +62 -0
  120. package/src/audit/dns.ts +49 -0
  121. package/src/audit/headers.ts +98 -0
  122. package/src/audit/index.ts +66 -0
  123. package/src/audit/scoring.ts +232 -0
  124. package/src/audit/transport.ts +258 -0
  125. package/src/audit/types.ts +102 -0
  126. package/src/core/network/proxyAdapter.ts +21 -0
  127. package/src/core/network/rateLimiter.ts +39 -0
  128. package/src/core/network/redirectController.ts +47 -0
  129. package/src/core/network/responseLimiter.ts +34 -0
  130. package/src/core/network/retryPolicy.ts +57 -0
  131. package/src/core/scope/domainFilter.ts +45 -0
  132. package/src/core/scope/scopeManager.ts +52 -0
  133. package/src/core/scope/subdomainPolicy.ts +39 -0
  134. package/src/core/security/ipGuard.ts +92 -0
  135. package/src/crawler/crawl.ts +382 -0
  136. package/src/crawler/extract.ts +34 -0
  137. package/src/crawler/fetcher.ts +233 -0
  138. package/src/crawler/metricsRunner.ts +124 -0
  139. package/src/crawler/normalize.ts +108 -0
  140. package/src/crawler/parser.ts +190 -0
  141. package/src/crawler/sitemap.ts +73 -0
  142. package/src/crawler/trap.ts +96 -0
  143. package/src/db/graphLoader.ts +105 -0
  144. package/src/db/index.ts +70 -0
  145. package/src/db/repositories/EdgeRepository.ts +29 -0
  146. package/src/db/repositories/MetricsRepository.ts +49 -0
  147. package/src/db/repositories/PageRepository.ts +128 -0
  148. package/src/db/repositories/SiteRepository.ts +32 -0
  149. package/src/db/repositories/SnapshotRepository.ts +74 -0
  150. package/src/db/schema.ts +177 -0
  151. package/src/diff/compare.ts +84 -0
  152. package/src/graph/cluster.ts +192 -0
  153. package/src/graph/duplicate.ts +286 -0
  154. package/src/graph/graph.ts +172 -0
  155. package/src/graph/metrics.ts +110 -0
  156. package/src/graph/pagerank.ts +125 -0
  157. package/src/graph/simhash.ts +61 -0
  158. package/src/index.ts +30 -0
  159. package/src/lock/hashKey.ts +51 -0
  160. package/src/lock/lockManager.ts +124 -0
  161. package/src/lock/pidCheck.ts +13 -0
  162. package/src/report/html.ts +227 -0
  163. package/src/report/sitegraphExport.ts +58 -0
  164. package/src/report/sitegraph_template.ts +630 -0
  165. package/src/scoring/hits.ts +131 -0
  166. package/src/scoring/orphanSeverity.ts +176 -0
  167. package/src/utils/version.ts +18 -0
  168. package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
  169. package/tests/analysis.unit.test.ts +98 -0
  170. package/tests/analyze.integration.test.ts +98 -0
  171. package/tests/audit/dns.test.ts +31 -0
  172. package/tests/audit/headers.test.ts +45 -0
  173. package/tests/audit/scoring.test.ts +133 -0
  174. package/tests/audit/security.test.ts +12 -0
  175. package/tests/audit/transport.test.ts +112 -0
  176. package/tests/clustering.test.ts +118 -0
  177. package/tests/crawler.test.ts +358 -0
  178. package/tests/db.test.ts +159 -0
  179. package/tests/diff.test.ts +67 -0
  180. package/tests/duplicate.test.ts +110 -0
  181. package/tests/fetcher.test.ts +106 -0
  182. package/tests/fetcher_safety.test.ts +85 -0
  183. package/tests/fixtures/analyze-crawl.json +26 -0
  184. package/tests/hits.test.ts +134 -0
  185. package/tests/html_report.test.ts +58 -0
  186. package/tests/lock/lockManager.test.ts +138 -0
  187. package/tests/metrics.test.ts +196 -0
  188. package/tests/normalize.test.ts +101 -0
  189. package/tests/orphanSeverity.test.ts +160 -0
  190. package/tests/pagerank.test.ts +98 -0
  191. package/tests/parser.test.ts +117 -0
  192. package/tests/proxy_safety.test.ts +57 -0
  193. package/tests/redirect_safety.test.ts +73 -0
  194. package/tests/safety.test.ts +114 -0
  195. package/tests/scope.test.ts +66 -0
  196. package/tests/scoring.test.ts +59 -0
  197. package/tests/sitemap.test.ts +88 -0
  198. package/tests/soft404.test.ts +41 -0
  199. package/tests/trap.test.ts +39 -0
  200. package/tests/visualization_data.test.ts +46 -0
  201. package/tsconfig.json +11 -0
@@ -0,0 +1,207 @@
1
+ import https from 'node:https';
2
+ import http from 'node:http';
3
+ import tls from 'node:tls';
4
+ import { URL } from 'node:url';
5
+ import { IPGuard } from '../core/security/ipGuard.js';
6
+ export async function analyzeTransport(targetUrl, timeout) {
7
+ const maxRedirects = 10;
8
+ let currentUrl = targetUrl;
9
+ let redirectCount = 0;
10
+ const redirects = [];
11
+ const issues = [];
12
+ // Cumulative metrics
13
+ let totalRedirectTime = 0;
14
+ for (let i = 0; i < maxRedirects; i++) {
15
+ const urlObj = new URL(currentUrl);
16
+ const isSafe = await IPGuard.validateHost(urlObj.hostname);
17
+ if (!isSafe) {
18
+ throw new Error(`Blocked: Redirect to internal/private IP prohibited (${currentUrl})`);
19
+ }
20
+ try {
21
+ const result = await executeRequest(currentUrl, timeout);
22
+ if (result.redirectUrl) {
23
+ redirectCount++;
24
+ totalRedirectTime += result.timings.total;
25
+ redirects.push({
26
+ url: currentUrl,
27
+ statusCode: result.response.statusCode || 0,
28
+ location: result.redirectUrl
29
+ });
30
+ currentUrl = result.redirectUrl;
31
+ continue;
32
+ }
33
+ // Final destination reached
34
+ const { response, body, timings, socket } = result;
35
+ // Collect Certificate Info
36
+ let certInfo = null;
37
+ let tlsVersion = null;
38
+ let cipherSuite = null;
39
+ let alpnProtocol = null;
40
+ if (socket instanceof tls.TLSSocket) {
41
+ const cert = socket.getPeerCertificate(true);
42
+ tlsVersion = socket.getProtocol();
43
+ const cipher = socket.getCipher();
44
+ cipherSuite = cipher ? cipher.name : null;
45
+ alpnProtocol = socket.alpnProtocol || null;
46
+ if (cert && Object.keys(cert).length > 0) {
47
+ certInfo = {
48
+ subject: (cert.subject && cert.subject.CN) ? cert.subject.CN : 'Unknown',
49
+ issuer: (cert.issuer && cert.issuer.CN) ? cert.issuer.CN : 'Unknown',
50
+ validFrom: cert.valid_from,
51
+ validTo: cert.valid_to,
52
+ daysUntilExpiry: Math.floor((new Date(cert.valid_to).getTime() - Date.now()) / (1000 * 60 * 60 * 24)),
53
+ isSelfSigned: cert.issuer && cert.subject && cert.issuer.CN === cert.subject.CN,
54
+ isValidChain: socket.authorized,
55
+ fingerprint: cert.fingerprint,
56
+ serialNumber: cert.serialNumber,
57
+ subjectAltName: cert.subjectaltname
58
+ };
59
+ if (!socket.authorized) {
60
+ issues.push({
61
+ id: 'cert-invalid',
62
+ severity: 'severe',
63
+ category: 'tls',
64
+ message: `Certificate validation failed: ${socket.authorizationError}`,
65
+ scorePenalty: 30
66
+ });
67
+ }
68
+ }
69
+ }
70
+ const httpVersion = response.httpVersion;
71
+ const contentEncoding = response.headers['content-encoding'];
72
+ const compression = [];
73
+ if (contentEncoding) {
74
+ compression.push(contentEncoding);
75
+ }
76
+ const connectionHeader = response.headers['connection'];
77
+ const keepAlive = connectionHeader ? connectionHeader.toLowerCase() !== 'close' : true;
78
+ const serverHeader = response.headers['server'] || null;
79
+ const headerText = `HTTP/${response.httpVersion} ${response.statusCode} ${response.statusMessage}\r\n` +
80
+ Object.entries(response.headers).map(([k, v]) => `${k}: ${v}`).join('\r\n') +
81
+ '\r\n\r\n';
82
+ const headerSize = Buffer.byteLength(headerText);
83
+ const htmlSize = body.length;
84
+ const transport = {
85
+ tlsVersion,
86
+ cipherSuite,
87
+ alpnProtocol: alpnProtocol || (httpVersion === '2.0' ? 'h2' : 'http/1.1'),
88
+ certificate: certInfo,
89
+ httpVersion,
90
+ compression,
91
+ keepAlive,
92
+ transferEncoding: response.headers['transfer-encoding'] || null,
93
+ redirectCount,
94
+ redirects,
95
+ serverHeader,
96
+ headers: response.headers
97
+ };
98
+ const performance = {
99
+ dnsLookupTime: timings.dns,
100
+ tcpConnectTime: timings.tcp,
101
+ tlsHandshakeTime: timings.tls,
102
+ ttfb: timings.ttfb,
103
+ totalTime: timings.total + totalRedirectTime,
104
+ htmlSize,
105
+ headerSize,
106
+ redirectTime: totalRedirectTime
107
+ };
108
+ return { transport, performance, issues };
109
+ }
110
+ catch (error) {
111
+ throw new Error(`Transport analysis failed for ${currentUrl}: ${error.message}`, { cause: error });
112
+ }
113
+ }
114
+ throw new Error(`Too many redirects (limit: ${maxRedirects})`);
115
+ }
116
+ function executeRequest(urlStr, timeout) {
117
+ return new Promise((resolve, reject) => {
118
+ let url;
119
+ try {
120
+ url = new URL(urlStr);
121
+ }
122
+ catch (_e) {
123
+ return reject(new Error(`Invalid URL: ${urlStr}`));
124
+ }
125
+ const isHttps = url.protocol === 'https:';
126
+ const requestModule = isHttps ? https : http;
127
+ const timings = {
128
+ dns: 0,
129
+ tcp: 0,
130
+ tls: 0,
131
+ ttfb: 0,
132
+ total: 0
133
+ };
134
+ const t0 = performance.now();
135
+ let tDNS = t0;
136
+ let tTCP = t0;
137
+ let tTLS = t0;
138
+ let tReqSent = 0;
139
+ // We use agent: false to force new connection for accurate timing
140
+ const options = {
141
+ method: 'GET',
142
+ timeout,
143
+ rejectUnauthorized: false,
144
+ agent: false,
145
+ headers: {
146
+ 'User-Agent': 'Crawlith/Audit',
147
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
148
+ 'Accept-Encoding': 'gzip, deflate, br'
149
+ }
150
+ };
151
+ const req = requestModule.request(url, options, (res) => {
152
+ // TTFB: Time from request sent to first byte of headers received
153
+ timings.ttfb = performance.now() - (tReqSent || t0);
154
+ const chunks = [];
155
+ res.on('data', (chunk) => chunks.push(chunk));
156
+ res.on('end', () => {
157
+ timings.total = performance.now() - t0;
158
+ const body = Buffer.concat(chunks);
159
+ let redirectUrl = null;
160
+ if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
161
+ try {
162
+ redirectUrl = new URL(res.headers.location, urlStr).toString();
163
+ }
164
+ catch (_e) {
165
+ // Ignore invalid redirect
166
+ }
167
+ }
168
+ resolve({
169
+ url: urlStr,
170
+ response: res,
171
+ body,
172
+ timings,
173
+ socket: res.socket,
174
+ redirectUrl
175
+ });
176
+ });
177
+ });
178
+ req.on('socket', (socket) => {
179
+ socket.on('lookup', () => {
180
+ tDNS = performance.now();
181
+ timings.dns = tDNS - t0;
182
+ });
183
+ socket.on('connect', () => {
184
+ tTCP = performance.now();
185
+ if (timings.dns === 0 && tDNS === t0) {
186
+ // No lookup event
187
+ timings.dns = 0;
188
+ tDNS = t0;
189
+ }
190
+ timings.tcp = tTCP - tDNS;
191
+ });
192
+ socket.on('secureConnect', () => {
193
+ tTLS = performance.now();
194
+ timings.tls = tTLS - tTCP;
195
+ });
196
+ });
197
+ req.on('finish', () => {
198
+ tReqSent = performance.now();
199
+ });
200
+ req.on('error', (err) => reject(err));
201
+ req.on('timeout', () => {
202
+ req.destroy();
203
+ reject(new Error('Request timed out'));
204
+ });
205
+ req.end();
206
+ });
207
+ }
@@ -0,0 +1,88 @@
1
+ export interface AuditResult {
2
+ url: string;
3
+ transport: TransportDiagnostics;
4
+ securityHeaders: SecurityHeadersResult;
5
+ dns: DnsDiagnostics;
6
+ performance: PerformanceMetrics;
7
+ score: number;
8
+ grade: 'A' | 'B' | 'C' | 'D' | 'F';
9
+ issues: AuditIssue[];
10
+ }
11
+ export interface TransportDiagnostics {
12
+ tlsVersion: string | null;
13
+ cipherSuite: string | null;
14
+ alpnProtocol: string | null;
15
+ certificate: CertificateInfo | null;
16
+ httpVersion: string;
17
+ compression: string[];
18
+ keepAlive: boolean;
19
+ transferEncoding: string | null;
20
+ redirectCount: number;
21
+ redirects: RedirectInfo[];
22
+ serverHeader: string | null;
23
+ headers: Record<string, string | string[] | undefined>;
24
+ }
25
+ export interface CertificateInfo {
26
+ issuer: string;
27
+ subject: string;
28
+ validFrom: string;
29
+ validTo: string;
30
+ daysUntilExpiry: number;
31
+ isSelfSigned: boolean;
32
+ isValidChain: boolean;
33
+ fingerprint: string;
34
+ serialNumber: string;
35
+ subjectAltName?: string;
36
+ }
37
+ export interface RedirectInfo {
38
+ url: string;
39
+ statusCode: number;
40
+ location: string | null;
41
+ }
42
+ export interface SecurityHeadersResult {
43
+ strictTransportSecurity: HeaderStatus;
44
+ contentSecurityPolicy: HeaderStatus;
45
+ xFrameOptions: HeaderStatus;
46
+ xContentTypeOptions: HeaderStatus;
47
+ referrerPolicy: HeaderStatus;
48
+ permissionsPolicy: HeaderStatus;
49
+ details: Record<string, string>;
50
+ score: number;
51
+ }
52
+ export interface HeaderStatus {
53
+ present: boolean;
54
+ value: string | null;
55
+ valid: boolean;
56
+ issues?: string[];
57
+ }
58
+ export interface DnsDiagnostics {
59
+ a: string[];
60
+ aaaa: string[];
61
+ cname: string[];
62
+ reverse: string[];
63
+ ipCount: number;
64
+ ipv6Support: boolean;
65
+ resolutionTime: number;
66
+ }
67
+ export interface PerformanceMetrics {
68
+ dnsLookupTime: number;
69
+ tcpConnectTime: number;
70
+ tlsHandshakeTime: number;
71
+ ttfb: number;
72
+ totalTime: number;
73
+ htmlSize: number;
74
+ headerSize: number;
75
+ redirectTime?: number;
76
+ }
77
+ export interface AuditIssue {
78
+ id: string;
79
+ severity: 'critical' | 'severe' | 'moderate' | 'minor' | 'info';
80
+ category: 'tls' | 'http' | 'headers' | 'dns' | 'performance';
81
+ message: string;
82
+ scorePenalty: number;
83
+ }
84
+ export interface AuditOptions {
85
+ timeout?: number;
86
+ verbose?: boolean;
87
+ debug?: boolean;
88
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,6 @@
1
+ import { ProxyAgent } from 'undici';
2
+ export declare class ProxyAdapter {
3
+ private agent?;
4
+ constructor(proxyUrl?: string);
5
+ get dispatcher(): ProxyAgent | undefined;
6
+ }
@@ -0,0 +1,19 @@
1
+ import { ProxyAgent } from 'undici';
2
+ export class ProxyAdapter {
3
+ agent;
4
+ constructor(proxyUrl) {
5
+ if (proxyUrl) {
6
+ try {
7
+ // Validate URL
8
+ new URL(proxyUrl);
9
+ this.agent = new ProxyAgent(proxyUrl);
10
+ }
11
+ catch {
12
+ throw new Error(`Invalid proxy URL: ${proxyUrl}`);
13
+ }
14
+ }
15
+ }
16
+ get dispatcher() {
17
+ return this.agent;
18
+ }
19
+ }
@@ -0,0 +1,6 @@
1
+ export declare class RateLimiter {
2
+ private buckets;
3
+ private rate;
4
+ constructor(rate?: number);
5
+ waitForToken(host: string, crawlDelay?: number): Promise<void>;
6
+ }
@@ -0,0 +1,31 @@
1
+ export class RateLimiter {
2
+ buckets = new Map();
3
+ rate; // tokens per second
4
+ constructor(rate = 2) {
5
+ this.rate = rate;
6
+ }
7
+ async waitForToken(host, crawlDelay = 0) {
8
+ const effectiveRate = crawlDelay > 0 ? Math.min(this.rate, 1 / crawlDelay) : this.rate;
9
+ const interval = 1000 / effectiveRate;
10
+ if (!this.buckets.has(host)) {
11
+ this.buckets.set(host, { tokens: this.rate - 1, lastRefill: Date.now() });
12
+ return;
13
+ }
14
+ const bucket = this.buckets.get(host);
15
+ while (true) {
16
+ const now = Date.now();
17
+ const elapsed = now - bucket.lastRefill;
18
+ if (elapsed > 0) {
19
+ const newTokens = elapsed / interval;
20
+ bucket.tokens = Math.min(this.rate, bucket.tokens + newTokens);
21
+ bucket.lastRefill = now;
22
+ }
23
+ if (bucket.tokens >= 1) {
24
+ bucket.tokens -= 1;
25
+ return;
26
+ }
27
+ const waitTime = Math.max(0, interval - (Date.now() - bucket.lastRefill));
28
+ await new Promise(resolve => setTimeout(resolve, waitTime));
29
+ }
30
+ }
31
+ }
@@ -0,0 +1,13 @@
1
+ export declare class RedirectController {
2
+ private maxHops;
3
+ private currentHops;
4
+ private history;
5
+ constructor(maxHops?: number, seedUrl?: string);
6
+ /**
7
+ * Records a hop and checks if it's within limits and not a loop.
8
+ * Returns null if allowed, or an error status string if blocked.
9
+ */
10
+ nextHop(url: string): 'redirect_limit_exceeded' | 'redirect_loop' | null;
11
+ get hops(): number;
12
+ private normalize;
13
+ }
@@ -0,0 +1,41 @@
1
+ export class RedirectController {
2
+ maxHops;
3
+ currentHops = 0;
4
+ history = new Set();
5
+ constructor(maxHops = 5, seedUrl) {
6
+ this.maxHops = maxHops;
7
+ if (seedUrl) {
8
+ this.history.add(this.normalize(seedUrl));
9
+ }
10
+ }
11
+ /**
12
+ * Records a hop and checks if it's within limits and not a loop.
13
+ * Returns null if allowed, or an error status string if blocked.
14
+ */
15
+ nextHop(url) {
16
+ // Normalize URL for loop detection (basic)
17
+ const normalized = this.normalize(url);
18
+ if (this.history.has(normalized)) {
19
+ return 'redirect_loop';
20
+ }
21
+ if (this.currentHops >= this.maxHops) {
22
+ return 'redirect_limit_exceeded';
23
+ }
24
+ this.history.add(normalized);
25
+ this.currentHops++;
26
+ return null;
27
+ }
28
+ get hops() {
29
+ return this.currentHops;
30
+ }
31
+ normalize(url) {
32
+ try {
33
+ const u = new URL(url);
34
+ u.hash = ''; // Ignore hash for loop detection
35
+ return u.toString();
36
+ }
37
+ catch {
38
+ return url;
39
+ }
40
+ }
41
+ }
@@ -0,0 +1,4 @@
1
+ import { Readable } from 'stream';
2
+ export declare class ResponseLimiter {
3
+ static streamToString(stream: Readable, maxBytes: number, onOversized?: (bytes: number) => void): Promise<string>;
4
+ }
@@ -0,0 +1,26 @@
1
+ export class ResponseLimiter {
2
+ static async streamToString(stream, maxBytes, onOversized) {
3
+ return new Promise((resolve, reject) => {
4
+ let accumulated = 0;
5
+ const chunks = [];
6
+ stream.on('data', (chunk) => {
7
+ const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
8
+ accumulated += buffer.length;
9
+ if (accumulated > maxBytes) {
10
+ stream.destroy();
11
+ if (onOversized)
12
+ onOversized(accumulated);
13
+ reject(new Error('Oversized response'));
14
+ return;
15
+ }
16
+ chunks.push(buffer);
17
+ });
18
+ stream.on('end', () => {
19
+ resolve(Buffer.concat(chunks).toString('utf-8'));
20
+ });
21
+ stream.on('error', (err) => {
22
+ reject(err);
23
+ });
24
+ });
25
+ }
26
+ }
@@ -0,0 +1,10 @@
1
+ export interface RetryConfig {
2
+ maxRetries: number;
3
+ baseDelay: number;
4
+ }
5
+ export declare class RetryPolicy {
6
+ static DEFAULT_CONFIG: RetryConfig;
7
+ static execute<T>(operation: (attempt: number) => Promise<T>, isRetryable: (error: any) => boolean, config?: RetryConfig): Promise<T>;
8
+ static isRetryableStatus(status: number): boolean;
9
+ static isNetworkError(error: any): boolean;
10
+ }
@@ -0,0 +1,41 @@
1
+ export class RetryPolicy {
2
+ static DEFAULT_CONFIG = {
3
+ maxRetries: 3,
4
+ baseDelay: 500
5
+ };
6
+ static async execute(operation, isRetryable, config = RetryPolicy.DEFAULT_CONFIG) {
7
+ let lastError;
8
+ for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
9
+ try {
10
+ return await operation(attempt);
11
+ }
12
+ catch (error) {
13
+ lastError = error;
14
+ if (attempt === config.maxRetries || !isRetryable(error)) {
15
+ throw error;
16
+ }
17
+ const delay = config.baseDelay * Math.pow(2, attempt);
18
+ const jitter = delay * 0.1 * (Math.random() * 2 - 1);
19
+ const finalDelay = Math.max(0, delay + jitter);
20
+ await new Promise(resolve => setTimeout(resolve, finalDelay));
21
+ }
22
+ }
23
+ throw lastError;
24
+ }
25
+ static isRetryableStatus(status) {
26
+ return status === 429 || (status >= 500 && status <= 599);
27
+ }
28
+ static isNetworkError(error) {
29
+ const code = error?.code || error?.cause?.code;
30
+ return [
31
+ 'ETIMEDOUT',
32
+ 'ECONNRESET',
33
+ 'EADDRINUSE',
34
+ 'ECONNREFUSED',
35
+ 'EPIPE',
36
+ 'ENOTFOUND',
37
+ 'ENETUNREACH',
38
+ 'EAI_AGAIN'
39
+ ].includes(code);
40
+ }
41
+ }
@@ -0,0 +1,11 @@
1
+ export declare class DomainFilter {
2
+ private allowed;
3
+ private denied;
4
+ constructor(allowed?: string[], denied?: string[]);
5
+ /**
6
+ * Normalizes a hostname: lowercase, strip trailing dot.
7
+ * Note: We expect hostnames, not URLs.
8
+ */
9
+ private normalize;
10
+ isAllowed(hostname: string): boolean;
11
+ }
@@ -0,0 +1,40 @@
1
+ export class DomainFilter {
2
+ allowed;
3
+ denied;
4
+ constructor(allowed = [], denied = []) {
5
+ this.allowed = new Set(allowed.map(d => this.normalize(d)));
6
+ this.denied = new Set(denied.map(d => this.normalize(d)));
7
+ }
8
+ /**
9
+ * Normalizes a hostname: lowercase, strip trailing dot.
10
+ * Note: We expect hostnames, not URLs.
11
+ */
12
+ normalize(hostname) {
13
+ let h = hostname.toLowerCase().trim();
14
+ if (h.endsWith('.')) {
15
+ h = h.slice(0, -1);
16
+ }
17
+ // Use URL to handle punycode and basic validation if possible
18
+ try {
19
+ // We wrap it in a dummy URL to let the browser/node logic normalize it
20
+ const url = new URL(`http://${h}`);
21
+ return url.hostname;
22
+ }
23
+ catch {
24
+ return h;
25
+ }
26
+ }
27
+ isAllowed(hostname) {
28
+ const normalized = this.normalize(hostname);
29
+ // 1. Deny list match -> Reject
30
+ if (this.denied.has(normalized)) {
31
+ return false;
32
+ }
33
+ // 2. Allow list not empty AND no match -> Reject
34
+ if (this.allowed.size > 0 && !this.allowed.has(normalized)) {
35
+ return false;
36
+ }
37
+ // 3. Otherwise -> Allow
38
+ return true;
39
+ }
40
+ }
@@ -0,0 +1,14 @@
1
+ export interface ScopeOptions {
2
+ allowedDomains?: string[];
3
+ deniedDomains?: string[];
4
+ includeSubdomains?: boolean;
5
+ rootUrl: string;
6
+ }
7
+ export type EligibilityResult = 'allowed' | 'blocked_by_domain_filter' | 'blocked_subdomain';
8
+ export declare class ScopeManager {
9
+ private domainFilter;
10
+ private subdomainPolicy;
11
+ private explicitAllowed;
12
+ constructor(options: ScopeOptions);
13
+ isUrlEligible(url: string): EligibilityResult;
14
+ }
@@ -0,0 +1,39 @@
1
+ import { DomainFilter } from './domainFilter.js';
2
+ import { SubdomainPolicy } from './subdomainPolicy.js';
3
+ export class ScopeManager {
4
+ domainFilter;
5
+ subdomainPolicy;
6
+ explicitAllowed;
7
+ constructor(options) {
8
+ this.domainFilter = new DomainFilter(options.allowedDomains, options.deniedDomains);
9
+ this.subdomainPolicy = new SubdomainPolicy(options.rootUrl, options.includeSubdomains);
10
+ this.explicitAllowed = new Set((options.allowedDomains || []).map(d => {
11
+ let h = d.toLowerCase().trim();
12
+ if (h.endsWith('.'))
13
+ h = h.slice(0, -1);
14
+ return h;
15
+ }));
16
+ }
17
+ isUrlEligible(url) {
18
+ let hostname;
19
+ try {
20
+ hostname = new URL(url).hostname.toLowerCase();
21
+ if (hostname.endsWith('.'))
22
+ hostname = hostname.slice(0, -1);
23
+ }
24
+ catch {
25
+ return 'blocked_by_domain_filter'; // Invalid URL is effectively blocked
26
+ }
27
+ if (!this.domainFilter.isAllowed(hostname)) {
28
+ return 'blocked_by_domain_filter';
29
+ }
30
+ // If explicit whitelist is used, and this domain is in it, allow it
31
+ if (this.explicitAllowed.has(hostname)) {
32
+ return 'allowed';
33
+ }
34
+ if (!this.subdomainPolicy.isAllowed(hostname)) {
35
+ return 'blocked_subdomain';
36
+ }
37
+ return 'allowed';
38
+ }
39
+ }
@@ -0,0 +1,6 @@
1
+ export declare class SubdomainPolicy {
2
+ private rootHost;
3
+ private includeSubdomains;
4
+ constructor(rootUrl: string, includeSubdomains?: boolean);
5
+ isAllowed(hostname: string): boolean;
6
+ }
@@ -0,0 +1,35 @@
1
+ export class SubdomainPolicy {
2
+ rootHost;
3
+ includeSubdomains;
4
+ constructor(rootUrl, includeSubdomains = false) {
5
+ try {
6
+ this.rootHost = new URL(rootUrl).hostname.toLowerCase();
7
+ if (this.rootHost.endsWith('.')) {
8
+ this.rootHost = this.rootHost.slice(0, -1);
9
+ }
10
+ }
11
+ catch {
12
+ this.rootHost = '';
13
+ }
14
+ this.includeSubdomains = includeSubdomains;
15
+ }
16
+ isAllowed(hostname) {
17
+ let target = hostname.toLowerCase().trim();
18
+ if (target.endsWith('.')) {
19
+ target = target.slice(0, -1);
20
+ }
21
+ // Exact match is always allowed if rootHost is set
22
+ if (target === this.rootHost) {
23
+ return true;
24
+ }
25
+ if (!this.includeSubdomains) {
26
+ return false;
27
+ }
28
+ // Label-based check for subdomains
29
+ // target must end with .rootHost
30
+ if (!target.endsWith(`.${this.rootHost}`)) {
31
+ return false;
32
+ }
33
+ return true;
34
+ }
35
+ }
@@ -0,0 +1,11 @@
1
+ export declare class IPGuard {
2
+ /**
3
+ * Checks if an IP address is internal/private
4
+ */
5
+ static isInternal(ip: string): boolean;
6
+ /**
7
+ * Resolves a hostname and validates all result IPs
8
+ */
9
+ static validateHost(host: string): Promise<boolean>;
10
+ private static expandIPv6;
11
+ }