jaku.sh 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +52 -0
- package/README.md +636 -0
- package/action.yml +264 -0
- package/bin/jaku +2 -0
- package/package.json +62 -0
- package/src/agents/ai-agent.js +175 -0
- package/src/agents/api-agent.js +95 -0
- package/src/agents/base-agent.js +158 -0
- package/src/agents/crawl-agent.js +175 -0
- package/src/agents/event-bus.js +59 -0
- package/src/agents/findings-ledger.js +410 -0
- package/src/agents/logic-agent.js +144 -0
- package/src/agents/orchestrator.js +323 -0
- package/src/agents/qa-agent.js +149 -0
- package/src/agents/security-agent.js +211 -0
- package/src/cli.js +423 -0
- package/src/core/accessibility-checker.js +171 -0
- package/src/core/ai/ai-endpoint-detector.js +227 -0
- package/src/core/ai/guardrail-prober.js +362 -0
- package/src/core/ai/indirect-injector.js +106 -0
- package/src/core/ai/jailbreak-tester.js +212 -0
- package/src/core/ai/model-dos-tester.js +174 -0
- package/src/core/ai/model-fingerprinter.js +246 -0
- package/src/core/ai/multi-turn-attacker.js +297 -0
- package/src/core/ai/output-analyzer.js +182 -0
- package/src/core/ai/prompt-injector.js +543 -0
- package/src/core/ai/system-prompt-extractor.js +244 -0
- package/src/core/api/api-key-auditor.js +266 -0
- package/src/core/api/auth-flow-tester.js +430 -0
- package/src/core/api/cors-ws-tester.js +263 -0
- package/src/core/api/graphql-tester.js +287 -0
- package/src/core/api/oauth-prober.js +343 -0
- package/src/core/auth-manager.js +902 -0
- package/src/core/broken-flow-detector.js +207 -0
- package/src/core/browser-manager.js +119 -0
- package/src/core/console-monitor.js +111 -0
- package/src/core/crawler.js +430 -0
- package/src/core/csr-waiter.js +410 -0
- package/src/core/form-validator.js +240 -0
- package/src/core/logic/abuse-pattern-scanner.js +291 -0
- package/src/core/logic/access-boundary-tester.js +448 -0
- package/src/core/logic/business-rule-inferrer.js +196 -0
- package/src/core/logic/graphql-auditor.js +298 -0
- package/src/core/logic/parameter-polluter.js +212 -0
- package/src/core/logic/pricing-exploiter.js +299 -0
- package/src/core/logic/race-condition-detector.js +222 -0
- package/src/core/logic/workflow-enforcer.js +284 -0
- package/src/core/performance-checker.js +204 -0
- package/src/core/responsive-checker.js +228 -0
- package/src/core/security/cors-prober.js +150 -0
- package/src/core/security/csrf-prober.js +217 -0
- package/src/core/security/dependency-auditor.js +182 -0
- package/src/core/security/file-upload-tester.js +340 -0
- package/src/core/security/header-analyzer.js +324 -0
- package/src/core/security/infra-scanner.js +391 -0
- package/src/core/security/path-traversal.js +112 -0
- package/src/core/security/prototype-pollution.js +147 -0
- package/src/core/security/secret-detector.js +517 -0
- package/src/core/security/sqli-prober.js +257 -0
- package/src/core/security/tls-checker.js +223 -0
- package/src/core/security/xss-scanner.js +225 -0
- package/src/core/test-generator.js +339 -0
- package/src/core/test-runner.js +398 -0
- package/src/reporting/diff-reporter.js +172 -0
- package/src/reporting/report-generator.js +408 -0
- package/src/reporting/sarif-generator.js +190 -0
- package/src/utils/config.js +57 -0
- package/src/utils/finding.js +67 -0
- package/src/utils/logger.js +50 -0
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
import { BrowserManager } from './browser-manager.js';
|
|
2
|
+
import pLimit from 'p-limit';
|
|
3
|
+
import { createFinding } from '../utils/finding.js';
|
|
4
|
+
|
|
5
|
+
export class Crawler {
|
|
6
|
+
constructor(config, logger) {
|
|
7
|
+
this.config = config;
|
|
8
|
+
this.logger = logger;
|
|
9
|
+
this.visited = new Set();
|
|
10
|
+
this.surfaces = [];
|
|
11
|
+
this.apiEndpoints = [];
|
|
12
|
+
this.forms = [];
|
|
13
|
+
this.consoleErrors = [];
|
|
14
|
+
this.failedRequests = [];
|
|
15
|
+
this.maxPages = config.crawler?.max_pages || 50;
|
|
16
|
+
this.maxDepth = config.crawler?.max_depth || 5;
|
|
17
|
+
this.timeout = config.crawler?.timeout || 30000;
|
|
18
|
+
|
|
19
|
+
// Fix 1 & 3: Concurrency + rate limiting
|
|
20
|
+
this.concurrency = config.crawler?.concurrency || 5;
|
|
21
|
+
this.delayMs = config.crawler?.delay_ms ?? 100; // 100ms default polite delay
|
|
22
|
+
|
|
23
|
+
this.baseUrl = null;
|
|
24
|
+
this._queue = [];
|
|
25
|
+
this._limit = null;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Main crawl entry point. Returns a SurfaceInventory.
|
|
30
|
+
* @param {string} targetUrl - URL to crawl
|
|
31
|
+
* @param {object} [authState] - Playwright storageState for authenticated crawling
|
|
32
|
+
* @param {string[]} [seedLinks] - Additional URLs to crawl (e.g., from post-login page)
|
|
33
|
+
*/
|
|
34
|
+
async crawl(targetUrl, authState = null, seedLinks = []) {
|
|
35
|
+
this.baseUrl = new URL(targetUrl);
|
|
36
|
+
|
|
37
|
+
// Fix 1: Create a concurrency limiter (default: 5 pages in parallel)
|
|
38
|
+
this._limit = pLimit(this.concurrency);
|
|
39
|
+
|
|
40
|
+
const browser = await BrowserManager.launch({ headless: true });
|
|
41
|
+
|
|
42
|
+
const contextOptions = {
|
|
43
|
+
viewport: { width: 1440, height: 900 },
|
|
44
|
+
ignoreHTTPSErrors: true,
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
if (authState) {
|
|
48
|
+
contextOptions.storageState = authState;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const context = await browser.newContext(contextOptions);
|
|
52
|
+
|
|
53
|
+
try {
|
|
54
|
+
// Seed initial URL(s) and process the queue
|
|
55
|
+
this._enqueue(targetUrl, 0);
|
|
56
|
+
|
|
57
|
+
// Also enqueue seed links (authenticated pages discovered during login)
|
|
58
|
+
for (const link of seedLinks) {
|
|
59
|
+
if (this._isSameOrigin(link)) {
|
|
60
|
+
this._enqueue(link, 0);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Process the queue until empty or limits reached
|
|
65
|
+
await this._processQueue(context);
|
|
66
|
+
|
|
67
|
+
// Backup discovery: if crawl found very few pages, try sitemap.xml and robots.txt
|
|
68
|
+
if (this.surfaces.length <= 2) {
|
|
69
|
+
this.logger?.info?.('Few pages discovered — trying sitemap.xml and robots.txt as backup discovery');
|
|
70
|
+
const backupLinks = await this._discoverBackupLinks(targetUrl);
|
|
71
|
+
for (const link of backupLinks) {
|
|
72
|
+
if (!this.visited.has(this._normalizeUrl(link)) && this._isSameOrigin(link)) {
|
|
73
|
+
this._enqueue(link, 1);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
await this._processQueue(context);
|
|
77
|
+
}
|
|
78
|
+
} finally {
|
|
79
|
+
await browser.close();
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const inventory = {
|
|
83
|
+
baseUrl: targetUrl,
|
|
84
|
+
pages: this.surfaces,
|
|
85
|
+
apiEndpoints: this.apiEndpoints,
|
|
86
|
+
forms: this.forms,
|
|
87
|
+
totalPages: this.surfaces.length,
|
|
88
|
+
totalApis: this.apiEndpoints.length,
|
|
89
|
+
totalForms: this.forms.length,
|
|
90
|
+
crawledAt: new Date().toISOString(),
|
|
91
|
+
authenticated: !!authState,
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
this.logger?.info?.(`Crawl complete: ${inventory.totalPages} pages, ${inventory.totalApis} APIs, ${inventory.totalForms} forms${authState ? ' (authenticated)' : ''}`);
|
|
95
|
+
return inventory;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Add a URL to the crawl queue.
|
|
100
|
+
*/
|
|
101
|
+
_enqueue(url, depth) {
|
|
102
|
+
const normalized = this._normalizeUrl(url);
|
|
103
|
+
if (this.visited.has(normalized)) return;
|
|
104
|
+
if (!this._isSameOrigin(url)) return;
|
|
105
|
+
if (depth > this.maxDepth) return;
|
|
106
|
+
if (this.visited.size >= this.maxPages) return;
|
|
107
|
+
|
|
108
|
+
// Mark as visited immediately to prevent duplicate queueing
|
|
109
|
+
this.visited.add(normalized);
|
|
110
|
+
this._queue.push({ url, depth });
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Drain the queue concurrently up to the concurrency limit.
|
|
115
|
+
*/
|
|
116
|
+
async _processQueue(context) {
|
|
117
|
+
while (this._queue.length > 0 && this.surfaces.length < this.maxPages) {
|
|
118
|
+
const batch = this._queue.splice(0, this._queue.length);
|
|
119
|
+
const tasks = batch
|
|
120
|
+
.filter(() => this.surfaces.length < this.maxPages)
|
|
121
|
+
.map(({ url, depth }) =>
|
|
122
|
+
this._limit(() => this._crawlPage(context, url, depth))
|
|
123
|
+
);
|
|
124
|
+
await Promise.allSettled(tasks);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Crawls a single page and enqueues discovered links.
|
|
130
|
+
*/
|
|
131
|
+
async _crawlPage(context, url, depth) {
|
|
132
|
+
// Fix 3: Polite delay between requests
|
|
133
|
+
if (this.delayMs > 0) {
|
|
134
|
+
await new Promise(r => setTimeout(r, this.delayMs));
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const normalizedUrl = this._normalizeUrl(url);
|
|
138
|
+
const page = await context.newPage();
|
|
139
|
+
const pageData = {
|
|
140
|
+
url: normalizedUrl,
|
|
141
|
+
type: 'page',
|
|
142
|
+
status: null,
|
|
143
|
+
title: '',
|
|
144
|
+
links: [],
|
|
145
|
+
forms: [],
|
|
146
|
+
consoleErrors: [],
|
|
147
|
+
failedRequests: [],
|
|
148
|
+
loadTime: 0,
|
|
149
|
+
};
|
|
150
|
+
|
|
151
|
+
// Monitor console and network
|
|
152
|
+
const consoleMessages = [];
|
|
153
|
+
const failedReqs = [];
|
|
154
|
+
|
|
155
|
+
page.on('console', msg => {
|
|
156
|
+
if (msg.type() === 'error') {
|
|
157
|
+
consoleMessages.push({
|
|
158
|
+
type: msg.type(),
|
|
159
|
+
text: msg.text(),
|
|
160
|
+
url: normalizedUrl,
|
|
161
|
+
});
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
page.on('pageerror', error => {
|
|
166
|
+
consoleMessages.push({
|
|
167
|
+
type: 'exception',
|
|
168
|
+
text: error.message,
|
|
169
|
+
url: normalizedUrl,
|
|
170
|
+
});
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
page.on('requestfailed', request => {
|
|
174
|
+
failedReqs.push({
|
|
175
|
+
url: request.url(),
|
|
176
|
+
method: request.method(),
|
|
177
|
+
failure: request.failure()?.errorText || 'Unknown',
|
|
178
|
+
page: normalizedUrl,
|
|
179
|
+
});
|
|
180
|
+
});
|
|
181
|
+
|
|
182
|
+
// Intercept API calls
|
|
183
|
+
page.on('response', response => {
|
|
184
|
+
const reqUrl = response.url();
|
|
185
|
+
const contentType = response.headers()['content-type'] || '';
|
|
186
|
+
|
|
187
|
+
// Fix 3: Respect Retry-After header
|
|
188
|
+
const retryAfter = response.headers()['retry-after'];
|
|
189
|
+
if (retryAfter && (response.status() === 429 || response.status() === 503)) {
|
|
190
|
+
const waitMs = Math.min(parseInt(retryAfter, 10) * 1000 || 5000, 30000);
|
|
191
|
+
this.logger?.warn?.(`Rate limited (${response.status()}) — backing off ${waitMs}ms`);
|
|
192
|
+
// Increase delay temporarily
|
|
193
|
+
this.delayMs = Math.max(this.delayMs, waitMs);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
if (contentType.includes('application/json') && this._isSameOrigin(reqUrl)) {
|
|
197
|
+
const existing = this.apiEndpoints.find(e => e.url === reqUrl && e.method === response.request().method());
|
|
198
|
+
if (!existing) {
|
|
199
|
+
this.apiEndpoints.push({
|
|
200
|
+
url: reqUrl,
|
|
201
|
+
method: response.request().method(),
|
|
202
|
+
status: response.status(),
|
|
203
|
+
contentType,
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
try {
|
|
210
|
+
const startTime = Date.now();
|
|
211
|
+
|
|
212
|
+
// Progressive fallback: networkidle → load → domcontentloaded
|
|
213
|
+
let response = null;
|
|
214
|
+
const strategies = ['networkidle', 'load', 'domcontentloaded'];
|
|
215
|
+
|
|
216
|
+
for (const strategy of strategies) {
|
|
217
|
+
try {
|
|
218
|
+
const strategyTimeout = strategy === 'networkidle' ? this.timeout : Math.min(this.timeout, 15000);
|
|
219
|
+
response = await page.goto(url, {
|
|
220
|
+
waitUntil: strategy,
|
|
221
|
+
timeout: strategyTimeout,
|
|
222
|
+
});
|
|
223
|
+
this.logger?.debug?.(`Page loaded with '${strategy}' strategy: ${normalizedUrl}`);
|
|
224
|
+
break;
|
|
225
|
+
} catch (navErr) {
|
|
226
|
+
if (strategy !== strategies[strategies.length - 1]) {
|
|
227
|
+
this.logger?.debug?.(`'${strategy}' timed out for ${normalizedUrl}, trying '${strategies[strategies.indexOf(strategy) + 1]}'`);
|
|
228
|
+
} else {
|
|
229
|
+
this.logger?.warn?.(`All load strategies failed for ${normalizedUrl}: ${navErr.message}`);
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
pageData.loadTime = Date.now() - startTime;
|
|
235
|
+
pageData.status = response?.status() || null;
|
|
236
|
+
pageData.title = await page.title().catch(() => '');
|
|
237
|
+
|
|
238
|
+
// Extract links
|
|
239
|
+
const links = await page.evaluate(() => {
|
|
240
|
+
const anchors = Array.from(document.querySelectorAll('a[href]'));
|
|
241
|
+
return anchors.map(a => a.href).filter(href => href && !href.startsWith('javascript:'));
|
|
242
|
+
}).catch(() => []);
|
|
243
|
+
pageData.links = [...new Set(links)];
|
|
244
|
+
|
|
245
|
+
// Extract forms
|
|
246
|
+
const pageForms = await page.evaluate(() => {
|
|
247
|
+
return Array.from(document.querySelectorAll('form')).map((form, idx) => {
|
|
248
|
+
const fields = Array.from(form.querySelectorAll('input, select, textarea')).map(field => ({
|
|
249
|
+
tag: field.tagName.toLowerCase(),
|
|
250
|
+
type: field.type || field.tagName.toLowerCase(),
|
|
251
|
+
name: field.name || field.id || `field-${idx}`,
|
|
252
|
+
required: field.required,
|
|
253
|
+
placeholder: field.placeholder || '',
|
|
254
|
+
pattern: field.pattern || '',
|
|
255
|
+
minLength: field.minLength > 0 ? field.minLength : null,
|
|
256
|
+
maxLength: field.maxLength > 0 ? field.maxLength : null,
|
|
257
|
+
}));
|
|
258
|
+
|
|
259
|
+
return {
|
|
260
|
+
action: form.action || window.location.href,
|
|
261
|
+
method: (form.method || 'get').toUpperCase(),
|
|
262
|
+
id: form.id || `form-${idx}`,
|
|
263
|
+
fields,
|
|
264
|
+
hasSubmitButton: !!form.querySelector('button[type="submit"], input[type="submit"]'),
|
|
265
|
+
};
|
|
266
|
+
});
|
|
267
|
+
}).catch(() => []);
|
|
268
|
+
|
|
269
|
+
for (const form of pageForms) {
|
|
270
|
+
form.page = normalizedUrl;
|
|
271
|
+
this.forms.push(form);
|
|
272
|
+
}
|
|
273
|
+
pageData.forms = pageForms;
|
|
274
|
+
pageData.consoleErrors = consoleMessages;
|
|
275
|
+
pageData.failedRequests = failedReqs;
|
|
276
|
+
|
|
277
|
+
this.consoleErrors.push(...consoleMessages);
|
|
278
|
+
this.failedRequests.push(...failedReqs);
|
|
279
|
+
this.surfaces.push(pageData);
|
|
280
|
+
|
|
281
|
+
this.logger?.debug?.(`Crawled: ${normalizedUrl} (${pageData.status}) - ${links.length} links, ${pageForms.length} forms`);
|
|
282
|
+
|
|
283
|
+
// Enqueue discovered links (non-blocking — queue is processed above)
|
|
284
|
+
if (depth < this.maxDepth) {
|
|
285
|
+
for (const link of pageData.links) {
|
|
286
|
+
this._enqueue(link, depth + 1);
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
} catch (err) {
|
|
290
|
+
const partialLinks = await page.evaluate(() => {
|
|
291
|
+
const anchors = Array.from(document.querySelectorAll('a[href]'));
|
|
292
|
+
return anchors.map(a => a.href).filter(href => href && !href.startsWith('javascript:'));
|
|
293
|
+
}).catch(() => []);
|
|
294
|
+
|
|
295
|
+
pageData.status = 'error';
|
|
296
|
+
pageData.error = err.message;
|
|
297
|
+
pageData.links = [...new Set(partialLinks)];
|
|
298
|
+
this.surfaces.push(pageData);
|
|
299
|
+
this.logger?.warn?.(`Failed to crawl ${normalizedUrl}: ${err.message}${partialLinks.length > 0 ? ` (extracted ${partialLinks.length} partial links)` : ''}`);
|
|
300
|
+
|
|
301
|
+
if (depth < this.maxDepth) {
|
|
302
|
+
for (const link of partialLinks) {
|
|
303
|
+
if (this._isSameOrigin(link)) {
|
|
304
|
+
this._enqueue(link, depth + 1);
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
} finally {
|
|
309
|
+
await page.close();
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
_normalizeUrl(url) {
|
|
314
|
+
try {
|
|
315
|
+
const u = new URL(url);
|
|
316
|
+
u.hash = '';
|
|
317
|
+
let normalized = u.toString();
|
|
318
|
+
if (normalized.endsWith('/') && u.pathname !== '/') {
|
|
319
|
+
normalized = normalized.slice(0, -1);
|
|
320
|
+
}
|
|
321
|
+
return normalized;
|
|
322
|
+
} catch {
|
|
323
|
+
return url;
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
_isSameOrigin(url) {
|
|
328
|
+
try {
|
|
329
|
+
const u = new URL(url);
|
|
330
|
+
return u.origin === this.baseUrl.origin;
|
|
331
|
+
} catch {
|
|
332
|
+
return false;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
/**
|
|
337
|
+
* Backup link discovery via sitemap.xml and robots.txt.
|
|
338
|
+
*/
|
|
339
|
+
async _discoverBackupLinks(targetUrl) {
|
|
340
|
+
const discovered = new Set();
|
|
341
|
+
await this._discoverFromSitemap(targetUrl, discovered);
|
|
342
|
+
await this._discoverFromRobots(targetUrl, discovered);
|
|
343
|
+
const newLinks = [...discovered].filter(link => !this.visited.has(this._normalizeUrl(link)));
|
|
344
|
+
if (newLinks.length > 0) {
|
|
345
|
+
this.logger?.info?.(`Backup discovery found ${newLinks.length} new URLs from sitemap/robots`);
|
|
346
|
+
}
|
|
347
|
+
return newLinks;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
async _discoverFromSitemap(targetUrl, discovered) {
|
|
351
|
+
const sitemapUrls = [
|
|
352
|
+
new URL('/sitemap.xml', targetUrl).toString(),
|
|
353
|
+
new URL('/sitemap_index.xml', targetUrl).toString(),
|
|
354
|
+
];
|
|
355
|
+
|
|
356
|
+
for (const sitemapUrl of sitemapUrls) {
|
|
357
|
+
try {
|
|
358
|
+
const resp = await fetch(sitemapUrl, {
|
|
359
|
+
signal: AbortSignal.timeout(10000),
|
|
360
|
+
redirect: 'follow',
|
|
361
|
+
});
|
|
362
|
+
if (!resp.ok) continue;
|
|
363
|
+
|
|
364
|
+
const contentType = resp.headers.get('content-type') || '';
|
|
365
|
+
if (!contentType.includes('xml') && !contentType.includes('text')) continue;
|
|
366
|
+
|
|
367
|
+
const body = await resp.text();
|
|
368
|
+
const locMatches = body.matchAll(/<loc>\s*(https?:\/\/[^<]+)\s*<\/loc>/gi);
|
|
369
|
+
for (const match of locMatches) {
|
|
370
|
+
const url = match[1].trim();
|
|
371
|
+
if (this._isSameOrigin(url)) {
|
|
372
|
+
discovered.add(url);
|
|
373
|
+
}
|
|
374
|
+
if (url.includes('sitemap') && url.endsWith('.xml')) {
|
|
375
|
+
await this._discoverFromSitemap(url, discovered);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
this.logger?.debug?.(`Parsed sitemap: ${sitemapUrl} → ${discovered.size} URLs`);
|
|
380
|
+
} catch {
|
|
381
|
+
// Sitemap not available
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
async _discoverFromRobots(targetUrl, discovered) {
|
|
387
|
+
try {
|
|
388
|
+
const robotsUrl = new URL('/robots.txt', targetUrl).toString();
|
|
389
|
+
const resp = await fetch(robotsUrl, {
|
|
390
|
+
signal: AbortSignal.timeout(10000),
|
|
391
|
+
redirect: 'follow',
|
|
392
|
+
});
|
|
393
|
+
if (!resp.ok) return;
|
|
394
|
+
|
|
395
|
+
const body = await resp.text();
|
|
396
|
+
const lines = body.split('\n');
|
|
397
|
+
|
|
398
|
+
for (const line of lines) {
|
|
399
|
+
const trimmed = line.trim();
|
|
400
|
+
|
|
401
|
+
if (trimmed.toLowerCase().startsWith('sitemap:')) {
|
|
402
|
+
const sitemapUrl = trimmed.substring(8).trim();
|
|
403
|
+
if (sitemapUrl.startsWith('http')) {
|
|
404
|
+
await this._discoverFromSitemap(sitemapUrl, discovered);
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
if (trimmed.toLowerCase().startsWith('disallow:')) {
|
|
409
|
+
const path = trimmed.substring(9).trim();
|
|
410
|
+
if (path && path !== '/' && path !== '*' && !path.includes('*')) {
|
|
411
|
+
try {
|
|
412
|
+
const fullUrl = new URL(path, targetUrl).toString();
|
|
413
|
+
if (this._isSameOrigin(fullUrl)) {
|
|
414
|
+
discovered.add(fullUrl);
|
|
415
|
+
}
|
|
416
|
+
} catch {
|
|
417
|
+
// Invalid path
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
this.logger?.debug?.(`Parsed robots.txt → ${discovered.size} URLs`);
|
|
424
|
+
} catch {
|
|
425
|
+
// robots.txt not available
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
export default Crawler;
|