jaku.sh 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/LICENSE +52 -0
  2. package/README.md +636 -0
  3. package/action.yml +264 -0
  4. package/bin/jaku +2 -0
  5. package/package.json +62 -0
  6. package/src/agents/ai-agent.js +175 -0
  7. package/src/agents/api-agent.js +95 -0
  8. package/src/agents/base-agent.js +158 -0
  9. package/src/agents/crawl-agent.js +175 -0
  10. package/src/agents/event-bus.js +59 -0
  11. package/src/agents/findings-ledger.js +410 -0
  12. package/src/agents/logic-agent.js +144 -0
  13. package/src/agents/orchestrator.js +323 -0
  14. package/src/agents/qa-agent.js +149 -0
  15. package/src/agents/security-agent.js +211 -0
  16. package/src/cli.js +423 -0
  17. package/src/core/accessibility-checker.js +171 -0
  18. package/src/core/ai/ai-endpoint-detector.js +227 -0
  19. package/src/core/ai/guardrail-prober.js +362 -0
  20. package/src/core/ai/indirect-injector.js +106 -0
  21. package/src/core/ai/jailbreak-tester.js +212 -0
  22. package/src/core/ai/model-dos-tester.js +174 -0
  23. package/src/core/ai/model-fingerprinter.js +246 -0
  24. package/src/core/ai/multi-turn-attacker.js +297 -0
  25. package/src/core/ai/output-analyzer.js +182 -0
  26. package/src/core/ai/prompt-injector.js +543 -0
  27. package/src/core/ai/system-prompt-extractor.js +244 -0
  28. package/src/core/api/api-key-auditor.js +266 -0
  29. package/src/core/api/auth-flow-tester.js +430 -0
  30. package/src/core/api/cors-ws-tester.js +263 -0
  31. package/src/core/api/graphql-tester.js +287 -0
  32. package/src/core/api/oauth-prober.js +343 -0
  33. package/src/core/auth-manager.js +902 -0
  34. package/src/core/broken-flow-detector.js +207 -0
  35. package/src/core/browser-manager.js +119 -0
  36. package/src/core/console-monitor.js +111 -0
  37. package/src/core/crawler.js +430 -0
  38. package/src/core/csr-waiter.js +410 -0
  39. package/src/core/form-validator.js +240 -0
  40. package/src/core/logic/abuse-pattern-scanner.js +291 -0
  41. package/src/core/logic/access-boundary-tester.js +448 -0
  42. package/src/core/logic/business-rule-inferrer.js +196 -0
  43. package/src/core/logic/graphql-auditor.js +298 -0
  44. package/src/core/logic/parameter-polluter.js +212 -0
  45. package/src/core/logic/pricing-exploiter.js +299 -0
  46. package/src/core/logic/race-condition-detector.js +222 -0
  47. package/src/core/logic/workflow-enforcer.js +284 -0
  48. package/src/core/performance-checker.js +204 -0
  49. package/src/core/responsive-checker.js +228 -0
  50. package/src/core/security/cors-prober.js +150 -0
  51. package/src/core/security/csrf-prober.js +217 -0
  52. package/src/core/security/dependency-auditor.js +182 -0
  53. package/src/core/security/file-upload-tester.js +340 -0
  54. package/src/core/security/header-analyzer.js +324 -0
  55. package/src/core/security/infra-scanner.js +391 -0
  56. package/src/core/security/path-traversal.js +112 -0
  57. package/src/core/security/prototype-pollution.js +147 -0
  58. package/src/core/security/secret-detector.js +517 -0
  59. package/src/core/security/sqli-prober.js +257 -0
  60. package/src/core/security/tls-checker.js +223 -0
  61. package/src/core/security/xss-scanner.js +225 -0
  62. package/src/core/test-generator.js +339 -0
  63. package/src/core/test-runner.js +398 -0
  64. package/src/reporting/diff-reporter.js +172 -0
  65. package/src/reporting/report-generator.js +408 -0
  66. package/src/reporting/sarif-generator.js +190 -0
  67. package/src/utils/config.js +57 -0
  68. package/src/utils/finding.js +67 -0
  69. package/src/utils/logger.js +50 -0
@@ -0,0 +1,430 @@
1
+ import { BrowserManager } from './browser-manager.js';
2
+ import pLimit from 'p-limit';
3
+ import { createFinding } from '../utils/finding.js';
4
+
5
+ export class Crawler {
6
+ constructor(config, logger) {
7
+ this.config = config;
8
+ this.logger = logger;
9
+ this.visited = new Set();
10
+ this.surfaces = [];
11
+ this.apiEndpoints = [];
12
+ this.forms = [];
13
+ this.consoleErrors = [];
14
+ this.failedRequests = [];
15
+ this.maxPages = config.crawler?.max_pages || 50;
16
+ this.maxDepth = config.crawler?.max_depth || 5;
17
+ this.timeout = config.crawler?.timeout || 30000;
18
+
19
+ // Fix 1 & 3: Concurrency + rate limiting
20
+ this.concurrency = config.crawler?.concurrency || 5;
21
+ this.delayMs = config.crawler?.delay_ms ?? 100; // 100ms default polite delay
22
+
23
+ this.baseUrl = null;
24
+ this._queue = [];
25
+ this._limit = null;
26
+ }
27
+
28
+ /**
29
+ * Main crawl entry point. Returns a SurfaceInventory.
30
+ * @param {string} targetUrl - URL to crawl
31
+ * @param {object} [authState] - Playwright storageState for authenticated crawling
32
+ * @param {string[]} [seedLinks] - Additional URLs to crawl (e.g., from post-login page)
33
+ */
34
+ async crawl(targetUrl, authState = null, seedLinks = []) {
35
+ this.baseUrl = new URL(targetUrl);
36
+
37
+ // Fix 1: Create a concurrency limiter (default: 5 pages in parallel)
38
+ this._limit = pLimit(this.concurrency);
39
+
40
+ const browser = await BrowserManager.launch({ headless: true });
41
+
42
+ const contextOptions = {
43
+ viewport: { width: 1440, height: 900 },
44
+ ignoreHTTPSErrors: true,
45
+ };
46
+
47
+ if (authState) {
48
+ contextOptions.storageState = authState;
49
+ }
50
+
51
+ const context = await browser.newContext(contextOptions);
52
+
53
+ try {
54
+ // Seed initial URL(s) and process the queue
55
+ this._enqueue(targetUrl, 0);
56
+
57
+ // Also enqueue seed links (authenticated pages discovered during login)
58
+ for (const link of seedLinks) {
59
+ if (this._isSameOrigin(link)) {
60
+ this._enqueue(link, 0);
61
+ }
62
+ }
63
+
64
+ // Process the queue until empty or limits reached
65
+ await this._processQueue(context);
66
+
67
+ // Backup discovery: if crawl found very few pages, try sitemap.xml and robots.txt
68
+ if (this.surfaces.length <= 2) {
69
+ this.logger?.info?.('Few pages discovered — trying sitemap.xml and robots.txt as backup discovery');
70
+ const backupLinks = await this._discoverBackupLinks(targetUrl);
71
+ for (const link of backupLinks) {
72
+ if (!this.visited.has(this._normalizeUrl(link)) && this._isSameOrigin(link)) {
73
+ this._enqueue(link, 1);
74
+ }
75
+ }
76
+ await this._processQueue(context);
77
+ }
78
+ } finally {
79
+ await browser.close();
80
+ }
81
+
82
+ const inventory = {
83
+ baseUrl: targetUrl,
84
+ pages: this.surfaces,
85
+ apiEndpoints: this.apiEndpoints,
86
+ forms: this.forms,
87
+ totalPages: this.surfaces.length,
88
+ totalApis: this.apiEndpoints.length,
89
+ totalForms: this.forms.length,
90
+ crawledAt: new Date().toISOString(),
91
+ authenticated: !!authState,
92
+ };
93
+
94
+ this.logger?.info?.(`Crawl complete: ${inventory.totalPages} pages, ${inventory.totalApis} APIs, ${inventory.totalForms} forms${authState ? ' (authenticated)' : ''}`);
95
+ return inventory;
96
+ }
97
+
98
+ /**
99
+ * Add a URL to the crawl queue.
100
+ */
101
+ _enqueue(url, depth) {
102
+ const normalized = this._normalizeUrl(url);
103
+ if (this.visited.has(normalized)) return;
104
+ if (!this._isSameOrigin(url)) return;
105
+ if (depth > this.maxDepth) return;
106
+ if (this.visited.size >= this.maxPages) return;
107
+
108
+ // Mark as visited immediately to prevent duplicate queueing
109
+ this.visited.add(normalized);
110
+ this._queue.push({ url, depth });
111
+ }
112
+
113
+ /**
114
+ * Drain the queue concurrently up to the concurrency limit.
115
+ */
116
+ async _processQueue(context) {
117
+ while (this._queue.length > 0 && this.surfaces.length < this.maxPages) {
118
+ const batch = this._queue.splice(0, this._queue.length);
119
+ const tasks = batch
120
+ .filter(() => this.surfaces.length < this.maxPages)
121
+ .map(({ url, depth }) =>
122
+ this._limit(() => this._crawlPage(context, url, depth))
123
+ );
124
+ await Promise.allSettled(tasks);
125
+ }
126
+ }
127
+
128
+ /**
129
+ * Crawls a single page and enqueues discovered links.
130
+ */
131
+ async _crawlPage(context, url, depth) {
132
+ // Fix 3: Polite delay between requests
133
+ if (this.delayMs > 0) {
134
+ await new Promise(r => setTimeout(r, this.delayMs));
135
+ }
136
+
137
+ const normalizedUrl = this._normalizeUrl(url);
138
+ const page = await context.newPage();
139
+ const pageData = {
140
+ url: normalizedUrl,
141
+ type: 'page',
142
+ status: null,
143
+ title: '',
144
+ links: [],
145
+ forms: [],
146
+ consoleErrors: [],
147
+ failedRequests: [],
148
+ loadTime: 0,
149
+ };
150
+
151
+ // Monitor console and network
152
+ const consoleMessages = [];
153
+ const failedReqs = [];
154
+
155
+ page.on('console', msg => {
156
+ if (msg.type() === 'error') {
157
+ consoleMessages.push({
158
+ type: msg.type(),
159
+ text: msg.text(),
160
+ url: normalizedUrl,
161
+ });
162
+ }
163
+ });
164
+
165
+ page.on('pageerror', error => {
166
+ consoleMessages.push({
167
+ type: 'exception',
168
+ text: error.message,
169
+ url: normalizedUrl,
170
+ });
171
+ });
172
+
173
+ page.on('requestfailed', request => {
174
+ failedReqs.push({
175
+ url: request.url(),
176
+ method: request.method(),
177
+ failure: request.failure()?.errorText || 'Unknown',
178
+ page: normalizedUrl,
179
+ });
180
+ });
181
+
182
+ // Intercept API calls
183
+ page.on('response', response => {
184
+ const reqUrl = response.url();
185
+ const contentType = response.headers()['content-type'] || '';
186
+
187
+ // Fix 3: Respect Retry-After header
188
+ const retryAfter = response.headers()['retry-after'];
189
+ if (retryAfter && (response.status() === 429 || response.status() === 503)) {
190
+ const waitMs = Math.min(parseInt(retryAfter, 10) * 1000 || 5000, 30000);
191
+ this.logger?.warn?.(`Rate limited (${response.status()}) — backing off ${waitMs}ms`);
192
+ // Increase delay temporarily
193
+ this.delayMs = Math.max(this.delayMs, waitMs);
194
+ }
195
+
196
+ if (contentType.includes('application/json') && this._isSameOrigin(reqUrl)) {
197
+ const existing = this.apiEndpoints.find(e => e.url === reqUrl && e.method === response.request().method());
198
+ if (!existing) {
199
+ this.apiEndpoints.push({
200
+ url: reqUrl,
201
+ method: response.request().method(),
202
+ status: response.status(),
203
+ contentType,
204
+ });
205
+ }
206
+ }
207
+ });
208
+
209
+ try {
210
+ const startTime = Date.now();
211
+
212
+ // Progressive fallback: networkidle → load → domcontentloaded
213
+ let response = null;
214
+ const strategies = ['networkidle', 'load', 'domcontentloaded'];
215
+
216
+ for (const strategy of strategies) {
217
+ try {
218
+ const strategyTimeout = strategy === 'networkidle' ? this.timeout : Math.min(this.timeout, 15000);
219
+ response = await page.goto(url, {
220
+ waitUntil: strategy,
221
+ timeout: strategyTimeout,
222
+ });
223
+ this.logger?.debug?.(`Page loaded with '${strategy}' strategy: ${normalizedUrl}`);
224
+ break;
225
+ } catch (navErr) {
226
+ if (strategy !== strategies[strategies.length - 1]) {
227
+ this.logger?.debug?.(`'${strategy}' timed out for ${normalizedUrl}, trying '${strategies[strategies.indexOf(strategy) + 1]}'`);
228
+ } else {
229
+ this.logger?.warn?.(`All load strategies failed for ${normalizedUrl}: ${navErr.message}`);
230
+ }
231
+ }
232
+ }
233
+
234
+ pageData.loadTime = Date.now() - startTime;
235
+ pageData.status = response?.status() || null;
236
+ pageData.title = await page.title().catch(() => '');
237
+
238
+ // Extract links
239
+ const links = await page.evaluate(() => {
240
+ const anchors = Array.from(document.querySelectorAll('a[href]'));
241
+ return anchors.map(a => a.href).filter(href => href && !href.startsWith('javascript:'));
242
+ }).catch(() => []);
243
+ pageData.links = [...new Set(links)];
244
+
245
+ // Extract forms
246
+ const pageForms = await page.evaluate(() => {
247
+ return Array.from(document.querySelectorAll('form')).map((form, idx) => {
248
+ const fields = Array.from(form.querySelectorAll('input, select, textarea')).map(field => ({
249
+ tag: field.tagName.toLowerCase(),
250
+ type: field.type || field.tagName.toLowerCase(),
251
+ name: field.name || field.id || `field-${idx}`,
252
+ required: field.required,
253
+ placeholder: field.placeholder || '',
254
+ pattern: field.pattern || '',
255
+ minLength: field.minLength > 0 ? field.minLength : null,
256
+ maxLength: field.maxLength > 0 ? field.maxLength : null,
257
+ }));
258
+
259
+ return {
260
+ action: form.action || window.location.href,
261
+ method: (form.method || 'get').toUpperCase(),
262
+ id: form.id || `form-${idx}`,
263
+ fields,
264
+ hasSubmitButton: !!form.querySelector('button[type="submit"], input[type="submit"]'),
265
+ };
266
+ });
267
+ }).catch(() => []);
268
+
269
+ for (const form of pageForms) {
270
+ form.page = normalizedUrl;
271
+ this.forms.push(form);
272
+ }
273
+ pageData.forms = pageForms;
274
+ pageData.consoleErrors = consoleMessages;
275
+ pageData.failedRequests = failedReqs;
276
+
277
+ this.consoleErrors.push(...consoleMessages);
278
+ this.failedRequests.push(...failedReqs);
279
+ this.surfaces.push(pageData);
280
+
281
+ this.logger?.debug?.(`Crawled: ${normalizedUrl} (${pageData.status}) - ${links.length} links, ${pageForms.length} forms`);
282
+
283
+ // Enqueue discovered links (non-blocking — queue is processed above)
284
+ if (depth < this.maxDepth) {
285
+ for (const link of pageData.links) {
286
+ this._enqueue(link, depth + 1);
287
+ }
288
+ }
289
+ } catch (err) {
290
+ const partialLinks = await page.evaluate(() => {
291
+ const anchors = Array.from(document.querySelectorAll('a[href]'));
292
+ return anchors.map(a => a.href).filter(href => href && !href.startsWith('javascript:'));
293
+ }).catch(() => []);
294
+
295
+ pageData.status = 'error';
296
+ pageData.error = err.message;
297
+ pageData.links = [...new Set(partialLinks)];
298
+ this.surfaces.push(pageData);
299
+ this.logger?.warn?.(`Failed to crawl ${normalizedUrl}: ${err.message}${partialLinks.length > 0 ? ` (extracted ${partialLinks.length} partial links)` : ''}`);
300
+
301
+ if (depth < this.maxDepth) {
302
+ for (const link of partialLinks) {
303
+ if (this._isSameOrigin(link)) {
304
+ this._enqueue(link, depth + 1);
305
+ }
306
+ }
307
+ }
308
+ } finally {
309
+ await page.close();
310
+ }
311
+ }
312
+
313
+ _normalizeUrl(url) {
314
+ try {
315
+ const u = new URL(url);
316
+ u.hash = '';
317
+ let normalized = u.toString();
318
+ if (normalized.endsWith('/') && u.pathname !== '/') {
319
+ normalized = normalized.slice(0, -1);
320
+ }
321
+ return normalized;
322
+ } catch {
323
+ return url;
324
+ }
325
+ }
326
+
327
+ _isSameOrigin(url) {
328
+ try {
329
+ const u = new URL(url);
330
+ return u.origin === this.baseUrl.origin;
331
+ } catch {
332
+ return false;
333
+ }
334
+ }
335
+
336
+ /**
337
+ * Backup link discovery via sitemap.xml and robots.txt.
338
+ */
339
+ async _discoverBackupLinks(targetUrl) {
340
+ const discovered = new Set();
341
+ await this._discoverFromSitemap(targetUrl, discovered);
342
+ await this._discoverFromRobots(targetUrl, discovered);
343
+ const newLinks = [...discovered].filter(link => !this.visited.has(this._normalizeUrl(link)));
344
+ if (newLinks.length > 0) {
345
+ this.logger?.info?.(`Backup discovery found ${newLinks.length} new URLs from sitemap/robots`);
346
+ }
347
+ return newLinks;
348
+ }
349
+
350
+ async _discoverFromSitemap(targetUrl, discovered) {
351
+ const sitemapUrls = [
352
+ new URL('/sitemap.xml', targetUrl).toString(),
353
+ new URL('/sitemap_index.xml', targetUrl).toString(),
354
+ ];
355
+
356
+ for (const sitemapUrl of sitemapUrls) {
357
+ try {
358
+ const resp = await fetch(sitemapUrl, {
359
+ signal: AbortSignal.timeout(10000),
360
+ redirect: 'follow',
361
+ });
362
+ if (!resp.ok) continue;
363
+
364
+ const contentType = resp.headers.get('content-type') || '';
365
+ if (!contentType.includes('xml') && !contentType.includes('text')) continue;
366
+
367
+ const body = await resp.text();
368
+ const locMatches = body.matchAll(/<loc>\s*(https?:\/\/[^<]+)\s*<\/loc>/gi);
369
+ for (const match of locMatches) {
370
+ const url = match[1].trim();
371
+ if (this._isSameOrigin(url)) {
372
+ discovered.add(url);
373
+ }
374
+ if (url.includes('sitemap') && url.endsWith('.xml')) {
375
+ await this._discoverFromSitemap(url, discovered);
376
+ }
377
+ }
378
+
379
+ this.logger?.debug?.(`Parsed sitemap: ${sitemapUrl} → ${discovered.size} URLs`);
380
+ } catch {
381
+ // Sitemap not available
382
+ }
383
+ }
384
+ }
385
+
386
+ async _discoverFromRobots(targetUrl, discovered) {
387
+ try {
388
+ const robotsUrl = new URL('/robots.txt', targetUrl).toString();
389
+ const resp = await fetch(robotsUrl, {
390
+ signal: AbortSignal.timeout(10000),
391
+ redirect: 'follow',
392
+ });
393
+ if (!resp.ok) return;
394
+
395
+ const body = await resp.text();
396
+ const lines = body.split('\n');
397
+
398
+ for (const line of lines) {
399
+ const trimmed = line.trim();
400
+
401
+ if (trimmed.toLowerCase().startsWith('sitemap:')) {
402
+ const sitemapUrl = trimmed.substring(8).trim();
403
+ if (sitemapUrl.startsWith('http')) {
404
+ await this._discoverFromSitemap(sitemapUrl, discovered);
405
+ }
406
+ }
407
+
408
+ if (trimmed.toLowerCase().startsWith('disallow:')) {
409
+ const path = trimmed.substring(9).trim();
410
+ if (path && path !== '/' && path !== '*' && !path.includes('*')) {
411
+ try {
412
+ const fullUrl = new URL(path, targetUrl).toString();
413
+ if (this._isSameOrigin(fullUrl)) {
414
+ discovered.add(fullUrl);
415
+ }
416
+ } catch {
417
+ // Invalid path
418
+ }
419
+ }
420
+ }
421
+ }
422
+
423
+ this.logger?.debug?.(`Parsed robots.txt → ${discovered.size} URLs`);
424
+ } catch {
425
+ // robots.txt not available
426
+ }
427
+ }
428
+ }
429
+
430
+ export default Crawler;