@govtechsg/oobee 0.10.90 → 0.10.92
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +289 -0
- package/README.md +3 -0
- package/dist/cli.js +3 -0
- package/dist/combine.js +14 -2
- package/dist/constants/cliFunctions.js +7 -0
- package/dist/constants/common.js +119 -70
- package/dist/constants/constants.js +1 -0
- package/dist/crawlers/commonCrawlerFunc.js +93 -15
- package/dist/crawlers/crawlDomain.js +45 -57
- package/dist/crawlers/crawlIntelligentSitemap.js +12 -7
- package/dist/crawlers/crawlRateController.js +47 -0
- package/dist/crawlers/crawlSitemap.js +51 -62
- package/dist/generateOobeeClientScanner.js +31 -0
- package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
- package/dist/mergeAxeResults.js +121 -68
- package/dist/npmIndex.js +1 -0
- package/dist/utils.js +23 -28
- package/oobee-client-scanner.js +33 -2
- package/package.json +2 -2
- package/src/cli.ts +4 -0
- package/src/combine.ts +15 -1
- package/src/constants/cliFunctions.ts +7 -0
- package/src/constants/common.ts +131 -79
- package/src/constants/constants.ts +1 -0
- package/src/crawlers/commonCrawlerFunc.ts +103 -14
- package/src/crawlers/crawlDomain.ts +52 -65
- package/src/crawlers/crawlIntelligentSitemap.ts +13 -7
- package/src/crawlers/crawlRateController.ts +63 -0
- package/src/crawlers/crawlSitemap.ts +57 -70
- package/src/generateOobeeClientScanner.ts +31 -0
- package/src/index.ts +1 -0
- package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
- package/src/mergeAxeResults.ts +141 -75
- package/src/npmIndex.ts +1 -0
- package/src/utils.ts +25 -33
- /package/{fb85adb0-5db6-4a09-8c80-05f030115004.txt → d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt} +0 -0
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import crawlee from 'crawlee';
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import { createCrawleeSubFolders, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
|
|
2
|
+
import { CrawlRateController } from './crawlRateController.js';
|
|
3
|
+
import { createCrawleeSubFolders, getPreLaunchHook, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
|
|
5
4
|
import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
|
|
6
5
|
import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
|
|
7
|
-
import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
|
|
6
|
+
import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
|
|
8
7
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
|
|
9
8
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
10
9
|
const isBlacklisted = (url, blacklistedPatterns) => {
|
|
@@ -258,9 +257,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
258
257
|
// same-domain strategy) still contribute their <a> links above, but
|
|
259
258
|
// clicking every interactive element on them is too slow and starves
|
|
260
259
|
// the crawler of time to discover pages on the primary hostname.
|
|
261
|
-
|
|
262
|
-
const seedHostname = new URL(url).hostname;
|
|
263
|
-
if (currentHostname === seedHostname) {
|
|
260
|
+
if (isSameHostname(new URL(page.url()).hostname, new URL(url).hostname)) {
|
|
264
261
|
// Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
|
|
265
262
|
try {
|
|
266
263
|
await customEnqueueLinksByClickingElements(page, browserContext);
|
|
@@ -277,39 +274,27 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
277
274
|
}
|
|
278
275
|
};
|
|
279
276
|
let isAbortingScanNow = false;
|
|
277
|
+
const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
|
|
280
278
|
const crawler = register(new crawlee.PlaywrightCrawler({
|
|
281
279
|
launchContext: {
|
|
282
280
|
launcher: constants.launcher,
|
|
283
281
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
|
284
|
-
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
285
|
-
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
286
282
|
},
|
|
287
283
|
retryOnBlocked: true,
|
|
288
284
|
browserPoolOptions: {
|
|
289
285
|
useFingerprints: false,
|
|
290
286
|
preLaunchHooks: [
|
|
287
|
+
getPreLaunchHook(userDataDirectory),
|
|
291
288
|
async (_pageId, launchContext) => {
|
|
292
|
-
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
|
|
293
|
-
// Ensure base exists
|
|
294
|
-
await fsp.mkdir(baseDir, { recursive: true });
|
|
295
|
-
// Create a unique subdir per browser
|
|
296
|
-
const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
|
|
297
|
-
await fsp.mkdir(subProfileDir, { recursive: true });
|
|
298
|
-
// Assign to Crawlee's launcher
|
|
299
|
-
// Crawlee preLaunchHooks expects launchContext to be mutated in-place.
|
|
300
|
-
// eslint-disable-next-line no-param-reassign
|
|
301
|
-
launchContext.userDataDir = subProfileDir;
|
|
302
|
-
// Safely extend launchOptions
|
|
303
289
|
// eslint-disable-next-line no-param-reassign
|
|
304
290
|
launchContext.launchOptions = {
|
|
305
291
|
...launchContext.launchOptions,
|
|
306
292
|
ignoreHTTPSErrors: true,
|
|
307
293
|
...playwrightDeviceDetailsObject,
|
|
294
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
308
295
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
309
296
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
310
297
|
};
|
|
311
|
-
// Optionally log for debugging
|
|
312
|
-
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
|
|
313
298
|
},
|
|
314
299
|
],
|
|
315
300
|
},
|
|
@@ -390,11 +375,9 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
390
375
|
return;
|
|
391
376
|
}
|
|
392
377
|
const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
|
393
|
-
if (
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
durationExceeded = true;
|
|
397
|
-
}
|
|
378
|
+
if (hasExceededDuration) {
|
|
379
|
+
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
|
|
380
|
+
durationExceeded = true;
|
|
398
381
|
isAbortingScanNow = true;
|
|
399
382
|
activeCrawler.autoscaledPool.abort();
|
|
400
383
|
return;
|
|
@@ -527,8 +510,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
527
510
|
});
|
|
528
511
|
return;
|
|
529
512
|
}
|
|
530
|
-
|
|
531
|
-
if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
|
|
513
|
+
if (rateController.claimSlot()) {
|
|
532
514
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
533
515
|
numScanned: urlsCrawled.scanned.length,
|
|
534
516
|
urlScanned: request.url,
|
|
@@ -538,6 +520,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
538
520
|
pageTitle: results.pageTitle,
|
|
539
521
|
actualUrl, // i.e. actualUrl
|
|
540
522
|
});
|
|
523
|
+
rateController.onSuccess(crawler.autoscaledPool);
|
|
524
|
+
if (rateController.isLimitReached()) {
|
|
525
|
+
isAbortingScanNow = true;
|
|
526
|
+
activeCrawler.autoscaledPool.abort();
|
|
527
|
+
}
|
|
541
528
|
scannedUrlSet.add(normUrl(request.url));
|
|
542
529
|
scannedResolvedUrlSet.add(normUrl(actualUrl));
|
|
543
530
|
urlsCrawled.scannedRedirects.push({
|
|
@@ -549,8 +536,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
549
536
|
await dataset.pushData(results);
|
|
550
537
|
}
|
|
551
538
|
}
|
|
552
|
-
else if (
|
|
553
|
-
// One more check if scanned pages have reached limit due to multi-instances of handler running
|
|
539
|
+
else if (rateController.claimSlot()) {
|
|
554
540
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
555
541
|
numScanned: urlsCrawled.scanned.length,
|
|
556
542
|
urlScanned: request.url,
|
|
@@ -560,6 +546,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
560
546
|
actualUrl: request.url,
|
|
561
547
|
pageTitle: results.pageTitle,
|
|
562
548
|
});
|
|
549
|
+
rateController.onSuccess(crawler.autoscaledPool);
|
|
550
|
+
if (rateController.isLimitReached()) {
|
|
551
|
+
isAbortingScanNow = true;
|
|
552
|
+
activeCrawler.autoscaledPool.abort();
|
|
553
|
+
}
|
|
563
554
|
scannedUrlSet.add(normUrl(request.url));
|
|
564
555
|
scannedResolvedUrlSet.add(normUrl(request.url));
|
|
565
556
|
await dataset.pushData(results);
|
|
@@ -611,30 +602,29 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
611
602
|
}
|
|
612
603
|
}
|
|
613
604
|
catch {
|
|
614
|
-
//
|
|
615
|
-
}
|
|
616
|
-
// when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
|
|
617
|
-
// a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
|
|
618
|
-
if (!isAbortingScanNow) {
|
|
619
|
-
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
|
620
|
-
numScanned: urlsCrawled.scanned.length,
|
|
621
|
-
urlScanned: request.url,
|
|
622
|
-
});
|
|
623
|
-
urlsCrawled.error.push({
|
|
624
|
-
url: request.url,
|
|
625
|
-
pageTitle: request.url,
|
|
626
|
-
actualUrl: request.url,
|
|
627
|
-
metadata: STATUS_CODE_METADATA[2],
|
|
628
|
-
});
|
|
605
|
+
// Recovery failed; Crawlee will retry the request automatically
|
|
629
606
|
}
|
|
607
|
+
// Do not push to urlsCrawled.error here — Crawlee will retry the request
|
|
608
|
+
// (up to maxRequestRetries, default 3). If all retries are exhausted,
|
|
609
|
+
// failedRequestHandler will record the error. Pushing here causes
|
|
610
|
+
// duplicates and false positives for URLs that succeed on retry.
|
|
630
611
|
}
|
|
631
612
|
},
|
|
632
613
|
failedRequestHandler: async ({ request, response }) => {
|
|
614
|
+
if (isAbortingScanNow) {
|
|
615
|
+
return;
|
|
616
|
+
}
|
|
617
|
+
const status = response?.status();
|
|
618
|
+
if (rateController.onFailure(status, crawler.autoscaledPool)) {
|
|
619
|
+
consoleLogger.info(`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`);
|
|
620
|
+
isAbortingScanNow = true;
|
|
621
|
+
crawler.autoscaledPool?.abort();
|
|
622
|
+
return;
|
|
623
|
+
}
|
|
633
624
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
|
634
625
|
numScanned: urlsCrawled.scanned.length,
|
|
635
626
|
urlScanned: request.url,
|
|
636
627
|
});
|
|
637
|
-
const status = response?.status();
|
|
638
628
|
const metadata = typeof status === 'number'
|
|
639
629
|
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
|
640
630
|
: STATUS_CODE_METADATA[2];
|
|
@@ -648,15 +638,13 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
648
638
|
},
|
|
649
639
|
maxRequestsPerCrawl: Infinity,
|
|
650
640
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
},
|
|
659
|
-
}),
|
|
641
|
+
autoscaledPoolOptions: {
|
|
642
|
+
minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
|
|
643
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
644
|
+
desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
|
|
645
|
+
scaleUpStepRatio: 0.99, // Scale up faster
|
|
646
|
+
scaleDownStepRatio: 0.1, // Scale down slower
|
|
647
|
+
},
|
|
660
648
|
}));
|
|
661
649
|
await crawler.run();
|
|
662
650
|
// Additional passes: keep re-visiting scanned seed-hostname pages for
|
|
@@ -675,7 +663,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
675
663
|
.map(item => item.actualUrl || item.url)
|
|
676
664
|
.filter(pageUrl => {
|
|
677
665
|
try {
|
|
678
|
-
return new URL(pageUrl).hostname
|
|
666
|
+
return isSameHostname(new URL(pageUrl).hostname, seedHostname) && !clickPassVisited.has(pageUrl);
|
|
679
667
|
}
|
|
680
668
|
catch {
|
|
681
669
|
return false;
|
|
@@ -3,7 +3,7 @@ import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/consta
|
|
|
3
3
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
4
4
|
import crawlDomain from './crawlDomain.js';
|
|
5
5
|
import crawlSitemap from './crawlSitemap.js';
|
|
6
|
-
import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
|
|
6
|
+
import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt, initModifiedUserAgent } from '../constants/common.js';
|
|
7
7
|
import { register } from '../utils.js';
|
|
8
8
|
const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, strategy, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, followRobots, extraHTTPHeaders, safeMode, scanDuration) => {
|
|
9
9
|
const startTime = Date.now(); // Track start time
|
|
@@ -15,6 +15,9 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
15
15
|
let sitemapUrl;
|
|
16
16
|
let durationExceeded = false;
|
|
17
17
|
({ dataset } = await createCrawleeSubFolders(randomToken));
|
|
18
|
+
// Initialise modified User-Agent early so sitemap discovery requests
|
|
19
|
+
// don't expose "HeadlessChrome" (which triggers bot-blocking on some sites).
|
|
20
|
+
await initModifiedUserAgent(browser);
|
|
18
21
|
function getHomeUrl(parsedUrl) {
|
|
19
22
|
const urlObject = new URL(parsedUrl);
|
|
20
23
|
return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
|
@@ -30,6 +33,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
30
33
|
context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
|
31
34
|
...launchOptions,
|
|
32
35
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
36
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
33
37
|
});
|
|
34
38
|
register(context);
|
|
35
39
|
}
|
|
@@ -39,6 +43,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
39
43
|
register(browserInstance);
|
|
40
44
|
context = await browserInstance.newContext({
|
|
41
45
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
46
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
42
47
|
});
|
|
43
48
|
}
|
|
44
49
|
const page = await context.newPage();
|
|
@@ -59,7 +64,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
59
64
|
const checkUrlExists = async (page, parsedUrl) => {
|
|
60
65
|
try {
|
|
61
66
|
const response = await page.goto(parsedUrl);
|
|
62
|
-
return response
|
|
67
|
+
return response?.ok() ?? false;
|
|
63
68
|
}
|
|
64
69
|
catch (e) {
|
|
65
70
|
consoleLogger.error(e);
|
|
@@ -71,7 +76,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
71
76
|
try {
|
|
72
77
|
sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
|
|
73
78
|
if (sitemapUrls.length > 0) {
|
|
74
|
-
|
|
79
|
+
consoleLogger.info(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
|
|
75
80
|
sitemapExist = true;
|
|
76
81
|
}
|
|
77
82
|
}
|
|
@@ -91,7 +96,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
91
96
|
}
|
|
92
97
|
}
|
|
93
98
|
if (!sitemapExist) {
|
|
94
|
-
|
|
99
|
+
consoleLogger.info('Unable to find sitemap. Commencing website crawl instead.');
|
|
95
100
|
return await crawlDomain({
|
|
96
101
|
url,
|
|
97
102
|
randomToken,
|
|
@@ -121,7 +126,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
121
126
|
durationExceeded = true;
|
|
122
127
|
break;
|
|
123
128
|
}
|
|
124
|
-
|
|
129
|
+
consoleLogger.info(`Processing sitemap: ${currentSitemapUrl}`);
|
|
125
130
|
urlsCrawledFinal = await crawlSitemap({
|
|
126
131
|
sitemapUrl: currentSitemapUrl,
|
|
127
132
|
randomToken,
|
|
@@ -149,7 +154,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
149
154
|
const remainingScanDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : 0;
|
|
150
155
|
const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
|
|
151
156
|
if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
|
|
152
|
-
|
|
157
|
+
consoleLogger.info(`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`);
|
|
153
158
|
urlsCrawledFinal = await crawlDomain({
|
|
154
159
|
url,
|
|
155
160
|
randomToken,
|
|
@@ -173,7 +178,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
173
178
|
});
|
|
174
179
|
}
|
|
175
180
|
else if (!hasDurationRemaining) {
|
|
176
|
-
|
|
181
|
+
consoleLogger.info(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
|
|
177
182
|
durationExceeded = true;
|
|
178
183
|
}
|
|
179
184
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { consoleLogger } from '../logs.js';
|
|
2
|
+
export class CrawlRateController {
|
|
3
|
+
constructor(maxRequestsPerCrawl, maxConcurrency) {
|
|
4
|
+
this.scannedCount = 0;
|
|
5
|
+
this.consecutiveFailures = 0;
|
|
6
|
+
this.consecutiveSuccesses = 0;
|
|
7
|
+
this.maxPages = maxRequestsPerCrawl;
|
|
8
|
+
this.maxConsecutiveFailures = Number(process.env.OOBEE_CONSECUTIVE_MAX_RETRIES) || 100;
|
|
9
|
+
this.originalMaxConcurrency = maxConcurrency;
|
|
10
|
+
}
|
|
11
|
+
claimSlot() {
|
|
12
|
+
if (this.scannedCount >= this.maxPages) {
|
|
13
|
+
return false;
|
|
14
|
+
}
|
|
15
|
+
this.scannedCount++;
|
|
16
|
+
return true;
|
|
17
|
+
}
|
|
18
|
+
onSuccess(pool) {
|
|
19
|
+
this.consecutiveFailures = 0;
|
|
20
|
+
this.consecutiveSuccesses++;
|
|
21
|
+
if (pool && this.consecutiveSuccesses % CrawlRateController.RECOVERY_INTERVAL === 0) {
|
|
22
|
+
if (pool.maxConcurrency < this.originalMaxConcurrency) {
|
|
23
|
+
pool.maxConcurrency = Math.min(pool.maxConcurrency + 2, this.originalMaxConcurrency);
|
|
24
|
+
consoleLogger.info(`Recovering concurrency to ${pool.maxConcurrency}`);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
onFailure(httpStatus, pool) {
|
|
29
|
+
if (typeof httpStatus !== 'number' || httpStatus < 400) {
|
|
30
|
+
return false;
|
|
31
|
+
}
|
|
32
|
+
this.consecutiveSuccesses = 0;
|
|
33
|
+
this.consecutiveFailures++;
|
|
34
|
+
if (pool && pool.maxConcurrency > 1) {
|
|
35
|
+
pool.maxConcurrency = Math.max(1, Math.floor(pool.maxConcurrency / 2));
|
|
36
|
+
consoleLogger.info(`Rate limited (HTTP ${httpStatus}) — reducing concurrency to ${pool.maxConcurrency}`);
|
|
37
|
+
}
|
|
38
|
+
if (this.consecutiveFailures >= this.maxConsecutiveFailures) {
|
|
39
|
+
return true;
|
|
40
|
+
}
|
|
41
|
+
return false;
|
|
42
|
+
}
|
|
43
|
+
isLimitReached() {
|
|
44
|
+
return this.scannedCount >= this.maxPages;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
CrawlRateController.RECOVERY_INTERVAL = 10;
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
import crawlee, { EnqueueStrategy, RequestList } from 'crawlee';
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import { createCrawleeSubFolders, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
|
|
2
|
+
import { CrawlRateController } from './crawlRateController.js';
|
|
3
|
+
import { createCrawleeSubFolders, getPreLaunchHook, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
|
|
5
4
|
import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, disallowedListOfPatterns, FileTypes, } from '../constants/constants.js';
|
|
6
5
|
import { getLinksFromSitemap, getPlaywrightLaunchOptions, isSkippedUrl, waitForPageLoaded, isFilePath, } from '../constants/common.js';
|
|
7
6
|
import { areLinksEqual, isFollowStrategy, isWhitelistedContentType, normUrl, register } from '../utils.js';
|
|
8
7
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
|
|
9
|
-
import { guiInfoLog } from '../logs.js';
|
|
8
|
+
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
10
9
|
const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = '', scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
|
|
11
10
|
const crawlStartTime = Date.now();
|
|
12
11
|
let dataset;
|
|
13
12
|
let urlsCrawled;
|
|
14
13
|
let durationExceeded = false;
|
|
15
14
|
let isAbortingScan = false;
|
|
15
|
+
const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
|
|
16
16
|
if (fromCrawlIntelligentSitemap) {
|
|
17
17
|
dataset = datasetFromIntelligent;
|
|
18
18
|
urlsCrawled = urlsCrawledFromIntelligent;
|
|
@@ -40,31 +40,20 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
40
40
|
launchContext: {
|
|
41
41
|
launcher: constants.launcher,
|
|
42
42
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
|
43
|
-
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
44
|
-
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
45
43
|
},
|
|
46
44
|
retryOnBlocked: true,
|
|
47
45
|
browserPoolOptions: {
|
|
48
46
|
useFingerprints: false,
|
|
49
47
|
preLaunchHooks: [
|
|
48
|
+
getPreLaunchHook(userDataDirectory),
|
|
50
49
|
async (_pageId, launchContext) => {
|
|
51
|
-
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
|
|
52
|
-
// Ensure base exists
|
|
53
|
-
await fsp.mkdir(baseDir, { recursive: true });
|
|
54
|
-
// Create a unique subdir per browser
|
|
55
|
-
const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
|
|
56
|
-
await fsp.mkdir(subProfileDir, { recursive: true });
|
|
57
|
-
// Assign to Crawlee's launcher
|
|
58
|
-
launchContext.userDataDir = subProfileDir;
|
|
59
|
-
// Safely extend launchOptions
|
|
60
50
|
launchContext.launchOptions = {
|
|
61
51
|
...launchContext.launchOptions,
|
|
62
52
|
ignoreHTTPSErrors: true,
|
|
63
53
|
...playwrightDeviceDetailsObject,
|
|
54
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
64
55
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
65
56
|
};
|
|
66
|
-
// Optionally log for debugging
|
|
67
|
-
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
|
|
68
57
|
},
|
|
69
58
|
],
|
|
70
59
|
},
|
|
@@ -149,13 +138,11 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
149
138
|
await waitForPageLoaded(page, 10000);
|
|
150
139
|
const actualUrl = page.url() || request.loadedUrl || request.url;
|
|
151
140
|
const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
|
152
|
-
if (
|
|
141
|
+
if (hasExceededDuration) {
|
|
142
|
+
consoleLogger.info(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
|
|
143
|
+
durationExceeded = true;
|
|
153
144
|
isAbortingScan = true;
|
|
154
|
-
|
|
155
|
-
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
|
|
156
|
-
durationExceeded = true;
|
|
157
|
-
}
|
|
158
|
-
crawler.autoscaledPool.abort(); // stops new requests
|
|
145
|
+
crawler.autoscaledPool.abort();
|
|
159
146
|
return;
|
|
160
147
|
}
|
|
161
148
|
if (request.skipNavigation && actualUrl === 'about:blank') {
|
|
@@ -245,22 +232,29 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
245
232
|
catch (_) {
|
|
246
233
|
// Page/context was destroyed during navigation — handled by outer catch
|
|
247
234
|
}
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
235
|
+
if (rateController.claimSlot()) {
|
|
236
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
237
|
+
numScanned: urlsCrawled.scanned.length,
|
|
238
|
+
urlScanned: request.url,
|
|
239
|
+
});
|
|
240
|
+
urlsCrawled.scanned.push({
|
|
241
|
+
url: request.url,
|
|
242
|
+
pageTitle: results.pageTitle,
|
|
243
|
+
actualUrl, // i.e. actualUrl
|
|
244
|
+
});
|
|
245
|
+
rateController.onSuccess(crawler.autoscaledPool);
|
|
246
|
+
if (rateController.isLimitReached()) {
|
|
247
|
+
isAbortingScan = true;
|
|
248
|
+
crawler.autoscaledPool.abort();
|
|
249
|
+
}
|
|
250
|
+
urlsCrawled.scannedRedirects.push({
|
|
251
|
+
fromUrl: request.url,
|
|
252
|
+
toUrl: actualUrl,
|
|
253
|
+
});
|
|
254
|
+
results.url = request.url;
|
|
255
|
+
results.actualUrl = actualUrl;
|
|
256
|
+
await dataset.pushData(results);
|
|
257
|
+
}
|
|
264
258
|
}
|
|
265
259
|
else {
|
|
266
260
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
@@ -284,30 +278,27 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
284
278
|
}
|
|
285
279
|
}
|
|
286
280
|
catch (e) {
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
});
|
|
292
|
-
urlsCrawled.error.push({
|
|
293
|
-
url: request.url,
|
|
294
|
-
pageTitle: request.url,
|
|
295
|
-
actualUrl: request.url,
|
|
296
|
-
metadata: STATUS_CODE_METADATA[2],
|
|
297
|
-
httpStatusCode: 0,
|
|
298
|
-
});
|
|
299
|
-
}
|
|
281
|
+
// Do not push to urlsCrawled.error here — Crawlee will retry the request
|
|
282
|
+
// (up to maxRequestRetries, default 3). If all retries are exhausted,
|
|
283
|
+
// failedRequestHandler will record the error. Pushing here causes
|
|
284
|
+
// duplicates and false positives for URLs that succeed on retry.
|
|
300
285
|
}
|
|
301
286
|
},
|
|
302
287
|
failedRequestHandler: async ({ request, response, error }) => {
|
|
303
288
|
if (isAbortingScan) {
|
|
304
289
|
return;
|
|
305
290
|
}
|
|
291
|
+
const status = response?.status();
|
|
292
|
+
if (rateController.onFailure(status, crawler.autoscaledPool)) {
|
|
293
|
+
consoleLogger.info(`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`);
|
|
294
|
+
isAbortingScan = true;
|
|
295
|
+
crawler.autoscaledPool?.abort();
|
|
296
|
+
return;
|
|
297
|
+
}
|
|
306
298
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
|
307
299
|
numScanned: urlsCrawled.scanned.length,
|
|
308
300
|
urlScanned: request.url,
|
|
309
301
|
});
|
|
310
|
-
const status = response?.status();
|
|
311
302
|
const metadata = typeof status === 'number'
|
|
312
303
|
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
|
313
304
|
: STATUS_CODE_METADATA[2];
|
|
@@ -322,15 +313,13 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
322
313
|
},
|
|
323
314
|
maxRequestsPerCrawl: Infinity,
|
|
324
315
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
},
|
|
333
|
-
}),
|
|
316
|
+
autoscaledPoolOptions: {
|
|
317
|
+
minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
|
|
318
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
319
|
+
desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
|
|
320
|
+
scaleUpStepRatio: 0.99, // Scale up faster
|
|
321
|
+
scaleDownStepRatio: 0.1, // Scale down slower
|
|
322
|
+
},
|
|
334
323
|
}));
|
|
335
324
|
await crawler.run();
|
|
336
325
|
await requestList.isFinished();
|
|
@@ -444,6 +444,37 @@ const scanApiScript = (shortDescMap, longDescMap, stepByStepMap) => `
|
|
|
444
444
|
// Run axe-core + oobee custom checks
|
|
445
445
|
var scanResult = await window.runA11yScan(elementsToScan, '');
|
|
446
446
|
|
|
447
|
+
// Re-verify aria-hidden-focus violations against the live DOM to handle
|
|
448
|
+
// race conditions with JS that sets tabindex="-1" after aria-hidden
|
|
449
|
+
var axeViolations = scanResult.axeScanResults.violations || [];
|
|
450
|
+
var ariaHiddenViolation = axeViolations.find(function(v) { return v.id === 'aria-hidden-focus'; });
|
|
451
|
+
if (ariaHiddenViolation) {
|
|
452
|
+
await new Promise(function(resolve) { setTimeout(resolve, 0); });
|
|
453
|
+
ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(function(node) {
|
|
454
|
+
var selector = node.target && node.target[0];
|
|
455
|
+
if (typeof selector !== 'string') return true;
|
|
456
|
+
try {
|
|
457
|
+
var el = document.querySelector(selector);
|
|
458
|
+
if (!el) return true;
|
|
459
|
+
var focusables = el.querySelectorAll(
|
|
460
|
+
'a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]'
|
|
461
|
+
);
|
|
462
|
+
if (focusables.length === 0) return false;
|
|
463
|
+
return Array.from(focusables).some(function(child) {
|
|
464
|
+
var tabindex = child.getAttribute('tabindex');
|
|
465
|
+
if (tabindex === null) return true;
|
|
466
|
+
var parsed = parseInt(tabindex, 10);
|
|
467
|
+
return isNaN(parsed) || parsed >= 0;
|
|
468
|
+
});
|
|
469
|
+
} catch (e) { return true; }
|
|
470
|
+
});
|
|
471
|
+
if (ariaHiddenViolation.nodes.length === 0) {
|
|
472
|
+
scanResult.axeScanResults.violations = axeViolations.filter(function(v) {
|
|
473
|
+
return v.id !== 'aria-hidden-focus';
|
|
474
|
+
});
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
|
|
447
478
|
// Convert raw axe results into oobee category structure
|
|
448
479
|
var filtered = _oobeeFilterAxeResults(scanResult.axeScanResults, scanResult.pageTitle);
|
|
449
480
|
|
|
@@ -114,6 +114,9 @@ const sendWcagBreakdownToSentry = async (appVersion, wcagBreakdown, ruleIdJson,
|
|
|
114
114
|
...(process.env.OOBEE_SCAN_PRODUCT && {
|
|
115
115
|
scanProduct: process.env.OOBEE_SCAN_PRODUCT,
|
|
116
116
|
}),
|
|
117
|
+
...(process.env.OOBEE_TAGGED_WEBSITE && {
|
|
118
|
+
websiteTag: process.env.OOBEE_TAGGED_WEBSITE,
|
|
119
|
+
}),
|
|
117
120
|
},
|
|
118
121
|
user: {
|
|
119
122
|
...(scanInfo.email && scanInfo.name
|