@govtechsg/oobee 0.10.91 → 0.10.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +303 -0
- package/README.md +22 -0
- package/dist/cli.js +3 -0
- package/dist/combine.js +15 -3
- package/dist/constants/cliFunctions.js +7 -0
- package/dist/constants/common.js +149 -80
- package/dist/constants/constants.js +1 -0
- package/dist/crawlers/commonCrawlerFunc.js +136 -15
- package/dist/crawlers/crawlDomain.js +55 -58
- package/dist/crawlers/crawlIntelligentSitemap.js +21 -11
- package/dist/crawlers/crawlRateController.js +47 -0
- package/dist/crawlers/crawlSitemap.js +51 -62
- package/dist/crawlers/runCustom.js +8 -2
- package/dist/generateOobeeClientScanner.js +32 -1
- package/dist/mergeAxeResults/itemsStore.js +32 -3
- package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
- package/dist/mergeAxeResults.js +120 -92
- package/dist/npmIndex.js +1 -0
- package/dist/utils.js +23 -28
- package/oobee-client-scanner.js +35 -4
- package/package.json +3 -3
- package/src/cli.ts +4 -0
- package/src/combine.ts +16 -1
- package/src/constants/cliFunctions.ts +7 -0
- package/src/constants/common.ts +162 -90
- package/src/constants/constants.ts +1 -0
- package/src/crawlers/commonCrawlerFunc.ts +148 -14
- package/src/crawlers/crawlDomain.ts +64 -66
- package/src/crawlers/crawlIntelligentSitemap.ts +23 -11
- package/src/crawlers/crawlRateController.ts +63 -0
- package/src/crawlers/crawlSitemap.ts +57 -70
- package/src/crawlers/runCustom.ts +10 -1
- package/src/generateOobeeClientScanner.ts +32 -1
- package/src/index.ts +1 -0
- package/src/mergeAxeResults/itemsStore.ts +37 -3
- package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
- package/src/mergeAxeResults.ts +139 -99
- package/src/npmIndex.ts +1 -0
- package/src/utils.ts +25 -33
- /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt} +0 -0
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import crawlee from 'crawlee';
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import { createCrawleeSubFolders, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
|
|
2
|
+
import { CrawlRateController } from './crawlRateController.js';
|
|
3
|
+
import { createCrawleeSubFolders, getPreLaunchHook, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, splitAuthHeaders, } from './commonCrawlerFunc.js';
|
|
5
4
|
import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
|
|
6
5
|
import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
|
|
7
|
-
import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
|
|
6
|
+
import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
|
|
8
7
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
|
|
9
8
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
10
9
|
const isBlacklisted = (url, blacklistedPatterns) => {
|
|
@@ -258,9 +257,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
258
257
|
// same-domain strategy) still contribute their <a> links above, but
|
|
259
258
|
// clicking every interactive element on them is too slow and starves
|
|
260
259
|
// the crawler of time to discover pages on the primary hostname.
|
|
261
|
-
|
|
262
|
-
const seedHostname = new URL(url).hostname;
|
|
263
|
-
if (currentHostname === seedHostname) {
|
|
260
|
+
if (isSameHostname(new URL(page.url()).hostname, new URL(url).hostname)) {
|
|
264
261
|
// Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
|
|
265
262
|
try {
|
|
266
263
|
await customEnqueueLinksByClickingElements(page, browserContext);
|
|
@@ -277,43 +274,40 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
277
274
|
}
|
|
278
275
|
};
|
|
279
276
|
let isAbortingScanNow = false;
|
|
277
|
+
const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
|
|
278
|
+
const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
280
279
|
const crawler = register(new crawlee.PlaywrightCrawler({
|
|
281
280
|
launchContext: {
|
|
282
281
|
launcher: constants.launcher,
|
|
283
282
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
|
284
|
-
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
285
|
-
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
286
283
|
},
|
|
287
284
|
retryOnBlocked: true,
|
|
288
285
|
browserPoolOptions: {
|
|
289
286
|
useFingerprints: false,
|
|
290
287
|
preLaunchHooks: [
|
|
288
|
+
getPreLaunchHook(userDataDirectory),
|
|
291
289
|
async (_pageId, launchContext) => {
|
|
292
|
-
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
|
|
293
|
-
// Ensure base exists
|
|
294
|
-
await fsp.mkdir(baseDir, { recursive: true });
|
|
295
|
-
// Create a unique subdir per browser
|
|
296
|
-
const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
|
|
297
|
-
await fsp.mkdir(subProfileDir, { recursive: true });
|
|
298
|
-
// Assign to Crawlee's launcher
|
|
299
|
-
// Crawlee preLaunchHooks expects launchContext to be mutated in-place.
|
|
300
|
-
// eslint-disable-next-line no-param-reassign
|
|
301
|
-
launchContext.userDataDir = subProfileDir;
|
|
302
|
-
// Safely extend launchOptions
|
|
303
290
|
// eslint-disable-next-line no-param-reassign
|
|
304
291
|
launchContext.launchOptions = {
|
|
305
292
|
...launchContext.launchOptions,
|
|
306
293
|
ignoreHTTPSErrors: true,
|
|
307
294
|
...playwrightDeviceDetailsObject,
|
|
295
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
308
296
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
309
|
-
...(
|
|
297
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
298
|
+
...(httpCredentials && { httpCredentials }),
|
|
310
299
|
};
|
|
311
|
-
// Optionally log for debugging
|
|
312
|
-
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
|
|
313
300
|
},
|
|
314
301
|
],
|
|
315
302
|
},
|
|
316
303
|
requestQueue,
|
|
304
|
+
preNavigationHooks: [
|
|
305
|
+
async (crawlingContext) => {
|
|
306
|
+
if (extraHTTPHeaders) {
|
|
307
|
+
crawlingContext.request.headers = extraHTTPHeaders;
|
|
308
|
+
}
|
|
309
|
+
},
|
|
310
|
+
],
|
|
317
311
|
postNavigationHooks: [
|
|
318
312
|
async (crawlingContext) => {
|
|
319
313
|
const { page, request } = crawlingContext;
|
|
@@ -390,11 +384,9 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
390
384
|
return;
|
|
391
385
|
}
|
|
392
386
|
const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
|
393
|
-
if (
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
durationExceeded = true;
|
|
397
|
-
}
|
|
387
|
+
if (hasExceededDuration) {
|
|
388
|
+
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
|
|
389
|
+
durationExceeded = true;
|
|
398
390
|
isAbortingScanNow = true;
|
|
399
391
|
activeCrawler.autoscaledPool.abort();
|
|
400
392
|
return;
|
|
@@ -527,8 +519,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
527
519
|
});
|
|
528
520
|
return;
|
|
529
521
|
}
|
|
530
|
-
|
|
531
|
-
if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
|
|
522
|
+
if (rateController.claimSlot()) {
|
|
532
523
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
533
524
|
numScanned: urlsCrawled.scanned.length,
|
|
534
525
|
urlScanned: request.url,
|
|
@@ -538,6 +529,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
538
529
|
pageTitle: results.pageTitle,
|
|
539
530
|
actualUrl, // i.e. actualUrl
|
|
540
531
|
});
|
|
532
|
+
rateController.onSuccess(crawler.autoscaledPool);
|
|
533
|
+
if (rateController.isLimitReached()) {
|
|
534
|
+
isAbortingScanNow = true;
|
|
535
|
+
activeCrawler.autoscaledPool.abort();
|
|
536
|
+
}
|
|
541
537
|
scannedUrlSet.add(normUrl(request.url));
|
|
542
538
|
scannedResolvedUrlSet.add(normUrl(actualUrl));
|
|
543
539
|
urlsCrawled.scannedRedirects.push({
|
|
@@ -549,8 +545,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
549
545
|
await dataset.pushData(results);
|
|
550
546
|
}
|
|
551
547
|
}
|
|
552
|
-
else if (
|
|
553
|
-
// One more check if scanned pages have reached limit due to multi-instances of handler running
|
|
548
|
+
else if (rateController.claimSlot()) {
|
|
554
549
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
555
550
|
numScanned: urlsCrawled.scanned.length,
|
|
556
551
|
urlScanned: request.url,
|
|
@@ -560,6 +555,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
560
555
|
actualUrl: request.url,
|
|
561
556
|
pageTitle: results.pageTitle,
|
|
562
557
|
});
|
|
558
|
+
rateController.onSuccess(crawler.autoscaledPool);
|
|
559
|
+
if (rateController.isLimitReached()) {
|
|
560
|
+
isAbortingScanNow = true;
|
|
561
|
+
activeCrawler.autoscaledPool.abort();
|
|
562
|
+
}
|
|
563
563
|
scannedUrlSet.add(normUrl(request.url));
|
|
564
564
|
scannedResolvedUrlSet.add(normUrl(request.url));
|
|
565
565
|
await dataset.pushData(results);
|
|
@@ -611,30 +611,29 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
611
611
|
}
|
|
612
612
|
}
|
|
613
613
|
catch {
|
|
614
|
-
//
|
|
615
|
-
}
|
|
616
|
-
// when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
|
|
617
|
-
// a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
|
|
618
|
-
if (!isAbortingScanNow) {
|
|
619
|
-
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
|
620
|
-
numScanned: urlsCrawled.scanned.length,
|
|
621
|
-
urlScanned: request.url,
|
|
622
|
-
});
|
|
623
|
-
urlsCrawled.error.push({
|
|
624
|
-
url: request.url,
|
|
625
|
-
pageTitle: request.url,
|
|
626
|
-
actualUrl: request.url,
|
|
627
|
-
metadata: STATUS_CODE_METADATA[2],
|
|
628
|
-
});
|
|
614
|
+
// Recovery failed; Crawlee will retry the request automatically
|
|
629
615
|
}
|
|
616
|
+
// Do not push to urlsCrawled.error here — Crawlee will retry the request
|
|
617
|
+
// (up to maxRequestRetries, default 3). If all retries are exhausted,
|
|
618
|
+
// failedRequestHandler will record the error. Pushing here causes
|
|
619
|
+
// duplicates and false positives for URLs that succeed on retry.
|
|
630
620
|
}
|
|
631
621
|
},
|
|
632
622
|
failedRequestHandler: async ({ request, response }) => {
|
|
623
|
+
if (isAbortingScanNow) {
|
|
624
|
+
return;
|
|
625
|
+
}
|
|
626
|
+
const status = response?.status();
|
|
627
|
+
if (rateController.onFailure(status, crawler.autoscaledPool)) {
|
|
628
|
+
consoleLogger.info(`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`);
|
|
629
|
+
isAbortingScanNow = true;
|
|
630
|
+
crawler.autoscaledPool?.abort();
|
|
631
|
+
return;
|
|
632
|
+
}
|
|
633
633
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
|
634
634
|
numScanned: urlsCrawled.scanned.length,
|
|
635
635
|
urlScanned: request.url,
|
|
636
636
|
});
|
|
637
|
-
const status = response?.status();
|
|
638
637
|
const metadata = typeof status === 'number'
|
|
639
638
|
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
|
640
639
|
: STATUS_CODE_METADATA[2];
|
|
@@ -648,15 +647,13 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
648
647
|
},
|
|
649
648
|
maxRequestsPerCrawl: Infinity,
|
|
650
649
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
},
|
|
659
|
-
}),
|
|
650
|
+
autoscaledPoolOptions: {
|
|
651
|
+
minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
|
|
652
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
653
|
+
desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
|
|
654
|
+
scaleUpStepRatio: 0.99, // Scale up faster
|
|
655
|
+
scaleDownStepRatio: 0.1, // Scale down slower
|
|
656
|
+
},
|
|
660
657
|
}));
|
|
661
658
|
await crawler.run();
|
|
662
659
|
// Additional passes: keep re-visiting scanned seed-hostname pages for
|
|
@@ -675,7 +672,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
675
672
|
.map(item => item.actualUrl || item.url)
|
|
676
673
|
.filter(pageUrl => {
|
|
677
674
|
try {
|
|
678
|
-
return new URL(pageUrl).hostname
|
|
675
|
+
return isSameHostname(new URL(pageUrl).hostname, seedHostname) && !clickPassVisited.has(pageUrl);
|
|
679
676
|
}
|
|
680
677
|
catch {
|
|
681
678
|
return false;
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
|
1
|
+
import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
|
|
2
2
|
import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
|
3
3
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
4
4
|
import crawlDomain from './crawlDomain.js';
|
|
5
5
|
import crawlSitemap from './crawlSitemap.js';
|
|
6
|
-
import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
|
|
6
|
+
import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt, initModifiedUserAgent } from '../constants/common.js';
|
|
7
7
|
import { register } from '../utils.js';
|
|
8
8
|
const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, strategy, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, followRobots, extraHTTPHeaders, safeMode, scanDuration) => {
|
|
9
9
|
const startTime = Date.now(); // Track start time
|
|
@@ -15,6 +15,9 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
15
15
|
let sitemapUrl;
|
|
16
16
|
let durationExceeded = false;
|
|
17
17
|
({ dataset } = await createCrawleeSubFolders(randomToken));
|
|
18
|
+
// Initialise modified User-Agent early so sitemap discovery requests
|
|
19
|
+
// don't expose "HeadlessChrome" (which triggers bot-blocking on some sites).
|
|
20
|
+
await initModifiedUserAgent(browser);
|
|
18
21
|
function getHomeUrl(parsedUrl) {
|
|
19
22
|
const urlObject = new URL(parsedUrl);
|
|
20
23
|
return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
|
@@ -23,24 +26,31 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
23
26
|
const homeUrl = getHomeUrl(link);
|
|
24
27
|
let sitemapLink = '';
|
|
25
28
|
const launchOptions = getPlaywrightLaunchOptions(browser);
|
|
29
|
+
const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
26
30
|
let context;
|
|
27
31
|
let browserInstance;
|
|
28
32
|
if (process.env.CRAWLEE_HEADLESS === '1') {
|
|
29
33
|
const effectiveUserDataDirectory = userDataDirectory || '';
|
|
30
34
|
context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
|
31
35
|
...launchOptions,
|
|
32
|
-
...(
|
|
36
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
37
|
+
...(httpCredentials && { httpCredentials }),
|
|
38
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
33
39
|
});
|
|
34
40
|
register(context);
|
|
35
41
|
}
|
|
36
42
|
else {
|
|
37
|
-
// In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
|
|
38
43
|
browserInstance = await constants.launcher.launch(launchOptions);
|
|
39
44
|
register(browserInstance);
|
|
40
45
|
context = await browserInstance.newContext({
|
|
41
|
-
...(
|
|
46
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
47
|
+
...(httpCredentials && { httpCredentials }),
|
|
48
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
42
49
|
});
|
|
43
50
|
}
|
|
51
|
+
if (authHeader) {
|
|
52
|
+
await addAuthRouteHandler(context, link, authHeader);
|
|
53
|
+
}
|
|
44
54
|
const page = await context.newPage();
|
|
45
55
|
for (const path of sitemapPaths) {
|
|
46
56
|
sitemapLink = homeUrl + path;
|
|
@@ -59,7 +69,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
59
69
|
const checkUrlExists = async (page, parsedUrl) => {
|
|
60
70
|
try {
|
|
61
71
|
const response = await page.goto(parsedUrl);
|
|
62
|
-
return response
|
|
72
|
+
return response?.ok() ?? false;
|
|
63
73
|
}
|
|
64
74
|
catch (e) {
|
|
65
75
|
consoleLogger.error(e);
|
|
@@ -71,7 +81,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
71
81
|
try {
|
|
72
82
|
sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
|
|
73
83
|
if (sitemapUrls.length > 0) {
|
|
74
|
-
|
|
84
|
+
consoleLogger.info(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
|
|
75
85
|
sitemapExist = true;
|
|
76
86
|
}
|
|
77
87
|
}
|
|
@@ -91,7 +101,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
91
101
|
}
|
|
92
102
|
}
|
|
93
103
|
if (!sitemapExist) {
|
|
94
|
-
|
|
104
|
+
consoleLogger.info('Unable to find sitemap. Commencing website crawl instead.');
|
|
95
105
|
return await crawlDomain({
|
|
96
106
|
url,
|
|
97
107
|
randomToken,
|
|
@@ -121,7 +131,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
121
131
|
durationExceeded = true;
|
|
122
132
|
break;
|
|
123
133
|
}
|
|
124
|
-
|
|
134
|
+
consoleLogger.info(`Processing sitemap: ${currentSitemapUrl}`);
|
|
125
135
|
urlsCrawledFinal = await crawlSitemap({
|
|
126
136
|
sitemapUrl: currentSitemapUrl,
|
|
127
137
|
randomToken,
|
|
@@ -149,7 +159,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
149
159
|
const remainingScanDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : 0;
|
|
150
160
|
const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
|
|
151
161
|
if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
|
|
152
|
-
|
|
162
|
+
consoleLogger.info(`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`);
|
|
153
163
|
urlsCrawledFinal = await crawlDomain({
|
|
154
164
|
url,
|
|
155
165
|
randomToken,
|
|
@@ -173,7 +183,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
173
183
|
});
|
|
174
184
|
}
|
|
175
185
|
else if (!hasDurationRemaining) {
|
|
176
|
-
|
|
186
|
+
consoleLogger.info(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
|
|
177
187
|
durationExceeded = true;
|
|
178
188
|
}
|
|
179
189
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { consoleLogger } from '../logs.js';
|
|
2
|
+
export class CrawlRateController {
|
|
3
|
+
constructor(maxRequestsPerCrawl, maxConcurrency) {
|
|
4
|
+
this.scannedCount = 0;
|
|
5
|
+
this.consecutiveFailures = 0;
|
|
6
|
+
this.consecutiveSuccesses = 0;
|
|
7
|
+
this.maxPages = maxRequestsPerCrawl;
|
|
8
|
+
this.maxConsecutiveFailures = Number(process.env.OOBEE_CONSECUTIVE_MAX_RETRIES) || 100;
|
|
9
|
+
this.originalMaxConcurrency = maxConcurrency;
|
|
10
|
+
}
|
|
11
|
+
claimSlot() {
|
|
12
|
+
if (this.scannedCount >= this.maxPages) {
|
|
13
|
+
return false;
|
|
14
|
+
}
|
|
15
|
+
this.scannedCount++;
|
|
16
|
+
return true;
|
|
17
|
+
}
|
|
18
|
+
onSuccess(pool) {
|
|
19
|
+
this.consecutiveFailures = 0;
|
|
20
|
+
this.consecutiveSuccesses++;
|
|
21
|
+
if (pool && this.consecutiveSuccesses % CrawlRateController.RECOVERY_INTERVAL === 0) {
|
|
22
|
+
if (pool.maxConcurrency < this.originalMaxConcurrency) {
|
|
23
|
+
pool.maxConcurrency = Math.min(pool.maxConcurrency + 2, this.originalMaxConcurrency);
|
|
24
|
+
consoleLogger.info(`Recovering concurrency to ${pool.maxConcurrency}`);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
onFailure(httpStatus, pool) {
|
|
29
|
+
if (typeof httpStatus !== 'number' || httpStatus < 400) {
|
|
30
|
+
return false;
|
|
31
|
+
}
|
|
32
|
+
this.consecutiveSuccesses = 0;
|
|
33
|
+
this.consecutiveFailures++;
|
|
34
|
+
if (pool && pool.maxConcurrency > 1) {
|
|
35
|
+
pool.maxConcurrency = Math.max(1, Math.floor(pool.maxConcurrency / 2));
|
|
36
|
+
consoleLogger.info(`Rate limited (HTTP ${httpStatus}) — reducing concurrency to ${pool.maxConcurrency}`);
|
|
37
|
+
}
|
|
38
|
+
if (this.consecutiveFailures >= this.maxConsecutiveFailures) {
|
|
39
|
+
return true;
|
|
40
|
+
}
|
|
41
|
+
return false;
|
|
42
|
+
}
|
|
43
|
+
isLimitReached() {
|
|
44
|
+
return this.scannedCount >= this.maxPages;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
CrawlRateController.RECOVERY_INTERVAL = 10;
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
import crawlee, { EnqueueStrategy, RequestList } from 'crawlee';
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import { createCrawleeSubFolders, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
|
|
2
|
+
import { CrawlRateController } from './crawlRateController.js';
|
|
3
|
+
import { createCrawleeSubFolders, getPreLaunchHook, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
|
|
5
4
|
import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, disallowedListOfPatterns, FileTypes, } from '../constants/constants.js';
|
|
6
5
|
import { getLinksFromSitemap, getPlaywrightLaunchOptions, isSkippedUrl, waitForPageLoaded, isFilePath, } from '../constants/common.js';
|
|
7
6
|
import { areLinksEqual, isFollowStrategy, isWhitelistedContentType, normUrl, register } from '../utils.js';
|
|
8
7
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
|
|
9
|
-
import { guiInfoLog } from '../logs.js';
|
|
8
|
+
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
10
9
|
const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = '', scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
|
|
11
10
|
const crawlStartTime = Date.now();
|
|
12
11
|
let dataset;
|
|
13
12
|
let urlsCrawled;
|
|
14
13
|
let durationExceeded = false;
|
|
15
14
|
let isAbortingScan = false;
|
|
15
|
+
const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
|
|
16
16
|
if (fromCrawlIntelligentSitemap) {
|
|
17
17
|
dataset = datasetFromIntelligent;
|
|
18
18
|
urlsCrawled = urlsCrawledFromIntelligent;
|
|
@@ -40,31 +40,20 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
40
40
|
launchContext: {
|
|
41
41
|
launcher: constants.launcher,
|
|
42
42
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
|
43
|
-
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
44
|
-
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
45
43
|
},
|
|
46
44
|
retryOnBlocked: true,
|
|
47
45
|
browserPoolOptions: {
|
|
48
46
|
useFingerprints: false,
|
|
49
47
|
preLaunchHooks: [
|
|
48
|
+
getPreLaunchHook(userDataDirectory),
|
|
50
49
|
async (_pageId, launchContext) => {
|
|
51
|
-
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
|
|
52
|
-
// Ensure base exists
|
|
53
|
-
await fsp.mkdir(baseDir, { recursive: true });
|
|
54
|
-
// Create a unique subdir per browser
|
|
55
|
-
const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
|
|
56
|
-
await fsp.mkdir(subProfileDir, { recursive: true });
|
|
57
|
-
// Assign to Crawlee's launcher
|
|
58
|
-
launchContext.userDataDir = subProfileDir;
|
|
59
|
-
// Safely extend launchOptions
|
|
60
50
|
launchContext.launchOptions = {
|
|
61
51
|
...launchContext.launchOptions,
|
|
62
52
|
ignoreHTTPSErrors: true,
|
|
63
53
|
...playwrightDeviceDetailsObject,
|
|
54
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
64
55
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
65
56
|
};
|
|
66
|
-
// Optionally log for debugging
|
|
67
|
-
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
|
|
68
57
|
},
|
|
69
58
|
],
|
|
70
59
|
},
|
|
@@ -149,13 +138,11 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
149
138
|
await waitForPageLoaded(page, 10000);
|
|
150
139
|
const actualUrl = page.url() || request.loadedUrl || request.url;
|
|
151
140
|
const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
|
152
|
-
if (
|
|
141
|
+
if (hasExceededDuration) {
|
|
142
|
+
consoleLogger.info(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
|
|
143
|
+
durationExceeded = true;
|
|
153
144
|
isAbortingScan = true;
|
|
154
|
-
|
|
155
|
-
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
|
|
156
|
-
durationExceeded = true;
|
|
157
|
-
}
|
|
158
|
-
crawler.autoscaledPool.abort(); // stops new requests
|
|
145
|
+
crawler.autoscaledPool.abort();
|
|
159
146
|
return;
|
|
160
147
|
}
|
|
161
148
|
if (request.skipNavigation && actualUrl === 'about:blank') {
|
|
@@ -245,22 +232,29 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
245
232
|
catch (_) {
|
|
246
233
|
// Page/context was destroyed during navigation — handled by outer catch
|
|
247
234
|
}
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
235
|
+
if (rateController.claimSlot()) {
|
|
236
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
237
|
+
numScanned: urlsCrawled.scanned.length,
|
|
238
|
+
urlScanned: request.url,
|
|
239
|
+
});
|
|
240
|
+
urlsCrawled.scanned.push({
|
|
241
|
+
url: request.url,
|
|
242
|
+
pageTitle: results.pageTitle,
|
|
243
|
+
actualUrl, // i.e. actualUrl
|
|
244
|
+
});
|
|
245
|
+
rateController.onSuccess(crawler.autoscaledPool);
|
|
246
|
+
if (rateController.isLimitReached()) {
|
|
247
|
+
isAbortingScan = true;
|
|
248
|
+
crawler.autoscaledPool.abort();
|
|
249
|
+
}
|
|
250
|
+
urlsCrawled.scannedRedirects.push({
|
|
251
|
+
fromUrl: request.url,
|
|
252
|
+
toUrl: actualUrl,
|
|
253
|
+
});
|
|
254
|
+
results.url = request.url;
|
|
255
|
+
results.actualUrl = actualUrl;
|
|
256
|
+
await dataset.pushData(results);
|
|
257
|
+
}
|
|
264
258
|
}
|
|
265
259
|
else {
|
|
266
260
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
@@ -284,30 +278,27 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
284
278
|
}
|
|
285
279
|
}
|
|
286
280
|
catch (e) {
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
});
|
|
292
|
-
urlsCrawled.error.push({
|
|
293
|
-
url: request.url,
|
|
294
|
-
pageTitle: request.url,
|
|
295
|
-
actualUrl: request.url,
|
|
296
|
-
metadata: STATUS_CODE_METADATA[2],
|
|
297
|
-
httpStatusCode: 0,
|
|
298
|
-
});
|
|
299
|
-
}
|
|
281
|
+
// Do not push to urlsCrawled.error here — Crawlee will retry the request
|
|
282
|
+
// (up to maxRequestRetries, default 3). If all retries are exhausted,
|
|
283
|
+
// failedRequestHandler will record the error. Pushing here causes
|
|
284
|
+
// duplicates and false positives for URLs that succeed on retry.
|
|
300
285
|
}
|
|
301
286
|
},
|
|
302
287
|
failedRequestHandler: async ({ request, response, error }) => {
|
|
303
288
|
if (isAbortingScan) {
|
|
304
289
|
return;
|
|
305
290
|
}
|
|
291
|
+
const status = response?.status();
|
|
292
|
+
if (rateController.onFailure(status, crawler.autoscaledPool)) {
|
|
293
|
+
consoleLogger.info(`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`);
|
|
294
|
+
isAbortingScan = true;
|
|
295
|
+
crawler.autoscaledPool?.abort();
|
|
296
|
+
return;
|
|
297
|
+
}
|
|
306
298
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
|
307
299
|
numScanned: urlsCrawled.scanned.length,
|
|
308
300
|
urlScanned: request.url,
|
|
309
301
|
});
|
|
310
|
-
const status = response?.status();
|
|
311
302
|
const metadata = typeof status === 'number'
|
|
312
303
|
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
|
313
304
|
: STATUS_CODE_METADATA[2];
|
|
@@ -322,15 +313,13 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
322
313
|
},
|
|
323
314
|
maxRequestsPerCrawl: Infinity,
|
|
324
315
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
},
|
|
333
|
-
}),
|
|
316
|
+
autoscaledPoolOptions: {
|
|
317
|
+
minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
|
|
318
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
319
|
+
desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
|
|
320
|
+
scaleUpStepRatio: 0.99, // Scale up faster
|
|
321
|
+
scaleDownStepRatio: 0.1, // Scale down slower
|
|
322
|
+
},
|
|
334
323
|
}));
|
|
335
324
|
await crawler.run();
|
|
336
325
|
await requestList.isFinished();
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/* eslint-env browser */
|
|
2
|
-
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
|
2
|
+
import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
|
|
3
3
|
import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
|
|
4
4
|
import constants, { getIntermediateScreenshotsPath, guiInfoStatusTypes, } from '../constants/constants.js';
|
|
5
5
|
import { initNewPage, log } from './custom/utils.js';
|
|
@@ -18,7 +18,7 @@ export class ProcessPageParams {
|
|
|
18
18
|
this.randomToken = randomToken;
|
|
19
19
|
}
|
|
20
20
|
}
|
|
21
|
-
const runCustom = async (url, randomToken, browserToRun, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, initialCustomFlowLabel) => {
|
|
21
|
+
const runCustom = async (url, randomToken, browserToRun, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, initialCustomFlowLabel, extraHTTPHeaders) => {
|
|
22
22
|
// checks and delete datasets path if it already exists
|
|
23
23
|
process.env.CRAWLEE_STORAGE_DIR = randomToken;
|
|
24
24
|
const urlsCrawled = { ...constants.urlsCrawledObj };
|
|
@@ -47,6 +47,7 @@ const runCustom = async (url, randomToken, browserToRun, userDataDirectory, view
|
|
|
47
47
|
...baseArgs.filter(a => !a.startsWith('--window-size') && a !== '--start-maximized'),
|
|
48
48
|
...customArgs,
|
|
49
49
|
];
|
|
50
|
+
const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
50
51
|
const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
|
|
51
52
|
...baseLaunchOptions,
|
|
52
53
|
args: mergedArgs,
|
|
@@ -56,7 +57,12 @@ const runCustom = async (url, randomToken, browserToRun, userDataDirectory, view
|
|
|
56
57
|
viewport: null,
|
|
57
58
|
...(hasCustomViewport ? contextDeviceOptions : {}),
|
|
58
59
|
userAgent: process.env.OOBEE_USER_AGENT || deviceUserAgent,
|
|
60
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
61
|
+
...(httpCredentials && { httpCredentials }),
|
|
59
62
|
});
|
|
63
|
+
if (authHeader) {
|
|
64
|
+
await addAuthRouteHandler(context, url, authHeader);
|
|
65
|
+
}
|
|
60
66
|
register(context);
|
|
61
67
|
processPageParams.stopAll = async () => {
|
|
62
68
|
try {
|
|
@@ -51,7 +51,7 @@ const SENTRY_NODE_VERSION = (() => {
|
|
|
51
51
|
return _require('@sentry/node/package.json').version;
|
|
52
52
|
}
|
|
53
53
|
catch {
|
|
54
|
-
return '
|
|
54
|
+
return '10.58.0'; // safe fallback matching currently installed version
|
|
55
55
|
}
|
|
56
56
|
})();
|
|
57
57
|
// ---------------------------------------------------------------------------
|
|
@@ -444,6 +444,37 @@ const scanApiScript = (shortDescMap, longDescMap, stepByStepMap) => `
|
|
|
444
444
|
// Run axe-core + oobee custom checks
|
|
445
445
|
var scanResult = await window.runA11yScan(elementsToScan, '');
|
|
446
446
|
|
|
447
|
+
// Re-verify aria-hidden-focus violations against the live DOM to handle
|
|
448
|
+
// race conditions with JS that sets tabindex="-1" after aria-hidden
|
|
449
|
+
var axeViolations = scanResult.axeScanResults.violations || [];
|
|
450
|
+
var ariaHiddenViolation = axeViolations.find(function(v) { return v.id === 'aria-hidden-focus'; });
|
|
451
|
+
if (ariaHiddenViolation) {
|
|
452
|
+
await new Promise(function(resolve) { setTimeout(resolve, 0); });
|
|
453
|
+
ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(function(node) {
|
|
454
|
+
var selector = node.target && node.target[0];
|
|
455
|
+
if (typeof selector !== 'string') return true;
|
|
456
|
+
try {
|
|
457
|
+
var el = document.querySelector(selector);
|
|
458
|
+
if (!el) return true;
|
|
459
|
+
var focusables = el.querySelectorAll(
|
|
460
|
+
'a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]'
|
|
461
|
+
);
|
|
462
|
+
if (focusables.length === 0) return false;
|
|
463
|
+
return Array.from(focusables).some(function(child) {
|
|
464
|
+
var tabindex = child.getAttribute('tabindex');
|
|
465
|
+
if (tabindex === null) return true;
|
|
466
|
+
var parsed = parseInt(tabindex, 10);
|
|
467
|
+
return isNaN(parsed) || parsed >= 0;
|
|
468
|
+
});
|
|
469
|
+
} catch (e) { return true; }
|
|
470
|
+
});
|
|
471
|
+
if (ariaHiddenViolation.nodes.length === 0) {
|
|
472
|
+
scanResult.axeScanResults.violations = axeViolations.filter(function(v) {
|
|
473
|
+
return v.id !== 'aria-hidden-focus';
|
|
474
|
+
});
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
|
|
447
478
|
// Convert raw axe results into oobee category structure
|
|
448
479
|
var filtered = _oobeeFilterAxeResults(scanResult.axeScanResults, scanResult.pageTitle);
|
|
449
480
|
|