@govtechsg/oobee 0.10.91 → 0.10.92
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +289 -0
- package/README.md +3 -0
- package/dist/cli.js +3 -0
- package/dist/combine.js +14 -2
- package/dist/constants/cliFunctions.js +7 -0
- package/dist/constants/common.js +119 -70
- package/dist/constants/constants.js +1 -0
- package/dist/crawlers/commonCrawlerFunc.js +93 -15
- package/dist/crawlers/crawlDomain.js +45 -57
- package/dist/crawlers/crawlIntelligentSitemap.js +12 -7
- package/dist/crawlers/crawlRateController.js +47 -0
- package/dist/crawlers/crawlSitemap.js +51 -62
- package/dist/generateOobeeClientScanner.js +31 -0
- package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
- package/dist/mergeAxeResults.js +120 -92
- package/dist/npmIndex.js +1 -0
- package/dist/utils.js +23 -28
- package/oobee-client-scanner.js +33 -2
- package/package.json +2 -2
- package/src/cli.ts +4 -0
- package/src/combine.ts +15 -1
- package/src/constants/cliFunctions.ts +7 -0
- package/src/constants/common.ts +131 -79
- package/src/constants/constants.ts +1 -0
- package/src/crawlers/commonCrawlerFunc.ts +103 -14
- package/src/crawlers/crawlDomain.ts +52 -65
- package/src/crawlers/crawlIntelligentSitemap.ts +13 -7
- package/src/crawlers/crawlRateController.ts +63 -0
- package/src/crawlers/crawlSitemap.ts +57 -70
- package/src/generateOobeeClientScanner.ts +31 -0
- package/src/index.ts +1 -0
- package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
- package/src/mergeAxeResults.ts +139 -99
- package/src/npmIndex.ts +1 -0
- package/src/utils.ts +25 -33
- /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt} +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import crawlee, { EnqueueStrategy } from 'crawlee';
|
|
2
|
+
import { CrawlRateController } from './crawlRateController.js';
|
|
2
3
|
import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
|
|
3
4
|
import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
|
|
4
|
-
import * as path from 'path';
|
|
5
|
-
import fsp from 'fs/promises';
|
|
6
5
|
import {
|
|
7
6
|
createCrawleeSubFolders,
|
|
7
|
+
getPreLaunchHook,
|
|
8
8
|
runAxeScript,
|
|
9
9
|
isUrlPdf,
|
|
10
10
|
shouldSkipClickDueToDisallowedHref,
|
|
@@ -29,7 +29,7 @@ import {
|
|
|
29
29
|
getUrlsFromRobotsTxt,
|
|
30
30
|
waitForPageLoaded,
|
|
31
31
|
} from '../constants/common.js';
|
|
32
|
-
import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
|
|
32
|
+
import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
|
|
33
33
|
import {
|
|
34
34
|
handlePdfDownload,
|
|
35
35
|
runPdfScan,
|
|
@@ -364,9 +364,7 @@ const crawlDomain = async ({
|
|
|
364
364
|
// same-domain strategy) still contribute their <a> links above, but
|
|
365
365
|
// clicking every interactive element on them is too slow and starves
|
|
366
366
|
// the crawler of time to discover pages on the primary hostname.
|
|
367
|
-
|
|
368
|
-
const seedHostname = new URL(url).hostname;
|
|
369
|
-
if (currentHostname === seedHostname) {
|
|
367
|
+
if (isSameHostname(new URL(page.url()).hostname, new URL(url).hostname)) {
|
|
370
368
|
// Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
|
|
371
369
|
try {
|
|
372
370
|
await customEnqueueLinksByClickingElements(page, browserContext);
|
|
@@ -382,49 +380,32 @@ const crawlDomain = async ({
|
|
|
382
380
|
};
|
|
383
381
|
|
|
384
382
|
let isAbortingScanNow = false;
|
|
383
|
+
const rateController = new CrawlRateController(
|
|
384
|
+
maxRequestsPerCrawl,
|
|
385
|
+
specifiedMaxConcurrency || constants.maxConcurrency,
|
|
386
|
+
);
|
|
385
387
|
|
|
386
388
|
const crawler = register(
|
|
387
389
|
new crawlee.PlaywrightCrawler({
|
|
388
390
|
launchContext: {
|
|
389
391
|
launcher: constants.launcher,
|
|
390
392
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
|
391
|
-
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
392
|
-
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
393
393
|
},
|
|
394
394
|
retryOnBlocked: true,
|
|
395
395
|
browserPoolOptions: {
|
|
396
396
|
useFingerprints: false,
|
|
397
397
|
preLaunchHooks: [
|
|
398
|
+
getPreLaunchHook(userDataDirectory),
|
|
398
399
|
async (_pageId, launchContext) => {
|
|
399
|
-
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
|
|
400
|
-
|
|
401
|
-
// Ensure base exists
|
|
402
|
-
await fsp.mkdir(baseDir, { recursive: true });
|
|
403
|
-
|
|
404
|
-
// Create a unique subdir per browser
|
|
405
|
-
const subProfileDir = path.join(
|
|
406
|
-
baseDir,
|
|
407
|
-
`profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
|
|
408
|
-
);
|
|
409
|
-
await fsp.mkdir(subProfileDir, { recursive: true });
|
|
410
|
-
|
|
411
|
-
// Assign to Crawlee's launcher
|
|
412
|
-
// Crawlee preLaunchHooks expects launchContext to be mutated in-place.
|
|
413
|
-
// eslint-disable-next-line no-param-reassign
|
|
414
|
-
launchContext.userDataDir = subProfileDir;
|
|
415
|
-
|
|
416
|
-
// Safely extend launchOptions
|
|
417
400
|
// eslint-disable-next-line no-param-reassign
|
|
418
401
|
launchContext.launchOptions = {
|
|
419
402
|
...launchContext.launchOptions,
|
|
420
403
|
ignoreHTTPSErrors: true,
|
|
421
404
|
...playwrightDeviceDetailsObject,
|
|
405
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
422
406
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
423
407
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
424
408
|
};
|
|
425
|
-
|
|
426
|
-
// Optionally log for debugging
|
|
427
|
-
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
|
|
428
409
|
},
|
|
429
410
|
],
|
|
430
411
|
},
|
|
@@ -527,11 +508,9 @@ const crawlDomain = async ({
|
|
|
527
508
|
const hasExceededDuration =
|
|
528
509
|
scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
|
529
510
|
|
|
530
|
-
if (
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
durationExceeded = true;
|
|
534
|
-
}
|
|
511
|
+
if (hasExceededDuration) {
|
|
512
|
+
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
|
|
513
|
+
durationExceeded = true;
|
|
535
514
|
isAbortingScanNow = true;
|
|
536
515
|
activeCrawler.autoscaledPool.abort();
|
|
537
516
|
return;
|
|
@@ -691,8 +670,7 @@ const crawlDomain = async ({
|
|
|
691
670
|
return;
|
|
692
671
|
}
|
|
693
672
|
|
|
694
|
-
|
|
695
|
-
if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
|
|
673
|
+
if (rateController.claimSlot()) {
|
|
696
674
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
697
675
|
numScanned: urlsCrawled.scanned.length,
|
|
698
676
|
urlScanned: request.url,
|
|
@@ -703,6 +681,11 @@ const crawlDomain = async ({
|
|
|
703
681
|
pageTitle: results.pageTitle,
|
|
704
682
|
actualUrl, // i.e. actualUrl
|
|
705
683
|
});
|
|
684
|
+
rateController.onSuccess(crawler.autoscaledPool);
|
|
685
|
+
if (rateController.isLimitReached()) {
|
|
686
|
+
isAbortingScanNow = true;
|
|
687
|
+
activeCrawler.autoscaledPool.abort();
|
|
688
|
+
}
|
|
706
689
|
scannedUrlSet.add(normUrl(request.url));
|
|
707
690
|
scannedResolvedUrlSet.add(normUrl(actualUrl));
|
|
708
691
|
|
|
@@ -715,8 +698,7 @@ const crawlDomain = async ({
|
|
|
715
698
|
results.actualUrl = actualUrl;
|
|
716
699
|
await dataset.pushData(results);
|
|
717
700
|
}
|
|
718
|
-
} else if (
|
|
719
|
-
// One more check if scanned pages have reached limit due to multi-instances of handler running
|
|
701
|
+
} else if (rateController.claimSlot()) {
|
|
720
702
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
721
703
|
numScanned: urlsCrawled.scanned.length,
|
|
722
704
|
urlScanned: request.url,
|
|
@@ -726,6 +708,11 @@ const crawlDomain = async ({
|
|
|
726
708
|
actualUrl: request.url,
|
|
727
709
|
pageTitle: results.pageTitle,
|
|
728
710
|
});
|
|
711
|
+
rateController.onSuccess(crawler.autoscaledPool);
|
|
712
|
+
if (rateController.isLimitReached()) {
|
|
713
|
+
isAbortingScanNow = true;
|
|
714
|
+
activeCrawler.autoscaledPool.abort();
|
|
715
|
+
}
|
|
729
716
|
scannedUrlSet.add(normUrl(request.url));
|
|
730
717
|
scannedResolvedUrlSet.add(normUrl(request.url));
|
|
731
718
|
await dataset.pushData(results);
|
|
@@ -777,33 +764,35 @@ const crawlDomain = async ({
|
|
|
777
764
|
});
|
|
778
765
|
}
|
|
779
766
|
} catch {
|
|
780
|
-
//
|
|
767
|
+
// Recovery failed; Crawlee will retry the request automatically
|
|
781
768
|
}
|
|
782
769
|
|
|
783
|
-
//
|
|
784
|
-
//
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
numScanned: urlsCrawled.scanned.length,
|
|
788
|
-
urlScanned: request.url,
|
|
789
|
-
});
|
|
790
|
-
|
|
791
|
-
urlsCrawled.error.push({
|
|
792
|
-
url: request.url,
|
|
793
|
-
pageTitle: request.url,
|
|
794
|
-
actualUrl: request.url,
|
|
795
|
-
metadata: STATUS_CODE_METADATA[2],
|
|
796
|
-
});
|
|
797
|
-
}
|
|
770
|
+
// Do not push to urlsCrawled.error here — Crawlee will retry the request
|
|
771
|
+
// (up to maxRequestRetries, default 3). If all retries are exhausted,
|
|
772
|
+
// failedRequestHandler will record the error. Pushing here causes
|
|
773
|
+
// duplicates and false positives for URLs that succeed on retry.
|
|
798
774
|
}
|
|
799
775
|
},
|
|
800
776
|
failedRequestHandler: async ({ request, response }) => {
|
|
777
|
+
if (isAbortingScanNow) {
|
|
778
|
+
return;
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
const status = response?.status();
|
|
782
|
+
if (rateController.onFailure(status, crawler.autoscaledPool)) {
|
|
783
|
+
consoleLogger.info(
|
|
784
|
+
`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`,
|
|
785
|
+
);
|
|
786
|
+
isAbortingScanNow = true;
|
|
787
|
+
crawler.autoscaledPool?.abort();
|
|
788
|
+
return;
|
|
789
|
+
}
|
|
790
|
+
|
|
801
791
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
|
802
792
|
numScanned: urlsCrawled.scanned.length,
|
|
803
793
|
urlScanned: request.url,
|
|
804
794
|
});
|
|
805
795
|
|
|
806
|
-
const status = response?.status();
|
|
807
796
|
const metadata =
|
|
808
797
|
typeof status === 'number'
|
|
809
798
|
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
|
@@ -819,15 +808,13 @@ const crawlDomain = async ({
|
|
|
819
808
|
},
|
|
820
809
|
maxRequestsPerCrawl: Infinity,
|
|
821
810
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
},
|
|
830
|
-
}),
|
|
811
|
+
autoscaledPoolOptions: {
|
|
812
|
+
minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
|
|
813
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
814
|
+
desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
|
|
815
|
+
scaleUpStepRatio: 0.99, // Scale up faster
|
|
816
|
+
scaleDownStepRatio: 0.1, // Scale down slower
|
|
817
|
+
},
|
|
831
818
|
}),
|
|
832
819
|
);
|
|
833
820
|
|
|
@@ -850,7 +837,7 @@ const crawlDomain = async ({
|
|
|
850
837
|
.map(item => item.actualUrl || item.url)
|
|
851
838
|
.filter(pageUrl => {
|
|
852
839
|
try {
|
|
853
|
-
return new URL(pageUrl).hostname
|
|
840
|
+
return isSameHostname(new URL(pageUrl).hostname, seedHostname) && !clickPassVisited.has(pageUrl);
|
|
854
841
|
} catch {
|
|
855
842
|
return false;
|
|
856
843
|
}
|
|
@@ -7,7 +7,7 @@ import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
|
7
7
|
import crawlDomain from './crawlDomain.js';
|
|
8
8
|
import crawlSitemap from './crawlSitemap.js';
|
|
9
9
|
import { ViewportSettingsClass } from '../combine.js';
|
|
10
|
-
import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
|
|
10
|
+
import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt, initModifiedUserAgent } from '../constants/common.js';
|
|
11
11
|
import { register } from '../utils.js';
|
|
12
12
|
|
|
13
13
|
const crawlIntelligentSitemap = async (
|
|
@@ -40,6 +40,10 @@ const crawlIntelligentSitemap = async (
|
|
|
40
40
|
|
|
41
41
|
({ dataset } = await createCrawleeSubFolders(randomToken));
|
|
42
42
|
|
|
43
|
+
// Initialise modified User-Agent early so sitemap discovery requests
|
|
44
|
+
// don't expose "HeadlessChrome" (which triggers bot-blocking on some sites).
|
|
45
|
+
await initModifiedUserAgent(browser);
|
|
46
|
+
|
|
43
47
|
function getHomeUrl(parsedUrl: string) {
|
|
44
48
|
const urlObject = new URL(parsedUrl);
|
|
45
49
|
return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
|
@@ -62,6 +66,7 @@ const crawlIntelligentSitemap = async (
|
|
|
62
66
|
context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
|
63
67
|
...launchOptions,
|
|
64
68
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
69
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
65
70
|
});
|
|
66
71
|
register(context);
|
|
67
72
|
} else {
|
|
@@ -70,6 +75,7 @@ const crawlIntelligentSitemap = async (
|
|
|
70
75
|
register(browserInstance as unknown as { close: () => Promise<void> });
|
|
71
76
|
context = await browserInstance.newContext({
|
|
72
77
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
78
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
73
79
|
});
|
|
74
80
|
}
|
|
75
81
|
|
|
@@ -93,7 +99,7 @@ const crawlIntelligentSitemap = async (
|
|
|
93
99
|
const checkUrlExists = async (page: Page, parsedUrl: string) => {
|
|
94
100
|
try {
|
|
95
101
|
const response = await page.goto(parsedUrl);
|
|
96
|
-
return response
|
|
102
|
+
return response?.ok() ?? false;
|
|
97
103
|
} catch (e) {
|
|
98
104
|
consoleLogger.error(e);
|
|
99
105
|
return false;
|
|
@@ -105,7 +111,7 @@ const crawlIntelligentSitemap = async (
|
|
|
105
111
|
try {
|
|
106
112
|
sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
|
|
107
113
|
if (sitemapUrls.length > 0) {
|
|
108
|
-
|
|
114
|
+
consoleLogger.info(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
|
|
109
115
|
sitemapExist = true;
|
|
110
116
|
}
|
|
111
117
|
} catch (error) {
|
|
@@ -125,7 +131,7 @@ const crawlIntelligentSitemap = async (
|
|
|
125
131
|
}
|
|
126
132
|
|
|
127
133
|
if (!sitemapExist) {
|
|
128
|
-
|
|
134
|
+
consoleLogger.info('Unable to find sitemap. Commencing website crawl instead.');
|
|
129
135
|
return await crawlDomain({
|
|
130
136
|
url,
|
|
131
137
|
randomToken,
|
|
@@ -157,7 +163,7 @@ const crawlIntelligentSitemap = async (
|
|
|
157
163
|
break;
|
|
158
164
|
}
|
|
159
165
|
|
|
160
|
-
|
|
166
|
+
consoleLogger.info(`Processing sitemap: ${currentSitemapUrl}`);
|
|
161
167
|
urlsCrawledFinal = await crawlSitemap({
|
|
162
168
|
sitemapUrl: currentSitemapUrl,
|
|
163
169
|
randomToken,
|
|
@@ -187,7 +193,7 @@ const crawlIntelligentSitemap = async (
|
|
|
187
193
|
const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
|
|
188
194
|
|
|
189
195
|
if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
|
|
190
|
-
|
|
196
|
+
consoleLogger.info(
|
|
191
197
|
`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`,
|
|
192
198
|
);
|
|
193
199
|
urlsCrawledFinal = await crawlDomain({
|
|
@@ -212,7 +218,7 @@ const crawlIntelligentSitemap = async (
|
|
|
212
218
|
scanDuration: remainingScanDuration,
|
|
213
219
|
});
|
|
214
220
|
} else if (!hasDurationRemaining) {
|
|
215
|
-
|
|
221
|
+
consoleLogger.info(
|
|
216
222
|
`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`,
|
|
217
223
|
);
|
|
218
224
|
durationExceeded = true;
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { consoleLogger } from '../logs.js';
|
|
2
|
+
|
|
3
|
+
export class CrawlRateController {
|
|
4
|
+
private scannedCount = 0;
|
|
5
|
+
private readonly maxPages: number;
|
|
6
|
+
private consecutiveFailures = 0;
|
|
7
|
+
private consecutiveSuccesses = 0;
|
|
8
|
+
private readonly maxConsecutiveFailures: number;
|
|
9
|
+
private readonly originalMaxConcurrency: number;
|
|
10
|
+
private static readonly RECOVERY_INTERVAL = 10;
|
|
11
|
+
|
|
12
|
+
constructor(maxRequestsPerCrawl: number, maxConcurrency: number) {
|
|
13
|
+
this.maxPages = maxRequestsPerCrawl;
|
|
14
|
+
this.maxConsecutiveFailures = Number(process.env.OOBEE_CONSECUTIVE_MAX_RETRIES) || 100;
|
|
15
|
+
this.originalMaxConcurrency = maxConcurrency;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
claimSlot(): boolean {
|
|
19
|
+
if (this.scannedCount >= this.maxPages) {
|
|
20
|
+
return false;
|
|
21
|
+
}
|
|
22
|
+
this.scannedCount++;
|
|
23
|
+
return true;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
onSuccess(pool?: { maxConcurrency: number }): void {
|
|
27
|
+
this.consecutiveFailures = 0;
|
|
28
|
+
this.consecutiveSuccesses++;
|
|
29
|
+
|
|
30
|
+
if (pool && this.consecutiveSuccesses % CrawlRateController.RECOVERY_INTERVAL === 0) {
|
|
31
|
+
if (pool.maxConcurrency < this.originalMaxConcurrency) {
|
|
32
|
+
pool.maxConcurrency = Math.min(pool.maxConcurrency + 2, this.originalMaxConcurrency);
|
|
33
|
+
consoleLogger.info(`Recovering concurrency to ${pool.maxConcurrency}`);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
onFailure(httpStatus: number | undefined, pool?: { maxConcurrency: number }): boolean {
|
|
39
|
+
if (typeof httpStatus !== 'number' || httpStatus < 400) {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
this.consecutiveSuccesses = 0;
|
|
44
|
+
this.consecutiveFailures++;
|
|
45
|
+
|
|
46
|
+
if (pool && pool.maxConcurrency > 1) {
|
|
47
|
+
pool.maxConcurrency = Math.max(1, Math.floor(pool.maxConcurrency / 2));
|
|
48
|
+
consoleLogger.info(
|
|
49
|
+
`Rate limited (HTTP ${httpStatus}) — reducing concurrency to ${pool.maxConcurrency}`,
|
|
50
|
+
);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
if (this.consecutiveFailures >= this.maxConsecutiveFailures) {
|
|
54
|
+
return true;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
isLimitReached(): boolean {
|
|
61
|
+
return this.scannedCount >= this.maxPages;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import crawlee, { EnqueueStrategy, LaunchContext, Request, RequestList, Dataset } from 'crawlee';
|
|
2
|
+
import { CrawlRateController } from './crawlRateController.js';
|
|
2
3
|
import fs from 'fs';
|
|
3
|
-
import * as path from 'path';
|
|
4
|
-
import fsp from 'fs/promises';
|
|
5
4
|
import {
|
|
6
5
|
createCrawleeSubFolders,
|
|
6
|
+
getPreLaunchHook,
|
|
7
7
|
preNavigationHooks,
|
|
8
8
|
runAxeScript,
|
|
9
9
|
isUrlPdf,
|
|
@@ -30,7 +30,7 @@ import {
|
|
|
30
30
|
mapPdfScanResults,
|
|
31
31
|
doPdfScreenshots,
|
|
32
32
|
} from './pdfScanFunc.js';
|
|
33
|
-
import { guiInfoLog } from '../logs.js';
|
|
33
|
+
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
34
34
|
import { ViewportSettingsClass } from '../combine.js';
|
|
35
35
|
|
|
36
36
|
const crawlSitemap = async ({
|
|
@@ -81,6 +81,10 @@ const crawlSitemap = async ({
|
|
|
81
81
|
let urlsCrawled: UrlsCrawled;
|
|
82
82
|
let durationExceeded = false;
|
|
83
83
|
let isAbortingScan = false;
|
|
84
|
+
const rateController = new CrawlRateController(
|
|
85
|
+
maxRequestsPerCrawl,
|
|
86
|
+
specifiedMaxConcurrency || constants.maxConcurrency,
|
|
87
|
+
);
|
|
84
88
|
|
|
85
89
|
if (fromCrawlIntelligentSitemap) {
|
|
86
90
|
dataset = datasetFromIntelligent;
|
|
@@ -125,39 +129,20 @@ const crawlSitemap = async ({
|
|
|
125
129
|
launchContext: {
|
|
126
130
|
launcher: constants.launcher,
|
|
127
131
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
|
128
|
-
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
129
|
-
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
130
132
|
},
|
|
131
133
|
retryOnBlocked: true,
|
|
132
134
|
browserPoolOptions: {
|
|
133
135
|
useFingerprints: false,
|
|
134
136
|
preLaunchHooks: [
|
|
137
|
+
getPreLaunchHook(userDataDirectory),
|
|
135
138
|
async (_pageId, launchContext) => {
|
|
136
|
-
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
|
|
137
|
-
|
|
138
|
-
// Ensure base exists
|
|
139
|
-
await fsp.mkdir(baseDir, { recursive: true });
|
|
140
|
-
|
|
141
|
-
// Create a unique subdir per browser
|
|
142
|
-
const subProfileDir = path.join(
|
|
143
|
-
baseDir,
|
|
144
|
-
`profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
|
|
145
|
-
);
|
|
146
|
-
await fsp.mkdir(subProfileDir, { recursive: true });
|
|
147
|
-
|
|
148
|
-
// Assign to Crawlee's launcher
|
|
149
|
-
launchContext.userDataDir = subProfileDir;
|
|
150
|
-
|
|
151
|
-
// Safely extend launchOptions
|
|
152
139
|
launchContext.launchOptions = {
|
|
153
140
|
...launchContext.launchOptions,
|
|
154
141
|
ignoreHTTPSErrors: true,
|
|
155
142
|
...playwrightDeviceDetailsObject,
|
|
143
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
156
144
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
157
145
|
};
|
|
158
|
-
|
|
159
|
-
// Optionally log for debugging
|
|
160
|
-
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
|
|
161
146
|
},
|
|
162
147
|
],
|
|
163
148
|
},
|
|
@@ -259,13 +244,11 @@ const crawlSitemap = async ({
|
|
|
259
244
|
const hasExceededDuration =
|
|
260
245
|
scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
|
261
246
|
|
|
262
|
-
if (
|
|
247
|
+
if (hasExceededDuration) {
|
|
248
|
+
consoleLogger.info(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
|
|
249
|
+
durationExceeded = true;
|
|
263
250
|
isAbortingScan = true;
|
|
264
|
-
|
|
265
|
-
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
|
|
266
|
-
durationExceeded = true;
|
|
267
|
-
}
|
|
268
|
-
crawler.autoscaledPool.abort(); // stops new requests
|
|
251
|
+
crawler.autoscaledPool.abort();
|
|
269
252
|
return;
|
|
270
253
|
}
|
|
271
254
|
|
|
@@ -376,26 +359,33 @@ const crawlSitemap = async ({
|
|
|
376
359
|
// Page/context was destroyed during navigation — handled by outer catch
|
|
377
360
|
}
|
|
378
361
|
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
362
|
+
if (rateController.claimSlot()) {
|
|
363
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
364
|
+
numScanned: urlsCrawled.scanned.length,
|
|
365
|
+
urlScanned: request.url,
|
|
366
|
+
});
|
|
383
367
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
368
|
+
urlsCrawled.scanned.push({
|
|
369
|
+
url: request.url,
|
|
370
|
+
pageTitle: results.pageTitle,
|
|
371
|
+
actualUrl, // i.e. actualUrl
|
|
372
|
+
});
|
|
373
|
+
rateController.onSuccess(crawler.autoscaledPool);
|
|
374
|
+
if (rateController.isLimitReached()) {
|
|
375
|
+
isAbortingScan = true;
|
|
376
|
+
crawler.autoscaledPool.abort();
|
|
377
|
+
}
|
|
389
378
|
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
379
|
+
urlsCrawled.scannedRedirects.push({
|
|
380
|
+
fromUrl: request.url,
|
|
381
|
+
toUrl: actualUrl,
|
|
382
|
+
});
|
|
394
383
|
|
|
395
|
-
|
|
396
|
-
|
|
384
|
+
results.url = request.url;
|
|
385
|
+
results.actualUrl = actualUrl;
|
|
397
386
|
|
|
398
|
-
|
|
387
|
+
await dataset.pushData(results);
|
|
388
|
+
}
|
|
399
389
|
} else {
|
|
400
390
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
401
391
|
numScanned: urlsCrawled.scanned.length,
|
|
@@ -420,20 +410,10 @@ const crawlSitemap = async ({
|
|
|
420
410
|
}
|
|
421
411
|
}
|
|
422
412
|
} catch (e) {
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
});
|
|
428
|
-
|
|
429
|
-
urlsCrawled.error.push({
|
|
430
|
-
url: request.url,
|
|
431
|
-
pageTitle: request.url,
|
|
432
|
-
actualUrl: request.url,
|
|
433
|
-
metadata: STATUS_CODE_METADATA[2],
|
|
434
|
-
httpStatusCode: 0,
|
|
435
|
-
});
|
|
436
|
-
}
|
|
413
|
+
// Do not push to urlsCrawled.error here — Crawlee will retry the request
|
|
414
|
+
// (up to maxRequestRetries, default 3). If all retries are exhausted,
|
|
415
|
+
// failedRequestHandler will record the error. Pushing here causes
|
|
416
|
+
// duplicates and false positives for URLs that succeed on retry.
|
|
437
417
|
}
|
|
438
418
|
},
|
|
439
419
|
failedRequestHandler: async ({ request, response, error }) => {
|
|
@@ -441,12 +421,21 @@ const crawlSitemap = async ({
|
|
|
441
421
|
return;
|
|
442
422
|
}
|
|
443
423
|
|
|
424
|
+
const status = response?.status();
|
|
425
|
+
if (rateController.onFailure(status, crawler.autoscaledPool)) {
|
|
426
|
+
consoleLogger.info(
|
|
427
|
+
`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`,
|
|
428
|
+
);
|
|
429
|
+
isAbortingScan = true;
|
|
430
|
+
crawler.autoscaledPool?.abort();
|
|
431
|
+
return;
|
|
432
|
+
}
|
|
433
|
+
|
|
444
434
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
|
445
435
|
numScanned: urlsCrawled.scanned.length,
|
|
446
436
|
urlScanned: request.url,
|
|
447
437
|
});
|
|
448
438
|
|
|
449
|
-
const status = response?.status();
|
|
450
439
|
const metadata =
|
|
451
440
|
typeof status === 'number'
|
|
452
441
|
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
|
@@ -463,15 +452,13 @@ const crawlSitemap = async ({
|
|
|
463
452
|
},
|
|
464
453
|
maxRequestsPerCrawl: Infinity,
|
|
465
454
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
},
|
|
474
|
-
}),
|
|
455
|
+
autoscaledPoolOptions: {
|
|
456
|
+
minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
|
|
457
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
458
|
+
desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
|
|
459
|
+
scaleUpStepRatio: 0.99, // Scale up faster
|
|
460
|
+
scaleDownStepRatio: 0.1, // Scale down slower
|
|
461
|
+
},
|
|
475
462
|
}),
|
|
476
463
|
);
|
|
477
464
|
|
|
@@ -461,6 +461,37 @@ const scanApiScript = (
|
|
|
461
461
|
// Run axe-core + oobee custom checks
|
|
462
462
|
var scanResult = await window.runA11yScan(elementsToScan, '');
|
|
463
463
|
|
|
464
|
+
// Re-verify aria-hidden-focus violations against the live DOM to handle
|
|
465
|
+
// race conditions with JS that sets tabindex="-1" after aria-hidden
|
|
466
|
+
var axeViolations = scanResult.axeScanResults.violations || [];
|
|
467
|
+
var ariaHiddenViolation = axeViolations.find(function(v) { return v.id === 'aria-hidden-focus'; });
|
|
468
|
+
if (ariaHiddenViolation) {
|
|
469
|
+
await new Promise(function(resolve) { setTimeout(resolve, 0); });
|
|
470
|
+
ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(function(node) {
|
|
471
|
+
var selector = node.target && node.target[0];
|
|
472
|
+
if (typeof selector !== 'string') return true;
|
|
473
|
+
try {
|
|
474
|
+
var el = document.querySelector(selector);
|
|
475
|
+
if (!el) return true;
|
|
476
|
+
var focusables = el.querySelectorAll(
|
|
477
|
+
'a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]'
|
|
478
|
+
);
|
|
479
|
+
if (focusables.length === 0) return false;
|
|
480
|
+
return Array.from(focusables).some(function(child) {
|
|
481
|
+
var tabindex = child.getAttribute('tabindex');
|
|
482
|
+
if (tabindex === null) return true;
|
|
483
|
+
var parsed = parseInt(tabindex, 10);
|
|
484
|
+
return isNaN(parsed) || parsed >= 0;
|
|
485
|
+
});
|
|
486
|
+
} catch (e) { return true; }
|
|
487
|
+
});
|
|
488
|
+
if (ariaHiddenViolation.nodes.length === 0) {
|
|
489
|
+
scanResult.axeScanResults.violations = axeViolations.filter(function(v) {
|
|
490
|
+
return v.id !== 'aria-hidden-focus';
|
|
491
|
+
});
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
|
|
464
495
|
// Convert raw axe results into oobee category structure
|
|
465
496
|
var filtered = _oobeeFilterAxeResults(scanResult.axeScanResults, scanResult.pageTitle);
|
|
466
497
|
|
package/src/index.ts
CHANGED
|
@@ -144,6 +144,9 @@ const sendWcagBreakdownToSentry = async (
|
|
|
144
144
|
...(process.env.OOBEE_SCAN_PRODUCT && {
|
|
145
145
|
scanProduct: process.env.OOBEE_SCAN_PRODUCT,
|
|
146
146
|
}),
|
|
147
|
+
...(process.env.OOBEE_TAGGED_WEBSITE && {
|
|
148
|
+
websiteTag: process.env.OOBEE_TAGGED_WEBSITE,
|
|
149
|
+
}),
|
|
147
150
|
},
|
|
148
151
|
user: {
|
|
149
152
|
...(scanInfo.email && scanInfo.name
|