@govtechsg/oobee 0.10.90 → 0.10.92

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/AGENTS.md +289 -0
  2. package/README.md +3 -0
  3. package/dist/cli.js +3 -0
  4. package/dist/combine.js +14 -2
  5. package/dist/constants/cliFunctions.js +7 -0
  6. package/dist/constants/common.js +119 -70
  7. package/dist/constants/constants.js +1 -0
  8. package/dist/crawlers/commonCrawlerFunc.js +93 -15
  9. package/dist/crawlers/crawlDomain.js +45 -57
  10. package/dist/crawlers/crawlIntelligentSitemap.js +12 -7
  11. package/dist/crawlers/crawlRateController.js +47 -0
  12. package/dist/crawlers/crawlSitemap.js +51 -62
  13. package/dist/generateOobeeClientScanner.js +31 -0
  14. package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
  15. package/dist/mergeAxeResults.js +121 -68
  16. package/dist/npmIndex.js +1 -0
  17. package/dist/utils.js +23 -28
  18. package/oobee-client-scanner.js +33 -2
  19. package/package.json +2 -2
  20. package/src/cli.ts +4 -0
  21. package/src/combine.ts +15 -1
  22. package/src/constants/cliFunctions.ts +7 -0
  23. package/src/constants/common.ts +131 -79
  24. package/src/constants/constants.ts +1 -0
  25. package/src/crawlers/commonCrawlerFunc.ts +103 -14
  26. package/src/crawlers/crawlDomain.ts +52 -65
  27. package/src/crawlers/crawlIntelligentSitemap.ts +13 -7
  28. package/src/crawlers/crawlRateController.ts +63 -0
  29. package/src/crawlers/crawlSitemap.ts +57 -70
  30. package/src/generateOobeeClientScanner.ts +31 -0
  31. package/src/index.ts +1 -0
  32. package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
  33. package/src/mergeAxeResults.ts +141 -75
  34. package/src/npmIndex.ts +1 -0
  35. package/src/utils.ts +25 -33
  36. /package/{fb85adb0-5db6-4a09-8c80-05f030115004.txt → d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt} +0 -0
@@ -1,10 +1,9 @@
1
1
  import crawlee from 'crawlee';
2
- import * as path from 'path';
3
- import fsp from 'fs/promises';
4
- import { createCrawleeSubFolders, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
2
+ import { CrawlRateController } from './crawlRateController.js';
3
+ import { createCrawleeSubFolders, getPreLaunchHook, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
5
4
  import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
6
5
  import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
7
- import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
6
+ import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
8
7
  import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
9
8
  import { consoleLogger, guiInfoLog } from '../logs.js';
10
9
  const isBlacklisted = (url, blacklistedPatterns) => {
@@ -258,9 +257,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
258
257
  // same-domain strategy) still contribute their <a> links above, but
259
258
  // clicking every interactive element on them is too slow and starves
260
259
  // the crawler of time to discover pages on the primary hostname.
261
- const currentHostname = new URL(page.url()).hostname;
262
- const seedHostname = new URL(url).hostname;
263
- if (currentHostname === seedHostname) {
260
+ if (isSameHostname(new URL(page.url()).hostname, new URL(url).hostname)) {
264
261
  // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
265
262
  try {
266
263
  await customEnqueueLinksByClickingElements(page, browserContext);
@@ -277,39 +274,27 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
277
274
  }
278
275
  };
279
276
  let isAbortingScanNow = false;
277
+ const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
280
278
  const crawler = register(new crawlee.PlaywrightCrawler({
281
279
  launchContext: {
282
280
  launcher: constants.launcher,
283
281
  launchOptions: getPlaywrightLaunchOptions(browser),
284
- // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
285
- ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
286
282
  },
287
283
  retryOnBlocked: true,
288
284
  browserPoolOptions: {
289
285
  useFingerprints: false,
290
286
  preLaunchHooks: [
287
+ getPreLaunchHook(userDataDirectory),
291
288
  async (_pageId, launchContext) => {
292
- const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
293
- // Ensure base exists
294
- await fsp.mkdir(baseDir, { recursive: true });
295
- // Create a unique subdir per browser
296
- const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
297
- await fsp.mkdir(subProfileDir, { recursive: true });
298
- // Assign to Crawlee's launcher
299
- // Crawlee preLaunchHooks expects launchContext to be mutated in-place.
300
- // eslint-disable-next-line no-param-reassign
301
- launchContext.userDataDir = subProfileDir;
302
- // Safely extend launchOptions
303
289
  // eslint-disable-next-line no-param-reassign
304
290
  launchContext.launchOptions = {
305
291
  ...launchContext.launchOptions,
306
292
  ignoreHTTPSErrors: true,
307
293
  ...playwrightDeviceDetailsObject,
294
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
308
295
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
309
296
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
310
297
  };
311
- // Optionally log for debugging
312
- // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
313
298
  },
314
299
  ],
315
300
  },
@@ -390,11 +375,9 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
390
375
  return;
391
376
  }
392
377
  const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
393
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
394
- if (hasExceededDuration) {
395
- console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
396
- durationExceeded = true;
397
- }
378
+ if (hasExceededDuration) {
379
+ console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
380
+ durationExceeded = true;
398
381
  isAbortingScanNow = true;
399
382
  activeCrawler.autoscaledPool.abort();
400
383
  return;
@@ -527,8 +510,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
527
510
  });
528
511
  return;
529
512
  }
530
- // One more check if scanned pages have reached limit due to multi-instances of handler running
531
- if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
513
+ if (rateController.claimSlot()) {
532
514
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
533
515
  numScanned: urlsCrawled.scanned.length,
534
516
  urlScanned: request.url,
@@ -538,6 +520,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
538
520
  pageTitle: results.pageTitle,
539
521
  actualUrl, // i.e. actualUrl
540
522
  });
523
+ rateController.onSuccess(crawler.autoscaledPool);
524
+ if (rateController.isLimitReached()) {
525
+ isAbortingScanNow = true;
526
+ activeCrawler.autoscaledPool.abort();
527
+ }
541
528
  scannedUrlSet.add(normUrl(request.url));
542
529
  scannedResolvedUrlSet.add(normUrl(actualUrl));
543
530
  urlsCrawled.scannedRedirects.push({
@@ -549,8 +536,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
549
536
  await dataset.pushData(results);
550
537
  }
551
538
  }
552
- else if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
553
- // One more check if scanned pages have reached limit due to multi-instances of handler running
539
+ else if (rateController.claimSlot()) {
554
540
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
555
541
  numScanned: urlsCrawled.scanned.length,
556
542
  urlScanned: request.url,
@@ -560,6 +546,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
560
546
  actualUrl: request.url,
561
547
  pageTitle: results.pageTitle,
562
548
  });
549
+ rateController.onSuccess(crawler.autoscaledPool);
550
+ if (rateController.isLimitReached()) {
551
+ isAbortingScanNow = true;
552
+ activeCrawler.autoscaledPool.abort();
553
+ }
563
554
  scannedUrlSet.add(normUrl(request.url));
564
555
  scannedResolvedUrlSet.add(normUrl(request.url));
565
556
  await dataset.pushData(results);
@@ -611,30 +602,29 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
611
602
  }
612
603
  }
613
604
  catch {
614
- // Do nothing since the error will be pushed
615
- }
616
- // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
617
- // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
618
- if (!isAbortingScanNow) {
619
- guiInfoLog(guiInfoStatusTypes.ERROR, {
620
- numScanned: urlsCrawled.scanned.length,
621
- urlScanned: request.url,
622
- });
623
- urlsCrawled.error.push({
624
- url: request.url,
625
- pageTitle: request.url,
626
- actualUrl: request.url,
627
- metadata: STATUS_CODE_METADATA[2],
628
- });
605
+ // Recovery failed; Crawlee will retry the request automatically
629
606
  }
607
+ // Do not push to urlsCrawled.error here — Crawlee will retry the request
608
+ // (up to maxRequestRetries, default 3). If all retries are exhausted,
609
+ // failedRequestHandler will record the error. Pushing here causes
610
+ // duplicates and false positives for URLs that succeed on retry.
630
611
  }
631
612
  },
632
613
  failedRequestHandler: async ({ request, response }) => {
614
+ if (isAbortingScanNow) {
615
+ return;
616
+ }
617
+ const status = response?.status();
618
+ if (rateController.onFailure(status, crawler.autoscaledPool)) {
619
+ consoleLogger.info(`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`);
620
+ isAbortingScanNow = true;
621
+ crawler.autoscaledPool?.abort();
622
+ return;
623
+ }
633
624
  guiInfoLog(guiInfoStatusTypes.ERROR, {
634
625
  numScanned: urlsCrawled.scanned.length,
635
626
  urlScanned: request.url,
636
627
  });
637
- const status = response?.status();
638
628
  const metadata = typeof status === 'number'
639
629
  ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
640
630
  : STATUS_CODE_METADATA[2];
@@ -648,15 +638,13 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
648
638
  },
649
639
  maxRequestsPerCrawl: Infinity,
650
640
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
651
- ...(process.env.OOBEE_FAST_CRAWLER && {
652
- autoscaledPoolOptions: {
653
- minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
654
- maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
655
- desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
656
- scaleUpStepRatio: 0.99, // Scale up faster
657
- scaleDownStepRatio: 0.1, // Scale down slower
658
- },
659
- }),
641
+ autoscaledPoolOptions: {
642
+ minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
643
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
644
+ desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
645
+ scaleUpStepRatio: 0.99, // Scale up faster
646
+ scaleDownStepRatio: 0.1, // Scale down slower
647
+ },
660
648
  }));
661
649
  await crawler.run();
662
650
  // Additional passes: keep re-visiting scanned seed-hostname pages for
@@ -675,7 +663,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
675
663
  .map(item => item.actualUrl || item.url)
676
664
  .filter(pageUrl => {
677
665
  try {
678
- return new URL(pageUrl).hostname === seedHostname && !clickPassVisited.has(pageUrl);
666
+ return isSameHostname(new URL(pageUrl).hostname, seedHostname) && !clickPassVisited.has(pageUrl);
679
667
  }
680
668
  catch {
681
669
  return false;
@@ -3,7 +3,7 @@ import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/consta
3
3
  import { consoleLogger, guiInfoLog } from '../logs.js';
4
4
  import crawlDomain from './crawlDomain.js';
5
5
  import crawlSitemap from './crawlSitemap.js';
6
- import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
6
+ import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt, initModifiedUserAgent } from '../constants/common.js';
7
7
  import { register } from '../utils.js';
8
8
  const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, strategy, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, followRobots, extraHTTPHeaders, safeMode, scanDuration) => {
9
9
  const startTime = Date.now(); // Track start time
@@ -15,6 +15,9 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
15
15
  let sitemapUrl;
16
16
  let durationExceeded = false;
17
17
  ({ dataset } = await createCrawleeSubFolders(randomToken));
18
+ // Initialise modified User-Agent early so sitemap discovery requests
19
+ // don't expose "HeadlessChrome" (which triggers bot-blocking on some sites).
20
+ await initModifiedUserAgent(browser);
18
21
  function getHomeUrl(parsedUrl) {
19
22
  const urlObject = new URL(parsedUrl);
20
23
  return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
@@ -30,6 +33,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
30
33
  context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
31
34
  ...launchOptions,
32
35
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
36
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
33
37
  });
34
38
  register(context);
35
39
  }
@@ -39,6 +43,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
39
43
  register(browserInstance);
40
44
  context = await browserInstance.newContext({
41
45
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
46
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
42
47
  });
43
48
  }
44
49
  const page = await context.newPage();
@@ -59,7 +64,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
59
64
  const checkUrlExists = async (page, parsedUrl) => {
60
65
  try {
61
66
  const response = await page.goto(parsedUrl);
62
- return response.ok();
67
+ return response?.ok() ?? false;
63
68
  }
64
69
  catch (e) {
65
70
  consoleLogger.error(e);
@@ -71,7 +76,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
71
76
  try {
72
77
  sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
73
78
  if (sitemapUrls.length > 0) {
74
- console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
79
+ consoleLogger.info(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
75
80
  sitemapExist = true;
76
81
  }
77
82
  }
@@ -91,7 +96,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
91
96
  }
92
97
  }
93
98
  if (!sitemapExist) {
94
- console.log('Unable to find sitemap. Commencing website crawl instead.');
99
+ consoleLogger.info('Unable to find sitemap. Commencing website crawl instead.');
95
100
  return await crawlDomain({
96
101
  url,
97
102
  randomToken,
@@ -121,7 +126,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
121
126
  durationExceeded = true;
122
127
  break;
123
128
  }
124
- console.log(`Processing sitemap: ${currentSitemapUrl}`);
129
+ consoleLogger.info(`Processing sitemap: ${currentSitemapUrl}`);
125
130
  urlsCrawledFinal = await crawlSitemap({
126
131
  sitemapUrl: currentSitemapUrl,
127
132
  randomToken,
@@ -149,7 +154,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
149
154
  const remainingScanDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : 0;
150
155
  const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
151
156
  if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
152
- console.log(`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`);
157
+ consoleLogger.info(`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`);
153
158
  urlsCrawledFinal = await crawlDomain({
154
159
  url,
155
160
  randomToken,
@@ -173,7 +178,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
173
178
  });
174
179
  }
175
180
  else if (!hasDurationRemaining) {
176
- console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
181
+ consoleLogger.info(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
177
182
  durationExceeded = true;
178
183
  }
179
184
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
@@ -0,0 +1,47 @@
1
+ import { consoleLogger } from '../logs.js';
2
+ export class CrawlRateController {
3
+ constructor(maxRequestsPerCrawl, maxConcurrency) {
4
+ this.scannedCount = 0;
5
+ this.consecutiveFailures = 0;
6
+ this.consecutiveSuccesses = 0;
7
+ this.maxPages = maxRequestsPerCrawl;
8
+ this.maxConsecutiveFailures = Number(process.env.OOBEE_CONSECUTIVE_MAX_RETRIES) || 100;
9
+ this.originalMaxConcurrency = maxConcurrency;
10
+ }
11
+ claimSlot() {
12
+ if (this.scannedCount >= this.maxPages) {
13
+ return false;
14
+ }
15
+ this.scannedCount++;
16
+ return true;
17
+ }
18
+ onSuccess(pool) {
19
+ this.consecutiveFailures = 0;
20
+ this.consecutiveSuccesses++;
21
+ if (pool && this.consecutiveSuccesses % CrawlRateController.RECOVERY_INTERVAL === 0) {
22
+ if (pool.maxConcurrency < this.originalMaxConcurrency) {
23
+ pool.maxConcurrency = Math.min(pool.maxConcurrency + 2, this.originalMaxConcurrency);
24
+ consoleLogger.info(`Recovering concurrency to ${pool.maxConcurrency}`);
25
+ }
26
+ }
27
+ }
28
+ onFailure(httpStatus, pool) {
29
+ if (typeof httpStatus !== 'number' || httpStatus < 400) {
30
+ return false;
31
+ }
32
+ this.consecutiveSuccesses = 0;
33
+ this.consecutiveFailures++;
34
+ if (pool && pool.maxConcurrency > 1) {
35
+ pool.maxConcurrency = Math.max(1, Math.floor(pool.maxConcurrency / 2));
36
+ consoleLogger.info(`Rate limited (HTTP ${httpStatus}) — reducing concurrency to ${pool.maxConcurrency}`);
37
+ }
38
+ if (this.consecutiveFailures >= this.maxConsecutiveFailures) {
39
+ return true;
40
+ }
41
+ return false;
42
+ }
43
+ isLimitReached() {
44
+ return this.scannedCount >= this.maxPages;
45
+ }
46
+ }
47
+ CrawlRateController.RECOVERY_INTERVAL = 10;
@@ -1,18 +1,18 @@
1
1
  import crawlee, { EnqueueStrategy, RequestList } from 'crawlee';
2
- import * as path from 'path';
3
- import fsp from 'fs/promises';
4
- import { createCrawleeSubFolders, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
2
+ import { CrawlRateController } from './crawlRateController.js';
3
+ import { createCrawleeSubFolders, getPreLaunchHook, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
5
4
  import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, disallowedListOfPatterns, FileTypes, } from '../constants/constants.js';
6
5
  import { getLinksFromSitemap, getPlaywrightLaunchOptions, isSkippedUrl, waitForPageLoaded, isFilePath, } from '../constants/common.js';
7
6
  import { areLinksEqual, isFollowStrategy, isWhitelistedContentType, normUrl, register } from '../utils.js';
8
7
  import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
9
- import { guiInfoLog } from '../logs.js';
8
+ import { consoleLogger, guiInfoLog } from '../logs.js';
10
9
  const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = '', scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
11
10
  const crawlStartTime = Date.now();
12
11
  let dataset;
13
12
  let urlsCrawled;
14
13
  let durationExceeded = false;
15
14
  let isAbortingScan = false;
15
+ const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
16
16
  if (fromCrawlIntelligentSitemap) {
17
17
  dataset = datasetFromIntelligent;
18
18
  urlsCrawled = urlsCrawledFromIntelligent;
@@ -40,31 +40,20 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
40
40
  launchContext: {
41
41
  launcher: constants.launcher,
42
42
  launchOptions: getPlaywrightLaunchOptions(browser),
43
- // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
44
- ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
45
43
  },
46
44
  retryOnBlocked: true,
47
45
  browserPoolOptions: {
48
46
  useFingerprints: false,
49
47
  preLaunchHooks: [
48
+ getPreLaunchHook(userDataDirectory),
50
49
  async (_pageId, launchContext) => {
51
- const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
52
- // Ensure base exists
53
- await fsp.mkdir(baseDir, { recursive: true });
54
- // Create a unique subdir per browser
55
- const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
56
- await fsp.mkdir(subProfileDir, { recursive: true });
57
- // Assign to Crawlee's launcher
58
- launchContext.userDataDir = subProfileDir;
59
- // Safely extend launchOptions
60
50
  launchContext.launchOptions = {
61
51
  ...launchContext.launchOptions,
62
52
  ignoreHTTPSErrors: true,
63
53
  ...playwrightDeviceDetailsObject,
54
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
64
55
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
65
56
  };
66
- // Optionally log for debugging
67
- // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
68
57
  },
69
58
  ],
70
59
  },
@@ -149,13 +138,11 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
149
138
  await waitForPageLoaded(page, 10000);
150
139
  const actualUrl = page.url() || request.loadedUrl || request.url;
151
140
  const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
152
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
141
+ if (hasExceededDuration) {
142
+ consoleLogger.info(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
143
+ durationExceeded = true;
153
144
  isAbortingScan = true;
154
- if (hasExceededDuration) {
155
- console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
156
- durationExceeded = true;
157
- }
158
- crawler.autoscaledPool.abort(); // stops new requests
145
+ crawler.autoscaledPool.abort();
159
146
  return;
160
147
  }
161
148
  if (request.skipNavigation && actualUrl === 'about:blank') {
@@ -245,22 +232,29 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
245
232
  catch (_) {
246
233
  // Page/context was destroyed during navigation — handled by outer catch
247
234
  }
248
- guiInfoLog(guiInfoStatusTypes.SCANNED, {
249
- numScanned: urlsCrawled.scanned.length,
250
- urlScanned: request.url,
251
- });
252
- urlsCrawled.scanned.push({
253
- url: request.url,
254
- pageTitle: results.pageTitle,
255
- actualUrl, // i.e. actualUrl
256
- });
257
- urlsCrawled.scannedRedirects.push({
258
- fromUrl: request.url,
259
- toUrl: actualUrl,
260
- });
261
- results.url = request.url;
262
- results.actualUrl = actualUrl;
263
- await dataset.pushData(results);
235
+ if (rateController.claimSlot()) {
236
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
237
+ numScanned: urlsCrawled.scanned.length,
238
+ urlScanned: request.url,
239
+ });
240
+ urlsCrawled.scanned.push({
241
+ url: request.url,
242
+ pageTitle: results.pageTitle,
243
+ actualUrl, // i.e. actualUrl
244
+ });
245
+ rateController.onSuccess(crawler.autoscaledPool);
246
+ if (rateController.isLimitReached()) {
247
+ isAbortingScan = true;
248
+ crawler.autoscaledPool.abort();
249
+ }
250
+ urlsCrawled.scannedRedirects.push({
251
+ fromUrl: request.url,
252
+ toUrl: actualUrl,
253
+ });
254
+ results.url = request.url;
255
+ results.actualUrl = actualUrl;
256
+ await dataset.pushData(results);
257
+ }
264
258
  }
265
259
  else {
266
260
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
@@ -284,30 +278,27 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
284
278
  }
285
279
  }
286
280
  catch (e) {
287
- if (!isAbortingScan) {
288
- guiInfoLog(guiInfoStatusTypes.ERROR, {
289
- numScanned: urlsCrawled.scanned.length,
290
- urlScanned: request.url,
291
- });
292
- urlsCrawled.error.push({
293
- url: request.url,
294
- pageTitle: request.url,
295
- actualUrl: request.url,
296
- metadata: STATUS_CODE_METADATA[2],
297
- httpStatusCode: 0,
298
- });
299
- }
281
+ // Do not push to urlsCrawled.error here — Crawlee will retry the request
282
+ // (up to maxRequestRetries, default 3). If all retries are exhausted,
283
+ // failedRequestHandler will record the error. Pushing here causes
284
+ // duplicates and false positives for URLs that succeed on retry.
300
285
  }
301
286
  },
302
287
  failedRequestHandler: async ({ request, response, error }) => {
303
288
  if (isAbortingScan) {
304
289
  return;
305
290
  }
291
+ const status = response?.status();
292
+ if (rateController.onFailure(status, crawler.autoscaledPool)) {
293
+ consoleLogger.info(`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`);
294
+ isAbortingScan = true;
295
+ crawler.autoscaledPool?.abort();
296
+ return;
297
+ }
306
298
  guiInfoLog(guiInfoStatusTypes.ERROR, {
307
299
  numScanned: urlsCrawled.scanned.length,
308
300
  urlScanned: request.url,
309
301
  });
310
- const status = response?.status();
311
302
  const metadata = typeof status === 'number'
312
303
  ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
313
304
  : STATUS_CODE_METADATA[2];
@@ -322,15 +313,13 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
322
313
  },
323
314
  maxRequestsPerCrawl: Infinity,
324
315
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
325
- ...(process.env.OOBEE_FAST_CRAWLER && {
326
- autoscaledPoolOptions: {
327
- minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
328
- maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
329
- desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
330
- scaleUpStepRatio: 0.99, // Scale up faster
331
- scaleDownStepRatio: 0.1, // Scale down slower
332
- },
333
- }),
316
+ autoscaledPoolOptions: {
317
+ minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
318
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
319
+ desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
320
+ scaleUpStepRatio: 0.99, // Scale up faster
321
+ scaleDownStepRatio: 0.1, // Scale down slower
322
+ },
334
323
  }));
335
324
  await crawler.run();
336
325
  await requestList.isFinished();
@@ -444,6 +444,37 @@ const scanApiScript = (shortDescMap, longDescMap, stepByStepMap) => `
444
444
  // Run axe-core + oobee custom checks
445
445
  var scanResult = await window.runA11yScan(elementsToScan, '');
446
446
 
447
+ // Re-verify aria-hidden-focus violations against the live DOM to handle
448
+ // race conditions with JS that sets tabindex="-1" after aria-hidden
449
+ var axeViolations = scanResult.axeScanResults.violations || [];
450
+ var ariaHiddenViolation = axeViolations.find(function(v) { return v.id === 'aria-hidden-focus'; });
451
+ if (ariaHiddenViolation) {
452
+ await new Promise(function(resolve) { setTimeout(resolve, 0); });
453
+ ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(function(node) {
454
+ var selector = node.target && node.target[0];
455
+ if (typeof selector !== 'string') return true;
456
+ try {
457
+ var el = document.querySelector(selector);
458
+ if (!el) return true;
459
+ var focusables = el.querySelectorAll(
460
+ 'a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]'
461
+ );
462
+ if (focusables.length === 0) return false;
463
+ return Array.from(focusables).some(function(child) {
464
+ var tabindex = child.getAttribute('tabindex');
465
+ if (tabindex === null) return true;
466
+ var parsed = parseInt(tabindex, 10);
467
+ return isNaN(parsed) || parsed >= 0;
468
+ });
469
+ } catch (e) { return true; }
470
+ });
471
+ if (ariaHiddenViolation.nodes.length === 0) {
472
+ scanResult.axeScanResults.violations = axeViolations.filter(function(v) {
473
+ return v.id !== 'aria-hidden-focus';
474
+ });
475
+ }
476
+ }
477
+
447
478
  // Convert raw axe results into oobee category structure
448
479
  var filtered = _oobeeFilterAxeResults(scanResult.axeScanResults, scanResult.pageTitle);
449
480
 
@@ -114,6 +114,9 @@ const sendWcagBreakdownToSentry = async (appVersion, wcagBreakdown, ruleIdJson,
114
114
  ...(process.env.OOBEE_SCAN_PRODUCT && {
115
115
  scanProduct: process.env.OOBEE_SCAN_PRODUCT,
116
116
  }),
117
+ ...(process.env.OOBEE_TAGGED_WEBSITE && {
118
+ websiteTag: process.env.OOBEE_TAGGED_WEBSITE,
119
+ }),
117
120
  },
118
121
  user: {
119
122
  ...(scanInfo.email && scanInfo.name