@govtechsg/oobee 0.10.91 → 0.10.92

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/AGENTS.md +289 -0
  2. package/README.md +3 -0
  3. package/dist/cli.js +3 -0
  4. package/dist/combine.js +14 -2
  5. package/dist/constants/cliFunctions.js +7 -0
  6. package/dist/constants/common.js +119 -70
  7. package/dist/constants/constants.js +1 -0
  8. package/dist/crawlers/commonCrawlerFunc.js +93 -15
  9. package/dist/crawlers/crawlDomain.js +45 -57
  10. package/dist/crawlers/crawlIntelligentSitemap.js +12 -7
  11. package/dist/crawlers/crawlRateController.js +47 -0
  12. package/dist/crawlers/crawlSitemap.js +51 -62
  13. package/dist/generateOobeeClientScanner.js +31 -0
  14. package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
  15. package/dist/mergeAxeResults.js +120 -92
  16. package/dist/npmIndex.js +1 -0
  17. package/dist/utils.js +23 -28
  18. package/oobee-client-scanner.js +33 -2
  19. package/package.json +2 -2
  20. package/src/cli.ts +4 -0
  21. package/src/combine.ts +15 -1
  22. package/src/constants/cliFunctions.ts +7 -0
  23. package/src/constants/common.ts +131 -79
  24. package/src/constants/constants.ts +1 -0
  25. package/src/crawlers/commonCrawlerFunc.ts +103 -14
  26. package/src/crawlers/crawlDomain.ts +52 -65
  27. package/src/crawlers/crawlIntelligentSitemap.ts +13 -7
  28. package/src/crawlers/crawlRateController.ts +63 -0
  29. package/src/crawlers/crawlSitemap.ts +57 -70
  30. package/src/generateOobeeClientScanner.ts +31 -0
  31. package/src/index.ts +1 -0
  32. package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
  33. package/src/mergeAxeResults.ts +139 -99
  34. package/src/npmIndex.ts +1 -0
  35. package/src/utils.ts +25 -33
  36. /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt} +0 -0
@@ -1,10 +1,10 @@
1
1
  import crawlee, { EnqueueStrategy } from 'crawlee';
2
+ import { CrawlRateController } from './crawlRateController.js';
2
3
  import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
3
4
  import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
4
- import * as path from 'path';
5
- import fsp from 'fs/promises';
6
5
  import {
7
6
  createCrawleeSubFolders,
7
+ getPreLaunchHook,
8
8
  runAxeScript,
9
9
  isUrlPdf,
10
10
  shouldSkipClickDueToDisallowedHref,
@@ -29,7 +29,7 @@ import {
29
29
  getUrlsFromRobotsTxt,
30
30
  waitForPageLoaded,
31
31
  } from '../constants/common.js';
32
- import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
32
+ import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
33
33
  import {
34
34
  handlePdfDownload,
35
35
  runPdfScan,
@@ -364,9 +364,7 @@ const crawlDomain = async ({
364
364
  // same-domain strategy) still contribute their <a> links above, but
365
365
  // clicking every interactive element on them is too slow and starves
366
366
  // the crawler of time to discover pages on the primary hostname.
367
- const currentHostname = new URL(page.url()).hostname;
368
- const seedHostname = new URL(url).hostname;
369
- if (currentHostname === seedHostname) {
367
+ if (isSameHostname(new URL(page.url()).hostname, new URL(url).hostname)) {
370
368
  // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
371
369
  try {
372
370
  await customEnqueueLinksByClickingElements(page, browserContext);
@@ -382,49 +380,32 @@ const crawlDomain = async ({
382
380
  };
383
381
 
384
382
  let isAbortingScanNow = false;
383
+ const rateController = new CrawlRateController(
384
+ maxRequestsPerCrawl,
385
+ specifiedMaxConcurrency || constants.maxConcurrency,
386
+ );
385
387
 
386
388
  const crawler = register(
387
389
  new crawlee.PlaywrightCrawler({
388
390
  launchContext: {
389
391
  launcher: constants.launcher,
390
392
  launchOptions: getPlaywrightLaunchOptions(browser),
391
- // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
392
- ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
393
393
  },
394
394
  retryOnBlocked: true,
395
395
  browserPoolOptions: {
396
396
  useFingerprints: false,
397
397
  preLaunchHooks: [
398
+ getPreLaunchHook(userDataDirectory),
398
399
  async (_pageId, launchContext) => {
399
- const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
400
-
401
- // Ensure base exists
402
- await fsp.mkdir(baseDir, { recursive: true });
403
-
404
- // Create a unique subdir per browser
405
- const subProfileDir = path.join(
406
- baseDir,
407
- `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
408
- );
409
- await fsp.mkdir(subProfileDir, { recursive: true });
410
-
411
- // Assign to Crawlee's launcher
412
- // Crawlee preLaunchHooks expects launchContext to be mutated in-place.
413
- // eslint-disable-next-line no-param-reassign
414
- launchContext.userDataDir = subProfileDir;
415
-
416
- // Safely extend launchOptions
417
400
  // eslint-disable-next-line no-param-reassign
418
401
  launchContext.launchOptions = {
419
402
  ...launchContext.launchOptions,
420
403
  ignoreHTTPSErrors: true,
421
404
  ...playwrightDeviceDetailsObject,
405
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
422
406
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
423
407
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
424
408
  };
425
-
426
- // Optionally log for debugging
427
- // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
428
409
  },
429
410
  ],
430
411
  },
@@ -527,11 +508,9 @@ const crawlDomain = async ({
527
508
  const hasExceededDuration =
528
509
  scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
529
510
 
530
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
531
- if (hasExceededDuration) {
532
- console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
533
- durationExceeded = true;
534
- }
511
+ if (hasExceededDuration) {
512
+ console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
513
+ durationExceeded = true;
535
514
  isAbortingScanNow = true;
536
515
  activeCrawler.autoscaledPool.abort();
537
516
  return;
@@ -691,8 +670,7 @@ const crawlDomain = async ({
691
670
  return;
692
671
  }
693
672
 
694
- // One more check if scanned pages have reached limit due to multi-instances of handler running
695
- if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
673
+ if (rateController.claimSlot()) {
696
674
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
697
675
  numScanned: urlsCrawled.scanned.length,
698
676
  urlScanned: request.url,
@@ -703,6 +681,11 @@ const crawlDomain = async ({
703
681
  pageTitle: results.pageTitle,
704
682
  actualUrl, // i.e. actualUrl
705
683
  });
684
+ rateController.onSuccess(crawler.autoscaledPool);
685
+ if (rateController.isLimitReached()) {
686
+ isAbortingScanNow = true;
687
+ activeCrawler.autoscaledPool.abort();
688
+ }
706
689
  scannedUrlSet.add(normUrl(request.url));
707
690
  scannedResolvedUrlSet.add(normUrl(actualUrl));
708
691
 
@@ -715,8 +698,7 @@ const crawlDomain = async ({
715
698
  results.actualUrl = actualUrl;
716
699
  await dataset.pushData(results);
717
700
  }
718
- } else if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
719
- // One more check if scanned pages have reached limit due to multi-instances of handler running
701
+ } else if (rateController.claimSlot()) {
720
702
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
721
703
  numScanned: urlsCrawled.scanned.length,
722
704
  urlScanned: request.url,
@@ -726,6 +708,11 @@ const crawlDomain = async ({
726
708
  actualUrl: request.url,
727
709
  pageTitle: results.pageTitle,
728
710
  });
711
+ rateController.onSuccess(crawler.autoscaledPool);
712
+ if (rateController.isLimitReached()) {
713
+ isAbortingScanNow = true;
714
+ activeCrawler.autoscaledPool.abort();
715
+ }
729
716
  scannedUrlSet.add(normUrl(request.url));
730
717
  scannedResolvedUrlSet.add(normUrl(request.url));
731
718
  await dataset.pushData(results);
@@ -777,33 +764,35 @@ const crawlDomain = async ({
777
764
  });
778
765
  }
779
766
  } catch {
780
- // Do nothing since the error will be pushed
767
+ // Recovery failed; Crawlee will retry the request automatically
781
768
  }
782
769
 
783
- // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
784
- // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
785
- if (!isAbortingScanNow) {
786
- guiInfoLog(guiInfoStatusTypes.ERROR, {
787
- numScanned: urlsCrawled.scanned.length,
788
- urlScanned: request.url,
789
- });
790
-
791
- urlsCrawled.error.push({
792
- url: request.url,
793
- pageTitle: request.url,
794
- actualUrl: request.url,
795
- metadata: STATUS_CODE_METADATA[2],
796
- });
797
- }
770
+ // Do not push to urlsCrawled.error here Crawlee will retry the request
771
+ // (up to maxRequestRetries, default 3). If all retries are exhausted,
772
+ // failedRequestHandler will record the error. Pushing here causes
773
+ // duplicates and false positives for URLs that succeed on retry.
798
774
  }
799
775
  },
800
776
  failedRequestHandler: async ({ request, response }) => {
777
+ if (isAbortingScanNow) {
778
+ return;
779
+ }
780
+
781
+ const status = response?.status();
782
+ if (rateController.onFailure(status, crawler.autoscaledPool)) {
783
+ consoleLogger.info(
784
+ `Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`,
785
+ );
786
+ isAbortingScanNow = true;
787
+ crawler.autoscaledPool?.abort();
788
+ return;
789
+ }
790
+
801
791
  guiInfoLog(guiInfoStatusTypes.ERROR, {
802
792
  numScanned: urlsCrawled.scanned.length,
803
793
  urlScanned: request.url,
804
794
  });
805
795
 
806
- const status = response?.status();
807
796
  const metadata =
808
797
  typeof status === 'number'
809
798
  ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
@@ -819,15 +808,13 @@ const crawlDomain = async ({
819
808
  },
820
809
  maxRequestsPerCrawl: Infinity,
821
810
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
822
- ...(process.env.OOBEE_FAST_CRAWLER && {
823
- autoscaledPoolOptions: {
824
- minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
825
- maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
826
- desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
827
- scaleUpStepRatio: 0.99, // Scale up faster
828
- scaleDownStepRatio: 0.1, // Scale down slower
829
- },
830
- }),
811
+ autoscaledPoolOptions: {
812
+ minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
813
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
814
+ desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
815
+ scaleUpStepRatio: 0.99, // Scale up faster
816
+ scaleDownStepRatio: 0.1, // Scale down slower
817
+ },
831
818
  }),
832
819
  );
833
820
 
@@ -850,7 +837,7 @@ const crawlDomain = async ({
850
837
  .map(item => item.actualUrl || item.url)
851
838
  .filter(pageUrl => {
852
839
  try {
853
- return new URL(pageUrl).hostname === seedHostname && !clickPassVisited.has(pageUrl);
840
+ return isSameHostname(new URL(pageUrl).hostname, seedHostname) && !clickPassVisited.has(pageUrl);
854
841
  } catch {
855
842
  return false;
856
843
  }
@@ -7,7 +7,7 @@ import { consoleLogger, guiInfoLog } from '../logs.js';
7
7
  import crawlDomain from './crawlDomain.js';
8
8
  import crawlSitemap from './crawlSitemap.js';
9
9
  import { ViewportSettingsClass } from '../combine.js';
10
- import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
10
+ import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt, initModifiedUserAgent } from '../constants/common.js';
11
11
  import { register } from '../utils.js';
12
12
 
13
13
  const crawlIntelligentSitemap = async (
@@ -40,6 +40,10 @@ const crawlIntelligentSitemap = async (
40
40
 
41
41
  ({ dataset } = await createCrawleeSubFolders(randomToken));
42
42
 
43
+ // Initialise modified User-Agent early so sitemap discovery requests
44
+ // don't expose "HeadlessChrome" (which triggers bot-blocking on some sites).
45
+ await initModifiedUserAgent(browser);
46
+
43
47
  function getHomeUrl(parsedUrl: string) {
44
48
  const urlObject = new URL(parsedUrl);
45
49
  return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
@@ -62,6 +66,7 @@ const crawlIntelligentSitemap = async (
62
66
  context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
63
67
  ...launchOptions,
64
68
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
69
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
65
70
  });
66
71
  register(context);
67
72
  } else {
@@ -70,6 +75,7 @@ const crawlIntelligentSitemap = async (
70
75
  register(browserInstance as unknown as { close: () => Promise<void> });
71
76
  context = await browserInstance.newContext({
72
77
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
78
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
73
79
  });
74
80
  }
75
81
 
@@ -93,7 +99,7 @@ const crawlIntelligentSitemap = async (
93
99
  const checkUrlExists = async (page: Page, parsedUrl: string) => {
94
100
  try {
95
101
  const response = await page.goto(parsedUrl);
96
- return response.ok();
102
+ return response?.ok() ?? false;
97
103
  } catch (e) {
98
104
  consoleLogger.error(e);
99
105
  return false;
@@ -105,7 +111,7 @@ const crawlIntelligentSitemap = async (
105
111
  try {
106
112
  sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
107
113
  if (sitemapUrls.length > 0) {
108
- console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
114
+ consoleLogger.info(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
109
115
  sitemapExist = true;
110
116
  }
111
117
  } catch (error) {
@@ -125,7 +131,7 @@ const crawlIntelligentSitemap = async (
125
131
  }
126
132
 
127
133
  if (!sitemapExist) {
128
- console.log('Unable to find sitemap. Commencing website crawl instead.');
134
+ consoleLogger.info('Unable to find sitemap. Commencing website crawl instead.');
129
135
  return await crawlDomain({
130
136
  url,
131
137
  randomToken,
@@ -157,7 +163,7 @@ const crawlIntelligentSitemap = async (
157
163
  break;
158
164
  }
159
165
 
160
- console.log(`Processing sitemap: ${currentSitemapUrl}`);
166
+ consoleLogger.info(`Processing sitemap: ${currentSitemapUrl}`);
161
167
  urlsCrawledFinal = await crawlSitemap({
162
168
  sitemapUrl: currentSitemapUrl,
163
169
  randomToken,
@@ -187,7 +193,7 @@ const crawlIntelligentSitemap = async (
187
193
  const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
188
194
 
189
195
  if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
190
- console.log(
196
+ consoleLogger.info(
191
197
  `Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`,
192
198
  );
193
199
  urlsCrawledFinal = await crawlDomain({
@@ -212,7 +218,7 @@ const crawlIntelligentSitemap = async (
212
218
  scanDuration: remainingScanDuration,
213
219
  });
214
220
  } else if (!hasDurationRemaining) {
215
- console.log(
221
+ consoleLogger.info(
216
222
  `Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`,
217
223
  );
218
224
  durationExceeded = true;
@@ -0,0 +1,63 @@
1
+ import { consoleLogger } from '../logs.js';
2
+
3
+ export class CrawlRateController {
4
+ private scannedCount = 0;
5
+ private readonly maxPages: number;
6
+ private consecutiveFailures = 0;
7
+ private consecutiveSuccesses = 0;
8
+ private readonly maxConsecutiveFailures: number;
9
+ private readonly originalMaxConcurrency: number;
10
+ private static readonly RECOVERY_INTERVAL = 10;
11
+
12
+ constructor(maxRequestsPerCrawl: number, maxConcurrency: number) {
13
+ this.maxPages = maxRequestsPerCrawl;
14
+ this.maxConsecutiveFailures = Number(process.env.OOBEE_CONSECUTIVE_MAX_RETRIES) || 100;
15
+ this.originalMaxConcurrency = maxConcurrency;
16
+ }
17
+
18
+ claimSlot(): boolean {
19
+ if (this.scannedCount >= this.maxPages) {
20
+ return false;
21
+ }
22
+ this.scannedCount++;
23
+ return true;
24
+ }
25
+
26
+ onSuccess(pool?: { maxConcurrency: number }): void {
27
+ this.consecutiveFailures = 0;
28
+ this.consecutiveSuccesses++;
29
+
30
+ if (pool && this.consecutiveSuccesses % CrawlRateController.RECOVERY_INTERVAL === 0) {
31
+ if (pool.maxConcurrency < this.originalMaxConcurrency) {
32
+ pool.maxConcurrency = Math.min(pool.maxConcurrency + 2, this.originalMaxConcurrency);
33
+ consoleLogger.info(`Recovering concurrency to ${pool.maxConcurrency}`);
34
+ }
35
+ }
36
+ }
37
+
38
+ onFailure(httpStatus: number | undefined, pool?: { maxConcurrency: number }): boolean {
39
+ if (typeof httpStatus !== 'number' || httpStatus < 400) {
40
+ return false;
41
+ }
42
+
43
+ this.consecutiveSuccesses = 0;
44
+ this.consecutiveFailures++;
45
+
46
+ if (pool && pool.maxConcurrency > 1) {
47
+ pool.maxConcurrency = Math.max(1, Math.floor(pool.maxConcurrency / 2));
48
+ consoleLogger.info(
49
+ `Rate limited (HTTP ${httpStatus}) — reducing concurrency to ${pool.maxConcurrency}`,
50
+ );
51
+ }
52
+
53
+ if (this.consecutiveFailures >= this.maxConsecutiveFailures) {
54
+ return true;
55
+ }
56
+
57
+ return false;
58
+ }
59
+
60
+ isLimitReached(): boolean {
61
+ return this.scannedCount >= this.maxPages;
62
+ }
63
+ }
@@ -1,9 +1,9 @@
1
1
  import crawlee, { EnqueueStrategy, LaunchContext, Request, RequestList, Dataset } from 'crawlee';
2
+ import { CrawlRateController } from './crawlRateController.js';
2
3
  import fs from 'fs';
3
- import * as path from 'path';
4
- import fsp from 'fs/promises';
5
4
  import {
6
5
  createCrawleeSubFolders,
6
+ getPreLaunchHook,
7
7
  preNavigationHooks,
8
8
  runAxeScript,
9
9
  isUrlPdf,
@@ -30,7 +30,7 @@ import {
30
30
  mapPdfScanResults,
31
31
  doPdfScreenshots,
32
32
  } from './pdfScanFunc.js';
33
- import { guiInfoLog } from '../logs.js';
33
+ import { consoleLogger, guiInfoLog } from '../logs.js';
34
34
  import { ViewportSettingsClass } from '../combine.js';
35
35
 
36
36
  const crawlSitemap = async ({
@@ -81,6 +81,10 @@ const crawlSitemap = async ({
81
81
  let urlsCrawled: UrlsCrawled;
82
82
  let durationExceeded = false;
83
83
  let isAbortingScan = false;
84
+ const rateController = new CrawlRateController(
85
+ maxRequestsPerCrawl,
86
+ specifiedMaxConcurrency || constants.maxConcurrency,
87
+ );
84
88
 
85
89
  if (fromCrawlIntelligentSitemap) {
86
90
  dataset = datasetFromIntelligent;
@@ -125,39 +129,20 @@ const crawlSitemap = async ({
125
129
  launchContext: {
126
130
  launcher: constants.launcher,
127
131
  launchOptions: getPlaywrightLaunchOptions(browser),
128
- // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
129
- ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
130
132
  },
131
133
  retryOnBlocked: true,
132
134
  browserPoolOptions: {
133
135
  useFingerprints: false,
134
136
  preLaunchHooks: [
137
+ getPreLaunchHook(userDataDirectory),
135
138
  async (_pageId, launchContext) => {
136
- const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
137
-
138
- // Ensure base exists
139
- await fsp.mkdir(baseDir, { recursive: true });
140
-
141
- // Create a unique subdir per browser
142
- const subProfileDir = path.join(
143
- baseDir,
144
- `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
145
- );
146
- await fsp.mkdir(subProfileDir, { recursive: true });
147
-
148
- // Assign to Crawlee's launcher
149
- launchContext.userDataDir = subProfileDir;
150
-
151
- // Safely extend launchOptions
152
139
  launchContext.launchOptions = {
153
140
  ...launchContext.launchOptions,
154
141
  ignoreHTTPSErrors: true,
155
142
  ...playwrightDeviceDetailsObject,
143
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
156
144
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
157
145
  };
158
-
159
- // Optionally log for debugging
160
- // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
161
146
  },
162
147
  ],
163
148
  },
@@ -259,13 +244,11 @@ const crawlSitemap = async ({
259
244
  const hasExceededDuration =
260
245
  scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
261
246
 
262
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
247
+ if (hasExceededDuration) {
248
+ consoleLogger.info(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
249
+ durationExceeded = true;
263
250
  isAbortingScan = true;
264
- if (hasExceededDuration) {
265
- console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
266
- durationExceeded = true;
267
- }
268
- crawler.autoscaledPool.abort(); // stops new requests
251
+ crawler.autoscaledPool.abort();
269
252
  return;
270
253
  }
271
254
 
@@ -376,26 +359,33 @@ const crawlSitemap = async ({
376
359
  // Page/context was destroyed during navigation — handled by outer catch
377
360
  }
378
361
 
379
- guiInfoLog(guiInfoStatusTypes.SCANNED, {
380
- numScanned: urlsCrawled.scanned.length,
381
- urlScanned: request.url,
382
- });
362
+ if (rateController.claimSlot()) {
363
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
364
+ numScanned: urlsCrawled.scanned.length,
365
+ urlScanned: request.url,
366
+ });
383
367
 
384
- urlsCrawled.scanned.push({
385
- url: request.url,
386
- pageTitle: results.pageTitle,
387
- actualUrl, // i.e. actualUrl
388
- });
368
+ urlsCrawled.scanned.push({
369
+ url: request.url,
370
+ pageTitle: results.pageTitle,
371
+ actualUrl, // i.e. actualUrl
372
+ });
373
+ rateController.onSuccess(crawler.autoscaledPool);
374
+ if (rateController.isLimitReached()) {
375
+ isAbortingScan = true;
376
+ crawler.autoscaledPool.abort();
377
+ }
389
378
 
390
- urlsCrawled.scannedRedirects.push({
391
- fromUrl: request.url,
392
- toUrl: actualUrl,
393
- });
379
+ urlsCrawled.scannedRedirects.push({
380
+ fromUrl: request.url,
381
+ toUrl: actualUrl,
382
+ });
394
383
 
395
- results.url = request.url;
396
- results.actualUrl = actualUrl;
384
+ results.url = request.url;
385
+ results.actualUrl = actualUrl;
397
386
 
398
- await dataset.pushData(results);
387
+ await dataset.pushData(results);
388
+ }
399
389
  } else {
400
390
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
401
391
  numScanned: urlsCrawled.scanned.length,
@@ -420,20 +410,10 @@ const crawlSitemap = async ({
420
410
  }
421
411
  }
422
412
  } catch (e) {
423
- if (!isAbortingScan) {
424
- guiInfoLog(guiInfoStatusTypes.ERROR, {
425
- numScanned: urlsCrawled.scanned.length,
426
- urlScanned: request.url,
427
- });
428
-
429
- urlsCrawled.error.push({
430
- url: request.url,
431
- pageTitle: request.url,
432
- actualUrl: request.url,
433
- metadata: STATUS_CODE_METADATA[2],
434
- httpStatusCode: 0,
435
- });
436
- }
413
+ // Do not push to urlsCrawled.error here — Crawlee will retry the request
414
+ // (up to maxRequestRetries, default 3). If all retries are exhausted,
415
+ // failedRequestHandler will record the error. Pushing here causes
416
+ // duplicates and false positives for URLs that succeed on retry.
437
417
  }
438
418
  },
439
419
  failedRequestHandler: async ({ request, response, error }) => {
@@ -441,12 +421,21 @@ const crawlSitemap = async ({
441
421
  return;
442
422
  }
443
423
 
424
+ const status = response?.status();
425
+ if (rateController.onFailure(status, crawler.autoscaledPool)) {
426
+ consoleLogger.info(
427
+ `Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`,
428
+ );
429
+ isAbortingScan = true;
430
+ crawler.autoscaledPool?.abort();
431
+ return;
432
+ }
433
+
444
434
  guiInfoLog(guiInfoStatusTypes.ERROR, {
445
435
  numScanned: urlsCrawled.scanned.length,
446
436
  urlScanned: request.url,
447
437
  });
448
438
 
449
- const status = response?.status();
450
439
  const metadata =
451
440
  typeof status === 'number'
452
441
  ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
@@ -463,15 +452,13 @@ const crawlSitemap = async ({
463
452
  },
464
453
  maxRequestsPerCrawl: Infinity,
465
454
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
466
- ...(process.env.OOBEE_FAST_CRAWLER && {
467
- autoscaledPoolOptions: {
468
- minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
469
- maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
470
- desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
471
- scaleUpStepRatio: 0.99, // Scale up faster
472
- scaleDownStepRatio: 0.1, // Scale down slower
473
- },
474
- }),
455
+ autoscaledPoolOptions: {
456
+ minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
457
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
458
+ desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
459
+ scaleUpStepRatio: 0.99, // Scale up faster
460
+ scaleDownStepRatio: 0.1, // Scale down slower
461
+ },
475
462
  }),
476
463
  );
477
464
 
@@ -461,6 +461,37 @@ const scanApiScript = (
461
461
  // Run axe-core + oobee custom checks
462
462
  var scanResult = await window.runA11yScan(elementsToScan, '');
463
463
 
464
+ // Re-verify aria-hidden-focus violations against the live DOM to handle
465
+ // race conditions with JS that sets tabindex="-1" after aria-hidden
466
+ var axeViolations = scanResult.axeScanResults.violations || [];
467
+ var ariaHiddenViolation = axeViolations.find(function(v) { return v.id === 'aria-hidden-focus'; });
468
+ if (ariaHiddenViolation) {
469
+ await new Promise(function(resolve) { setTimeout(resolve, 0); });
470
+ ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(function(node) {
471
+ var selector = node.target && node.target[0];
472
+ if (typeof selector !== 'string') return true;
473
+ try {
474
+ var el = document.querySelector(selector);
475
+ if (!el) return true;
476
+ var focusables = el.querySelectorAll(
477
+ 'a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]'
478
+ );
479
+ if (focusables.length === 0) return false;
480
+ return Array.from(focusables).some(function(child) {
481
+ var tabindex = child.getAttribute('tabindex');
482
+ if (tabindex === null) return true;
483
+ var parsed = parseInt(tabindex, 10);
484
+ return isNaN(parsed) || parsed >= 0;
485
+ });
486
+ } catch (e) { return true; }
487
+ });
488
+ if (ariaHiddenViolation.nodes.length === 0) {
489
+ scanResult.axeScanResults.violations = axeViolations.filter(function(v) {
490
+ return v.id !== 'aria-hidden-focus';
491
+ });
492
+ }
493
+ }
494
+
464
495
  // Convert raw axe results into oobee category structure
465
496
  var filtered = _oobeeFilterAxeResults(scanResult.axeScanResults, scanResult.pageTitle);
466
497
 
package/src/index.ts CHANGED
@@ -52,6 +52,7 @@ export type Answers = {
52
52
  ruleset: RuleFlags[];
53
53
  generateJsonFiles: boolean;
54
54
  scanDuration?: number;
55
+ websiteTag?: string;
55
56
  };
56
57
 
57
58
  export type Data = {
@@ -144,6 +144,9 @@ const sendWcagBreakdownToSentry = async (
144
144
  ...(process.env.OOBEE_SCAN_PRODUCT && {
145
145
  scanProduct: process.env.OOBEE_SCAN_PRODUCT,
146
146
  }),
147
+ ...(process.env.OOBEE_TAGGED_WEBSITE && {
148
+ websiteTag: process.env.OOBEE_TAGGED_WEBSITE,
149
+ }),
147
150
  },
148
151
  user: {
149
152
  ...(scanInfo.email && scanInfo.name