@govtechsg/oobee 0.10.91 → 0.10.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/AGENTS.md +303 -0
  2. package/README.md +22 -0
  3. package/dist/cli.js +3 -0
  4. package/dist/combine.js +15 -3
  5. package/dist/constants/cliFunctions.js +7 -0
  6. package/dist/constants/common.js +149 -80
  7. package/dist/constants/constants.js +1 -0
  8. package/dist/crawlers/commonCrawlerFunc.js +136 -15
  9. package/dist/crawlers/crawlDomain.js +55 -58
  10. package/dist/crawlers/crawlIntelligentSitemap.js +21 -11
  11. package/dist/crawlers/crawlRateController.js +47 -0
  12. package/dist/crawlers/crawlSitemap.js +51 -62
  13. package/dist/crawlers/runCustom.js +8 -2
  14. package/dist/generateOobeeClientScanner.js +32 -1
  15. package/dist/mergeAxeResults/itemsStore.js +32 -3
  16. package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
  17. package/dist/mergeAxeResults.js +120 -92
  18. package/dist/npmIndex.js +1 -0
  19. package/dist/utils.js +23 -28
  20. package/oobee-client-scanner.js +35 -4
  21. package/package.json +3 -3
  22. package/src/cli.ts +4 -0
  23. package/src/combine.ts +16 -1
  24. package/src/constants/cliFunctions.ts +7 -0
  25. package/src/constants/common.ts +162 -90
  26. package/src/constants/constants.ts +1 -0
  27. package/src/crawlers/commonCrawlerFunc.ts +148 -14
  28. package/src/crawlers/crawlDomain.ts +64 -66
  29. package/src/crawlers/crawlIntelligentSitemap.ts +23 -11
  30. package/src/crawlers/crawlRateController.ts +63 -0
  31. package/src/crawlers/crawlSitemap.ts +57 -70
  32. package/src/crawlers/runCustom.ts +10 -1
  33. package/src/generateOobeeClientScanner.ts +32 -1
  34. package/src/index.ts +1 -0
  35. package/src/mergeAxeResults/itemsStore.ts +37 -3
  36. package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
  37. package/src/mergeAxeResults.ts +139 -99
  38. package/src/npmIndex.ts +1 -0
  39. package/src/utils.ts +25 -33
  40. /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt} +0 -0
@@ -1,10 +1,9 @@
1
1
  import crawlee from 'crawlee';
2
- import * as path from 'path';
3
- import fsp from 'fs/promises';
4
- import { createCrawleeSubFolders, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
2
+ import { CrawlRateController } from './crawlRateController.js';
3
+ import { createCrawleeSubFolders, getPreLaunchHook, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, splitAuthHeaders, } from './commonCrawlerFunc.js';
5
4
  import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
6
5
  import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
7
- import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
6
+ import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
8
7
  import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
9
8
  import { consoleLogger, guiInfoLog } from '../logs.js';
10
9
  const isBlacklisted = (url, blacklistedPatterns) => {
@@ -258,9 +257,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
258
257
  // same-domain strategy) still contribute their <a> links above, but
259
258
  // clicking every interactive element on them is too slow and starves
260
259
  // the crawler of time to discover pages on the primary hostname.
261
- const currentHostname = new URL(page.url()).hostname;
262
- const seedHostname = new URL(url).hostname;
263
- if (currentHostname === seedHostname) {
260
+ if (isSameHostname(new URL(page.url()).hostname, new URL(url).hostname)) {
264
261
  // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
265
262
  try {
266
263
  await customEnqueueLinksByClickingElements(page, browserContext);
@@ -277,43 +274,40 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
277
274
  }
278
275
  };
279
276
  let isAbortingScanNow = false;
277
+ const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
278
+ const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
280
279
  const crawler = register(new crawlee.PlaywrightCrawler({
281
280
  launchContext: {
282
281
  launcher: constants.launcher,
283
282
  launchOptions: getPlaywrightLaunchOptions(browser),
284
- // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
285
- ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
286
283
  },
287
284
  retryOnBlocked: true,
288
285
  browserPoolOptions: {
289
286
  useFingerprints: false,
290
287
  preLaunchHooks: [
288
+ getPreLaunchHook(userDataDirectory),
291
289
  async (_pageId, launchContext) => {
292
- const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
293
- // Ensure base exists
294
- await fsp.mkdir(baseDir, { recursive: true });
295
- // Create a unique subdir per browser
296
- const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
297
- await fsp.mkdir(subProfileDir, { recursive: true });
298
- // Assign to Crawlee's launcher
299
- // Crawlee preLaunchHooks expects launchContext to be mutated in-place.
300
- // eslint-disable-next-line no-param-reassign
301
- launchContext.userDataDir = subProfileDir;
302
- // Safely extend launchOptions
303
290
  // eslint-disable-next-line no-param-reassign
304
291
  launchContext.launchOptions = {
305
292
  ...launchContext.launchOptions,
306
293
  ignoreHTTPSErrors: true,
307
294
  ...playwrightDeviceDetailsObject,
295
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
308
296
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
309
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
297
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
298
+ ...(httpCredentials && { httpCredentials }),
310
299
  };
311
- // Optionally log for debugging
312
- // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
313
300
  },
314
301
  ],
315
302
  },
316
303
  requestQueue,
304
+ preNavigationHooks: [
305
+ async (crawlingContext) => {
306
+ if (extraHTTPHeaders) {
307
+ crawlingContext.request.headers = extraHTTPHeaders;
308
+ }
309
+ },
310
+ ],
317
311
  postNavigationHooks: [
318
312
  async (crawlingContext) => {
319
313
  const { page, request } = crawlingContext;
@@ -390,11 +384,9 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
390
384
  return;
391
385
  }
392
386
  const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
393
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
394
- if (hasExceededDuration) {
395
- console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
396
- durationExceeded = true;
397
- }
387
+ if (hasExceededDuration) {
388
+ console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
389
+ durationExceeded = true;
398
390
  isAbortingScanNow = true;
399
391
  activeCrawler.autoscaledPool.abort();
400
392
  return;
@@ -527,8 +519,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
527
519
  });
528
520
  return;
529
521
  }
530
- // One more check if scanned pages have reached limit due to multi-instances of handler running
531
- if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
522
+ if (rateController.claimSlot()) {
532
523
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
533
524
  numScanned: urlsCrawled.scanned.length,
534
525
  urlScanned: request.url,
@@ -538,6 +529,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
538
529
  pageTitle: results.pageTitle,
539
530
  actualUrl, // i.e. actualUrl
540
531
  });
532
+ rateController.onSuccess(crawler.autoscaledPool);
533
+ if (rateController.isLimitReached()) {
534
+ isAbortingScanNow = true;
535
+ activeCrawler.autoscaledPool.abort();
536
+ }
541
537
  scannedUrlSet.add(normUrl(request.url));
542
538
  scannedResolvedUrlSet.add(normUrl(actualUrl));
543
539
  urlsCrawled.scannedRedirects.push({
@@ -549,8 +545,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
549
545
  await dataset.pushData(results);
550
546
  }
551
547
  }
552
- else if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
553
- // One more check if scanned pages have reached limit due to multi-instances of handler running
548
+ else if (rateController.claimSlot()) {
554
549
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
555
550
  numScanned: urlsCrawled.scanned.length,
556
551
  urlScanned: request.url,
@@ -560,6 +555,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
560
555
  actualUrl: request.url,
561
556
  pageTitle: results.pageTitle,
562
557
  });
558
+ rateController.onSuccess(crawler.autoscaledPool);
559
+ if (rateController.isLimitReached()) {
560
+ isAbortingScanNow = true;
561
+ activeCrawler.autoscaledPool.abort();
562
+ }
563
563
  scannedUrlSet.add(normUrl(request.url));
564
564
  scannedResolvedUrlSet.add(normUrl(request.url));
565
565
  await dataset.pushData(results);
@@ -611,30 +611,29 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
611
611
  }
612
612
  }
613
613
  catch {
614
- // Do nothing since the error will be pushed
615
- }
616
- // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
617
- // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
618
- if (!isAbortingScanNow) {
619
- guiInfoLog(guiInfoStatusTypes.ERROR, {
620
- numScanned: urlsCrawled.scanned.length,
621
- urlScanned: request.url,
622
- });
623
- urlsCrawled.error.push({
624
- url: request.url,
625
- pageTitle: request.url,
626
- actualUrl: request.url,
627
- metadata: STATUS_CODE_METADATA[2],
628
- });
614
+ // Recovery failed; Crawlee will retry the request automatically
629
615
  }
616
+ // Do not push to urlsCrawled.error here — Crawlee will retry the request
617
+ // (up to maxRequestRetries, default 3). If all retries are exhausted,
618
+ // failedRequestHandler will record the error. Pushing here causes
619
+ // duplicates and false positives for URLs that succeed on retry.
630
620
  }
631
621
  },
632
622
  failedRequestHandler: async ({ request, response }) => {
623
+ if (isAbortingScanNow) {
624
+ return;
625
+ }
626
+ const status = response?.status();
627
+ if (rateController.onFailure(status, crawler.autoscaledPool)) {
628
+ consoleLogger.info(`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`);
629
+ isAbortingScanNow = true;
630
+ crawler.autoscaledPool?.abort();
631
+ return;
632
+ }
633
633
  guiInfoLog(guiInfoStatusTypes.ERROR, {
634
634
  numScanned: urlsCrawled.scanned.length,
635
635
  urlScanned: request.url,
636
636
  });
637
- const status = response?.status();
638
637
  const metadata = typeof status === 'number'
639
638
  ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
640
639
  : STATUS_CODE_METADATA[2];
@@ -648,15 +647,13 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
648
647
  },
649
648
  maxRequestsPerCrawl: Infinity,
650
649
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
651
- ...(process.env.OOBEE_FAST_CRAWLER && {
652
- autoscaledPoolOptions: {
653
- minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
654
- maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
655
- desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
656
- scaleUpStepRatio: 0.99, // Scale up faster
657
- scaleDownStepRatio: 0.1, // Scale down slower
658
- },
659
- }),
650
+ autoscaledPoolOptions: {
651
+ minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
652
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
653
+ desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
654
+ scaleUpStepRatio: 0.99, // Scale up faster
655
+ scaleDownStepRatio: 0.1, // Scale down slower
656
+ },
660
657
  }));
661
658
  await crawler.run();
662
659
  // Additional passes: keep re-visiting scanned seed-hostname pages for
@@ -675,7 +672,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
675
672
  .map(item => item.actualUrl || item.url)
676
673
  .filter(pageUrl => {
677
674
  try {
678
- return new URL(pageUrl).hostname === seedHostname && !clickPassVisited.has(pageUrl);
675
+ return isSameHostname(new URL(pageUrl).hostname, seedHostname) && !clickPassVisited.has(pageUrl);
679
676
  }
680
677
  catch {
681
678
  return false;
@@ -1,9 +1,9 @@
1
- import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
1
+ import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
2
2
  import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
3
3
  import { consoleLogger, guiInfoLog } from '../logs.js';
4
4
  import crawlDomain from './crawlDomain.js';
5
5
  import crawlSitemap from './crawlSitemap.js';
6
- import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
6
+ import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt, initModifiedUserAgent } from '../constants/common.js';
7
7
  import { register } from '../utils.js';
8
8
  const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, strategy, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, followRobots, extraHTTPHeaders, safeMode, scanDuration) => {
9
9
  const startTime = Date.now(); // Track start time
@@ -15,6 +15,9 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
15
15
  let sitemapUrl;
16
16
  let durationExceeded = false;
17
17
  ({ dataset } = await createCrawleeSubFolders(randomToken));
18
+ // Initialise modified User-Agent early so sitemap discovery requests
19
+ // don't expose "HeadlessChrome" (which triggers bot-blocking on some sites).
20
+ await initModifiedUserAgent(browser);
18
21
  function getHomeUrl(parsedUrl) {
19
22
  const urlObject = new URL(parsedUrl);
20
23
  return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
@@ -23,24 +26,31 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
23
26
  const homeUrl = getHomeUrl(link);
24
27
  let sitemapLink = '';
25
28
  const launchOptions = getPlaywrightLaunchOptions(browser);
29
+ const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
26
30
  let context;
27
31
  let browserInstance;
28
32
  if (process.env.CRAWLEE_HEADLESS === '1') {
29
33
  const effectiveUserDataDirectory = userDataDirectory || '';
30
34
  context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
31
35
  ...launchOptions,
32
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
36
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
37
+ ...(httpCredentials && { httpCredentials }),
38
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
33
39
  });
34
40
  register(context);
35
41
  }
36
42
  else {
37
- // In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
38
43
  browserInstance = await constants.launcher.launch(launchOptions);
39
44
  register(browserInstance);
40
45
  context = await browserInstance.newContext({
41
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
46
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
47
+ ...(httpCredentials && { httpCredentials }),
48
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
42
49
  });
43
50
  }
51
+ if (authHeader) {
52
+ await addAuthRouteHandler(context, link, authHeader);
53
+ }
44
54
  const page = await context.newPage();
45
55
  for (const path of sitemapPaths) {
46
56
  sitemapLink = homeUrl + path;
@@ -59,7 +69,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
59
69
  const checkUrlExists = async (page, parsedUrl) => {
60
70
  try {
61
71
  const response = await page.goto(parsedUrl);
62
- return response.ok();
72
+ return response?.ok() ?? false;
63
73
  }
64
74
  catch (e) {
65
75
  consoleLogger.error(e);
@@ -71,7 +81,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
71
81
  try {
72
82
  sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
73
83
  if (sitemapUrls.length > 0) {
74
- console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
84
+ consoleLogger.info(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
75
85
  sitemapExist = true;
76
86
  }
77
87
  }
@@ -91,7 +101,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
91
101
  }
92
102
  }
93
103
  if (!sitemapExist) {
94
- console.log('Unable to find sitemap. Commencing website crawl instead.');
104
+ consoleLogger.info('Unable to find sitemap. Commencing website crawl instead.');
95
105
  return await crawlDomain({
96
106
  url,
97
107
  randomToken,
@@ -121,7 +131,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
121
131
  durationExceeded = true;
122
132
  break;
123
133
  }
124
- console.log(`Processing sitemap: ${currentSitemapUrl}`);
134
+ consoleLogger.info(`Processing sitemap: ${currentSitemapUrl}`);
125
135
  urlsCrawledFinal = await crawlSitemap({
126
136
  sitemapUrl: currentSitemapUrl,
127
137
  randomToken,
@@ -149,7 +159,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
149
159
  const remainingScanDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : 0;
150
160
  const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
151
161
  if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
152
- console.log(`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`);
162
+ consoleLogger.info(`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`);
153
163
  urlsCrawledFinal = await crawlDomain({
154
164
  url,
155
165
  randomToken,
@@ -173,7 +183,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
173
183
  });
174
184
  }
175
185
  else if (!hasDurationRemaining) {
176
- console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
186
+ consoleLogger.info(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
177
187
  durationExceeded = true;
178
188
  }
179
189
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
@@ -0,0 +1,47 @@
1
+ import { consoleLogger } from '../logs.js';
2
+ export class CrawlRateController {
3
+ constructor(maxRequestsPerCrawl, maxConcurrency) {
4
+ this.scannedCount = 0;
5
+ this.consecutiveFailures = 0;
6
+ this.consecutiveSuccesses = 0;
7
+ this.maxPages = maxRequestsPerCrawl;
8
+ this.maxConsecutiveFailures = Number(process.env.OOBEE_CONSECUTIVE_MAX_RETRIES) || 100;
9
+ this.originalMaxConcurrency = maxConcurrency;
10
+ }
11
+ claimSlot() {
12
+ if (this.scannedCount >= this.maxPages) {
13
+ return false;
14
+ }
15
+ this.scannedCount++;
16
+ return true;
17
+ }
18
+ onSuccess(pool) {
19
+ this.consecutiveFailures = 0;
20
+ this.consecutiveSuccesses++;
21
+ if (pool && this.consecutiveSuccesses % CrawlRateController.RECOVERY_INTERVAL === 0) {
22
+ if (pool.maxConcurrency < this.originalMaxConcurrency) {
23
+ pool.maxConcurrency = Math.min(pool.maxConcurrency + 2, this.originalMaxConcurrency);
24
+ consoleLogger.info(`Recovering concurrency to ${pool.maxConcurrency}`);
25
+ }
26
+ }
27
+ }
28
+ onFailure(httpStatus, pool) {
29
+ if (typeof httpStatus !== 'number' || httpStatus < 400) {
30
+ return false;
31
+ }
32
+ this.consecutiveSuccesses = 0;
33
+ this.consecutiveFailures++;
34
+ if (pool && pool.maxConcurrency > 1) {
35
+ pool.maxConcurrency = Math.max(1, Math.floor(pool.maxConcurrency / 2));
36
+ consoleLogger.info(`Rate limited (HTTP ${httpStatus}) — reducing concurrency to ${pool.maxConcurrency}`);
37
+ }
38
+ if (this.consecutiveFailures >= this.maxConsecutiveFailures) {
39
+ return true;
40
+ }
41
+ return false;
42
+ }
43
+ isLimitReached() {
44
+ return this.scannedCount >= this.maxPages;
45
+ }
46
+ }
47
+ CrawlRateController.RECOVERY_INTERVAL = 10;
@@ -1,18 +1,18 @@
1
1
  import crawlee, { EnqueueStrategy, RequestList } from 'crawlee';
2
- import * as path from 'path';
3
- import fsp from 'fs/promises';
4
- import { createCrawleeSubFolders, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
2
+ import { CrawlRateController } from './crawlRateController.js';
3
+ import { createCrawleeSubFolders, getPreLaunchHook, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
5
4
  import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, disallowedListOfPatterns, FileTypes, } from '../constants/constants.js';
6
5
  import { getLinksFromSitemap, getPlaywrightLaunchOptions, isSkippedUrl, waitForPageLoaded, isFilePath, } from '../constants/common.js';
7
6
  import { areLinksEqual, isFollowStrategy, isWhitelistedContentType, normUrl, register } from '../utils.js';
8
7
  import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
9
- import { guiInfoLog } from '../logs.js';
8
+ import { consoleLogger, guiInfoLog } from '../logs.js';
10
9
  const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = '', scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
11
10
  const crawlStartTime = Date.now();
12
11
  let dataset;
13
12
  let urlsCrawled;
14
13
  let durationExceeded = false;
15
14
  let isAbortingScan = false;
15
+ const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
16
16
  if (fromCrawlIntelligentSitemap) {
17
17
  dataset = datasetFromIntelligent;
18
18
  urlsCrawled = urlsCrawledFromIntelligent;
@@ -40,31 +40,20 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
40
40
  launchContext: {
41
41
  launcher: constants.launcher,
42
42
  launchOptions: getPlaywrightLaunchOptions(browser),
43
- // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
44
- ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
45
43
  },
46
44
  retryOnBlocked: true,
47
45
  browserPoolOptions: {
48
46
  useFingerprints: false,
49
47
  preLaunchHooks: [
48
+ getPreLaunchHook(userDataDirectory),
50
49
  async (_pageId, launchContext) => {
51
- const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
52
- // Ensure base exists
53
- await fsp.mkdir(baseDir, { recursive: true });
54
- // Create a unique subdir per browser
55
- const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
56
- await fsp.mkdir(subProfileDir, { recursive: true });
57
- // Assign to Crawlee's launcher
58
- launchContext.userDataDir = subProfileDir;
59
- // Safely extend launchOptions
60
50
  launchContext.launchOptions = {
61
51
  ...launchContext.launchOptions,
62
52
  ignoreHTTPSErrors: true,
63
53
  ...playwrightDeviceDetailsObject,
54
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
64
55
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
65
56
  };
66
- // Optionally log for debugging
67
- // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
68
57
  },
69
58
  ],
70
59
  },
@@ -149,13 +138,11 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
149
138
  await waitForPageLoaded(page, 10000);
150
139
  const actualUrl = page.url() || request.loadedUrl || request.url;
151
140
  const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
152
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
141
+ if (hasExceededDuration) {
142
+ consoleLogger.info(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
143
+ durationExceeded = true;
153
144
  isAbortingScan = true;
154
- if (hasExceededDuration) {
155
- console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
156
- durationExceeded = true;
157
- }
158
- crawler.autoscaledPool.abort(); // stops new requests
145
+ crawler.autoscaledPool.abort();
159
146
  return;
160
147
  }
161
148
  if (request.skipNavigation && actualUrl === 'about:blank') {
@@ -245,22 +232,29 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
245
232
  catch (_) {
246
233
  // Page/context was destroyed during navigation — handled by outer catch
247
234
  }
248
- guiInfoLog(guiInfoStatusTypes.SCANNED, {
249
- numScanned: urlsCrawled.scanned.length,
250
- urlScanned: request.url,
251
- });
252
- urlsCrawled.scanned.push({
253
- url: request.url,
254
- pageTitle: results.pageTitle,
255
- actualUrl, // i.e. actualUrl
256
- });
257
- urlsCrawled.scannedRedirects.push({
258
- fromUrl: request.url,
259
- toUrl: actualUrl,
260
- });
261
- results.url = request.url;
262
- results.actualUrl = actualUrl;
263
- await dataset.pushData(results);
235
+ if (rateController.claimSlot()) {
236
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
237
+ numScanned: urlsCrawled.scanned.length,
238
+ urlScanned: request.url,
239
+ });
240
+ urlsCrawled.scanned.push({
241
+ url: request.url,
242
+ pageTitle: results.pageTitle,
243
+ actualUrl, // i.e. actualUrl
244
+ });
245
+ rateController.onSuccess(crawler.autoscaledPool);
246
+ if (rateController.isLimitReached()) {
247
+ isAbortingScan = true;
248
+ crawler.autoscaledPool.abort();
249
+ }
250
+ urlsCrawled.scannedRedirects.push({
251
+ fromUrl: request.url,
252
+ toUrl: actualUrl,
253
+ });
254
+ results.url = request.url;
255
+ results.actualUrl = actualUrl;
256
+ await dataset.pushData(results);
257
+ }
264
258
  }
265
259
  else {
266
260
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
@@ -284,30 +278,27 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
284
278
  }
285
279
  }
286
280
  catch (e) {
287
- if (!isAbortingScan) {
288
- guiInfoLog(guiInfoStatusTypes.ERROR, {
289
- numScanned: urlsCrawled.scanned.length,
290
- urlScanned: request.url,
291
- });
292
- urlsCrawled.error.push({
293
- url: request.url,
294
- pageTitle: request.url,
295
- actualUrl: request.url,
296
- metadata: STATUS_CODE_METADATA[2],
297
- httpStatusCode: 0,
298
- });
299
- }
281
+ // Do not push to urlsCrawled.error here — Crawlee will retry the request
282
+ // (up to maxRequestRetries, default 3). If all retries are exhausted,
283
+ // failedRequestHandler will record the error. Pushing here causes
284
+ // duplicates and false positives for URLs that succeed on retry.
300
285
  }
301
286
  },
302
287
  failedRequestHandler: async ({ request, response, error }) => {
303
288
  if (isAbortingScan) {
304
289
  return;
305
290
  }
291
+ const status = response?.status();
292
+ if (rateController.onFailure(status, crawler.autoscaledPool)) {
293
+ consoleLogger.info(`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`);
294
+ isAbortingScan = true;
295
+ crawler.autoscaledPool?.abort();
296
+ return;
297
+ }
306
298
  guiInfoLog(guiInfoStatusTypes.ERROR, {
307
299
  numScanned: urlsCrawled.scanned.length,
308
300
  urlScanned: request.url,
309
301
  });
310
- const status = response?.status();
311
302
  const metadata = typeof status === 'number'
312
303
  ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
313
304
  : STATUS_CODE_METADATA[2];
@@ -322,15 +313,13 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
322
313
  },
323
314
  maxRequestsPerCrawl: Infinity,
324
315
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
325
- ...(process.env.OOBEE_FAST_CRAWLER && {
326
- autoscaledPoolOptions: {
327
- minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
328
- maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
329
- desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
330
- scaleUpStepRatio: 0.99, // Scale up faster
331
- scaleDownStepRatio: 0.1, // Scale down slower
332
- },
333
- }),
316
+ autoscaledPoolOptions: {
317
+ minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
318
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
319
+ desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
320
+ scaleUpStepRatio: 0.99, // Scale up faster
321
+ scaleDownStepRatio: 0.1, // Scale down slower
322
+ },
334
323
  }));
335
324
  await crawler.run();
336
325
  await requestList.isFinished();
@@ -1,5 +1,5 @@
1
1
  /* eslint-env browser */
2
- import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
2
+ import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
3
3
  import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
4
4
  import constants, { getIntermediateScreenshotsPath, guiInfoStatusTypes, } from '../constants/constants.js';
5
5
  import { initNewPage, log } from './custom/utils.js';
@@ -18,7 +18,7 @@ export class ProcessPageParams {
18
18
  this.randomToken = randomToken;
19
19
  }
20
20
  }
21
- const runCustom = async (url, randomToken, browserToRun, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, initialCustomFlowLabel) => {
21
+ const runCustom = async (url, randomToken, browserToRun, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, initialCustomFlowLabel, extraHTTPHeaders) => {
22
22
  // checks and delete datasets path if it already exists
23
23
  process.env.CRAWLEE_STORAGE_DIR = randomToken;
24
24
  const urlsCrawled = { ...constants.urlsCrawledObj };
@@ -47,6 +47,7 @@ const runCustom = async (url, randomToken, browserToRun, userDataDirectory, view
47
47
  ...baseArgs.filter(a => !a.startsWith('--window-size') && a !== '--start-maximized'),
48
48
  ...customArgs,
49
49
  ];
50
+ const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
50
51
  const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
51
52
  ...baseLaunchOptions,
52
53
  args: mergedArgs,
@@ -56,7 +57,12 @@ const runCustom = async (url, randomToken, browserToRun, userDataDirectory, view
56
57
  viewport: null,
57
58
  ...(hasCustomViewport ? contextDeviceOptions : {}),
58
59
  userAgent: process.env.OOBEE_USER_AGENT || deviceUserAgent,
60
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
61
+ ...(httpCredentials && { httpCredentials }),
59
62
  });
63
+ if (authHeader) {
64
+ await addAuthRouteHandler(context, url, authHeader);
65
+ }
60
66
  register(context);
61
67
  processPageParams.stopAll = async () => {
62
68
  try {
@@ -51,7 +51,7 @@ const SENTRY_NODE_VERSION = (() => {
51
51
  return _require('@sentry/node/package.json').version;
52
52
  }
53
53
  catch {
54
- return '9.47.1'; // safe fallback matching currently installed version
54
+ return '10.58.0'; // safe fallback matching currently installed version
55
55
  }
56
56
  })();
57
57
  // ---------------------------------------------------------------------------
@@ -444,6 +444,37 @@ const scanApiScript = (shortDescMap, longDescMap, stepByStepMap) => `
444
444
  // Run axe-core + oobee custom checks
445
445
  var scanResult = await window.runA11yScan(elementsToScan, '');
446
446
 
447
+ // Re-verify aria-hidden-focus violations against the live DOM to handle
448
+ // race conditions with JS that sets tabindex="-1" after aria-hidden
449
+ var axeViolations = scanResult.axeScanResults.violations || [];
450
+ var ariaHiddenViolation = axeViolations.find(function(v) { return v.id === 'aria-hidden-focus'; });
451
+ if (ariaHiddenViolation) {
452
+ await new Promise(function(resolve) { setTimeout(resolve, 0); });
453
+ ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(function(node) {
454
+ var selector = node.target && node.target[0];
455
+ if (typeof selector !== 'string') return true;
456
+ try {
457
+ var el = document.querySelector(selector);
458
+ if (!el) return true;
459
+ var focusables = el.querySelectorAll(
460
+ 'a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]'
461
+ );
462
+ if (focusables.length === 0) return false;
463
+ return Array.from(focusables).some(function(child) {
464
+ var tabindex = child.getAttribute('tabindex');
465
+ if (tabindex === null) return true;
466
+ var parsed = parseInt(tabindex, 10);
467
+ return isNaN(parsed) || parsed >= 0;
468
+ });
469
+ } catch (e) { return true; }
470
+ });
471
+ if (ariaHiddenViolation.nodes.length === 0) {
472
+ scanResult.axeScanResults.violations = axeViolations.filter(function(v) {
473
+ return v.id !== 'aria-hidden-focus';
474
+ });
475
+ }
476
+ }
477
+
447
478
  // Convert raw axe results into oobee category structure
448
479
  var filtered = _oobeeFilterAxeResults(scanResult.axeScanResults, scanResult.pageTitle);
449
480