npm - @govtechsg/oobee - Versions diffs - 0.10.91 → 0.10.92 - Mend

@govtechsg/oobee 0.10.91 → 0.10.92

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/AGENTS.md +289 -0
package/README.md +3 -0
package/dist/cli.js +3 -0
package/dist/combine.js +14 -2
package/dist/constants/cliFunctions.js +7 -0
package/dist/constants/common.js +119 -70
package/dist/constants/constants.js +1 -0
package/dist/crawlers/commonCrawlerFunc.js +93 -15
package/dist/crawlers/crawlDomain.js +45 -57
package/dist/crawlers/crawlIntelligentSitemap.js +12 -7
package/dist/crawlers/crawlRateController.js +47 -0
package/dist/crawlers/crawlSitemap.js +51 -62
package/dist/generateOobeeClientScanner.js +31 -0
package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
package/dist/mergeAxeResults.js +120 -92
package/dist/npmIndex.js +1 -0
package/dist/utils.js +23 -28
package/oobee-client-scanner.js +33 -2
package/package.json +2 -2
package/src/cli.ts +4 -0
package/src/combine.ts +15 -1
package/src/constants/cliFunctions.ts +7 -0
package/src/constants/common.ts +131 -79
package/src/constants/constants.ts +1 -0
package/src/crawlers/commonCrawlerFunc.ts +103 -14
package/src/crawlers/crawlDomain.ts +52 -65
package/src/crawlers/crawlIntelligentSitemap.ts +13 -7
package/src/crawlers/crawlRateController.ts +63 -0
package/src/crawlers/crawlSitemap.ts +57 -70
package/src/generateOobeeClientScanner.ts +31 -0
package/src/index.ts +1 -0
package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
package/src/mergeAxeResults.ts +139 -99
package/src/npmIndex.ts +1 -0
package/src/utils.ts +25 -33
/package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt} +0 -0

package/src/crawlers/crawlDomain.ts CHANGED Viewed

@@ -1,10 +1,10 @@
 import crawlee, { EnqueueStrategy } from 'crawlee';
+import { CrawlRateController } from './crawlRateController.js';
 import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
 import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
-import * as path from 'path';
-import fsp from 'fs/promises';
 import {
   createCrawleeSubFolders,
+  getPreLaunchHook,
   runAxeScript,
   isUrlPdf,
   shouldSkipClickDueToDisallowedHref,
@@ -29,7 +29,7 @@ import {
   getUrlsFromRobotsTxt,
   waitForPageLoaded,
 } from '../constants/common.js';
-import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
+import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
 import {
   handlePdfDownload,
   runPdfScan,
@@ -364,9 +364,7 @@ const crawlDomain = async ({
         // same-domain strategy) still contribute their <a> links above, but
         // clicking every interactive element on them is too slow and starves
         // the crawler of time to discover pages on the primary hostname.
-        const currentHostname = new URL(page.url()).hostname;
-        const seedHostname = new URL(url).hostname;
-        if (currentHostname === seedHostname) {
+        if (isSameHostname(new URL(page.url()).hostname, new URL(url).hostname)) {
           // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
           try {
             await customEnqueueLinksByClickingElements(page, browserContext);
@@ -382,49 +380,32 @@ const crawlDomain = async ({
   };
   let isAbortingScanNow = false;
+  const rateController = new CrawlRateController(
+    maxRequestsPerCrawl,
+    specifiedMaxConcurrency || constants.maxConcurrency,
+  );
   const crawler = register(
     new crawlee.PlaywrightCrawler({
       launchContext: {
         launcher: constants.launcher,
         launchOptions: getPlaywrightLaunchOptions(browser),
-        // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
-        ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
       },
       retryOnBlocked: true,
       browserPoolOptions: {
         useFingerprints: false,
         preLaunchHooks: [
+          getPreLaunchHook(userDataDirectory),
           async (_pageId, launchContext) => {
-            const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
-            // Ensure base exists
-            await fsp.mkdir(baseDir, { recursive: true });
-            // Create a unique subdir per browser
-            const subProfileDir = path.join(
-              baseDir,
-              `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
-            );
-            await fsp.mkdir(subProfileDir, { recursive: true });
-            // Assign to Crawlee's launcher
-            // Crawlee preLaunchHooks expects launchContext to be mutated in-place.
-            // eslint-disable-next-line no-param-reassign
-            launchContext.userDataDir = subProfileDir;
-            // Safely extend launchOptions
             // eslint-disable-next-line no-param-reassign
             launchContext.launchOptions = {
               ...launchContext.launchOptions,
               ignoreHTTPSErrors: true,
               ...playwrightDeviceDetailsObject,
+              ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
               ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
               ...(extraHTTPHeaders && { extraHTTPHeaders }),
             };
-            // Optionally log for debugging
-            // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
           },
         ],
       },
@@ -527,11 +508,9 @@ const crawlDomain = async ({
           const hasExceededDuration =
             scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
-          if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
-            if (hasExceededDuration) {
-              console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
-              durationExceeded = true;
-            }
+          if (hasExceededDuration) {
+            console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
+            durationExceeded = true;
             isAbortingScanNow = true;
             activeCrawler.autoscaledPool.abort();
             return;
@@ -691,8 +670,7 @@ const crawlDomain = async ({
                 return;
               }
-              // One more check if scanned pages have reached limit due to multi-instances of handler running
-              if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
+              if (rateController.claimSlot()) {
                 guiInfoLog(guiInfoStatusTypes.SCANNED, {
                   numScanned: urlsCrawled.scanned.length,
                   urlScanned: request.url,
@@ -703,6 +681,11 @@ const crawlDomain = async ({
                   pageTitle: results.pageTitle,
                   actualUrl, // i.e. actualUrl
                 });
+                rateController.onSuccess(crawler.autoscaledPool);
+                if (rateController.isLimitReached()) {
+                  isAbortingScanNow = true;
+                  activeCrawler.autoscaledPool.abort();
+                }
                 scannedUrlSet.add(normUrl(request.url));
                 scannedResolvedUrlSet.add(normUrl(actualUrl));
@@ -715,8 +698,7 @@ const crawlDomain = async ({
                 results.actualUrl = actualUrl;
                 await dataset.pushData(results);
               }
-            } else if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
-              // One more check if scanned pages have reached limit due to multi-instances of handler running
+            } else if (rateController.claimSlot()) {
               guiInfoLog(guiInfoStatusTypes.SCANNED, {
                 numScanned: urlsCrawled.scanned.length,
                 urlScanned: request.url,
@@ -726,6 +708,11 @@ const crawlDomain = async ({
                 actualUrl: request.url,
                 pageTitle: results.pageTitle,
               });
+              rateController.onSuccess(crawler.autoscaledPool);
+              if (rateController.isLimitReached()) {
+                isAbortingScanNow = true;
+                activeCrawler.autoscaledPool.abort();
+              }
               scannedUrlSet.add(normUrl(request.url));
               scannedResolvedUrlSet.add(normUrl(request.url));
               await dataset.pushData(results);
@@ -777,33 +764,35 @@ const crawlDomain = async ({
               });
             }
           } catch {
-            // Do nothing since the error will be pushed
+            // Recovery failed; Crawlee will retry the request automatically
           }
-          // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
-          // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
-          if (!isAbortingScanNow) {
-            guiInfoLog(guiInfoStatusTypes.ERROR, {
-              numScanned: urlsCrawled.scanned.length,
-              urlScanned: request.url,
-            });
-            urlsCrawled.error.push({
-              url: request.url,
-              pageTitle: request.url,
-              actualUrl: request.url,
-              metadata: STATUS_CODE_METADATA[2],
-            });
-          }
+          // Do not push to urlsCrawled.error here — Crawlee will retry the request
+          // (up to maxRequestRetries, default 3). If all retries are exhausted,
+          // failedRequestHandler will record the error. Pushing here causes
+          // duplicates and false positives for URLs that succeed on retry.
         }
       },
       failedRequestHandler: async ({ request, response }) => {
+        if (isAbortingScanNow) {
+          return;
+        }
+        const status = response?.status();
+        if (rateController.onFailure(status, crawler.autoscaledPool)) {
+          consoleLogger.info(
+            `Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`,
+          );
+          isAbortingScanNow = true;
+          crawler.autoscaledPool?.abort();
+          return;
+        }
         guiInfoLog(guiInfoStatusTypes.ERROR, {
           numScanned: urlsCrawled.scanned.length,
           urlScanned: request.url,
         });
-        const status = response?.status();
         const metadata =
           typeof status === 'number'
             ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
@@ -819,15 +808,13 @@ const crawlDomain = async ({
       },
       maxRequestsPerCrawl: Infinity,
       maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
-      ...(process.env.OOBEE_FAST_CRAWLER && {
-        autoscaledPoolOptions: {
-          minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
-          maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
-          desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
-          scaleUpStepRatio: 0.99, // Scale up faster
-          scaleDownStepRatio: 0.1, // Scale down slower
-        },
-      }),
+      autoscaledPoolOptions: {
+        minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
+        maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
+        desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
+        scaleUpStepRatio: 0.99, // Scale up faster
+        scaleDownStepRatio: 0.1, // Scale down slower
+      },
     }),
   );
@@ -850,7 +837,7 @@ const crawlDomain = async ({
         .map(item => item.actualUrl || item.url)
         .filter(pageUrl => {
           try {
-            return new URL(pageUrl).hostname === seedHostname && !clickPassVisited.has(pageUrl);
+            return isSameHostname(new URL(pageUrl).hostname, seedHostname) && !clickPassVisited.has(pageUrl);
           } catch {
             return false;
           }

package/src/crawlers/crawlIntelligentSitemap.ts CHANGED Viewed

@@ -7,7 +7,7 @@ import { consoleLogger, guiInfoLog } from '../logs.js';
 import crawlDomain from './crawlDomain.js';
 import crawlSitemap from './crawlSitemap.js';
 import { ViewportSettingsClass } from '../combine.js';
-import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
+import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt, initModifiedUserAgent } from '../constants/common.js';
 import { register } from '../utils.js';
 const crawlIntelligentSitemap = async (
@@ -40,6 +40,10 @@ const crawlIntelligentSitemap = async (
   ({ dataset } = await createCrawleeSubFolders(randomToken));
+  // Initialise modified User-Agent early so sitemap discovery requests
+  // don't expose "HeadlessChrome" (which triggers bot-blocking on some sites).
+  await initModifiedUserAgent(browser);
   function getHomeUrl(parsedUrl: string) {
     const urlObject = new URL(parsedUrl);
     return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
@@ -62,6 +66,7 @@ const crawlIntelligentSitemap = async (
       context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
         ...launchOptions,
         ...(extraHTTPHeaders && { extraHTTPHeaders }),
+        ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
       });
       register(context);
     } else {
@@ -70,6 +75,7 @@ const crawlIntelligentSitemap = async (
       register(browserInstance as unknown as { close: () => Promise<void> });
       context = await browserInstance.newContext({
         ...(extraHTTPHeaders && { extraHTTPHeaders }),
+        ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
       });
     }
@@ -93,7 +99,7 @@ const crawlIntelligentSitemap = async (
   const checkUrlExists = async (page: Page, parsedUrl: string) => {
     try {
       const response = await page.goto(parsedUrl);
-      return response.ok();
+      return response?.ok() ?? false;
     } catch (e) {
       consoleLogger.error(e);
       return false;
@@ -105,7 +111,7 @@ const crawlIntelligentSitemap = async (
   try {
     sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
     if (sitemapUrls.length > 0) {
-      console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
+      consoleLogger.info(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
       sitemapExist = true;
     }
   } catch (error) {
@@ -125,7 +131,7 @@ const crawlIntelligentSitemap = async (
   }
   if (!sitemapExist) {
-    console.log('Unable to find sitemap. Commencing website crawl instead.');
+    consoleLogger.info('Unable to find sitemap. Commencing website crawl instead.');
     return await crawlDomain({
       url,
       randomToken,
@@ -157,7 +163,7 @@ const crawlIntelligentSitemap = async (
       break;
     }
-    console.log(`Processing sitemap: ${currentSitemapUrl}`);
+    consoleLogger.info(`Processing sitemap: ${currentSitemapUrl}`);
     urlsCrawledFinal = await crawlSitemap({
       sitemapUrl: currentSitemapUrl,
       randomToken,
@@ -187,7 +193,7 @@ const crawlIntelligentSitemap = async (
   const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
   if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
-    console.log(
+    consoleLogger.info(
       `Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`,
     );
     urlsCrawledFinal = await crawlDomain({
@@ -212,7 +218,7 @@ const crawlIntelligentSitemap = async (
       scanDuration: remainingScanDuration,
     });
   } else if (!hasDurationRemaining) {
-    console.log(
+    consoleLogger.info(
       `Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`,
     );
     durationExceeded = true;

package/src/crawlers/crawlRateController.ts ADDED Viewed

@@ -0,0 +1,63 @@
+import { consoleLogger } from '../logs.js';
+export class CrawlRateController {
+  private scannedCount = 0;
+  private readonly maxPages: number;
+  private consecutiveFailures = 0;
+  private consecutiveSuccesses = 0;
+  private readonly maxConsecutiveFailures: number;
+  private readonly originalMaxConcurrency: number;
+  private static readonly RECOVERY_INTERVAL = 10;
+  constructor(maxRequestsPerCrawl: number, maxConcurrency: number) {
+    this.maxPages = maxRequestsPerCrawl;
+    this.maxConsecutiveFailures = Number(process.env.OOBEE_CONSECUTIVE_MAX_RETRIES) || 100;
+    this.originalMaxConcurrency = maxConcurrency;
+  }
+  claimSlot(): boolean {
+    if (this.scannedCount >= this.maxPages) {
+      return false;
+    }
+    this.scannedCount++;
+    return true;
+  }
+  onSuccess(pool?: { maxConcurrency: number }): void {
+    this.consecutiveFailures = 0;
+    this.consecutiveSuccesses++;
+    if (pool && this.consecutiveSuccesses % CrawlRateController.RECOVERY_INTERVAL === 0) {
+      if (pool.maxConcurrency < this.originalMaxConcurrency) {
+        pool.maxConcurrency = Math.min(pool.maxConcurrency + 2, this.originalMaxConcurrency);
+        consoleLogger.info(`Recovering concurrency to ${pool.maxConcurrency}`);
+      }
+    }
+  }
+  onFailure(httpStatus: number | undefined, pool?: { maxConcurrency: number }): boolean {
+    if (typeof httpStatus !== 'number' || httpStatus < 400) {
+      return false;
+    }
+    this.consecutiveSuccesses = 0;
+    this.consecutiveFailures++;
+    if (pool && pool.maxConcurrency > 1) {
+      pool.maxConcurrency = Math.max(1, Math.floor(pool.maxConcurrency / 2));
+      consoleLogger.info(
+        `Rate limited (HTTP ${httpStatus}) — reducing concurrency to ${pool.maxConcurrency}`,
+      );
+    }
+    if (this.consecutiveFailures >= this.maxConsecutiveFailures) {
+      return true;
+    }
+    return false;
+  }
+  isLimitReached(): boolean {
+    return this.scannedCount >= this.maxPages;
+  }
+}

package/src/crawlers/crawlSitemap.ts CHANGED Viewed

@@ -1,9 +1,9 @@
 import crawlee, { EnqueueStrategy, LaunchContext, Request, RequestList, Dataset } from 'crawlee';
+import { CrawlRateController } from './crawlRateController.js';
 import fs from 'fs';
-import * as path from 'path';
-import fsp from 'fs/promises';
 import {
   createCrawleeSubFolders,
+  getPreLaunchHook,
   preNavigationHooks,
   runAxeScript,
   isUrlPdf,
@@ -30,7 +30,7 @@ import {
   mapPdfScanResults,
   doPdfScreenshots,
 } from './pdfScanFunc.js';
-import { guiInfoLog } from '../logs.js';
+import { consoleLogger, guiInfoLog } from '../logs.js';
 import { ViewportSettingsClass } from '../combine.js';
 const crawlSitemap = async ({
@@ -81,6 +81,10 @@ const crawlSitemap = async ({
   let urlsCrawled: UrlsCrawled;
   let durationExceeded = false;
   let isAbortingScan = false;
+  const rateController = new CrawlRateController(
+    maxRequestsPerCrawl,
+    specifiedMaxConcurrency || constants.maxConcurrency,
+  );
   if (fromCrawlIntelligentSitemap) {
     dataset = datasetFromIntelligent;
@@ -125,39 +129,20 @@ const crawlSitemap = async ({
       launchContext: {
         launcher: constants.launcher,
         launchOptions: getPlaywrightLaunchOptions(browser),
-        // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
-        ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
       },
       retryOnBlocked: true,
       browserPoolOptions: {
         useFingerprints: false,
         preLaunchHooks: [
+          getPreLaunchHook(userDataDirectory),
           async (_pageId, launchContext) => {
-            const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
-            // Ensure base exists
-            await fsp.mkdir(baseDir, { recursive: true });
-            // Create a unique subdir per browser
-            const subProfileDir = path.join(
-              baseDir,
-              `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
-            );
-            await fsp.mkdir(subProfileDir, { recursive: true });
-            // Assign to Crawlee's launcher
-            launchContext.userDataDir = subProfileDir;
-            // Safely extend launchOptions
             launchContext.launchOptions = {
               ...launchContext.launchOptions,
               ignoreHTTPSErrors: true,
               ...playwrightDeviceDetailsObject,
+              ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
               ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
             };
-            // Optionally log for debugging
-            // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
           },
         ],
       },
@@ -259,13 +244,11 @@ const crawlSitemap = async ({
           const hasExceededDuration =
             scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
-          if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
+          if (hasExceededDuration) {
+            consoleLogger.info(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
+            durationExceeded = true;
             isAbortingScan = true;
-            if (hasExceededDuration) {
-              console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
-              durationExceeded = true;
-            }
-            crawler.autoscaledPool.abort(); // stops new requests
+            crawler.autoscaledPool.abort();
             return;
           }
@@ -376,26 +359,33 @@ const crawlSitemap = async ({
               // Page/context was destroyed during navigation — handled by outer catch
             }
-            guiInfoLog(guiInfoStatusTypes.SCANNED, {
-              numScanned: urlsCrawled.scanned.length,
-              urlScanned: request.url,
-            });
+            if (rateController.claimSlot()) {
+              guiInfoLog(guiInfoStatusTypes.SCANNED, {
+                numScanned: urlsCrawled.scanned.length,
+                urlScanned: request.url,
+              });
-            urlsCrawled.scanned.push({
-              url: request.url,
-              pageTitle: results.pageTitle,
-              actualUrl, // i.e. actualUrl
-            });
+              urlsCrawled.scanned.push({
+                url: request.url,
+                pageTitle: results.pageTitle,
+                actualUrl, // i.e. actualUrl
+              });
+              rateController.onSuccess(crawler.autoscaledPool);
+              if (rateController.isLimitReached()) {
+                isAbortingScan = true;
+                crawler.autoscaledPool.abort();
+              }
-            urlsCrawled.scannedRedirects.push({
-              fromUrl: request.url,
-              toUrl: actualUrl,
-            });
+              urlsCrawled.scannedRedirects.push({
+                fromUrl: request.url,
+                toUrl: actualUrl,
+              });
-            results.url = request.url;
-            results.actualUrl = actualUrl;
+              results.url = request.url;
+              results.actualUrl = actualUrl;
-            await dataset.pushData(results);
+              await dataset.pushData(results);
+            }
           } else {
             guiInfoLog(guiInfoStatusTypes.SKIPPED, {
               numScanned: urlsCrawled.scanned.length,
@@ -420,20 +410,10 @@ const crawlSitemap = async ({
             }
           }
         } catch (e) {
-          if (!isAbortingScan) {
-            guiInfoLog(guiInfoStatusTypes.ERROR, {
-              numScanned: urlsCrawled.scanned.length,
-              urlScanned: request.url,
-            });
-            urlsCrawled.error.push({
-              url: request.url,
-              pageTitle: request.url,
-              actualUrl: request.url,
-              metadata: STATUS_CODE_METADATA[2],
-              httpStatusCode: 0,
-            });
-          }
+          // Do not push to urlsCrawled.error here — Crawlee will retry the request
+          // (up to maxRequestRetries, default 3). If all retries are exhausted,
+          // failedRequestHandler will record the error. Pushing here causes
+          // duplicates and false positives for URLs that succeed on retry.
         }
       },
       failedRequestHandler: async ({ request, response, error }) => {
@@ -441,12 +421,21 @@ const crawlSitemap = async ({
           return;
         }
+        const status = response?.status();
+        if (rateController.onFailure(status, crawler.autoscaledPool)) {
+          consoleLogger.info(
+            `Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`,
+          );
+          isAbortingScan = true;
+          crawler.autoscaledPool?.abort();
+          return;
+        }
         guiInfoLog(guiInfoStatusTypes.ERROR, {
           numScanned: urlsCrawled.scanned.length,
           urlScanned: request.url,
         });
-        const status = response?.status();
         const metadata =
           typeof status === 'number'
             ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
@@ -463,15 +452,13 @@ const crawlSitemap = async ({
       },
       maxRequestsPerCrawl: Infinity,
       maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
-      ...(process.env.OOBEE_FAST_CRAWLER && {
-        autoscaledPoolOptions: {
-          minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
-          maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
-          desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
-          scaleUpStepRatio: 0.99, // Scale up faster
-          scaleDownStepRatio: 0.1, // Scale down slower
-        },
-      }),
+      autoscaledPoolOptions: {
+        minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
+        maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
+        desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
+        scaleUpStepRatio: 0.99, // Scale up faster
+        scaleDownStepRatio: 0.1, // Scale down slower
+      },
     }),
   );

package/src/generateOobeeClientScanner.ts CHANGED Viewed

@@ -461,6 +461,37 @@ const scanApiScript = (
       // Run axe-core + oobee custom checks
       var scanResult = await window.runA11yScan(elementsToScan, '');
+      // Re-verify aria-hidden-focus violations against the live DOM to handle
+      // race conditions with JS that sets tabindex="-1" after aria-hidden
+      var axeViolations = scanResult.axeScanResults.violations || [];
+      var ariaHiddenViolation = axeViolations.find(function(v) { return v.id === 'aria-hidden-focus'; });
+      if (ariaHiddenViolation) {
+        await new Promise(function(resolve) { setTimeout(resolve, 0); });
+        ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(function(node) {
+          var selector = node.target && node.target[0];
+          if (typeof selector !== 'string') return true;
+          try {
+            var el = document.querySelector(selector);
+            if (!el) return true;
+            var focusables = el.querySelectorAll(
+              'a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]'
+            );
+            if (focusables.length === 0) return false;
+            return Array.from(focusables).some(function(child) {
+              var tabindex = child.getAttribute('tabindex');
+              if (tabindex === null) return true;
+              var parsed = parseInt(tabindex, 10);
+              return isNaN(parsed) || parsed >= 0;
+            });
+          } catch (e) { return true; }
+        });
+        if (ariaHiddenViolation.nodes.length === 0) {
+          scanResult.axeScanResults.violations = axeViolations.filter(function(v) {
+            return v.id !== 'aria-hidden-focus';
+          });
+        }
+      }
       // Convert raw axe results into oobee category structure
       var filtered = _oobeeFilterAxeResults(scanResult.axeScanResults, scanResult.pageTitle);

package/src/index.ts CHANGED Viewed

@@ -52,6 +52,7 @@ export type Answers = {
   ruleset: RuleFlags[];
   generateJsonFiles: boolean;
   scanDuration?: number;
+  websiteTag?: string;
 };
 export type Data = {

package/src/mergeAxeResults/sentryTelemetry.ts CHANGED Viewed

@@ -144,6 +144,9 @@ const sendWcagBreakdownToSentry = async (
         ...(process.env.OOBEE_SCAN_PRODUCT && {
           scanProduct: process.env.OOBEE_SCAN_PRODUCT,
         }),
+        ...(process.env.OOBEE_TAGGED_WEBSITE && {
+          websiteTag: process.env.OOBEE_TAGGED_WEBSITE,
+        }),
       },
       user: {
         ...(scanInfo.email && scanInfo.name