npm - @govtechsg/oobee - Versions diffs - 0.10.91 → 0.10.93 - Mend

@govtechsg/oobee 0.10.91 → 0.10.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/AGENTS.md +303 -0
package/README.md +22 -0
package/dist/cli.js +3 -0
package/dist/combine.js +15 -3
package/dist/constants/cliFunctions.js +7 -0
package/dist/constants/common.js +149 -80
package/dist/constants/constants.js +1 -0
package/dist/crawlers/commonCrawlerFunc.js +136 -15
package/dist/crawlers/crawlDomain.js +55 -58
package/dist/crawlers/crawlIntelligentSitemap.js +21 -11
package/dist/crawlers/crawlRateController.js +47 -0
package/dist/crawlers/crawlSitemap.js +51 -62
package/dist/crawlers/runCustom.js +8 -2
package/dist/generateOobeeClientScanner.js +32 -1
package/dist/mergeAxeResults/itemsStore.js +32 -3
package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
package/dist/mergeAxeResults.js +120 -92
package/dist/npmIndex.js +1 -0
package/dist/utils.js +23 -28
package/oobee-client-scanner.js +35 -4
package/package.json +3 -3
package/src/cli.ts +4 -0
package/src/combine.ts +16 -1
package/src/constants/cliFunctions.ts +7 -0
package/src/constants/common.ts +162 -90
package/src/constants/constants.ts +1 -0
package/src/crawlers/commonCrawlerFunc.ts +148 -14
package/src/crawlers/crawlDomain.ts +64 -66
package/src/crawlers/crawlIntelligentSitemap.ts +23 -11
package/src/crawlers/crawlRateController.ts +63 -0
package/src/crawlers/crawlSitemap.ts +57 -70
package/src/crawlers/runCustom.ts +10 -1
package/src/generateOobeeClientScanner.ts +32 -1
package/src/index.ts +1 -0
package/src/mergeAxeResults/itemsStore.ts +37 -3
package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
package/src/mergeAxeResults.ts +139 -99
package/src/npmIndex.ts +1 -0
package/src/utils.ts +25 -33
/package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt} +0 -0

package/dist/crawlers/crawlDomain.js CHANGED Viewed

@@ -1,10 +1,9 @@
 import crawlee from 'crawlee';
-import * as path from 'path';
-import fsp from 'fs/promises';
-import { createCrawleeSubFolders, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
+import { CrawlRateController } from './crawlRateController.js';
+import { createCrawleeSubFolders, getPreLaunchHook, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, splitAuthHeaders, } from './commonCrawlerFunc.js';
 import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
 import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
-import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
+import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
 import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
 import { consoleLogger, guiInfoLog } from '../logs.js';
 const isBlacklisted = (url, blacklistedPatterns) => {
@@ -258,9 +257,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
                 // same-domain strategy) still contribute their <a> links above, but
                 // clicking every interactive element on them is too slow and starves
                 // the crawler of time to discover pages on the primary hostname.
-                const currentHostname = new URL(page.url()).hostname;
-                const seedHostname = new URL(url).hostname;
-                if (currentHostname === seedHostname) {
+                if (isSameHostname(new URL(page.url()).hostname, new URL(url).hostname)) {
                     // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
                     try {
                         await customEnqueueLinksByClickingElements(page, browserContext);
@@ -277,43 +274,40 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
         }
     };
     let isAbortingScanNow = false;
+    const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
+    const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
     const crawler = register(new crawlee.PlaywrightCrawler({
         launchContext: {
             launcher: constants.launcher,
             launchOptions: getPlaywrightLaunchOptions(browser),
-            // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
-            ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
         },
         retryOnBlocked: true,
         browserPoolOptions: {
             useFingerprints: false,
             preLaunchHooks: [
+                getPreLaunchHook(userDataDirectory),
                 async (_pageId, launchContext) => {
-                    const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
-                    // Ensure base exists
-                    await fsp.mkdir(baseDir, { recursive: true });
-                    // Create a unique subdir per browser
-                    const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
-                    await fsp.mkdir(subProfileDir, { recursive: true });
-                    // Assign to Crawlee's launcher
-                    // Crawlee preLaunchHooks expects launchContext to be mutated in-place.
-                    // eslint-disable-next-line no-param-reassign
-                    launchContext.userDataDir = subProfileDir;
-                    // Safely extend launchOptions
                     // eslint-disable-next-line no-param-reassign
                     launchContext.launchOptions = {
                         ...launchContext.launchOptions,
                         ignoreHTTPSErrors: true,
                         ...playwrightDeviceDetailsObject,
+                        ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
                         ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
-                        ...(extraHTTPHeaders && { extraHTTPHeaders }),
+                        ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+                        ...(httpCredentials && { httpCredentials }),
                     };
-                    // Optionally log for debugging
-                    // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
                 },
             ],
         },
         requestQueue,
+        preNavigationHooks: [
+            async (crawlingContext) => {
+                if (extraHTTPHeaders) {
+                    crawlingContext.request.headers = extraHTTPHeaders;
+                }
+            },
+        ],
         postNavigationHooks: [
             async (crawlingContext) => {
                 const { page, request } = crawlingContext;
@@ -390,11 +384,9 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
                     return;
                 }
                 const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
-                if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
-                    if (hasExceededDuration) {
-                        console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
-                        durationExceeded = true;
-                    }
+                if (hasExceededDuration) {
+                    console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
+                    durationExceeded = true;
                     isAbortingScanNow = true;
                     activeCrawler.autoscaledPool.abort();
                     return;
@@ -527,8 +519,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
                             });
                             return;
                         }
-                        // One more check if scanned pages have reached limit due to multi-instances of handler running
-                        if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
+                        if (rateController.claimSlot()) {
                             guiInfoLog(guiInfoStatusTypes.SCANNED, {
                                 numScanned: urlsCrawled.scanned.length,
                                 urlScanned: request.url,
@@ -538,6 +529,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
                                 pageTitle: results.pageTitle,
                                 actualUrl, // i.e. actualUrl
                             });
+                            rateController.onSuccess(crawler.autoscaledPool);
+                            if (rateController.isLimitReached()) {
+                                isAbortingScanNow = true;
+                                activeCrawler.autoscaledPool.abort();
+                            }
                             scannedUrlSet.add(normUrl(request.url));
                             scannedResolvedUrlSet.add(normUrl(actualUrl));
                             urlsCrawled.scannedRedirects.push({
@@ -549,8 +545,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
                             await dataset.pushData(results);
                         }
                     }
-                    else if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
-                        // One more check if scanned pages have reached limit due to multi-instances of handler running
+                    else if (rateController.claimSlot()) {
                         guiInfoLog(guiInfoStatusTypes.SCANNED, {
                             numScanned: urlsCrawled.scanned.length,
                             urlScanned: request.url,
@@ -560,6 +555,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
                             actualUrl: request.url,
                             pageTitle: results.pageTitle,
                         });
+                        rateController.onSuccess(crawler.autoscaledPool);
+                        if (rateController.isLimitReached()) {
+                            isAbortingScanNow = true;
+                            activeCrawler.autoscaledPool.abort();
+                        }
                         scannedUrlSet.add(normUrl(request.url));
                         scannedResolvedUrlSet.add(normUrl(request.url));
                         await dataset.pushData(results);
@@ -611,30 +611,29 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
                     }
                 }
                 catch {
-                    // Do nothing since the error will be pushed
-                }
-                // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
-                // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
-                if (!isAbortingScanNow) {
-                    guiInfoLog(guiInfoStatusTypes.ERROR, {
-                        numScanned: urlsCrawled.scanned.length,
-                        urlScanned: request.url,
-                    });
-                    urlsCrawled.error.push({
-                        url: request.url,
-                        pageTitle: request.url,
-                        actualUrl: request.url,
-                        metadata: STATUS_CODE_METADATA[2],
-                    });
+                    // Recovery failed; Crawlee will retry the request automatically
                 }
+                // Do not push to urlsCrawled.error here — Crawlee will retry the request
+                // (up to maxRequestRetries, default 3). If all retries are exhausted,
+                // failedRequestHandler will record the error. Pushing here causes
+                // duplicates and false positives for URLs that succeed on retry.
             }
         },
         failedRequestHandler: async ({ request, response }) => {
+            if (isAbortingScanNow) {
+                return;
+            }
+            const status = response?.status();
+            if (rateController.onFailure(status, crawler.autoscaledPool)) {
+                consoleLogger.info(`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`);
+                isAbortingScanNow = true;
+                crawler.autoscaledPool?.abort();
+                return;
+            }
             guiInfoLog(guiInfoStatusTypes.ERROR, {
                 numScanned: urlsCrawled.scanned.length,
                 urlScanned: request.url,
             });
-            const status = response?.status();
             const metadata = typeof status === 'number'
                 ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
                 : STATUS_CODE_METADATA[2];
@@ -648,15 +647,13 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
         },
         maxRequestsPerCrawl: Infinity,
         maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
-        ...(process.env.OOBEE_FAST_CRAWLER && {
-            autoscaledPoolOptions: {
-                minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
-                maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
-                desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
-                scaleUpStepRatio: 0.99, // Scale up faster
-                scaleDownStepRatio: 0.1, // Scale down slower
-            },
-        }),
+        autoscaledPoolOptions: {
+            minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
+            maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
+            desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
+            scaleUpStepRatio: 0.99, // Scale up faster
+            scaleDownStepRatio: 0.1, // Scale down slower
+        },
     }));
     await crawler.run();
     // Additional passes: keep re-visiting scanned seed-hostname pages for
@@ -675,7 +672,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
                 .map(item => item.actualUrl || item.url)
                 .filter(pageUrl => {
                 try {
-                    return new URL(pageUrl).hostname === seedHostname && !clickPassVisited.has(pageUrl);
+                    return isSameHostname(new URL(pageUrl).hostname, seedHostname) && !clickPassVisited.has(pageUrl);
                 }
                 catch {
                     return false;

package/dist/crawlers/crawlIntelligentSitemap.js CHANGED Viewed

@@ -1,9 +1,9 @@
-import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
+import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
 import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
 import { consoleLogger, guiInfoLog } from '../logs.js';
 import crawlDomain from './crawlDomain.js';
 import crawlSitemap from './crawlSitemap.js';
-import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
+import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt, initModifiedUserAgent } from '../constants/common.js';
 import { register } from '../utils.js';
 const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, strategy, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, followRobots, extraHTTPHeaders, safeMode, scanDuration) => {
     const startTime = Date.now(); // Track start time
@@ -15,6 +15,9 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
     let sitemapUrl;
     let durationExceeded = false;
     ({ dataset } = await createCrawleeSubFolders(randomToken));
+    // Initialise modified User-Agent early so sitemap discovery requests
+    // don't expose "HeadlessChrome" (which triggers bot-blocking on some sites).
+    await initModifiedUserAgent(browser);
     function getHomeUrl(parsedUrl) {
         const urlObject = new URL(parsedUrl);
         return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
@@ -23,24 +26,31 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
         const homeUrl = getHomeUrl(link);
         let sitemapLink = '';
         const launchOptions = getPlaywrightLaunchOptions(browser);
+        const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
         let context;
         let browserInstance;
         if (process.env.CRAWLEE_HEADLESS === '1') {
             const effectiveUserDataDirectory = userDataDirectory || '';
             context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
                 ...launchOptions,
-                ...(extraHTTPHeaders && { extraHTTPHeaders }),
+                ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+                ...(httpCredentials && { httpCredentials }),
+                ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
             });
             register(context);
         }
         else {
-            // In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
             browserInstance = await constants.launcher.launch(launchOptions);
             register(browserInstance);
             context = await browserInstance.newContext({
-                ...(extraHTTPHeaders && { extraHTTPHeaders }),
+                ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+                ...(httpCredentials && { httpCredentials }),
+                ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
             });
         }
+        if (authHeader) {
+            await addAuthRouteHandler(context, link, authHeader);
+        }
         const page = await context.newPage();
         for (const path of sitemapPaths) {
             sitemapLink = homeUrl + path;
@@ -59,7 +69,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
     const checkUrlExists = async (page, parsedUrl) => {
         try {
             const response = await page.goto(parsedUrl);
-            return response.ok();
+            return response?.ok() ?? false;
         }
         catch (e) {
             consoleLogger.error(e);
@@ -71,7 +81,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
     try {
         sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
         if (sitemapUrls.length > 0) {
-            console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
+            consoleLogger.info(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
             sitemapExist = true;
         }
     }
@@ -91,7 +101,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
         }
     }
     if (!sitemapExist) {
-        console.log('Unable to find sitemap. Commencing website crawl instead.');
+        consoleLogger.info('Unable to find sitemap. Commencing website crawl instead.');
         return await crawlDomain({
             url,
             randomToken,
@@ -121,7 +131,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
             durationExceeded = true;
             break;
         }
-        console.log(`Processing sitemap: ${currentSitemapUrl}`);
+        consoleLogger.info(`Processing sitemap: ${currentSitemapUrl}`);
         urlsCrawledFinal = await crawlSitemap({
             sitemapUrl: currentSitemapUrl,
             randomToken,
@@ -149,7 +159,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
     const remainingScanDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : 0;
     const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
     if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
-        console.log(`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`);
+        consoleLogger.info(`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`);
         urlsCrawledFinal = await crawlDomain({
             url,
             randomToken,
@@ -173,7 +183,7 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
         });
     }
     else if (!hasDurationRemaining) {
-        console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
+        consoleLogger.info(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
         durationExceeded = true;
     }
     guiInfoLog(guiInfoStatusTypes.COMPLETED, {});

package/dist/crawlers/crawlRateController.js ADDED Viewed

@@ -0,0 +1,47 @@
+import { consoleLogger } from '../logs.js';
+export class CrawlRateController {
+    constructor(maxRequestsPerCrawl, maxConcurrency) {
+        this.scannedCount = 0;
+        this.consecutiveFailures = 0;
+        this.consecutiveSuccesses = 0;
+        this.maxPages = maxRequestsPerCrawl;
+        this.maxConsecutiveFailures = Number(process.env.OOBEE_CONSECUTIVE_MAX_RETRIES) || 100;
+        this.originalMaxConcurrency = maxConcurrency;
+    }
+    claimSlot() {
+        if (this.scannedCount >= this.maxPages) {
+            return false;
+        }
+        this.scannedCount++;
+        return true;
+    }
+    onSuccess(pool) {
+        this.consecutiveFailures = 0;
+        this.consecutiveSuccesses++;
+        if (pool && this.consecutiveSuccesses % CrawlRateController.RECOVERY_INTERVAL === 0) {
+            if (pool.maxConcurrency < this.originalMaxConcurrency) {
+                pool.maxConcurrency = Math.min(pool.maxConcurrency + 2, this.originalMaxConcurrency);
+                consoleLogger.info(`Recovering concurrency to ${pool.maxConcurrency}`);
+            }
+        }
+    }
+    onFailure(httpStatus, pool) {
+        if (typeof httpStatus !== 'number' || httpStatus < 400) {
+            return false;
+        }
+        this.consecutiveSuccesses = 0;
+        this.consecutiveFailures++;
+        if (pool && pool.maxConcurrency > 1) {
+            pool.maxConcurrency = Math.max(1, Math.floor(pool.maxConcurrency / 2));
+            consoleLogger.info(`Rate limited (HTTP ${httpStatus}) — reducing concurrency to ${pool.maxConcurrency}`);
+        }
+        if (this.consecutiveFailures >= this.maxConsecutiveFailures) {
+            return true;
+        }
+        return false;
+    }
+    isLimitReached() {
+        return this.scannedCount >= this.maxPages;
+    }
+}
+CrawlRateController.RECOVERY_INTERVAL = 10;

package/dist/crawlers/crawlSitemap.js CHANGED Viewed

@@ -1,18 +1,18 @@
 import crawlee, { EnqueueStrategy, RequestList } from 'crawlee';
-import * as path from 'path';
-import fsp from 'fs/promises';
-import { createCrawleeSubFolders, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
+import { CrawlRateController } from './crawlRateController.js';
+import { createCrawleeSubFolders, getPreLaunchHook, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
 import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, disallowedListOfPatterns, FileTypes, } from '../constants/constants.js';
 import { getLinksFromSitemap, getPlaywrightLaunchOptions, isSkippedUrl, waitForPageLoaded, isFilePath, } from '../constants/common.js';
 import { areLinksEqual, isFollowStrategy, isWhitelistedContentType, normUrl, register } from '../utils.js';
 import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
-import { guiInfoLog } from '../logs.js';
+import { consoleLogger, guiInfoLog } from '../logs.js';
 const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = '', scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
     const crawlStartTime = Date.now();
     let dataset;
     let urlsCrawled;
     let durationExceeded = false;
     let isAbortingScan = false;
+    const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
     if (fromCrawlIntelligentSitemap) {
         dataset = datasetFromIntelligent;
         urlsCrawled = urlsCrawledFromIntelligent;
@@ -40,31 +40,20 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
         launchContext: {
             launcher: constants.launcher,
             launchOptions: getPlaywrightLaunchOptions(browser),
-            // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
-            ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
         },
         retryOnBlocked: true,
         browserPoolOptions: {
             useFingerprints: false,
             preLaunchHooks: [
+                getPreLaunchHook(userDataDirectory),
                 async (_pageId, launchContext) => {
-                    const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
-                    // Ensure base exists
-                    await fsp.mkdir(baseDir, { recursive: true });
-                    // Create a unique subdir per browser
-                    const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
-                    await fsp.mkdir(subProfileDir, { recursive: true });
-                    // Assign to Crawlee's launcher
-                    launchContext.userDataDir = subProfileDir;
-                    // Safely extend launchOptions
                     launchContext.launchOptions = {
                         ...launchContext.launchOptions,
                         ignoreHTTPSErrors: true,
                         ...playwrightDeviceDetailsObject,
+                        ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
                         ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
                     };
-                    // Optionally log for debugging
-                    // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
                 },
             ],
         },
@@ -149,13 +138,11 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
                 await waitForPageLoaded(page, 10000);
                 const actualUrl = page.url() || request.loadedUrl || request.url;
                 const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
-                if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
+                if (hasExceededDuration) {
+                    consoleLogger.info(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
+                    durationExceeded = true;
                     isAbortingScan = true;
-                    if (hasExceededDuration) {
-                        console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
-                        durationExceeded = true;
-                    }
-                    crawler.autoscaledPool.abort(); // stops new requests
+                    crawler.autoscaledPool.abort();
                     return;
                 }
                 if (request.skipNavigation && actualUrl === 'about:blank') {
@@ -245,22 +232,29 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
                     catch (_) {
                         // Page/context was destroyed during navigation — handled by outer catch
                     }
-                    guiInfoLog(guiInfoStatusTypes.SCANNED, {
-                        numScanned: urlsCrawled.scanned.length,
-                        urlScanned: request.url,
-                    });
-                    urlsCrawled.scanned.push({
-                        url: request.url,
-                        pageTitle: results.pageTitle,
-                        actualUrl, // i.e. actualUrl
-                    });
-                    urlsCrawled.scannedRedirects.push({
-                        fromUrl: request.url,
-                        toUrl: actualUrl,
-                    });
-                    results.url = request.url;
-                    results.actualUrl = actualUrl;
-                    await dataset.pushData(results);
+                    if (rateController.claimSlot()) {
+                        guiInfoLog(guiInfoStatusTypes.SCANNED, {
+                            numScanned: urlsCrawled.scanned.length,
+                            urlScanned: request.url,
+                        });
+                        urlsCrawled.scanned.push({
+                            url: request.url,
+                            pageTitle: results.pageTitle,
+                            actualUrl, // i.e. actualUrl
+                        });
+                        rateController.onSuccess(crawler.autoscaledPool);
+                        if (rateController.isLimitReached()) {
+                            isAbortingScan = true;
+                            crawler.autoscaledPool.abort();
+                        }
+                        urlsCrawled.scannedRedirects.push({
+                            fromUrl: request.url,
+                            toUrl: actualUrl,
+                        });
+                        results.url = request.url;
+                        results.actualUrl = actualUrl;
+                        await dataset.pushData(results);
+                    }
                 }
                 else {
                     guiInfoLog(guiInfoStatusTypes.SKIPPED, {
@@ -284,30 +278,27 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
                 }
             }
             catch (e) {
-                if (!isAbortingScan) {
-                    guiInfoLog(guiInfoStatusTypes.ERROR, {
-                        numScanned: urlsCrawled.scanned.length,
-                        urlScanned: request.url,
-                    });
-                    urlsCrawled.error.push({
-                        url: request.url,
-                        pageTitle: request.url,
-                        actualUrl: request.url,
-                        metadata: STATUS_CODE_METADATA[2],
-                        httpStatusCode: 0,
-                    });
-                }
+                // Do not push to urlsCrawled.error here — Crawlee will retry the request
+                // (up to maxRequestRetries, default 3). If all retries are exhausted,
+                // failedRequestHandler will record the error. Pushing here causes
+                // duplicates and false positives for URLs that succeed on retry.
             }
         },
         failedRequestHandler: async ({ request, response, error }) => {
             if (isAbortingScan) {
                 return;
             }
+            const status = response?.status();
+            if (rateController.onFailure(status, crawler.autoscaledPool)) {
+                consoleLogger.info(`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`);
+                isAbortingScan = true;
+                crawler.autoscaledPool?.abort();
+                return;
+            }
             guiInfoLog(guiInfoStatusTypes.ERROR, {
                 numScanned: urlsCrawled.scanned.length,
                 urlScanned: request.url,
             });
-            const status = response?.status();
             const metadata = typeof status === 'number'
                 ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
                 : STATUS_CODE_METADATA[2];
@@ -322,15 +313,13 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
         },
         maxRequestsPerCrawl: Infinity,
         maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
-        ...(process.env.OOBEE_FAST_CRAWLER && {
-            autoscaledPoolOptions: {
-                minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
-                maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
-                desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
-                scaleUpStepRatio: 0.99, // Scale up faster
-                scaleDownStepRatio: 0.1, // Scale down slower
-            },
-        }),
+        autoscaledPoolOptions: {
+            minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
+            maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
+            desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
+            scaleUpStepRatio: 0.99, // Scale up faster
+            scaleDownStepRatio: 0.1, // Scale down slower
+        },
     }));
     await crawler.run();
     await requestList.isFinished();

package/dist/crawlers/runCustom.js CHANGED Viewed

@@ -1,5 +1,5 @@
 /* eslint-env browser */
-import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
+import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
 import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
 import constants, { getIntermediateScreenshotsPath, guiInfoStatusTypes, } from '../constants/constants.js';
 import { initNewPage, log } from './custom/utils.js';
@@ -18,7 +18,7 @@ export class ProcessPageParams {
         this.randomToken = randomToken;
     }
 }
-const runCustom = async (url, randomToken, browserToRun, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, initialCustomFlowLabel) => {
+const runCustom = async (url, randomToken, browserToRun, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, initialCustomFlowLabel, extraHTTPHeaders) => {
     // checks and delete datasets path if it already exists
     process.env.CRAWLEE_STORAGE_DIR = randomToken;
     const urlsCrawled = { ...constants.urlsCrawledObj };
@@ -47,6 +47,7 @@ const runCustom = async (url, randomToken, browserToRun, userDataDirectory, view
             ...baseArgs.filter(a => !a.startsWith('--window-size') && a !== '--start-maximized'),
             ...customArgs,
         ];
+        const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
         const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
             ...baseLaunchOptions,
             args: mergedArgs,
@@ -56,7 +57,12 @@ const runCustom = async (url, randomToken, browserToRun, userDataDirectory, view
             viewport: null,
             ...(hasCustomViewport ? contextDeviceOptions : {}),
             userAgent: process.env.OOBEE_USER_AGENT || deviceUserAgent,
+            ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+            ...(httpCredentials && { httpCredentials }),
         });
+        if (authHeader) {
+            await addAuthRouteHandler(context, url, authHeader);
+        }
         register(context);
         processPageParams.stopAll = async () => {
             try {

package/dist/generateOobeeClientScanner.js CHANGED Viewed

@@ -51,7 +51,7 @@ const SENTRY_NODE_VERSION = (() => {
         return _require('@sentry/node/package.json').version;
     }
     catch {
-        return '9.47.1'; // safe fallback matching currently installed version
+        return '10.58.0'; // safe fallback matching currently installed version
     }
 })();
 // ---------------------------------------------------------------------------
@@ -444,6 +444,37 @@ const scanApiScript = (shortDescMap, longDescMap, stepByStepMap) => `
       // Run axe-core + oobee custom checks
       var scanResult = await window.runA11yScan(elementsToScan, '');
+      // Re-verify aria-hidden-focus violations against the live DOM to handle
+      // race conditions with JS that sets tabindex="-1" after aria-hidden
+      var axeViolations = scanResult.axeScanResults.violations || [];
+      var ariaHiddenViolation = axeViolations.find(function(v) { return v.id === 'aria-hidden-focus'; });
+      if (ariaHiddenViolation) {
+        await new Promise(function(resolve) { setTimeout(resolve, 0); });
+        ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(function(node) {
+          var selector = node.target && node.target[0];
+          if (typeof selector !== 'string') return true;
+          try {
+            var el = document.querySelector(selector);
+            if (!el) return true;
+            var focusables = el.querySelectorAll(
+              'a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]'
+            );
+            if (focusables.length === 0) return false;
+            return Array.from(focusables).some(function(child) {
+              var tabindex = child.getAttribute('tabindex');
+              if (tabindex === null) return true;
+              var parsed = parseInt(tabindex, 10);
+              return isNaN(parsed) || parsed >= 0;
+            });
+          } catch (e) { return true; }
+        });
+        if (ariaHiddenViolation.nodes.length === 0) {
+          scanResult.axeScanResults.violations = axeViolations.filter(function(v) {
+            return v.id !== 'aria-hidden-focus';
+          });
+        }
+      }
       // Convert raw axe results into oobee category structure
       var filtered = _oobeeFilterAxeResults(scanResult.axeScanResults, scanResult.pageTitle);