npm - @govtechsg/oobee - Versions diffs - 0.10.91 → 0.10.93 - Mend

@govtechsg/oobee 0.10.91 → 0.10.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/AGENTS.md +303 -0
package/README.md +22 -0
package/dist/cli.js +3 -0
package/dist/combine.js +15 -3
package/dist/constants/cliFunctions.js +7 -0
package/dist/constants/common.js +149 -80
package/dist/constants/constants.js +1 -0
package/dist/crawlers/commonCrawlerFunc.js +136 -15
package/dist/crawlers/crawlDomain.js +55 -58
package/dist/crawlers/crawlIntelligentSitemap.js +21 -11
package/dist/crawlers/crawlRateController.js +47 -0
package/dist/crawlers/crawlSitemap.js +51 -62
package/dist/crawlers/runCustom.js +8 -2
package/dist/generateOobeeClientScanner.js +32 -1
package/dist/mergeAxeResults/itemsStore.js +32 -3
package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
package/dist/mergeAxeResults.js +120 -92
package/dist/npmIndex.js +1 -0
package/dist/utils.js +23 -28
package/oobee-client-scanner.js +35 -4
package/package.json +3 -3
package/src/cli.ts +4 -0
package/src/combine.ts +16 -1
package/src/constants/cliFunctions.ts +7 -0
package/src/constants/common.ts +162 -90
package/src/constants/constants.ts +1 -0
package/src/crawlers/commonCrawlerFunc.ts +148 -14
package/src/crawlers/crawlDomain.ts +64 -66
package/src/crawlers/crawlIntelligentSitemap.ts +23 -11
package/src/crawlers/crawlRateController.ts +63 -0
package/src/crawlers/crawlSitemap.ts +57 -70
package/src/crawlers/runCustom.ts +10 -1
package/src/generateOobeeClientScanner.ts +32 -1
package/src/index.ts +1 -0
package/src/mergeAxeResults/itemsStore.ts +37 -3
package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
package/src/mergeAxeResults.ts +139 -99
package/src/npmIndex.ts +1 -0
package/src/utils.ts +25 -33
/package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt} +0 -0

package/src/crawlers/crawlDomain.ts CHANGED Viewed

@@ -1,14 +1,15 @@
 import crawlee, { EnqueueStrategy } from 'crawlee';
+import { CrawlRateController } from './crawlRateController.js';
 import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
 import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
-import * as path from 'path';
-import fsp from 'fs/promises';
 import {
   createCrawleeSubFolders,
+  getPreLaunchHook,
   runAxeScript,
   isUrlPdf,
   shouldSkipClickDueToDisallowedHref,
   shouldSkipDueToUnsupportedContent,
+  splitAuthHeaders,
 } from './commonCrawlerFunc.js';
 import constants, {
   UrlsCrawled,
@@ -29,7 +30,7 @@ import {
   getUrlsFromRobotsTxt,
   waitForPageLoaded,
 } from '../constants/common.js';
-import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
+import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
 import {
   handlePdfDownload,
   runPdfScan,
@@ -364,9 +365,7 @@ const crawlDomain = async ({
         // same-domain strategy) still contribute their <a> links above, but
         // clicking every interactive element on them is too slow and starves
         // the crawler of time to discover pages on the primary hostname.
-        const currentHostname = new URL(page.url()).hostname;
-        const seedHostname = new URL(url).hostname;
-        if (currentHostname === seedHostname) {
+        if (isSameHostname(new URL(page.url()).hostname, new URL(url).hostname)) {
           // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
           try {
             await customEnqueueLinksByClickingElements(page, browserContext);
@@ -382,53 +381,46 @@ const crawlDomain = async ({
   };
   let isAbortingScanNow = false;
+  const rateController = new CrawlRateController(
+    maxRequestsPerCrawl,
+    specifiedMaxConcurrency || constants.maxConcurrency,
+  );
+  const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
   const crawler = register(
     new crawlee.PlaywrightCrawler({
       launchContext: {
         launcher: constants.launcher,
         launchOptions: getPlaywrightLaunchOptions(browser),
-        // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
-        ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
       },
       retryOnBlocked: true,
       browserPoolOptions: {
         useFingerprints: false,
         preLaunchHooks: [
+          getPreLaunchHook(userDataDirectory),
           async (_pageId, launchContext) => {
-            const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
-            // Ensure base exists
-            await fsp.mkdir(baseDir, { recursive: true });
-            // Create a unique subdir per browser
-            const subProfileDir = path.join(
-              baseDir,
-              `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
-            );
-            await fsp.mkdir(subProfileDir, { recursive: true });
-            // Assign to Crawlee's launcher
-            // Crawlee preLaunchHooks expects launchContext to be mutated in-place.
-            // eslint-disable-next-line no-param-reassign
-            launchContext.userDataDir = subProfileDir;
-            // Safely extend launchOptions
             // eslint-disable-next-line no-param-reassign
             launchContext.launchOptions = {
               ...launchContext.launchOptions,
               ignoreHTTPSErrors: true,
               ...playwrightDeviceDetailsObject,
+              ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
               ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
-              ...(extraHTTPHeaders && { extraHTTPHeaders }),
+              ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+              ...(httpCredentials && { httpCredentials }),
             };
-            // Optionally log for debugging
-            // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
           },
         ],
       },
       requestQueue,
+      preNavigationHooks: [
+        async (crawlingContext) => {
+          if (extraHTTPHeaders) {
+            crawlingContext.request.headers = extraHTTPHeaders;
+          }
+        },
+      ],
       postNavigationHooks: [
         async crawlingContext => {
           const { page, request } = crawlingContext;
@@ -527,11 +519,9 @@ const crawlDomain = async ({
           const hasExceededDuration =
             scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
-          if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
-            if (hasExceededDuration) {
-              console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
-              durationExceeded = true;
-            }
+          if (hasExceededDuration) {
+            console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
+            durationExceeded = true;
             isAbortingScanNow = true;
             activeCrawler.autoscaledPool.abort();
             return;
@@ -691,8 +681,7 @@ const crawlDomain = async ({
                 return;
               }
-              // One more check if scanned pages have reached limit due to multi-instances of handler running
-              if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
+              if (rateController.claimSlot()) {
                 guiInfoLog(guiInfoStatusTypes.SCANNED, {
                   numScanned: urlsCrawled.scanned.length,
                   urlScanned: request.url,
@@ -703,6 +692,11 @@ const crawlDomain = async ({
                   pageTitle: results.pageTitle,
                   actualUrl, // i.e. actualUrl
                 });
+                rateController.onSuccess(crawler.autoscaledPool);
+                if (rateController.isLimitReached()) {
+                  isAbortingScanNow = true;
+                  activeCrawler.autoscaledPool.abort();
+                }
                 scannedUrlSet.add(normUrl(request.url));
                 scannedResolvedUrlSet.add(normUrl(actualUrl));
@@ -715,8 +709,7 @@ const crawlDomain = async ({
                 results.actualUrl = actualUrl;
                 await dataset.pushData(results);
               }
-            } else if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
-              // One more check if scanned pages have reached limit due to multi-instances of handler running
+            } else if (rateController.claimSlot()) {
               guiInfoLog(guiInfoStatusTypes.SCANNED, {
                 numScanned: urlsCrawled.scanned.length,
                 urlScanned: request.url,
@@ -726,6 +719,11 @@ const crawlDomain = async ({
                 actualUrl: request.url,
                 pageTitle: results.pageTitle,
               });
+              rateController.onSuccess(crawler.autoscaledPool);
+              if (rateController.isLimitReached()) {
+                isAbortingScanNow = true;
+                activeCrawler.autoscaledPool.abort();
+              }
               scannedUrlSet.add(normUrl(request.url));
               scannedResolvedUrlSet.add(normUrl(request.url));
               await dataset.pushData(results);
@@ -777,33 +775,35 @@ const crawlDomain = async ({
               });
             }
           } catch {
-            // Do nothing since the error will be pushed
+            // Recovery failed; Crawlee will retry the request automatically
           }
-          // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
-          // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
-          if (!isAbortingScanNow) {
-            guiInfoLog(guiInfoStatusTypes.ERROR, {
-              numScanned: urlsCrawled.scanned.length,
-              urlScanned: request.url,
-            });
-            urlsCrawled.error.push({
-              url: request.url,
-              pageTitle: request.url,
-              actualUrl: request.url,
-              metadata: STATUS_CODE_METADATA[2],
-            });
-          }
+          // Do not push to urlsCrawled.error here — Crawlee will retry the request
+          // (up to maxRequestRetries, default 3). If all retries are exhausted,
+          // failedRequestHandler will record the error. Pushing here causes
+          // duplicates and false positives for URLs that succeed on retry.
         }
       },
       failedRequestHandler: async ({ request, response }) => {
+        if (isAbortingScanNow) {
+          return;
+        }
+        const status = response?.status();
+        if (rateController.onFailure(status, crawler.autoscaledPool)) {
+          consoleLogger.info(
+            `Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`,
+          );
+          isAbortingScanNow = true;
+          crawler.autoscaledPool?.abort();
+          return;
+        }
         guiInfoLog(guiInfoStatusTypes.ERROR, {
           numScanned: urlsCrawled.scanned.length,
           urlScanned: request.url,
         });
-        const status = response?.status();
         const metadata =
           typeof status === 'number'
             ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
@@ -819,15 +819,13 @@ const crawlDomain = async ({
       },
       maxRequestsPerCrawl: Infinity,
       maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
-      ...(process.env.OOBEE_FAST_CRAWLER && {
-        autoscaledPoolOptions: {
-          minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
-          maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
-          desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
-          scaleUpStepRatio: 0.99, // Scale up faster
-          scaleDownStepRatio: 0.1, // Scale down slower
-        },
-      }),
+      autoscaledPoolOptions: {
+        minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
+        maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
+        desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
+        scaleUpStepRatio: 0.99, // Scale up faster
+        scaleDownStepRatio: 0.1, // Scale down slower
+      },
     }),
   );
@@ -850,7 +848,7 @@ const crawlDomain = async ({
         .map(item => item.actualUrl || item.url)
         .filter(pageUrl => {
           try {
-            return new URL(pageUrl).hostname === seedHostname && !clickPassVisited.has(pageUrl);
+            return isSameHostname(new URL(pageUrl).hostname, seedHostname) && !clickPassVisited.has(pageUrl);
           } catch {
             return false;
           }

package/src/crawlers/crawlIntelligentSitemap.ts CHANGED Viewed

@@ -1,13 +1,13 @@
 import fs from 'fs';
 import { chromium, Page } from 'playwright';
 import { EnqueueStrategy } from 'crawlee';
-import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
+import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
 import constants, { FileTypes, guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
 import { consoleLogger, guiInfoLog } from '../logs.js';
 import crawlDomain from './crawlDomain.js';
 import crawlSitemap from './crawlSitemap.js';
 import { ViewportSettingsClass } from '../combine.js';
-import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
+import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt, initModifiedUserAgent } from '../constants/common.js';
 import { register } from '../utils.js';
 const crawlIntelligentSitemap = async (
@@ -40,6 +40,10 @@ const crawlIntelligentSitemap = async (
   ({ dataset } = await createCrawleeSubFolders(randomToken));
+  // Initialise modified User-Agent early so sitemap discovery requests
+  // don't expose "HeadlessChrome" (which triggers bot-blocking on some sites).
+  await initModifiedUserAgent(browser);
   function getHomeUrl(parsedUrl: string) {
     const urlObject = new URL(parsedUrl);
     return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
@@ -54,6 +58,7 @@ const crawlIntelligentSitemap = async (
     let sitemapLink = '';
     const launchOptions = getPlaywrightLaunchOptions(browser);
+    const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
     let context;
     let browserInstance;
@@ -61,18 +66,25 @@ const crawlIntelligentSitemap = async (
       const effectiveUserDataDirectory = userDataDirectory || '';
       context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
         ...launchOptions,
-        ...(extraHTTPHeaders && { extraHTTPHeaders }),
+        ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+        ...(httpCredentials && { httpCredentials }),
+        ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
       });
       register(context);
     } else {
-      // In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
       browserInstance = await constants.launcher.launch(launchOptions);
       register(browserInstance as unknown as { close: () => Promise<void> });
       context = await browserInstance.newContext({
-        ...(extraHTTPHeaders && { extraHTTPHeaders }),
+        ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+        ...(httpCredentials && { httpCredentials }),
+        ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
       });
     }
+    if (authHeader) {
+      await addAuthRouteHandler(context, link, authHeader);
+    }
     const page = await context.newPage();
     for (const path of sitemapPaths) {
@@ -93,7 +105,7 @@ const crawlIntelligentSitemap = async (
   const checkUrlExists = async (page: Page, parsedUrl: string) => {
     try {
       const response = await page.goto(parsedUrl);
-      return response.ok();
+      return response?.ok() ?? false;
     } catch (e) {
       consoleLogger.error(e);
       return false;
@@ -105,7 +117,7 @@ const crawlIntelligentSitemap = async (
   try {
     sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
     if (sitemapUrls.length > 0) {
-      console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
+      consoleLogger.info(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
       sitemapExist = true;
     }
   } catch (error) {
@@ -125,7 +137,7 @@ const crawlIntelligentSitemap = async (
   }
   if (!sitemapExist) {
-    console.log('Unable to find sitemap. Commencing website crawl instead.');
+    consoleLogger.info('Unable to find sitemap. Commencing website crawl instead.');
     return await crawlDomain({
       url,
       randomToken,
@@ -157,7 +169,7 @@ const crawlIntelligentSitemap = async (
       break;
     }
-    console.log(`Processing sitemap: ${currentSitemapUrl}`);
+    consoleLogger.info(`Processing sitemap: ${currentSitemapUrl}`);
     urlsCrawledFinal = await crawlSitemap({
       sitemapUrl: currentSitemapUrl,
       randomToken,
@@ -187,7 +199,7 @@ const crawlIntelligentSitemap = async (
   const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
   if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
-    console.log(
+    consoleLogger.info(
       `Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`,
     );
     urlsCrawledFinal = await crawlDomain({
@@ -212,7 +224,7 @@ const crawlIntelligentSitemap = async (
       scanDuration: remainingScanDuration,
     });
   } else if (!hasDurationRemaining) {
-    console.log(
+    consoleLogger.info(
       `Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`,
     );
     durationExceeded = true;

package/src/crawlers/crawlRateController.ts ADDED Viewed

@@ -0,0 +1,63 @@
+import { consoleLogger } from '../logs.js';
+export class CrawlRateController {
+  private scannedCount = 0;
+  private readonly maxPages: number;
+  private consecutiveFailures = 0;
+  private consecutiveSuccesses = 0;
+  private readonly maxConsecutiveFailures: number;
+  private readonly originalMaxConcurrency: number;
+  private static readonly RECOVERY_INTERVAL = 10;
+  constructor(maxRequestsPerCrawl: number, maxConcurrency: number) {
+    this.maxPages = maxRequestsPerCrawl;
+    this.maxConsecutiveFailures = Number(process.env.OOBEE_CONSECUTIVE_MAX_RETRIES) || 100;
+    this.originalMaxConcurrency = maxConcurrency;
+  }
+  claimSlot(): boolean {
+    if (this.scannedCount >= this.maxPages) {
+      return false;
+    }
+    this.scannedCount++;
+    return true;
+  }
+  onSuccess(pool?: { maxConcurrency: number }): void {
+    this.consecutiveFailures = 0;
+    this.consecutiveSuccesses++;
+    if (pool && this.consecutiveSuccesses % CrawlRateController.RECOVERY_INTERVAL === 0) {
+      if (pool.maxConcurrency < this.originalMaxConcurrency) {
+        pool.maxConcurrency = Math.min(pool.maxConcurrency + 2, this.originalMaxConcurrency);
+        consoleLogger.info(`Recovering concurrency to ${pool.maxConcurrency}`);
+      }
+    }
+  }
+  onFailure(httpStatus: number | undefined, pool?: { maxConcurrency: number }): boolean {
+    if (typeof httpStatus !== 'number' || httpStatus < 400) {
+      return false;
+    }
+    this.consecutiveSuccesses = 0;
+    this.consecutiveFailures++;
+    if (pool && pool.maxConcurrency > 1) {
+      pool.maxConcurrency = Math.max(1, Math.floor(pool.maxConcurrency / 2));
+      consoleLogger.info(
+        `Rate limited (HTTP ${httpStatus}) — reducing concurrency to ${pool.maxConcurrency}`,
+      );
+    }
+    if (this.consecutiveFailures >= this.maxConsecutiveFailures) {
+      return true;
+    }
+    return false;
+  }
+  isLimitReached(): boolean {
+    return this.scannedCount >= this.maxPages;
+  }
+}

package/src/crawlers/crawlSitemap.ts CHANGED Viewed

@@ -1,9 +1,9 @@
 import crawlee, { EnqueueStrategy, LaunchContext, Request, RequestList, Dataset } from 'crawlee';
+import { CrawlRateController } from './crawlRateController.js';
 import fs from 'fs';
-import * as path from 'path';
-import fsp from 'fs/promises';
 import {
   createCrawleeSubFolders,
+  getPreLaunchHook,
   preNavigationHooks,
   runAxeScript,
   isUrlPdf,
@@ -30,7 +30,7 @@ import {
   mapPdfScanResults,
   doPdfScreenshots,
 } from './pdfScanFunc.js';
-import { guiInfoLog } from '../logs.js';
+import { consoleLogger, guiInfoLog } from '../logs.js';
 import { ViewportSettingsClass } from '../combine.js';
 const crawlSitemap = async ({
@@ -81,6 +81,10 @@ const crawlSitemap = async ({
   let urlsCrawled: UrlsCrawled;
   let durationExceeded = false;
   let isAbortingScan = false;
+  const rateController = new CrawlRateController(
+    maxRequestsPerCrawl,
+    specifiedMaxConcurrency || constants.maxConcurrency,
+  );
   if (fromCrawlIntelligentSitemap) {
     dataset = datasetFromIntelligent;
@@ -125,39 +129,20 @@ const crawlSitemap = async ({
       launchContext: {
         launcher: constants.launcher,
         launchOptions: getPlaywrightLaunchOptions(browser),
-        // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
-        ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
       },
       retryOnBlocked: true,
       browserPoolOptions: {
         useFingerprints: false,
         preLaunchHooks: [
+          getPreLaunchHook(userDataDirectory),
           async (_pageId, launchContext) => {
-            const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
-            // Ensure base exists
-            await fsp.mkdir(baseDir, { recursive: true });
-            // Create a unique subdir per browser
-            const subProfileDir = path.join(
-              baseDir,
-              `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
-            );
-            await fsp.mkdir(subProfileDir, { recursive: true });
-            // Assign to Crawlee's launcher
-            launchContext.userDataDir = subProfileDir;
-            // Safely extend launchOptions
             launchContext.launchOptions = {
               ...launchContext.launchOptions,
               ignoreHTTPSErrors: true,
               ...playwrightDeviceDetailsObject,
+              ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
               ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
             };
-            // Optionally log for debugging
-            // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
           },
         ],
       },
@@ -259,13 +244,11 @@ const crawlSitemap = async ({
           const hasExceededDuration =
             scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
-          if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
+          if (hasExceededDuration) {
+            consoleLogger.info(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
+            durationExceeded = true;
             isAbortingScan = true;
-            if (hasExceededDuration) {
-              console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
-              durationExceeded = true;
-            }
-            crawler.autoscaledPool.abort(); // stops new requests
+            crawler.autoscaledPool.abort();
             return;
           }
@@ -376,26 +359,33 @@ const crawlSitemap = async ({
               // Page/context was destroyed during navigation — handled by outer catch
             }
-            guiInfoLog(guiInfoStatusTypes.SCANNED, {
-              numScanned: urlsCrawled.scanned.length,
-              urlScanned: request.url,
-            });
+            if (rateController.claimSlot()) {
+              guiInfoLog(guiInfoStatusTypes.SCANNED, {
+                numScanned: urlsCrawled.scanned.length,
+                urlScanned: request.url,
+              });
-            urlsCrawled.scanned.push({
-              url: request.url,
-              pageTitle: results.pageTitle,
-              actualUrl, // i.e. actualUrl
-            });
+              urlsCrawled.scanned.push({
+                url: request.url,
+                pageTitle: results.pageTitle,
+                actualUrl, // i.e. actualUrl
+              });
+              rateController.onSuccess(crawler.autoscaledPool);
+              if (rateController.isLimitReached()) {
+                isAbortingScan = true;
+                crawler.autoscaledPool.abort();
+              }
-            urlsCrawled.scannedRedirects.push({
-              fromUrl: request.url,
-              toUrl: actualUrl,
-            });
+              urlsCrawled.scannedRedirects.push({
+                fromUrl: request.url,
+                toUrl: actualUrl,
+              });
-            results.url = request.url;
-            results.actualUrl = actualUrl;
+              results.url = request.url;
+              results.actualUrl = actualUrl;
-            await dataset.pushData(results);
+              await dataset.pushData(results);
+            }
           } else {
             guiInfoLog(guiInfoStatusTypes.SKIPPED, {
               numScanned: urlsCrawled.scanned.length,
@@ -420,20 +410,10 @@ const crawlSitemap = async ({
             }
           }
         } catch (e) {
-          if (!isAbortingScan) {
-            guiInfoLog(guiInfoStatusTypes.ERROR, {
-              numScanned: urlsCrawled.scanned.length,
-              urlScanned: request.url,
-            });
-            urlsCrawled.error.push({
-              url: request.url,
-              pageTitle: request.url,
-              actualUrl: request.url,
-              metadata: STATUS_CODE_METADATA[2],
-              httpStatusCode: 0,
-            });
-          }
+          // Do not push to urlsCrawled.error here — Crawlee will retry the request
+          // (up to maxRequestRetries, default 3). If all retries are exhausted,
+          // failedRequestHandler will record the error. Pushing here causes
+          // duplicates and false positives for URLs that succeed on retry.
         }
       },
       failedRequestHandler: async ({ request, response, error }) => {
@@ -441,12 +421,21 @@ const crawlSitemap = async ({
           return;
         }
+        const status = response?.status();
+        if (rateController.onFailure(status, crawler.autoscaledPool)) {
+          consoleLogger.info(
+            `Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`,
+          );
+          isAbortingScan = true;
+          crawler.autoscaledPool?.abort();
+          return;
+        }
         guiInfoLog(guiInfoStatusTypes.ERROR, {
           numScanned: urlsCrawled.scanned.length,
           urlScanned: request.url,
         });
-        const status = response?.status();
         const metadata =
           typeof status === 'number'
             ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
@@ -463,15 +452,13 @@ const crawlSitemap = async ({
       },
       maxRequestsPerCrawl: Infinity,
       maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
-      ...(process.env.OOBEE_FAST_CRAWLER && {
-        autoscaledPoolOptions: {
-          minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
-          maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
-          desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
-          scaleUpStepRatio: 0.99, // Scale up faster
-          scaleDownStepRatio: 0.1, // Scale down slower
-        },
-      }),
+      autoscaledPoolOptions: {
+        minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
+        maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
+        desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
+        scaleUpStepRatio: 0.99, // Scale up faster
+        scaleDownStepRatio: 0.1, // Scale down slower
+      },
     }),
   );

package/src/crawlers/runCustom.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 /* eslint-env browser */
-import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
+import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
 import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
 import constants, {
   getIntermediateScreenshotsPath,
@@ -60,6 +60,7 @@ const runCustom = async (
   blacklistedPatterns: string[] | null,
   includeScreenshots: boolean,
   initialCustomFlowLabel?: string,
+  extraHTTPHeaders?: Record<string, string>,
 ) => {
   // checks and delete datasets path if it already exists
   process.env.CRAWLEE_STORAGE_DIR = randomToken;
@@ -109,6 +110,8 @@ const runCustom = async (
       ...customArgs,
     ];
+    const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
     const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
       ...baseLaunchOptions,
       args: mergedArgs,
@@ -118,8 +121,14 @@ const runCustom = async (
       viewport: null,
       ...(hasCustomViewport ? contextDeviceOptions : {}),
       userAgent: process.env.OOBEE_USER_AGENT || (deviceUserAgent as string | undefined),
+      ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+      ...(httpCredentials && { httpCredentials }),
     });
+    if (authHeader) {
+      await addAuthRouteHandler(context, url, authHeader);
+    }
     register(context);
     processPageParams.stopAll = async () => {