npm - @govtechsg/oobee - Versions diffs - 0.10.92 → 0.10.94 - Mend

@govtechsg/oobee 0.10.92 → 0.10.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

package/src/constants/common.ts CHANGED Viewed

@@ -359,8 +359,11 @@ const checkUrlConnectivityWithBrowser = async (
     }
   }
-  // Ensure Accept header for non-html content fallback
-  extraHTTPHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
+  // Ensure Accept header for non-html content fallback — use a local copy to avoid
+  // mutating the caller's extraHTTPHeaders object (which is later checked by crawlers
+  // to decide whether to enable preNavigationHooks header rewriting).
+  const localHeaders = { ...extraHTTPHeaders };
+  localHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
   await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
@@ -377,9 +380,21 @@ const checkUrlConnectivityWithBrowser = async (
   } = rawDevice;
   const launchOptions = getPlaywrightLaunchOptions(browserToRun);
+  const { Authorization, ...nonAuthHeaders } = localHeaders || {};
+  let httpCredentials = undefined;
+  if (Authorization?.startsWith('Basic ')) {
+    const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
+    const colonIdx = decoded.indexOf(':');
+    if (colonIdx > 0) {
+      httpCredentials = { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
+    }
+  }
   const contextOptions: Record<string, unknown> = {
     ...restDevice,
-    ...(extraHTTPHeaders && { extraHTTPHeaders }),
+    ...(Object.keys(nonAuthHeaders).length > 0 && { extraHTTPHeaders: nonAuthHeaders }),
+    ...(httpCredentials && { httpCredentials }),
     ignoreHTTPSErrors: true,
     ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
   };
@@ -421,6 +436,26 @@ const checkUrlConnectivityWithBrowser = async (
   }
   try {
+    // Only enable generic Authorization header routing interception broadly if
+    // a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
+    // performance warnings inside the check checkUrl phase for typical public scans
+    if (Object.keys(localHeaders).length > 0) {
+      if (Authorization && !httpCredentials) {
+        const entryOrigin = new URL(url).origin;
+        await browserContext.route('**/*', async (route: any, request: any) => {
+          try {
+            if (new URL(request.url()).origin === entryOrigin) {
+              await route.continue({ headers: { ...request.headers(), Authorization } });
+            } else {
+              await route.continue();
+            }
+          } catch {
+            await route.continue();
+          }
+        });
+      }
+    }
     const page = await browserContext.newPage();
     // Block native Chrome download UI
@@ -431,16 +466,6 @@ const checkUrlConnectivityWithBrowser = async (
       consoleLogger.info(`Unable to set download deny: ${(e as Error).message}`);
     }
-    // OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
-    // This allows the "Connectivity Check" to pass as soon as HTML is ready
-    await page.route('**/*', (route) => {
-      const type = route.request().resourceType();
-      if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
-        return route.abort();
-      }
-      return route.continue();
-    });
     // STEP 2: Navigate (follows server-side redirects)
     page.once('download', () => {
       res.status = constants.urlCheckStatuses.notASupportedDocument.code;
@@ -549,7 +574,7 @@ export const isSitemapContent = (content: string) => {
   }
   const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
-  const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
+  const regexForXmlSitemap = new RegExp('<(?:urlset|sitemapindex|feed|rss)+?.*>', 'gmi');
   if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
     // is an XML sitemap wrapped in a HTML document
     return true;
@@ -572,8 +597,22 @@ export const checkUrl = async (
   extraHTTPHeaders: Record<string, string>,
   fileTypes: FileTypes,
 ) => {
+  let urlToCheck = url;
+  if (scanner === ScannerTypes.LOCALFILE) {
+    if (!isFilePath(url)) {
+      const res = new RES();
+      res.status = constants.urlCheckStatuses.notALocalFile.code;
+      return res;
+    }
+    if (!url.toLowerCase().startsWith('file://')) {
+      urlToCheck = pathToFileURL(path.resolve(url)).toString();
+    }
+  }
   const res = await checkUrlConnectivityWithBrowser(
-    url,
+    urlToCheck,
     browser,
     clonedDataDir,
     playwrightDeviceDetailsObject,
@@ -661,6 +700,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
     ruleset,
     generateJsonFiles,
     scanDuration,
+    finalUrl,
   } = argv;
   const extraHTTPHeaders = parseHeaders(header);
@@ -694,6 +734,10 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
     url = temp.toString();
   }
+  // Keep browser-resolved URL (if provided by pre-check flow) as canonical entry URL.
+  // For local file paths, keep using the normalized `url` value below.
+  const resolvedEntryUrl = finalUrl && !isFilePath(finalUrl) ? finalUrl : url;
   // construct filename for scan results
   const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
   const domain = isLocalFileScan ? path.basename(url) : new URL(url).hostname;
@@ -738,7 +782,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
   return {
     type: scanner,
     url,
-    entryUrl: url,
+    entryUrl: resolvedEntryUrl,
     isHeadless: headless,
     deviceChosen,
     customDevice,
@@ -989,6 +1033,8 @@ export const getLinksFromSitemap = async (
   const scannedSitemaps = new Set<string>();
   const sitemapLinkCounts: Record<string, number> = {};
   const allUrls = new Set<string>(); // all discovered URLs (lightweight strings)
+  const isImageSitemapUrl = (candidateUrl: string) =>
+    /(^|\/)image-sitemap(?:-index)?(?:-\d+)?\.xml(?:$|[?#])/i.test(candidateUrl);
   const addToUrlList = (url: string) => {
     if (!url) return;
@@ -1072,6 +1118,11 @@ export const getLinksFromSitemap = async (
     let data;
     let sitemapType;
+    if (isImageSitemapUrl(url)) {
+      consoleLogger.info(`Skipping image sitemap: ${url}`);
+      return;
+    }
     if (scannedSitemaps.has(url)) {
       // Skip processing if the sitemap has already been scanned
       return;
@@ -1127,11 +1178,28 @@ export const getLinksFromSitemap = async (
         const page = await browserContext.newPage();
-        await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
+        // Use 'domcontentloaded' instead of 'networkidle' — sitemap XMLs with
+        // XSL stylesheet references (e.g. <?xml-stylesheet ...?>) cause the browser
+        // to fetch and apply the stylesheet, which may load additional resources
+        // (fonts, CSS, images) that prevent 'networkidle' from ever being reached.
+        const response = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
+        // Prefer the raw response body — this gives us the original XML before
+        // the browser applies any XSL transformation (which would turn the XML
+        // into rendered HTML, losing the sitemap structure).
+        if (response) {
+          try {
+            data = await response.text();
+          } catch {
+            // response.text() can fail if the body was already consumed or
+            // if a redirect occurred; fall through to DOM extraction below.
+          }
+        }
-        if ((await page.locator('body').count()) > 0) {
-          data = await page.locator('body').innerText();
-        } else {
+        if (!data) {
+          if ((await page.locator('body').count()) > 0) {
+            data = await page.locator('body').innerText();
+          } else {
           const urlSet = page.locator('urlset');
           const sitemapIndex = page.locator('sitemapindex');
           const rss = page.locator('rss');
@@ -1146,6 +1214,7 @@ export const getLinksFromSitemap = async (
             data = await rss.evaluate(elem => elem.outerHTML);
           } else if (await isRoot(feed)) {
             data = await feed.evaluate(elem => elem.outerHTML);
+            }
           }
         }
       } finally {
@@ -1169,39 +1238,65 @@ export const getLinksFromSitemap = async (
     }
     const $ = cheerio.load(data, { xml: true });
+    const countBefore = allUrls.size;
     // This case is when the document is not an XML format document
     if ($(':root').length === 0) {
       processNonStandardSitemap(data);
+      const linksFromThisSitemap = allUrls.size - countBefore;
+      if (linksFromThisSitemap > 0) {
+        sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
+      }
       return;
     }
     // Root element
     const root = $(':root')[0];
+    const hasImageNamespace = Object.values(root?.attribs ?? {}).some(
+      attribVal => typeof attribVal === 'string' && attribVal.toLowerCase().includes('sitemap-image'),
+    );
-    const { xmlns } = root.attribs;
+    if (hasImageNamespace) {
+      consoleLogger.info(`Skipping image sitemap: ${url}`);
+      return;
+    }
+    const rootName = root?.name?.toLowerCase().split(':').pop() ?? '';
+    const hasXmlSitemapIndexTag = /<\s*(?:[a-z0-9_-]+:)?sitemapindex\b/i.test(data);
+    const hasXmlUrlsetTag = /<\s*(?:[a-z0-9_-]+:)?urlset\b/i.test(data);
-    const xmlFormatNamespace = '/schemas/sitemap';
-    if (root.name === 'urlset' && xmlns.includes(xmlFormatNamespace)) {
+    if (rootName === 'urlset') {
       sitemapType = constants.xmlSitemapTypes.xml;
-    } else if (root.name === 'sitemapindex' && xmlns.includes(xmlFormatNamespace)) {
+    } else if (rootName === 'sitemapindex') {
       sitemapType = constants.xmlSitemapTypes.xmlIndex;
-    } else if (root.name === 'rss') {
+    } else if (rootName === 'rss') {
       sitemapType = constants.xmlSitemapTypes.rss;
-    } else if (root.name === 'feed') {
+    } else if (rootName === 'feed') {
       sitemapType = constants.xmlSitemapTypes.atom;
+    } else if (hasXmlSitemapIndexTag) {
+      sitemapType = constants.xmlSitemapTypes.xmlIndex;
+    } else if (hasXmlUrlsetTag) {
+      sitemapType = constants.xmlSitemapTypes.xml;
     } else {
       sitemapType = constants.xmlSitemapTypes.unknown;
     }
-    const countBefore = allUrls.size;
     switch (sitemapType) {
       case constants.xmlSitemapTypes.xmlIndex:
-        consoleLogger.info(`This is a XML format sitemap index.`);
+        consoleLogger.info(`This is a XML format sitemap index: ${url}`);
         for (const childSitemapUrl of $('loc')) {
-          const childSitemapUrlText = $(childSitemapUrl).text();
-          if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
+          const childSitemapUrlText = $(childSitemapUrl).text().trim();
+          if (!childSitemapUrlText) {
+            continue;
+          }
+          const childSitemapPath = childSitemapUrlText.split(/[?#]/)[0].toLowerCase();
+          if (childSitemapPath.endsWith('.xml') || childSitemapPath.endsWith('.txt')) {
+            if (isImageSitemapUrl(childSitemapUrlText)) {
+              consoleLogger.info(`Skipping image sitemap: ${childSitemapUrlText}`);
+              continue;
+            }
             await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
           } else {
             addToUrlList(childSitemapUrlText); // Add regular URLs to the list
@@ -1209,19 +1304,19 @@ export const getLinksFromSitemap = async (
         }
         break;
       case constants.xmlSitemapTypes.xml:
-        consoleLogger.info(`This is a XML format sitemap.`);
+        consoleLogger.info(`This is a XML format sitemap: ${url}`);
         await processXmlSitemap($, sitemapType, 'loc', 'lastmod', 'url');
         break;
       case constants.xmlSitemapTypes.rss:
-        consoleLogger.info(`This is a RSS format sitemap.`);
+        consoleLogger.info(`This is a RSS format sitemap: ${url}`);
         await processXmlSitemap($, sitemapType, 'link', 'pubDate', 'item');
         break;
       case constants.xmlSitemapTypes.atom:
-        consoleLogger.info(`This is a Atom format sitemap.`);
+        consoleLogger.info(`This is a Atom format sitemap: ${url}`);
         await processXmlSitemap($, sitemapType, 'link', 'published', 'entry');
         break;
       default:
-        consoleLogger.info(`This is an unrecognised XML sitemap format.`);
+        consoleLogger.info(`This is an unrecognised XML sitemap format: ${url}`);
         processNonStandardSitemap(data);
     }
@@ -2171,6 +2266,7 @@ export const isFilePath = (url: string): boolean => {
   const driveLetterPattern = /^[A-Z]:/i;
   const backslashPattern = /\\/;
   return (
+    url.toLowerCase().startsWith('file://') ||
     url.startsWith('/') ||
     driveLetterPattern.test(url) ||
     backslashPattern.test(url) ||

package/src/crawlers/commonCrawlerFunc.ts CHANGED Viewed

@@ -1145,14 +1145,68 @@ export const createCrawleeSubFolders = async (
 export const preNavigationHooks = (extraHTTPHeaders: Record<string, string>) => {
   return [
     async (crawlingContext: CrawlingContext, gotoOptions: PlaywrightGotoOptions) => {
-      if (extraHTTPHeaders) {
+      if (extraHTTPHeaders && Object.keys(extraHTTPHeaders).length > 0) {
         crawlingContext.request.headers = extraHTTPHeaders;
       }
-      gotoOptions = { waitUntil: 'networkidle', timeout: 30000 };
+      // Use domcontentloaded — fires as soon as the DOM is parsed, before
+      // images/stylesheets/network requests settle. This avoids indefinite
+      // hangs on sites with WebSockets, analytics polling, or infinite-scroll
+      // beacons that never reach networkidle. Further page stability is
+      // handled by waitForPageLoaded() in each crawler's requestHandler and
+      // by the DOM mutation observer in postNavigationHooks.
+      if (gotoOptions) {
+        gotoOptions.waitUntil = 'domcontentloaded';
+        gotoOptions.timeout = 30000;
+      }
     },
   ];
 };
+/**
+ * Splits extraHTTPHeaders into auth and non-auth parts.
+ * Auth headers (Authorization) must only be sent to same-origin requests to avoid CORS preflight failures.
+ * Non-auth headers are safe to set globally on the browser context.
+ */
+export const splitAuthHeaders = (extraHTTPHeaders?: Record<string, string>) => {
+  const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
+  return {
+    authHeader: Authorization || null,
+    nonAuthHeaders: Object.keys(nonAuthHeaders).length > 0 ? nonAuthHeaders : null,
+    httpCredentials: (() => {
+      if (!Authorization?.startsWith('Basic ')) return null;
+      const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
+      const colonIdx = decoded.indexOf(':');
+      if (colonIdx <= 0) return null;
+      return { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
+    })(),
+  };
+};
+/**
+ * Adds a route handler to a BrowserContext that sends the Authorization header
+ * only to same-origin requests, preventing CORS preflight failures on cross-origin CDN resources.
+ */
+export const addAuthRouteHandler = async (
+  context: BrowserContext,
+  entryUrl: string,
+  authHeader: string | null
+) => {
+  if (!authHeader) return;
+  const entryOrigin = new URL(entryUrl).origin;
+  await context.route('**/*', async (route, request) => {
+    try {
+      if (new URL(request.url()).origin === entryOrigin) {
+        await route.continue({ headers: { ...request.headers(), Authorization: authHeader } });
+      } else {
+        await route.continue();
+      }
+    } catch {
+      await route.continue();
+    }
+  });
+};
 export const postNavigationHooks = [
   async (_crawlingContext: CrawlingContext) => {
     guiInfoLog(guiInfoStatusTypes.COMPLETED, {});

package/src/crawlers/crawlDomain.ts CHANGED Viewed

@@ -5,10 +5,12 @@ import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
 import {
   createCrawleeSubFolders,
   getPreLaunchHook,
+  preNavigationHooks,
   runAxeScript,
   isUrlPdf,
   shouldSkipClickDueToDisallowedHref,
   shouldSkipDueToUnsupportedContent,
+  splitAuthHeaders,
 } from './commonCrawlerFunc.js';
 import constants, {
   UrlsCrawled,
@@ -385,6 +387,8 @@ const crawlDomain = async ({
     specifiedMaxConcurrency || constants.maxConcurrency,
   );
+  const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
   const crawler = register(
     new crawlee.PlaywrightCrawler({
       launchContext: {
@@ -404,12 +408,18 @@ const crawlDomain = async ({
               ...playwrightDeviceDetailsObject,
               ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
               ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
-              ...(extraHTTPHeaders && { extraHTTPHeaders }),
+              ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+              ...(httpCredentials && { httpCredentials }),
             };
           },
         ],
       },
       requestQueue,
+      maxRequestRetries: 3,
+      maxSessionRotations: 1,
+      preNavigationHooks: [
+        ...preNavigationHooks(extraHTTPHeaders),
+      ],
       postNavigationHooks: [
         async crawlingContext => {
           const { page, request } = crawlingContext;

package/src/crawlers/crawlIntelligentSitemap.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import fs from 'fs';
 import { chromium, Page } from 'playwright';
 import { EnqueueStrategy } from 'crawlee';
-import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
+import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
 import constants, { FileTypes, guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
 import { consoleLogger, guiInfoLog } from '../logs.js';
 import crawlDomain from './crawlDomain.js';
@@ -58,6 +58,7 @@ const crawlIntelligentSitemap = async (
     let sitemapLink = '';
     const launchOptions = getPlaywrightLaunchOptions(browser);
+    const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
     let context;
     let browserInstance;
@@ -65,20 +66,25 @@ const crawlIntelligentSitemap = async (
       const effectiveUserDataDirectory = userDataDirectory || '';
       context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
         ...launchOptions,
-        ...(extraHTTPHeaders && { extraHTTPHeaders }),
+        ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+        ...(httpCredentials && { httpCredentials }),
         ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
       });
       register(context);
     } else {
-      // In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
       browserInstance = await constants.launcher.launch(launchOptions);
       register(browserInstance as unknown as { close: () => Promise<void> });
       context = await browserInstance.newContext({
-        ...(extraHTTPHeaders && { extraHTTPHeaders }),
+        ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+        ...(httpCredentials && { httpCredentials }),
         ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
       });
     }
+    if (authHeader) {
+      await addAuthRouteHandler(context, link, authHeader);
+    }
     const page = await context.newPage();
     for (const path of sitemapPaths) {

package/src/crawlers/crawlSitemap.ts CHANGED Viewed

@@ -7,6 +7,7 @@ import {
   preNavigationHooks,
   runAxeScript,
   isUrlPdf,
+  splitAuthHeaders,
 } from './commonCrawlerFunc.js';
 import constants, {
@@ -85,6 +86,7 @@ const crawlSitemap = async ({
     maxRequestsPerCrawl,
     specifiedMaxConcurrency || constants.maxConcurrency,
   );
+  const initialNoSuccessFailureAbortThreshold = Math.max(5, Math.min(maxRequestsPerCrawl, 25));
   if (fromCrawlIntelligentSitemap) {
     dataset = datasetFromIntelligent;
@@ -119,6 +121,7 @@ const crawlSitemap = async ({
   const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
   const { playwrightDeviceDetailsObject } = viewportSettings;
   const { maxConcurrency } = constants;
+  const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
   const requestList = await RequestList.open({
     sources: linksFromSitemap,
@@ -142,11 +145,15 @@ const crawlSitemap = async ({
               ...playwrightDeviceDetailsObject,
               ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
               ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
+              ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+              ...(httpCredentials && { httpCredentials }),
             };
           },
         ],
       },
       requestList,
+      maxRequestRetries: 3,
+      maxSessionRotations: 1,
       postNavigationHooks: [
         async ({ page }) => {
           try {
@@ -197,6 +204,7 @@ const crawlSitemap = async ({
         },
       ],
       preNavigationHooks: [
+        ...preNavigationHooks(extraHTTPHeaders),
         async ({ request, page }, gotoOptions) => {
           const url = request.url.toLowerCase();
@@ -213,8 +221,6 @@ const crawlSitemap = async ({
             return;
           }
-          preNavigationHooks(extraHTTPHeaders);
         },
       ],
       requestHandlerTimeoutSecs: 90,
@@ -449,6 +455,17 @@ const crawlSitemap = async ({
           httpStatusCode: typeof status === 'number' ? status : 0,
         });
         crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
+        if (
+          urlsCrawled.scanned.length === 0 &&
+          urlsCrawled.error.length >= initialNoSuccessFailureAbortThreshold
+        ) {
+          consoleLogger.info(
+            `Aborting sitemap crawl: ${urlsCrawled.error.length} failed pages with 0 successful scans.`,
+          );
+          isAbortingScan = true;
+          crawler.autoscaledPool?.abort();
+        }
       },
       maxRequestsPerCrawl: Infinity,
       maxConcurrency: specifiedMaxConcurrency || maxConcurrency,

package/src/crawlers/custom/utils.ts CHANGED Viewed

@@ -1228,19 +1228,32 @@ export const initNewPage = async (page, pageClosePromises, processPageParams, pa
         const allowed = isOverlayAllowed(page.url(), processPageParams.entryUrl);
         if (!allowed) {
-          await Promise.race([
-            removeOverlayMenu(page),
-            new Promise((_, reject) => {
-              setTimeout(() => {
-                reject(
-                  new Error(
-                    `removeOverlayMenu timed out after ${OVERLAY_OPERATION_TIMEOUT_MS}ms`,
-                  ),
-                );
-              }, OVERLAY_OPERATION_TIMEOUT_MS);
-            }),
-          ]);
-          return;
+          // On macOS and Windows the custom flow always runs headful.
+          // The URL guard (urlGuard.ts) intercepts non-http/https navigations
+          // and calls page.goto(safeUrl). Do NOT remove the overlay here —
+          // removing it causes it to stay permanently disabled if the redirect
+          // races ahead of the next reconcile cycle.
+          // Instead, fall through to the hasOverlay / addOverlayMenu block so
+          // the overlay is (re-)injected even on transient non-http/https URLs
+          // (e.g. file://, about:blank) and again after the guard's redirect.
+          const isDesktopHost = process.platform === 'darwin' || process.platform === 'win32';
+          if (!isDesktopHost) {
+            // On Linux / Docker: remove overlay for non-http/https URLs and stop.
+            await Promise.race([
+              removeOverlayMenu(page),
+              new Promise((_, reject) => {
+                setTimeout(() => {
+                  reject(
+                    new Error(
+                      `removeOverlayMenu timed out after ${OVERLAY_OPERATION_TIMEOUT_MS}ms`,
+                    ),
+                  );
+                }, OVERLAY_OPERATION_TIMEOUT_MS);
+              }),
+            ]);
+            return;
+          }
+          // Desktop hosts: skip removal and fall through to re-add overlay.
         }
         const hasOverlay = await page.evaluate(() =>

package/src/crawlers/guards/urlGuard.ts CHANGED Viewed

@@ -35,8 +35,18 @@ export function addUrlGuardScript(context, opts = {}) {
       });
     const restoreToSafeUrl = async (page, attemptedUrl) => {
+      const safeUrl = lastAllowedUrlByPage.get(page) || fallbackUrl || 'about:blank';
+      // Only redirect if the safe URL is itself an allowed (http/https) URL.
+      // If the entry URL is file:// (e.g. scanning a local HTML file), the
+      // fallback is also file://, and redirecting would create an infinite loop:
+      //   file:// → restoreToSafeUrl → file:// → framenavigated → restoreToSafeUrl → …
+      try {
+        const safeObj = new URL(safeUrl);
+        if (!ALLOWED_PROTOCOLS.has(safeObj.protocol)) return;
+      } catch {
+        return;
+      }
       try {
-        const safeUrl = lastAllowedUrlByPage.get(page) || fallbackUrl || 'about:blank';
         await page.goto(safeUrl, { waitUntil: 'domcontentloaded' });
       } catch {
         // page might be closing; ignore
@@ -58,6 +68,13 @@ export function addUrlGuardScript(context, opts = {}) {
         lastAllowedUrlByPage.set(page, urlObj.toString());
         return;
       }
+      // Skip browser-internal transitional states (about:blank, about:srcdoc, etc.).
+      // page.goto() navigates through about:blank before loading the target URL.
+      // Redirecting from about: creates an infinite loop:
+      //   restoreToSafeUrl → page.goto(safeUrl) → about:blank → restoreToSafeUrl → …
+      if (urlObj.protocol === 'about:') return;
       await restoreToSafeUrl(page, urlStr);
     });
   };

package/src/crawlers/runCustom.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 /* eslint-env browser */
-import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
+import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
 import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
 import constants, {
   getIntermediateScreenshotsPath,
@@ -60,6 +60,7 @@ const runCustom = async (
   blacklistedPatterns: string[] | null,
   includeScreenshots: boolean,
   initialCustomFlowLabel?: string,
+  extraHTTPHeaders?: Record<string, string>,
 ) => {
   // checks and delete datasets path if it already exists
   process.env.CRAWLEE_STORAGE_DIR = randomToken;
@@ -109,6 +110,8 @@ const runCustom = async (
       ...customArgs,
     ];
+    const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
     const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
       ...baseLaunchOptions,
       args: mergedArgs,
@@ -118,8 +121,14 @@ const runCustom = async (
       viewport: null,
       ...(hasCustomViewport ? contextDeviceOptions : {}),
       userAgent: process.env.OOBEE_USER_AGENT || (deviceUserAgent as string | undefined),
+      ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
+      ...(httpCredentials && { httpCredentials }),
     });
+    if (authHeader) {
+      await addAuthRouteHandler(context, url, authHeader);
+    }
     register(context);
     processPageParams.stopAll = async () => {

package/src/generateOobeeClientScanner.ts CHANGED Viewed

@@ -60,7 +60,7 @@ const SENTRY_NODE_VERSION: string = (() => {
   try {
     return _require('@sentry/node/package.json').version as string;
   } catch {
-    return '9.47.1';   // safe fallback matching currently installed version
+    return '10.58.0';   // safe fallback matching currently installed version
   }
 })();