@govtechsg/oobee 0.10.92 → 0.10.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/AGENTS.md +34 -0
  2. package/README.md +19 -0
  3. package/dist/cli.js +3 -2
  4. package/dist/combine.js +4 -4
  5. package/dist/constants/common.js +136 -49
  6. package/dist/crawlers/commonCrawlerFunc.js +54 -2
  7. package/dist/crawlers/crawlDomain.js +9 -2
  8. package/dist/crawlers/crawlIntelligentSitemap.js +9 -4
  9. package/dist/crawlers/crawlSitemap.js +14 -2
  10. package/dist/crawlers/custom/utils.js +22 -9
  11. package/dist/crawlers/guards/urlGuard.js +19 -1
  12. package/dist/crawlers/runCustom.js +8 -2
  13. package/dist/generateOobeeClientScanner.js +1 -1
  14. package/dist/mergeAxeResults/itemsStore.js +32 -3
  15. package/dist/static/ejs/partials/components/allIssues/CategoryBadges.ejs +3 -0
  16. package/dist/static/ejs/partials/components/allIssues/IssuesTable.ejs +3 -3
  17. package/dist/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +1 -1
  18. package/dist/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +3 -3
  19. package/dist/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +34 -27
  20. package/dist/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +1 -0
  21. package/dist/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +7 -0
  22. package/dist/static/ejs/partials/components/wcagCoverageDetails.ejs +5 -5
  23. package/dist/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +3 -3
  24. package/dist/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +21 -19
  25. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +39 -8
  26. package/dist/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +11 -5
  27. package/dist/static/ejs/partials/scripts/screenshotLightbox.ejs +49 -31
  28. package/dist/static/ejs/partials/styles/header/SiteInfo.ejs +1 -1
  29. package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +36 -16
  30. package/dist/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +22 -1
  31. package/dist/static/ejs/partials/styles/styles.ejs +1 -1
  32. package/dist/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +6 -0
  33. package/dist/static/ejs/partials/styles/wcagCompliance.ejs +5 -4
  34. package/dist/static/ejs/partials/styles/wcagCoverageDetails.ejs +6 -1
  35. package/oobee-client-scanner.js +4 -4
  36. package/package.json +2 -2
  37. package/src/cli.ts +3 -2
  38. package/src/combine.ts +4 -2
  39. package/src/constants/common.ts +131 -35
  40. package/src/crawlers/commonCrawlerFunc.ts +56 -2
  41. package/src/crawlers/crawlDomain.ts +11 -1
  42. package/src/crawlers/crawlIntelligentSitemap.ts +10 -4
  43. package/src/crawlers/crawlSitemap.ts +19 -2
  44. package/src/crawlers/custom/utils.ts +26 -13
  45. package/src/crawlers/guards/urlGuard.ts +18 -1
  46. package/src/crawlers/runCustom.ts +10 -1
  47. package/src/generateOobeeClientScanner.ts +1 -1
  48. package/src/mergeAxeResults/itemsStore.ts +37 -3
  49. package/src/static/ejs/partials/components/allIssues/CategoryBadges.ejs +3 -0
  50. package/src/static/ejs/partials/components/allIssues/IssuesTable.ejs +3 -3
  51. package/src/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +1 -1
  52. package/src/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +3 -3
  53. package/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +34 -27
  54. package/src/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +1 -0
  55. package/src/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +7 -0
  56. package/src/static/ejs/partials/components/wcagCoverageDetails.ejs +5 -5
  57. package/src/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +3 -3
  58. package/src/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +21 -19
  59. package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +39 -8
  60. package/src/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +11 -5
  61. package/src/static/ejs/partials/scripts/screenshotLightbox.ejs +49 -31
  62. package/src/static/ejs/partials/styles/header/SiteInfo.ejs +1 -1
  63. package/src/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +36 -16
  64. package/src/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +22 -1
  65. package/src/static/ejs/partials/styles/styles.ejs +1 -1
  66. package/src/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +6 -0
  67. package/src/static/ejs/partials/styles/wcagCompliance.ejs +5 -4
  68. package/src/static/ejs/partials/styles/wcagCoverageDetails.ejs +6 -1
  69. package/testStaticJSScanner.html +1 -1
  70. /package/{d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt → 67e8137b-1939-4253-8f11-a82bc833cfcb.txt} +0 -0
@@ -359,8 +359,11 @@ const checkUrlConnectivityWithBrowser = async (
359
359
  }
360
360
  }
361
361
 
362
- // Ensure Accept header for non-html content fallback
363
- extraHTTPHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
362
+ // Ensure Accept header for non-html content fallback — use a local copy to avoid
363
+ // mutating the caller's extraHTTPHeaders object (which is later checked by crawlers
364
+ // to decide whether to enable preNavigationHooks header rewriting).
365
+ const localHeaders = { ...extraHTTPHeaders };
366
+ localHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
364
367
 
365
368
  await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
366
369
 
@@ -377,9 +380,21 @@ const checkUrlConnectivityWithBrowser = async (
377
380
  } = rawDevice;
378
381
 
379
382
  const launchOptions = getPlaywrightLaunchOptions(browserToRun);
383
+
384
+ const { Authorization, ...nonAuthHeaders } = localHeaders || {};
385
+ let httpCredentials = undefined;
386
+ if (Authorization?.startsWith('Basic ')) {
387
+ const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
388
+ const colonIdx = decoded.indexOf(':');
389
+ if (colonIdx > 0) {
390
+ httpCredentials = { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
391
+ }
392
+ }
393
+
380
394
  const contextOptions: Record<string, unknown> = {
381
395
  ...restDevice,
382
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
396
+ ...(Object.keys(nonAuthHeaders).length > 0 && { extraHTTPHeaders: nonAuthHeaders }),
397
+ ...(httpCredentials && { httpCredentials }),
383
398
  ignoreHTTPSErrors: true,
384
399
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
385
400
  };
@@ -421,6 +436,26 @@ const checkUrlConnectivityWithBrowser = async (
421
436
  }
422
437
 
423
438
  try {
439
+ // Only enable generic Authorization header routing interception broadly if
440
+ // a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
441
+ // performance warnings inside the check checkUrl phase for typical public scans
442
+ if (Object.keys(localHeaders).length > 0) {
443
+ if (Authorization && !httpCredentials) {
444
+ const entryOrigin = new URL(url).origin;
445
+ await browserContext.route('**/*', async (route: any, request: any) => {
446
+ try {
447
+ if (new URL(request.url()).origin === entryOrigin) {
448
+ await route.continue({ headers: { ...request.headers(), Authorization } });
449
+ } else {
450
+ await route.continue();
451
+ }
452
+ } catch {
453
+ await route.continue();
454
+ }
455
+ });
456
+ }
457
+ }
458
+
424
459
  const page = await browserContext.newPage();
425
460
 
426
461
  // Block native Chrome download UI
@@ -431,16 +466,6 @@ const checkUrlConnectivityWithBrowser = async (
431
466
  consoleLogger.info(`Unable to set download deny: ${(e as Error).message}`);
432
467
  }
433
468
 
434
- // OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
435
- // This allows the "Connectivity Check" to pass as soon as HTML is ready
436
- await page.route('**/*', (route) => {
437
- const type = route.request().resourceType();
438
- if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
439
- return route.abort();
440
- }
441
- return route.continue();
442
- });
443
-
444
469
  // STEP 2: Navigate (follows server-side redirects)
445
470
  page.once('download', () => {
446
471
  res.status = constants.urlCheckStatuses.notASupportedDocument.code;
@@ -549,7 +574,7 @@ export const isSitemapContent = (content: string) => {
549
574
  }
550
575
 
551
576
  const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
552
- const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
577
+ const regexForXmlSitemap = new RegExp('<(?:urlset|sitemapindex|feed|rss)+?.*>', 'gmi');
553
578
  if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
554
579
  // is an XML sitemap wrapped in a HTML document
555
580
  return true;
@@ -572,8 +597,22 @@ export const checkUrl = async (
572
597
  extraHTTPHeaders: Record<string, string>,
573
598
  fileTypes: FileTypes,
574
599
  ) => {
600
+ let urlToCheck = url;
601
+
602
+ if (scanner === ScannerTypes.LOCALFILE) {
603
+ if (!isFilePath(url)) {
604
+ const res = new RES();
605
+ res.status = constants.urlCheckStatuses.notALocalFile.code;
606
+ return res;
607
+ }
608
+
609
+ if (!url.toLowerCase().startsWith('file://')) {
610
+ urlToCheck = pathToFileURL(path.resolve(url)).toString();
611
+ }
612
+ }
613
+
575
614
  const res = await checkUrlConnectivityWithBrowser(
576
- url,
615
+ urlToCheck,
577
616
  browser,
578
617
  clonedDataDir,
579
618
  playwrightDeviceDetailsObject,
@@ -661,6 +700,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
661
700
  ruleset,
662
701
  generateJsonFiles,
663
702
  scanDuration,
703
+ finalUrl,
664
704
  } = argv;
665
705
 
666
706
  const extraHTTPHeaders = parseHeaders(header);
@@ -694,6 +734,10 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
694
734
  url = temp.toString();
695
735
  }
696
736
 
737
+ // Keep browser-resolved URL (if provided by pre-check flow) as canonical entry URL.
738
+ // For local file paths, keep using the normalized `url` value below.
739
+ const resolvedEntryUrl = finalUrl && !isFilePath(finalUrl) ? finalUrl : url;
740
+
697
741
  // construct filename for scan results
698
742
  const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
699
743
  const domain = isLocalFileScan ? path.basename(url) : new URL(url).hostname;
@@ -738,7 +782,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
738
782
  return {
739
783
  type: scanner,
740
784
  url,
741
- entryUrl: url,
785
+ entryUrl: resolvedEntryUrl,
742
786
  isHeadless: headless,
743
787
  deviceChosen,
744
788
  customDevice,
@@ -989,6 +1033,8 @@ export const getLinksFromSitemap = async (
989
1033
  const scannedSitemaps = new Set<string>();
990
1034
  const sitemapLinkCounts: Record<string, number> = {};
991
1035
  const allUrls = new Set<string>(); // all discovered URLs (lightweight strings)
1036
+ const isImageSitemapUrl = (candidateUrl: string) =>
1037
+ /(^|\/)image-sitemap(?:-index)?(?:-\d+)?\.xml(?:$|[?#])/i.test(candidateUrl);
992
1038
 
993
1039
  const addToUrlList = (url: string) => {
994
1040
  if (!url) return;
@@ -1072,6 +1118,11 @@ export const getLinksFromSitemap = async (
1072
1118
  let data;
1073
1119
  let sitemapType;
1074
1120
 
1121
+ if (isImageSitemapUrl(url)) {
1122
+ consoleLogger.info(`Skipping image sitemap: ${url}`);
1123
+ return;
1124
+ }
1125
+
1075
1126
  if (scannedSitemaps.has(url)) {
1076
1127
  // Skip processing if the sitemap has already been scanned
1077
1128
  return;
@@ -1127,11 +1178,28 @@ export const getLinksFromSitemap = async (
1127
1178
 
1128
1179
  const page = await browserContext.newPage();
1129
1180
 
1130
- await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
1181
+ // Use 'domcontentloaded' instead of 'networkidle' sitemap XMLs with
1182
+ // XSL stylesheet references (e.g. <?xml-stylesheet ...?>) cause the browser
1183
+ // to fetch and apply the stylesheet, which may load additional resources
1184
+ // (fonts, CSS, images) that prevent 'networkidle' from ever being reached.
1185
+ const response = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
1186
+
1187
+ // Prefer the raw response body — this gives us the original XML before
1188
+ // the browser applies any XSL transformation (which would turn the XML
1189
+ // into rendered HTML, losing the sitemap structure).
1190
+ if (response) {
1191
+ try {
1192
+ data = await response.text();
1193
+ } catch {
1194
+ // response.text() can fail if the body was already consumed or
1195
+ // if a redirect occurred; fall through to DOM extraction below.
1196
+ }
1197
+ }
1131
1198
 
1132
- if ((await page.locator('body').count()) > 0) {
1133
- data = await page.locator('body').innerText();
1134
- } else {
1199
+ if (!data) {
1200
+ if ((await page.locator('body').count()) > 0) {
1201
+ data = await page.locator('body').innerText();
1202
+ } else {
1135
1203
  const urlSet = page.locator('urlset');
1136
1204
  const sitemapIndex = page.locator('sitemapindex');
1137
1205
  const rss = page.locator('rss');
@@ -1146,6 +1214,7 @@ export const getLinksFromSitemap = async (
1146
1214
  data = await rss.evaluate(elem => elem.outerHTML);
1147
1215
  } else if (await isRoot(feed)) {
1148
1216
  data = await feed.evaluate(elem => elem.outerHTML);
1217
+ }
1149
1218
  }
1150
1219
  }
1151
1220
  } finally {
@@ -1169,39 +1238,65 @@ export const getLinksFromSitemap = async (
1169
1238
  }
1170
1239
 
1171
1240
  const $ = cheerio.load(data, { xml: true });
1241
+ const countBefore = allUrls.size;
1172
1242
 
1173
1243
  // This case is when the document is not an XML format document
1174
1244
  if ($(':root').length === 0) {
1175
1245
  processNonStandardSitemap(data);
1246
+
1247
+ const linksFromThisSitemap = allUrls.size - countBefore;
1248
+ if (linksFromThisSitemap > 0) {
1249
+ sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
1250
+ }
1176
1251
  return;
1177
1252
  }
1178
1253
 
1179
1254
  // Root element
1180
1255
  const root = $(':root')[0];
1256
+ const hasImageNamespace = Object.values(root?.attribs ?? {}).some(
1257
+ attribVal => typeof attribVal === 'string' && attribVal.toLowerCase().includes('sitemap-image'),
1258
+ );
1181
1259
 
1182
- const { xmlns } = root.attribs;
1260
+ if (hasImageNamespace) {
1261
+ consoleLogger.info(`Skipping image sitemap: ${url}`);
1262
+ return;
1263
+ }
1264
+
1265
+ const rootName = root?.name?.toLowerCase().split(':').pop() ?? '';
1266
+ const hasXmlSitemapIndexTag = /<\s*(?:[a-z0-9_-]+:)?sitemapindex\b/i.test(data);
1267
+ const hasXmlUrlsetTag = /<\s*(?:[a-z0-9_-]+:)?urlset\b/i.test(data);
1183
1268
 
1184
- const xmlFormatNamespace = '/schemas/sitemap';
1185
- if (root.name === 'urlset' && xmlns.includes(xmlFormatNamespace)) {
1269
+ if (rootName === 'urlset') {
1186
1270
  sitemapType = constants.xmlSitemapTypes.xml;
1187
- } else if (root.name === 'sitemapindex' && xmlns.includes(xmlFormatNamespace)) {
1271
+ } else if (rootName === 'sitemapindex') {
1188
1272
  sitemapType = constants.xmlSitemapTypes.xmlIndex;
1189
- } else if (root.name === 'rss') {
1273
+ } else if (rootName === 'rss') {
1190
1274
  sitemapType = constants.xmlSitemapTypes.rss;
1191
- } else if (root.name === 'feed') {
1275
+ } else if (rootName === 'feed') {
1192
1276
  sitemapType = constants.xmlSitemapTypes.atom;
1277
+ } else if (hasXmlSitemapIndexTag) {
1278
+ sitemapType = constants.xmlSitemapTypes.xmlIndex;
1279
+ } else if (hasXmlUrlsetTag) {
1280
+ sitemapType = constants.xmlSitemapTypes.xml;
1193
1281
  } else {
1194
1282
  sitemapType = constants.xmlSitemapTypes.unknown;
1195
1283
  }
1196
1284
 
1197
- const countBefore = allUrls.size;
1198
-
1199
1285
  switch (sitemapType) {
1200
1286
  case constants.xmlSitemapTypes.xmlIndex:
1201
- consoleLogger.info(`This is a XML format sitemap index.`);
1287
+ consoleLogger.info(`This is a XML format sitemap index: ${url}`);
1202
1288
  for (const childSitemapUrl of $('loc')) {
1203
- const childSitemapUrlText = $(childSitemapUrl).text();
1204
- if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
1289
+ const childSitemapUrlText = $(childSitemapUrl).text().trim();
1290
+ if (!childSitemapUrlText) {
1291
+ continue;
1292
+ }
1293
+
1294
+ const childSitemapPath = childSitemapUrlText.split(/[?#]/)[0].toLowerCase();
1295
+ if (childSitemapPath.endsWith('.xml') || childSitemapPath.endsWith('.txt')) {
1296
+ if (isImageSitemapUrl(childSitemapUrlText)) {
1297
+ consoleLogger.info(`Skipping image sitemap: ${childSitemapUrlText}`);
1298
+ continue;
1299
+ }
1205
1300
  await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
1206
1301
  } else {
1207
1302
  addToUrlList(childSitemapUrlText); // Add regular URLs to the list
@@ -1209,19 +1304,19 @@ export const getLinksFromSitemap = async (
1209
1304
  }
1210
1305
  break;
1211
1306
  case constants.xmlSitemapTypes.xml:
1212
- consoleLogger.info(`This is a XML format sitemap.`);
1307
+ consoleLogger.info(`This is a XML format sitemap: ${url}`);
1213
1308
  await processXmlSitemap($, sitemapType, 'loc', 'lastmod', 'url');
1214
1309
  break;
1215
1310
  case constants.xmlSitemapTypes.rss:
1216
- consoleLogger.info(`This is a RSS format sitemap.`);
1311
+ consoleLogger.info(`This is a RSS format sitemap: ${url}`);
1217
1312
  await processXmlSitemap($, sitemapType, 'link', 'pubDate', 'item');
1218
1313
  break;
1219
1314
  case constants.xmlSitemapTypes.atom:
1220
- consoleLogger.info(`This is a Atom format sitemap.`);
1315
+ consoleLogger.info(`This is a Atom format sitemap: ${url}`);
1221
1316
  await processXmlSitemap($, sitemapType, 'link', 'published', 'entry');
1222
1317
  break;
1223
1318
  default:
1224
- consoleLogger.info(`This is an unrecognised XML sitemap format.`);
1319
+ consoleLogger.info(`This is an unrecognised XML sitemap format: ${url}`);
1225
1320
  processNonStandardSitemap(data);
1226
1321
  }
1227
1322
 
@@ -2171,6 +2266,7 @@ export const isFilePath = (url: string): boolean => {
2171
2266
  const driveLetterPattern = /^[A-Z]:/i;
2172
2267
  const backslashPattern = /\\/;
2173
2268
  return (
2269
+ url.toLowerCase().startsWith('file://') ||
2174
2270
  url.startsWith('/') ||
2175
2271
  driveLetterPattern.test(url) ||
2176
2272
  backslashPattern.test(url) ||
@@ -1145,14 +1145,68 @@ export const createCrawleeSubFolders = async (
1145
1145
  export const preNavigationHooks = (extraHTTPHeaders: Record<string, string>) => {
1146
1146
  return [
1147
1147
  async (crawlingContext: CrawlingContext, gotoOptions: PlaywrightGotoOptions) => {
1148
- if (extraHTTPHeaders) {
1148
+ if (extraHTTPHeaders && Object.keys(extraHTTPHeaders).length > 0) {
1149
1149
  crawlingContext.request.headers = extraHTTPHeaders;
1150
1150
  }
1151
- gotoOptions = { waitUntil: 'networkidle', timeout: 30000 };
1151
+ // Use domcontentloaded fires as soon as the DOM is parsed, before
1152
+ // images/stylesheets/network requests settle. This avoids indefinite
1153
+ // hangs on sites with WebSockets, analytics polling, or infinite-scroll
1154
+ // beacons that never reach networkidle. Further page stability is
1155
+ // handled by waitForPageLoaded() in each crawler's requestHandler and
1156
+ // by the DOM mutation observer in postNavigationHooks.
1157
+ if (gotoOptions) {
1158
+ gotoOptions.waitUntil = 'domcontentloaded';
1159
+ gotoOptions.timeout = 30000;
1160
+ }
1152
1161
  },
1153
1162
  ];
1154
1163
  };
1155
1164
 
1165
+ /**
1166
+ * Splits extraHTTPHeaders into auth and non-auth parts.
1167
+ * Auth headers (Authorization) must only be sent to same-origin requests to avoid CORS preflight failures.
1168
+ * Non-auth headers are safe to set globally on the browser context.
1169
+ */
1170
+ export const splitAuthHeaders = (extraHTTPHeaders?: Record<string, string>) => {
1171
+ const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
1172
+ return {
1173
+ authHeader: Authorization || null,
1174
+ nonAuthHeaders: Object.keys(nonAuthHeaders).length > 0 ? nonAuthHeaders : null,
1175
+ httpCredentials: (() => {
1176
+ if (!Authorization?.startsWith('Basic ')) return null;
1177
+ const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
1178
+ const colonIdx = decoded.indexOf(':');
1179
+ if (colonIdx <= 0) return null;
1180
+ return { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
1181
+ })(),
1182
+ };
1183
+ };
1184
+
1185
+ /**
1186
+ * Adds a route handler to a BrowserContext that sends the Authorization header
1187
+ * only to same-origin requests, preventing CORS preflight failures on cross-origin CDN resources.
1188
+ */
1189
+ export const addAuthRouteHandler = async (
1190
+ context: BrowserContext,
1191
+ entryUrl: string,
1192
+ authHeader: string | null
1193
+ ) => {
1194
+ if (!authHeader) return;
1195
+
1196
+ const entryOrigin = new URL(entryUrl).origin;
1197
+ await context.route('**/*', async (route, request) => {
1198
+ try {
1199
+ if (new URL(request.url()).origin === entryOrigin) {
1200
+ await route.continue({ headers: { ...request.headers(), Authorization: authHeader } });
1201
+ } else {
1202
+ await route.continue();
1203
+ }
1204
+ } catch {
1205
+ await route.continue();
1206
+ }
1207
+ });
1208
+ };
1209
+
1156
1210
  export const postNavigationHooks = [
1157
1211
  async (_crawlingContext: CrawlingContext) => {
1158
1212
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
@@ -5,10 +5,12 @@ import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
5
5
  import {
6
6
  createCrawleeSubFolders,
7
7
  getPreLaunchHook,
8
+ preNavigationHooks,
8
9
  runAxeScript,
9
10
  isUrlPdf,
10
11
  shouldSkipClickDueToDisallowedHref,
11
12
  shouldSkipDueToUnsupportedContent,
13
+ splitAuthHeaders,
12
14
  } from './commonCrawlerFunc.js';
13
15
  import constants, {
14
16
  UrlsCrawled,
@@ -385,6 +387,8 @@ const crawlDomain = async ({
385
387
  specifiedMaxConcurrency || constants.maxConcurrency,
386
388
  );
387
389
 
390
+ const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
391
+
388
392
  const crawler = register(
389
393
  new crawlee.PlaywrightCrawler({
390
394
  launchContext: {
@@ -404,12 +408,18 @@ const crawlDomain = async ({
404
408
  ...playwrightDeviceDetailsObject,
405
409
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
406
410
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
407
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
411
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
412
+ ...(httpCredentials && { httpCredentials }),
408
413
  };
409
414
  },
410
415
  ],
411
416
  },
412
417
  requestQueue,
418
+ maxRequestRetries: 3,
419
+ maxSessionRotations: 1,
420
+ preNavigationHooks: [
421
+ ...preNavigationHooks(extraHTTPHeaders),
422
+ ],
413
423
  postNavigationHooks: [
414
424
  async crawlingContext => {
415
425
  const { page, request } = crawlingContext;
@@ -1,7 +1,7 @@
1
1
  import fs from 'fs';
2
2
  import { chromium, Page } from 'playwright';
3
3
  import { EnqueueStrategy } from 'crawlee';
4
- import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
4
+ import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
5
5
  import constants, { FileTypes, guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
6
6
  import { consoleLogger, guiInfoLog } from '../logs.js';
7
7
  import crawlDomain from './crawlDomain.js';
@@ -58,6 +58,7 @@ const crawlIntelligentSitemap = async (
58
58
  let sitemapLink = '';
59
59
 
60
60
  const launchOptions = getPlaywrightLaunchOptions(browser);
61
+ const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
61
62
  let context;
62
63
  let browserInstance;
63
64
 
@@ -65,20 +66,25 @@ const crawlIntelligentSitemap = async (
65
66
  const effectiveUserDataDirectory = userDataDirectory || '';
66
67
  context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
67
68
  ...launchOptions,
68
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
69
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
70
+ ...(httpCredentials && { httpCredentials }),
69
71
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
70
72
  });
71
73
  register(context);
72
74
  } else {
73
- // In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
74
75
  browserInstance = await constants.launcher.launch(launchOptions);
75
76
  register(browserInstance as unknown as { close: () => Promise<void> });
76
77
  context = await browserInstance.newContext({
77
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
78
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
79
+ ...(httpCredentials && { httpCredentials }),
78
80
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
79
81
  });
80
82
  }
81
83
 
84
+ if (authHeader) {
85
+ await addAuthRouteHandler(context, link, authHeader);
86
+ }
87
+
82
88
  const page = await context.newPage();
83
89
 
84
90
  for (const path of sitemapPaths) {
@@ -7,6 +7,7 @@ import {
7
7
  preNavigationHooks,
8
8
  runAxeScript,
9
9
  isUrlPdf,
10
+ splitAuthHeaders,
10
11
  } from './commonCrawlerFunc.js';
11
12
 
12
13
  import constants, {
@@ -85,6 +86,7 @@ const crawlSitemap = async ({
85
86
  maxRequestsPerCrawl,
86
87
  specifiedMaxConcurrency || constants.maxConcurrency,
87
88
  );
89
+ const initialNoSuccessFailureAbortThreshold = Math.max(5, Math.min(maxRequestsPerCrawl, 25));
88
90
 
89
91
  if (fromCrawlIntelligentSitemap) {
90
92
  dataset = datasetFromIntelligent;
@@ -119,6 +121,7 @@ const crawlSitemap = async ({
119
121
  const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
120
122
  const { playwrightDeviceDetailsObject } = viewportSettings;
121
123
  const { maxConcurrency } = constants;
124
+ const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
122
125
 
123
126
  const requestList = await RequestList.open({
124
127
  sources: linksFromSitemap,
@@ -142,11 +145,15 @@ const crawlSitemap = async ({
142
145
  ...playwrightDeviceDetailsObject,
143
146
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
144
147
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
148
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
149
+ ...(httpCredentials && { httpCredentials }),
145
150
  };
146
151
  },
147
152
  ],
148
153
  },
149
154
  requestList,
155
+ maxRequestRetries: 3,
156
+ maxSessionRotations: 1,
150
157
  postNavigationHooks: [
151
158
  async ({ page }) => {
152
159
  try {
@@ -197,6 +204,7 @@ const crawlSitemap = async ({
197
204
  },
198
205
  ],
199
206
  preNavigationHooks: [
207
+ ...preNavigationHooks(extraHTTPHeaders),
200
208
  async ({ request, page }, gotoOptions) => {
201
209
  const url = request.url.toLowerCase();
202
210
 
@@ -213,8 +221,6 @@ const crawlSitemap = async ({
213
221
 
214
222
  return;
215
223
  }
216
-
217
- preNavigationHooks(extraHTTPHeaders);
218
224
  },
219
225
  ],
220
226
  requestHandlerTimeoutSecs: 90,
@@ -449,6 +455,17 @@ const crawlSitemap = async ({
449
455
  httpStatusCode: typeof status === 'number' ? status : 0,
450
456
  });
451
457
  crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
458
+
459
+ if (
460
+ urlsCrawled.scanned.length === 0 &&
461
+ urlsCrawled.error.length >= initialNoSuccessFailureAbortThreshold
462
+ ) {
463
+ consoleLogger.info(
464
+ `Aborting sitemap crawl: ${urlsCrawled.error.length} failed pages with 0 successful scans.`,
465
+ );
466
+ isAbortingScan = true;
467
+ crawler.autoscaledPool?.abort();
468
+ }
452
469
  },
453
470
  maxRequestsPerCrawl: Infinity,
454
471
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
@@ -1228,19 +1228,32 @@ export const initNewPage = async (page, pageClosePromises, processPageParams, pa
1228
1228
  const allowed = isOverlayAllowed(page.url(), processPageParams.entryUrl);
1229
1229
 
1230
1230
  if (!allowed) {
1231
- await Promise.race([
1232
- removeOverlayMenu(page),
1233
- new Promise((_, reject) => {
1234
- setTimeout(() => {
1235
- reject(
1236
- new Error(
1237
- `removeOverlayMenu timed out after ${OVERLAY_OPERATION_TIMEOUT_MS}ms`,
1238
- ),
1239
- );
1240
- }, OVERLAY_OPERATION_TIMEOUT_MS);
1241
- }),
1242
- ]);
1243
- return;
1231
+ // On macOS and Windows the custom flow always runs headful.
1232
+ // The URL guard (urlGuard.ts) intercepts non-http/https navigations
1233
+ // and calls page.goto(safeUrl). Do NOT remove the overlay here —
1234
+ // removing it causes it to stay permanently disabled if the redirect
1235
+ // races ahead of the next reconcile cycle.
1236
+ // Instead, fall through to the hasOverlay / addOverlayMenu block so
1237
+ // the overlay is (re-)injected even on transient non-http/https URLs
1238
+ // (e.g. file://, about:blank) and again after the guard's redirect.
1239
+ const isDesktopHost = process.platform === 'darwin' || process.platform === 'win32';
1240
+ if (!isDesktopHost) {
1241
+ // On Linux / Docker: remove overlay for non-http/https URLs and stop.
1242
+ await Promise.race([
1243
+ removeOverlayMenu(page),
1244
+ new Promise((_, reject) => {
1245
+ setTimeout(() => {
1246
+ reject(
1247
+ new Error(
1248
+ `removeOverlayMenu timed out after ${OVERLAY_OPERATION_TIMEOUT_MS}ms`,
1249
+ ),
1250
+ );
1251
+ }, OVERLAY_OPERATION_TIMEOUT_MS);
1252
+ }),
1253
+ ]);
1254
+ return;
1255
+ }
1256
+ // Desktop hosts: skip removal and fall through to re-add overlay.
1244
1257
  }
1245
1258
 
1246
1259
  const hasOverlay = await page.evaluate(() =>
@@ -35,8 +35,18 @@ export function addUrlGuardScript(context, opts = {}) {
35
35
  });
36
36
 
37
37
  const restoreToSafeUrl = async (page, attemptedUrl) => {
38
+ const safeUrl = lastAllowedUrlByPage.get(page) || fallbackUrl || 'about:blank';
39
+ // Only redirect if the safe URL is itself an allowed (http/https) URL.
40
+ // If the entry URL is file:// (e.g. scanning a local HTML file), the
41
+ // fallback is also file://, and redirecting would create an infinite loop:
42
+ // file:// → restoreToSafeUrl → file:// → framenavigated → restoreToSafeUrl → …
43
+ try {
44
+ const safeObj = new URL(safeUrl);
45
+ if (!ALLOWED_PROTOCOLS.has(safeObj.protocol)) return;
46
+ } catch {
47
+ return;
48
+ }
38
49
  try {
39
- const safeUrl = lastAllowedUrlByPage.get(page) || fallbackUrl || 'about:blank';
40
50
  await page.goto(safeUrl, { waitUntil: 'domcontentloaded' });
41
51
  } catch {
42
52
  // page might be closing; ignore
@@ -58,6 +68,13 @@ export function addUrlGuardScript(context, opts = {}) {
58
68
  lastAllowedUrlByPage.set(page, urlObj.toString());
59
69
  return;
60
70
  }
71
+
72
+ // Skip browser-internal transitional states (about:blank, about:srcdoc, etc.).
73
+ // page.goto() navigates through about:blank before loading the target URL.
74
+ // Redirecting from about: creates an infinite loop:
75
+ // restoreToSafeUrl → page.goto(safeUrl) → about:blank → restoreToSafeUrl → …
76
+ if (urlObj.protocol === 'about:') return;
77
+
61
78
  await restoreToSafeUrl(page, urlStr);
62
79
  });
63
80
  };
@@ -1,5 +1,5 @@
1
1
  /* eslint-env browser */
2
- import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
2
+ import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
3
3
  import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
4
4
  import constants, {
5
5
  getIntermediateScreenshotsPath,
@@ -60,6 +60,7 @@ const runCustom = async (
60
60
  blacklistedPatterns: string[] | null,
61
61
  includeScreenshots: boolean,
62
62
  initialCustomFlowLabel?: string,
63
+ extraHTTPHeaders?: Record<string, string>,
63
64
  ) => {
64
65
  // checks and delete datasets path if it already exists
65
66
  process.env.CRAWLEE_STORAGE_DIR = randomToken;
@@ -109,6 +110,8 @@ const runCustom = async (
109
110
  ...customArgs,
110
111
  ];
111
112
 
113
+ const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
114
+
112
115
  const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
113
116
  ...baseLaunchOptions,
114
117
  args: mergedArgs,
@@ -118,8 +121,14 @@ const runCustom = async (
118
121
  viewport: null,
119
122
  ...(hasCustomViewport ? contextDeviceOptions : {}),
120
123
  userAgent: process.env.OOBEE_USER_AGENT || (deviceUserAgent as string | undefined),
124
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
125
+ ...(httpCredentials && { httpCredentials }),
121
126
  });
122
127
 
128
+ if (authHeader) {
129
+ await addAuthRouteHandler(context, url, authHeader);
130
+ }
131
+
123
132
  register(context);
124
133
 
125
134
  processPageParams.stopAll = async () => {
@@ -60,7 +60,7 @@ const SENTRY_NODE_VERSION: string = (() => {
60
60
  try {
61
61
  return _require('@sentry/node/package.json').version as string;
62
62
  } catch {
63
- return '9.47.1'; // safe fallback matching currently installed version
63
+ return '10.58.0'; // safe fallback matching currently installed version
64
64
  }
65
65
  })();
66
66