@govtechsg/oobee 0.10.93 → 0.10.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/AGENTS.md +20 -0
  2. package/dist/cli.js +3 -2
  3. package/dist/combine.js +3 -3
  4. package/dist/constants/common.js +119 -52
  5. package/dist/crawlers/commonCrawlerFunc.js +11 -2
  6. package/dist/crawlers/crawlDomain.js +4 -6
  7. package/dist/crawlers/crawlSitemap.js +14 -2
  8. package/dist/crawlers/custom/utils.js +22 -9
  9. package/dist/crawlers/guards/urlGuard.js +19 -1
  10. package/dist/static/ejs/partials/components/allIssues/CategoryBadges.ejs +3 -0
  11. package/dist/static/ejs/partials/components/allIssues/IssuesTable.ejs +3 -3
  12. package/dist/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +1 -1
  13. package/dist/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +3 -3
  14. package/dist/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +34 -27
  15. package/dist/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +1 -0
  16. package/dist/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +7 -0
  17. package/dist/static/ejs/partials/components/wcagCoverageDetails.ejs +5 -5
  18. package/dist/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +3 -3
  19. package/dist/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +21 -19
  20. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +39 -8
  21. package/dist/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +11 -5
  22. package/dist/static/ejs/partials/scripts/screenshotLightbox.ejs +49 -31
  23. package/dist/static/ejs/partials/styles/header/SiteInfo.ejs +1 -1
  24. package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +36 -16
  25. package/dist/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +22 -1
  26. package/dist/static/ejs/partials/styles/styles.ejs +1 -1
  27. package/dist/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +6 -0
  28. package/dist/static/ejs/partials/styles/wcagCompliance.ejs +5 -4
  29. package/dist/static/ejs/partials/styles/wcagCoverageDetails.ejs +6 -1
  30. package/oobee-client-scanner.js +2 -2
  31. package/package.json +1 -1
  32. package/src/cli.ts +3 -2
  33. package/src/combine.ts +3 -2
  34. package/src/constants/common.ts +112 -36
  35. package/src/crawlers/commonCrawlerFunc.ts +11 -2
  36. package/src/crawlers/crawlDomain.ts +4 -5
  37. package/src/crawlers/crawlSitemap.ts +19 -2
  38. package/src/crawlers/custom/utils.ts +26 -13
  39. package/src/crawlers/guards/urlGuard.ts +18 -1
  40. package/src/static/ejs/partials/components/allIssues/CategoryBadges.ejs +3 -0
  41. package/src/static/ejs/partials/components/allIssues/IssuesTable.ejs +3 -3
  42. package/src/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +1 -1
  43. package/src/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +3 -3
  44. package/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +34 -27
  45. package/src/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +1 -0
  46. package/src/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +7 -0
  47. package/src/static/ejs/partials/components/wcagCoverageDetails.ejs +5 -5
  48. package/src/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +3 -3
  49. package/src/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +21 -19
  50. package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +39 -8
  51. package/src/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +11 -5
  52. package/src/static/ejs/partials/scripts/screenshotLightbox.ejs +49 -31
  53. package/src/static/ejs/partials/styles/header/SiteInfo.ejs +1 -1
  54. package/src/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +36 -16
  55. package/src/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +22 -1
  56. package/src/static/ejs/partials/styles/styles.ejs +1 -1
  57. package/src/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +6 -0
  58. package/src/static/ejs/partials/styles/wcagCompliance.ejs +5 -4
  59. package/src/static/ejs/partials/styles/wcagCoverageDetails.ejs +6 -1
  60. package/testStaticJSScanner.html +1 -1
  61. /package/{7339fae5-e8ed-4b50-af13-317847620dbf.txt → 67e8137b-1939-4253-8f11-a82bc833cfcb.txt} +0 -0
@@ -39,6 +39,11 @@
39
39
  .gauge-number {
40
40
  font-weight: 700;
41
41
  font-size: 16px;
42
+ color: var(--dark-charcoal, #1f1f1f);
43
+ background-color: var(--true-white, #fff);
44
+ display: inline-block;
45
+ padding: 2px 8px;
46
+ border-radius: 999px;
42
47
  }
43
48
 
44
49
  .gauge-caption {
@@ -54,6 +59,7 @@
54
59
 
55
60
  .gauge-value-number {
56
61
  font-size: 24px;
62
+ color: var(--dark-charcoal, #1f1f1f);
57
63
  }
58
64
 
59
65
  .gauge-value-number.perfect-score {
@@ -18,10 +18,11 @@
18
18
  margin-bottom: 0rem;
19
19
  }
20
20
 
21
- .wcag-link {
22
- font-size: 1rem;
23
- border: 0;
24
- }
21
+ .wcag-link {
22
+ font-size: 1rem;
23
+ border: 0;
24
+ text-decoration: underline;
25
+ }
25
26
 
26
27
  .wcag-status {
27
28
  margin-bottom: 0.3rem;
@@ -12,9 +12,14 @@
12
12
  gap: 10px 18px;
13
13
  }
14
14
 
15
+ #wcagCoverage .wcag-criteria-heading {
16
+ font-size: 1.25rem;
17
+ line-height: 1.2;
18
+ }
19
+
15
20
  #wcagCoverage .wcag-grid a {
16
21
  color: var(--a11y-majorelle-blue, #5735DF);
17
- text-decoration: none;
22
+ text-decoration: underline;
18
23
  }
19
24
  #wcagCoverage .wcag-grid a:hover,
20
25
  #wcagCoverage .wcag-grid a:focus-visible {
@@ -3,7 +3,7 @@
3
3
  * DO NOT EDIT MANUALLY. Re-generate with: node dist/generateOobeeClientScanner.js
4
4
  *
5
5
  * Embedded at generation time:
6
- * App version : 0.10.93
6
+ * App version : 0.10.94
7
7
  * Sentry DSN : (from OOBEE_SENTRY_DSN env var or constants.ts default)
8
8
  * Sentry SDK : @sentry/browser 10.58.0 (loaded from CDN at runtime)
9
9
  *
@@ -34883,7 +34883,7 @@
34883
34883
  // ── Sentry browser telemetry (Sentry JS SDK, loaded from CDN) ────────────
34884
34884
 
34885
34885
  var _oobeeSentryDsn = "https://3b8c7ee46b06f33815a1301b6713ebc3@o4509047624761344.ingest.us.sentry.io/4509327783559168";
34886
- var _oobeeAppVersion = "0.10.93";
34886
+ var _oobeeAppVersion = "0.10.94";
34887
34887
  var _oobeeSentryVersion = "10.58.0";
34888
34888
  var _oobeeSentryInitialized = false;
34889
34889
  var _oobeeSentryLoadPromise = null;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@govtechsg/oobee",
3
3
  "main": "dist/npmIndex.js",
4
- "version": "0.10.93",
4
+ "version": "0.10.94",
5
5
  "type": "module",
6
6
  "author": "Government Technology Agency <info@tech.gov.sg>",
7
7
  "bin": {
package/src/cli.ts CHANGED
@@ -264,9 +264,10 @@ const scanInit = async (argvs: Answers): Promise<string> => {
264
264
  consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
265
265
 
266
266
  if (res.status === statuses.success.code) {
267
- // Custom flow should continue from the user-provided entry URL so auth redirects
268
- // do not replace the original domain used for overlay gating and navigation.
267
+ // Keep browser-resolved URL as entryUrl for downstream scan metadata/events
268
+ // on non-custom scans.
269
269
  if (data.type !== ScannerTypes.CUSTOM) {
270
+ data.entryUrl = res.url;
270
271
  data.url = res.url;
271
272
  }
272
273
  if (process.env.OOBEE_VALIDATE_URL) {
package/src/combine.ts CHANGED
@@ -45,6 +45,7 @@ const combineRun = async (details: Data, deviceToScan: string) => {
45
45
  const {
46
46
  type,
47
47
  url,
48
+ entryUrl,
48
49
  nameEmail,
49
50
  randomToken,
50
51
  deviceChosen,
@@ -104,8 +105,8 @@ const combineRun = async (details: Data, deviceToScan: string) => {
104
105
 
105
106
  // remove basic-auth credentials from URL
106
107
  const finalUrl = !(type === ScannerTypes.SITEMAP || type === ScannerTypes.LOCALFILE)
107
- ? new URL(url)
108
- : new URL(pathToFileURL(url));
108
+ ? new URL(entryUrl)
109
+ : new URL(pathToFileURL(entryUrl));
109
110
 
110
111
  // Use the string version of finalUrl to reduce logic at submitForm
111
112
  const finalUrlString = finalUrl.toString();
@@ -359,8 +359,11 @@ const checkUrlConnectivityWithBrowser = async (
359
359
  }
360
360
  }
361
361
 
362
- // Ensure Accept header for non-html content fallback
363
- extraHTTPHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
362
+ // Ensure Accept header for non-html content fallback — use a local copy to avoid
363
+ // mutating the caller's extraHTTPHeaders object (which is later checked by crawlers
364
+ // to decide whether to enable preNavigationHooks header rewriting).
365
+ const localHeaders = { ...extraHTTPHeaders };
366
+ localHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
364
367
 
365
368
  await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
366
369
 
@@ -378,7 +381,7 @@ const checkUrlConnectivityWithBrowser = async (
378
381
 
379
382
  const launchOptions = getPlaywrightLaunchOptions(browserToRun);
380
383
 
381
- const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
384
+ const { Authorization, ...nonAuthHeaders } = localHeaders || {};
382
385
  let httpCredentials = undefined;
383
386
  if (Authorization?.startsWith('Basic ')) {
384
387
  const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
@@ -436,19 +439,21 @@ const checkUrlConnectivityWithBrowser = async (
436
439
  // Only enable generic Authorization header routing interception broadly if
437
440
  // a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
438
441
  // performance warnings inside the check checkUrl phase for typical public scans
439
- if (Authorization && !httpCredentials) {
440
- const entryOrigin = new URL(url).origin;
441
- await browserContext.route('**/*', async (route: any, request: any) => {
442
- try {
443
- if (new URL(request.url()).origin === entryOrigin) {
444
- await route.continue({ headers: { ...request.headers(), Authorization } });
445
- } else {
442
+ if (Object.keys(localHeaders).length > 0) {
443
+ if (Authorization && !httpCredentials) {
444
+ const entryOrigin = new URL(url).origin;
445
+ await browserContext.route('**/*', async (route: any, request: any) => {
446
+ try {
447
+ if (new URL(request.url()).origin === entryOrigin) {
448
+ await route.continue({ headers: { ...request.headers(), Authorization } });
449
+ } else {
450
+ await route.continue();
451
+ }
452
+ } catch {
446
453
  await route.continue();
447
454
  }
448
- } catch {
449
- await route.continue();
450
- }
451
- });
455
+ });
456
+ }
452
457
  }
453
458
 
454
459
  const page = await browserContext.newPage();
@@ -569,7 +574,7 @@ export const isSitemapContent = (content: string) => {
569
574
  }
570
575
 
571
576
  const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
572
- const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
577
+ const regexForXmlSitemap = new RegExp('<(?:urlset|sitemapindex|feed|rss)+?.*>', 'gmi');
573
578
  if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
574
579
  // is an XML sitemap wrapped in a HTML document
575
580
  return true;
@@ -592,8 +597,22 @@ export const checkUrl = async (
592
597
  extraHTTPHeaders: Record<string, string>,
593
598
  fileTypes: FileTypes,
594
599
  ) => {
600
+ let urlToCheck = url;
601
+
602
+ if (scanner === ScannerTypes.LOCALFILE) {
603
+ if (!isFilePath(url)) {
604
+ const res = new RES();
605
+ res.status = constants.urlCheckStatuses.notALocalFile.code;
606
+ return res;
607
+ }
608
+
609
+ if (!url.toLowerCase().startsWith('file://')) {
610
+ urlToCheck = pathToFileURL(path.resolve(url)).toString();
611
+ }
612
+ }
613
+
595
614
  const res = await checkUrlConnectivityWithBrowser(
596
- url,
615
+ urlToCheck,
597
616
  browser,
598
617
  clonedDataDir,
599
618
  playwrightDeviceDetailsObject,
@@ -681,6 +700,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
681
700
  ruleset,
682
701
  generateJsonFiles,
683
702
  scanDuration,
703
+ finalUrl,
684
704
  } = argv;
685
705
 
686
706
  const extraHTTPHeaders = parseHeaders(header);
@@ -714,6 +734,10 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
714
734
  url = temp.toString();
715
735
  }
716
736
 
737
+ // Keep browser-resolved URL (if provided by pre-check flow) as canonical entry URL.
738
+ // For local file paths, keep using the normalized `url` value below.
739
+ const resolvedEntryUrl = finalUrl && !isFilePath(finalUrl) ? finalUrl : url;
740
+
717
741
  // construct filename for scan results
718
742
  const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
719
743
  const domain = isLocalFileScan ? path.basename(url) : new URL(url).hostname;
@@ -758,7 +782,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
758
782
  return {
759
783
  type: scanner,
760
784
  url,
761
- entryUrl: url,
785
+ entryUrl: resolvedEntryUrl,
762
786
  isHeadless: headless,
763
787
  deviceChosen,
764
788
  customDevice,
@@ -1009,6 +1033,8 @@ export const getLinksFromSitemap = async (
1009
1033
  const scannedSitemaps = new Set<string>();
1010
1034
  const sitemapLinkCounts: Record<string, number> = {};
1011
1035
  const allUrls = new Set<string>(); // all discovered URLs (lightweight strings)
1036
+ const isImageSitemapUrl = (candidateUrl: string) =>
1037
+ /(^|\/)image-sitemap(?:-index)?(?:-\d+)?\.xml(?:$|[?#])/i.test(candidateUrl);
1012
1038
 
1013
1039
  const addToUrlList = (url: string) => {
1014
1040
  if (!url) return;
@@ -1092,6 +1118,11 @@ export const getLinksFromSitemap = async (
1092
1118
  let data;
1093
1119
  let sitemapType;
1094
1120
 
1121
+ if (isImageSitemapUrl(url)) {
1122
+ consoleLogger.info(`Skipping image sitemap: ${url}`);
1123
+ return;
1124
+ }
1125
+
1095
1126
  if (scannedSitemaps.has(url)) {
1096
1127
  // Skip processing if the sitemap has already been scanned
1097
1128
  return;
@@ -1147,11 +1178,28 @@ export const getLinksFromSitemap = async (
1147
1178
 
1148
1179
  const page = await browserContext.newPage();
1149
1180
 
1150
- await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
1181
+ // Use 'domcontentloaded' instead of 'networkidle' sitemap XMLs with
1182
+ // XSL stylesheet references (e.g. <?xml-stylesheet ...?>) cause the browser
1183
+ // to fetch and apply the stylesheet, which may load additional resources
1184
+ // (fonts, CSS, images) that prevent 'networkidle' from ever being reached.
1185
+ const response = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
1186
+
1187
+ // Prefer the raw response body — this gives us the original XML before
1188
+ // the browser applies any XSL transformation (which would turn the XML
1189
+ // into rendered HTML, losing the sitemap structure).
1190
+ if (response) {
1191
+ try {
1192
+ data = await response.text();
1193
+ } catch {
1194
+ // response.text() can fail if the body was already consumed or
1195
+ // if a redirect occurred; fall through to DOM extraction below.
1196
+ }
1197
+ }
1151
1198
 
1152
- if ((await page.locator('body').count()) > 0) {
1153
- data = await page.locator('body').innerText();
1154
- } else {
1199
+ if (!data) {
1200
+ if ((await page.locator('body').count()) > 0) {
1201
+ data = await page.locator('body').innerText();
1202
+ } else {
1155
1203
  const urlSet = page.locator('urlset');
1156
1204
  const sitemapIndex = page.locator('sitemapindex');
1157
1205
  const rss = page.locator('rss');
@@ -1166,6 +1214,7 @@ export const getLinksFromSitemap = async (
1166
1214
  data = await rss.evaluate(elem => elem.outerHTML);
1167
1215
  } else if (await isRoot(feed)) {
1168
1216
  data = await feed.evaluate(elem => elem.outerHTML);
1217
+ }
1169
1218
  }
1170
1219
  }
1171
1220
  } finally {
@@ -1189,39 +1238,65 @@ export const getLinksFromSitemap = async (
1189
1238
  }
1190
1239
 
1191
1240
  const $ = cheerio.load(data, { xml: true });
1241
+ const countBefore = allUrls.size;
1192
1242
 
1193
1243
  // This case is when the document is not an XML format document
1194
1244
  if ($(':root').length === 0) {
1195
1245
  processNonStandardSitemap(data);
1246
+
1247
+ const linksFromThisSitemap = allUrls.size - countBefore;
1248
+ if (linksFromThisSitemap > 0) {
1249
+ sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
1250
+ }
1196
1251
  return;
1197
1252
  }
1198
1253
 
1199
1254
  // Root element
1200
1255
  const root = $(':root')[0];
1256
+ const hasImageNamespace = Object.values(root?.attribs ?? {}).some(
1257
+ attribVal => typeof attribVal === 'string' && attribVal.toLowerCase().includes('sitemap-image'),
1258
+ );
1201
1259
 
1202
- const { xmlns } = root.attribs;
1260
+ if (hasImageNamespace) {
1261
+ consoleLogger.info(`Skipping image sitemap: ${url}`);
1262
+ return;
1263
+ }
1264
+
1265
+ const rootName = root?.name?.toLowerCase().split(':').pop() ?? '';
1266
+ const hasXmlSitemapIndexTag = /<\s*(?:[a-z0-9_-]+:)?sitemapindex\b/i.test(data);
1267
+ const hasXmlUrlsetTag = /<\s*(?:[a-z0-9_-]+:)?urlset\b/i.test(data);
1203
1268
 
1204
- const xmlFormatNamespace = '/schemas/sitemap';
1205
- if (root.name === 'urlset' && xmlns.includes(xmlFormatNamespace)) {
1269
+ if (rootName === 'urlset') {
1206
1270
  sitemapType = constants.xmlSitemapTypes.xml;
1207
- } else if (root.name === 'sitemapindex' && xmlns.includes(xmlFormatNamespace)) {
1271
+ } else if (rootName === 'sitemapindex') {
1208
1272
  sitemapType = constants.xmlSitemapTypes.xmlIndex;
1209
- } else if (root.name === 'rss') {
1273
+ } else if (rootName === 'rss') {
1210
1274
  sitemapType = constants.xmlSitemapTypes.rss;
1211
- } else if (root.name === 'feed') {
1275
+ } else if (rootName === 'feed') {
1212
1276
  sitemapType = constants.xmlSitemapTypes.atom;
1277
+ } else if (hasXmlSitemapIndexTag) {
1278
+ sitemapType = constants.xmlSitemapTypes.xmlIndex;
1279
+ } else if (hasXmlUrlsetTag) {
1280
+ sitemapType = constants.xmlSitemapTypes.xml;
1213
1281
  } else {
1214
1282
  sitemapType = constants.xmlSitemapTypes.unknown;
1215
1283
  }
1216
1284
 
1217
- const countBefore = allUrls.size;
1218
-
1219
1285
  switch (sitemapType) {
1220
1286
  case constants.xmlSitemapTypes.xmlIndex:
1221
- consoleLogger.info(`This is a XML format sitemap index.`);
1287
+ consoleLogger.info(`This is a XML format sitemap index: ${url}`);
1222
1288
  for (const childSitemapUrl of $('loc')) {
1223
- const childSitemapUrlText = $(childSitemapUrl).text();
1224
- if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
1289
+ const childSitemapUrlText = $(childSitemapUrl).text().trim();
1290
+ if (!childSitemapUrlText) {
1291
+ continue;
1292
+ }
1293
+
1294
+ const childSitemapPath = childSitemapUrlText.split(/[?#]/)[0].toLowerCase();
1295
+ if (childSitemapPath.endsWith('.xml') || childSitemapPath.endsWith('.txt')) {
1296
+ if (isImageSitemapUrl(childSitemapUrlText)) {
1297
+ consoleLogger.info(`Skipping image sitemap: ${childSitemapUrlText}`);
1298
+ continue;
1299
+ }
1225
1300
  await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
1226
1301
  } else {
1227
1302
  addToUrlList(childSitemapUrlText); // Add regular URLs to the list
@@ -1229,19 +1304,19 @@ export const getLinksFromSitemap = async (
1229
1304
  }
1230
1305
  break;
1231
1306
  case constants.xmlSitemapTypes.xml:
1232
- consoleLogger.info(`This is a XML format sitemap.`);
1307
+ consoleLogger.info(`This is a XML format sitemap: ${url}`);
1233
1308
  await processXmlSitemap($, sitemapType, 'loc', 'lastmod', 'url');
1234
1309
  break;
1235
1310
  case constants.xmlSitemapTypes.rss:
1236
- consoleLogger.info(`This is a RSS format sitemap.`);
1311
+ consoleLogger.info(`This is a RSS format sitemap: ${url}`);
1237
1312
  await processXmlSitemap($, sitemapType, 'link', 'pubDate', 'item');
1238
1313
  break;
1239
1314
  case constants.xmlSitemapTypes.atom:
1240
- consoleLogger.info(`This is a Atom format sitemap.`);
1315
+ consoleLogger.info(`This is a Atom format sitemap: ${url}`);
1241
1316
  await processXmlSitemap($, sitemapType, 'link', 'published', 'entry');
1242
1317
  break;
1243
1318
  default:
1244
- consoleLogger.info(`This is an unrecognised XML sitemap format.`);
1319
+ consoleLogger.info(`This is an unrecognised XML sitemap format: ${url}`);
1245
1320
  processNonStandardSitemap(data);
1246
1321
  }
1247
1322
 
@@ -2191,6 +2266,7 @@ export const isFilePath = (url: string): boolean => {
2191
2266
  const driveLetterPattern = /^[A-Z]:/i;
2192
2267
  const backslashPattern = /\\/;
2193
2268
  return (
2269
+ url.toLowerCase().startsWith('file://') ||
2194
2270
  url.startsWith('/') ||
2195
2271
  driveLetterPattern.test(url) ||
2196
2272
  backslashPattern.test(url) ||
@@ -1145,10 +1145,19 @@ export const createCrawleeSubFolders = async (
1145
1145
  export const preNavigationHooks = (extraHTTPHeaders: Record<string, string>) => {
1146
1146
  return [
1147
1147
  async (crawlingContext: CrawlingContext, gotoOptions: PlaywrightGotoOptions) => {
1148
- if (extraHTTPHeaders) {
1148
+ if (extraHTTPHeaders && Object.keys(extraHTTPHeaders).length > 0) {
1149
1149
  crawlingContext.request.headers = extraHTTPHeaders;
1150
1150
  }
1151
- gotoOptions = { waitUntil: 'networkidle', timeout: 30000 };
1151
+ // Use domcontentloaded fires as soon as the DOM is parsed, before
1152
+ // images/stylesheets/network requests settle. This avoids indefinite
1153
+ // hangs on sites with WebSockets, analytics polling, or infinite-scroll
1154
+ // beacons that never reach networkidle. Further page stability is
1155
+ // handled by waitForPageLoaded() in each crawler's requestHandler and
1156
+ // by the DOM mutation observer in postNavigationHooks.
1157
+ if (gotoOptions) {
1158
+ gotoOptions.waitUntil = 'domcontentloaded';
1159
+ gotoOptions.timeout = 30000;
1160
+ }
1152
1161
  },
1153
1162
  ];
1154
1163
  };
@@ -5,6 +5,7 @@ import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
5
5
  import {
6
6
  createCrawleeSubFolders,
7
7
  getPreLaunchHook,
8
+ preNavigationHooks,
8
9
  runAxeScript,
9
10
  isUrlPdf,
10
11
  shouldSkipClickDueToDisallowedHref,
@@ -414,12 +415,10 @@ const crawlDomain = async ({
414
415
  ],
415
416
  },
416
417
  requestQueue,
418
+ maxRequestRetries: 3,
419
+ maxSessionRotations: 1,
417
420
  preNavigationHooks: [
418
- async (crawlingContext) => {
419
- if (extraHTTPHeaders) {
420
- crawlingContext.request.headers = extraHTTPHeaders;
421
- }
422
- },
421
+ ...preNavigationHooks(extraHTTPHeaders),
423
422
  ],
424
423
  postNavigationHooks: [
425
424
  async crawlingContext => {
@@ -7,6 +7,7 @@ import {
7
7
  preNavigationHooks,
8
8
  runAxeScript,
9
9
  isUrlPdf,
10
+ splitAuthHeaders,
10
11
  } from './commonCrawlerFunc.js';
11
12
 
12
13
  import constants, {
@@ -85,6 +86,7 @@ const crawlSitemap = async ({
85
86
  maxRequestsPerCrawl,
86
87
  specifiedMaxConcurrency || constants.maxConcurrency,
87
88
  );
89
+ const initialNoSuccessFailureAbortThreshold = Math.max(5, Math.min(maxRequestsPerCrawl, 25));
88
90
 
89
91
  if (fromCrawlIntelligentSitemap) {
90
92
  dataset = datasetFromIntelligent;
@@ -119,6 +121,7 @@ const crawlSitemap = async ({
119
121
  const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
120
122
  const { playwrightDeviceDetailsObject } = viewportSettings;
121
123
  const { maxConcurrency } = constants;
124
+ const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
122
125
 
123
126
  const requestList = await RequestList.open({
124
127
  sources: linksFromSitemap,
@@ -142,11 +145,15 @@ const crawlSitemap = async ({
142
145
  ...playwrightDeviceDetailsObject,
143
146
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
144
147
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
148
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
149
+ ...(httpCredentials && { httpCredentials }),
145
150
  };
146
151
  },
147
152
  ],
148
153
  },
149
154
  requestList,
155
+ maxRequestRetries: 3,
156
+ maxSessionRotations: 1,
150
157
  postNavigationHooks: [
151
158
  async ({ page }) => {
152
159
  try {
@@ -197,6 +204,7 @@ const crawlSitemap = async ({
197
204
  },
198
205
  ],
199
206
  preNavigationHooks: [
207
+ ...preNavigationHooks(extraHTTPHeaders),
200
208
  async ({ request, page }, gotoOptions) => {
201
209
  const url = request.url.toLowerCase();
202
210
 
@@ -213,8 +221,6 @@ const crawlSitemap = async ({
213
221
 
214
222
  return;
215
223
  }
216
-
217
- preNavigationHooks(extraHTTPHeaders);
218
224
  },
219
225
  ],
220
226
  requestHandlerTimeoutSecs: 90,
@@ -449,6 +455,17 @@ const crawlSitemap = async ({
449
455
  httpStatusCode: typeof status === 'number' ? status : 0,
450
456
  });
451
457
  crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
458
+
459
+ if (
460
+ urlsCrawled.scanned.length === 0 &&
461
+ urlsCrawled.error.length >= initialNoSuccessFailureAbortThreshold
462
+ ) {
463
+ consoleLogger.info(
464
+ `Aborting sitemap crawl: ${urlsCrawled.error.length} failed pages with 0 successful scans.`,
465
+ );
466
+ isAbortingScan = true;
467
+ crawler.autoscaledPool?.abort();
468
+ }
452
469
  },
453
470
  maxRequestsPerCrawl: Infinity,
454
471
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
@@ -1228,19 +1228,32 @@ export const initNewPage = async (page, pageClosePromises, processPageParams, pa
1228
1228
  const allowed = isOverlayAllowed(page.url(), processPageParams.entryUrl);
1229
1229
 
1230
1230
  if (!allowed) {
1231
- await Promise.race([
1232
- removeOverlayMenu(page),
1233
- new Promise((_, reject) => {
1234
- setTimeout(() => {
1235
- reject(
1236
- new Error(
1237
- `removeOverlayMenu timed out after ${OVERLAY_OPERATION_TIMEOUT_MS}ms`,
1238
- ),
1239
- );
1240
- }, OVERLAY_OPERATION_TIMEOUT_MS);
1241
- }),
1242
- ]);
1243
- return;
1231
+ // On macOS and Windows the custom flow always runs headful.
1232
+ // The URL guard (urlGuard.ts) intercepts non-http/https navigations
1233
+ // and calls page.goto(safeUrl). Do NOT remove the overlay here —
1234
+ // removing it causes it to stay permanently disabled if the redirect
1235
+ // races ahead of the next reconcile cycle.
1236
+ // Instead, fall through to the hasOverlay / addOverlayMenu block so
1237
+ // the overlay is (re-)injected even on transient non-http/https URLs
1238
+ // (e.g. file://, about:blank) and again after the guard's redirect.
1239
+ const isDesktopHost = process.platform === 'darwin' || process.platform === 'win32';
1240
+ if (!isDesktopHost) {
1241
+ // On Linux / Docker: remove overlay for non-http/https URLs and stop.
1242
+ await Promise.race([
1243
+ removeOverlayMenu(page),
1244
+ new Promise((_, reject) => {
1245
+ setTimeout(() => {
1246
+ reject(
1247
+ new Error(
1248
+ `removeOverlayMenu timed out after ${OVERLAY_OPERATION_TIMEOUT_MS}ms`,
1249
+ ),
1250
+ );
1251
+ }, OVERLAY_OPERATION_TIMEOUT_MS);
1252
+ }),
1253
+ ]);
1254
+ return;
1255
+ }
1256
+ // Desktop hosts: skip removal and fall through to re-add overlay.
1244
1257
  }
1245
1258
 
1246
1259
  const hasOverlay = await page.evaluate(() =>
@@ -35,8 +35,18 @@ export function addUrlGuardScript(context, opts = {}) {
35
35
  });
36
36
 
37
37
  const restoreToSafeUrl = async (page, attemptedUrl) => {
38
+ const safeUrl = lastAllowedUrlByPage.get(page) || fallbackUrl || 'about:blank';
39
+ // Only redirect if the safe URL is itself an allowed (http/https) URL.
40
+ // If the entry URL is file:// (e.g. scanning a local HTML file), the
41
+ // fallback is also file://, and redirecting would create an infinite loop:
42
+ // file:// → restoreToSafeUrl → file:// → framenavigated → restoreToSafeUrl → …
43
+ try {
44
+ const safeObj = new URL(safeUrl);
45
+ if (!ALLOWED_PROTOCOLS.has(safeObj.protocol)) return;
46
+ } catch {
47
+ return;
48
+ }
38
49
  try {
39
- const safeUrl = lastAllowedUrlByPage.get(page) || fallbackUrl || 'about:blank';
40
50
  await page.goto(safeUrl, { waitUntil: 'domcontentloaded' });
41
51
  } catch {
42
52
  // page might be closing; ignore
@@ -58,6 +68,13 @@ export function addUrlGuardScript(context, opts = {}) {
58
68
  lastAllowedUrlByPage.set(page, urlObj.toString());
59
69
  return;
60
70
  }
71
+
72
+ // Skip browser-internal transitional states (about:blank, about:srcdoc, etc.).
73
+ // page.goto() navigates through about:blank before loading the target URL.
74
+ // Redirecting from about: creates an infinite loop:
75
+ // restoreToSafeUrl → page.goto(safeUrl) → about:blank → restoreToSafeUrl → …
76
+ if (urlObj.protocol === 'about:') return;
77
+
61
78
  await restoreToSafeUrl(page, urlStr);
62
79
  });
63
80
  };
@@ -7,6 +7,7 @@
7
7
  <button
8
8
  type="button"
9
9
  class="category-tooltip-icon"
10
+ aria-label="About Must Fix category"
10
11
  aria-describedby="mustFixTooltip"
11
12
  >
12
13
  <svg xmlns="http://www.w3.org/2000/svg" width="14" height="14"
@@ -34,6 +35,7 @@
34
35
  <button
35
36
  type="button"
36
37
  class="category-tooltip-icon"
38
+ aria-label="About Good to Fix category"
37
39
  aria-describedby="goodToFixTooltip"
38
40
  >
39
41
  <svg xmlns="http://www.w3.org/2000/svg" width="14" height="14"
@@ -61,6 +63,7 @@
61
63
  <button
62
64
  type="button"
63
65
  class="category-tooltip-icon"
66
+ aria-label="About Manual Test category"
64
67
  aria-describedby="manualTestTooltip"
65
68
  >
66
69
  <svg xmlns="http://www.w3.org/2000/svg" width="14" height="14"
@@ -2,21 +2,21 @@
2
2
  <table class="issues-table" id="issuesTable">
3
3
  <thead>
4
4
  <tr>
5
- <th class="sortable" role="button" tabindex="0" aria-sort="none" style="width: 15%;">
5
+ <th class="sortable" tabindex="0" aria-sort="none" style="width: 15%;">
6
6
  <span>Severity</span>
7
7
  <svg class="sort-icon" width="24" height="24" viewBox="0 0 24 24" fill="none" aria-hidden="true">
8
8
  <path d="M7 9L12 4L17 9H7Z" fill="currentColor" opacity="1" />
9
9
  <path d="M7 15L12 20L17 15H7Z" fill="currentColor" opacity="0.3" />
10
10
  </svg>
11
11
  </th>
12
- <th class="sortable" role="button" tabindex="0" aria-sort="none">
12
+ <th class="sortable" tabindex="0" aria-sort="none">
13
13
  <span>Issue Name</span>
14
14
  <svg class="sort-icon" width="24" height="24" viewBox="0 0 24 24" fill="none" aria-hidden="true">
15
15
  <path d="M7 9L12 4L17 9H7Z" fill="currentColor" opacity="0.3" />
16
16
  <path d="M7 15L12 20L17 15H7Z" fill="currentColor" opacity="1" />
17
17
  </svg>
18
18
  </th>
19
- <th class="sortable" role="button" tabindex="0" aria-sort="descending" style="width: 15%;">
19
+ <th class="sortable" tabindex="0" aria-sort="descending" style="width: 15%;">
20
20
  <span>Occurrence</span>
21
21
  <svg class="sort-icon" width="24" height="24" viewBox="0 0 24 24" fill="none" aria-hidden="true">
22
22
  <path d="M7 9L12 4L17 9H7Z" fill="currentColor" opacity="0.3" />
@@ -1,4 +1,4 @@
1
- <div id="aboutScanModal" class="modal fade" tabindex="-1" aria-labelledby="aboutScanModalLabel" aria-hidden="true">
1
+ <div id="aboutScanModal" class="modal fade" tabindex="-1" aria-label="About this scan" aria-hidden="true">
2
2
  <div class="modal-dialog modal-dialog-centered">
3
3
  <div class="modal-content">
4
4
  <div class="modal-header">