@govtechsg/oobee 0.10.85 → 0.10.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/.github/workflows/publish.yml +10 -0
  2. package/DETAILS.md +29 -0
  3. package/dist/cli.js +18 -5
  4. package/dist/combine.js +3 -1
  5. package/dist/constants/cliFunctions.js +2 -2
  6. package/dist/constants/common.js +70 -17
  7. package/dist/constants/constants.js +604 -1
  8. package/dist/crawlers/commonCrawlerFunc.js +3 -2
  9. package/dist/crawlers/crawlDomain.js +38 -13
  10. package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
  11. package/dist/crawlers/crawlSitemap.js +141 -84
  12. package/dist/crawlers/custom/utils.js +218 -71
  13. package/dist/crawlers/guards/urlGuard.js +8 -15
  14. package/dist/crawlers/runCustom.js +18 -11
  15. package/dist/generateHtmlReport.js +18 -11
  16. package/dist/generateOobeeClientScanner.js +570 -0
  17. package/dist/mergeAxeResults/itemReferences.js +60 -25
  18. package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
  19. package/dist/mergeAxeResults.js +23 -13
  20. package/dist/npmIndex.js +10 -2
  21. package/dist/proxyService.js +18 -3
  22. package/dist/services/s3Uploader.js +21 -10
  23. package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  24. package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
  25. package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
  26. package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
  27. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
  28. package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  29. package/dist/static/ejs/summary.ejs +19 -8
  30. package/dist/utils.js +4 -3
  31. package/fix-summary-html-oom-pr.md +62 -0
  32. package/oobee-client-scanner.js +34992 -0
  33. package/package.json +5 -5
  34. package/src/cli.ts +19 -5
  35. package/src/combine.ts +5 -1
  36. package/src/constants/cliFunctions.ts +2 -2
  37. package/src/constants/common.ts +87 -22
  38. package/src/constants/constants.ts +602 -1
  39. package/src/crawlers/commonCrawlerFunc.ts +4 -3
  40. package/src/crawlers/crawlDomain.ts +39 -13
  41. package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
  42. package/src/crawlers/crawlSitemap.ts +165 -100
  43. package/src/crawlers/custom/utils.ts +241 -80
  44. package/src/crawlers/guards/urlGuard.ts +24 -31
  45. package/src/crawlers/runCustom.ts +29 -11
  46. package/src/generateHtmlReport.ts +21 -11
  47. package/src/generateOobeeClientScanner.ts +591 -0
  48. package/src/mergeAxeResults/itemReferences.ts +70 -26
  49. package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
  50. package/src/mergeAxeResults.ts +26 -14
  51. package/src/npmIndex.ts +12 -2
  52. package/src/proxyService.ts +25 -4
  53. package/src/services/s3Uploader.ts +23 -11
  54. package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  55. package/src/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
  56. package/src/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
  57. package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
  58. package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
  59. package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  60. package/src/static/ejs/summary.ejs +19 -8
  61. package/src/utils.ts +4 -3
  62. package/testStaticJSScanner.html +534 -0
@@ -116,7 +116,7 @@ export const filterAxeResults = (results, pageTitle, customFlowDetails) => {
116
116
  return;
117
117
  const conformance = tags.filter(tag => tag.startsWith('wcag') || tag === 'best-practice');
118
118
  nodes.forEach(node => {
119
- const { html } = node;
119
+ const { html, target } = node;
120
120
  if (!(rule in passed.rules)) {
121
121
  passed.rules[rule] = {
122
122
  description,
@@ -128,7 +128,8 @@ export const filterAxeResults = (results, pageTitle, customFlowDetails) => {
128
128
  };
129
129
  }
130
130
  const finalHtml = truncateHtml(html);
131
- passed.rules[rule].items.push({ html: finalHtml, screenshotPath: '', message: '', xpath: '' });
131
+ const xpath = target.length === 1 && typeof target[0] === 'string' ? target[0] : undefined;
132
+ passed.rules[rule].items.push({ html: finalHtml, screenshotPath: '', message: '', xpath: xpath || '' });
132
133
  passed.totalItems += 1;
133
134
  passed.rules[rule].totalItems += 1;
134
135
  totalItems += 1;
@@ -4,7 +4,7 @@ import fsp from 'fs/promises';
4
4
  import { createCrawleeSubFolders, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
5
5
  import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
6
6
  import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
7
- import { areLinksEqual, isFollowStrategy, register } from '../utils.js';
7
+ import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
8
8
  import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
9
9
  import { consoleLogger, guiInfoLog } from '../logs.js';
10
10
  const isBlacklisted = (url, blacklistedPatterns) => {
@@ -37,8 +37,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
37
37
  const pdfDownloads = [];
38
38
  const uuidToPdfMapping = {};
39
39
  const queuedUrlSet = new Set();
40
- const scannedUrlSet = new Set(urlsCrawled.scanned.map(item => item.url));
41
- const scannedResolvedUrlSet = new Set(urlsCrawled.scanned.map(item => item.actualUrl || item.url));
40
+ const scannedUrlSet = new Set(urlsCrawled.scanned.map(item => normUrl(item.url)));
41
+ const scannedResolvedUrlSet = new Set(urlsCrawled.scanned.map(item => normUrl(item.actualUrl || item.url)));
42
42
  const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes);
43
43
  const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes);
44
44
  const { maxConcurrency } = constants;
@@ -70,11 +70,12 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
70
70
  const initialPageUrl = workingPage.url().toString();
71
71
  const selectedElementsString = cssQuerySelectors.join(', ');
72
72
  const isExcluded = (newPageUrl) => {
73
- const isAlreadyScanned = urlsCrawled.scanned.some(item => item.url === newPageUrl);
73
+ const isAlreadyScanned = scannedUrlSet.has(normUrl(newPageUrl));
74
74
  const isBlacklistedUrl = isBlacklisted(newPageUrl, blacklistedPatterns);
75
75
  const isNotFollowStrategy = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
76
76
  const isNotSupportedDocument = disallowedListOfPatterns.some(pattern => newPageUrl.toLowerCase().startsWith(pattern));
77
- return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
77
+ const isRobotsDisallowed = isDisallowedInRobotsTxt(newPageUrl);
78
+ return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy || isRobotsDisallowed;
78
79
  };
79
80
  const setPageListeners = (pageListener) => {
80
81
  // event listener to handle new page popups upon button click
@@ -235,7 +236,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
235
236
  catch (e) {
236
237
  consoleLogger.error(e);
237
238
  }
238
- if (scannedUrlSet.has(req.url)) {
239
+ if (scannedUrlSet.has(normUrl(req.url))) {
239
240
  req.skipNavigation = true;
240
241
  }
241
242
  if (isDisallowedInRobotsTxt(req.url))
@@ -358,7 +359,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
358
359
  finalUrl = requestLabelUrl;
359
360
  }
360
361
  const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
361
- if (isRedirected) {
362
+ if (isRedirected && !isDisallowedInRobotsTxt(finalUrl)) {
362
363
  await enqueueUniqueRequest({ url: finalUrl, label: finalUrl });
363
364
  }
364
365
  else {
@@ -399,7 +400,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
399
400
  return;
400
401
  }
401
402
  // if URL has already been scanned
402
- if (scannedUrlSet.has(request.url)) {
403
+ if (scannedUrlSet.has(normUrl(request.url))) {
403
404
  await enqueueProcess(page, enqueueLinks, browserContext);
404
405
  return;
405
406
  }
@@ -493,8 +494,32 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
493
494
  return;
494
495
  }
495
496
  const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
497
+ // Detect JS redirects that fire during/after axe scan.
498
+ // Listen for navigation, then give a brief window for pending redirects to complete.
499
+ try {
500
+ let navigatedToUrl = null;
501
+ const onFrameNavigated = (frame) => {
502
+ if (frame === page.mainFrame()) {
503
+ navigatedToUrl = frame.url();
504
+ }
505
+ };
506
+ page.on('framenavigated', onFrameNavigated);
507
+ await page.waitForTimeout(1000);
508
+ page.off('framenavigated', onFrameNavigated);
509
+ const postScanUrl = navigatedToUrl || page.url();
510
+ if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
511
+ urlsCrawled.notScannedRedirects.push({
512
+ fromUrl: request.url,
513
+ toUrl: postScanUrl,
514
+ });
515
+ return;
516
+ }
517
+ }
518
+ catch (_) {
519
+ // Page/context was destroyed during navigation — handled by outer catch
520
+ }
496
521
  if (isRedirected) {
497
- const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(actualUrl);
522
+ const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(normUrl(actualUrl));
498
523
  if (isLoadedUrlInCrawledUrls) {
499
524
  urlsCrawled.notScannedRedirects.push({
500
525
  fromUrl: request.url,
@@ -513,8 +538,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
513
538
  pageTitle: results.pageTitle,
514
539
  actualUrl, // i.e. actualUrl
515
540
  });
516
- scannedUrlSet.add(request.url);
517
- scannedResolvedUrlSet.add(actualUrl);
541
+ scannedUrlSet.add(normUrl(request.url));
542
+ scannedResolvedUrlSet.add(normUrl(actualUrl));
518
543
  urlsCrawled.scannedRedirects.push({
519
544
  fromUrl: request.url,
520
545
  toUrl: actualUrl, // i.e. actualUrl
@@ -535,8 +560,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
535
560
  actualUrl: request.url,
536
561
  pageTitle: results.pageTitle,
537
562
  });
538
- scannedUrlSet.add(request.url);
539
- scannedResolvedUrlSet.add(request.url);
563
+ scannedUrlSet.add(normUrl(request.url));
564
+ scannedResolvedUrlSet.add(normUrl(request.url));
540
565
  await dataset.pushData(results);
541
566
  }
542
567
  }
@@ -3,7 +3,7 @@ import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/consta
3
3
  import { consoleLogger, guiInfoLog } from '../logs.js';
4
4
  import crawlDomain from './crawlDomain.js';
5
5
  import crawlSitemap from './crawlSitemap.js';
6
- import { getPlaywrightLaunchOptions } from '../constants/common.js';
6
+ import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
7
7
  import { register } from '../utils.js';
8
8
  const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, strategy, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, followRobots, extraHTTPHeaders, safeMode, scanDuration) => {
9
9
  const startTime = Date.now(); // Track start time
@@ -66,12 +66,30 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
66
66
  return false;
67
67
  }
68
68
  };
69
+ // Discover sitemaps from robots.txt first (supports multiple Sitemap: directives)
70
+ let sitemapUrls = [];
69
71
  try {
70
- sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
72
+ sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
73
+ if (sitemapUrls.length > 0) {
74
+ console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
75
+ sitemapExist = true;
76
+ }
71
77
  }
72
78
  catch (error) {
73
79
  consoleLogger.error(error);
74
80
  }
81
+ // Fall back to hardcoded path probing if robots.txt had no sitemaps
82
+ if (!sitemapExist) {
83
+ try {
84
+ sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
85
+ if (sitemapExist) {
86
+ sitemapUrls = [sitemapUrl];
87
+ }
88
+ }
89
+ catch (error) {
90
+ consoleLogger.error(error);
91
+ }
92
+ }
75
93
  if (!sitemapExist) {
76
94
  console.log('Unable to find sitemap. Commencing website crawl instead.');
77
95
  return await crawlDomain({
@@ -90,34 +108,48 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
90
108
  followRobots,
91
109
  extraHTTPHeaders,
92
110
  safeMode,
93
- scanDuration, // Use full duration since no sitemap
111
+ scanDuration,
112
+ });
113
+ }
114
+ // Process all discovered sitemaps sequentially, sharing dataset and urlsCrawled
115
+ for (const currentSitemapUrl of sitemapUrls) {
116
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl)
117
+ break;
118
+ const elapsed = Date.now() - startTime;
119
+ const remainingDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : scanDuration;
120
+ if (scanDuration > 0 && remainingDuration <= 0) {
121
+ durationExceeded = true;
122
+ break;
123
+ }
124
+ console.log(`Processing sitemap: ${currentSitemapUrl}`);
125
+ urlsCrawledFinal = await crawlSitemap({
126
+ sitemapUrl: currentSitemapUrl,
127
+ randomToken,
128
+ host,
129
+ viewportSettings,
130
+ maxRequestsPerCrawl,
131
+ browser,
132
+ userDataDirectory,
133
+ specifiedMaxConcurrency,
134
+ fileTypes,
135
+ blacklistedPatterns,
136
+ includeScreenshots,
137
+ extraHTTPHeaders,
138
+ strategy,
139
+ userUrl: url,
140
+ fromCrawlIntelligentSitemap,
141
+ userUrlInputFromIntelligent: url,
142
+ datasetFromIntelligent: dataset,
143
+ urlsCrawledFromIntelligent: urlsCrawled,
144
+ crawledFromLocalFile: false,
145
+ scanDuration: scanDuration > 0 ? remainingDuration : 0,
94
146
  });
95
147
  }
96
- console.log(`Sitemap found at ${sitemapUrl}`);
97
- urlsCrawledFinal = await crawlSitemap({
98
- sitemapUrl,
99
- randomToken,
100
- host,
101
- viewportSettings,
102
- maxRequestsPerCrawl,
103
- browser,
104
- userDataDirectory,
105
- specifiedMaxConcurrency,
106
- fileTypes,
107
- blacklistedPatterns,
108
- includeScreenshots,
109
- extraHTTPHeaders,
110
- fromCrawlIntelligentSitemap,
111
- userUrlInputFromIntelligent: url,
112
- datasetFromIntelligent: dataset,
113
- urlsCrawledFromIntelligent: urlsCrawled,
114
- crawledFromLocalFile: false,
115
- scanDuration,
116
- });
117
148
  const elapsed = Date.now() - startTime;
118
- const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0); // in seconds
119
- if (urlsCrawledFinal.scanned.length < maxRequestsPerCrawl && remainingScanDuration > 0) {
120
- console.log(`Continuing crawl from root website. Remaining scan time: ${remainingScanDuration.toFixed(1)}s`);
149
+ const remainingScanDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : 0;
150
+ const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
151
+ if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
152
+ console.log(`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`);
121
153
  urlsCrawledFinal = await crawlDomain({
122
154
  url,
123
155
  randomToken,
@@ -136,15 +168,15 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
136
168
  safeMode,
137
169
  fromCrawlIntelligentSitemap,
138
170
  datasetFromIntelligent: dataset,
139
- urlsCrawledFromIntelligent: urlsCrawledFinal,
171
+ urlsCrawledFromIntelligent: urlsCrawled,
140
172
  scanDuration: remainingScanDuration,
141
173
  });
142
174
  }
143
- else if (remainingScanDuration <= 0) {
175
+ else if (!hasDurationRemaining) {
144
176
  console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
145
177
  durationExceeded = true;
146
178
  }
147
179
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
148
- return { urlsCrawled: urlsCrawledFinal, durationExceeded };
180
+ return { urlsCrawled, durationExceeded };
149
181
  };
150
182
  export default crawlIntelligentSitemap;
@@ -1,17 +1,18 @@
1
- import crawlee, { RequestList } from 'crawlee';
1
+ import crawlee, { EnqueueStrategy, RequestList } from 'crawlee';
2
2
  import * as path from 'path';
3
3
  import fsp from 'fs/promises';
4
4
  import { createCrawleeSubFolders, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
5
5
  import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, disallowedListOfPatterns, FileTypes, } from '../constants/constants.js';
6
6
  import { getLinksFromSitemap, getPlaywrightLaunchOptions, isSkippedUrl, waitForPageLoaded, isFilePath, } from '../constants/common.js';
7
- import { areLinksEqual, isWhitelistedContentType, register } from '../utils.js';
7
+ import { areLinksEqual, isFollowStrategy, isWhitelistedContentType, normUrl, register } from '../utils.js';
8
8
  import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
9
9
  import { guiInfoLog } from '../logs.js';
10
- const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
10
+ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = '', scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
11
11
  const crawlStartTime = Date.now();
12
12
  let dataset;
13
13
  let urlsCrawled;
14
14
  let durationExceeded = false;
15
+ let isAbortingScan = false;
15
16
  if (fromCrawlIntelligentSitemap) {
16
17
  dataset = datasetFromIntelligent;
17
18
  urlsCrawled = urlsCrawledFromIntelligent;
@@ -24,7 +25,7 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
24
25
  console.log('Local file crawling not supported for sitemap. Please provide a valid URL.');
25
26
  return;
26
27
  }
27
- const linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, extraHTTPHeaders);
28
+ const linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, extraHTTPHeaders, strategy, userUrl || sitemapUrl);
28
29
  sitemapUrl = encodeURI(sitemapUrl);
29
30
  const pdfDownloads = [];
30
31
  const uuidToPdfMapping = {};
@@ -144,106 +145,162 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
144
145
  });
145
146
  return;
146
147
  }
147
- await waitForPageLoaded(page, 10000);
148
- const actualUrl = page.url() || request.loadedUrl || request.url;
149
- const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
150
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
151
- if (hasExceededDuration) {
152
- console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
153
- durationExceeded = true;
154
- }
155
- crawler.autoscaledPool.abort(); // stops new requests
156
- return;
157
- }
158
- if (request.skipNavigation && actualUrl === 'about:blank') {
159
- if (isScanPdfs) {
160
- // pushes download promise into pdfDownloads
161
- const { pdfFileName, url } = handlePdfDownload(randomToken, pdfDownloads, request, sendRequest, urlsCrawled);
162
- uuidToPdfMapping[pdfFileName] = url;
148
+ try {
149
+ await waitForPageLoaded(page, 10000);
150
+ const actualUrl = page.url() || request.loadedUrl || request.url;
151
+ const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
152
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
153
+ isAbortingScan = true;
154
+ if (hasExceededDuration) {
155
+ console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
156
+ durationExceeded = true;
157
+ }
158
+ crawler.autoscaledPool.abort(); // stops new requests
163
159
  return;
164
160
  }
165
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
166
- numScanned: urlsCrawled.scanned.length,
167
- urlScanned: request.url,
168
- });
169
- urlsCrawled.userExcluded.push({
170
- url: request.url,
171
- pageTitle: request.url,
172
- actualUrl: request.url, // because about:blank is not useful
173
- metadata: STATUS_CODE_METADATA[1],
174
- httpStatusCode: 1,
175
- });
176
- return;
177
- }
178
- const contentType = response?.headers?.()['content-type'] || '';
179
- const status = response ? response.status() : 0;
180
- if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
181
- const isRedirected = !areLinksEqual(page.url(), request.url);
182
- const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(item => (item.actualUrl || item.url) === page.url());
183
- if (isRedirected && isLoadedUrlInCrawledUrls) {
184
- urlsCrawled.notScannedRedirects.push({
185
- fromUrl: request.url,
186
- toUrl: actualUrl, // i.e. actualUrl
161
+ if (request.skipNavigation && actualUrl === 'about:blank') {
162
+ if (isScanPdfs) {
163
+ // pushes download promise into pdfDownloads
164
+ const { pdfFileName, url } = handlePdfDownload(randomToken, pdfDownloads, request, sendRequest, urlsCrawled);
165
+ uuidToPdfMapping[pdfFileName] = url;
166
+ return;
167
+ }
168
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
169
+ numScanned: urlsCrawled.scanned.length,
170
+ urlScanned: request.url,
187
171
  });
188
- return;
189
- }
190
- // This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
191
- if (isRedirected && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
192
172
  urlsCrawled.userExcluded.push({
193
173
  url: request.url,
194
174
  pageTitle: request.url,
195
- actualUrl,
196
- metadata: STATUS_CODE_METADATA[0],
197
- httpStatusCode: 0,
175
+ actualUrl: request.url, // because about:blank is not useful
176
+ metadata: STATUS_CODE_METADATA[1],
177
+ httpStatusCode: 1,
198
178
  });
179
+ return;
180
+ }
181
+ const contentType = response?.headers?.()['content-type'] || '';
182
+ const status = response ? response.status() : 0;
183
+ if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
184
+ const isRedirected = !areLinksEqual(page.url(), request.url);
185
+ const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(item => normUrl(item.actualUrl || item.url) === normUrl(page.url()));
186
+ if (isRedirected && isLoadedUrlInCrawledUrls) {
187
+ urlsCrawled.notScannedRedirects.push({
188
+ fromUrl: request.url,
189
+ toUrl: actualUrl, // i.e. actualUrl
190
+ });
191
+ return;
192
+ }
193
+ // This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
194
+ if (isRedirected && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
195
+ urlsCrawled.userExcluded.push({
196
+ url: request.url,
197
+ pageTitle: request.url,
198
+ actualUrl,
199
+ metadata: STATUS_CODE_METADATA[0],
200
+ httpStatusCode: 0,
201
+ });
202
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
203
+ numScanned: urlsCrawled.scanned.length,
204
+ urlScanned: request.url,
205
+ });
206
+ return;
207
+ }
208
+ if (isRedirected && !isFollowStrategy(actualUrl, request.url, 'same-hostname')) {
209
+ urlsCrawled.notScannedRedirects.push({
210
+ fromUrl: request.url,
211
+ toUrl: actualUrl,
212
+ });
213
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
214
+ numScanned: urlsCrawled.scanned.length,
215
+ urlScanned: request.url,
216
+ });
217
+ return;
218
+ }
219
+ const results = await runAxeScript({ includeScreenshots, page, randomToken });
220
+ // Detect JS redirects that fire during/after axe scan.
221
+ // Listen for navigation, then give a brief window for pending redirects to complete.
222
+ try {
223
+ let navigatedToUrl = null;
224
+ const onFrameNavigated = (frame) => {
225
+ if (frame === page.mainFrame()) {
226
+ navigatedToUrl = frame.url();
227
+ }
228
+ };
229
+ page.on('framenavigated', onFrameNavigated);
230
+ await page.waitForTimeout(1000);
231
+ page.off('framenavigated', onFrameNavigated);
232
+ const postScanUrl = navigatedToUrl || page.url();
233
+ if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
234
+ urlsCrawled.notScannedRedirects.push({
235
+ fromUrl: request.url,
236
+ toUrl: postScanUrl,
237
+ });
238
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
239
+ numScanned: urlsCrawled.scanned.length,
240
+ urlScanned: request.url,
241
+ });
242
+ return;
243
+ }
244
+ }
245
+ catch (_) {
246
+ // Page/context was destroyed during navigation — handled by outer catch
247
+ }
248
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
249
+ numScanned: urlsCrawled.scanned.length,
250
+ urlScanned: request.url,
251
+ });
252
+ urlsCrawled.scanned.push({
253
+ url: request.url,
254
+ pageTitle: results.pageTitle,
255
+ actualUrl, // i.e. actualUrl
256
+ });
257
+ urlsCrawled.scannedRedirects.push({
258
+ fromUrl: request.url,
259
+ toUrl: actualUrl,
260
+ });
261
+ results.url = request.url;
262
+ results.actualUrl = actualUrl;
263
+ await dataset.pushData(results);
264
+ }
265
+ else {
199
266
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
200
267
  numScanned: urlsCrawled.scanned.length,
201
268
  urlScanned: request.url,
202
269
  });
203
- return;
270
+ if (isScanHtml) {
271
+ // carry through the HTTP status metadata
272
+ const status = response?.status();
273
+ const metadata = typeof status === 'number'
274
+ ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
275
+ : STATUS_CODE_METADATA[2];
276
+ urlsCrawled.invalid.push({
277
+ actualUrl,
278
+ url: request.url,
279
+ pageTitle: request.url,
280
+ metadata,
281
+ httpStatusCode: typeof status === 'number' ? status : 0,
282
+ });
283
+ }
204
284
  }
205
- const results = await runAxeScript({ includeScreenshots, page, randomToken });
206
- guiInfoLog(guiInfoStatusTypes.SCANNED, {
207
- numScanned: urlsCrawled.scanned.length,
208
- urlScanned: request.url,
209
- });
210
- urlsCrawled.scanned.push({
211
- url: request.url,
212
- pageTitle: results.pageTitle,
213
- actualUrl, // i.e. actualUrl
214
- });
215
- urlsCrawled.scannedRedirects.push({
216
- fromUrl: request.url,
217
- toUrl: actualUrl,
218
- });
219
- results.url = request.url;
220
- results.actualUrl = actualUrl;
221
- await dataset.pushData(results);
222
285
  }
223
- else {
224
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
225
- numScanned: urlsCrawled.scanned.length,
226
- urlScanned: request.url,
227
- });
228
- if (isScanHtml) {
229
- // carry through the HTTP status metadata
230
- const status = response?.status();
231
- const metadata = typeof status === 'number'
232
- ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
233
- : STATUS_CODE_METADATA[2];
234
- urlsCrawled.invalid.push({
235
- actualUrl,
286
+ catch (e) {
287
+ if (!isAbortingScan) {
288
+ guiInfoLog(guiInfoStatusTypes.ERROR, {
289
+ numScanned: urlsCrawled.scanned.length,
290
+ urlScanned: request.url,
291
+ });
292
+ urlsCrawled.error.push({
236
293
  url: request.url,
237
294
  pageTitle: request.url,
238
- metadata,
239
- httpStatusCode: typeof status === 'number' ? status : 0,
295
+ actualUrl: request.url,
296
+ metadata: STATUS_CODE_METADATA[2],
297
+ httpStatusCode: 0,
240
298
  });
241
299
  }
242
300
  }
243
301
  },
244
302
  failedRequestHandler: async ({ request, response, error }) => {
245
- // check if scanned pages have reached limit due to multi-instances of handler running
246
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
303
+ if (isAbortingScan) {
247
304
  return;
248
305
  }
249
306
  guiInfoLog(guiInfoStatusTypes.ERROR, {