@govtechsg/oobee 0.10.85 → 0.10.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/.github/workflows/publish.yml +10 -0
  2. package/DETAILS.md +29 -0
  3. package/dist/cli.js +18 -5
  4. package/dist/combine.js +3 -1
  5. package/dist/constants/cliFunctions.js +2 -2
  6. package/dist/constants/common.js +70 -17
  7. package/dist/constants/constants.js +604 -1
  8. package/dist/crawlers/commonCrawlerFunc.js +3 -2
  9. package/dist/crawlers/crawlDomain.js +38 -13
  10. package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
  11. package/dist/crawlers/crawlSitemap.js +141 -84
  12. package/dist/crawlers/custom/utils.js +218 -71
  13. package/dist/crawlers/guards/urlGuard.js +8 -15
  14. package/dist/crawlers/runCustom.js +18 -11
  15. package/dist/generateHtmlReport.js +18 -11
  16. package/dist/generateOobeeClientScanner.js +570 -0
  17. package/dist/mergeAxeResults/itemReferences.js +60 -25
  18. package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
  19. package/dist/mergeAxeResults.js +23 -13
  20. package/dist/npmIndex.js +10 -2
  21. package/dist/proxyService.js +18 -3
  22. package/dist/services/s3Uploader.js +21 -10
  23. package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  24. package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
  25. package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
  26. package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
  27. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
  28. package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  29. package/dist/static/ejs/summary.ejs +19 -8
  30. package/dist/utils.js +4 -3
  31. package/fix-summary-html-oom-pr.md +62 -0
  32. package/oobee-client-scanner.js +34992 -0
  33. package/package.json +5 -5
  34. package/src/cli.ts +19 -5
  35. package/src/combine.ts +5 -1
  36. package/src/constants/cliFunctions.ts +2 -2
  37. package/src/constants/common.ts +87 -22
  38. package/src/constants/constants.ts +602 -1
  39. package/src/crawlers/commonCrawlerFunc.ts +4 -3
  40. package/src/crawlers/crawlDomain.ts +39 -13
  41. package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
  42. package/src/crawlers/crawlSitemap.ts +165 -100
  43. package/src/crawlers/custom/utils.ts +241 -80
  44. package/src/crawlers/guards/urlGuard.ts +24 -31
  45. package/src/crawlers/runCustom.ts +29 -11
  46. package/src/generateHtmlReport.ts +21 -11
  47. package/src/generateOobeeClientScanner.ts +591 -0
  48. package/src/mergeAxeResults/itemReferences.ts +70 -26
  49. package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
  50. package/src/mergeAxeResults.ts +26 -14
  51. package/src/npmIndex.ts +12 -2
  52. package/src/proxyService.ts +25 -4
  53. package/src/services/s3Uploader.ts +23 -11
  54. package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  55. package/src/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
  56. package/src/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
  57. package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
  58. package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
  59. package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  60. package/src/static/ejs/summary.ejs +19 -8
  61. package/src/utils.ts +4 -3
  62. package/testStaticJSScanner.html +534 -0
@@ -196,7 +196,7 @@ export const filterAxeResults = (
196
196
  const conformance = tags.filter(tag => tag.startsWith('wcag') || tag === 'best-practice');
197
197
 
198
198
  nodes.forEach(node => {
199
- const { html } = node;
199
+ const { html, target } = node;
200
200
  if (!(rule in passed.rules)) {
201
201
  passed.rules[rule] = {
202
202
  description,
@@ -207,9 +207,10 @@ export const filterAxeResults = (
207
207
  items: [],
208
208
  };
209
209
  }
210
-
210
+
211
211
  const finalHtml = truncateHtml(html);
212
- passed.rules[rule].items.push({ html: finalHtml, screenshotPath: '', message: '', xpath: '' });
212
+ const xpath = target.length === 1 && typeof target[0] === 'string' ? target[0] : undefined;
213
+ passed.rules[rule].items.push({ html: finalHtml, screenshotPath: '', message: '', xpath: xpath || '' });
213
214
 
214
215
  passed.totalItems += 1;
215
216
  passed.rules[rule].totalItems += 1;
@@ -29,7 +29,7 @@ import {
29
29
  getUrlsFromRobotsTxt,
30
30
  waitForPageLoaded,
31
31
  } from '../constants/common.js';
32
- import { areLinksEqual, isFollowStrategy, register } from '../utils.js';
32
+ import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
33
33
  import {
34
34
  handlePdfDownload,
35
35
  runPdfScan,
@@ -116,9 +116,9 @@ const crawlDomain = async ({
116
116
  const pdfDownloads: Promise<void>[] = [];
117
117
  const uuidToPdfMapping: Record<string, string> = {};
118
118
  const queuedUrlSet = new Set<string>();
119
- const scannedUrlSet = new Set<string>(urlsCrawled.scanned.map(item => item.url));
119
+ const scannedUrlSet = new Set<string>(urlsCrawled.scanned.map(item => normUrl(item.url)));
120
120
  const scannedResolvedUrlSet = new Set<string>(
121
- urlsCrawled.scanned.map(item => item.actualUrl || item.url),
121
+ urlsCrawled.scanned.map(item => normUrl(item.actualUrl || item.url)),
122
122
  );
123
123
  const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes as FileTypes);
124
124
  const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
@@ -166,13 +166,14 @@ const crawlDomain = async ({
166
166
  const selectedElementsString = cssQuerySelectors.join(', ');
167
167
 
168
168
  const isExcluded = (newPageUrl: string): boolean => {
169
- const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
169
+ const isAlreadyScanned: boolean = scannedUrlSet.has(normUrl(newPageUrl));
170
170
  const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns);
171
171
  const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
172
172
  const isNotSupportedDocument: boolean = disallowedListOfPatterns.some(pattern =>
173
173
  newPageUrl.toLowerCase().startsWith(pattern),
174
174
  );
175
- return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
175
+ const isRobotsDisallowed: boolean = isDisallowedInRobotsTxt(newPageUrl);
176
+ return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy || isRobotsDisallowed;
176
177
  };
177
178
  const setPageListeners = (pageListener: Page): void => {
178
179
  // event listener to handle new page popups upon button click
@@ -341,7 +342,7 @@ const crawlDomain = async ({
341
342
  } catch (e) {
342
343
  consoleLogger.error(e);
343
344
  }
344
- if (scannedUrlSet.has(req.url)) {
345
+ if (scannedUrlSet.has(normUrl(req.url))) {
345
346
  req.skipNavigation = true;
346
347
  }
347
348
  if (isDisallowedInRobotsTxt(req.url)) return null;
@@ -481,7 +482,7 @@ const crawlDomain = async ({
481
482
  }
482
483
 
483
484
  const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
484
- if (isRedirected) {
485
+ if (isRedirected && !isDisallowedInRobotsTxt(finalUrl)) {
485
486
  await enqueueUniqueRequest({ url: finalUrl, label: finalUrl });
486
487
  } else {
487
488
  request.skipNavigation = false;
@@ -537,7 +538,7 @@ const crawlDomain = async ({
537
538
  }
538
539
 
539
540
  // if URL has already been scanned
540
- if (scannedUrlSet.has(request.url)) {
541
+ if (scannedUrlSet.has(normUrl(request.url))) {
541
542
  await enqueueProcess(page, enqueueLinks, browserContext);
542
543
  return;
543
544
  }
@@ -654,8 +655,33 @@ const crawlDomain = async ({
654
655
 
655
656
  const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
656
657
 
658
+ // Detect JS redirects that fire during/after axe scan.
659
+ // Listen for navigation, then give a brief window for pending redirects to complete.
660
+ try {
661
+ let navigatedToUrl: string | null = null;
662
+ const onFrameNavigated = (frame: Frame) => {
663
+ if (frame === page.mainFrame()) {
664
+ navigatedToUrl = frame.url();
665
+ }
666
+ };
667
+ page.on('framenavigated', onFrameNavigated);
668
+ await page.waitForTimeout(1000);
669
+ page.off('framenavigated', onFrameNavigated);
670
+
671
+ const postScanUrl = navigatedToUrl || page.url();
672
+ if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
673
+ urlsCrawled.notScannedRedirects.push({
674
+ fromUrl: request.url,
675
+ toUrl: postScanUrl,
676
+ });
677
+ return;
678
+ }
679
+ } catch (_) {
680
+ // Page/context was destroyed during navigation — handled by outer catch
681
+ }
682
+
657
683
  if (isRedirected) {
658
- const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(actualUrl);
684
+ const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(normUrl(actualUrl));
659
685
 
660
686
  if (isLoadedUrlInCrawledUrls) {
661
687
  urlsCrawled.notScannedRedirects.push({
@@ -677,8 +703,8 @@ const crawlDomain = async ({
677
703
  pageTitle: results.pageTitle,
678
704
  actualUrl, // i.e. actualUrl
679
705
  });
680
- scannedUrlSet.add(request.url);
681
- scannedResolvedUrlSet.add(actualUrl);
706
+ scannedUrlSet.add(normUrl(request.url));
707
+ scannedResolvedUrlSet.add(normUrl(actualUrl));
682
708
 
683
709
  urlsCrawled.scannedRedirects.push({
684
710
  fromUrl: request.url,
@@ -700,8 +726,8 @@ const crawlDomain = async ({
700
726
  actualUrl: request.url,
701
727
  pageTitle: results.pageTitle,
702
728
  });
703
- scannedUrlSet.add(request.url);
704
- scannedResolvedUrlSet.add(request.url);
729
+ scannedUrlSet.add(normUrl(request.url));
730
+ scannedResolvedUrlSet.add(normUrl(request.url));
705
731
  await dataset.pushData(results);
706
732
  }
707
733
  } else {
@@ -7,7 +7,7 @@ import { consoleLogger, guiInfoLog } from '../logs.js';
7
7
  import crawlDomain from './crawlDomain.js';
8
8
  import crawlSitemap from './crawlSitemap.js';
9
9
  import { ViewportSettingsClass } from '../combine.js';
10
- import { getPlaywrightLaunchOptions } from '../constants/common.js';
10
+ import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
11
11
  import { register } from '../utils.js';
12
12
 
13
13
  const crawlIntelligentSitemap = async (
@@ -100,12 +100,30 @@ const crawlIntelligentSitemap = async (
100
100
  }
101
101
  };
102
102
 
103
+ // Discover sitemaps from robots.txt first (supports multiple Sitemap: directives)
104
+ let sitemapUrls: string[] = [];
103
105
  try {
104
- sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
106
+ sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
107
+ if (sitemapUrls.length > 0) {
108
+ console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
109
+ sitemapExist = true;
110
+ }
105
111
  } catch (error) {
106
112
  consoleLogger.error(error);
107
113
  }
108
114
 
115
+ // Fall back to hardcoded path probing if robots.txt had no sitemaps
116
+ if (!sitemapExist) {
117
+ try {
118
+ sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
119
+ if (sitemapExist) {
120
+ sitemapUrls = [sitemapUrl];
121
+ }
122
+ } catch (error) {
123
+ consoleLogger.error(error);
124
+ }
125
+ }
126
+
109
127
  if (!sitemapExist) {
110
128
  console.log('Unable to find sitemap. Commencing website crawl instead.');
111
129
  return await crawlDomain({
@@ -124,38 +142,53 @@ const crawlIntelligentSitemap = async (
124
142
  followRobots,
125
143
  extraHTTPHeaders,
126
144
  safeMode,
127
- scanDuration, // Use full duration since no sitemap
145
+ scanDuration,
128
146
  });
129
147
  }
130
148
 
131
- console.log(`Sitemap found at ${sitemapUrl}`);
132
- urlsCrawledFinal = await crawlSitemap({
133
- sitemapUrl,
134
- randomToken,
135
- host,
136
- viewportSettings,
137
- maxRequestsPerCrawl,
138
- browser,
139
- userDataDirectory,
140
- specifiedMaxConcurrency,
141
- fileTypes,
142
- blacklistedPatterns,
143
- includeScreenshots,
144
- extraHTTPHeaders,
145
- fromCrawlIntelligentSitemap,
146
- userUrlInputFromIntelligent: url,
147
- datasetFromIntelligent: dataset,
148
- urlsCrawledFromIntelligent: urlsCrawled,
149
- crawledFromLocalFile: false,
150
- scanDuration,
151
- });
149
+ // Process all discovered sitemaps sequentially, sharing dataset and urlsCrawled
150
+ for (const currentSitemapUrl of sitemapUrls) {
151
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) break;
152
+
153
+ const elapsed = Date.now() - startTime;
154
+ const remainingDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : scanDuration;
155
+ if (scanDuration > 0 && remainingDuration <= 0) {
156
+ durationExceeded = true;
157
+ break;
158
+ }
159
+
160
+ console.log(`Processing sitemap: ${currentSitemapUrl}`);
161
+ urlsCrawledFinal = await crawlSitemap({
162
+ sitemapUrl: currentSitemapUrl,
163
+ randomToken,
164
+ host,
165
+ viewportSettings,
166
+ maxRequestsPerCrawl,
167
+ browser,
168
+ userDataDirectory,
169
+ specifiedMaxConcurrency,
170
+ fileTypes,
171
+ blacklistedPatterns,
172
+ includeScreenshots,
173
+ extraHTTPHeaders,
174
+ strategy,
175
+ userUrl: url,
176
+ fromCrawlIntelligentSitemap,
177
+ userUrlInputFromIntelligent: url,
178
+ datasetFromIntelligent: dataset,
179
+ urlsCrawledFromIntelligent: urlsCrawled,
180
+ crawledFromLocalFile: false,
181
+ scanDuration: scanDuration > 0 ? remainingDuration : 0,
182
+ });
183
+ }
152
184
 
153
185
  const elapsed = Date.now() - startTime;
154
- const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0); // in seconds
186
+ const remainingScanDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : 0;
187
+ const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
155
188
 
156
- if (urlsCrawledFinal.scanned.length < maxRequestsPerCrawl && remainingScanDuration > 0) {
189
+ if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
157
190
  console.log(
158
- `Continuing crawl from root website. Remaining scan time: ${remainingScanDuration.toFixed(1)}s`,
191
+ `Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`,
159
192
  );
160
193
  urlsCrawledFinal = await crawlDomain({
161
194
  url,
@@ -175,10 +208,10 @@ const crawlIntelligentSitemap = async (
175
208
  safeMode,
176
209
  fromCrawlIntelligentSitemap,
177
210
  datasetFromIntelligent: dataset,
178
- urlsCrawledFromIntelligent: urlsCrawledFinal,
211
+ urlsCrawledFromIntelligent: urlsCrawled,
179
212
  scanDuration: remainingScanDuration,
180
213
  });
181
- } else if (remainingScanDuration <= 0) {
214
+ } else if (!hasDurationRemaining) {
182
215
  console.log(
183
216
  `Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`,
184
217
  );
@@ -186,7 +219,7 @@ const crawlIntelligentSitemap = async (
186
219
  }
187
220
 
188
221
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
189
- return { urlsCrawled: urlsCrawledFinal, durationExceeded };
222
+ return { urlsCrawled, durationExceeded };
190
223
  };
191
224
 
192
225
  export default crawlIntelligentSitemap;
@@ -1,4 +1,4 @@
1
- import crawlee, { LaunchContext, Request, RequestList, Dataset } from 'crawlee';
1
+ import crawlee, { EnqueueStrategy, LaunchContext, Request, RequestList, Dataset } from 'crawlee';
2
2
  import fs from 'fs';
3
3
  import * as path from 'path';
4
4
  import fsp from 'fs/promises';
@@ -23,7 +23,7 @@ import {
23
23
  waitForPageLoaded,
24
24
  isFilePath,
25
25
  } from '../constants/common.js';
26
- import { areLinksEqual, isWhitelistedContentType, register } from '../utils.js';
26
+ import { areLinksEqual, isFollowStrategy, isWhitelistedContentType, normUrl, register } from '../utils.js';
27
27
  import {
28
28
  handlePdfDownload,
29
29
  runPdfScan,
@@ -46,6 +46,8 @@ const crawlSitemap = async ({
46
46
  blacklistedPatterns,
47
47
  includeScreenshots,
48
48
  extraHTTPHeaders,
49
+ strategy = EnqueueStrategy.All,
50
+ userUrl = '',
49
51
  scanDuration = 0,
50
52
  fromCrawlIntelligentSitemap = false,
51
53
  userUrlInputFromIntelligent = null,
@@ -65,6 +67,8 @@ const crawlSitemap = async ({
65
67
  blacklistedPatterns: string[];
66
68
  includeScreenshots: boolean;
67
69
  extraHTTPHeaders: Record<string, string>;
70
+ strategy?: EnqueueStrategy;
71
+ userUrl?: string;
68
72
  scanDuration?: number;
69
73
  fromCrawlIntelligentSitemap?: boolean;
70
74
  userUrlInputFromIntelligent?: string;
@@ -76,6 +80,7 @@ const crawlSitemap = async ({
76
80
  let dataset: crawlee.Dataset;
77
81
  let urlsCrawled: UrlsCrawled;
78
82
  let durationExceeded = false;
83
+ let isAbortingScan = false;
79
84
 
80
85
  if (fromCrawlIntelligentSitemap) {
81
86
  dataset = datasetFromIntelligent;
@@ -98,6 +103,8 @@ const crawlSitemap = async ({
98
103
  userUrlInputFromIntelligent,
99
104
  fromCrawlIntelligentSitemap,
100
105
  extraHTTPHeaders,
106
+ strategy,
107
+ userUrl || sitemapUrl,
101
108
  );
102
109
 
103
110
  sitemapUrl = encodeURI(sitemapUrl);
@@ -244,135 +251,193 @@ const crawlSitemap = async ({
244
251
  return;
245
252
  }
246
253
 
247
- await waitForPageLoaded(page, 10000);
254
+ try {
255
+ await waitForPageLoaded(page, 10000);
248
256
 
249
- const actualUrl = page.url() || request.loadedUrl || request.url;
257
+ const actualUrl = page.url() || request.loadedUrl || request.url;
250
258
 
251
- const hasExceededDuration =
252
- scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
259
+ const hasExceededDuration =
260
+ scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
253
261
 
254
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
255
- if (hasExceededDuration) {
256
- console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
257
- durationExceeded = true;
262
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
263
+ isAbortingScan = true;
264
+ if (hasExceededDuration) {
265
+ console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
266
+ durationExceeded = true;
267
+ }
268
+ crawler.autoscaledPool.abort(); // stops new requests
269
+ return;
258
270
  }
259
- crawler.autoscaledPool.abort(); // stops new requests
260
- return;
261
- }
262
271
 
263
- if (request.skipNavigation && actualUrl === 'about:blank') {
264
- if (isScanPdfs) {
265
- // pushes download promise into pdfDownloads
266
- const { pdfFileName, url } = handlePdfDownload(
267
- randomToken,
268
- pdfDownloads,
269
- request,
270
- sendRequest,
271
- urlsCrawled,
272
- );
272
+ if (request.skipNavigation && actualUrl === 'about:blank') {
273
+ if (isScanPdfs) {
274
+ // pushes download promise into pdfDownloads
275
+ const { pdfFileName, url } = handlePdfDownload(
276
+ randomToken,
277
+ pdfDownloads,
278
+ request,
279
+ sendRequest,
280
+ urlsCrawled,
281
+ );
282
+
283
+ uuidToPdfMapping[pdfFileName] = url;
284
+ return;
285
+ }
286
+
287
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
288
+ numScanned: urlsCrawled.scanned.length,
289
+ urlScanned: request.url,
290
+ });
291
+ urlsCrawled.userExcluded.push({
292
+ url: request.url,
293
+ pageTitle: request.url,
294
+ actualUrl: request.url, // because about:blank is not useful
295
+ metadata: STATUS_CODE_METADATA[1],
296
+ httpStatusCode: 1,
297
+ });
273
298
 
274
- uuidToPdfMapping[pdfFileName] = url;
275
299
  return;
276
300
  }
277
301
 
278
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
279
- numScanned: urlsCrawled.scanned.length,
280
- urlScanned: request.url,
281
- });
282
- urlsCrawled.userExcluded.push({
283
- url: request.url,
284
- pageTitle: request.url,
285
- actualUrl: request.url, // because about:blank is not useful
286
- metadata: STATUS_CODE_METADATA[1],
287
- httpStatusCode: 1,
288
- });
302
+ const contentType = response?.headers?.()['content-type'] || '';
303
+ const status = response ? response.status() : 0;
289
304
 
290
- return;
291
- }
305
+ if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
306
+ const isRedirected = !areLinksEqual(page.url(), request.url);
307
+ const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
308
+ item => normUrl(item.actualUrl || item.url) === normUrl(page.url()),
309
+ );
292
310
 
293
- const contentType = response?.headers?.()['content-type'] || '';
294
- const status = response ? response.status() : 0;
311
+ if (isRedirected && isLoadedUrlInCrawledUrls) {
312
+ urlsCrawled.notScannedRedirects.push({
313
+ fromUrl: request.url,
314
+ toUrl: actualUrl, // i.e. actualUrl
315
+ });
316
+ return;
317
+ }
295
318
 
296
- if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
297
- const isRedirected = !areLinksEqual(page.url(), request.url);
298
- const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
299
- item => (item.actualUrl || item.url) === page.url(),
300
- );
319
+ // This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
320
+ if (isRedirected && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
321
+ urlsCrawled.userExcluded.push({
322
+ url: request.url,
323
+ pageTitle: request.url,
324
+ actualUrl,
325
+ metadata: STATUS_CODE_METADATA[0],
326
+ httpStatusCode: 0,
327
+ });
301
328
 
302
- if (isRedirected && isLoadedUrlInCrawledUrls) {
303
- urlsCrawled.notScannedRedirects.push({
304
- fromUrl: request.url,
305
- toUrl: actualUrl, // i.e. actualUrl
306
- });
307
- return;
308
- }
329
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
330
+ numScanned: urlsCrawled.scanned.length,
331
+ urlScanned: request.url,
332
+ });
333
+ return;
334
+ }
309
335
 
310
- // This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
311
- if (isRedirected && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
312
- urlsCrawled.userExcluded.push({
313
- url: request.url,
314
- pageTitle: request.url,
315
- actualUrl,
316
- metadata: STATUS_CODE_METADATA[0],
317
- httpStatusCode: 0,
318
- });
336
+ if (isRedirected && !isFollowStrategy(actualUrl, request.url, 'same-hostname')) {
337
+ urlsCrawled.notScannedRedirects.push({
338
+ fromUrl: request.url,
339
+ toUrl: actualUrl,
340
+ });
341
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
342
+ numScanned: urlsCrawled.scanned.length,
343
+ urlScanned: request.url,
344
+ });
345
+ return;
346
+ }
319
347
 
320
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
348
+ const results = await runAxeScript({ includeScreenshots, page, randomToken });
349
+
350
+ // Detect JS redirects that fire during/after axe scan.
351
+ // Listen for navigation, then give a brief window for pending redirects to complete.
352
+ try {
353
+ let navigatedToUrl: string | null = null;
354
+ const onFrameNavigated = (frame: any) => {
355
+ if (frame === page.mainFrame()) {
356
+ navigatedToUrl = frame.url();
357
+ }
358
+ };
359
+ page.on('framenavigated', onFrameNavigated);
360
+ await page.waitForTimeout(1000);
361
+ page.off('framenavigated', onFrameNavigated);
362
+
363
+ const postScanUrl = navigatedToUrl || page.url();
364
+ if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
365
+ urlsCrawled.notScannedRedirects.push({
366
+ fromUrl: request.url,
367
+ toUrl: postScanUrl,
368
+ });
369
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
370
+ numScanned: urlsCrawled.scanned.length,
371
+ urlScanned: request.url,
372
+ });
373
+ return;
374
+ }
375
+ } catch (_) {
376
+ // Page/context was destroyed during navigation — handled by outer catch
377
+ }
378
+
379
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
321
380
  numScanned: urlsCrawled.scanned.length,
322
381
  urlScanned: request.url,
323
382
  });
324
- return;
325
- }
326
-
327
- const results = await runAxeScript({ includeScreenshots, page, randomToken });
328
-
329
- guiInfoLog(guiInfoStatusTypes.SCANNED, {
330
- numScanned: urlsCrawled.scanned.length,
331
- urlScanned: request.url,
332
- });
333
383
 
334
- urlsCrawled.scanned.push({
335
- url: request.url,
336
- pageTitle: results.pageTitle,
337
- actualUrl, // i.e. actualUrl
338
- });
384
+ urlsCrawled.scanned.push({
385
+ url: request.url,
386
+ pageTitle: results.pageTitle,
387
+ actualUrl, // i.e. actualUrl
388
+ });
339
389
 
340
- urlsCrawled.scannedRedirects.push({
341
- fromUrl: request.url,
342
- toUrl: actualUrl,
343
- });
390
+ urlsCrawled.scannedRedirects.push({
391
+ fromUrl: request.url,
392
+ toUrl: actualUrl,
393
+ });
344
394
 
345
- results.url = request.url;
346
- results.actualUrl = actualUrl;
395
+ results.url = request.url;
396
+ results.actualUrl = actualUrl;
347
397
 
348
- await dataset.pushData(results);
349
- } else {
350
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
351
- numScanned: urlsCrawled.scanned.length,
352
- urlScanned: request.url,
353
- });
398
+ await dataset.pushData(results);
399
+ } else {
400
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
401
+ numScanned: urlsCrawled.scanned.length,
402
+ urlScanned: request.url,
403
+ });
354
404
 
355
- if (isScanHtml) {
356
- // carry through the HTTP status metadata
357
- const status = response?.status();
358
- const metadata =
359
- typeof status === 'number'
360
- ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
361
- : STATUS_CODE_METADATA[2];
405
+ if (isScanHtml) {
406
+ // carry through the HTTP status metadata
407
+ const status = response?.status();
408
+ const metadata =
409
+ typeof status === 'number'
410
+ ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
411
+ : STATUS_CODE_METADATA[2];
412
+
413
+ urlsCrawled.invalid.push({
414
+ actualUrl,
415
+ url: request.url,
416
+ pageTitle: request.url,
417
+ metadata,
418
+ httpStatusCode: typeof status === 'number' ? status : 0,
419
+ });
420
+ }
421
+ }
422
+ } catch (e) {
423
+ if (!isAbortingScan) {
424
+ guiInfoLog(guiInfoStatusTypes.ERROR, {
425
+ numScanned: urlsCrawled.scanned.length,
426
+ urlScanned: request.url,
427
+ });
362
428
 
363
- urlsCrawled.invalid.push({
364
- actualUrl,
429
+ urlsCrawled.error.push({
365
430
  url: request.url,
366
431
  pageTitle: request.url,
367
- metadata,
368
- httpStatusCode: typeof status === 'number' ? status : 0,
432
+ actualUrl: request.url,
433
+ metadata: STATUS_CODE_METADATA[2],
434
+ httpStatusCode: 0,
369
435
  });
370
436
  }
371
437
  }
372
438
  },
373
439
  failedRequestHandler: async ({ request, response, error }) => {
374
- // check if scanned pages have reached limit due to multi-instances of handler running
375
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
440
+ if (isAbortingScan) {
376
441
  return;
377
442
  }
378
443