@govtechsg/oobee 0.10.86 → 0.10.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/.github/workflows/image.yml +2 -3
  2. package/dist/cli.js +18 -5
  3. package/dist/combine.js +2 -0
  4. package/dist/constants/cliFunctions.js +2 -2
  5. package/dist/constants/common.js +55 -13
  6. package/dist/crawlers/crawlDomain.js +38 -13
  7. package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
  8. package/dist/crawlers/crawlSitemap.js +44 -5
  9. package/dist/crawlers/custom/utils.js +81 -40
  10. package/dist/generateHtmlReport.js +18 -11
  11. package/dist/mergeAxeResults/itemReferences.js +60 -25
  12. package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
  13. package/dist/mergeAxeResults.js +18 -9
  14. package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  15. package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
  16. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
  17. package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  18. package/dist/static/ejs/summary.ejs +18 -12
  19. package/dist/utils.js +4 -3
  20. package/fix-summary-html-oom-pr.md +62 -0
  21. package/package.json +5 -5
  22. package/src/cli.ts +19 -5
  23. package/src/combine.ts +2 -0
  24. package/src/constants/cliFunctions.ts +2 -2
  25. package/src/constants/common.ts +65 -12
  26. package/src/crawlers/crawlDomain.ts +39 -13
  27. package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
  28. package/src/crawlers/crawlSitemap.ts +50 -3
  29. package/src/crawlers/custom/utils.ts +99 -43
  30. package/src/generateHtmlReport.ts +21 -11
  31. package/src/mergeAxeResults/itemReferences.ts +70 -26
  32. package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
  33. package/src/mergeAxeResults.ts +21 -11
  34. package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  35. package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
  36. package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
  37. package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  38. package/src/static/ejs/summary.ejs +18 -12
  39. package/src/utils.ts +4 -3
  40. package/testStaticJSScanner.html +1 -1
@@ -146,18 +146,17 @@ jobs:
146
146
  chmod -R u+w "$GITHUB_WORKSPACE/oobee"
147
147
 
148
148
  # Sign all Mach-O (exec bits OR dylib OR node native addons)
149
- # Search $GITHUB_WORKSPACE (not just oobee/) to cover scripts copied to the parent dir
150
149
  while IFS= read -r f; do
151
150
  echo "Signing $f"
152
151
  codesign --force --options runtime --timestamp --sign "${CERTIFICATE_NAME}" "$f"
153
152
  done < <(
154
- find "$GITHUB_WORKSPACE" -type f \
153
+ find "$GITHUB_WORKSPACE/oobee" -type f \
155
154
  \( -perm -111 -o -name "*.dylib" -o -name "*.node" \) \
156
155
  ! -path "*/.git/*"
157
156
  )
158
157
 
159
158
  echo "Verifying signatures of Mach-O files..."
160
- find "$GITHUB_WORKSPACE" -type f \( -perm -111 -o -name "*.dylib" -o -name "*.node" \) \
159
+ find "$GITHUB_WORKSPACE/oobee" -type f \( -perm -111 -o -name "*.dylib" -o -name "*.node" \) \
161
160
  -exec codesign --verify --strict --verbose=2 {} \; || true
162
161
 
163
162
  - name: Cleanup keychain
package/dist/cli.js CHANGED
@@ -147,8 +147,11 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
147
147
  })
148
148
  .check(argvs => {
149
149
  const scanner = String(argvs.scanner ?? '');
150
- if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM) {
151
- throw new Error('-s or --strategy is only available in website and custom flow scans.');
150
+ if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM && scanner !== ScannerTypes.INTELLIGENT && scanner !== ScannerTypes.SITEMAP) {
151
+ throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
152
+ }
153
+ if (argvs.strategy === 'ignore' && scanner !== ScannerTypes.SITEMAP) {
154
+ throw new Error('-s ignore is only available for sitemap scans.');
152
155
  }
153
156
  return true;
154
157
  })
@@ -161,13 +164,19 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
161
164
  return duration;
162
165
  })
163
166
  .check(argvs => {
164
- if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
165
- throw new Error('-s or --strategy is only available in website scans.');
167
+ if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.scanner !== ScannerTypes.CUSTOM && argvs.scanner !== ScannerTypes.INTELLIGENT && argvs.scanner !== ScannerTypes.SITEMAP && argvs.strategy) {
168
+ throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
169
+ }
170
+ if (argvs.strategy === 'ignore' && argvs.scanner !== ScannerTypes.SITEMAP) {
171
+ throw new Error('-s ignore is only available for sitemap scans.');
166
172
  }
167
173
  return true;
168
174
  })
169
175
  .conflicts('d', 'w')
170
176
  .parse();
177
+ if (!options.strategy) {
178
+ options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
179
+ }
171
180
  const scanInit = async (argvs) => {
172
181
  const updatedArgvs = { ...argvs };
173
182
  // Cannot use data.browser and data.isHeadless as the connectivity check comes first before prepareData
@@ -187,7 +196,11 @@ const scanInit = async (argvs) => {
187
196
  if (res.httpStatus)
188
197
  consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
189
198
  if (res.status === statuses.success.code) {
190
- data.url = res.url;
199
+ // Custom flow should continue from the user-provided entry URL so auth redirects
200
+ // do not replace the original domain used for overlay gating and navigation.
201
+ if (data.type !== ScannerTypes.CUSTOM) {
202
+ data.url = res.url;
203
+ }
191
204
  if (process.env.OOBEE_VALIDATE_URL) {
192
205
  consoleLogger.info('Url is valid');
193
206
  cleanUpAndExit(0, data.randomToken);
package/dist/combine.js CHANGED
@@ -95,6 +95,8 @@ const combineRun = async (details, deviceToScan) => {
95
95
  blacklistedPatterns,
96
96
  includeScreenshots,
97
97
  extraHTTPHeaders,
98
+ strategy,
99
+ userUrl: url,
98
100
  scanDuration,
99
101
  });
100
102
  urlsCrawledObj = sitemapResult.urlsCrawled;
@@ -147,8 +147,8 @@ export const cliOptions = {
147
147
  },
148
148
  s: {
149
149
  alias: 'strategy',
150
- describe: 'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
151
- choices: ['same-domain', 'same-hostname'],
150
+ describe: 'Crawls up to general (same parent) domains, or only specific hostname. Use "ignore" to disable URL filtering (default for sitemap scans). Defaults to "same-domain".',
151
+ choices: ['same-domain', 'same-hostname', 'ignore'],
152
152
  requiresArg: true,
153
153
  demandOption: false,
154
154
  },
@@ -26,7 +26,7 @@ formDataFields,
26
26
  ScannerTypes, BrowserTypes, FileTypes, getEnumKey, } from './constants.js';
27
27
  import { consoleLogger } from '../logs.js';
28
28
  import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
29
- import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
29
+ import { cleanUpAndExit, isFollowStrategy, randomThreeDigitNumberString, register } from '../utils.js';
30
30
  import { getProxyInfo, proxyInfoToResolution } from '../proxyService.js';
31
31
  // validateDirPath validates a provided directory path
32
32
  // returns null if no error
@@ -592,7 +592,9 @@ export const prepareData = async (argv) => {
592
592
  viewportWidth,
593
593
  playwrightDeviceDetailsObject,
594
594
  maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
595
- strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain,
595
+ strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
596
+ : strategy === 'ignore' ? EnqueueStrategy.All
597
+ : EnqueueStrategy.SameDomain,
596
598
  isLocalFileScan,
597
599
  browser: browserToRun,
598
600
  nameEmail,
@@ -637,6 +639,10 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
637
639
  let shouldCapture = false;
638
640
  const disallowedUrls = [];
639
641
  const allowedUrls = [];
642
+ // Returns 1–2 minimatch glob patterns for a single robots.txt path pattern.
643
+ // Two patterns are returned for bare paths (no trailing wildcard) so that
644
+ // both the exact URL and all child paths are blocked, matching robots.txt
645
+ // prefix semantics.
640
646
  const sanitisePattern = (pattern) => {
641
647
  const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
642
648
  const subdirWildcardRegex = /\/\*\//g;
@@ -644,18 +650,29 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
644
650
  if (subdirWildcardRegex.test(pattern)) {
645
651
  pattern = pattern.replace(subdirWildcardRegex, '/**/');
646
652
  }
653
+ // Query-string patterns (e.g. /faq?faqItem= or /faq/?faq&faqItem=):
654
+ // '?' is the query separator in robots.txt but a single-char wildcard in
655
+ // minimatch. Escape it to a literal match and append '*' so any query
656
+ // value after the stated prefix is also blocked.
657
+ if (pattern.includes('?')) {
658
+ return [domain + pattern.replace('?', '\\?') + '*'];
659
+ }
647
660
  if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
648
661
  if (pattern.endsWith('*')) {
649
- pattern = pattern.concat('*');
662
+ // e.g. /ebook/* → /ebook/** (already covers all children)
663
+ return [domain + pattern.concat('*')];
650
664
  }
651
665
  else {
652
- if (!pattern.endsWith('/'))
653
- pattern = pattern.concat('/');
654
- pattern = pattern.concat('**');
666
+ // Bare path (e.g. /subscription/unsubscribe): robots.txt blocks the
667
+ // exact URL *and* every descendant. minimatch's '/**' glob does not
668
+ // match the bare path itself (no trailing slash), so we emit both the
669
+ // exact-path pattern and a children glob.
670
+ const base = domain + pattern;
671
+ const children = domain + (pattern.endsWith('/') ? pattern : pattern + '/') + '**';
672
+ return [base, children];
655
673
  }
656
674
  }
657
- const final = domain.concat(pattern);
658
- return final;
675
+ return [domain + pattern];
659
676
  };
660
677
  for (const line of lines) {
661
678
  if (line.toLowerCase().startsWith('user-agent: *')) {
@@ -667,15 +684,13 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
667
684
  else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
668
685
  let disallowed = line.substring('disallow: '.length).trim();
669
686
  if (disallowed) {
670
- disallowed = sanitisePattern(disallowed);
671
- disallowedUrls.push(disallowed);
687
+ disallowedUrls.push(...sanitisePattern(disallowed));
672
688
  }
673
689
  }
674
690
  else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
675
691
  let allowed = line.substring('allow: '.length).trim();
676
692
  if (allowed) {
677
- allowed = sanitisePattern(allowed);
678
- allowedUrls.push(allowed);
693
+ allowedUrls.push(...sanitisePattern(allowed));
679
694
  }
680
695
  }
681
696
  }
@@ -726,6 +741,31 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
726
741
  }
727
742
  }
728
743
  };
744
+ export const getSitemapsFromRobotsTxt = async (url, browser, userDataDirectory, extraHTTPHeaders) => {
745
+ const domain = new URL(url).origin;
746
+ const robotsUrl = domain.concat('/robots.txt');
747
+ let robotsTxt;
748
+ try {
749
+ robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browser, userDataDirectory, extraHTTPHeaders);
750
+ }
751
+ catch (e) {
752
+ consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl} for sitemap discovery`);
753
+ return [];
754
+ }
755
+ if (!robotsTxt)
756
+ return [];
757
+ const sitemaps = [];
758
+ const lines = robotsTxt.split(/\r?\n/);
759
+ for (const line of lines) {
760
+ if (line.toLowerCase().startsWith('sitemap:')) {
761
+ const sitemapUrl = line.substring('sitemap:'.length).trim();
762
+ if (sitemapUrl) {
763
+ sitemaps.push(sitemapUrl);
764
+ }
765
+ }
766
+ }
767
+ return sitemaps;
768
+ };
729
769
  export const isDisallowedInRobotsTxt = (url) => {
730
770
  if (!constants.robotsTxtUrls)
731
771
  return;
@@ -744,7 +784,7 @@ export const isDisallowedInRobotsTxt = (url) => {
744
784
  }
745
785
  return false;
746
786
  };
747
- export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders) => {
787
+ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
748
788
  const scannedSitemaps = new Set();
749
789
  const urls = {}; // dictionary of requests to urls to be scanned
750
790
  const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
@@ -753,6 +793,8 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
753
793
  return;
754
794
  if (isDisallowedInRobotsTxt(url))
755
795
  return;
796
+ if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy))
797
+ return;
756
798
  url = convertPathToLocalFile(url);
757
799
  let request;
758
800
  try {
@@ -4,7 +4,7 @@ import fsp from 'fs/promises';
4
4
  import { createCrawleeSubFolders, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
5
5
  import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
6
6
  import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
7
- import { areLinksEqual, isFollowStrategy, register } from '../utils.js';
7
+ import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
8
8
  import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
9
9
  import { consoleLogger, guiInfoLog } from '../logs.js';
10
10
  const isBlacklisted = (url, blacklistedPatterns) => {
@@ -37,8 +37,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
37
37
  const pdfDownloads = [];
38
38
  const uuidToPdfMapping = {};
39
39
  const queuedUrlSet = new Set();
40
- const scannedUrlSet = new Set(urlsCrawled.scanned.map(item => item.url));
41
- const scannedResolvedUrlSet = new Set(urlsCrawled.scanned.map(item => item.actualUrl || item.url));
40
+ const scannedUrlSet = new Set(urlsCrawled.scanned.map(item => normUrl(item.url)));
41
+ const scannedResolvedUrlSet = new Set(urlsCrawled.scanned.map(item => normUrl(item.actualUrl || item.url)));
42
42
  const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes);
43
43
  const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes);
44
44
  const { maxConcurrency } = constants;
@@ -70,11 +70,12 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
70
70
  const initialPageUrl = workingPage.url().toString();
71
71
  const selectedElementsString = cssQuerySelectors.join(', ');
72
72
  const isExcluded = (newPageUrl) => {
73
- const isAlreadyScanned = urlsCrawled.scanned.some(item => item.url === newPageUrl);
73
+ const isAlreadyScanned = scannedUrlSet.has(normUrl(newPageUrl));
74
74
  const isBlacklistedUrl = isBlacklisted(newPageUrl, blacklistedPatterns);
75
75
  const isNotFollowStrategy = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
76
76
  const isNotSupportedDocument = disallowedListOfPatterns.some(pattern => newPageUrl.toLowerCase().startsWith(pattern));
77
- return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
77
+ const isRobotsDisallowed = isDisallowedInRobotsTxt(newPageUrl);
78
+ return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy || isRobotsDisallowed;
78
79
  };
79
80
  const setPageListeners = (pageListener) => {
80
81
  // event listener to handle new page popups upon button click
@@ -235,7 +236,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
235
236
  catch (e) {
236
237
  consoleLogger.error(e);
237
238
  }
238
- if (scannedUrlSet.has(req.url)) {
239
+ if (scannedUrlSet.has(normUrl(req.url))) {
239
240
  req.skipNavigation = true;
240
241
  }
241
242
  if (isDisallowedInRobotsTxt(req.url))
@@ -358,7 +359,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
358
359
  finalUrl = requestLabelUrl;
359
360
  }
360
361
  const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
361
- if (isRedirected) {
362
+ if (isRedirected && !isDisallowedInRobotsTxt(finalUrl)) {
362
363
  await enqueueUniqueRequest({ url: finalUrl, label: finalUrl });
363
364
  }
364
365
  else {
@@ -399,7 +400,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
399
400
  return;
400
401
  }
401
402
  // if URL has already been scanned
402
- if (scannedUrlSet.has(request.url)) {
403
+ if (scannedUrlSet.has(normUrl(request.url))) {
403
404
  await enqueueProcess(page, enqueueLinks, browserContext);
404
405
  return;
405
406
  }
@@ -493,8 +494,32 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
493
494
  return;
494
495
  }
495
496
  const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
497
+ // Detect JS redirects that fire during/after axe scan.
498
+ // Listen for navigation, then give a brief window for pending redirects to complete.
499
+ try {
500
+ let navigatedToUrl = null;
501
+ const onFrameNavigated = (frame) => {
502
+ if (frame === page.mainFrame()) {
503
+ navigatedToUrl = frame.url();
504
+ }
505
+ };
506
+ page.on('framenavigated', onFrameNavigated);
507
+ await page.waitForTimeout(1000);
508
+ page.off('framenavigated', onFrameNavigated);
509
+ const postScanUrl = navigatedToUrl || page.url();
510
+ if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
511
+ urlsCrawled.notScannedRedirects.push({
512
+ fromUrl: request.url,
513
+ toUrl: postScanUrl,
514
+ });
515
+ return;
516
+ }
517
+ }
518
+ catch (_) {
519
+ // Page/context was destroyed during navigation — handled by outer catch
520
+ }
496
521
  if (isRedirected) {
497
- const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(actualUrl);
522
+ const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(normUrl(actualUrl));
498
523
  if (isLoadedUrlInCrawledUrls) {
499
524
  urlsCrawled.notScannedRedirects.push({
500
525
  fromUrl: request.url,
@@ -513,8 +538,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
513
538
  pageTitle: results.pageTitle,
514
539
  actualUrl, // i.e. actualUrl
515
540
  });
516
- scannedUrlSet.add(request.url);
517
- scannedResolvedUrlSet.add(actualUrl);
541
+ scannedUrlSet.add(normUrl(request.url));
542
+ scannedResolvedUrlSet.add(normUrl(actualUrl));
518
543
  urlsCrawled.scannedRedirects.push({
519
544
  fromUrl: request.url,
520
545
  toUrl: actualUrl, // i.e. actualUrl
@@ -535,8 +560,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
535
560
  actualUrl: request.url,
536
561
  pageTitle: results.pageTitle,
537
562
  });
538
- scannedUrlSet.add(request.url);
539
- scannedResolvedUrlSet.add(request.url);
563
+ scannedUrlSet.add(normUrl(request.url));
564
+ scannedResolvedUrlSet.add(normUrl(request.url));
540
565
  await dataset.pushData(results);
541
566
  }
542
567
  }
@@ -3,7 +3,7 @@ import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/consta
3
3
  import { consoleLogger, guiInfoLog } from '../logs.js';
4
4
  import crawlDomain from './crawlDomain.js';
5
5
  import crawlSitemap from './crawlSitemap.js';
6
- import { getPlaywrightLaunchOptions } from '../constants/common.js';
6
+ import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
7
7
  import { register } from '../utils.js';
8
8
  const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, strategy, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, followRobots, extraHTTPHeaders, safeMode, scanDuration) => {
9
9
  const startTime = Date.now(); // Track start time
@@ -66,12 +66,30 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
66
66
  return false;
67
67
  }
68
68
  };
69
+ // Discover sitemaps from robots.txt first (supports multiple Sitemap: directives)
70
+ let sitemapUrls = [];
69
71
  try {
70
- sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
72
+ sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
73
+ if (sitemapUrls.length > 0) {
74
+ console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
75
+ sitemapExist = true;
76
+ }
71
77
  }
72
78
  catch (error) {
73
79
  consoleLogger.error(error);
74
80
  }
81
+ // Fall back to hardcoded path probing if robots.txt had no sitemaps
82
+ if (!sitemapExist) {
83
+ try {
84
+ sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
85
+ if (sitemapExist) {
86
+ sitemapUrls = [sitemapUrl];
87
+ }
88
+ }
89
+ catch (error) {
90
+ consoleLogger.error(error);
91
+ }
92
+ }
75
93
  if (!sitemapExist) {
76
94
  console.log('Unable to find sitemap. Commencing website crawl instead.');
77
95
  return await crawlDomain({
@@ -90,34 +108,48 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
90
108
  followRobots,
91
109
  extraHTTPHeaders,
92
110
  safeMode,
93
- scanDuration, // Use full duration since no sitemap
111
+ scanDuration,
112
+ });
113
+ }
114
+ // Process all discovered sitemaps sequentially, sharing dataset and urlsCrawled
115
+ for (const currentSitemapUrl of sitemapUrls) {
116
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl)
117
+ break;
118
+ const elapsed = Date.now() - startTime;
119
+ const remainingDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : scanDuration;
120
+ if (scanDuration > 0 && remainingDuration <= 0) {
121
+ durationExceeded = true;
122
+ break;
123
+ }
124
+ console.log(`Processing sitemap: ${currentSitemapUrl}`);
125
+ urlsCrawledFinal = await crawlSitemap({
126
+ sitemapUrl: currentSitemapUrl,
127
+ randomToken,
128
+ host,
129
+ viewportSettings,
130
+ maxRequestsPerCrawl,
131
+ browser,
132
+ userDataDirectory,
133
+ specifiedMaxConcurrency,
134
+ fileTypes,
135
+ blacklistedPatterns,
136
+ includeScreenshots,
137
+ extraHTTPHeaders,
138
+ strategy,
139
+ userUrl: url,
140
+ fromCrawlIntelligentSitemap,
141
+ userUrlInputFromIntelligent: url,
142
+ datasetFromIntelligent: dataset,
143
+ urlsCrawledFromIntelligent: urlsCrawled,
144
+ crawledFromLocalFile: false,
145
+ scanDuration: scanDuration > 0 ? remainingDuration : 0,
94
146
  });
95
147
  }
96
- console.log(`Sitemap found at ${sitemapUrl}`);
97
- urlsCrawledFinal = await crawlSitemap({
98
- sitemapUrl,
99
- randomToken,
100
- host,
101
- viewportSettings,
102
- maxRequestsPerCrawl,
103
- browser,
104
- userDataDirectory,
105
- specifiedMaxConcurrency,
106
- fileTypes,
107
- blacklistedPatterns,
108
- includeScreenshots,
109
- extraHTTPHeaders,
110
- fromCrawlIntelligentSitemap,
111
- userUrlInputFromIntelligent: url,
112
- datasetFromIntelligent: dataset,
113
- urlsCrawledFromIntelligent: urlsCrawled,
114
- crawledFromLocalFile: false,
115
- scanDuration,
116
- });
117
148
  const elapsed = Date.now() - startTime;
118
- const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0); // in seconds
119
- if (urlsCrawledFinal.scanned.length < maxRequestsPerCrawl && remainingScanDuration > 0) {
120
- console.log(`Continuing crawl from root website. Remaining scan time: ${remainingScanDuration.toFixed(1)}s`);
149
+ const remainingScanDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : 0;
150
+ const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
151
+ if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
152
+ console.log(`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`);
121
153
  urlsCrawledFinal = await crawlDomain({
122
154
  url,
123
155
  randomToken,
@@ -136,15 +168,15 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
136
168
  safeMode,
137
169
  fromCrawlIntelligentSitemap,
138
170
  datasetFromIntelligent: dataset,
139
- urlsCrawledFromIntelligent: urlsCrawledFinal,
171
+ urlsCrawledFromIntelligent: urlsCrawled,
140
172
  scanDuration: remainingScanDuration,
141
173
  });
142
174
  }
143
- else if (remainingScanDuration <= 0) {
175
+ else if (!hasDurationRemaining) {
144
176
  console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
145
177
  durationExceeded = true;
146
178
  }
147
179
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
148
- return { urlsCrawled: urlsCrawledFinal, durationExceeded };
180
+ return { urlsCrawled, durationExceeded };
149
181
  };
150
182
  export default crawlIntelligentSitemap;
@@ -1,13 +1,13 @@
1
- import crawlee, { RequestList } from 'crawlee';
1
+ import crawlee, { EnqueueStrategy, RequestList } from 'crawlee';
2
2
  import * as path from 'path';
3
3
  import fsp from 'fs/promises';
4
4
  import { createCrawleeSubFolders, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
5
5
  import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, disallowedListOfPatterns, FileTypes, } from '../constants/constants.js';
6
6
  import { getLinksFromSitemap, getPlaywrightLaunchOptions, isSkippedUrl, waitForPageLoaded, isFilePath, } from '../constants/common.js';
7
- import { areLinksEqual, isWhitelistedContentType, register } from '../utils.js';
7
+ import { areLinksEqual, isFollowStrategy, isWhitelistedContentType, normUrl, register } from '../utils.js';
8
8
  import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
9
9
  import { guiInfoLog } from '../logs.js';
10
- const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
10
+ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = '', scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
11
11
  const crawlStartTime = Date.now();
12
12
  let dataset;
13
13
  let urlsCrawled;
@@ -25,7 +25,7 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
25
25
  console.log('Local file crawling not supported for sitemap. Please provide a valid URL.');
26
26
  return;
27
27
  }
28
- const linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, extraHTTPHeaders);
28
+ const linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, extraHTTPHeaders, strategy, userUrl || sitemapUrl);
29
29
  sitemapUrl = encodeURI(sitemapUrl);
30
30
  const pdfDownloads = [];
31
31
  const uuidToPdfMapping = {};
@@ -182,7 +182,7 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
182
182
  const status = response ? response.status() : 0;
183
183
  if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
184
184
  const isRedirected = !areLinksEqual(page.url(), request.url);
185
- const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(item => (item.actualUrl || item.url) === page.url());
185
+ const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(item => normUrl(item.actualUrl || item.url) === normUrl(page.url()));
186
186
  if (isRedirected && isLoadedUrlInCrawledUrls) {
187
187
  urlsCrawled.notScannedRedirects.push({
188
188
  fromUrl: request.url,
@@ -205,7 +205,46 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
205
205
  });
206
206
  return;
207
207
  }
208
+ if (isRedirected && !isFollowStrategy(actualUrl, request.url, 'same-hostname')) {
209
+ urlsCrawled.notScannedRedirects.push({
210
+ fromUrl: request.url,
211
+ toUrl: actualUrl,
212
+ });
213
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
214
+ numScanned: urlsCrawled.scanned.length,
215
+ urlScanned: request.url,
216
+ });
217
+ return;
218
+ }
208
219
  const results = await runAxeScript({ includeScreenshots, page, randomToken });
220
+ // Detect JS redirects that fire during/after axe scan.
221
+ // Listen for navigation, then give a brief window for pending redirects to complete.
222
+ try {
223
+ let navigatedToUrl = null;
224
+ const onFrameNavigated = (frame) => {
225
+ if (frame === page.mainFrame()) {
226
+ navigatedToUrl = frame.url();
227
+ }
228
+ };
229
+ page.on('framenavigated', onFrameNavigated);
230
+ await page.waitForTimeout(1000);
231
+ page.off('framenavigated', onFrameNavigated);
232
+ const postScanUrl = navigatedToUrl || page.url();
233
+ if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
234
+ urlsCrawled.notScannedRedirects.push({
235
+ fromUrl: request.url,
236
+ toUrl: postScanUrl,
237
+ });
238
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
239
+ numScanned: urlsCrawled.scanned.length,
240
+ urlScanned: request.url,
241
+ });
242
+ return;
243
+ }
244
+ }
245
+ catch (_) {
246
+ // Page/context was destroyed during navigation — handled by outer catch
247
+ }
209
248
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
210
249
  numScanned: urlsCrawled.scanned.length,
211
250
  urlScanned: request.url,