@govtechsg/oobee 0.10.86 → 0.10.87
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/image.yml +2 -3
- package/dist/cli.js +18 -5
- package/dist/combine.js +2 -0
- package/dist/constants/cliFunctions.js +2 -2
- package/dist/constants/common.js +55 -13
- package/dist/crawlers/crawlDomain.js +38 -13
- package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
- package/dist/crawlers/crawlSitemap.js +44 -5
- package/dist/crawlers/custom/utils.js +81 -40
- package/dist/generateHtmlReport.js +18 -11
- package/dist/mergeAxeResults/itemReferences.js +60 -25
- package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
- package/dist/mergeAxeResults.js +18 -9
- package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
- package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
- package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
- package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
- package/dist/static/ejs/summary.ejs +18 -12
- package/dist/utils.js +4 -3
- package/fix-summary-html-oom-pr.md +62 -0
- package/package.json +5 -5
- package/src/cli.ts +19 -5
- package/src/combine.ts +2 -0
- package/src/constants/cliFunctions.ts +2 -2
- package/src/constants/common.ts +65 -12
- package/src/crawlers/crawlDomain.ts +39 -13
- package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
- package/src/crawlers/crawlSitemap.ts +50 -3
- package/src/crawlers/custom/utils.ts +99 -43
- package/src/generateHtmlReport.ts +21 -11
- package/src/mergeAxeResults/itemReferences.ts +70 -26
- package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
- package/src/mergeAxeResults.ts +21 -11
- package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
- package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
- package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
- package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
- package/src/static/ejs/summary.ejs +18 -12
- package/src/utils.ts +4 -3
- package/testStaticJSScanner.html +1 -1
|
@@ -146,18 +146,17 @@ jobs:
|
|
|
146
146
|
chmod -R u+w "$GITHUB_WORKSPACE/oobee"
|
|
147
147
|
|
|
148
148
|
# Sign all Mach-O (exec bits OR dylib OR node native addons)
|
|
149
|
-
# Search $GITHUB_WORKSPACE (not just oobee/) to cover scripts copied to the parent dir
|
|
150
149
|
while IFS= read -r f; do
|
|
151
150
|
echo "Signing $f"
|
|
152
151
|
codesign --force --options runtime --timestamp --sign "${CERTIFICATE_NAME}" "$f"
|
|
153
152
|
done < <(
|
|
154
|
-
find "$GITHUB_WORKSPACE" -type f \
|
|
153
|
+
find "$GITHUB_WORKSPACE/oobee" -type f \
|
|
155
154
|
\( -perm -111 -o -name "*.dylib" -o -name "*.node" \) \
|
|
156
155
|
! -path "*/.git/*"
|
|
157
156
|
)
|
|
158
157
|
|
|
159
158
|
echo "Verifying signatures of Mach-O files..."
|
|
160
|
-
find "$GITHUB_WORKSPACE" -type f \( -perm -111 -o -name "*.dylib" -o -name "*.node" \) \
|
|
159
|
+
find "$GITHUB_WORKSPACE/oobee" -type f \( -perm -111 -o -name "*.dylib" -o -name "*.node" \) \
|
|
161
160
|
-exec codesign --verify --strict --verbose=2 {} \; || true
|
|
162
161
|
|
|
163
162
|
- name: Cleanup keychain
|
package/dist/cli.js
CHANGED
|
@@ -147,8 +147,11 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
|
|
|
147
147
|
})
|
|
148
148
|
.check(argvs => {
|
|
149
149
|
const scanner = String(argvs.scanner ?? '');
|
|
150
|
-
if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM) {
|
|
151
|
-
throw new Error('-s or --strategy is only available in website
|
|
150
|
+
if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM && scanner !== ScannerTypes.INTELLIGENT && scanner !== ScannerTypes.SITEMAP) {
|
|
151
|
+
throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
|
|
152
|
+
}
|
|
153
|
+
if (argvs.strategy === 'ignore' && scanner !== ScannerTypes.SITEMAP) {
|
|
154
|
+
throw new Error('-s ignore is only available for sitemap scans.');
|
|
152
155
|
}
|
|
153
156
|
return true;
|
|
154
157
|
})
|
|
@@ -161,13 +164,19 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
|
|
|
161
164
|
return duration;
|
|
162
165
|
})
|
|
163
166
|
.check(argvs => {
|
|
164
|
-
if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
|
|
165
|
-
throw new Error('-s or --strategy is only available in website scans.');
|
|
167
|
+
if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.scanner !== ScannerTypes.CUSTOM && argvs.scanner !== ScannerTypes.INTELLIGENT && argvs.scanner !== ScannerTypes.SITEMAP && argvs.strategy) {
|
|
168
|
+
throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
|
|
169
|
+
}
|
|
170
|
+
if (argvs.strategy === 'ignore' && argvs.scanner !== ScannerTypes.SITEMAP) {
|
|
171
|
+
throw new Error('-s ignore is only available for sitemap scans.');
|
|
166
172
|
}
|
|
167
173
|
return true;
|
|
168
174
|
})
|
|
169
175
|
.conflicts('d', 'w')
|
|
170
176
|
.parse();
|
|
177
|
+
if (!options.strategy) {
|
|
178
|
+
options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
|
|
179
|
+
}
|
|
171
180
|
const scanInit = async (argvs) => {
|
|
172
181
|
const updatedArgvs = { ...argvs };
|
|
173
182
|
// Cannot use data.browser and data.isHeadless as the connectivity check comes first before prepareData
|
|
@@ -187,7 +196,11 @@ const scanInit = async (argvs) => {
|
|
|
187
196
|
if (res.httpStatus)
|
|
188
197
|
consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
|
|
189
198
|
if (res.status === statuses.success.code) {
|
|
190
|
-
|
|
199
|
+
// Custom flow should continue from the user-provided entry URL so auth redirects
|
|
200
|
+
// do not replace the original domain used for overlay gating and navigation.
|
|
201
|
+
if (data.type !== ScannerTypes.CUSTOM) {
|
|
202
|
+
data.url = res.url;
|
|
203
|
+
}
|
|
191
204
|
if (process.env.OOBEE_VALIDATE_URL) {
|
|
192
205
|
consoleLogger.info('Url is valid');
|
|
193
206
|
cleanUpAndExit(0, data.randomToken);
|
package/dist/combine.js
CHANGED
|
@@ -147,8 +147,8 @@ export const cliOptions = {
|
|
|
147
147
|
},
|
|
148
148
|
s: {
|
|
149
149
|
alias: 'strategy',
|
|
150
|
-
describe: 'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
|
|
151
|
-
choices: ['same-domain', 'same-hostname'],
|
|
150
|
+
describe: 'Crawls up to general (same parent) domains, or only specific hostname. Use "ignore" to disable URL filtering (default for sitemap scans). Defaults to "same-domain".',
|
|
151
|
+
choices: ['same-domain', 'same-hostname', 'ignore'],
|
|
152
152
|
requiresArg: true,
|
|
153
153
|
demandOption: false,
|
|
154
154
|
},
|
package/dist/constants/common.js
CHANGED
|
@@ -26,7 +26,7 @@ formDataFields,
|
|
|
26
26
|
ScannerTypes, BrowserTypes, FileTypes, getEnumKey, } from './constants.js';
|
|
27
27
|
import { consoleLogger } from '../logs.js';
|
|
28
28
|
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
|
|
29
|
-
import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
|
|
29
|
+
import { cleanUpAndExit, isFollowStrategy, randomThreeDigitNumberString, register } from '../utils.js';
|
|
30
30
|
import { getProxyInfo, proxyInfoToResolution } from '../proxyService.js';
|
|
31
31
|
// validateDirPath validates a provided directory path
|
|
32
32
|
// returns null if no error
|
|
@@ -592,7 +592,9 @@ export const prepareData = async (argv) => {
|
|
|
592
592
|
viewportWidth,
|
|
593
593
|
playwrightDeviceDetailsObject,
|
|
594
594
|
maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
|
|
595
|
-
strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
|
|
595
|
+
strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
|
|
596
|
+
: strategy === 'ignore' ? EnqueueStrategy.All
|
|
597
|
+
: EnqueueStrategy.SameDomain,
|
|
596
598
|
isLocalFileScan,
|
|
597
599
|
browser: browserToRun,
|
|
598
600
|
nameEmail,
|
|
@@ -637,6 +639,10 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
|
|
|
637
639
|
let shouldCapture = false;
|
|
638
640
|
const disallowedUrls = [];
|
|
639
641
|
const allowedUrls = [];
|
|
642
|
+
// Returns 1–2 minimatch glob patterns for a single robots.txt path pattern.
|
|
643
|
+
// Two patterns are returned for bare paths (no trailing wildcard) so that
|
|
644
|
+
// both the exact URL and all child paths are blocked, matching robots.txt
|
|
645
|
+
// prefix semantics.
|
|
640
646
|
const sanitisePattern = (pattern) => {
|
|
641
647
|
const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
|
|
642
648
|
const subdirWildcardRegex = /\/\*\//g;
|
|
@@ -644,18 +650,29 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
|
|
|
644
650
|
if (subdirWildcardRegex.test(pattern)) {
|
|
645
651
|
pattern = pattern.replace(subdirWildcardRegex, '/**/');
|
|
646
652
|
}
|
|
653
|
+
// Query-string patterns (e.g. /faq?faqItem= or /faq/?faq&faqItem=):
|
|
654
|
+
// '?' is the query separator in robots.txt but a single-char wildcard in
|
|
655
|
+
// minimatch. Escape it to a literal match and append '*' so any query
|
|
656
|
+
// value after the stated prefix is also blocked.
|
|
657
|
+
if (pattern.includes('?')) {
|
|
658
|
+
return [domain + pattern.replace('?', '\\?') + '*'];
|
|
659
|
+
}
|
|
647
660
|
if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
|
|
648
661
|
if (pattern.endsWith('*')) {
|
|
649
|
-
|
|
662
|
+
// e.g. /ebook/* → /ebook/** (already covers all children)
|
|
663
|
+
return [domain + pattern.concat('*')];
|
|
650
664
|
}
|
|
651
665
|
else {
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
666
|
+
// Bare path (e.g. /subscription/unsubscribe): robots.txt blocks the
|
|
667
|
+
// exact URL *and* every descendant. minimatch's '/**' glob does not
|
|
668
|
+
// match the bare path itself (no trailing slash), so we emit both the
|
|
669
|
+
// exact-path pattern and a children glob.
|
|
670
|
+
const base = domain + pattern;
|
|
671
|
+
const children = domain + (pattern.endsWith('/') ? pattern : pattern + '/') + '**';
|
|
672
|
+
return [base, children];
|
|
655
673
|
}
|
|
656
674
|
}
|
|
657
|
-
|
|
658
|
-
return final;
|
|
675
|
+
return [domain + pattern];
|
|
659
676
|
};
|
|
660
677
|
for (const line of lines) {
|
|
661
678
|
if (line.toLowerCase().startsWith('user-agent: *')) {
|
|
@@ -667,15 +684,13 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
|
|
|
667
684
|
else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
|
|
668
685
|
let disallowed = line.substring('disallow: '.length).trim();
|
|
669
686
|
if (disallowed) {
|
|
670
|
-
|
|
671
|
-
disallowedUrls.push(disallowed);
|
|
687
|
+
disallowedUrls.push(...sanitisePattern(disallowed));
|
|
672
688
|
}
|
|
673
689
|
}
|
|
674
690
|
else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
|
|
675
691
|
let allowed = line.substring('allow: '.length).trim();
|
|
676
692
|
if (allowed) {
|
|
677
|
-
|
|
678
|
-
allowedUrls.push(allowed);
|
|
693
|
+
allowedUrls.push(...sanitisePattern(allowed));
|
|
679
694
|
}
|
|
680
695
|
}
|
|
681
696
|
}
|
|
@@ -726,6 +741,31 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
|
|
|
726
741
|
}
|
|
727
742
|
}
|
|
728
743
|
};
|
|
744
|
+
export const getSitemapsFromRobotsTxt = async (url, browser, userDataDirectory, extraHTTPHeaders) => {
|
|
745
|
+
const domain = new URL(url).origin;
|
|
746
|
+
const robotsUrl = domain.concat('/robots.txt');
|
|
747
|
+
let robotsTxt;
|
|
748
|
+
try {
|
|
749
|
+
robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browser, userDataDirectory, extraHTTPHeaders);
|
|
750
|
+
}
|
|
751
|
+
catch (e) {
|
|
752
|
+
consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl} for sitemap discovery`);
|
|
753
|
+
return [];
|
|
754
|
+
}
|
|
755
|
+
if (!robotsTxt)
|
|
756
|
+
return [];
|
|
757
|
+
const sitemaps = [];
|
|
758
|
+
const lines = robotsTxt.split(/\r?\n/);
|
|
759
|
+
for (const line of lines) {
|
|
760
|
+
if (line.toLowerCase().startsWith('sitemap:')) {
|
|
761
|
+
const sitemapUrl = line.substring('sitemap:'.length).trim();
|
|
762
|
+
if (sitemapUrl) {
|
|
763
|
+
sitemaps.push(sitemapUrl);
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
return sitemaps;
|
|
768
|
+
};
|
|
729
769
|
export const isDisallowedInRobotsTxt = (url) => {
|
|
730
770
|
if (!constants.robotsTxtUrls)
|
|
731
771
|
return;
|
|
@@ -744,7 +784,7 @@ export const isDisallowedInRobotsTxt = (url) => {
|
|
|
744
784
|
}
|
|
745
785
|
return false;
|
|
746
786
|
};
|
|
747
|
-
export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders) => {
|
|
787
|
+
export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
|
|
748
788
|
const scannedSitemaps = new Set();
|
|
749
789
|
const urls = {}; // dictionary of requests to urls to be scanned
|
|
750
790
|
const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
|
|
@@ -753,6 +793,8 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
753
793
|
return;
|
|
754
794
|
if (isDisallowedInRobotsTxt(url))
|
|
755
795
|
return;
|
|
796
|
+
if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy))
|
|
797
|
+
return;
|
|
756
798
|
url = convertPathToLocalFile(url);
|
|
757
799
|
let request;
|
|
758
800
|
try {
|
|
@@ -4,7 +4,7 @@ import fsp from 'fs/promises';
|
|
|
4
4
|
import { createCrawleeSubFolders, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
|
|
5
5
|
import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
|
|
6
6
|
import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
|
|
7
|
-
import { areLinksEqual, isFollowStrategy, register } from '../utils.js';
|
|
7
|
+
import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
|
|
8
8
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
|
|
9
9
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
10
10
|
const isBlacklisted = (url, blacklistedPatterns) => {
|
|
@@ -37,8 +37,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
37
37
|
const pdfDownloads = [];
|
|
38
38
|
const uuidToPdfMapping = {};
|
|
39
39
|
const queuedUrlSet = new Set();
|
|
40
|
-
const scannedUrlSet = new Set(urlsCrawled.scanned.map(item => item.url));
|
|
41
|
-
const scannedResolvedUrlSet = new Set(urlsCrawled.scanned.map(item => item.actualUrl || item.url));
|
|
40
|
+
const scannedUrlSet = new Set(urlsCrawled.scanned.map(item => normUrl(item.url)));
|
|
41
|
+
const scannedResolvedUrlSet = new Set(urlsCrawled.scanned.map(item => normUrl(item.actualUrl || item.url)));
|
|
42
42
|
const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes);
|
|
43
43
|
const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes);
|
|
44
44
|
const { maxConcurrency } = constants;
|
|
@@ -70,11 +70,12 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
70
70
|
const initialPageUrl = workingPage.url().toString();
|
|
71
71
|
const selectedElementsString = cssQuerySelectors.join(', ');
|
|
72
72
|
const isExcluded = (newPageUrl) => {
|
|
73
|
-
const isAlreadyScanned =
|
|
73
|
+
const isAlreadyScanned = scannedUrlSet.has(normUrl(newPageUrl));
|
|
74
74
|
const isBlacklistedUrl = isBlacklisted(newPageUrl, blacklistedPatterns);
|
|
75
75
|
const isNotFollowStrategy = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
|
|
76
76
|
const isNotSupportedDocument = disallowedListOfPatterns.some(pattern => newPageUrl.toLowerCase().startsWith(pattern));
|
|
77
|
-
|
|
77
|
+
const isRobotsDisallowed = isDisallowedInRobotsTxt(newPageUrl);
|
|
78
|
+
return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy || isRobotsDisallowed;
|
|
78
79
|
};
|
|
79
80
|
const setPageListeners = (pageListener) => {
|
|
80
81
|
// event listener to handle new page popups upon button click
|
|
@@ -235,7 +236,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
235
236
|
catch (e) {
|
|
236
237
|
consoleLogger.error(e);
|
|
237
238
|
}
|
|
238
|
-
if (scannedUrlSet.has(req.url)) {
|
|
239
|
+
if (scannedUrlSet.has(normUrl(req.url))) {
|
|
239
240
|
req.skipNavigation = true;
|
|
240
241
|
}
|
|
241
242
|
if (isDisallowedInRobotsTxt(req.url))
|
|
@@ -358,7 +359,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
358
359
|
finalUrl = requestLabelUrl;
|
|
359
360
|
}
|
|
360
361
|
const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
|
|
361
|
-
if (isRedirected) {
|
|
362
|
+
if (isRedirected && !isDisallowedInRobotsTxt(finalUrl)) {
|
|
362
363
|
await enqueueUniqueRequest({ url: finalUrl, label: finalUrl });
|
|
363
364
|
}
|
|
364
365
|
else {
|
|
@@ -399,7 +400,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
399
400
|
return;
|
|
400
401
|
}
|
|
401
402
|
// if URL has already been scanned
|
|
402
|
-
if (scannedUrlSet.has(request.url)) {
|
|
403
|
+
if (scannedUrlSet.has(normUrl(request.url))) {
|
|
403
404
|
await enqueueProcess(page, enqueueLinks, browserContext);
|
|
404
405
|
return;
|
|
405
406
|
}
|
|
@@ -493,8 +494,32 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
493
494
|
return;
|
|
494
495
|
}
|
|
495
496
|
const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
|
|
497
|
+
// Detect JS redirects that fire during/after axe scan.
|
|
498
|
+
// Listen for navigation, then give a brief window for pending redirects to complete.
|
|
499
|
+
try {
|
|
500
|
+
let navigatedToUrl = null;
|
|
501
|
+
const onFrameNavigated = (frame) => {
|
|
502
|
+
if (frame === page.mainFrame()) {
|
|
503
|
+
navigatedToUrl = frame.url();
|
|
504
|
+
}
|
|
505
|
+
};
|
|
506
|
+
page.on('framenavigated', onFrameNavigated);
|
|
507
|
+
await page.waitForTimeout(1000);
|
|
508
|
+
page.off('framenavigated', onFrameNavigated);
|
|
509
|
+
const postScanUrl = navigatedToUrl || page.url();
|
|
510
|
+
if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
|
|
511
|
+
urlsCrawled.notScannedRedirects.push({
|
|
512
|
+
fromUrl: request.url,
|
|
513
|
+
toUrl: postScanUrl,
|
|
514
|
+
});
|
|
515
|
+
return;
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
catch (_) {
|
|
519
|
+
// Page/context was destroyed during navigation — handled by outer catch
|
|
520
|
+
}
|
|
496
521
|
if (isRedirected) {
|
|
497
|
-
const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(actualUrl);
|
|
522
|
+
const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(normUrl(actualUrl));
|
|
498
523
|
if (isLoadedUrlInCrawledUrls) {
|
|
499
524
|
urlsCrawled.notScannedRedirects.push({
|
|
500
525
|
fromUrl: request.url,
|
|
@@ -513,8 +538,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
513
538
|
pageTitle: results.pageTitle,
|
|
514
539
|
actualUrl, // i.e. actualUrl
|
|
515
540
|
});
|
|
516
|
-
scannedUrlSet.add(request.url);
|
|
517
|
-
scannedResolvedUrlSet.add(actualUrl);
|
|
541
|
+
scannedUrlSet.add(normUrl(request.url));
|
|
542
|
+
scannedResolvedUrlSet.add(normUrl(actualUrl));
|
|
518
543
|
urlsCrawled.scannedRedirects.push({
|
|
519
544
|
fromUrl: request.url,
|
|
520
545
|
toUrl: actualUrl, // i.e. actualUrl
|
|
@@ -535,8 +560,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
535
560
|
actualUrl: request.url,
|
|
536
561
|
pageTitle: results.pageTitle,
|
|
537
562
|
});
|
|
538
|
-
scannedUrlSet.add(request.url);
|
|
539
|
-
scannedResolvedUrlSet.add(request.url);
|
|
563
|
+
scannedUrlSet.add(normUrl(request.url));
|
|
564
|
+
scannedResolvedUrlSet.add(normUrl(request.url));
|
|
540
565
|
await dataset.pushData(results);
|
|
541
566
|
}
|
|
542
567
|
}
|
|
@@ -3,7 +3,7 @@ import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/consta
|
|
|
3
3
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
4
4
|
import crawlDomain from './crawlDomain.js';
|
|
5
5
|
import crawlSitemap from './crawlSitemap.js';
|
|
6
|
-
import { getPlaywrightLaunchOptions } from '../constants/common.js';
|
|
6
|
+
import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
|
|
7
7
|
import { register } from '../utils.js';
|
|
8
8
|
const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, strategy, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, followRobots, extraHTTPHeaders, safeMode, scanDuration) => {
|
|
9
9
|
const startTime = Date.now(); // Track start time
|
|
@@ -66,12 +66,30 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
66
66
|
return false;
|
|
67
67
|
}
|
|
68
68
|
};
|
|
69
|
+
// Discover sitemaps from robots.txt first (supports multiple Sitemap: directives)
|
|
70
|
+
let sitemapUrls = [];
|
|
69
71
|
try {
|
|
70
|
-
|
|
72
|
+
sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
|
|
73
|
+
if (sitemapUrls.length > 0) {
|
|
74
|
+
console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
|
|
75
|
+
sitemapExist = true;
|
|
76
|
+
}
|
|
71
77
|
}
|
|
72
78
|
catch (error) {
|
|
73
79
|
consoleLogger.error(error);
|
|
74
80
|
}
|
|
81
|
+
// Fall back to hardcoded path probing if robots.txt had no sitemaps
|
|
82
|
+
if (!sitemapExist) {
|
|
83
|
+
try {
|
|
84
|
+
sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
|
|
85
|
+
if (sitemapExist) {
|
|
86
|
+
sitemapUrls = [sitemapUrl];
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
catch (error) {
|
|
90
|
+
consoleLogger.error(error);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
75
93
|
if (!sitemapExist) {
|
|
76
94
|
console.log('Unable to find sitemap. Commencing website crawl instead.');
|
|
77
95
|
return await crawlDomain({
|
|
@@ -90,34 +108,48 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
90
108
|
followRobots,
|
|
91
109
|
extraHTTPHeaders,
|
|
92
110
|
safeMode,
|
|
93
|
-
scanDuration,
|
|
111
|
+
scanDuration,
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
// Process all discovered sitemaps sequentially, sharing dataset and urlsCrawled
|
|
115
|
+
for (const currentSitemapUrl of sitemapUrls) {
|
|
116
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl)
|
|
117
|
+
break;
|
|
118
|
+
const elapsed = Date.now() - startTime;
|
|
119
|
+
const remainingDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : scanDuration;
|
|
120
|
+
if (scanDuration > 0 && remainingDuration <= 0) {
|
|
121
|
+
durationExceeded = true;
|
|
122
|
+
break;
|
|
123
|
+
}
|
|
124
|
+
console.log(`Processing sitemap: ${currentSitemapUrl}`);
|
|
125
|
+
urlsCrawledFinal = await crawlSitemap({
|
|
126
|
+
sitemapUrl: currentSitemapUrl,
|
|
127
|
+
randomToken,
|
|
128
|
+
host,
|
|
129
|
+
viewportSettings,
|
|
130
|
+
maxRequestsPerCrawl,
|
|
131
|
+
browser,
|
|
132
|
+
userDataDirectory,
|
|
133
|
+
specifiedMaxConcurrency,
|
|
134
|
+
fileTypes,
|
|
135
|
+
blacklistedPatterns,
|
|
136
|
+
includeScreenshots,
|
|
137
|
+
extraHTTPHeaders,
|
|
138
|
+
strategy,
|
|
139
|
+
userUrl: url,
|
|
140
|
+
fromCrawlIntelligentSitemap,
|
|
141
|
+
userUrlInputFromIntelligent: url,
|
|
142
|
+
datasetFromIntelligent: dataset,
|
|
143
|
+
urlsCrawledFromIntelligent: urlsCrawled,
|
|
144
|
+
crawledFromLocalFile: false,
|
|
145
|
+
scanDuration: scanDuration > 0 ? remainingDuration : 0,
|
|
94
146
|
});
|
|
95
147
|
}
|
|
96
|
-
console.log(`Sitemap found at ${sitemapUrl}`);
|
|
97
|
-
urlsCrawledFinal = await crawlSitemap({
|
|
98
|
-
sitemapUrl,
|
|
99
|
-
randomToken,
|
|
100
|
-
host,
|
|
101
|
-
viewportSettings,
|
|
102
|
-
maxRequestsPerCrawl,
|
|
103
|
-
browser,
|
|
104
|
-
userDataDirectory,
|
|
105
|
-
specifiedMaxConcurrency,
|
|
106
|
-
fileTypes,
|
|
107
|
-
blacklistedPatterns,
|
|
108
|
-
includeScreenshots,
|
|
109
|
-
extraHTTPHeaders,
|
|
110
|
-
fromCrawlIntelligentSitemap,
|
|
111
|
-
userUrlInputFromIntelligent: url,
|
|
112
|
-
datasetFromIntelligent: dataset,
|
|
113
|
-
urlsCrawledFromIntelligent: urlsCrawled,
|
|
114
|
-
crawledFromLocalFile: false,
|
|
115
|
-
scanDuration,
|
|
116
|
-
});
|
|
117
148
|
const elapsed = Date.now() - startTime;
|
|
118
|
-
const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0)
|
|
119
|
-
|
|
120
|
-
|
|
149
|
+
const remainingScanDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : 0;
|
|
150
|
+
const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
|
|
151
|
+
if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
|
|
152
|
+
console.log(`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`);
|
|
121
153
|
urlsCrawledFinal = await crawlDomain({
|
|
122
154
|
url,
|
|
123
155
|
randomToken,
|
|
@@ -136,15 +168,15 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
136
168
|
safeMode,
|
|
137
169
|
fromCrawlIntelligentSitemap,
|
|
138
170
|
datasetFromIntelligent: dataset,
|
|
139
|
-
urlsCrawledFromIntelligent:
|
|
171
|
+
urlsCrawledFromIntelligent: urlsCrawled,
|
|
140
172
|
scanDuration: remainingScanDuration,
|
|
141
173
|
});
|
|
142
174
|
}
|
|
143
|
-
else if (
|
|
175
|
+
else if (!hasDurationRemaining) {
|
|
144
176
|
console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
|
|
145
177
|
durationExceeded = true;
|
|
146
178
|
}
|
|
147
179
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
148
|
-
return { urlsCrawled
|
|
180
|
+
return { urlsCrawled, durationExceeded };
|
|
149
181
|
};
|
|
150
182
|
export default crawlIntelligentSitemap;
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
import crawlee, { RequestList } from 'crawlee';
|
|
1
|
+
import crawlee, { EnqueueStrategy, RequestList } from 'crawlee';
|
|
2
2
|
import * as path from 'path';
|
|
3
3
|
import fsp from 'fs/promises';
|
|
4
4
|
import { createCrawleeSubFolders, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
|
|
5
5
|
import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, disallowedListOfPatterns, FileTypes, } from '../constants/constants.js';
|
|
6
6
|
import { getLinksFromSitemap, getPlaywrightLaunchOptions, isSkippedUrl, waitForPageLoaded, isFilePath, } from '../constants/common.js';
|
|
7
|
-
import { areLinksEqual, isWhitelistedContentType, register } from '../utils.js';
|
|
7
|
+
import { areLinksEqual, isFollowStrategy, isWhitelistedContentType, normUrl, register } from '../utils.js';
|
|
8
8
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
|
|
9
9
|
import { guiInfoLog } from '../logs.js';
|
|
10
|
-
const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
|
|
10
|
+
const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = '', scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
|
|
11
11
|
const crawlStartTime = Date.now();
|
|
12
12
|
let dataset;
|
|
13
13
|
let urlsCrawled;
|
|
@@ -25,7 +25,7 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
25
25
|
console.log('Local file crawling not supported for sitemap. Please provide a valid URL.');
|
|
26
26
|
return;
|
|
27
27
|
}
|
|
28
|
-
const linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, extraHTTPHeaders);
|
|
28
|
+
const linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, extraHTTPHeaders, strategy, userUrl || sitemapUrl);
|
|
29
29
|
sitemapUrl = encodeURI(sitemapUrl);
|
|
30
30
|
const pdfDownloads = [];
|
|
31
31
|
const uuidToPdfMapping = {};
|
|
@@ -182,7 +182,7 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
182
182
|
const status = response ? response.status() : 0;
|
|
183
183
|
if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
|
|
184
184
|
const isRedirected = !areLinksEqual(page.url(), request.url);
|
|
185
|
-
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(item => (item.actualUrl || item.url) === page.url());
|
|
185
|
+
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(item => normUrl(item.actualUrl || item.url) === normUrl(page.url()));
|
|
186
186
|
if (isRedirected && isLoadedUrlInCrawledUrls) {
|
|
187
187
|
urlsCrawled.notScannedRedirects.push({
|
|
188
188
|
fromUrl: request.url,
|
|
@@ -205,7 +205,46 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
205
205
|
});
|
|
206
206
|
return;
|
|
207
207
|
}
|
|
208
|
+
if (isRedirected && !isFollowStrategy(actualUrl, request.url, 'same-hostname')) {
|
|
209
|
+
urlsCrawled.notScannedRedirects.push({
|
|
210
|
+
fromUrl: request.url,
|
|
211
|
+
toUrl: actualUrl,
|
|
212
|
+
});
|
|
213
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
214
|
+
numScanned: urlsCrawled.scanned.length,
|
|
215
|
+
urlScanned: request.url,
|
|
216
|
+
});
|
|
217
|
+
return;
|
|
218
|
+
}
|
|
208
219
|
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
|
220
|
+
// Detect JS redirects that fire during/after axe scan.
|
|
221
|
+
// Listen for navigation, then give a brief window for pending redirects to complete.
|
|
222
|
+
try {
|
|
223
|
+
let navigatedToUrl = null;
|
|
224
|
+
const onFrameNavigated = (frame) => {
|
|
225
|
+
if (frame === page.mainFrame()) {
|
|
226
|
+
navigatedToUrl = frame.url();
|
|
227
|
+
}
|
|
228
|
+
};
|
|
229
|
+
page.on('framenavigated', onFrameNavigated);
|
|
230
|
+
await page.waitForTimeout(1000);
|
|
231
|
+
page.off('framenavigated', onFrameNavigated);
|
|
232
|
+
const postScanUrl = navigatedToUrl || page.url();
|
|
233
|
+
if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
|
|
234
|
+
urlsCrawled.notScannedRedirects.push({
|
|
235
|
+
fromUrl: request.url,
|
|
236
|
+
toUrl: postScanUrl,
|
|
237
|
+
});
|
|
238
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
239
|
+
numScanned: urlsCrawled.scanned.length,
|
|
240
|
+
urlScanned: request.url,
|
|
241
|
+
});
|
|
242
|
+
return;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
catch (_) {
|
|
246
|
+
// Page/context was destroyed during navigation — handled by outer catch
|
|
247
|
+
}
|
|
209
248
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
210
249
|
numScanned: urlsCrawled.scanned.length,
|
|
211
250
|
urlScanned: request.url,
|