@govtechsg/oobee 0.10.85 → 0.10.87
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/publish.yml +10 -0
- package/DETAILS.md +29 -0
- package/dist/cli.js +18 -5
- package/dist/combine.js +3 -1
- package/dist/constants/cliFunctions.js +2 -2
- package/dist/constants/common.js +70 -17
- package/dist/constants/constants.js +604 -1
- package/dist/crawlers/commonCrawlerFunc.js +3 -2
- package/dist/crawlers/crawlDomain.js +38 -13
- package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
- package/dist/crawlers/crawlSitemap.js +141 -84
- package/dist/crawlers/custom/utils.js +218 -71
- package/dist/crawlers/guards/urlGuard.js +8 -15
- package/dist/crawlers/runCustom.js +18 -11
- package/dist/generateHtmlReport.js +18 -11
- package/dist/generateOobeeClientScanner.js +570 -0
- package/dist/mergeAxeResults/itemReferences.js +60 -25
- package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
- package/dist/mergeAxeResults.js +23 -13
- package/dist/npmIndex.js +10 -2
- package/dist/proxyService.js +18 -3
- package/dist/services/s3Uploader.js +21 -10
- package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
- package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
- package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
- package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
- package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
- package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
- package/dist/static/ejs/summary.ejs +19 -8
- package/dist/utils.js +4 -3
- package/fix-summary-html-oom-pr.md +62 -0
- package/oobee-client-scanner.js +34992 -0
- package/package.json +5 -5
- package/src/cli.ts +19 -5
- package/src/combine.ts +5 -1
- package/src/constants/cliFunctions.ts +2 -2
- package/src/constants/common.ts +87 -22
- package/src/constants/constants.ts +602 -1
- package/src/crawlers/commonCrawlerFunc.ts +4 -3
- package/src/crawlers/crawlDomain.ts +39 -13
- package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
- package/src/crawlers/crawlSitemap.ts +165 -100
- package/src/crawlers/custom/utils.ts +241 -80
- package/src/crawlers/guards/urlGuard.ts +24 -31
- package/src/crawlers/runCustom.ts +29 -11
- package/src/generateHtmlReport.ts +21 -11
- package/src/generateOobeeClientScanner.ts +591 -0
- package/src/mergeAxeResults/itemReferences.ts +70 -26
- package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
- package/src/mergeAxeResults.ts +26 -14
- package/src/npmIndex.ts +12 -2
- package/src/proxyService.ts +25 -4
- package/src/services/s3Uploader.ts +23 -11
- package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
- package/src/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
- package/src/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
- package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
- package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
- package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
- package/src/static/ejs/summary.ejs +19 -8
- package/src/utils.ts +4 -3
- package/testStaticJSScanner.html +534 -0
|
@@ -116,7 +116,7 @@ export const filterAxeResults = (results, pageTitle, customFlowDetails) => {
|
|
|
116
116
|
return;
|
|
117
117
|
const conformance = tags.filter(tag => tag.startsWith('wcag') || tag === 'best-practice');
|
|
118
118
|
nodes.forEach(node => {
|
|
119
|
-
const { html } = node;
|
|
119
|
+
const { html, target } = node;
|
|
120
120
|
if (!(rule in passed.rules)) {
|
|
121
121
|
passed.rules[rule] = {
|
|
122
122
|
description,
|
|
@@ -128,7 +128,8 @@ export const filterAxeResults = (results, pageTitle, customFlowDetails) => {
|
|
|
128
128
|
};
|
|
129
129
|
}
|
|
130
130
|
const finalHtml = truncateHtml(html);
|
|
131
|
-
|
|
131
|
+
const xpath = target.length === 1 && typeof target[0] === 'string' ? target[0] : undefined;
|
|
132
|
+
passed.rules[rule].items.push({ html: finalHtml, screenshotPath: '', message: '', xpath: xpath || '' });
|
|
132
133
|
passed.totalItems += 1;
|
|
133
134
|
passed.rules[rule].totalItems += 1;
|
|
134
135
|
totalItems += 1;
|
|
@@ -4,7 +4,7 @@ import fsp from 'fs/promises';
|
|
|
4
4
|
import { createCrawleeSubFolders, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
|
|
5
5
|
import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
|
|
6
6
|
import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
|
|
7
|
-
import { areLinksEqual, isFollowStrategy, register } from '../utils.js';
|
|
7
|
+
import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
|
|
8
8
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
|
|
9
9
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
10
10
|
const isBlacklisted = (url, blacklistedPatterns) => {
|
|
@@ -37,8 +37,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
37
37
|
const pdfDownloads = [];
|
|
38
38
|
const uuidToPdfMapping = {};
|
|
39
39
|
const queuedUrlSet = new Set();
|
|
40
|
-
const scannedUrlSet = new Set(urlsCrawled.scanned.map(item => item.url));
|
|
41
|
-
const scannedResolvedUrlSet = new Set(urlsCrawled.scanned.map(item => item.actualUrl || item.url));
|
|
40
|
+
const scannedUrlSet = new Set(urlsCrawled.scanned.map(item => normUrl(item.url)));
|
|
41
|
+
const scannedResolvedUrlSet = new Set(urlsCrawled.scanned.map(item => normUrl(item.actualUrl || item.url)));
|
|
42
42
|
const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes);
|
|
43
43
|
const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes);
|
|
44
44
|
const { maxConcurrency } = constants;
|
|
@@ -70,11 +70,12 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
70
70
|
const initialPageUrl = workingPage.url().toString();
|
|
71
71
|
const selectedElementsString = cssQuerySelectors.join(', ');
|
|
72
72
|
const isExcluded = (newPageUrl) => {
|
|
73
|
-
const isAlreadyScanned =
|
|
73
|
+
const isAlreadyScanned = scannedUrlSet.has(normUrl(newPageUrl));
|
|
74
74
|
const isBlacklistedUrl = isBlacklisted(newPageUrl, blacklistedPatterns);
|
|
75
75
|
const isNotFollowStrategy = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
|
|
76
76
|
const isNotSupportedDocument = disallowedListOfPatterns.some(pattern => newPageUrl.toLowerCase().startsWith(pattern));
|
|
77
|
-
|
|
77
|
+
const isRobotsDisallowed = isDisallowedInRobotsTxt(newPageUrl);
|
|
78
|
+
return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy || isRobotsDisallowed;
|
|
78
79
|
};
|
|
79
80
|
const setPageListeners = (pageListener) => {
|
|
80
81
|
// event listener to handle new page popups upon button click
|
|
@@ -235,7 +236,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
235
236
|
catch (e) {
|
|
236
237
|
consoleLogger.error(e);
|
|
237
238
|
}
|
|
238
|
-
if (scannedUrlSet.has(req.url)) {
|
|
239
|
+
if (scannedUrlSet.has(normUrl(req.url))) {
|
|
239
240
|
req.skipNavigation = true;
|
|
240
241
|
}
|
|
241
242
|
if (isDisallowedInRobotsTxt(req.url))
|
|
@@ -358,7 +359,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
358
359
|
finalUrl = requestLabelUrl;
|
|
359
360
|
}
|
|
360
361
|
const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
|
|
361
|
-
if (isRedirected) {
|
|
362
|
+
if (isRedirected && !isDisallowedInRobotsTxt(finalUrl)) {
|
|
362
363
|
await enqueueUniqueRequest({ url: finalUrl, label: finalUrl });
|
|
363
364
|
}
|
|
364
365
|
else {
|
|
@@ -399,7 +400,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
399
400
|
return;
|
|
400
401
|
}
|
|
401
402
|
// if URL has already been scanned
|
|
402
|
-
if (scannedUrlSet.has(request.url)) {
|
|
403
|
+
if (scannedUrlSet.has(normUrl(request.url))) {
|
|
403
404
|
await enqueueProcess(page, enqueueLinks, browserContext);
|
|
404
405
|
return;
|
|
405
406
|
}
|
|
@@ -493,8 +494,32 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
493
494
|
return;
|
|
494
495
|
}
|
|
495
496
|
const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
|
|
497
|
+
// Detect JS redirects that fire during/after axe scan.
|
|
498
|
+
// Listen for navigation, then give a brief window for pending redirects to complete.
|
|
499
|
+
try {
|
|
500
|
+
let navigatedToUrl = null;
|
|
501
|
+
const onFrameNavigated = (frame) => {
|
|
502
|
+
if (frame === page.mainFrame()) {
|
|
503
|
+
navigatedToUrl = frame.url();
|
|
504
|
+
}
|
|
505
|
+
};
|
|
506
|
+
page.on('framenavigated', onFrameNavigated);
|
|
507
|
+
await page.waitForTimeout(1000);
|
|
508
|
+
page.off('framenavigated', onFrameNavigated);
|
|
509
|
+
const postScanUrl = navigatedToUrl || page.url();
|
|
510
|
+
if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
|
|
511
|
+
urlsCrawled.notScannedRedirects.push({
|
|
512
|
+
fromUrl: request.url,
|
|
513
|
+
toUrl: postScanUrl,
|
|
514
|
+
});
|
|
515
|
+
return;
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
catch (_) {
|
|
519
|
+
// Page/context was destroyed during navigation — handled by outer catch
|
|
520
|
+
}
|
|
496
521
|
if (isRedirected) {
|
|
497
|
-
const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(actualUrl);
|
|
522
|
+
const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(normUrl(actualUrl));
|
|
498
523
|
if (isLoadedUrlInCrawledUrls) {
|
|
499
524
|
urlsCrawled.notScannedRedirects.push({
|
|
500
525
|
fromUrl: request.url,
|
|
@@ -513,8 +538,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
513
538
|
pageTitle: results.pageTitle,
|
|
514
539
|
actualUrl, // i.e. actualUrl
|
|
515
540
|
});
|
|
516
|
-
scannedUrlSet.add(request.url);
|
|
517
|
-
scannedResolvedUrlSet.add(actualUrl);
|
|
541
|
+
scannedUrlSet.add(normUrl(request.url));
|
|
542
|
+
scannedResolvedUrlSet.add(normUrl(actualUrl));
|
|
518
543
|
urlsCrawled.scannedRedirects.push({
|
|
519
544
|
fromUrl: request.url,
|
|
520
545
|
toUrl: actualUrl, // i.e. actualUrl
|
|
@@ -535,8 +560,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
535
560
|
actualUrl: request.url,
|
|
536
561
|
pageTitle: results.pageTitle,
|
|
537
562
|
});
|
|
538
|
-
scannedUrlSet.add(request.url);
|
|
539
|
-
scannedResolvedUrlSet.add(request.url);
|
|
563
|
+
scannedUrlSet.add(normUrl(request.url));
|
|
564
|
+
scannedResolvedUrlSet.add(normUrl(request.url));
|
|
540
565
|
await dataset.pushData(results);
|
|
541
566
|
}
|
|
542
567
|
}
|
|
@@ -3,7 +3,7 @@ import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/consta
|
|
|
3
3
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
4
4
|
import crawlDomain from './crawlDomain.js';
|
|
5
5
|
import crawlSitemap from './crawlSitemap.js';
|
|
6
|
-
import { getPlaywrightLaunchOptions } from '../constants/common.js';
|
|
6
|
+
import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
|
|
7
7
|
import { register } from '../utils.js';
|
|
8
8
|
const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, strategy, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, followRobots, extraHTTPHeaders, safeMode, scanDuration) => {
|
|
9
9
|
const startTime = Date.now(); // Track start time
|
|
@@ -66,12 +66,30 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
66
66
|
return false;
|
|
67
67
|
}
|
|
68
68
|
};
|
|
69
|
+
// Discover sitemaps from robots.txt first (supports multiple Sitemap: directives)
|
|
70
|
+
let sitemapUrls = [];
|
|
69
71
|
try {
|
|
70
|
-
|
|
72
|
+
sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
|
|
73
|
+
if (sitemapUrls.length > 0) {
|
|
74
|
+
console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
|
|
75
|
+
sitemapExist = true;
|
|
76
|
+
}
|
|
71
77
|
}
|
|
72
78
|
catch (error) {
|
|
73
79
|
consoleLogger.error(error);
|
|
74
80
|
}
|
|
81
|
+
// Fall back to hardcoded path probing if robots.txt had no sitemaps
|
|
82
|
+
if (!sitemapExist) {
|
|
83
|
+
try {
|
|
84
|
+
sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
|
|
85
|
+
if (sitemapExist) {
|
|
86
|
+
sitemapUrls = [sitemapUrl];
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
catch (error) {
|
|
90
|
+
consoleLogger.error(error);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
75
93
|
if (!sitemapExist) {
|
|
76
94
|
console.log('Unable to find sitemap. Commencing website crawl instead.');
|
|
77
95
|
return await crawlDomain({
|
|
@@ -90,34 +108,48 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
90
108
|
followRobots,
|
|
91
109
|
extraHTTPHeaders,
|
|
92
110
|
safeMode,
|
|
93
|
-
scanDuration,
|
|
111
|
+
scanDuration,
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
// Process all discovered sitemaps sequentially, sharing dataset and urlsCrawled
|
|
115
|
+
for (const currentSitemapUrl of sitemapUrls) {
|
|
116
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl)
|
|
117
|
+
break;
|
|
118
|
+
const elapsed = Date.now() - startTime;
|
|
119
|
+
const remainingDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : scanDuration;
|
|
120
|
+
if (scanDuration > 0 && remainingDuration <= 0) {
|
|
121
|
+
durationExceeded = true;
|
|
122
|
+
break;
|
|
123
|
+
}
|
|
124
|
+
console.log(`Processing sitemap: ${currentSitemapUrl}`);
|
|
125
|
+
urlsCrawledFinal = await crawlSitemap({
|
|
126
|
+
sitemapUrl: currentSitemapUrl,
|
|
127
|
+
randomToken,
|
|
128
|
+
host,
|
|
129
|
+
viewportSettings,
|
|
130
|
+
maxRequestsPerCrawl,
|
|
131
|
+
browser,
|
|
132
|
+
userDataDirectory,
|
|
133
|
+
specifiedMaxConcurrency,
|
|
134
|
+
fileTypes,
|
|
135
|
+
blacklistedPatterns,
|
|
136
|
+
includeScreenshots,
|
|
137
|
+
extraHTTPHeaders,
|
|
138
|
+
strategy,
|
|
139
|
+
userUrl: url,
|
|
140
|
+
fromCrawlIntelligentSitemap,
|
|
141
|
+
userUrlInputFromIntelligent: url,
|
|
142
|
+
datasetFromIntelligent: dataset,
|
|
143
|
+
urlsCrawledFromIntelligent: urlsCrawled,
|
|
144
|
+
crawledFromLocalFile: false,
|
|
145
|
+
scanDuration: scanDuration > 0 ? remainingDuration : 0,
|
|
94
146
|
});
|
|
95
147
|
}
|
|
96
|
-
console.log(`Sitemap found at ${sitemapUrl}`);
|
|
97
|
-
urlsCrawledFinal = await crawlSitemap({
|
|
98
|
-
sitemapUrl,
|
|
99
|
-
randomToken,
|
|
100
|
-
host,
|
|
101
|
-
viewportSettings,
|
|
102
|
-
maxRequestsPerCrawl,
|
|
103
|
-
browser,
|
|
104
|
-
userDataDirectory,
|
|
105
|
-
specifiedMaxConcurrency,
|
|
106
|
-
fileTypes,
|
|
107
|
-
blacklistedPatterns,
|
|
108
|
-
includeScreenshots,
|
|
109
|
-
extraHTTPHeaders,
|
|
110
|
-
fromCrawlIntelligentSitemap,
|
|
111
|
-
userUrlInputFromIntelligent: url,
|
|
112
|
-
datasetFromIntelligent: dataset,
|
|
113
|
-
urlsCrawledFromIntelligent: urlsCrawled,
|
|
114
|
-
crawledFromLocalFile: false,
|
|
115
|
-
scanDuration,
|
|
116
|
-
});
|
|
117
148
|
const elapsed = Date.now() - startTime;
|
|
118
|
-
const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0)
|
|
119
|
-
|
|
120
|
-
|
|
149
|
+
const remainingScanDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : 0;
|
|
150
|
+
const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
|
|
151
|
+
if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
|
|
152
|
+
console.log(`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`);
|
|
121
153
|
urlsCrawledFinal = await crawlDomain({
|
|
122
154
|
url,
|
|
123
155
|
randomToken,
|
|
@@ -136,15 +168,15 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
136
168
|
safeMode,
|
|
137
169
|
fromCrawlIntelligentSitemap,
|
|
138
170
|
datasetFromIntelligent: dataset,
|
|
139
|
-
urlsCrawledFromIntelligent:
|
|
171
|
+
urlsCrawledFromIntelligent: urlsCrawled,
|
|
140
172
|
scanDuration: remainingScanDuration,
|
|
141
173
|
});
|
|
142
174
|
}
|
|
143
|
-
else if (
|
|
175
|
+
else if (!hasDurationRemaining) {
|
|
144
176
|
console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
|
|
145
177
|
durationExceeded = true;
|
|
146
178
|
}
|
|
147
179
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
148
|
-
return { urlsCrawled
|
|
180
|
+
return { urlsCrawled, durationExceeded };
|
|
149
181
|
};
|
|
150
182
|
export default crawlIntelligentSitemap;
|
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
import crawlee, { RequestList } from 'crawlee';
|
|
1
|
+
import crawlee, { EnqueueStrategy, RequestList } from 'crawlee';
|
|
2
2
|
import * as path from 'path';
|
|
3
3
|
import fsp from 'fs/promises';
|
|
4
4
|
import { createCrawleeSubFolders, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
|
|
5
5
|
import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, disallowedListOfPatterns, FileTypes, } from '../constants/constants.js';
|
|
6
6
|
import { getLinksFromSitemap, getPlaywrightLaunchOptions, isSkippedUrl, waitForPageLoaded, isFilePath, } from '../constants/common.js';
|
|
7
|
-
import { areLinksEqual, isWhitelistedContentType, register } from '../utils.js';
|
|
7
|
+
import { areLinksEqual, isFollowStrategy, isWhitelistedContentType, normUrl, register } from '../utils.js';
|
|
8
8
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
|
|
9
9
|
import { guiInfoLog } from '../logs.js';
|
|
10
|
-
const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
|
|
10
|
+
const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = '', scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
|
|
11
11
|
const crawlStartTime = Date.now();
|
|
12
12
|
let dataset;
|
|
13
13
|
let urlsCrawled;
|
|
14
14
|
let durationExceeded = false;
|
|
15
|
+
let isAbortingScan = false;
|
|
15
16
|
if (fromCrawlIntelligentSitemap) {
|
|
16
17
|
dataset = datasetFromIntelligent;
|
|
17
18
|
urlsCrawled = urlsCrawledFromIntelligent;
|
|
@@ -24,7 +25,7 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
24
25
|
console.log('Local file crawling not supported for sitemap. Please provide a valid URL.');
|
|
25
26
|
return;
|
|
26
27
|
}
|
|
27
|
-
const linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, extraHTTPHeaders);
|
|
28
|
+
const linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, extraHTTPHeaders, strategy, userUrl || sitemapUrl);
|
|
28
29
|
sitemapUrl = encodeURI(sitemapUrl);
|
|
29
30
|
const pdfDownloads = [];
|
|
30
31
|
const uuidToPdfMapping = {};
|
|
@@ -144,106 +145,162 @@ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, m
|
|
|
144
145
|
});
|
|
145
146
|
return;
|
|
146
147
|
}
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
if (hasExceededDuration) {
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
if (request.skipNavigation && actualUrl === 'about:blank') {
|
|
159
|
-
if (isScanPdfs) {
|
|
160
|
-
// pushes download promise into pdfDownloads
|
|
161
|
-
const { pdfFileName, url } = handlePdfDownload(randomToken, pdfDownloads, request, sendRequest, urlsCrawled);
|
|
162
|
-
uuidToPdfMapping[pdfFileName] = url;
|
|
148
|
+
try {
|
|
149
|
+
await waitForPageLoaded(page, 10000);
|
|
150
|
+
const actualUrl = page.url() || request.loadedUrl || request.url;
|
|
151
|
+
const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
|
152
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
|
|
153
|
+
isAbortingScan = true;
|
|
154
|
+
if (hasExceededDuration) {
|
|
155
|
+
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
|
|
156
|
+
durationExceeded = true;
|
|
157
|
+
}
|
|
158
|
+
crawler.autoscaledPool.abort(); // stops new requests
|
|
163
159
|
return;
|
|
164
160
|
}
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
});
|
|
176
|
-
return;
|
|
177
|
-
}
|
|
178
|
-
const contentType = response?.headers?.()['content-type'] || '';
|
|
179
|
-
const status = response ? response.status() : 0;
|
|
180
|
-
if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
|
|
181
|
-
const isRedirected = !areLinksEqual(page.url(), request.url);
|
|
182
|
-
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(item => (item.actualUrl || item.url) === page.url());
|
|
183
|
-
if (isRedirected && isLoadedUrlInCrawledUrls) {
|
|
184
|
-
urlsCrawled.notScannedRedirects.push({
|
|
185
|
-
fromUrl: request.url,
|
|
186
|
-
toUrl: actualUrl, // i.e. actualUrl
|
|
161
|
+
if (request.skipNavigation && actualUrl === 'about:blank') {
|
|
162
|
+
if (isScanPdfs) {
|
|
163
|
+
// pushes download promise into pdfDownloads
|
|
164
|
+
const { pdfFileName, url } = handlePdfDownload(randomToken, pdfDownloads, request, sendRequest, urlsCrawled);
|
|
165
|
+
uuidToPdfMapping[pdfFileName] = url;
|
|
166
|
+
return;
|
|
167
|
+
}
|
|
168
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
169
|
+
numScanned: urlsCrawled.scanned.length,
|
|
170
|
+
urlScanned: request.url,
|
|
187
171
|
});
|
|
188
|
-
return;
|
|
189
|
-
}
|
|
190
|
-
// This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
|
|
191
|
-
if (isRedirected && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
|
|
192
172
|
urlsCrawled.userExcluded.push({
|
|
193
173
|
url: request.url,
|
|
194
174
|
pageTitle: request.url,
|
|
195
|
-
actualUrl,
|
|
196
|
-
metadata: STATUS_CODE_METADATA[
|
|
197
|
-
httpStatusCode:
|
|
175
|
+
actualUrl: request.url, // because about:blank is not useful
|
|
176
|
+
metadata: STATUS_CODE_METADATA[1],
|
|
177
|
+
httpStatusCode: 1,
|
|
198
178
|
});
|
|
179
|
+
return;
|
|
180
|
+
}
|
|
181
|
+
const contentType = response?.headers?.()['content-type'] || '';
|
|
182
|
+
const status = response ? response.status() : 0;
|
|
183
|
+
if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
|
|
184
|
+
const isRedirected = !areLinksEqual(page.url(), request.url);
|
|
185
|
+
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(item => normUrl(item.actualUrl || item.url) === normUrl(page.url()));
|
|
186
|
+
if (isRedirected && isLoadedUrlInCrawledUrls) {
|
|
187
|
+
urlsCrawled.notScannedRedirects.push({
|
|
188
|
+
fromUrl: request.url,
|
|
189
|
+
toUrl: actualUrl, // i.e. actualUrl
|
|
190
|
+
});
|
|
191
|
+
return;
|
|
192
|
+
}
|
|
193
|
+
// This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
|
|
194
|
+
if (isRedirected && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
|
|
195
|
+
urlsCrawled.userExcluded.push({
|
|
196
|
+
url: request.url,
|
|
197
|
+
pageTitle: request.url,
|
|
198
|
+
actualUrl,
|
|
199
|
+
metadata: STATUS_CODE_METADATA[0],
|
|
200
|
+
httpStatusCode: 0,
|
|
201
|
+
});
|
|
202
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
203
|
+
numScanned: urlsCrawled.scanned.length,
|
|
204
|
+
urlScanned: request.url,
|
|
205
|
+
});
|
|
206
|
+
return;
|
|
207
|
+
}
|
|
208
|
+
if (isRedirected && !isFollowStrategy(actualUrl, request.url, 'same-hostname')) {
|
|
209
|
+
urlsCrawled.notScannedRedirects.push({
|
|
210
|
+
fromUrl: request.url,
|
|
211
|
+
toUrl: actualUrl,
|
|
212
|
+
});
|
|
213
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
214
|
+
numScanned: urlsCrawled.scanned.length,
|
|
215
|
+
urlScanned: request.url,
|
|
216
|
+
});
|
|
217
|
+
return;
|
|
218
|
+
}
|
|
219
|
+
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
|
220
|
+
// Detect JS redirects that fire during/after axe scan.
|
|
221
|
+
// Listen for navigation, then give a brief window for pending redirects to complete.
|
|
222
|
+
try {
|
|
223
|
+
let navigatedToUrl = null;
|
|
224
|
+
const onFrameNavigated = (frame) => {
|
|
225
|
+
if (frame === page.mainFrame()) {
|
|
226
|
+
navigatedToUrl = frame.url();
|
|
227
|
+
}
|
|
228
|
+
};
|
|
229
|
+
page.on('framenavigated', onFrameNavigated);
|
|
230
|
+
await page.waitForTimeout(1000);
|
|
231
|
+
page.off('framenavigated', onFrameNavigated);
|
|
232
|
+
const postScanUrl = navigatedToUrl || page.url();
|
|
233
|
+
if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
|
|
234
|
+
urlsCrawled.notScannedRedirects.push({
|
|
235
|
+
fromUrl: request.url,
|
|
236
|
+
toUrl: postScanUrl,
|
|
237
|
+
});
|
|
238
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
239
|
+
numScanned: urlsCrawled.scanned.length,
|
|
240
|
+
urlScanned: request.url,
|
|
241
|
+
});
|
|
242
|
+
return;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
catch (_) {
|
|
246
|
+
// Page/context was destroyed during navigation — handled by outer catch
|
|
247
|
+
}
|
|
248
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
249
|
+
numScanned: urlsCrawled.scanned.length,
|
|
250
|
+
urlScanned: request.url,
|
|
251
|
+
});
|
|
252
|
+
urlsCrawled.scanned.push({
|
|
253
|
+
url: request.url,
|
|
254
|
+
pageTitle: results.pageTitle,
|
|
255
|
+
actualUrl, // i.e. actualUrl
|
|
256
|
+
});
|
|
257
|
+
urlsCrawled.scannedRedirects.push({
|
|
258
|
+
fromUrl: request.url,
|
|
259
|
+
toUrl: actualUrl,
|
|
260
|
+
});
|
|
261
|
+
results.url = request.url;
|
|
262
|
+
results.actualUrl = actualUrl;
|
|
263
|
+
await dataset.pushData(results);
|
|
264
|
+
}
|
|
265
|
+
else {
|
|
199
266
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
200
267
|
numScanned: urlsCrawled.scanned.length,
|
|
201
268
|
urlScanned: request.url,
|
|
202
269
|
});
|
|
203
|
-
|
|
270
|
+
if (isScanHtml) {
|
|
271
|
+
// carry through the HTTP status metadata
|
|
272
|
+
const status = response?.status();
|
|
273
|
+
const metadata = typeof status === 'number'
|
|
274
|
+
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
|
275
|
+
: STATUS_CODE_METADATA[2];
|
|
276
|
+
urlsCrawled.invalid.push({
|
|
277
|
+
actualUrl,
|
|
278
|
+
url: request.url,
|
|
279
|
+
pageTitle: request.url,
|
|
280
|
+
metadata,
|
|
281
|
+
httpStatusCode: typeof status === 'number' ? status : 0,
|
|
282
|
+
});
|
|
283
|
+
}
|
|
204
284
|
}
|
|
205
|
-
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
|
206
|
-
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
207
|
-
numScanned: urlsCrawled.scanned.length,
|
|
208
|
-
urlScanned: request.url,
|
|
209
|
-
});
|
|
210
|
-
urlsCrawled.scanned.push({
|
|
211
|
-
url: request.url,
|
|
212
|
-
pageTitle: results.pageTitle,
|
|
213
|
-
actualUrl, // i.e. actualUrl
|
|
214
|
-
});
|
|
215
|
-
urlsCrawled.scannedRedirects.push({
|
|
216
|
-
fromUrl: request.url,
|
|
217
|
-
toUrl: actualUrl,
|
|
218
|
-
});
|
|
219
|
-
results.url = request.url;
|
|
220
|
-
results.actualUrl = actualUrl;
|
|
221
|
-
await dataset.pushData(results);
|
|
222
285
|
}
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
const status = response?.status();
|
|
231
|
-
const metadata = typeof status === 'number'
|
|
232
|
-
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
|
233
|
-
: STATUS_CODE_METADATA[2];
|
|
234
|
-
urlsCrawled.invalid.push({
|
|
235
|
-
actualUrl,
|
|
286
|
+
catch (e) {
|
|
287
|
+
if (!isAbortingScan) {
|
|
288
|
+
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
|
289
|
+
numScanned: urlsCrawled.scanned.length,
|
|
290
|
+
urlScanned: request.url,
|
|
291
|
+
});
|
|
292
|
+
urlsCrawled.error.push({
|
|
236
293
|
url: request.url,
|
|
237
294
|
pageTitle: request.url,
|
|
238
|
-
|
|
239
|
-
|
|
295
|
+
actualUrl: request.url,
|
|
296
|
+
metadata: STATUS_CODE_METADATA[2],
|
|
297
|
+
httpStatusCode: 0,
|
|
240
298
|
});
|
|
241
299
|
}
|
|
242
300
|
}
|
|
243
301
|
},
|
|
244
302
|
failedRequestHandler: async ({ request, response, error }) => {
|
|
245
|
-
|
|
246
|
-
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
|
|
303
|
+
if (isAbortingScan) {
|
|
247
304
|
return;
|
|
248
305
|
}
|
|
249
306
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|