@govtechsg/oobee 0.10.85 → 0.10.87
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/publish.yml +10 -0
- package/DETAILS.md +29 -0
- package/dist/cli.js +18 -5
- package/dist/combine.js +3 -1
- package/dist/constants/cliFunctions.js +2 -2
- package/dist/constants/common.js +70 -17
- package/dist/constants/constants.js +604 -1
- package/dist/crawlers/commonCrawlerFunc.js +3 -2
- package/dist/crawlers/crawlDomain.js +38 -13
- package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
- package/dist/crawlers/crawlSitemap.js +141 -84
- package/dist/crawlers/custom/utils.js +218 -71
- package/dist/crawlers/guards/urlGuard.js +8 -15
- package/dist/crawlers/runCustom.js +18 -11
- package/dist/generateHtmlReport.js +18 -11
- package/dist/generateOobeeClientScanner.js +570 -0
- package/dist/mergeAxeResults/itemReferences.js +60 -25
- package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
- package/dist/mergeAxeResults.js +23 -13
- package/dist/npmIndex.js +10 -2
- package/dist/proxyService.js +18 -3
- package/dist/services/s3Uploader.js +21 -10
- package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
- package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
- package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
- package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
- package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
- package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
- package/dist/static/ejs/summary.ejs +19 -8
- package/dist/utils.js +4 -3
- package/fix-summary-html-oom-pr.md +62 -0
- package/oobee-client-scanner.js +34992 -0
- package/package.json +5 -5
- package/src/cli.ts +19 -5
- package/src/combine.ts +5 -1
- package/src/constants/cliFunctions.ts +2 -2
- package/src/constants/common.ts +87 -22
- package/src/constants/constants.ts +602 -1
- package/src/crawlers/commonCrawlerFunc.ts +4 -3
- package/src/crawlers/crawlDomain.ts +39 -13
- package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
- package/src/crawlers/crawlSitemap.ts +165 -100
- package/src/crawlers/custom/utils.ts +241 -80
- package/src/crawlers/guards/urlGuard.ts +24 -31
- package/src/crawlers/runCustom.ts +29 -11
- package/src/generateHtmlReport.ts +21 -11
- package/src/generateOobeeClientScanner.ts +591 -0
- package/src/mergeAxeResults/itemReferences.ts +70 -26
- package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
- package/src/mergeAxeResults.ts +26 -14
- package/src/npmIndex.ts +12 -2
- package/src/proxyService.ts +25 -4
- package/src/services/s3Uploader.ts +23 -11
- package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
- package/src/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
- package/src/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
- package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
- package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
- package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
- package/src/static/ejs/summary.ejs +19 -8
- package/src/utils.ts +4 -3
- package/testStaticJSScanner.html +534 -0
|
@@ -196,7 +196,7 @@ export const filterAxeResults = (
|
|
|
196
196
|
const conformance = tags.filter(tag => tag.startsWith('wcag') || tag === 'best-practice');
|
|
197
197
|
|
|
198
198
|
nodes.forEach(node => {
|
|
199
|
-
const { html } = node;
|
|
199
|
+
const { html, target } = node;
|
|
200
200
|
if (!(rule in passed.rules)) {
|
|
201
201
|
passed.rules[rule] = {
|
|
202
202
|
description,
|
|
@@ -207,9 +207,10 @@ export const filterAxeResults = (
|
|
|
207
207
|
items: [],
|
|
208
208
|
};
|
|
209
209
|
}
|
|
210
|
-
|
|
210
|
+
|
|
211
211
|
const finalHtml = truncateHtml(html);
|
|
212
|
-
|
|
212
|
+
const xpath = target.length === 1 && typeof target[0] === 'string' ? target[0] : undefined;
|
|
213
|
+
passed.rules[rule].items.push({ html: finalHtml, screenshotPath: '', message: '', xpath: xpath || '' });
|
|
213
214
|
|
|
214
215
|
passed.totalItems += 1;
|
|
215
216
|
passed.rules[rule].totalItems += 1;
|
|
@@ -29,7 +29,7 @@ import {
|
|
|
29
29
|
getUrlsFromRobotsTxt,
|
|
30
30
|
waitForPageLoaded,
|
|
31
31
|
} from '../constants/common.js';
|
|
32
|
-
import { areLinksEqual, isFollowStrategy, register } from '../utils.js';
|
|
32
|
+
import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
|
|
33
33
|
import {
|
|
34
34
|
handlePdfDownload,
|
|
35
35
|
runPdfScan,
|
|
@@ -116,9 +116,9 @@ const crawlDomain = async ({
|
|
|
116
116
|
const pdfDownloads: Promise<void>[] = [];
|
|
117
117
|
const uuidToPdfMapping: Record<string, string> = {};
|
|
118
118
|
const queuedUrlSet = new Set<string>();
|
|
119
|
-
const scannedUrlSet = new Set<string>(urlsCrawled.scanned.map(item => item.url));
|
|
119
|
+
const scannedUrlSet = new Set<string>(urlsCrawled.scanned.map(item => normUrl(item.url)));
|
|
120
120
|
const scannedResolvedUrlSet = new Set<string>(
|
|
121
|
-
urlsCrawled.scanned.map(item => item.actualUrl || item.url),
|
|
121
|
+
urlsCrawled.scanned.map(item => normUrl(item.actualUrl || item.url)),
|
|
122
122
|
);
|
|
123
123
|
const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes as FileTypes);
|
|
124
124
|
const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
|
|
@@ -166,13 +166,14 @@ const crawlDomain = async ({
|
|
|
166
166
|
const selectedElementsString = cssQuerySelectors.join(', ');
|
|
167
167
|
|
|
168
168
|
const isExcluded = (newPageUrl: string): boolean => {
|
|
169
|
-
const isAlreadyScanned: boolean =
|
|
169
|
+
const isAlreadyScanned: boolean = scannedUrlSet.has(normUrl(newPageUrl));
|
|
170
170
|
const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns);
|
|
171
171
|
const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
|
|
172
172
|
const isNotSupportedDocument: boolean = disallowedListOfPatterns.some(pattern =>
|
|
173
173
|
newPageUrl.toLowerCase().startsWith(pattern),
|
|
174
174
|
);
|
|
175
|
-
|
|
175
|
+
const isRobotsDisallowed: boolean = isDisallowedInRobotsTxt(newPageUrl);
|
|
176
|
+
return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy || isRobotsDisallowed;
|
|
176
177
|
};
|
|
177
178
|
const setPageListeners = (pageListener: Page): void => {
|
|
178
179
|
// event listener to handle new page popups upon button click
|
|
@@ -341,7 +342,7 @@ const crawlDomain = async ({
|
|
|
341
342
|
} catch (e) {
|
|
342
343
|
consoleLogger.error(e);
|
|
343
344
|
}
|
|
344
|
-
if (scannedUrlSet.has(req.url)) {
|
|
345
|
+
if (scannedUrlSet.has(normUrl(req.url))) {
|
|
345
346
|
req.skipNavigation = true;
|
|
346
347
|
}
|
|
347
348
|
if (isDisallowedInRobotsTxt(req.url)) return null;
|
|
@@ -481,7 +482,7 @@ const crawlDomain = async ({
|
|
|
481
482
|
}
|
|
482
483
|
|
|
483
484
|
const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
|
|
484
|
-
if (isRedirected) {
|
|
485
|
+
if (isRedirected && !isDisallowedInRobotsTxt(finalUrl)) {
|
|
485
486
|
await enqueueUniqueRequest({ url: finalUrl, label: finalUrl });
|
|
486
487
|
} else {
|
|
487
488
|
request.skipNavigation = false;
|
|
@@ -537,7 +538,7 @@ const crawlDomain = async ({
|
|
|
537
538
|
}
|
|
538
539
|
|
|
539
540
|
// if URL has already been scanned
|
|
540
|
-
if (scannedUrlSet.has(request.url)) {
|
|
541
|
+
if (scannedUrlSet.has(normUrl(request.url))) {
|
|
541
542
|
await enqueueProcess(page, enqueueLinks, browserContext);
|
|
542
543
|
return;
|
|
543
544
|
}
|
|
@@ -654,8 +655,33 @@ const crawlDomain = async ({
|
|
|
654
655
|
|
|
655
656
|
const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
|
|
656
657
|
|
|
658
|
+
// Detect JS redirects that fire during/after axe scan.
|
|
659
|
+
// Listen for navigation, then give a brief window for pending redirects to complete.
|
|
660
|
+
try {
|
|
661
|
+
let navigatedToUrl: string | null = null;
|
|
662
|
+
const onFrameNavigated = (frame: Frame) => {
|
|
663
|
+
if (frame === page.mainFrame()) {
|
|
664
|
+
navigatedToUrl = frame.url();
|
|
665
|
+
}
|
|
666
|
+
};
|
|
667
|
+
page.on('framenavigated', onFrameNavigated);
|
|
668
|
+
await page.waitForTimeout(1000);
|
|
669
|
+
page.off('framenavigated', onFrameNavigated);
|
|
670
|
+
|
|
671
|
+
const postScanUrl = navigatedToUrl || page.url();
|
|
672
|
+
if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
|
|
673
|
+
urlsCrawled.notScannedRedirects.push({
|
|
674
|
+
fromUrl: request.url,
|
|
675
|
+
toUrl: postScanUrl,
|
|
676
|
+
});
|
|
677
|
+
return;
|
|
678
|
+
}
|
|
679
|
+
} catch (_) {
|
|
680
|
+
// Page/context was destroyed during navigation — handled by outer catch
|
|
681
|
+
}
|
|
682
|
+
|
|
657
683
|
if (isRedirected) {
|
|
658
|
-
const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(actualUrl);
|
|
684
|
+
const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(normUrl(actualUrl));
|
|
659
685
|
|
|
660
686
|
if (isLoadedUrlInCrawledUrls) {
|
|
661
687
|
urlsCrawled.notScannedRedirects.push({
|
|
@@ -677,8 +703,8 @@ const crawlDomain = async ({
|
|
|
677
703
|
pageTitle: results.pageTitle,
|
|
678
704
|
actualUrl, // i.e. actualUrl
|
|
679
705
|
});
|
|
680
|
-
scannedUrlSet.add(request.url);
|
|
681
|
-
scannedResolvedUrlSet.add(actualUrl);
|
|
706
|
+
scannedUrlSet.add(normUrl(request.url));
|
|
707
|
+
scannedResolvedUrlSet.add(normUrl(actualUrl));
|
|
682
708
|
|
|
683
709
|
urlsCrawled.scannedRedirects.push({
|
|
684
710
|
fromUrl: request.url,
|
|
@@ -700,8 +726,8 @@ const crawlDomain = async ({
|
|
|
700
726
|
actualUrl: request.url,
|
|
701
727
|
pageTitle: results.pageTitle,
|
|
702
728
|
});
|
|
703
|
-
scannedUrlSet.add(request.url);
|
|
704
|
-
scannedResolvedUrlSet.add(request.url);
|
|
729
|
+
scannedUrlSet.add(normUrl(request.url));
|
|
730
|
+
scannedResolvedUrlSet.add(normUrl(request.url));
|
|
705
731
|
await dataset.pushData(results);
|
|
706
732
|
}
|
|
707
733
|
} else {
|
|
@@ -7,7 +7,7 @@ import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
|
7
7
|
import crawlDomain from './crawlDomain.js';
|
|
8
8
|
import crawlSitemap from './crawlSitemap.js';
|
|
9
9
|
import { ViewportSettingsClass } from '../combine.js';
|
|
10
|
-
import { getPlaywrightLaunchOptions } from '../constants/common.js';
|
|
10
|
+
import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
|
|
11
11
|
import { register } from '../utils.js';
|
|
12
12
|
|
|
13
13
|
const crawlIntelligentSitemap = async (
|
|
@@ -100,12 +100,30 @@ const crawlIntelligentSitemap = async (
|
|
|
100
100
|
}
|
|
101
101
|
};
|
|
102
102
|
|
|
103
|
+
// Discover sitemaps from robots.txt first (supports multiple Sitemap: directives)
|
|
104
|
+
let sitemapUrls: string[] = [];
|
|
103
105
|
try {
|
|
104
|
-
|
|
106
|
+
sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
|
|
107
|
+
if (sitemapUrls.length > 0) {
|
|
108
|
+
console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
|
|
109
|
+
sitemapExist = true;
|
|
110
|
+
}
|
|
105
111
|
} catch (error) {
|
|
106
112
|
consoleLogger.error(error);
|
|
107
113
|
}
|
|
108
114
|
|
|
115
|
+
// Fall back to hardcoded path probing if robots.txt had no sitemaps
|
|
116
|
+
if (!sitemapExist) {
|
|
117
|
+
try {
|
|
118
|
+
sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
|
|
119
|
+
if (sitemapExist) {
|
|
120
|
+
sitemapUrls = [sitemapUrl];
|
|
121
|
+
}
|
|
122
|
+
} catch (error) {
|
|
123
|
+
consoleLogger.error(error);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
109
127
|
if (!sitemapExist) {
|
|
110
128
|
console.log('Unable to find sitemap. Commencing website crawl instead.');
|
|
111
129
|
return await crawlDomain({
|
|
@@ -124,38 +142,53 @@ const crawlIntelligentSitemap = async (
|
|
|
124
142
|
followRobots,
|
|
125
143
|
extraHTTPHeaders,
|
|
126
144
|
safeMode,
|
|
127
|
-
scanDuration,
|
|
145
|
+
scanDuration,
|
|
128
146
|
});
|
|
129
147
|
}
|
|
130
148
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
149
|
+
// Process all discovered sitemaps sequentially, sharing dataset and urlsCrawled
|
|
150
|
+
for (const currentSitemapUrl of sitemapUrls) {
|
|
151
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) break;
|
|
152
|
+
|
|
153
|
+
const elapsed = Date.now() - startTime;
|
|
154
|
+
const remainingDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : scanDuration;
|
|
155
|
+
if (scanDuration > 0 && remainingDuration <= 0) {
|
|
156
|
+
durationExceeded = true;
|
|
157
|
+
break;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
console.log(`Processing sitemap: ${currentSitemapUrl}`);
|
|
161
|
+
urlsCrawledFinal = await crawlSitemap({
|
|
162
|
+
sitemapUrl: currentSitemapUrl,
|
|
163
|
+
randomToken,
|
|
164
|
+
host,
|
|
165
|
+
viewportSettings,
|
|
166
|
+
maxRequestsPerCrawl,
|
|
167
|
+
browser,
|
|
168
|
+
userDataDirectory,
|
|
169
|
+
specifiedMaxConcurrency,
|
|
170
|
+
fileTypes,
|
|
171
|
+
blacklistedPatterns,
|
|
172
|
+
includeScreenshots,
|
|
173
|
+
extraHTTPHeaders,
|
|
174
|
+
strategy,
|
|
175
|
+
userUrl: url,
|
|
176
|
+
fromCrawlIntelligentSitemap,
|
|
177
|
+
userUrlInputFromIntelligent: url,
|
|
178
|
+
datasetFromIntelligent: dataset,
|
|
179
|
+
urlsCrawledFromIntelligent: urlsCrawled,
|
|
180
|
+
crawledFromLocalFile: false,
|
|
181
|
+
scanDuration: scanDuration > 0 ? remainingDuration : 0,
|
|
182
|
+
});
|
|
183
|
+
}
|
|
152
184
|
|
|
153
185
|
const elapsed = Date.now() - startTime;
|
|
154
|
-
const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0)
|
|
186
|
+
const remainingScanDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : 0;
|
|
187
|
+
const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
|
|
155
188
|
|
|
156
|
-
if (
|
|
189
|
+
if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
|
|
157
190
|
console.log(
|
|
158
|
-
`Continuing crawl from root website
|
|
191
|
+
`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`,
|
|
159
192
|
);
|
|
160
193
|
urlsCrawledFinal = await crawlDomain({
|
|
161
194
|
url,
|
|
@@ -175,10 +208,10 @@ const crawlIntelligentSitemap = async (
|
|
|
175
208
|
safeMode,
|
|
176
209
|
fromCrawlIntelligentSitemap,
|
|
177
210
|
datasetFromIntelligent: dataset,
|
|
178
|
-
urlsCrawledFromIntelligent:
|
|
211
|
+
urlsCrawledFromIntelligent: urlsCrawled,
|
|
179
212
|
scanDuration: remainingScanDuration,
|
|
180
213
|
});
|
|
181
|
-
} else if (
|
|
214
|
+
} else if (!hasDurationRemaining) {
|
|
182
215
|
console.log(
|
|
183
216
|
`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`,
|
|
184
217
|
);
|
|
@@ -186,7 +219,7 @@ const crawlIntelligentSitemap = async (
|
|
|
186
219
|
}
|
|
187
220
|
|
|
188
221
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
189
|
-
return { urlsCrawled
|
|
222
|
+
return { urlsCrawled, durationExceeded };
|
|
190
223
|
};
|
|
191
224
|
|
|
192
225
|
export default crawlIntelligentSitemap;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import crawlee, { LaunchContext, Request, RequestList, Dataset } from 'crawlee';
|
|
1
|
+
import crawlee, { EnqueueStrategy, LaunchContext, Request, RequestList, Dataset } from 'crawlee';
|
|
2
2
|
import fs from 'fs';
|
|
3
3
|
import * as path from 'path';
|
|
4
4
|
import fsp from 'fs/promises';
|
|
@@ -23,7 +23,7 @@ import {
|
|
|
23
23
|
waitForPageLoaded,
|
|
24
24
|
isFilePath,
|
|
25
25
|
} from '../constants/common.js';
|
|
26
|
-
import { areLinksEqual, isWhitelistedContentType, register } from '../utils.js';
|
|
26
|
+
import { areLinksEqual, isFollowStrategy, isWhitelistedContentType, normUrl, register } from '../utils.js';
|
|
27
27
|
import {
|
|
28
28
|
handlePdfDownload,
|
|
29
29
|
runPdfScan,
|
|
@@ -46,6 +46,8 @@ const crawlSitemap = async ({
|
|
|
46
46
|
blacklistedPatterns,
|
|
47
47
|
includeScreenshots,
|
|
48
48
|
extraHTTPHeaders,
|
|
49
|
+
strategy = EnqueueStrategy.All,
|
|
50
|
+
userUrl = '',
|
|
49
51
|
scanDuration = 0,
|
|
50
52
|
fromCrawlIntelligentSitemap = false,
|
|
51
53
|
userUrlInputFromIntelligent = null,
|
|
@@ -65,6 +67,8 @@ const crawlSitemap = async ({
|
|
|
65
67
|
blacklistedPatterns: string[];
|
|
66
68
|
includeScreenshots: boolean;
|
|
67
69
|
extraHTTPHeaders: Record<string, string>;
|
|
70
|
+
strategy?: EnqueueStrategy;
|
|
71
|
+
userUrl?: string;
|
|
68
72
|
scanDuration?: number;
|
|
69
73
|
fromCrawlIntelligentSitemap?: boolean;
|
|
70
74
|
userUrlInputFromIntelligent?: string;
|
|
@@ -76,6 +80,7 @@ const crawlSitemap = async ({
|
|
|
76
80
|
let dataset: crawlee.Dataset;
|
|
77
81
|
let urlsCrawled: UrlsCrawled;
|
|
78
82
|
let durationExceeded = false;
|
|
83
|
+
let isAbortingScan = false;
|
|
79
84
|
|
|
80
85
|
if (fromCrawlIntelligentSitemap) {
|
|
81
86
|
dataset = datasetFromIntelligent;
|
|
@@ -98,6 +103,8 @@ const crawlSitemap = async ({
|
|
|
98
103
|
userUrlInputFromIntelligent,
|
|
99
104
|
fromCrawlIntelligentSitemap,
|
|
100
105
|
extraHTTPHeaders,
|
|
106
|
+
strategy,
|
|
107
|
+
userUrl || sitemapUrl,
|
|
101
108
|
);
|
|
102
109
|
|
|
103
110
|
sitemapUrl = encodeURI(sitemapUrl);
|
|
@@ -244,135 +251,193 @@ const crawlSitemap = async ({
|
|
|
244
251
|
return;
|
|
245
252
|
}
|
|
246
253
|
|
|
247
|
-
|
|
254
|
+
try {
|
|
255
|
+
await waitForPageLoaded(page, 10000);
|
|
248
256
|
|
|
249
|
-
|
|
257
|
+
const actualUrl = page.url() || request.loadedUrl || request.url;
|
|
250
258
|
|
|
251
|
-
|
|
252
|
-
|
|
259
|
+
const hasExceededDuration =
|
|
260
|
+
scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
|
253
261
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
262
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
|
|
263
|
+
isAbortingScan = true;
|
|
264
|
+
if (hasExceededDuration) {
|
|
265
|
+
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
|
|
266
|
+
durationExceeded = true;
|
|
267
|
+
}
|
|
268
|
+
crawler.autoscaledPool.abort(); // stops new requests
|
|
269
|
+
return;
|
|
258
270
|
}
|
|
259
|
-
crawler.autoscaledPool.abort(); // stops new requests
|
|
260
|
-
return;
|
|
261
|
-
}
|
|
262
271
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
272
|
+
if (request.skipNavigation && actualUrl === 'about:blank') {
|
|
273
|
+
if (isScanPdfs) {
|
|
274
|
+
// pushes download promise into pdfDownloads
|
|
275
|
+
const { pdfFileName, url } = handlePdfDownload(
|
|
276
|
+
randomToken,
|
|
277
|
+
pdfDownloads,
|
|
278
|
+
request,
|
|
279
|
+
sendRequest,
|
|
280
|
+
urlsCrawled,
|
|
281
|
+
);
|
|
282
|
+
|
|
283
|
+
uuidToPdfMapping[pdfFileName] = url;
|
|
284
|
+
return;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
288
|
+
numScanned: urlsCrawled.scanned.length,
|
|
289
|
+
urlScanned: request.url,
|
|
290
|
+
});
|
|
291
|
+
urlsCrawled.userExcluded.push({
|
|
292
|
+
url: request.url,
|
|
293
|
+
pageTitle: request.url,
|
|
294
|
+
actualUrl: request.url, // because about:blank is not useful
|
|
295
|
+
metadata: STATUS_CODE_METADATA[1],
|
|
296
|
+
httpStatusCode: 1,
|
|
297
|
+
});
|
|
273
298
|
|
|
274
|
-
uuidToPdfMapping[pdfFileName] = url;
|
|
275
299
|
return;
|
|
276
300
|
}
|
|
277
301
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
urlScanned: request.url,
|
|
281
|
-
});
|
|
282
|
-
urlsCrawled.userExcluded.push({
|
|
283
|
-
url: request.url,
|
|
284
|
-
pageTitle: request.url,
|
|
285
|
-
actualUrl: request.url, // because about:blank is not useful
|
|
286
|
-
metadata: STATUS_CODE_METADATA[1],
|
|
287
|
-
httpStatusCode: 1,
|
|
288
|
-
});
|
|
302
|
+
const contentType = response?.headers?.()['content-type'] || '';
|
|
303
|
+
const status = response ? response.status() : 0;
|
|
289
304
|
|
|
290
|
-
|
|
291
|
-
|
|
305
|
+
if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
|
|
306
|
+
const isRedirected = !areLinksEqual(page.url(), request.url);
|
|
307
|
+
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
|
|
308
|
+
item => normUrl(item.actualUrl || item.url) === normUrl(page.url()),
|
|
309
|
+
);
|
|
292
310
|
|
|
293
|
-
|
|
294
|
-
|
|
311
|
+
if (isRedirected && isLoadedUrlInCrawledUrls) {
|
|
312
|
+
urlsCrawled.notScannedRedirects.push({
|
|
313
|
+
fromUrl: request.url,
|
|
314
|
+
toUrl: actualUrl, // i.e. actualUrl
|
|
315
|
+
});
|
|
316
|
+
return;
|
|
317
|
+
}
|
|
295
318
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
319
|
+
// This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
|
|
320
|
+
if (isRedirected && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
|
|
321
|
+
urlsCrawled.userExcluded.push({
|
|
322
|
+
url: request.url,
|
|
323
|
+
pageTitle: request.url,
|
|
324
|
+
actualUrl,
|
|
325
|
+
metadata: STATUS_CODE_METADATA[0],
|
|
326
|
+
httpStatusCode: 0,
|
|
327
|
+
});
|
|
301
328
|
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
}
|
|
329
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
330
|
+
numScanned: urlsCrawled.scanned.length,
|
|
331
|
+
urlScanned: request.url,
|
|
332
|
+
});
|
|
333
|
+
return;
|
|
334
|
+
}
|
|
309
335
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
336
|
+
if (isRedirected && !isFollowStrategy(actualUrl, request.url, 'same-hostname')) {
|
|
337
|
+
urlsCrawled.notScannedRedirects.push({
|
|
338
|
+
fromUrl: request.url,
|
|
339
|
+
toUrl: actualUrl,
|
|
340
|
+
});
|
|
341
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
342
|
+
numScanned: urlsCrawled.scanned.length,
|
|
343
|
+
urlScanned: request.url,
|
|
344
|
+
});
|
|
345
|
+
return;
|
|
346
|
+
}
|
|
319
347
|
|
|
320
|
-
|
|
348
|
+
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
|
349
|
+
|
|
350
|
+
// Detect JS redirects that fire during/after axe scan.
|
|
351
|
+
// Listen for navigation, then give a brief window for pending redirects to complete.
|
|
352
|
+
try {
|
|
353
|
+
let navigatedToUrl: string | null = null;
|
|
354
|
+
const onFrameNavigated = (frame: any) => {
|
|
355
|
+
if (frame === page.mainFrame()) {
|
|
356
|
+
navigatedToUrl = frame.url();
|
|
357
|
+
}
|
|
358
|
+
};
|
|
359
|
+
page.on('framenavigated', onFrameNavigated);
|
|
360
|
+
await page.waitForTimeout(1000);
|
|
361
|
+
page.off('framenavigated', onFrameNavigated);
|
|
362
|
+
|
|
363
|
+
const postScanUrl = navigatedToUrl || page.url();
|
|
364
|
+
if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
|
|
365
|
+
urlsCrawled.notScannedRedirects.push({
|
|
366
|
+
fromUrl: request.url,
|
|
367
|
+
toUrl: postScanUrl,
|
|
368
|
+
});
|
|
369
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
370
|
+
numScanned: urlsCrawled.scanned.length,
|
|
371
|
+
urlScanned: request.url,
|
|
372
|
+
});
|
|
373
|
+
return;
|
|
374
|
+
}
|
|
375
|
+
} catch (_) {
|
|
376
|
+
// Page/context was destroyed during navigation — handled by outer catch
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
321
380
|
numScanned: urlsCrawled.scanned.length,
|
|
322
381
|
urlScanned: request.url,
|
|
323
382
|
});
|
|
324
|
-
return;
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
|
328
|
-
|
|
329
|
-
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
330
|
-
numScanned: urlsCrawled.scanned.length,
|
|
331
|
-
urlScanned: request.url,
|
|
332
|
-
});
|
|
333
383
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
384
|
+
urlsCrawled.scanned.push({
|
|
385
|
+
url: request.url,
|
|
386
|
+
pageTitle: results.pageTitle,
|
|
387
|
+
actualUrl, // i.e. actualUrl
|
|
388
|
+
});
|
|
339
389
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
390
|
+
urlsCrawled.scannedRedirects.push({
|
|
391
|
+
fromUrl: request.url,
|
|
392
|
+
toUrl: actualUrl,
|
|
393
|
+
});
|
|
344
394
|
|
|
345
|
-
|
|
346
|
-
|
|
395
|
+
results.url = request.url;
|
|
396
|
+
results.actualUrl = actualUrl;
|
|
347
397
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
398
|
+
await dataset.pushData(results);
|
|
399
|
+
} else {
|
|
400
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
401
|
+
numScanned: urlsCrawled.scanned.length,
|
|
402
|
+
urlScanned: request.url,
|
|
403
|
+
});
|
|
354
404
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
405
|
+
if (isScanHtml) {
|
|
406
|
+
// carry through the HTTP status metadata
|
|
407
|
+
const status = response?.status();
|
|
408
|
+
const metadata =
|
|
409
|
+
typeof status === 'number'
|
|
410
|
+
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
|
411
|
+
: STATUS_CODE_METADATA[2];
|
|
412
|
+
|
|
413
|
+
urlsCrawled.invalid.push({
|
|
414
|
+
actualUrl,
|
|
415
|
+
url: request.url,
|
|
416
|
+
pageTitle: request.url,
|
|
417
|
+
metadata,
|
|
418
|
+
httpStatusCode: typeof status === 'number' ? status : 0,
|
|
419
|
+
});
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
} catch (e) {
|
|
423
|
+
if (!isAbortingScan) {
|
|
424
|
+
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
|
425
|
+
numScanned: urlsCrawled.scanned.length,
|
|
426
|
+
urlScanned: request.url,
|
|
427
|
+
});
|
|
362
428
|
|
|
363
|
-
urlsCrawled.
|
|
364
|
-
actualUrl,
|
|
429
|
+
urlsCrawled.error.push({
|
|
365
430
|
url: request.url,
|
|
366
431
|
pageTitle: request.url,
|
|
367
|
-
|
|
368
|
-
|
|
432
|
+
actualUrl: request.url,
|
|
433
|
+
metadata: STATUS_CODE_METADATA[2],
|
|
434
|
+
httpStatusCode: 0,
|
|
369
435
|
});
|
|
370
436
|
}
|
|
371
437
|
}
|
|
372
438
|
},
|
|
373
439
|
failedRequestHandler: async ({ request, response, error }) => {
|
|
374
|
-
|
|
375
|
-
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
|
|
440
|
+
if (isAbortingScan) {
|
|
376
441
|
return;
|
|
377
442
|
}
|
|
378
443
|
|