@govtechsg/oobee 0.10.86 → 0.10.87
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/image.yml +2 -3
- package/dist/cli.js +18 -5
- package/dist/combine.js +2 -0
- package/dist/constants/cliFunctions.js +2 -2
- package/dist/constants/common.js +55 -13
- package/dist/crawlers/crawlDomain.js +38 -13
- package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
- package/dist/crawlers/crawlSitemap.js +44 -5
- package/dist/crawlers/custom/utils.js +81 -40
- package/dist/generateHtmlReport.js +18 -11
- package/dist/mergeAxeResults/itemReferences.js +60 -25
- package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
- package/dist/mergeAxeResults.js +18 -9
- package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
- package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
- package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
- package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
- package/dist/static/ejs/summary.ejs +18 -12
- package/dist/utils.js +4 -3
- package/fix-summary-html-oom-pr.md +62 -0
- package/package.json +5 -5
- package/src/cli.ts +19 -5
- package/src/combine.ts +2 -0
- package/src/constants/cliFunctions.ts +2 -2
- package/src/constants/common.ts +65 -12
- package/src/crawlers/crawlDomain.ts +39 -13
- package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
- package/src/crawlers/crawlSitemap.ts +50 -3
- package/src/crawlers/custom/utils.ts +99 -43
- package/src/generateHtmlReport.ts +21 -11
- package/src/mergeAxeResults/itemReferences.ts +70 -26
- package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
- package/src/mergeAxeResults.ts +21 -11
- package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
- package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
- package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
- package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
- package/src/static/ejs/summary.ejs +18 -12
- package/src/utils.ts +4 -3
- package/testStaticJSScanner.html +1 -1
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# fix: prevent OOM and browser crash in report generation for large scans
|
|
2
|
+
|
|
3
|
+
## Summary
|
|
4
|
+
|
|
5
|
+
- Fix `summary.ejs` inlining the entire scan items payload (2 GB+ for 1000-page scans) via `JSON.stringify`, causing V8 OOM and killing the process
|
|
6
|
+
- Fix `report.html` embedded scanItems exceeding browser memory limits (746 MB uncompressed JSON for 1000-page scans)
|
|
7
|
+
- Fix write stream backpressure handling when embedding chunked base64 data
|
|
8
|
+
- `writeSummaryHTML` crash also blocked `report.html` generation since it runs first
|
|
9
|
+
|
|
10
|
+
## Problem 1: OOM in summary.html generation (server-side)
|
|
11
|
+
|
|
12
|
+
For large scans (e.g. 1000 pages, 2.5M+ passed occurrences), `summary.ejs` serialized the full `items` object — including every rule's `pagesAffected` array with all individual issue items — into an inline `<script>` tag. This produced a string exceeding V8's limits, crashing the process silently.
|
|
13
|
+
|
|
14
|
+
The result: neither `summary.html` nor `report.html` were generated, even though all JSON artifacts (`scanData.json`, `scanItems.json`, etc.) were written successfully.
|
|
15
|
+
|
|
16
|
+
## Problem 2: Browser cannot parse embedded scanItems (client-side)
|
|
17
|
+
|
|
18
|
+
Even with report generation fixed, the browser failed to load the All Issues view:
|
|
19
|
+
```
|
|
20
|
+
Failed to decode/unzip/parse: Unexpected end of JSON input
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Root cause: `convertItemsToReferences` stripped per-page `items` arrays but still embedded the full `pagesAffected` array (url, pageTitle, actualUrl, metadata, etc. for every page × every rule). For 1000-page scans this produced **746 MB of uncompressed JSON** after base64-decode and gunzip — exceeding browser string/memory limits during `JSON.parse()`.
|
|
24
|
+
|
|
25
|
+
## Problem 3: Write stream backpressure (server-side)
|
|
26
|
+
|
|
27
|
+
The `writeHTML` function writes scan items as 2 MB base64 chunks via a `for await` loop over a read stream. `outputStream.write()` was not being checked for backpressure — when the write buffer filled up, subsequent writes could be silently dropped, producing truncated base64.
|
|
28
|
+
|
|
29
|
+
## Fix
|
|
30
|
+
|
|
31
|
+
### summary.ejs (OOM fix)
|
|
32
|
+
Strip the inline JSON to only what `summaryTable.ejs` actually needs:
|
|
33
|
+
- Rule-level metadata: `description`, `helpUrl`, `conformance`, `totalItems`
|
|
34
|
+
- `pagesAffected: { length: N }` (just the count object, not the full array)
|
|
35
|
+
|
|
36
|
+
This reduces the serialized payload from potentially gigabytes to a few kilobytes regardless of scan size.
|
|
37
|
+
|
|
38
|
+
### itemReferences.ts (browser payload fix)
|
|
39
|
+
`convertItemsToReferences` now strips each `pagesAffected` entry down to only `url`, `pageTitle`, and `itemsCount` — removing all per-item details (html snippets, screenshots, xpath, metadata, etc.) that constituted the bulk of the data. The All Issues list renders rule totals, and the "Group By Page" view in the rule modal still shows page URLs with occurrence counts.
|
|
40
|
+
|
|
41
|
+
This reduces the embedded payload from 746 MB (uncompressed) to ~11 MB for a 1000-page scan — well within browser memory limits.
|
|
42
|
+
|
|
43
|
+
### mergeAxeResults.ts (backpressure fix)
|
|
44
|
+
Await the `drain` event on the output stream when `write()` returns `false` before writing the next chunk. This ensures all base64 data is fully written to the report regardless of payload size.
|
|
45
|
+
|
|
46
|
+
## Files changed
|
|
47
|
+
|
|
48
|
+
| File | Change |
|
|
49
|
+
|------|--------|
|
|
50
|
+
| `src/static/ejs/summary.ejs` | Strip inline JSON to rule counts only |
|
|
51
|
+
| `src/mergeAxeResults/itemReferences.ts` | Strip `pagesAffected` to lightweight entries (url, pageTitle, itemsCount only) |
|
|
52
|
+
| `src/mergeAxeResults.ts` | Await drain on backpressure during chunked write |
|
|
53
|
+
| `src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs` | Fall back to `pagesAffectedCount` |
|
|
54
|
+
| `src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs` | Fall back to `pagesAffectedCount` |
|
|
55
|
+
|
|
56
|
+
## Test plan
|
|
57
|
+
|
|
58
|
+
- [ ] Run a large scan (500+ pages) and verify both `summary.html` and `report.html` are generated
|
|
59
|
+
- [ ] Open `summary.html` in a browser and verify the summary table renders correctly (issue counts, page counts, help links)
|
|
60
|
+
- [ ] Open `report.html` and verify the All Issues list loads and displays rule counts correctly
|
|
61
|
+
- [ ] Verify the rule modal shows correct "Pages affected" count
|
|
62
|
+
- [ ] Verify small scans still produce correct reports (no regression)
|
package/package.json
CHANGED
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@govtechsg/oobee",
|
|
3
3
|
"main": "dist/npmIndex.js",
|
|
4
|
-
"version": "0.10.
|
|
4
|
+
"version": "0.10.87",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"author": "Government Technology Agency <info@tech.gov.sg>",
|
|
7
7
|
"bin": {
|
|
8
8
|
"oobee": "./dist/cli.js"
|
|
9
9
|
},
|
|
10
10
|
"dependencies": {
|
|
11
|
-
"@aws-sdk/client-s3": "^3.
|
|
11
|
+
"@aws-sdk/client-s3": "^3.1049.0",
|
|
12
12
|
"@json2csv/node": "^7.0.3",
|
|
13
13
|
"@napi-rs/canvas": "^0.1.53",
|
|
14
14
|
"@sentry/node": "^9.13.0",
|
|
15
15
|
"@types/aws-sdk": "^0.0.42",
|
|
16
|
-
"axe-core": "^4.11.
|
|
16
|
+
"axe-core": "^4.11.4",
|
|
17
17
|
"axios": "^1.8.2",
|
|
18
18
|
"base64-stream": "^1.0.0",
|
|
19
19
|
"cheerio": "^1.0.0-rc.12",
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
"tldts": "^7.0.27",
|
|
40
40
|
"typescript": "^5.4.5",
|
|
41
41
|
"url": "^0.11.3",
|
|
42
|
-
"uuid": "^
|
|
42
|
+
"uuid": "^14.0.0",
|
|
43
43
|
"validator": "^13.11.0",
|
|
44
44
|
"which": "^4.0.0",
|
|
45
45
|
"winston": "^3.11.0",
|
|
@@ -86,7 +86,7 @@
|
|
|
86
86
|
"fast-xml-parser": ">=5.3.8",
|
|
87
87
|
"js-yaml": "^4.1.1",
|
|
88
88
|
"minimatch": "^10.2.4",
|
|
89
|
-
"brace-expansion": "^5.0.
|
|
89
|
+
"brace-expansion": "^5.0.6",
|
|
90
90
|
"glob": "^13.0.6",
|
|
91
91
|
"flatted": "^3.4.1",
|
|
92
92
|
"file-type": "^21.3.3"
|
package/src/cli.ts
CHANGED
|
@@ -193,8 +193,11 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
|
|
|
193
193
|
.check(argvs => {
|
|
194
194
|
const scanner = String(argvs.scanner ?? '');
|
|
195
195
|
|
|
196
|
-
if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM) {
|
|
197
|
-
throw new Error('-s or --strategy is only available in website
|
|
196
|
+
if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM && scanner !== ScannerTypes.INTELLIGENT && scanner !== ScannerTypes.SITEMAP) {
|
|
197
|
+
throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
|
|
198
|
+
}
|
|
199
|
+
if (argvs.strategy === 'ignore' && scanner !== ScannerTypes.SITEMAP) {
|
|
200
|
+
throw new Error('-s ignore is only available for sitemap scans.');
|
|
198
201
|
}
|
|
199
202
|
return true;
|
|
200
203
|
})
|
|
@@ -210,14 +213,21 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
|
|
|
210
213
|
return duration;
|
|
211
214
|
})
|
|
212
215
|
.check(argvs => {
|
|
213
|
-
if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
|
|
214
|
-
throw new Error('-s or --strategy is only available in website scans.');
|
|
216
|
+
if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.scanner !== ScannerTypes.CUSTOM && argvs.scanner !== ScannerTypes.INTELLIGENT && argvs.scanner !== ScannerTypes.SITEMAP && argvs.strategy) {
|
|
217
|
+
throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
|
|
218
|
+
}
|
|
219
|
+
if (argvs.strategy === 'ignore' && argvs.scanner !== ScannerTypes.SITEMAP) {
|
|
220
|
+
throw new Error('-s ignore is only available for sitemap scans.');
|
|
215
221
|
}
|
|
216
222
|
return true;
|
|
217
223
|
})
|
|
218
224
|
.conflicts('d', 'w')
|
|
219
225
|
.parse() as unknown as Answers;
|
|
220
226
|
|
|
227
|
+
if (!options.strategy) {
|
|
228
|
+
options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
|
|
229
|
+
}
|
|
230
|
+
|
|
221
231
|
const scanInit = async (argvs: Answers): Promise<string> => {
|
|
222
232
|
const updatedArgvs = { ...argvs };
|
|
223
233
|
|
|
@@ -250,7 +260,11 @@ const scanInit = async (argvs: Answers): Promise<string> => {
|
|
|
250
260
|
consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
|
|
251
261
|
|
|
252
262
|
if (res.status === statuses.success.code) {
|
|
253
|
-
|
|
263
|
+
// Custom flow should continue from the user-provided entry URL so auth redirects
|
|
264
|
+
// do not replace the original domain used for overlay gating and navigation.
|
|
265
|
+
if (data.type !== ScannerTypes.CUSTOM) {
|
|
266
|
+
data.url = res.url;
|
|
267
|
+
}
|
|
254
268
|
if (process.env.OOBEE_VALIDATE_URL) {
|
|
255
269
|
consoleLogger.info('Url is valid');
|
|
256
270
|
cleanUpAndExit(0, data.randomToken);
|
package/src/combine.ts
CHANGED
|
@@ -168,8 +168,8 @@ export const cliOptions: { [key: string]: Options } = {
|
|
|
168
168
|
s: {
|
|
169
169
|
alias: 'strategy',
|
|
170
170
|
describe:
|
|
171
|
-
'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
|
|
172
|
-
choices: ['same-domain', 'same-hostname'],
|
|
171
|
+
'Crawls up to general (same parent) domains, or only specific hostname. Use "ignore" to disable URL filtering (default for sitemap scans). Defaults to "same-domain".',
|
|
172
|
+
choices: ['same-domain', 'same-hostname', 'ignore'],
|
|
173
173
|
requiresArg: true,
|
|
174
174
|
demandOption: false,
|
|
175
175
|
},
|
package/src/constants/common.ts
CHANGED
|
@@ -33,7 +33,7 @@ import constants, {
|
|
|
33
33
|
} from './constants.js';
|
|
34
34
|
import { consoleLogger } from '../logs.js';
|
|
35
35
|
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
|
|
36
|
-
import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
|
|
36
|
+
import { cleanUpAndExit, isFollowStrategy, randomThreeDigitNumberString, register } from '../utils.js';
|
|
37
37
|
import { Answers, Data } from '../index.js';
|
|
38
38
|
import { DeviceDescriptor } from '../types/types.js';
|
|
39
39
|
import { getProxyInfo, proxyInfoToResolution, ProxySettings } from '../proxyService.js';
|
|
@@ -746,7 +746,9 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
746
746
|
playwrightDeviceDetailsObject,
|
|
747
747
|
maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
|
|
748
748
|
strategy:
|
|
749
|
-
strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
|
|
749
|
+
strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
|
|
750
|
+
: strategy === 'ignore' ? EnqueueStrategy.All
|
|
751
|
+
: EnqueueStrategy.SameDomain,
|
|
750
752
|
isLocalFileScan,
|
|
751
753
|
browser: browserToRun,
|
|
752
754
|
nameEmail,
|
|
@@ -804,7 +806,11 @@ export const getUrlsFromRobotsTxt = async (
|
|
|
804
806
|
const disallowedUrls = [];
|
|
805
807
|
const allowedUrls = [];
|
|
806
808
|
|
|
807
|
-
|
|
809
|
+
// Returns 1–2 minimatch glob patterns for a single robots.txt path pattern.
|
|
810
|
+
// Two patterns are returned for bare paths (no trailing wildcard) so that
|
|
811
|
+
// both the exact URL and all child paths are blocked, matching robots.txt
|
|
812
|
+
// prefix semantics.
|
|
813
|
+
const sanitisePattern = (pattern: string): string[] => {
|
|
808
814
|
const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
|
|
809
815
|
const subdirWildcardRegex = /\/\*\//g;
|
|
810
816
|
const filePathRegex = /^\/(?:[^\/]+\/)*[^\/]+\.[a-zA-Z0-9]{1,6}$/;
|
|
@@ -812,16 +818,30 @@ export const getUrlsFromRobotsTxt = async (
|
|
|
812
818
|
if (subdirWildcardRegex.test(pattern)) {
|
|
813
819
|
pattern = pattern.replace(subdirWildcardRegex, '/**/');
|
|
814
820
|
}
|
|
821
|
+
|
|
822
|
+
// Query-string patterns (e.g. /faq?faqItem= or /faq/?faq&faqItem=):
|
|
823
|
+
// '?' is the query separator in robots.txt but a single-char wildcard in
|
|
824
|
+
// minimatch. Escape it to a literal match and append '*' so any query
|
|
825
|
+
// value after the stated prefix is also blocked.
|
|
826
|
+
if (pattern.includes('?')) {
|
|
827
|
+
return [domain + pattern.replace('?', '\\?') + '*'];
|
|
828
|
+
}
|
|
829
|
+
|
|
815
830
|
if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
|
|
816
831
|
if (pattern.endsWith('*')) {
|
|
817
|
-
|
|
832
|
+
// e.g. /ebook/* → /ebook/** (already covers all children)
|
|
833
|
+
return [domain + pattern.concat('*')];
|
|
818
834
|
} else {
|
|
819
|
-
|
|
820
|
-
|
|
835
|
+
// Bare path (e.g. /subscription/unsubscribe): robots.txt blocks the
|
|
836
|
+
// exact URL *and* every descendant. minimatch's '/**' glob does not
|
|
837
|
+
// match the bare path itself (no trailing slash), so we emit both the
|
|
838
|
+
// exact-path pattern and a children glob.
|
|
839
|
+
const base = domain + pattern;
|
|
840
|
+
const children = domain + (pattern.endsWith('/') ? pattern : pattern + '/') + '**';
|
|
841
|
+
return [base, children];
|
|
821
842
|
}
|
|
822
843
|
}
|
|
823
|
-
|
|
824
|
-
return final;
|
|
844
|
+
return [domain + pattern];
|
|
825
845
|
};
|
|
826
846
|
|
|
827
847
|
for (const line of lines) {
|
|
@@ -832,14 +852,12 @@ export const getUrlsFromRobotsTxt = async (
|
|
|
832
852
|
} else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
|
|
833
853
|
let disallowed = line.substring('disallow: '.length).trim();
|
|
834
854
|
if (disallowed) {
|
|
835
|
-
|
|
836
|
-
disallowedUrls.push(disallowed);
|
|
855
|
+
disallowedUrls.push(...sanitisePattern(disallowed));
|
|
837
856
|
}
|
|
838
857
|
} else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
|
|
839
858
|
let allowed = line.substring('allow: '.length).trim();
|
|
840
859
|
if (allowed) {
|
|
841
|
-
|
|
842
|
-
allowedUrls.push(allowed);
|
|
860
|
+
allowedUrls.push(...sanitisePattern(allowed));
|
|
843
861
|
}
|
|
844
862
|
}
|
|
845
863
|
}
|
|
@@ -899,6 +917,38 @@ const getRobotsTxtViaPlaywright = async (
|
|
|
899
917
|
}
|
|
900
918
|
};
|
|
901
919
|
|
|
920
|
+
export const getSitemapsFromRobotsTxt = async (
|
|
921
|
+
url: string,
|
|
922
|
+
browser: string,
|
|
923
|
+
userDataDirectory: string,
|
|
924
|
+
extraHTTPHeaders: Record<string, string>,
|
|
925
|
+
): Promise<string[]> => {
|
|
926
|
+
const domain = new URL(url).origin;
|
|
927
|
+
const robotsUrl = domain.concat('/robots.txt');
|
|
928
|
+
|
|
929
|
+
let robotsTxt: string;
|
|
930
|
+
try {
|
|
931
|
+
robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browser, userDataDirectory, extraHTTPHeaders);
|
|
932
|
+
} catch (e) {
|
|
933
|
+
consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl} for sitemap discovery`);
|
|
934
|
+
return [];
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
if (!robotsTxt) return [];
|
|
938
|
+
|
|
939
|
+
const sitemaps: string[] = [];
|
|
940
|
+
const lines = robotsTxt.split(/\r?\n/);
|
|
941
|
+
for (const line of lines) {
|
|
942
|
+
if (line.toLowerCase().startsWith('sitemap:')) {
|
|
943
|
+
const sitemapUrl = line.substring('sitemap:'.length).trim();
|
|
944
|
+
if (sitemapUrl) {
|
|
945
|
+
sitemaps.push(sitemapUrl);
|
|
946
|
+
}
|
|
947
|
+
}
|
|
948
|
+
}
|
|
949
|
+
return sitemaps;
|
|
950
|
+
};
|
|
951
|
+
|
|
902
952
|
export const isDisallowedInRobotsTxt = (url: string): boolean => {
|
|
903
953
|
if (!constants.robotsTxtUrls) return;
|
|
904
954
|
|
|
@@ -931,6 +981,8 @@ export const getLinksFromSitemap = async (
|
|
|
931
981
|
userUrlInput: string,
|
|
932
982
|
isIntelligent: boolean,
|
|
933
983
|
extraHTTPHeaders: Record<string, string>,
|
|
984
|
+
strategy: EnqueueStrategy = EnqueueStrategy.All,
|
|
985
|
+
userUrl: string = userUrlInput,
|
|
934
986
|
) => {
|
|
935
987
|
const scannedSitemaps = new Set<string>();
|
|
936
988
|
const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
|
|
@@ -940,6 +992,7 @@ export const getLinksFromSitemap = async (
|
|
|
940
992
|
const addToUrlList = (url: string) => {
|
|
941
993
|
if (!url) return;
|
|
942
994
|
if (isDisallowedInRobotsTxt(url)) return;
|
|
995
|
+
if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy)) return;
|
|
943
996
|
|
|
944
997
|
url = convertPathToLocalFile(url);
|
|
945
998
|
|
|
@@ -29,7 +29,7 @@ import {
|
|
|
29
29
|
getUrlsFromRobotsTxt,
|
|
30
30
|
waitForPageLoaded,
|
|
31
31
|
} from '../constants/common.js';
|
|
32
|
-
import { areLinksEqual, isFollowStrategy, register } from '../utils.js';
|
|
32
|
+
import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
|
|
33
33
|
import {
|
|
34
34
|
handlePdfDownload,
|
|
35
35
|
runPdfScan,
|
|
@@ -116,9 +116,9 @@ const crawlDomain = async ({
|
|
|
116
116
|
const pdfDownloads: Promise<void>[] = [];
|
|
117
117
|
const uuidToPdfMapping: Record<string, string> = {};
|
|
118
118
|
const queuedUrlSet = new Set<string>();
|
|
119
|
-
const scannedUrlSet = new Set<string>(urlsCrawled.scanned.map(item => item.url));
|
|
119
|
+
const scannedUrlSet = new Set<string>(urlsCrawled.scanned.map(item => normUrl(item.url)));
|
|
120
120
|
const scannedResolvedUrlSet = new Set<string>(
|
|
121
|
-
urlsCrawled.scanned.map(item => item.actualUrl || item.url),
|
|
121
|
+
urlsCrawled.scanned.map(item => normUrl(item.actualUrl || item.url)),
|
|
122
122
|
);
|
|
123
123
|
const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes as FileTypes);
|
|
124
124
|
const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
|
|
@@ -166,13 +166,14 @@ const crawlDomain = async ({
|
|
|
166
166
|
const selectedElementsString = cssQuerySelectors.join(', ');
|
|
167
167
|
|
|
168
168
|
const isExcluded = (newPageUrl: string): boolean => {
|
|
169
|
-
const isAlreadyScanned: boolean =
|
|
169
|
+
const isAlreadyScanned: boolean = scannedUrlSet.has(normUrl(newPageUrl));
|
|
170
170
|
const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns);
|
|
171
171
|
const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
|
|
172
172
|
const isNotSupportedDocument: boolean = disallowedListOfPatterns.some(pattern =>
|
|
173
173
|
newPageUrl.toLowerCase().startsWith(pattern),
|
|
174
174
|
);
|
|
175
|
-
|
|
175
|
+
const isRobotsDisallowed: boolean = isDisallowedInRobotsTxt(newPageUrl);
|
|
176
|
+
return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy || isRobotsDisallowed;
|
|
176
177
|
};
|
|
177
178
|
const setPageListeners = (pageListener: Page): void => {
|
|
178
179
|
// event listener to handle new page popups upon button click
|
|
@@ -341,7 +342,7 @@ const crawlDomain = async ({
|
|
|
341
342
|
} catch (e) {
|
|
342
343
|
consoleLogger.error(e);
|
|
343
344
|
}
|
|
344
|
-
if (scannedUrlSet.has(req.url)) {
|
|
345
|
+
if (scannedUrlSet.has(normUrl(req.url))) {
|
|
345
346
|
req.skipNavigation = true;
|
|
346
347
|
}
|
|
347
348
|
if (isDisallowedInRobotsTxt(req.url)) return null;
|
|
@@ -481,7 +482,7 @@ const crawlDomain = async ({
|
|
|
481
482
|
}
|
|
482
483
|
|
|
483
484
|
const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
|
|
484
|
-
if (isRedirected) {
|
|
485
|
+
if (isRedirected && !isDisallowedInRobotsTxt(finalUrl)) {
|
|
485
486
|
await enqueueUniqueRequest({ url: finalUrl, label: finalUrl });
|
|
486
487
|
} else {
|
|
487
488
|
request.skipNavigation = false;
|
|
@@ -537,7 +538,7 @@ const crawlDomain = async ({
|
|
|
537
538
|
}
|
|
538
539
|
|
|
539
540
|
// if URL has already been scanned
|
|
540
|
-
if (scannedUrlSet.has(request.url)) {
|
|
541
|
+
if (scannedUrlSet.has(normUrl(request.url))) {
|
|
541
542
|
await enqueueProcess(page, enqueueLinks, browserContext);
|
|
542
543
|
return;
|
|
543
544
|
}
|
|
@@ -654,8 +655,33 @@ const crawlDomain = async ({
|
|
|
654
655
|
|
|
655
656
|
const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
|
|
656
657
|
|
|
658
|
+
// Detect JS redirects that fire during/after axe scan.
|
|
659
|
+
// Listen for navigation, then give a brief window for pending redirects to complete.
|
|
660
|
+
try {
|
|
661
|
+
let navigatedToUrl: string | null = null;
|
|
662
|
+
const onFrameNavigated = (frame: Frame) => {
|
|
663
|
+
if (frame === page.mainFrame()) {
|
|
664
|
+
navigatedToUrl = frame.url();
|
|
665
|
+
}
|
|
666
|
+
};
|
|
667
|
+
page.on('framenavigated', onFrameNavigated);
|
|
668
|
+
await page.waitForTimeout(1000);
|
|
669
|
+
page.off('framenavigated', onFrameNavigated);
|
|
670
|
+
|
|
671
|
+
const postScanUrl = navigatedToUrl || page.url();
|
|
672
|
+
if (postScanUrl && postScanUrl !== 'about:blank' && !isFollowStrategy(postScanUrl, request.url, 'same-hostname')) {
|
|
673
|
+
urlsCrawled.notScannedRedirects.push({
|
|
674
|
+
fromUrl: request.url,
|
|
675
|
+
toUrl: postScanUrl,
|
|
676
|
+
});
|
|
677
|
+
return;
|
|
678
|
+
}
|
|
679
|
+
} catch (_) {
|
|
680
|
+
// Page/context was destroyed during navigation — handled by outer catch
|
|
681
|
+
}
|
|
682
|
+
|
|
657
683
|
if (isRedirected) {
|
|
658
|
-
const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(actualUrl);
|
|
684
|
+
const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(normUrl(actualUrl));
|
|
659
685
|
|
|
660
686
|
if (isLoadedUrlInCrawledUrls) {
|
|
661
687
|
urlsCrawled.notScannedRedirects.push({
|
|
@@ -677,8 +703,8 @@ const crawlDomain = async ({
|
|
|
677
703
|
pageTitle: results.pageTitle,
|
|
678
704
|
actualUrl, // i.e. actualUrl
|
|
679
705
|
});
|
|
680
|
-
scannedUrlSet.add(request.url);
|
|
681
|
-
scannedResolvedUrlSet.add(actualUrl);
|
|
706
|
+
scannedUrlSet.add(normUrl(request.url));
|
|
707
|
+
scannedResolvedUrlSet.add(normUrl(actualUrl));
|
|
682
708
|
|
|
683
709
|
urlsCrawled.scannedRedirects.push({
|
|
684
710
|
fromUrl: request.url,
|
|
@@ -700,8 +726,8 @@ const crawlDomain = async ({
|
|
|
700
726
|
actualUrl: request.url,
|
|
701
727
|
pageTitle: results.pageTitle,
|
|
702
728
|
});
|
|
703
|
-
scannedUrlSet.add(request.url);
|
|
704
|
-
scannedResolvedUrlSet.add(request.url);
|
|
729
|
+
scannedUrlSet.add(normUrl(request.url));
|
|
730
|
+
scannedResolvedUrlSet.add(normUrl(request.url));
|
|
705
731
|
await dataset.pushData(results);
|
|
706
732
|
}
|
|
707
733
|
} else {
|
|
@@ -7,7 +7,7 @@ import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
|
7
7
|
import crawlDomain from './crawlDomain.js';
|
|
8
8
|
import crawlSitemap from './crawlSitemap.js';
|
|
9
9
|
import { ViewportSettingsClass } from '../combine.js';
|
|
10
|
-
import { getPlaywrightLaunchOptions } from '../constants/common.js';
|
|
10
|
+
import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
|
|
11
11
|
import { register } from '../utils.js';
|
|
12
12
|
|
|
13
13
|
const crawlIntelligentSitemap = async (
|
|
@@ -100,12 +100,30 @@ const crawlIntelligentSitemap = async (
|
|
|
100
100
|
}
|
|
101
101
|
};
|
|
102
102
|
|
|
103
|
+
// Discover sitemaps from robots.txt first (supports multiple Sitemap: directives)
|
|
104
|
+
let sitemapUrls: string[] = [];
|
|
103
105
|
try {
|
|
104
|
-
|
|
106
|
+
sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
|
|
107
|
+
if (sitemapUrls.length > 0) {
|
|
108
|
+
console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
|
|
109
|
+
sitemapExist = true;
|
|
110
|
+
}
|
|
105
111
|
} catch (error) {
|
|
106
112
|
consoleLogger.error(error);
|
|
107
113
|
}
|
|
108
114
|
|
|
115
|
+
// Fall back to hardcoded path probing if robots.txt had no sitemaps
|
|
116
|
+
if (!sitemapExist) {
|
|
117
|
+
try {
|
|
118
|
+
sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
|
|
119
|
+
if (sitemapExist) {
|
|
120
|
+
sitemapUrls = [sitemapUrl];
|
|
121
|
+
}
|
|
122
|
+
} catch (error) {
|
|
123
|
+
consoleLogger.error(error);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
109
127
|
if (!sitemapExist) {
|
|
110
128
|
console.log('Unable to find sitemap. Commencing website crawl instead.');
|
|
111
129
|
return await crawlDomain({
|
|
@@ -124,38 +142,53 @@ const crawlIntelligentSitemap = async (
|
|
|
124
142
|
followRobots,
|
|
125
143
|
extraHTTPHeaders,
|
|
126
144
|
safeMode,
|
|
127
|
-
scanDuration,
|
|
145
|
+
scanDuration,
|
|
128
146
|
});
|
|
129
147
|
}
|
|
130
148
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
149
|
+
// Process all discovered sitemaps sequentially, sharing dataset and urlsCrawled
|
|
150
|
+
for (const currentSitemapUrl of sitemapUrls) {
|
|
151
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) break;
|
|
152
|
+
|
|
153
|
+
const elapsed = Date.now() - startTime;
|
|
154
|
+
const remainingDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : scanDuration;
|
|
155
|
+
if (scanDuration > 0 && remainingDuration <= 0) {
|
|
156
|
+
durationExceeded = true;
|
|
157
|
+
break;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
console.log(`Processing sitemap: ${currentSitemapUrl}`);
|
|
161
|
+
urlsCrawledFinal = await crawlSitemap({
|
|
162
|
+
sitemapUrl: currentSitemapUrl,
|
|
163
|
+
randomToken,
|
|
164
|
+
host,
|
|
165
|
+
viewportSettings,
|
|
166
|
+
maxRequestsPerCrawl,
|
|
167
|
+
browser,
|
|
168
|
+
userDataDirectory,
|
|
169
|
+
specifiedMaxConcurrency,
|
|
170
|
+
fileTypes,
|
|
171
|
+
blacklistedPatterns,
|
|
172
|
+
includeScreenshots,
|
|
173
|
+
extraHTTPHeaders,
|
|
174
|
+
strategy,
|
|
175
|
+
userUrl: url,
|
|
176
|
+
fromCrawlIntelligentSitemap,
|
|
177
|
+
userUrlInputFromIntelligent: url,
|
|
178
|
+
datasetFromIntelligent: dataset,
|
|
179
|
+
urlsCrawledFromIntelligent: urlsCrawled,
|
|
180
|
+
crawledFromLocalFile: false,
|
|
181
|
+
scanDuration: scanDuration > 0 ? remainingDuration : 0,
|
|
182
|
+
});
|
|
183
|
+
}
|
|
152
184
|
|
|
153
185
|
const elapsed = Date.now() - startTime;
|
|
154
|
-
const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0)
|
|
186
|
+
const remainingScanDuration = scanDuration > 0 ? Math.max(scanDuration - elapsed / 1000, 0) : 0;
|
|
187
|
+
const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
|
|
155
188
|
|
|
156
|
-
if (
|
|
189
|
+
if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
|
|
157
190
|
console.log(
|
|
158
|
-
`Continuing crawl from root website
|
|
191
|
+
`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`,
|
|
159
192
|
);
|
|
160
193
|
urlsCrawledFinal = await crawlDomain({
|
|
161
194
|
url,
|
|
@@ -175,10 +208,10 @@ const crawlIntelligentSitemap = async (
|
|
|
175
208
|
safeMode,
|
|
176
209
|
fromCrawlIntelligentSitemap,
|
|
177
210
|
datasetFromIntelligent: dataset,
|
|
178
|
-
urlsCrawledFromIntelligent:
|
|
211
|
+
urlsCrawledFromIntelligent: urlsCrawled,
|
|
179
212
|
scanDuration: remainingScanDuration,
|
|
180
213
|
});
|
|
181
|
-
} else if (
|
|
214
|
+
} else if (!hasDurationRemaining) {
|
|
182
215
|
console.log(
|
|
183
216
|
`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`,
|
|
184
217
|
);
|
|
@@ -186,7 +219,7 @@ const crawlIntelligentSitemap = async (
|
|
|
186
219
|
}
|
|
187
220
|
|
|
188
221
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
189
|
-
return { urlsCrawled
|
|
222
|
+
return { urlsCrawled, durationExceeded };
|
|
190
223
|
};
|
|
191
224
|
|
|
192
225
|
export default crawlIntelligentSitemap;
|