@govtechsg/oobee 0.10.93 → 0.10.94
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +20 -0
- package/dist/cli.js +3 -2
- package/dist/combine.js +3 -3
- package/dist/constants/common.js +119 -52
- package/dist/crawlers/commonCrawlerFunc.js +11 -2
- package/dist/crawlers/crawlDomain.js +4 -6
- package/dist/crawlers/crawlSitemap.js +14 -2
- package/dist/crawlers/custom/utils.js +22 -9
- package/dist/crawlers/guards/urlGuard.js +19 -1
- package/dist/static/ejs/partials/components/allIssues/CategoryBadges.ejs +3 -0
- package/dist/static/ejs/partials/components/allIssues/IssuesTable.ejs +3 -3
- package/dist/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +1 -1
- package/dist/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +3 -3
- package/dist/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +34 -27
- package/dist/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +1 -0
- package/dist/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +7 -0
- package/dist/static/ejs/partials/components/wcagCoverageDetails.ejs +5 -5
- package/dist/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +3 -3
- package/dist/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +21 -19
- package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +39 -8
- package/dist/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +11 -5
- package/dist/static/ejs/partials/scripts/screenshotLightbox.ejs +49 -31
- package/dist/static/ejs/partials/styles/header/SiteInfo.ejs +1 -1
- package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +36 -16
- package/dist/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +22 -1
- package/dist/static/ejs/partials/styles/styles.ejs +1 -1
- package/dist/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +6 -0
- package/dist/static/ejs/partials/styles/wcagCompliance.ejs +5 -4
- package/dist/static/ejs/partials/styles/wcagCoverageDetails.ejs +6 -1
- package/oobee-client-scanner.js +2 -2
- package/package.json +1 -1
- package/src/cli.ts +3 -2
- package/src/combine.ts +3 -2
- package/src/constants/common.ts +112 -36
- package/src/crawlers/commonCrawlerFunc.ts +11 -2
- package/src/crawlers/crawlDomain.ts +4 -5
- package/src/crawlers/crawlSitemap.ts +19 -2
- package/src/crawlers/custom/utils.ts +26 -13
- package/src/crawlers/guards/urlGuard.ts +18 -1
- package/src/static/ejs/partials/components/allIssues/CategoryBadges.ejs +3 -0
- package/src/static/ejs/partials/components/allIssues/IssuesTable.ejs +3 -3
- package/src/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +1 -1
- package/src/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +3 -3
- package/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +34 -27
- package/src/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +1 -0
- package/src/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +7 -0
- package/src/static/ejs/partials/components/wcagCoverageDetails.ejs +5 -5
- package/src/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +3 -3
- package/src/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +21 -19
- package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +39 -8
- package/src/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +11 -5
- package/src/static/ejs/partials/scripts/screenshotLightbox.ejs +49 -31
- package/src/static/ejs/partials/styles/header/SiteInfo.ejs +1 -1
- package/src/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +36 -16
- package/src/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +22 -1
- package/src/static/ejs/partials/styles/styles.ejs +1 -1
- package/src/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +6 -0
- package/src/static/ejs/partials/styles/wcagCompliance.ejs +5 -4
- package/src/static/ejs/partials/styles/wcagCoverageDetails.ejs +6 -1
- package/testStaticJSScanner.html +1 -1
- /package/{7339fae5-e8ed-4b50-af13-317847620dbf.txt → 67e8137b-1939-4253-8f11-a82bc833cfcb.txt} +0 -0
|
@@ -39,6 +39,11 @@
|
|
|
39
39
|
.gauge-number {
|
|
40
40
|
font-weight: 700;
|
|
41
41
|
font-size: 16px;
|
|
42
|
+
color: var(--dark-charcoal, #1f1f1f);
|
|
43
|
+
background-color: var(--true-white, #fff);
|
|
44
|
+
display: inline-block;
|
|
45
|
+
padding: 2px 8px;
|
|
46
|
+
border-radius: 999px;
|
|
42
47
|
}
|
|
43
48
|
|
|
44
49
|
.gauge-caption {
|
|
@@ -54,6 +59,7 @@
|
|
|
54
59
|
|
|
55
60
|
.gauge-value-number {
|
|
56
61
|
font-size: 24px;
|
|
62
|
+
color: var(--dark-charcoal, #1f1f1f);
|
|
57
63
|
}
|
|
58
64
|
|
|
59
65
|
.gauge-value-number.perfect-score {
|
|
@@ -12,9 +12,14 @@
|
|
|
12
12
|
gap: 10px 18px;
|
|
13
13
|
}
|
|
14
14
|
|
|
15
|
+
#wcagCoverage .wcag-criteria-heading {
|
|
16
|
+
font-size: 1.25rem;
|
|
17
|
+
line-height: 1.2;
|
|
18
|
+
}
|
|
19
|
+
|
|
15
20
|
#wcagCoverage .wcag-grid a {
|
|
16
21
|
color: var(--a11y-majorelle-blue, #5735DF);
|
|
17
|
-
text-decoration:
|
|
22
|
+
text-decoration: underline;
|
|
18
23
|
}
|
|
19
24
|
#wcagCoverage .wcag-grid a:hover,
|
|
20
25
|
#wcagCoverage .wcag-grid a:focus-visible {
|
package/oobee-client-scanner.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* DO NOT EDIT MANUALLY. Re-generate with: node dist/generateOobeeClientScanner.js
|
|
4
4
|
*
|
|
5
5
|
* Embedded at generation time:
|
|
6
|
-
* App version : 0.10.
|
|
6
|
+
* App version : 0.10.94
|
|
7
7
|
* Sentry DSN : (from OOBEE_SENTRY_DSN env var or constants.ts default)
|
|
8
8
|
* Sentry SDK : @sentry/browser 10.58.0 (loaded from CDN at runtime)
|
|
9
9
|
*
|
|
@@ -34883,7 +34883,7 @@
|
|
|
34883
34883
|
// ── Sentry browser telemetry (Sentry JS SDK, loaded from CDN) ────────────
|
|
34884
34884
|
|
|
34885
34885
|
var _oobeeSentryDsn = "https://3b8c7ee46b06f33815a1301b6713ebc3@o4509047624761344.ingest.us.sentry.io/4509327783559168";
|
|
34886
|
-
var _oobeeAppVersion = "0.10.
|
|
34886
|
+
var _oobeeAppVersion = "0.10.94";
|
|
34887
34887
|
var _oobeeSentryVersion = "10.58.0";
|
|
34888
34888
|
var _oobeeSentryInitialized = false;
|
|
34889
34889
|
var _oobeeSentryLoadPromise = null;
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -264,9 +264,10 @@ const scanInit = async (argvs: Answers): Promise<string> => {
|
|
|
264
264
|
consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
|
|
265
265
|
|
|
266
266
|
if (res.status === statuses.success.code) {
|
|
267
|
-
//
|
|
268
|
-
//
|
|
267
|
+
// Keep browser-resolved URL as entryUrl for downstream scan metadata/events
|
|
268
|
+
// on non-custom scans.
|
|
269
269
|
if (data.type !== ScannerTypes.CUSTOM) {
|
|
270
|
+
data.entryUrl = res.url;
|
|
270
271
|
data.url = res.url;
|
|
271
272
|
}
|
|
272
273
|
if (process.env.OOBEE_VALIDATE_URL) {
|
package/src/combine.ts
CHANGED
|
@@ -45,6 +45,7 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
|
45
45
|
const {
|
|
46
46
|
type,
|
|
47
47
|
url,
|
|
48
|
+
entryUrl,
|
|
48
49
|
nameEmail,
|
|
49
50
|
randomToken,
|
|
50
51
|
deviceChosen,
|
|
@@ -104,8 +105,8 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
|
104
105
|
|
|
105
106
|
// remove basic-auth credentials from URL
|
|
106
107
|
const finalUrl = !(type === ScannerTypes.SITEMAP || type === ScannerTypes.LOCALFILE)
|
|
107
|
-
? new URL(
|
|
108
|
-
: new URL(pathToFileURL(
|
|
108
|
+
? new URL(entryUrl)
|
|
109
|
+
: new URL(pathToFileURL(entryUrl));
|
|
109
110
|
|
|
110
111
|
// Use the string version of finalUrl to reduce logic at submitForm
|
|
111
112
|
const finalUrlString = finalUrl.toString();
|
package/src/constants/common.ts
CHANGED
|
@@ -359,8 +359,11 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
359
359
|
}
|
|
360
360
|
}
|
|
361
361
|
|
|
362
|
-
// Ensure Accept header for non-html content fallback
|
|
363
|
-
extraHTTPHeaders
|
|
362
|
+
// Ensure Accept header for non-html content fallback — use a local copy to avoid
|
|
363
|
+
// mutating the caller's extraHTTPHeaders object (which is later checked by crawlers
|
|
364
|
+
// to decide whether to enable preNavigationHooks header rewriting).
|
|
365
|
+
const localHeaders = { ...extraHTTPHeaders };
|
|
366
|
+
localHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
|
|
364
367
|
|
|
365
368
|
await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
|
|
366
369
|
|
|
@@ -378,7 +381,7 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
378
381
|
|
|
379
382
|
const launchOptions = getPlaywrightLaunchOptions(browserToRun);
|
|
380
383
|
|
|
381
|
-
const { Authorization, ...nonAuthHeaders } =
|
|
384
|
+
const { Authorization, ...nonAuthHeaders } = localHeaders || {};
|
|
382
385
|
let httpCredentials = undefined;
|
|
383
386
|
if (Authorization?.startsWith('Basic ')) {
|
|
384
387
|
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
@@ -436,19 +439,21 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
436
439
|
// Only enable generic Authorization header routing interception broadly if
|
|
437
440
|
// a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
|
|
438
441
|
// performance warnings inside the check checkUrl phase for typical public scans
|
|
439
|
-
if (
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
442
|
+
if (Object.keys(localHeaders).length > 0) {
|
|
443
|
+
if (Authorization && !httpCredentials) {
|
|
444
|
+
const entryOrigin = new URL(url).origin;
|
|
445
|
+
await browserContext.route('**/*', async (route: any, request: any) => {
|
|
446
|
+
try {
|
|
447
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
448
|
+
await route.continue({ headers: { ...request.headers(), Authorization } });
|
|
449
|
+
} else {
|
|
450
|
+
await route.continue();
|
|
451
|
+
}
|
|
452
|
+
} catch {
|
|
446
453
|
await route.continue();
|
|
447
454
|
}
|
|
448
|
-
}
|
|
449
|
-
|
|
450
|
-
}
|
|
451
|
-
});
|
|
455
|
+
});
|
|
456
|
+
}
|
|
452
457
|
}
|
|
453
458
|
|
|
454
459
|
const page = await browserContext.newPage();
|
|
@@ -569,7 +574,7 @@ export const isSitemapContent = (content: string) => {
|
|
|
569
574
|
}
|
|
570
575
|
|
|
571
576
|
const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
|
|
572
|
-
const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
|
|
577
|
+
const regexForXmlSitemap = new RegExp('<(?:urlset|sitemapindex|feed|rss)+?.*>', 'gmi');
|
|
573
578
|
if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
|
|
574
579
|
// is an XML sitemap wrapped in a HTML document
|
|
575
580
|
return true;
|
|
@@ -592,8 +597,22 @@ export const checkUrl = async (
|
|
|
592
597
|
extraHTTPHeaders: Record<string, string>,
|
|
593
598
|
fileTypes: FileTypes,
|
|
594
599
|
) => {
|
|
600
|
+
let urlToCheck = url;
|
|
601
|
+
|
|
602
|
+
if (scanner === ScannerTypes.LOCALFILE) {
|
|
603
|
+
if (!isFilePath(url)) {
|
|
604
|
+
const res = new RES();
|
|
605
|
+
res.status = constants.urlCheckStatuses.notALocalFile.code;
|
|
606
|
+
return res;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
if (!url.toLowerCase().startsWith('file://')) {
|
|
610
|
+
urlToCheck = pathToFileURL(path.resolve(url)).toString();
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
595
614
|
const res = await checkUrlConnectivityWithBrowser(
|
|
596
|
-
|
|
615
|
+
urlToCheck,
|
|
597
616
|
browser,
|
|
598
617
|
clonedDataDir,
|
|
599
618
|
playwrightDeviceDetailsObject,
|
|
@@ -681,6 +700,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
681
700
|
ruleset,
|
|
682
701
|
generateJsonFiles,
|
|
683
702
|
scanDuration,
|
|
703
|
+
finalUrl,
|
|
684
704
|
} = argv;
|
|
685
705
|
|
|
686
706
|
const extraHTTPHeaders = parseHeaders(header);
|
|
@@ -714,6 +734,10 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
714
734
|
url = temp.toString();
|
|
715
735
|
}
|
|
716
736
|
|
|
737
|
+
// Keep browser-resolved URL (if provided by pre-check flow) as canonical entry URL.
|
|
738
|
+
// For local file paths, keep using the normalized `url` value below.
|
|
739
|
+
const resolvedEntryUrl = finalUrl && !isFilePath(finalUrl) ? finalUrl : url;
|
|
740
|
+
|
|
717
741
|
// construct filename for scan results
|
|
718
742
|
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
|
|
719
743
|
const domain = isLocalFileScan ? path.basename(url) : new URL(url).hostname;
|
|
@@ -758,7 +782,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
758
782
|
return {
|
|
759
783
|
type: scanner,
|
|
760
784
|
url,
|
|
761
|
-
entryUrl:
|
|
785
|
+
entryUrl: resolvedEntryUrl,
|
|
762
786
|
isHeadless: headless,
|
|
763
787
|
deviceChosen,
|
|
764
788
|
customDevice,
|
|
@@ -1009,6 +1033,8 @@ export const getLinksFromSitemap = async (
|
|
|
1009
1033
|
const scannedSitemaps = new Set<string>();
|
|
1010
1034
|
const sitemapLinkCounts: Record<string, number> = {};
|
|
1011
1035
|
const allUrls = new Set<string>(); // all discovered URLs (lightweight strings)
|
|
1036
|
+
const isImageSitemapUrl = (candidateUrl: string) =>
|
|
1037
|
+
/(^|\/)image-sitemap(?:-index)?(?:-\d+)?\.xml(?:$|[?#])/i.test(candidateUrl);
|
|
1012
1038
|
|
|
1013
1039
|
const addToUrlList = (url: string) => {
|
|
1014
1040
|
if (!url) return;
|
|
@@ -1092,6 +1118,11 @@ export const getLinksFromSitemap = async (
|
|
|
1092
1118
|
let data;
|
|
1093
1119
|
let sitemapType;
|
|
1094
1120
|
|
|
1121
|
+
if (isImageSitemapUrl(url)) {
|
|
1122
|
+
consoleLogger.info(`Skipping image sitemap: ${url}`);
|
|
1123
|
+
return;
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1095
1126
|
if (scannedSitemaps.has(url)) {
|
|
1096
1127
|
// Skip processing if the sitemap has already been scanned
|
|
1097
1128
|
return;
|
|
@@ -1147,11 +1178,28 @@ export const getLinksFromSitemap = async (
|
|
|
1147
1178
|
|
|
1148
1179
|
const page = await browserContext.newPage();
|
|
1149
1180
|
|
|
1150
|
-
|
|
1181
|
+
// Use 'domcontentloaded' instead of 'networkidle' — sitemap XMLs with
|
|
1182
|
+
// XSL stylesheet references (e.g. <?xml-stylesheet ...?>) cause the browser
|
|
1183
|
+
// to fetch and apply the stylesheet, which may load additional resources
|
|
1184
|
+
// (fonts, CSS, images) that prevent 'networkidle' from ever being reached.
|
|
1185
|
+
const response = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
|
1186
|
+
|
|
1187
|
+
// Prefer the raw response body — this gives us the original XML before
|
|
1188
|
+
// the browser applies any XSL transformation (which would turn the XML
|
|
1189
|
+
// into rendered HTML, losing the sitemap structure).
|
|
1190
|
+
if (response) {
|
|
1191
|
+
try {
|
|
1192
|
+
data = await response.text();
|
|
1193
|
+
} catch {
|
|
1194
|
+
// response.text() can fail if the body was already consumed or
|
|
1195
|
+
// if a redirect occurred; fall through to DOM extraction below.
|
|
1196
|
+
}
|
|
1197
|
+
}
|
|
1151
1198
|
|
|
1152
|
-
if (
|
|
1153
|
-
|
|
1154
|
-
|
|
1199
|
+
if (!data) {
|
|
1200
|
+
if ((await page.locator('body').count()) > 0) {
|
|
1201
|
+
data = await page.locator('body').innerText();
|
|
1202
|
+
} else {
|
|
1155
1203
|
const urlSet = page.locator('urlset');
|
|
1156
1204
|
const sitemapIndex = page.locator('sitemapindex');
|
|
1157
1205
|
const rss = page.locator('rss');
|
|
@@ -1166,6 +1214,7 @@ export const getLinksFromSitemap = async (
|
|
|
1166
1214
|
data = await rss.evaluate(elem => elem.outerHTML);
|
|
1167
1215
|
} else if (await isRoot(feed)) {
|
|
1168
1216
|
data = await feed.evaluate(elem => elem.outerHTML);
|
|
1217
|
+
}
|
|
1169
1218
|
}
|
|
1170
1219
|
}
|
|
1171
1220
|
} finally {
|
|
@@ -1189,39 +1238,65 @@ export const getLinksFromSitemap = async (
|
|
|
1189
1238
|
}
|
|
1190
1239
|
|
|
1191
1240
|
const $ = cheerio.load(data, { xml: true });
|
|
1241
|
+
const countBefore = allUrls.size;
|
|
1192
1242
|
|
|
1193
1243
|
// This case is when the document is not an XML format document
|
|
1194
1244
|
if ($(':root').length === 0) {
|
|
1195
1245
|
processNonStandardSitemap(data);
|
|
1246
|
+
|
|
1247
|
+
const linksFromThisSitemap = allUrls.size - countBefore;
|
|
1248
|
+
if (linksFromThisSitemap > 0) {
|
|
1249
|
+
sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
|
|
1250
|
+
}
|
|
1196
1251
|
return;
|
|
1197
1252
|
}
|
|
1198
1253
|
|
|
1199
1254
|
// Root element
|
|
1200
1255
|
const root = $(':root')[0];
|
|
1256
|
+
const hasImageNamespace = Object.values(root?.attribs ?? {}).some(
|
|
1257
|
+
attribVal => typeof attribVal === 'string' && attribVal.toLowerCase().includes('sitemap-image'),
|
|
1258
|
+
);
|
|
1201
1259
|
|
|
1202
|
-
|
|
1260
|
+
if (hasImageNamespace) {
|
|
1261
|
+
consoleLogger.info(`Skipping image sitemap: ${url}`);
|
|
1262
|
+
return;
|
|
1263
|
+
}
|
|
1264
|
+
|
|
1265
|
+
const rootName = root?.name?.toLowerCase().split(':').pop() ?? '';
|
|
1266
|
+
const hasXmlSitemapIndexTag = /<\s*(?:[a-z0-9_-]+:)?sitemapindex\b/i.test(data);
|
|
1267
|
+
const hasXmlUrlsetTag = /<\s*(?:[a-z0-9_-]+:)?urlset\b/i.test(data);
|
|
1203
1268
|
|
|
1204
|
-
|
|
1205
|
-
if (root.name === 'urlset' && xmlns.includes(xmlFormatNamespace)) {
|
|
1269
|
+
if (rootName === 'urlset') {
|
|
1206
1270
|
sitemapType = constants.xmlSitemapTypes.xml;
|
|
1207
|
-
} else if (
|
|
1271
|
+
} else if (rootName === 'sitemapindex') {
|
|
1208
1272
|
sitemapType = constants.xmlSitemapTypes.xmlIndex;
|
|
1209
|
-
} else if (
|
|
1273
|
+
} else if (rootName === 'rss') {
|
|
1210
1274
|
sitemapType = constants.xmlSitemapTypes.rss;
|
|
1211
|
-
} else if (
|
|
1275
|
+
} else if (rootName === 'feed') {
|
|
1212
1276
|
sitemapType = constants.xmlSitemapTypes.atom;
|
|
1277
|
+
} else if (hasXmlSitemapIndexTag) {
|
|
1278
|
+
sitemapType = constants.xmlSitemapTypes.xmlIndex;
|
|
1279
|
+
} else if (hasXmlUrlsetTag) {
|
|
1280
|
+
sitemapType = constants.xmlSitemapTypes.xml;
|
|
1213
1281
|
} else {
|
|
1214
1282
|
sitemapType = constants.xmlSitemapTypes.unknown;
|
|
1215
1283
|
}
|
|
1216
1284
|
|
|
1217
|
-
const countBefore = allUrls.size;
|
|
1218
|
-
|
|
1219
1285
|
switch (sitemapType) {
|
|
1220
1286
|
case constants.xmlSitemapTypes.xmlIndex:
|
|
1221
|
-
consoleLogger.info(`This is a XML format sitemap index
|
|
1287
|
+
consoleLogger.info(`This is a XML format sitemap index: ${url}`);
|
|
1222
1288
|
for (const childSitemapUrl of $('loc')) {
|
|
1223
|
-
const childSitemapUrlText = $(childSitemapUrl).text();
|
|
1224
|
-
if (childSitemapUrlText
|
|
1289
|
+
const childSitemapUrlText = $(childSitemapUrl).text().trim();
|
|
1290
|
+
if (!childSitemapUrlText) {
|
|
1291
|
+
continue;
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
const childSitemapPath = childSitemapUrlText.split(/[?#]/)[0].toLowerCase();
|
|
1295
|
+
if (childSitemapPath.endsWith('.xml') || childSitemapPath.endsWith('.txt')) {
|
|
1296
|
+
if (isImageSitemapUrl(childSitemapUrlText)) {
|
|
1297
|
+
consoleLogger.info(`Skipping image sitemap: ${childSitemapUrlText}`);
|
|
1298
|
+
continue;
|
|
1299
|
+
}
|
|
1225
1300
|
await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
|
|
1226
1301
|
} else {
|
|
1227
1302
|
addToUrlList(childSitemapUrlText); // Add regular URLs to the list
|
|
@@ -1229,19 +1304,19 @@ export const getLinksFromSitemap = async (
|
|
|
1229
1304
|
}
|
|
1230
1305
|
break;
|
|
1231
1306
|
case constants.xmlSitemapTypes.xml:
|
|
1232
|
-
consoleLogger.info(`This is a XML format sitemap
|
|
1307
|
+
consoleLogger.info(`This is a XML format sitemap: ${url}`);
|
|
1233
1308
|
await processXmlSitemap($, sitemapType, 'loc', 'lastmod', 'url');
|
|
1234
1309
|
break;
|
|
1235
1310
|
case constants.xmlSitemapTypes.rss:
|
|
1236
|
-
consoleLogger.info(`This is a RSS format sitemap
|
|
1311
|
+
consoleLogger.info(`This is a RSS format sitemap: ${url}`);
|
|
1237
1312
|
await processXmlSitemap($, sitemapType, 'link', 'pubDate', 'item');
|
|
1238
1313
|
break;
|
|
1239
1314
|
case constants.xmlSitemapTypes.atom:
|
|
1240
|
-
consoleLogger.info(`This is a Atom format sitemap
|
|
1315
|
+
consoleLogger.info(`This is a Atom format sitemap: ${url}`);
|
|
1241
1316
|
await processXmlSitemap($, sitemapType, 'link', 'published', 'entry');
|
|
1242
1317
|
break;
|
|
1243
1318
|
default:
|
|
1244
|
-
consoleLogger.info(`This is an unrecognised XML sitemap format
|
|
1319
|
+
consoleLogger.info(`This is an unrecognised XML sitemap format: ${url}`);
|
|
1245
1320
|
processNonStandardSitemap(data);
|
|
1246
1321
|
}
|
|
1247
1322
|
|
|
@@ -2191,6 +2266,7 @@ export const isFilePath = (url: string): boolean => {
|
|
|
2191
2266
|
const driveLetterPattern = /^[A-Z]:/i;
|
|
2192
2267
|
const backslashPattern = /\\/;
|
|
2193
2268
|
return (
|
|
2269
|
+
url.toLowerCase().startsWith('file://') ||
|
|
2194
2270
|
url.startsWith('/') ||
|
|
2195
2271
|
driveLetterPattern.test(url) ||
|
|
2196
2272
|
backslashPattern.test(url) ||
|
|
@@ -1145,10 +1145,19 @@ export const createCrawleeSubFolders = async (
|
|
|
1145
1145
|
export const preNavigationHooks = (extraHTTPHeaders: Record<string, string>) => {
|
|
1146
1146
|
return [
|
|
1147
1147
|
async (crawlingContext: CrawlingContext, gotoOptions: PlaywrightGotoOptions) => {
|
|
1148
|
-
if (extraHTTPHeaders) {
|
|
1148
|
+
if (extraHTTPHeaders && Object.keys(extraHTTPHeaders).length > 0) {
|
|
1149
1149
|
crawlingContext.request.headers = extraHTTPHeaders;
|
|
1150
1150
|
}
|
|
1151
|
-
|
|
1151
|
+
// Use domcontentloaded — fires as soon as the DOM is parsed, before
|
|
1152
|
+
// images/stylesheets/network requests settle. This avoids indefinite
|
|
1153
|
+
// hangs on sites with WebSockets, analytics polling, or infinite-scroll
|
|
1154
|
+
// beacons that never reach networkidle. Further page stability is
|
|
1155
|
+
// handled by waitForPageLoaded() in each crawler's requestHandler and
|
|
1156
|
+
// by the DOM mutation observer in postNavigationHooks.
|
|
1157
|
+
if (gotoOptions) {
|
|
1158
|
+
gotoOptions.waitUntil = 'domcontentloaded';
|
|
1159
|
+
gotoOptions.timeout = 30000;
|
|
1160
|
+
}
|
|
1152
1161
|
},
|
|
1153
1162
|
];
|
|
1154
1163
|
};
|
|
@@ -5,6 +5,7 @@ import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
|
|
|
5
5
|
import {
|
|
6
6
|
createCrawleeSubFolders,
|
|
7
7
|
getPreLaunchHook,
|
|
8
|
+
preNavigationHooks,
|
|
8
9
|
runAxeScript,
|
|
9
10
|
isUrlPdf,
|
|
10
11
|
shouldSkipClickDueToDisallowedHref,
|
|
@@ -414,12 +415,10 @@ const crawlDomain = async ({
|
|
|
414
415
|
],
|
|
415
416
|
},
|
|
416
417
|
requestQueue,
|
|
418
|
+
maxRequestRetries: 3,
|
|
419
|
+
maxSessionRotations: 1,
|
|
417
420
|
preNavigationHooks: [
|
|
418
|
-
|
|
419
|
-
if (extraHTTPHeaders) {
|
|
420
|
-
crawlingContext.request.headers = extraHTTPHeaders;
|
|
421
|
-
}
|
|
422
|
-
},
|
|
421
|
+
...preNavigationHooks(extraHTTPHeaders),
|
|
423
422
|
],
|
|
424
423
|
postNavigationHooks: [
|
|
425
424
|
async crawlingContext => {
|
|
@@ -7,6 +7,7 @@ import {
|
|
|
7
7
|
preNavigationHooks,
|
|
8
8
|
runAxeScript,
|
|
9
9
|
isUrlPdf,
|
|
10
|
+
splitAuthHeaders,
|
|
10
11
|
} from './commonCrawlerFunc.js';
|
|
11
12
|
|
|
12
13
|
import constants, {
|
|
@@ -85,6 +86,7 @@ const crawlSitemap = async ({
|
|
|
85
86
|
maxRequestsPerCrawl,
|
|
86
87
|
specifiedMaxConcurrency || constants.maxConcurrency,
|
|
87
88
|
);
|
|
89
|
+
const initialNoSuccessFailureAbortThreshold = Math.max(5, Math.min(maxRequestsPerCrawl, 25));
|
|
88
90
|
|
|
89
91
|
if (fromCrawlIntelligentSitemap) {
|
|
90
92
|
dataset = datasetFromIntelligent;
|
|
@@ -119,6 +121,7 @@ const crawlSitemap = async ({
|
|
|
119
121
|
const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
|
|
120
122
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
|
121
123
|
const { maxConcurrency } = constants;
|
|
124
|
+
const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
122
125
|
|
|
123
126
|
const requestList = await RequestList.open({
|
|
124
127
|
sources: linksFromSitemap,
|
|
@@ -142,11 +145,15 @@ const crawlSitemap = async ({
|
|
|
142
145
|
...playwrightDeviceDetailsObject,
|
|
143
146
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
144
147
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
148
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
149
|
+
...(httpCredentials && { httpCredentials }),
|
|
145
150
|
};
|
|
146
151
|
},
|
|
147
152
|
],
|
|
148
153
|
},
|
|
149
154
|
requestList,
|
|
155
|
+
maxRequestRetries: 3,
|
|
156
|
+
maxSessionRotations: 1,
|
|
150
157
|
postNavigationHooks: [
|
|
151
158
|
async ({ page }) => {
|
|
152
159
|
try {
|
|
@@ -197,6 +204,7 @@ const crawlSitemap = async ({
|
|
|
197
204
|
},
|
|
198
205
|
],
|
|
199
206
|
preNavigationHooks: [
|
|
207
|
+
...preNavigationHooks(extraHTTPHeaders),
|
|
200
208
|
async ({ request, page }, gotoOptions) => {
|
|
201
209
|
const url = request.url.toLowerCase();
|
|
202
210
|
|
|
@@ -213,8 +221,6 @@ const crawlSitemap = async ({
|
|
|
213
221
|
|
|
214
222
|
return;
|
|
215
223
|
}
|
|
216
|
-
|
|
217
|
-
preNavigationHooks(extraHTTPHeaders);
|
|
218
224
|
},
|
|
219
225
|
],
|
|
220
226
|
requestHandlerTimeoutSecs: 90,
|
|
@@ -449,6 +455,17 @@ const crawlSitemap = async ({
|
|
|
449
455
|
httpStatusCode: typeof status === 'number' ? status : 0,
|
|
450
456
|
});
|
|
451
457
|
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|
|
458
|
+
|
|
459
|
+
if (
|
|
460
|
+
urlsCrawled.scanned.length === 0 &&
|
|
461
|
+
urlsCrawled.error.length >= initialNoSuccessFailureAbortThreshold
|
|
462
|
+
) {
|
|
463
|
+
consoleLogger.info(
|
|
464
|
+
`Aborting sitemap crawl: ${urlsCrawled.error.length} failed pages with 0 successful scans.`,
|
|
465
|
+
);
|
|
466
|
+
isAbortingScan = true;
|
|
467
|
+
crawler.autoscaledPool?.abort();
|
|
468
|
+
}
|
|
452
469
|
},
|
|
453
470
|
maxRequestsPerCrawl: Infinity,
|
|
454
471
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
@@ -1228,19 +1228,32 @@ export const initNewPage = async (page, pageClosePromises, processPageParams, pa
|
|
|
1228
1228
|
const allowed = isOverlayAllowed(page.url(), processPageParams.entryUrl);
|
|
1229
1229
|
|
|
1230
1230
|
if (!allowed) {
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1231
|
+
// On macOS and Windows the custom flow always runs headful.
|
|
1232
|
+
// The URL guard (urlGuard.ts) intercepts non-http/https navigations
|
|
1233
|
+
// and calls page.goto(safeUrl). Do NOT remove the overlay here —
|
|
1234
|
+
// removing it causes it to stay permanently disabled if the redirect
|
|
1235
|
+
// races ahead of the next reconcile cycle.
|
|
1236
|
+
// Instead, fall through to the hasOverlay / addOverlayMenu block so
|
|
1237
|
+
// the overlay is (re-)injected even on transient non-http/https URLs
|
|
1238
|
+
// (e.g. file://, about:blank) and again after the guard's redirect.
|
|
1239
|
+
const isDesktopHost = process.platform === 'darwin' || process.platform === 'win32';
|
|
1240
|
+
if (!isDesktopHost) {
|
|
1241
|
+
// On Linux / Docker: remove overlay for non-http/https URLs and stop.
|
|
1242
|
+
await Promise.race([
|
|
1243
|
+
removeOverlayMenu(page),
|
|
1244
|
+
new Promise((_, reject) => {
|
|
1245
|
+
setTimeout(() => {
|
|
1246
|
+
reject(
|
|
1247
|
+
new Error(
|
|
1248
|
+
`removeOverlayMenu timed out after ${OVERLAY_OPERATION_TIMEOUT_MS}ms`,
|
|
1249
|
+
),
|
|
1250
|
+
);
|
|
1251
|
+
}, OVERLAY_OPERATION_TIMEOUT_MS);
|
|
1252
|
+
}),
|
|
1253
|
+
]);
|
|
1254
|
+
return;
|
|
1255
|
+
}
|
|
1256
|
+
// Desktop hosts: skip removal and fall through to re-add overlay.
|
|
1244
1257
|
}
|
|
1245
1258
|
|
|
1246
1259
|
const hasOverlay = await page.evaluate(() =>
|
|
@@ -35,8 +35,18 @@ export function addUrlGuardScript(context, opts = {}) {
|
|
|
35
35
|
});
|
|
36
36
|
|
|
37
37
|
const restoreToSafeUrl = async (page, attemptedUrl) => {
|
|
38
|
+
const safeUrl = lastAllowedUrlByPage.get(page) || fallbackUrl || 'about:blank';
|
|
39
|
+
// Only redirect if the safe URL is itself an allowed (http/https) URL.
|
|
40
|
+
// If the entry URL is file:// (e.g. scanning a local HTML file), the
|
|
41
|
+
// fallback is also file://, and redirecting would create an infinite loop:
|
|
42
|
+
// file:// → restoreToSafeUrl → file:// → framenavigated → restoreToSafeUrl → …
|
|
43
|
+
try {
|
|
44
|
+
const safeObj = new URL(safeUrl);
|
|
45
|
+
if (!ALLOWED_PROTOCOLS.has(safeObj.protocol)) return;
|
|
46
|
+
} catch {
|
|
47
|
+
return;
|
|
48
|
+
}
|
|
38
49
|
try {
|
|
39
|
-
const safeUrl = lastAllowedUrlByPage.get(page) || fallbackUrl || 'about:blank';
|
|
40
50
|
await page.goto(safeUrl, { waitUntil: 'domcontentloaded' });
|
|
41
51
|
} catch {
|
|
42
52
|
// page might be closing; ignore
|
|
@@ -58,6 +68,13 @@ export function addUrlGuardScript(context, opts = {}) {
|
|
|
58
68
|
lastAllowedUrlByPage.set(page, urlObj.toString());
|
|
59
69
|
return;
|
|
60
70
|
}
|
|
71
|
+
|
|
72
|
+
// Skip browser-internal transitional states (about:blank, about:srcdoc, etc.).
|
|
73
|
+
// page.goto() navigates through about:blank before loading the target URL.
|
|
74
|
+
// Redirecting from about: creates an infinite loop:
|
|
75
|
+
// restoreToSafeUrl → page.goto(safeUrl) → about:blank → restoreToSafeUrl → …
|
|
76
|
+
if (urlObj.protocol === 'about:') return;
|
|
77
|
+
|
|
61
78
|
await restoreToSafeUrl(page, urlStr);
|
|
62
79
|
});
|
|
63
80
|
};
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
<button
|
|
8
8
|
type="button"
|
|
9
9
|
class="category-tooltip-icon"
|
|
10
|
+
aria-label="About Must Fix category"
|
|
10
11
|
aria-describedby="mustFixTooltip"
|
|
11
12
|
>
|
|
12
13
|
<svg xmlns="http://www.w3.org/2000/svg" width="14" height="14"
|
|
@@ -34,6 +35,7 @@
|
|
|
34
35
|
<button
|
|
35
36
|
type="button"
|
|
36
37
|
class="category-tooltip-icon"
|
|
38
|
+
aria-label="About Good to Fix category"
|
|
37
39
|
aria-describedby="goodToFixTooltip"
|
|
38
40
|
>
|
|
39
41
|
<svg xmlns="http://www.w3.org/2000/svg" width="14" height="14"
|
|
@@ -61,6 +63,7 @@
|
|
|
61
63
|
<button
|
|
62
64
|
type="button"
|
|
63
65
|
class="category-tooltip-icon"
|
|
66
|
+
aria-label="About Manual Test category"
|
|
64
67
|
aria-describedby="manualTestTooltip"
|
|
65
68
|
>
|
|
66
69
|
<svg xmlns="http://www.w3.org/2000/svg" width="14" height="14"
|
|
@@ -2,21 +2,21 @@
|
|
|
2
2
|
<table class="issues-table" id="issuesTable">
|
|
3
3
|
<thead>
|
|
4
4
|
<tr>
|
|
5
|
-
<th class="sortable"
|
|
5
|
+
<th class="sortable" tabindex="0" aria-sort="none" style="width: 15%;">
|
|
6
6
|
<span>Severity</span>
|
|
7
7
|
<svg class="sort-icon" width="24" height="24" viewBox="0 0 24 24" fill="none" aria-hidden="true">
|
|
8
8
|
<path d="M7 9L12 4L17 9H7Z" fill="currentColor" opacity="1" />
|
|
9
9
|
<path d="M7 15L12 20L17 15H7Z" fill="currentColor" opacity="0.3" />
|
|
10
10
|
</svg>
|
|
11
11
|
</th>
|
|
12
|
-
<th class="sortable"
|
|
12
|
+
<th class="sortable" tabindex="0" aria-sort="none">
|
|
13
13
|
<span>Issue Name</span>
|
|
14
14
|
<svg class="sort-icon" width="24" height="24" viewBox="0 0 24 24" fill="none" aria-hidden="true">
|
|
15
15
|
<path d="M7 9L12 4L17 9H7Z" fill="currentColor" opacity="0.3" />
|
|
16
16
|
<path d="M7 15L12 20L17 15H7Z" fill="currentColor" opacity="1" />
|
|
17
17
|
</svg>
|
|
18
18
|
</th>
|
|
19
|
-
<th class="sortable"
|
|
19
|
+
<th class="sortable" tabindex="0" aria-sort="descending" style="width: 15%;">
|
|
20
20
|
<span>Occurrence</span>
|
|
21
21
|
<svg class="sort-icon" width="24" height="24" viewBox="0 0 24 24" fill="none" aria-hidden="true">
|
|
22
22
|
<path d="M7 9L12 4L17 9H7Z" fill="currentColor" opacity="0.3" />
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
<div id="aboutScanModal" class="modal fade" tabindex="-1" aria-
|
|
1
|
+
<div id="aboutScanModal" class="modal fade" tabindex="-1" aria-label="About this scan" aria-hidden="true">
|
|
2
2
|
<div class="modal-dialog modal-dialog-centered">
|
|
3
3
|
<div class="modal-content">
|
|
4
4
|
<div class="modal-header">
|