@govtechsg/oobee 0.10.92 → 0.10.94
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +34 -0
- package/README.md +19 -0
- package/dist/cli.js +3 -2
- package/dist/combine.js +4 -4
- package/dist/constants/common.js +136 -49
- package/dist/crawlers/commonCrawlerFunc.js +54 -2
- package/dist/crawlers/crawlDomain.js +9 -2
- package/dist/crawlers/crawlIntelligentSitemap.js +9 -4
- package/dist/crawlers/crawlSitemap.js +14 -2
- package/dist/crawlers/custom/utils.js +22 -9
- package/dist/crawlers/guards/urlGuard.js +19 -1
- package/dist/crawlers/runCustom.js +8 -2
- package/dist/generateOobeeClientScanner.js +1 -1
- package/dist/mergeAxeResults/itemsStore.js +32 -3
- package/dist/static/ejs/partials/components/allIssues/CategoryBadges.ejs +3 -0
- package/dist/static/ejs/partials/components/allIssues/IssuesTable.ejs +3 -3
- package/dist/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +1 -1
- package/dist/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +3 -3
- package/dist/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +34 -27
- package/dist/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +1 -0
- package/dist/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +7 -0
- package/dist/static/ejs/partials/components/wcagCoverageDetails.ejs +5 -5
- package/dist/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +3 -3
- package/dist/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +21 -19
- package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +39 -8
- package/dist/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +11 -5
- package/dist/static/ejs/partials/scripts/screenshotLightbox.ejs +49 -31
- package/dist/static/ejs/partials/styles/header/SiteInfo.ejs +1 -1
- package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +36 -16
- package/dist/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +22 -1
- package/dist/static/ejs/partials/styles/styles.ejs +1 -1
- package/dist/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +6 -0
- package/dist/static/ejs/partials/styles/wcagCompliance.ejs +5 -4
- package/dist/static/ejs/partials/styles/wcagCoverageDetails.ejs +6 -1
- package/oobee-client-scanner.js +4 -4
- package/package.json +2 -2
- package/src/cli.ts +3 -2
- package/src/combine.ts +4 -2
- package/src/constants/common.ts +131 -35
- package/src/crawlers/commonCrawlerFunc.ts +56 -2
- package/src/crawlers/crawlDomain.ts +11 -1
- package/src/crawlers/crawlIntelligentSitemap.ts +10 -4
- package/src/crawlers/crawlSitemap.ts +19 -2
- package/src/crawlers/custom/utils.ts +26 -13
- package/src/crawlers/guards/urlGuard.ts +18 -1
- package/src/crawlers/runCustom.ts +10 -1
- package/src/generateOobeeClientScanner.ts +1 -1
- package/src/mergeAxeResults/itemsStore.ts +37 -3
- package/src/static/ejs/partials/components/allIssues/CategoryBadges.ejs +3 -0
- package/src/static/ejs/partials/components/allIssues/IssuesTable.ejs +3 -3
- package/src/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +1 -1
- package/src/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +3 -3
- package/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +34 -27
- package/src/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +1 -0
- package/src/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +7 -0
- package/src/static/ejs/partials/components/wcagCoverageDetails.ejs +5 -5
- package/src/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +3 -3
- package/src/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +21 -19
- package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +39 -8
- package/src/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +11 -5
- package/src/static/ejs/partials/scripts/screenshotLightbox.ejs +49 -31
- package/src/static/ejs/partials/styles/header/SiteInfo.ejs +1 -1
- package/src/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +36 -16
- package/src/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +22 -1
- package/src/static/ejs/partials/styles/styles.ejs +1 -1
- package/src/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +6 -0
- package/src/static/ejs/partials/styles/wcagCompliance.ejs +5 -4
- package/src/static/ejs/partials/styles/wcagCoverageDetails.ejs +6 -1
- package/testStaticJSScanner.html +1 -1
- /package/{d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt → 67e8137b-1939-4253-8f11-a82bc833cfcb.txt} +0 -0
package/src/constants/common.ts
CHANGED
|
@@ -359,8 +359,11 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
359
359
|
}
|
|
360
360
|
}
|
|
361
361
|
|
|
362
|
-
// Ensure Accept header for non-html content fallback
|
|
363
|
-
extraHTTPHeaders
|
|
362
|
+
// Ensure Accept header for non-html content fallback — use a local copy to avoid
|
|
363
|
+
// mutating the caller's extraHTTPHeaders object (which is later checked by crawlers
|
|
364
|
+
// to decide whether to enable preNavigationHooks header rewriting).
|
|
365
|
+
const localHeaders = { ...extraHTTPHeaders };
|
|
366
|
+
localHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
|
|
364
367
|
|
|
365
368
|
await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
|
|
366
369
|
|
|
@@ -377,9 +380,21 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
377
380
|
} = rawDevice;
|
|
378
381
|
|
|
379
382
|
const launchOptions = getPlaywrightLaunchOptions(browserToRun);
|
|
383
|
+
|
|
384
|
+
const { Authorization, ...nonAuthHeaders } = localHeaders || {};
|
|
385
|
+
let httpCredentials = undefined;
|
|
386
|
+
if (Authorization?.startsWith('Basic ')) {
|
|
387
|
+
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
388
|
+
const colonIdx = decoded.indexOf(':');
|
|
389
|
+
if (colonIdx > 0) {
|
|
390
|
+
httpCredentials = { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
380
394
|
const contextOptions: Record<string, unknown> = {
|
|
381
395
|
...restDevice,
|
|
382
|
-
...(
|
|
396
|
+
...(Object.keys(nonAuthHeaders).length > 0 && { extraHTTPHeaders: nonAuthHeaders }),
|
|
397
|
+
...(httpCredentials && { httpCredentials }),
|
|
383
398
|
ignoreHTTPSErrors: true,
|
|
384
399
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
385
400
|
};
|
|
@@ -421,6 +436,26 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
421
436
|
}
|
|
422
437
|
|
|
423
438
|
try {
|
|
439
|
+
// Only enable generic Authorization header routing interception broadly if
|
|
440
|
+
// a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
|
|
441
|
+
// performance warnings inside the check checkUrl phase for typical public scans
|
|
442
|
+
if (Object.keys(localHeaders).length > 0) {
|
|
443
|
+
if (Authorization && !httpCredentials) {
|
|
444
|
+
const entryOrigin = new URL(url).origin;
|
|
445
|
+
await browserContext.route('**/*', async (route: any, request: any) => {
|
|
446
|
+
try {
|
|
447
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
448
|
+
await route.continue({ headers: { ...request.headers(), Authorization } });
|
|
449
|
+
} else {
|
|
450
|
+
await route.continue();
|
|
451
|
+
}
|
|
452
|
+
} catch {
|
|
453
|
+
await route.continue();
|
|
454
|
+
}
|
|
455
|
+
});
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
|
|
424
459
|
const page = await browserContext.newPage();
|
|
425
460
|
|
|
426
461
|
// Block native Chrome download UI
|
|
@@ -431,16 +466,6 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
431
466
|
consoleLogger.info(`Unable to set download deny: ${(e as Error).message}`);
|
|
432
467
|
}
|
|
433
468
|
|
|
434
|
-
// OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
|
|
435
|
-
// This allows the "Connectivity Check" to pass as soon as HTML is ready
|
|
436
|
-
await page.route('**/*', (route) => {
|
|
437
|
-
const type = route.request().resourceType();
|
|
438
|
-
if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
|
|
439
|
-
return route.abort();
|
|
440
|
-
}
|
|
441
|
-
return route.continue();
|
|
442
|
-
});
|
|
443
|
-
|
|
444
469
|
// STEP 2: Navigate (follows server-side redirects)
|
|
445
470
|
page.once('download', () => {
|
|
446
471
|
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
@@ -549,7 +574,7 @@ export const isSitemapContent = (content: string) => {
|
|
|
549
574
|
}
|
|
550
575
|
|
|
551
576
|
const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
|
|
552
|
-
const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
|
|
577
|
+
const regexForXmlSitemap = new RegExp('<(?:urlset|sitemapindex|feed|rss)+?.*>', 'gmi');
|
|
553
578
|
if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
|
|
554
579
|
// is an XML sitemap wrapped in a HTML document
|
|
555
580
|
return true;
|
|
@@ -572,8 +597,22 @@ export const checkUrl = async (
|
|
|
572
597
|
extraHTTPHeaders: Record<string, string>,
|
|
573
598
|
fileTypes: FileTypes,
|
|
574
599
|
) => {
|
|
600
|
+
let urlToCheck = url;
|
|
601
|
+
|
|
602
|
+
if (scanner === ScannerTypes.LOCALFILE) {
|
|
603
|
+
if (!isFilePath(url)) {
|
|
604
|
+
const res = new RES();
|
|
605
|
+
res.status = constants.urlCheckStatuses.notALocalFile.code;
|
|
606
|
+
return res;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
if (!url.toLowerCase().startsWith('file://')) {
|
|
610
|
+
urlToCheck = pathToFileURL(path.resolve(url)).toString();
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
575
614
|
const res = await checkUrlConnectivityWithBrowser(
|
|
576
|
-
|
|
615
|
+
urlToCheck,
|
|
577
616
|
browser,
|
|
578
617
|
clonedDataDir,
|
|
579
618
|
playwrightDeviceDetailsObject,
|
|
@@ -661,6 +700,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
661
700
|
ruleset,
|
|
662
701
|
generateJsonFiles,
|
|
663
702
|
scanDuration,
|
|
703
|
+
finalUrl,
|
|
664
704
|
} = argv;
|
|
665
705
|
|
|
666
706
|
const extraHTTPHeaders = parseHeaders(header);
|
|
@@ -694,6 +734,10 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
694
734
|
url = temp.toString();
|
|
695
735
|
}
|
|
696
736
|
|
|
737
|
+
// Keep browser-resolved URL (if provided by pre-check flow) as canonical entry URL.
|
|
738
|
+
// For local file paths, keep using the normalized `url` value below.
|
|
739
|
+
const resolvedEntryUrl = finalUrl && !isFilePath(finalUrl) ? finalUrl : url;
|
|
740
|
+
|
|
697
741
|
// construct filename for scan results
|
|
698
742
|
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
|
|
699
743
|
const domain = isLocalFileScan ? path.basename(url) : new URL(url).hostname;
|
|
@@ -738,7 +782,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
738
782
|
return {
|
|
739
783
|
type: scanner,
|
|
740
784
|
url,
|
|
741
|
-
entryUrl:
|
|
785
|
+
entryUrl: resolvedEntryUrl,
|
|
742
786
|
isHeadless: headless,
|
|
743
787
|
deviceChosen,
|
|
744
788
|
customDevice,
|
|
@@ -989,6 +1033,8 @@ export const getLinksFromSitemap = async (
|
|
|
989
1033
|
const scannedSitemaps = new Set<string>();
|
|
990
1034
|
const sitemapLinkCounts: Record<string, number> = {};
|
|
991
1035
|
const allUrls = new Set<string>(); // all discovered URLs (lightweight strings)
|
|
1036
|
+
const isImageSitemapUrl = (candidateUrl: string) =>
|
|
1037
|
+
/(^|\/)image-sitemap(?:-index)?(?:-\d+)?\.xml(?:$|[?#])/i.test(candidateUrl);
|
|
992
1038
|
|
|
993
1039
|
const addToUrlList = (url: string) => {
|
|
994
1040
|
if (!url) return;
|
|
@@ -1072,6 +1118,11 @@ export const getLinksFromSitemap = async (
|
|
|
1072
1118
|
let data;
|
|
1073
1119
|
let sitemapType;
|
|
1074
1120
|
|
|
1121
|
+
if (isImageSitemapUrl(url)) {
|
|
1122
|
+
consoleLogger.info(`Skipping image sitemap: ${url}`);
|
|
1123
|
+
return;
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1075
1126
|
if (scannedSitemaps.has(url)) {
|
|
1076
1127
|
// Skip processing if the sitemap has already been scanned
|
|
1077
1128
|
return;
|
|
@@ -1127,11 +1178,28 @@ export const getLinksFromSitemap = async (
|
|
|
1127
1178
|
|
|
1128
1179
|
const page = await browserContext.newPage();
|
|
1129
1180
|
|
|
1130
|
-
|
|
1181
|
+
// Use 'domcontentloaded' instead of 'networkidle' — sitemap XMLs with
|
|
1182
|
+
// XSL stylesheet references (e.g. <?xml-stylesheet ...?>) cause the browser
|
|
1183
|
+
// to fetch and apply the stylesheet, which may load additional resources
|
|
1184
|
+
// (fonts, CSS, images) that prevent 'networkidle' from ever being reached.
|
|
1185
|
+
const response = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
|
1186
|
+
|
|
1187
|
+
// Prefer the raw response body — this gives us the original XML before
|
|
1188
|
+
// the browser applies any XSL transformation (which would turn the XML
|
|
1189
|
+
// into rendered HTML, losing the sitemap structure).
|
|
1190
|
+
if (response) {
|
|
1191
|
+
try {
|
|
1192
|
+
data = await response.text();
|
|
1193
|
+
} catch {
|
|
1194
|
+
// response.text() can fail if the body was already consumed or
|
|
1195
|
+
// if a redirect occurred; fall through to DOM extraction below.
|
|
1196
|
+
}
|
|
1197
|
+
}
|
|
1131
1198
|
|
|
1132
|
-
if (
|
|
1133
|
-
|
|
1134
|
-
|
|
1199
|
+
if (!data) {
|
|
1200
|
+
if ((await page.locator('body').count()) > 0) {
|
|
1201
|
+
data = await page.locator('body').innerText();
|
|
1202
|
+
} else {
|
|
1135
1203
|
const urlSet = page.locator('urlset');
|
|
1136
1204
|
const sitemapIndex = page.locator('sitemapindex');
|
|
1137
1205
|
const rss = page.locator('rss');
|
|
@@ -1146,6 +1214,7 @@ export const getLinksFromSitemap = async (
|
|
|
1146
1214
|
data = await rss.evaluate(elem => elem.outerHTML);
|
|
1147
1215
|
} else if (await isRoot(feed)) {
|
|
1148
1216
|
data = await feed.evaluate(elem => elem.outerHTML);
|
|
1217
|
+
}
|
|
1149
1218
|
}
|
|
1150
1219
|
}
|
|
1151
1220
|
} finally {
|
|
@@ -1169,39 +1238,65 @@ export const getLinksFromSitemap = async (
|
|
|
1169
1238
|
}
|
|
1170
1239
|
|
|
1171
1240
|
const $ = cheerio.load(data, { xml: true });
|
|
1241
|
+
const countBefore = allUrls.size;
|
|
1172
1242
|
|
|
1173
1243
|
// This case is when the document is not an XML format document
|
|
1174
1244
|
if ($(':root').length === 0) {
|
|
1175
1245
|
processNonStandardSitemap(data);
|
|
1246
|
+
|
|
1247
|
+
const linksFromThisSitemap = allUrls.size - countBefore;
|
|
1248
|
+
if (linksFromThisSitemap > 0) {
|
|
1249
|
+
sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
|
|
1250
|
+
}
|
|
1176
1251
|
return;
|
|
1177
1252
|
}
|
|
1178
1253
|
|
|
1179
1254
|
// Root element
|
|
1180
1255
|
const root = $(':root')[0];
|
|
1256
|
+
const hasImageNamespace = Object.values(root?.attribs ?? {}).some(
|
|
1257
|
+
attribVal => typeof attribVal === 'string' && attribVal.toLowerCase().includes('sitemap-image'),
|
|
1258
|
+
);
|
|
1181
1259
|
|
|
1182
|
-
|
|
1260
|
+
if (hasImageNamespace) {
|
|
1261
|
+
consoleLogger.info(`Skipping image sitemap: ${url}`);
|
|
1262
|
+
return;
|
|
1263
|
+
}
|
|
1264
|
+
|
|
1265
|
+
const rootName = root?.name?.toLowerCase().split(':').pop() ?? '';
|
|
1266
|
+
const hasXmlSitemapIndexTag = /<\s*(?:[a-z0-9_-]+:)?sitemapindex\b/i.test(data);
|
|
1267
|
+
const hasXmlUrlsetTag = /<\s*(?:[a-z0-9_-]+:)?urlset\b/i.test(data);
|
|
1183
1268
|
|
|
1184
|
-
|
|
1185
|
-
if (root.name === 'urlset' && xmlns.includes(xmlFormatNamespace)) {
|
|
1269
|
+
if (rootName === 'urlset') {
|
|
1186
1270
|
sitemapType = constants.xmlSitemapTypes.xml;
|
|
1187
|
-
} else if (
|
|
1271
|
+
} else if (rootName === 'sitemapindex') {
|
|
1188
1272
|
sitemapType = constants.xmlSitemapTypes.xmlIndex;
|
|
1189
|
-
} else if (
|
|
1273
|
+
} else if (rootName === 'rss') {
|
|
1190
1274
|
sitemapType = constants.xmlSitemapTypes.rss;
|
|
1191
|
-
} else if (
|
|
1275
|
+
} else if (rootName === 'feed') {
|
|
1192
1276
|
sitemapType = constants.xmlSitemapTypes.atom;
|
|
1277
|
+
} else if (hasXmlSitemapIndexTag) {
|
|
1278
|
+
sitemapType = constants.xmlSitemapTypes.xmlIndex;
|
|
1279
|
+
} else if (hasXmlUrlsetTag) {
|
|
1280
|
+
sitemapType = constants.xmlSitemapTypes.xml;
|
|
1193
1281
|
} else {
|
|
1194
1282
|
sitemapType = constants.xmlSitemapTypes.unknown;
|
|
1195
1283
|
}
|
|
1196
1284
|
|
|
1197
|
-
const countBefore = allUrls.size;
|
|
1198
|
-
|
|
1199
1285
|
switch (sitemapType) {
|
|
1200
1286
|
case constants.xmlSitemapTypes.xmlIndex:
|
|
1201
|
-
consoleLogger.info(`This is a XML format sitemap index
|
|
1287
|
+
consoleLogger.info(`This is a XML format sitemap index: ${url}`);
|
|
1202
1288
|
for (const childSitemapUrl of $('loc')) {
|
|
1203
|
-
const childSitemapUrlText = $(childSitemapUrl).text();
|
|
1204
|
-
if (childSitemapUrlText
|
|
1289
|
+
const childSitemapUrlText = $(childSitemapUrl).text().trim();
|
|
1290
|
+
if (!childSitemapUrlText) {
|
|
1291
|
+
continue;
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
const childSitemapPath = childSitemapUrlText.split(/[?#]/)[0].toLowerCase();
|
|
1295
|
+
if (childSitemapPath.endsWith('.xml') || childSitemapPath.endsWith('.txt')) {
|
|
1296
|
+
if (isImageSitemapUrl(childSitemapUrlText)) {
|
|
1297
|
+
consoleLogger.info(`Skipping image sitemap: ${childSitemapUrlText}`);
|
|
1298
|
+
continue;
|
|
1299
|
+
}
|
|
1205
1300
|
await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
|
|
1206
1301
|
} else {
|
|
1207
1302
|
addToUrlList(childSitemapUrlText); // Add regular URLs to the list
|
|
@@ -1209,19 +1304,19 @@ export const getLinksFromSitemap = async (
|
|
|
1209
1304
|
}
|
|
1210
1305
|
break;
|
|
1211
1306
|
case constants.xmlSitemapTypes.xml:
|
|
1212
|
-
consoleLogger.info(`This is a XML format sitemap
|
|
1307
|
+
consoleLogger.info(`This is a XML format sitemap: ${url}`);
|
|
1213
1308
|
await processXmlSitemap($, sitemapType, 'loc', 'lastmod', 'url');
|
|
1214
1309
|
break;
|
|
1215
1310
|
case constants.xmlSitemapTypes.rss:
|
|
1216
|
-
consoleLogger.info(`This is a RSS format sitemap
|
|
1311
|
+
consoleLogger.info(`This is a RSS format sitemap: ${url}`);
|
|
1217
1312
|
await processXmlSitemap($, sitemapType, 'link', 'pubDate', 'item');
|
|
1218
1313
|
break;
|
|
1219
1314
|
case constants.xmlSitemapTypes.atom:
|
|
1220
|
-
consoleLogger.info(`This is a Atom format sitemap
|
|
1315
|
+
consoleLogger.info(`This is a Atom format sitemap: ${url}`);
|
|
1221
1316
|
await processXmlSitemap($, sitemapType, 'link', 'published', 'entry');
|
|
1222
1317
|
break;
|
|
1223
1318
|
default:
|
|
1224
|
-
consoleLogger.info(`This is an unrecognised XML sitemap format
|
|
1319
|
+
consoleLogger.info(`This is an unrecognised XML sitemap format: ${url}`);
|
|
1225
1320
|
processNonStandardSitemap(data);
|
|
1226
1321
|
}
|
|
1227
1322
|
|
|
@@ -2171,6 +2266,7 @@ export const isFilePath = (url: string): boolean => {
|
|
|
2171
2266
|
const driveLetterPattern = /^[A-Z]:/i;
|
|
2172
2267
|
const backslashPattern = /\\/;
|
|
2173
2268
|
return (
|
|
2269
|
+
url.toLowerCase().startsWith('file://') ||
|
|
2174
2270
|
url.startsWith('/') ||
|
|
2175
2271
|
driveLetterPattern.test(url) ||
|
|
2176
2272
|
backslashPattern.test(url) ||
|
|
@@ -1145,14 +1145,68 @@ export const createCrawleeSubFolders = async (
|
|
|
1145
1145
|
export const preNavigationHooks = (extraHTTPHeaders: Record<string, string>) => {
|
|
1146
1146
|
return [
|
|
1147
1147
|
async (crawlingContext: CrawlingContext, gotoOptions: PlaywrightGotoOptions) => {
|
|
1148
|
-
if (extraHTTPHeaders) {
|
|
1148
|
+
if (extraHTTPHeaders && Object.keys(extraHTTPHeaders).length > 0) {
|
|
1149
1149
|
crawlingContext.request.headers = extraHTTPHeaders;
|
|
1150
1150
|
}
|
|
1151
|
-
|
|
1151
|
+
// Use domcontentloaded — fires as soon as the DOM is parsed, before
|
|
1152
|
+
// images/stylesheets/network requests settle. This avoids indefinite
|
|
1153
|
+
// hangs on sites with WebSockets, analytics polling, or infinite-scroll
|
|
1154
|
+
// beacons that never reach networkidle. Further page stability is
|
|
1155
|
+
// handled by waitForPageLoaded() in each crawler's requestHandler and
|
|
1156
|
+
// by the DOM mutation observer in postNavigationHooks.
|
|
1157
|
+
if (gotoOptions) {
|
|
1158
|
+
gotoOptions.waitUntil = 'domcontentloaded';
|
|
1159
|
+
gotoOptions.timeout = 30000;
|
|
1160
|
+
}
|
|
1152
1161
|
},
|
|
1153
1162
|
];
|
|
1154
1163
|
};
|
|
1155
1164
|
|
|
1165
|
+
/**
|
|
1166
|
+
* Splits extraHTTPHeaders into auth and non-auth parts.
|
|
1167
|
+
* Auth headers (Authorization) must only be sent to same-origin requests to avoid CORS preflight failures.
|
|
1168
|
+
* Non-auth headers are safe to set globally on the browser context.
|
|
1169
|
+
*/
|
|
1170
|
+
export const splitAuthHeaders = (extraHTTPHeaders?: Record<string, string>) => {
|
|
1171
|
+
const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
|
|
1172
|
+
return {
|
|
1173
|
+
authHeader: Authorization || null,
|
|
1174
|
+
nonAuthHeaders: Object.keys(nonAuthHeaders).length > 0 ? nonAuthHeaders : null,
|
|
1175
|
+
httpCredentials: (() => {
|
|
1176
|
+
if (!Authorization?.startsWith('Basic ')) return null;
|
|
1177
|
+
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
1178
|
+
const colonIdx = decoded.indexOf(':');
|
|
1179
|
+
if (colonIdx <= 0) return null;
|
|
1180
|
+
return { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
|
|
1181
|
+
})(),
|
|
1182
|
+
};
|
|
1183
|
+
};
|
|
1184
|
+
|
|
1185
|
+
/**
|
|
1186
|
+
* Adds a route handler to a BrowserContext that sends the Authorization header
|
|
1187
|
+
* only to same-origin requests, preventing CORS preflight failures on cross-origin CDN resources.
|
|
1188
|
+
*/
|
|
1189
|
+
export const addAuthRouteHandler = async (
|
|
1190
|
+
context: BrowserContext,
|
|
1191
|
+
entryUrl: string,
|
|
1192
|
+
authHeader: string | null
|
|
1193
|
+
) => {
|
|
1194
|
+
if (!authHeader) return;
|
|
1195
|
+
|
|
1196
|
+
const entryOrigin = new URL(entryUrl).origin;
|
|
1197
|
+
await context.route('**/*', async (route, request) => {
|
|
1198
|
+
try {
|
|
1199
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
1200
|
+
await route.continue({ headers: { ...request.headers(), Authorization: authHeader } });
|
|
1201
|
+
} else {
|
|
1202
|
+
await route.continue();
|
|
1203
|
+
}
|
|
1204
|
+
} catch {
|
|
1205
|
+
await route.continue();
|
|
1206
|
+
}
|
|
1207
|
+
});
|
|
1208
|
+
};
|
|
1209
|
+
|
|
1156
1210
|
export const postNavigationHooks = [
|
|
1157
1211
|
async (_crawlingContext: CrawlingContext) => {
|
|
1158
1212
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
@@ -5,10 +5,12 @@ import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
|
|
|
5
5
|
import {
|
|
6
6
|
createCrawleeSubFolders,
|
|
7
7
|
getPreLaunchHook,
|
|
8
|
+
preNavigationHooks,
|
|
8
9
|
runAxeScript,
|
|
9
10
|
isUrlPdf,
|
|
10
11
|
shouldSkipClickDueToDisallowedHref,
|
|
11
12
|
shouldSkipDueToUnsupportedContent,
|
|
13
|
+
splitAuthHeaders,
|
|
12
14
|
} from './commonCrawlerFunc.js';
|
|
13
15
|
import constants, {
|
|
14
16
|
UrlsCrawled,
|
|
@@ -385,6 +387,8 @@ const crawlDomain = async ({
|
|
|
385
387
|
specifiedMaxConcurrency || constants.maxConcurrency,
|
|
386
388
|
);
|
|
387
389
|
|
|
390
|
+
const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
391
|
+
|
|
388
392
|
const crawler = register(
|
|
389
393
|
new crawlee.PlaywrightCrawler({
|
|
390
394
|
launchContext: {
|
|
@@ -404,12 +408,18 @@ const crawlDomain = async ({
|
|
|
404
408
|
...playwrightDeviceDetailsObject,
|
|
405
409
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
406
410
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
407
|
-
...(
|
|
411
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
412
|
+
...(httpCredentials && { httpCredentials }),
|
|
408
413
|
};
|
|
409
414
|
},
|
|
410
415
|
],
|
|
411
416
|
},
|
|
412
417
|
requestQueue,
|
|
418
|
+
maxRequestRetries: 3,
|
|
419
|
+
maxSessionRotations: 1,
|
|
420
|
+
preNavigationHooks: [
|
|
421
|
+
...preNavigationHooks(extraHTTPHeaders),
|
|
422
|
+
],
|
|
413
423
|
postNavigationHooks: [
|
|
414
424
|
async crawlingContext => {
|
|
415
425
|
const { page, request } = crawlingContext;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import fs from 'fs';
|
|
2
2
|
import { chromium, Page } from 'playwright';
|
|
3
3
|
import { EnqueueStrategy } from 'crawlee';
|
|
4
|
-
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
|
4
|
+
import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
|
|
5
5
|
import constants, { FileTypes, guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
|
6
6
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
7
7
|
import crawlDomain from './crawlDomain.js';
|
|
@@ -58,6 +58,7 @@ const crawlIntelligentSitemap = async (
|
|
|
58
58
|
let sitemapLink = '';
|
|
59
59
|
|
|
60
60
|
const launchOptions = getPlaywrightLaunchOptions(browser);
|
|
61
|
+
const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
61
62
|
let context;
|
|
62
63
|
let browserInstance;
|
|
63
64
|
|
|
@@ -65,20 +66,25 @@ const crawlIntelligentSitemap = async (
|
|
|
65
66
|
const effectiveUserDataDirectory = userDataDirectory || '';
|
|
66
67
|
context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
|
67
68
|
...launchOptions,
|
|
68
|
-
...(
|
|
69
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
70
|
+
...(httpCredentials && { httpCredentials }),
|
|
69
71
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
70
72
|
});
|
|
71
73
|
register(context);
|
|
72
74
|
} else {
|
|
73
|
-
// In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
|
|
74
75
|
browserInstance = await constants.launcher.launch(launchOptions);
|
|
75
76
|
register(browserInstance as unknown as { close: () => Promise<void> });
|
|
76
77
|
context = await browserInstance.newContext({
|
|
77
|
-
...(
|
|
78
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
79
|
+
...(httpCredentials && { httpCredentials }),
|
|
78
80
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
79
81
|
});
|
|
80
82
|
}
|
|
81
83
|
|
|
84
|
+
if (authHeader) {
|
|
85
|
+
await addAuthRouteHandler(context, link, authHeader);
|
|
86
|
+
}
|
|
87
|
+
|
|
82
88
|
const page = await context.newPage();
|
|
83
89
|
|
|
84
90
|
for (const path of sitemapPaths) {
|
|
@@ -7,6 +7,7 @@ import {
|
|
|
7
7
|
preNavigationHooks,
|
|
8
8
|
runAxeScript,
|
|
9
9
|
isUrlPdf,
|
|
10
|
+
splitAuthHeaders,
|
|
10
11
|
} from './commonCrawlerFunc.js';
|
|
11
12
|
|
|
12
13
|
import constants, {
|
|
@@ -85,6 +86,7 @@ const crawlSitemap = async ({
|
|
|
85
86
|
maxRequestsPerCrawl,
|
|
86
87
|
specifiedMaxConcurrency || constants.maxConcurrency,
|
|
87
88
|
);
|
|
89
|
+
const initialNoSuccessFailureAbortThreshold = Math.max(5, Math.min(maxRequestsPerCrawl, 25));
|
|
88
90
|
|
|
89
91
|
if (fromCrawlIntelligentSitemap) {
|
|
90
92
|
dataset = datasetFromIntelligent;
|
|
@@ -119,6 +121,7 @@ const crawlSitemap = async ({
|
|
|
119
121
|
const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
|
|
120
122
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
|
121
123
|
const { maxConcurrency } = constants;
|
|
124
|
+
const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
122
125
|
|
|
123
126
|
const requestList = await RequestList.open({
|
|
124
127
|
sources: linksFromSitemap,
|
|
@@ -142,11 +145,15 @@ const crawlSitemap = async ({
|
|
|
142
145
|
...playwrightDeviceDetailsObject,
|
|
143
146
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
144
147
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
148
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
149
|
+
...(httpCredentials && { httpCredentials }),
|
|
145
150
|
};
|
|
146
151
|
},
|
|
147
152
|
],
|
|
148
153
|
},
|
|
149
154
|
requestList,
|
|
155
|
+
maxRequestRetries: 3,
|
|
156
|
+
maxSessionRotations: 1,
|
|
150
157
|
postNavigationHooks: [
|
|
151
158
|
async ({ page }) => {
|
|
152
159
|
try {
|
|
@@ -197,6 +204,7 @@ const crawlSitemap = async ({
|
|
|
197
204
|
},
|
|
198
205
|
],
|
|
199
206
|
preNavigationHooks: [
|
|
207
|
+
...preNavigationHooks(extraHTTPHeaders),
|
|
200
208
|
async ({ request, page }, gotoOptions) => {
|
|
201
209
|
const url = request.url.toLowerCase();
|
|
202
210
|
|
|
@@ -213,8 +221,6 @@ const crawlSitemap = async ({
|
|
|
213
221
|
|
|
214
222
|
return;
|
|
215
223
|
}
|
|
216
|
-
|
|
217
|
-
preNavigationHooks(extraHTTPHeaders);
|
|
218
224
|
},
|
|
219
225
|
],
|
|
220
226
|
requestHandlerTimeoutSecs: 90,
|
|
@@ -449,6 +455,17 @@ const crawlSitemap = async ({
|
|
|
449
455
|
httpStatusCode: typeof status === 'number' ? status : 0,
|
|
450
456
|
});
|
|
451
457
|
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|
|
458
|
+
|
|
459
|
+
if (
|
|
460
|
+
urlsCrawled.scanned.length === 0 &&
|
|
461
|
+
urlsCrawled.error.length >= initialNoSuccessFailureAbortThreshold
|
|
462
|
+
) {
|
|
463
|
+
consoleLogger.info(
|
|
464
|
+
`Aborting sitemap crawl: ${urlsCrawled.error.length} failed pages with 0 successful scans.`,
|
|
465
|
+
);
|
|
466
|
+
isAbortingScan = true;
|
|
467
|
+
crawler.autoscaledPool?.abort();
|
|
468
|
+
}
|
|
452
469
|
},
|
|
453
470
|
maxRequestsPerCrawl: Infinity,
|
|
454
471
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
@@ -1228,19 +1228,32 @@ export const initNewPage = async (page, pageClosePromises, processPageParams, pa
|
|
|
1228
1228
|
const allowed = isOverlayAllowed(page.url(), processPageParams.entryUrl);
|
|
1229
1229
|
|
|
1230
1230
|
if (!allowed) {
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1231
|
+
// On macOS and Windows the custom flow always runs headful.
|
|
1232
|
+
// The URL guard (urlGuard.ts) intercepts non-http/https navigations
|
|
1233
|
+
// and calls page.goto(safeUrl). Do NOT remove the overlay here —
|
|
1234
|
+
// removing it causes it to stay permanently disabled if the redirect
|
|
1235
|
+
// races ahead of the next reconcile cycle.
|
|
1236
|
+
// Instead, fall through to the hasOverlay / addOverlayMenu block so
|
|
1237
|
+
// the overlay is (re-)injected even on transient non-http/https URLs
|
|
1238
|
+
// (e.g. file://, about:blank) and again after the guard's redirect.
|
|
1239
|
+
const isDesktopHost = process.platform === 'darwin' || process.platform === 'win32';
|
|
1240
|
+
if (!isDesktopHost) {
|
|
1241
|
+
// On Linux / Docker: remove overlay for non-http/https URLs and stop.
|
|
1242
|
+
await Promise.race([
|
|
1243
|
+
removeOverlayMenu(page),
|
|
1244
|
+
new Promise((_, reject) => {
|
|
1245
|
+
setTimeout(() => {
|
|
1246
|
+
reject(
|
|
1247
|
+
new Error(
|
|
1248
|
+
`removeOverlayMenu timed out after ${OVERLAY_OPERATION_TIMEOUT_MS}ms`,
|
|
1249
|
+
),
|
|
1250
|
+
);
|
|
1251
|
+
}, OVERLAY_OPERATION_TIMEOUT_MS);
|
|
1252
|
+
}),
|
|
1253
|
+
]);
|
|
1254
|
+
return;
|
|
1255
|
+
}
|
|
1256
|
+
// Desktop hosts: skip removal and fall through to re-add overlay.
|
|
1244
1257
|
}
|
|
1245
1258
|
|
|
1246
1259
|
const hasOverlay = await page.evaluate(() =>
|
|
@@ -35,8 +35,18 @@ export function addUrlGuardScript(context, opts = {}) {
|
|
|
35
35
|
});
|
|
36
36
|
|
|
37
37
|
const restoreToSafeUrl = async (page, attemptedUrl) => {
|
|
38
|
+
const safeUrl = lastAllowedUrlByPage.get(page) || fallbackUrl || 'about:blank';
|
|
39
|
+
// Only redirect if the safe URL is itself an allowed (http/https) URL.
|
|
40
|
+
// If the entry URL is file:// (e.g. scanning a local HTML file), the
|
|
41
|
+
// fallback is also file://, and redirecting would create an infinite loop:
|
|
42
|
+
// file:// → restoreToSafeUrl → file:// → framenavigated → restoreToSafeUrl → …
|
|
43
|
+
try {
|
|
44
|
+
const safeObj = new URL(safeUrl);
|
|
45
|
+
if (!ALLOWED_PROTOCOLS.has(safeObj.protocol)) return;
|
|
46
|
+
} catch {
|
|
47
|
+
return;
|
|
48
|
+
}
|
|
38
49
|
try {
|
|
39
|
-
const safeUrl = lastAllowedUrlByPage.get(page) || fallbackUrl || 'about:blank';
|
|
40
50
|
await page.goto(safeUrl, { waitUntil: 'domcontentloaded' });
|
|
41
51
|
} catch {
|
|
42
52
|
// page might be closing; ignore
|
|
@@ -58,6 +68,13 @@ export function addUrlGuardScript(context, opts = {}) {
|
|
|
58
68
|
lastAllowedUrlByPage.set(page, urlObj.toString());
|
|
59
69
|
return;
|
|
60
70
|
}
|
|
71
|
+
|
|
72
|
+
// Skip browser-internal transitional states (about:blank, about:srcdoc, etc.).
|
|
73
|
+
// page.goto() navigates through about:blank before loading the target URL.
|
|
74
|
+
// Redirecting from about: creates an infinite loop:
|
|
75
|
+
// restoreToSafeUrl → page.goto(safeUrl) → about:blank → restoreToSafeUrl → …
|
|
76
|
+
if (urlObj.protocol === 'about:') return;
|
|
77
|
+
|
|
61
78
|
await restoreToSafeUrl(page, urlStr);
|
|
62
79
|
});
|
|
63
80
|
};
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/* eslint-env browser */
|
|
2
|
-
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
|
2
|
+
import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
|
|
3
3
|
import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
|
|
4
4
|
import constants, {
|
|
5
5
|
getIntermediateScreenshotsPath,
|
|
@@ -60,6 +60,7 @@ const runCustom = async (
|
|
|
60
60
|
blacklistedPatterns: string[] | null,
|
|
61
61
|
includeScreenshots: boolean,
|
|
62
62
|
initialCustomFlowLabel?: string,
|
|
63
|
+
extraHTTPHeaders?: Record<string, string>,
|
|
63
64
|
) => {
|
|
64
65
|
// checks and delete datasets path if it already exists
|
|
65
66
|
process.env.CRAWLEE_STORAGE_DIR = randomToken;
|
|
@@ -109,6 +110,8 @@ const runCustom = async (
|
|
|
109
110
|
...customArgs,
|
|
110
111
|
];
|
|
111
112
|
|
|
113
|
+
const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
114
|
+
|
|
112
115
|
const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
|
|
113
116
|
...baseLaunchOptions,
|
|
114
117
|
args: mergedArgs,
|
|
@@ -118,8 +121,14 @@ const runCustom = async (
|
|
|
118
121
|
viewport: null,
|
|
119
122
|
...(hasCustomViewport ? contextDeviceOptions : {}),
|
|
120
123
|
userAgent: process.env.OOBEE_USER_AGENT || (deviceUserAgent as string | undefined),
|
|
124
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
125
|
+
...(httpCredentials && { httpCredentials }),
|
|
121
126
|
});
|
|
122
127
|
|
|
128
|
+
if (authHeader) {
|
|
129
|
+
await addAuthRouteHandler(context, url, authHeader);
|
|
130
|
+
}
|
|
131
|
+
|
|
123
132
|
register(context);
|
|
124
133
|
|
|
125
134
|
processPageParams.stopAll = async () => {
|
|
@@ -60,7 +60,7 @@ const SENTRY_NODE_VERSION: string = (() => {
|
|
|
60
60
|
try {
|
|
61
61
|
return _require('@sentry/node/package.json').version as string;
|
|
62
62
|
} catch {
|
|
63
|
-
return '
|
|
63
|
+
return '10.58.0'; // safe fallback matching currently installed version
|
|
64
64
|
}
|
|
65
65
|
})();
|
|
66
66
|
|