@govtechsg/oobee 0.10.91 → 0.10.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +303 -0
- package/README.md +22 -0
- package/dist/cli.js +3 -0
- package/dist/combine.js +15 -3
- package/dist/constants/cliFunctions.js +7 -0
- package/dist/constants/common.js +149 -80
- package/dist/constants/constants.js +1 -0
- package/dist/crawlers/commonCrawlerFunc.js +136 -15
- package/dist/crawlers/crawlDomain.js +55 -58
- package/dist/crawlers/crawlIntelligentSitemap.js +21 -11
- package/dist/crawlers/crawlRateController.js +47 -0
- package/dist/crawlers/crawlSitemap.js +51 -62
- package/dist/crawlers/runCustom.js +8 -2
- package/dist/generateOobeeClientScanner.js +32 -1
- package/dist/mergeAxeResults/itemsStore.js +32 -3
- package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
- package/dist/mergeAxeResults.js +120 -92
- package/dist/npmIndex.js +1 -0
- package/dist/utils.js +23 -28
- package/oobee-client-scanner.js +35 -4
- package/package.json +3 -3
- package/src/cli.ts +4 -0
- package/src/combine.ts +16 -1
- package/src/constants/cliFunctions.ts +7 -0
- package/src/constants/common.ts +162 -90
- package/src/constants/constants.ts +1 -0
- package/src/crawlers/commonCrawlerFunc.ts +148 -14
- package/src/crawlers/crawlDomain.ts +64 -66
- package/src/crawlers/crawlIntelligentSitemap.ts +23 -11
- package/src/crawlers/crawlRateController.ts +63 -0
- package/src/crawlers/crawlSitemap.ts +57 -70
- package/src/crawlers/runCustom.ts +10 -1
- package/src/generateOobeeClientScanner.ts +32 -1
- package/src/index.ts +1 -0
- package/src/mergeAxeResults/itemsStore.ts +37 -3
- package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
- package/src/mergeAxeResults.ts +139 -99
- package/src/npmIndex.ts +1 -0
- package/src/utils.ts +25 -33
- /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt} +0 -0
package/dist/constants/common.js
CHANGED
|
@@ -300,9 +300,19 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
|
|
|
300
300
|
const rawDevice = (playwrightDeviceDetailsObject || {});
|
|
301
301
|
const { viewport, isMobile, hasTouch, userAgent: deviceUserAgent, ...restDevice } = rawDevice;
|
|
302
302
|
const launchOptions = getPlaywrightLaunchOptions(browserToRun);
|
|
303
|
+
const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
|
|
304
|
+
let httpCredentials = undefined;
|
|
305
|
+
if (Authorization?.startsWith('Basic ')) {
|
|
306
|
+
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
307
|
+
const colonIdx = decoded.indexOf(':');
|
|
308
|
+
if (colonIdx > 0) {
|
|
309
|
+
httpCredentials = { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
|
|
310
|
+
}
|
|
311
|
+
}
|
|
303
312
|
const contextOptions = {
|
|
304
313
|
...restDevice,
|
|
305
|
-
...(
|
|
314
|
+
...(Object.keys(nonAuthHeaders).length > 0 && { extraHTTPHeaders: nonAuthHeaders }),
|
|
315
|
+
...(httpCredentials && { httpCredentials }),
|
|
306
316
|
ignoreHTTPSErrors: true,
|
|
307
317
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
308
318
|
};
|
|
@@ -342,6 +352,25 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
|
|
|
342
352
|
return res;
|
|
343
353
|
}
|
|
344
354
|
try {
|
|
355
|
+
// Only enable generic Authorization header routing interception broadly if
|
|
356
|
+
// a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
|
|
357
|
+
// performance warnings inside the check checkUrl phase for typical public scans
|
|
358
|
+
if (Authorization && !httpCredentials) {
|
|
359
|
+
const entryOrigin = new URL(url).origin;
|
|
360
|
+
await browserContext.route('**/*', async (route, request) => {
|
|
361
|
+
try {
|
|
362
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
363
|
+
await route.continue({ headers: { ...request.headers(), Authorization } });
|
|
364
|
+
}
|
|
365
|
+
else {
|
|
366
|
+
await route.continue();
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
catch {
|
|
370
|
+
await route.continue();
|
|
371
|
+
}
|
|
372
|
+
});
|
|
373
|
+
}
|
|
345
374
|
const page = await browserContext.newPage();
|
|
346
375
|
// Block native Chrome download UI
|
|
347
376
|
try {
|
|
@@ -351,15 +380,6 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
|
|
|
351
380
|
catch (e) {
|
|
352
381
|
consoleLogger.info(`Unable to set download deny: ${e.message}`);
|
|
353
382
|
}
|
|
354
|
-
// OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
|
|
355
|
-
// This allows the "Connectivity Check" to pass as soon as HTML is ready
|
|
356
|
-
await page.route('**/*', (route) => {
|
|
357
|
-
const type = route.request().resourceType();
|
|
358
|
-
if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
|
|
359
|
-
return route.abort();
|
|
360
|
-
}
|
|
361
|
-
return route.continue();
|
|
362
|
-
});
|
|
363
383
|
// STEP 2: Navigate (follows server-side redirects)
|
|
364
384
|
page.once('download', () => {
|
|
365
385
|
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
@@ -713,6 +733,7 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
|
|
|
713
733
|
browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
|
|
714
734
|
...getPlaywrightLaunchOptions(browser),
|
|
715
735
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
736
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
716
737
|
});
|
|
717
738
|
register(browserContext);
|
|
718
739
|
}
|
|
@@ -723,6 +744,7 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
|
|
|
723
744
|
register(browserInstance);
|
|
724
745
|
browserContext = await browserInstance.newContext({
|
|
725
746
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
747
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
726
748
|
});
|
|
727
749
|
}
|
|
728
750
|
const page = await browserContext.newPage();
|
|
@@ -784,10 +806,10 @@ export const isDisallowedInRobotsTxt = (url) => {
|
|
|
784
806
|
}
|
|
785
807
|
return false;
|
|
786
808
|
};
|
|
787
|
-
export const getLinksFromSitemap = async (sitemapUrl,
|
|
809
|
+
export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
|
|
788
810
|
const scannedSitemaps = new Set();
|
|
789
|
-
const
|
|
790
|
-
const
|
|
811
|
+
const sitemapLinkCounts = {};
|
|
812
|
+
const allUrls = new Set(); // all discovered URLs (lightweight strings)
|
|
791
813
|
const addToUrlList = (url) => {
|
|
792
814
|
if (!url)
|
|
793
815
|
return;
|
|
@@ -796,17 +818,7 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
796
818
|
if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy))
|
|
797
819
|
return;
|
|
798
820
|
url = convertPathToLocalFile(url);
|
|
799
|
-
|
|
800
|
-
try {
|
|
801
|
-
request = new Request({ url });
|
|
802
|
-
}
|
|
803
|
-
catch (e) {
|
|
804
|
-
console.log('Error creating request', e);
|
|
805
|
-
}
|
|
806
|
-
if (isUrlPdf(url)) {
|
|
807
|
-
request.skipNavigation = true;
|
|
808
|
-
}
|
|
809
|
-
urls[url] = request;
|
|
821
|
+
allUrls.add(url);
|
|
810
822
|
};
|
|
811
823
|
const calculateCloseness = (sitemapUrl) => {
|
|
812
824
|
// Remove 'http://', 'https://', and 'www.' prefixes from the URLs
|
|
@@ -849,15 +861,14 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
849
861
|
return (b.lastModifiedDate?.getTime() || 0) - (a.lastModifiedDate?.getTime() || 0);
|
|
850
862
|
});
|
|
851
863
|
}
|
|
852
|
-
// Add
|
|
853
|
-
for (const { url } of urlList
|
|
864
|
+
// Add all URLs to the discovered list (limit applied later at return time)
|
|
865
|
+
for (const { url } of urlList) {
|
|
854
866
|
addToUrlList(url);
|
|
855
867
|
}
|
|
856
868
|
};
|
|
857
869
|
const processNonStandardSitemap = (data) => {
|
|
858
870
|
const urlsFromData = crawlee
|
|
859
|
-
.extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
|
|
860
|
-
.slice(0, maxLinksCount);
|
|
871
|
+
.extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') });
|
|
861
872
|
urlsFromData.forEach(url => {
|
|
862
873
|
addToUrlList(url);
|
|
863
874
|
});
|
|
@@ -900,6 +911,7 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
900
911
|
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
901
912
|
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
902
913
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
914
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
903
915
|
});
|
|
904
916
|
register(browserContext);
|
|
905
917
|
}
|
|
@@ -910,6 +922,7 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
910
922
|
register(browserInstance);
|
|
911
923
|
browserContext = await browserInstance.newContext({
|
|
912
924
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
925
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
913
926
|
});
|
|
914
927
|
}
|
|
915
928
|
const page = await browserContext.newPage();
|
|
@@ -980,14 +993,12 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
980
993
|
else {
|
|
981
994
|
sitemapType = constants.xmlSitemapTypes.unknown;
|
|
982
995
|
}
|
|
996
|
+
const countBefore = allUrls.size;
|
|
983
997
|
switch (sitemapType) {
|
|
984
998
|
case constants.xmlSitemapTypes.xmlIndex:
|
|
985
999
|
consoleLogger.info(`This is a XML format sitemap index.`);
|
|
986
1000
|
for (const childSitemapUrl of $('loc')) {
|
|
987
1001
|
const childSitemapUrlText = $(childSitemapUrl).text();
|
|
988
|
-
if (isLimitReached()) {
|
|
989
|
-
break;
|
|
990
|
-
}
|
|
991
1002
|
if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
|
|
992
1003
|
await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
|
|
993
1004
|
}
|
|
@@ -1012,6 +1023,10 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
1012
1023
|
consoleLogger.info(`This is an unrecognised XML sitemap format.`);
|
|
1013
1024
|
processNonStandardSitemap(data);
|
|
1014
1025
|
}
|
|
1026
|
+
const linksFromThisSitemap = allUrls.size - countBefore;
|
|
1027
|
+
if (linksFromThisSitemap > 0) {
|
|
1028
|
+
sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
|
|
1029
|
+
}
|
|
1015
1030
|
};
|
|
1016
1031
|
try {
|
|
1017
1032
|
await fetchUrls(sitemapUrl, extraHTTPHeaders);
|
|
@@ -1019,7 +1034,37 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
1019
1034
|
catch (e) {
|
|
1020
1035
|
consoleLogger.error(e);
|
|
1021
1036
|
}
|
|
1022
|
-
|
|
1037
|
+
// Build Request objects for all discovered URLs; the crawler itself enforces
|
|
1038
|
+
// maxRequestsPerCrawl by counting only successfully scanned pages.
|
|
1039
|
+
const requestList = [];
|
|
1040
|
+
for (const url of allUrls) {
|
|
1041
|
+
try {
|
|
1042
|
+
const request = new Request({ url });
|
|
1043
|
+
if (isUrlPdf(url)) {
|
|
1044
|
+
request.skipNavigation = true;
|
|
1045
|
+
}
|
|
1046
|
+
requestList.push(request);
|
|
1047
|
+
}
|
|
1048
|
+
catch (e) {
|
|
1049
|
+
consoleLogger.info(`Error creating request for ${url}: ${e}`);
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
const totalLinksDiscovered = allUrls.size;
|
|
1053
|
+
const fetchedSitemaps = Object.entries(sitemapLinkCounts).map(([url, fetchedLinks]) => ({
|
|
1054
|
+
url,
|
|
1055
|
+
fetchedLinks,
|
|
1056
|
+
}));
|
|
1057
|
+
const prev = constants.sitemapFetchedLinks;
|
|
1058
|
+
constants.sitemapFetchedLinks = {
|
|
1059
|
+
totalLinksFetchedFromSitemaps: (prev?.totalLinksFetchedFromSitemaps ?? 0) + totalLinksDiscovered,
|
|
1060
|
+
fetchedSitemaps: [...(prev?.fetchedSitemaps ?? []), ...fetchedSitemaps],
|
|
1061
|
+
};
|
|
1062
|
+
if (totalLinksDiscovered > 0) {
|
|
1063
|
+
const breakdown = fetchedSitemaps
|
|
1064
|
+
.map(({ url, fetchedLinks }) => `${url} (${fetchedLinks})`)
|
|
1065
|
+
.join(', ');
|
|
1066
|
+
consoleLogger.info(`There are a total of ${totalLinksDiscovered} links found across ${breakdown}.`);
|
|
1067
|
+
}
|
|
1023
1068
|
return requestList;
|
|
1024
1069
|
};
|
|
1025
1070
|
export const validEmail = (email) => {
|
|
@@ -1158,6 +1203,34 @@ export const getEdgeData = (randomToken) => {
|
|
|
1158
1203
|
* @param {*} destDir destination directory
|
|
1159
1204
|
* @returns boolean indicating whether the operation was successful
|
|
1160
1205
|
*/
|
|
1206
|
+
// Helper to copy a file with retry logic for transient EBUSY errors
|
|
1207
|
+
const copyFileWithRetry = (src, dest, maxRetries = 3) => {
|
|
1208
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
1209
|
+
try {
|
|
1210
|
+
fs.copyFileSync(src, dest);
|
|
1211
|
+
if (attempt > 1) {
|
|
1212
|
+
consoleLogger.info(`File copy succeeded on attempt ${attempt}: ${dest}`);
|
|
1213
|
+
}
|
|
1214
|
+
return true;
|
|
1215
|
+
}
|
|
1216
|
+
catch (err) {
|
|
1217
|
+
if (err.code === 'EBUSY' && attempt < maxRetries) {
|
|
1218
|
+
// Transient lock — wait and retry
|
|
1219
|
+
const delayMs = Math.min(100 * Math.pow(2, attempt - 1), 1000); // Exponential backoff: 100ms, 200ms, 400ms, capped at 1s
|
|
1220
|
+
consoleLogger.warn(`File copy attempt ${attempt}/${maxRetries} failed with EBUSY. Retrying after ${delayMs}ms: ${dest}`);
|
|
1221
|
+
// Synchronous sleep via busy-wait (not ideal but avoids promise complications in sync context)
|
|
1222
|
+
const endTime = Date.now() + delayMs;
|
|
1223
|
+
while (Date.now() < endTime) {
|
|
1224
|
+
// Busy wait
|
|
1225
|
+
}
|
|
1226
|
+
continue; // Retry
|
|
1227
|
+
}
|
|
1228
|
+
// Non-transient error or max retries reached
|
|
1229
|
+
return false;
|
|
1230
|
+
}
|
|
1231
|
+
}
|
|
1232
|
+
return false;
|
|
1233
|
+
};
|
|
1161
1234
|
const cloneChromeProfileCookieFiles = (options, destDir) => {
|
|
1162
1235
|
let profileCookiesDir;
|
|
1163
1236
|
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
|
@@ -1196,19 +1269,9 @@ const cloneChromeProfileCookieFiles = (options, destDir) => {
|
|
|
1196
1269
|
}
|
|
1197
1270
|
// Prevents duplicate cookies file if the cookies already exist
|
|
1198
1271
|
if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
catch (err) {
|
|
1203
|
-
consoleLogger.error(err);
|
|
1204
|
-
if (err.code === 'EBUSY') {
|
|
1205
|
-
console.log(`Unable to copy the file for ${profileName} because it is currently in use.`);
|
|
1206
|
-
console.log('Please close any applications that might be using this file and try again.');
|
|
1207
|
-
}
|
|
1208
|
-
else {
|
|
1209
|
-
console.log(`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`);
|
|
1210
|
-
}
|
|
1211
|
-
// printMessage([err], messageOptions);
|
|
1272
|
+
const destCookiesPath = path.join(destProfileDir, 'Cookies');
|
|
1273
|
+
if (!copyFileWithRetry(dir, destCookiesPath)) {
|
|
1274
|
+
consoleLogger.error(`Failed to copy Chrome profile cookies for ${profileName} after retries.`);
|
|
1212
1275
|
success = false;
|
|
1213
1276
|
}
|
|
1214
1277
|
}
|
|
@@ -1220,12 +1283,6 @@ const cloneChromeProfileCookieFiles = (options, destDir) => {
|
|
|
1220
1283
|
printMessage(['Unable to find Chrome profile cookies file in the system.'], messageOptions);
|
|
1221
1284
|
return false;
|
|
1222
1285
|
};
|
|
1223
|
-
/**
|
|
1224
|
-
* Clone the Chrome profile cookie files to the destination directory
|
|
1225
|
-
* @param {*} options glob options object
|
|
1226
|
-
* @param {*} destDir destination directory
|
|
1227
|
-
* @returns boolean indicating whether the operation was successful
|
|
1228
|
-
*/
|
|
1229
1286
|
const cloneEdgeProfileCookieFiles = (options, destDir) => {
|
|
1230
1287
|
let profileCookiesDir;
|
|
1231
1288
|
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
|
@@ -1265,19 +1322,9 @@ const cloneEdgeProfileCookieFiles = (options, destDir) => {
|
|
|
1265
1322
|
}
|
|
1266
1323
|
// Prevents duplicate cookies file if the cookies already exist
|
|
1267
1324
|
if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
catch (err) {
|
|
1272
|
-
consoleLogger.error(err);
|
|
1273
|
-
if (err.code === 'EBUSY') {
|
|
1274
|
-
console.log(`Unable to copy the file for ${profileName} because it is currently in use.`);
|
|
1275
|
-
console.log('Please close any applications that might be using this file and try again.');
|
|
1276
|
-
}
|
|
1277
|
-
else {
|
|
1278
|
-
console.log(`An unexpected error occurred while copying the file: ${err.message}`);
|
|
1279
|
-
}
|
|
1280
|
-
// printMessage([err], messageOptions);
|
|
1325
|
+
const destCookiesPath = path.join(destProfileDir, 'Cookies');
|
|
1326
|
+
if (!copyFileWithRetry(dir, destCookiesPath)) {
|
|
1327
|
+
consoleLogger.error(`Failed to copy Edge profile cookies for ${profileName} after retries.`);
|
|
1281
1328
|
success = false;
|
|
1282
1329
|
}
|
|
1283
1330
|
}
|
|
@@ -1305,19 +1352,9 @@ const cloneLocalStateFile = (options, destDir) => {
|
|
|
1305
1352
|
let success = true;
|
|
1306
1353
|
localState.forEach(dir => {
|
|
1307
1354
|
const profileName = dir.match(profileNamesRegex)[1];
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
catch (err) {
|
|
1312
|
-
consoleLogger.error(err);
|
|
1313
|
-
if (err.code === 'EBUSY') {
|
|
1314
|
-
console.log(`Unable to copy the file because it is currently in use.`);
|
|
1315
|
-
console.log('Please close any applications that might be using this file and try again.');
|
|
1316
|
-
}
|
|
1317
|
-
else {
|
|
1318
|
-
console.log(`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`);
|
|
1319
|
-
}
|
|
1320
|
-
printMessage([err], messageOptions);
|
|
1355
|
+
const destPath = path.join(destDir, 'Local State');
|
|
1356
|
+
if (!copyFileWithRetry(dir, destPath)) {
|
|
1357
|
+
consoleLogger.error(`Failed to copy Local State file for ${profileName} after retries.`);
|
|
1321
1358
|
success = false;
|
|
1322
1359
|
}
|
|
1323
1360
|
});
|
|
@@ -1362,6 +1399,15 @@ export const cloneChromeProfiles = (randomToken) => {
|
|
|
1362
1399
|
return destDir;
|
|
1363
1400
|
}
|
|
1364
1401
|
consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
|
|
1402
|
+
// Fall back to a clean profile directory to avoid launch failures from partial clones.
|
|
1403
|
+
try {
|
|
1404
|
+
fs.rmSync(destDir, { recursive: true, force: true });
|
|
1405
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1406
|
+
consoleLogger.warn('Using an empty cloned Chrome profile directory due to clone failure.');
|
|
1407
|
+
}
|
|
1408
|
+
catch (cleanupError) {
|
|
1409
|
+
consoleLogger.error(`Unable to reset cloned Chrome profile directory ${destDir}: ${cleanupError}`);
|
|
1410
|
+
}
|
|
1365
1411
|
}
|
|
1366
1412
|
// For future reference, return a null instead to halt the scan
|
|
1367
1413
|
return destDir;
|
|
@@ -1418,6 +1464,15 @@ export const cloneEdgeProfiles = (randomToken) => {
|
|
|
1418
1464
|
return destDir;
|
|
1419
1465
|
}
|
|
1420
1466
|
consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
|
|
1467
|
+
// Fall back to a clean profile directory to avoid launch failures from partial clones.
|
|
1468
|
+
try {
|
|
1469
|
+
fs.rmSync(destDir, { recursive: true, force: true });
|
|
1470
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1471
|
+
consoleLogger.warn('Using an empty cloned Edge profile directory due to clone failure.');
|
|
1472
|
+
}
|
|
1473
|
+
catch (cleanupError) {
|
|
1474
|
+
consoleLogger.error(`Unable to reset cloned Edge profile directory ${destDir}: ${cleanupError}`);
|
|
1475
|
+
}
|
|
1421
1476
|
}
|
|
1422
1477
|
// For future reference, return a null instead to halt the scan
|
|
1423
1478
|
return destDir;
|
|
@@ -1444,7 +1499,14 @@ export const deleteClonedChromeProfiles = (randomToken) => {
|
|
|
1444
1499
|
}
|
|
1445
1500
|
let destDir;
|
|
1446
1501
|
if (randomToken) {
|
|
1447
|
-
|
|
1502
|
+
// Also match _pool* directories created by browser pool re-launches
|
|
1503
|
+
destDir = globSync(`oobee-${randomToken}*`, {
|
|
1504
|
+
cwd: baseDir,
|
|
1505
|
+
absolute: true,
|
|
1506
|
+
});
|
|
1507
|
+
if (destDir.length === 0) {
|
|
1508
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
|
1509
|
+
}
|
|
1448
1510
|
}
|
|
1449
1511
|
else {
|
|
1450
1512
|
// Find all the oobee directories in the Chrome data directory
|
|
@@ -1481,10 +1543,17 @@ export const deleteClonedEdgeProfiles = (randomToken) => {
|
|
|
1481
1543
|
}
|
|
1482
1544
|
let destDir;
|
|
1483
1545
|
if (randomToken) {
|
|
1484
|
-
|
|
1546
|
+
// Also match _pool* directories created by browser pool re-launches
|
|
1547
|
+
destDir = globSync(`oobee-${randomToken}*`, {
|
|
1548
|
+
cwd: baseDir,
|
|
1549
|
+
absolute: true,
|
|
1550
|
+
});
|
|
1551
|
+
if (destDir.length === 0) {
|
|
1552
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
|
1553
|
+
}
|
|
1485
1554
|
}
|
|
1486
1555
|
else {
|
|
1487
|
-
// Find all the oobee directories in the
|
|
1556
|
+
// Find all the oobee directories in the Edge data directory
|
|
1488
1557
|
destDir = globSync('**/oobee*', {
|
|
1489
1558
|
cwd: baseDir,
|
|
1490
1559
|
absolute: true,
|
|
@@ -771,6 +771,7 @@ export default {
|
|
|
771
771
|
a11yRuleShortDescriptionMap,
|
|
772
772
|
disabilityBadgesMap,
|
|
773
773
|
robotsTxtUrls: null,
|
|
774
|
+
sitemapFetchedLinks: null,
|
|
774
775
|
userDataDirectory: null, // This will be set later in the code
|
|
775
776
|
randomToken: null, // This will be set later in the code
|
|
776
777
|
// Track all active Crawlee / Playwright resources for cleanup
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Dataset, RequestQueue, log, playwrightUtils } from 'crawlee';
|
|
2
2
|
import axe from 'axe-core';
|
|
3
3
|
import { axeScript, disallowedListOfPatterns, guiInfoStatusTypes, RuleFlags, saflyIconSelector, } from '../constants/constants.js';
|
|
4
|
-
import {
|
|
4
|
+
import { guiInfoLog } from '../logs.js';
|
|
5
5
|
import { enrichColorContrastDOMContext, takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
|
|
6
6
|
import { isFilePath } from '../constants/common.js';
|
|
7
7
|
import { extractAndGradeText } from './custom/extractAndGradeText.js';
|
|
@@ -674,6 +674,13 @@ export const filterAxeResults = (results, pageTitle, customFlowDetails) => {
|
|
|
674
674
|
export const runAxeScript = async ({ includeScreenshots, page, randomToken, customFlowDetails = null, selectors = [], ruleset = [], }) => {
|
|
675
675
|
const browserContext = page.context();
|
|
676
676
|
const requestUrl = page.url();
|
|
677
|
+
let pageTitle = null;
|
|
678
|
+
try {
|
|
679
|
+
pageTitle = await page.evaluate(() => document.title);
|
|
680
|
+
}
|
|
681
|
+
catch {
|
|
682
|
+
// Page may already be in a bad state; title will remain null
|
|
683
|
+
}
|
|
677
684
|
try {
|
|
678
685
|
// Checking for DOM mutations before proceeding to scan
|
|
679
686
|
await page.evaluate(() => {
|
|
@@ -781,7 +788,40 @@ export const runAxeScript = async ({ includeScreenshots, page, randomToken, cust
|
|
|
781
788
|
.run(selectors, {
|
|
782
789
|
resultTypes: defaultResultTypes,
|
|
783
790
|
})
|
|
784
|
-
.then(results => {
|
|
791
|
+
.then(async (results) => {
|
|
792
|
+
// Re-verify aria-hidden-focus violations against the live DOM to
|
|
793
|
+
// handle race conditions with JS that sets tabindex="-1" after
|
|
794
|
+
// aria-hidden (common in carousel/slider libraries like slick)
|
|
795
|
+
const ariaHiddenViolation = results.violations.find(v => v.id === 'aria-hidden-focus');
|
|
796
|
+
if (ariaHiddenViolation) {
|
|
797
|
+
await new Promise(resolve => setTimeout(resolve, 0));
|
|
798
|
+
ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(node => {
|
|
799
|
+
const selector = node.target && node.target[0];
|
|
800
|
+
if (typeof selector !== 'string')
|
|
801
|
+
return true;
|
|
802
|
+
try {
|
|
803
|
+
const el = document.querySelector(selector);
|
|
804
|
+
if (!el)
|
|
805
|
+
return true;
|
|
806
|
+
const focusables = el.querySelectorAll('a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]');
|
|
807
|
+
if (focusables.length === 0)
|
|
808
|
+
return false;
|
|
809
|
+
return Array.from(focusables).some(child => {
|
|
810
|
+
const tabindex = child.getAttribute('tabindex');
|
|
811
|
+
if (tabindex === null)
|
|
812
|
+
return true;
|
|
813
|
+
const parsed = parseInt(tabindex, 10);
|
|
814
|
+
return isNaN(parsed) || parsed >= 0;
|
|
815
|
+
});
|
|
816
|
+
}
|
|
817
|
+
catch {
|
|
818
|
+
return true;
|
|
819
|
+
}
|
|
820
|
+
});
|
|
821
|
+
if (ariaHiddenViolation.nodes.length === 0) {
|
|
822
|
+
results.violations = results.violations.filter(v => v.id !== 'aria-hidden-focus');
|
|
823
|
+
}
|
|
824
|
+
}
|
|
785
825
|
if (disableOobee) {
|
|
786
826
|
return results;
|
|
787
827
|
}
|
|
@@ -847,19 +887,6 @@ export const runAxeScript = async ({ includeScreenshots, page, randomToken, cust
|
|
|
847
887
|
results.violations = await takeScreenshotForHTMLElements(results.violations, page, randomToken);
|
|
848
888
|
results.incomplete = await takeScreenshotForHTMLElements(results.incomplete, page, randomToken);
|
|
849
889
|
}
|
|
850
|
-
let pageTitle = null;
|
|
851
|
-
try {
|
|
852
|
-
pageTitle = await page.evaluate(() => document.title);
|
|
853
|
-
}
|
|
854
|
-
catch (e) {
|
|
855
|
-
consoleLogger.info(`Error while getting page title: ${e}`);
|
|
856
|
-
if (page.isClosed()) {
|
|
857
|
-
consoleLogger.info(`Page was closed for ${requestUrl}, creating new page`);
|
|
858
|
-
page = await browserContext.newPage();
|
|
859
|
-
await page.goto(requestUrl, { waitUntil: 'domcontentloaded' });
|
|
860
|
-
pageTitle = await page.evaluate(() => document.title);
|
|
861
|
-
}
|
|
862
|
-
}
|
|
863
890
|
return filterAxeResults(results, pageTitle, customFlowDetails);
|
|
864
891
|
};
|
|
865
892
|
export const createCrawleeSubFolders = async (randomToken) => {
|
|
@@ -878,11 +905,105 @@ export const preNavigationHooks = (extraHTTPHeaders) => {
|
|
|
878
905
|
},
|
|
879
906
|
];
|
|
880
907
|
};
|
|
908
|
+
/**
|
|
909
|
+
* Splits extraHTTPHeaders into auth and non-auth parts.
|
|
910
|
+
* Auth headers (Authorization) must only be sent to same-origin requests to avoid CORS preflight failures.
|
|
911
|
+
* Non-auth headers are safe to set globally on the browser context.
|
|
912
|
+
*/
|
|
913
|
+
export const splitAuthHeaders = (extraHTTPHeaders) => {
|
|
914
|
+
const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
|
|
915
|
+
return {
|
|
916
|
+
authHeader: Authorization || null,
|
|
917
|
+
nonAuthHeaders: Object.keys(nonAuthHeaders).length > 0 ? nonAuthHeaders : null,
|
|
918
|
+
httpCredentials: (() => {
|
|
919
|
+
if (!Authorization?.startsWith('Basic '))
|
|
920
|
+
return null;
|
|
921
|
+
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
922
|
+
const colonIdx = decoded.indexOf(':');
|
|
923
|
+
if (colonIdx <= 0)
|
|
924
|
+
return null;
|
|
925
|
+
return { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
|
|
926
|
+
})(),
|
|
927
|
+
};
|
|
928
|
+
};
|
|
929
|
+
/**
|
|
930
|
+
* Adds a route handler to a BrowserContext that sends the Authorization header
|
|
931
|
+
* only to same-origin requests, preventing CORS preflight failures on cross-origin CDN resources.
|
|
932
|
+
*/
|
|
933
|
+
export const addAuthRouteHandler = async (context, entryUrl, authHeader) => {
|
|
934
|
+
if (!authHeader)
|
|
935
|
+
return;
|
|
936
|
+
const entryOrigin = new URL(entryUrl).origin;
|
|
937
|
+
await context.route('**/*', async (route, request) => {
|
|
938
|
+
try {
|
|
939
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
940
|
+
await route.continue({ headers: { ...request.headers(), Authorization: authHeader } });
|
|
941
|
+
}
|
|
942
|
+
else {
|
|
943
|
+
await route.continue();
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
catch {
|
|
947
|
+
await route.continue();
|
|
948
|
+
}
|
|
949
|
+
});
|
|
950
|
+
};
|
|
881
951
|
export const postNavigationHooks = [
|
|
882
952
|
async (_crawlingContext) => {
|
|
883
953
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
884
954
|
},
|
|
885
955
|
];
|
|
956
|
+
export const getPreLaunchHook = (userDataDirectory) => {
|
|
957
|
+
let launchCount = 0;
|
|
958
|
+
return async (_pageId, launchContext) => {
|
|
959
|
+
const fsp = await import('fs/promises').then(m => m.default);
|
|
960
|
+
launchCount += 1;
|
|
961
|
+
// First launch uses the base directory; subsequent launches get a unique
|
|
962
|
+
// directory so that lingering file handles from a retired browser don't
|
|
963
|
+
// cause Chrome exit code 21 on Windows.
|
|
964
|
+
const effectiveDir = launchCount === 1
|
|
965
|
+
? userDataDirectory
|
|
966
|
+
: `${userDataDirectory}_pool${launchCount}`;
|
|
967
|
+
await fsp.mkdir(effectiveDir, { recursive: true });
|
|
968
|
+
// For pool re-launches, best-effort clone profile data from base directory
|
|
969
|
+
// so authenticated sessions are preserved across browser pool retirements.
|
|
970
|
+
if (launchCount > 1) {
|
|
971
|
+
try {
|
|
972
|
+
const copyRecursive = async (src, dest) => {
|
|
973
|
+
const stat = await fsp.stat(src).catch(() => null);
|
|
974
|
+
if (!stat)
|
|
975
|
+
return;
|
|
976
|
+
if (stat.isDirectory()) {
|
|
977
|
+
await fsp.mkdir(dest, { recursive: true }).catch(() => { });
|
|
978
|
+
const entries = await fsp.readdir(src).catch(() => []);
|
|
979
|
+
await Promise.all(entries
|
|
980
|
+
.filter(entry => !entry.startsWith('Singleton') && entry !== 'lockfile' && entry !== 'LOCK')
|
|
981
|
+
.map(entry => copyRecursive(path.join(src, entry), path.join(dest, entry)).catch(() => { })));
|
|
982
|
+
}
|
|
983
|
+
else {
|
|
984
|
+
await fsp.copyFile(src, dest).catch(() => { });
|
|
985
|
+
}
|
|
986
|
+
};
|
|
987
|
+
await copyRecursive(userDataDirectory, effectiveDir).catch(() => { });
|
|
988
|
+
}
|
|
989
|
+
catch {
|
|
990
|
+
// Silent fallback: use empty profile if clone fails
|
|
991
|
+
}
|
|
992
|
+
}
|
|
993
|
+
// Clean any stale lock files that may block browser launches on Windows
|
|
994
|
+
const lockFiles = [
|
|
995
|
+
path.join(effectiveDir, 'SingletonLock'),
|
|
996
|
+
path.join(effectiveDir, 'SingletonSocket'),
|
|
997
|
+
path.join(effectiveDir, 'SingletonCookie'),
|
|
998
|
+
path.join(effectiveDir, 'lockfile'),
|
|
999
|
+
path.join(effectiveDir, 'Default', 'LOCK'),
|
|
1000
|
+
path.join(effectiveDir, 'Default', 'Network', 'LOCK'),
|
|
1001
|
+
];
|
|
1002
|
+
await Promise.all(lockFiles.map(f => fsp.rm(f, { force: true }).catch(() => { })));
|
|
1003
|
+
// eslint-disable-next-line no-param-reassign
|
|
1004
|
+
launchContext.userDataDir = effectiveDir;
|
|
1005
|
+
};
|
|
1006
|
+
};
|
|
886
1007
|
export const failedRequestHandler = async ({ request }) => {
|
|
887
1008
|
guiInfoLog(guiInfoStatusTypes.ERROR, { numScanned: 0, urlScanned: request.url });
|
|
888
1009
|
log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|