@govtechsg/oobee 0.10.91 → 0.10.92
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +289 -0
- package/README.md +3 -0
- package/dist/cli.js +3 -0
- package/dist/combine.js +14 -2
- package/dist/constants/cliFunctions.js +7 -0
- package/dist/constants/common.js +119 -70
- package/dist/constants/constants.js +1 -0
- package/dist/crawlers/commonCrawlerFunc.js +93 -15
- package/dist/crawlers/crawlDomain.js +45 -57
- package/dist/crawlers/crawlIntelligentSitemap.js +12 -7
- package/dist/crawlers/crawlRateController.js +47 -0
- package/dist/crawlers/crawlSitemap.js +51 -62
- package/dist/generateOobeeClientScanner.js +31 -0
- package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
- package/dist/mergeAxeResults.js +120 -92
- package/dist/npmIndex.js +1 -0
- package/dist/utils.js +23 -28
- package/oobee-client-scanner.js +33 -2
- package/package.json +2 -2
- package/src/cli.ts +4 -0
- package/src/combine.ts +15 -1
- package/src/constants/cliFunctions.ts +7 -0
- package/src/constants/common.ts +131 -79
- package/src/constants/constants.ts +1 -0
- package/src/crawlers/commonCrawlerFunc.ts +103 -14
- package/src/crawlers/crawlDomain.ts +52 -65
- package/src/crawlers/crawlIntelligentSitemap.ts +13 -7
- package/src/crawlers/crawlRateController.ts +63 -0
- package/src/crawlers/crawlSitemap.ts +57 -70
- package/src/generateOobeeClientScanner.ts +31 -0
- package/src/index.ts +1 -0
- package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
- package/src/mergeAxeResults.ts +139 -99
- package/src/npmIndex.ts +1 -0
- package/src/utils.ts +25 -33
- /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt} +0 -0
package/dist/constants/common.js
CHANGED
|
@@ -713,6 +713,7 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
|
|
|
713
713
|
browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
|
|
714
714
|
...getPlaywrightLaunchOptions(browser),
|
|
715
715
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
716
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
716
717
|
});
|
|
717
718
|
register(browserContext);
|
|
718
719
|
}
|
|
@@ -723,6 +724,7 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
|
|
|
723
724
|
register(browserInstance);
|
|
724
725
|
browserContext = await browserInstance.newContext({
|
|
725
726
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
727
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
726
728
|
});
|
|
727
729
|
}
|
|
728
730
|
const page = await browserContext.newPage();
|
|
@@ -784,10 +786,10 @@ export const isDisallowedInRobotsTxt = (url) => {
|
|
|
784
786
|
}
|
|
785
787
|
return false;
|
|
786
788
|
};
|
|
787
|
-
export const getLinksFromSitemap = async (sitemapUrl,
|
|
789
|
+
export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
|
|
788
790
|
const scannedSitemaps = new Set();
|
|
789
|
-
const
|
|
790
|
-
const
|
|
791
|
+
const sitemapLinkCounts = {};
|
|
792
|
+
const allUrls = new Set(); // all discovered URLs (lightweight strings)
|
|
791
793
|
const addToUrlList = (url) => {
|
|
792
794
|
if (!url)
|
|
793
795
|
return;
|
|
@@ -796,17 +798,7 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
796
798
|
if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy))
|
|
797
799
|
return;
|
|
798
800
|
url = convertPathToLocalFile(url);
|
|
799
|
-
|
|
800
|
-
try {
|
|
801
|
-
request = new Request({ url });
|
|
802
|
-
}
|
|
803
|
-
catch (e) {
|
|
804
|
-
console.log('Error creating request', e);
|
|
805
|
-
}
|
|
806
|
-
if (isUrlPdf(url)) {
|
|
807
|
-
request.skipNavigation = true;
|
|
808
|
-
}
|
|
809
|
-
urls[url] = request;
|
|
801
|
+
allUrls.add(url);
|
|
810
802
|
};
|
|
811
803
|
const calculateCloseness = (sitemapUrl) => {
|
|
812
804
|
// Remove 'http://', 'https://', and 'www.' prefixes from the URLs
|
|
@@ -849,15 +841,14 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
849
841
|
return (b.lastModifiedDate?.getTime() || 0) - (a.lastModifiedDate?.getTime() || 0);
|
|
850
842
|
});
|
|
851
843
|
}
|
|
852
|
-
// Add
|
|
853
|
-
for (const { url } of urlList
|
|
844
|
+
// Add all URLs to the discovered list (limit applied later at return time)
|
|
845
|
+
for (const { url } of urlList) {
|
|
854
846
|
addToUrlList(url);
|
|
855
847
|
}
|
|
856
848
|
};
|
|
857
849
|
const processNonStandardSitemap = (data) => {
|
|
858
850
|
const urlsFromData = crawlee
|
|
859
|
-
.extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
|
|
860
|
-
.slice(0, maxLinksCount);
|
|
851
|
+
.extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') });
|
|
861
852
|
urlsFromData.forEach(url => {
|
|
862
853
|
addToUrlList(url);
|
|
863
854
|
});
|
|
@@ -900,6 +891,7 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
900
891
|
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
901
892
|
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
902
893
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
894
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
903
895
|
});
|
|
904
896
|
register(browserContext);
|
|
905
897
|
}
|
|
@@ -910,6 +902,7 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
910
902
|
register(browserInstance);
|
|
911
903
|
browserContext = await browserInstance.newContext({
|
|
912
904
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
905
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
913
906
|
});
|
|
914
907
|
}
|
|
915
908
|
const page = await browserContext.newPage();
|
|
@@ -980,14 +973,12 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
980
973
|
else {
|
|
981
974
|
sitemapType = constants.xmlSitemapTypes.unknown;
|
|
982
975
|
}
|
|
976
|
+
const countBefore = allUrls.size;
|
|
983
977
|
switch (sitemapType) {
|
|
984
978
|
case constants.xmlSitemapTypes.xmlIndex:
|
|
985
979
|
consoleLogger.info(`This is a XML format sitemap index.`);
|
|
986
980
|
for (const childSitemapUrl of $('loc')) {
|
|
987
981
|
const childSitemapUrlText = $(childSitemapUrl).text();
|
|
988
|
-
if (isLimitReached()) {
|
|
989
|
-
break;
|
|
990
|
-
}
|
|
991
982
|
if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
|
|
992
983
|
await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
|
|
993
984
|
}
|
|
@@ -1012,6 +1003,10 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
1012
1003
|
consoleLogger.info(`This is an unrecognised XML sitemap format.`);
|
|
1013
1004
|
processNonStandardSitemap(data);
|
|
1014
1005
|
}
|
|
1006
|
+
const linksFromThisSitemap = allUrls.size - countBefore;
|
|
1007
|
+
if (linksFromThisSitemap > 0) {
|
|
1008
|
+
sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
|
|
1009
|
+
}
|
|
1015
1010
|
};
|
|
1016
1011
|
try {
|
|
1017
1012
|
await fetchUrls(sitemapUrl, extraHTTPHeaders);
|
|
@@ -1019,7 +1014,37 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
1019
1014
|
catch (e) {
|
|
1020
1015
|
consoleLogger.error(e);
|
|
1021
1016
|
}
|
|
1022
|
-
|
|
1017
|
+
// Build Request objects for all discovered URLs; the crawler itself enforces
|
|
1018
|
+
// maxRequestsPerCrawl by counting only successfully scanned pages.
|
|
1019
|
+
const requestList = [];
|
|
1020
|
+
for (const url of allUrls) {
|
|
1021
|
+
try {
|
|
1022
|
+
const request = new Request({ url });
|
|
1023
|
+
if (isUrlPdf(url)) {
|
|
1024
|
+
request.skipNavigation = true;
|
|
1025
|
+
}
|
|
1026
|
+
requestList.push(request);
|
|
1027
|
+
}
|
|
1028
|
+
catch (e) {
|
|
1029
|
+
consoleLogger.info(`Error creating request for ${url}: ${e}`);
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
const totalLinksDiscovered = allUrls.size;
|
|
1033
|
+
const fetchedSitemaps = Object.entries(sitemapLinkCounts).map(([url, fetchedLinks]) => ({
|
|
1034
|
+
url,
|
|
1035
|
+
fetchedLinks,
|
|
1036
|
+
}));
|
|
1037
|
+
const prev = constants.sitemapFetchedLinks;
|
|
1038
|
+
constants.sitemapFetchedLinks = {
|
|
1039
|
+
totalLinksFetchedFromSitemaps: (prev?.totalLinksFetchedFromSitemaps ?? 0) + totalLinksDiscovered,
|
|
1040
|
+
fetchedSitemaps: [...(prev?.fetchedSitemaps ?? []), ...fetchedSitemaps],
|
|
1041
|
+
};
|
|
1042
|
+
if (totalLinksDiscovered > 0) {
|
|
1043
|
+
const breakdown = fetchedSitemaps
|
|
1044
|
+
.map(({ url, fetchedLinks }) => `${url} (${fetchedLinks})`)
|
|
1045
|
+
.join(', ');
|
|
1046
|
+
consoleLogger.info(`There are a total of ${totalLinksDiscovered} links found across ${breakdown}.`);
|
|
1047
|
+
}
|
|
1023
1048
|
return requestList;
|
|
1024
1049
|
};
|
|
1025
1050
|
export const validEmail = (email) => {
|
|
@@ -1158,6 +1183,34 @@ export const getEdgeData = (randomToken) => {
|
|
|
1158
1183
|
* @param {*} destDir destination directory
|
|
1159
1184
|
* @returns boolean indicating whether the operation was successful
|
|
1160
1185
|
*/
|
|
1186
|
+
// Helper to copy a file with retry logic for transient EBUSY errors
|
|
1187
|
+
const copyFileWithRetry = (src, dest, maxRetries = 3) => {
|
|
1188
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
1189
|
+
try {
|
|
1190
|
+
fs.copyFileSync(src, dest);
|
|
1191
|
+
if (attempt > 1) {
|
|
1192
|
+
consoleLogger.info(`File copy succeeded on attempt ${attempt}: ${dest}`);
|
|
1193
|
+
}
|
|
1194
|
+
return true;
|
|
1195
|
+
}
|
|
1196
|
+
catch (err) {
|
|
1197
|
+
if (err.code === 'EBUSY' && attempt < maxRetries) {
|
|
1198
|
+
// Transient lock — wait and retry
|
|
1199
|
+
const delayMs = Math.min(100 * Math.pow(2, attempt - 1), 1000); // Exponential backoff: 100ms, 200ms, 400ms, capped at 1s
|
|
1200
|
+
consoleLogger.warn(`File copy attempt ${attempt}/${maxRetries} failed with EBUSY. Retrying after ${delayMs}ms: ${dest}`);
|
|
1201
|
+
// Synchronous sleep via busy-wait (not ideal but avoids promise complications in sync context)
|
|
1202
|
+
const endTime = Date.now() + delayMs;
|
|
1203
|
+
while (Date.now() < endTime) {
|
|
1204
|
+
// Busy wait
|
|
1205
|
+
}
|
|
1206
|
+
continue; // Retry
|
|
1207
|
+
}
|
|
1208
|
+
// Non-transient error or max retries reached
|
|
1209
|
+
return false;
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
1212
|
+
return false;
|
|
1213
|
+
};
|
|
1161
1214
|
const cloneChromeProfileCookieFiles = (options, destDir) => {
|
|
1162
1215
|
let profileCookiesDir;
|
|
1163
1216
|
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
|
@@ -1196,19 +1249,9 @@ const cloneChromeProfileCookieFiles = (options, destDir) => {
|
|
|
1196
1249
|
}
|
|
1197
1250
|
// Prevents duplicate cookies file if the cookies already exist
|
|
1198
1251
|
if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
catch (err) {
|
|
1203
|
-
consoleLogger.error(err);
|
|
1204
|
-
if (err.code === 'EBUSY') {
|
|
1205
|
-
console.log(`Unable to copy the file for ${profileName} because it is currently in use.`);
|
|
1206
|
-
console.log('Please close any applications that might be using this file and try again.');
|
|
1207
|
-
}
|
|
1208
|
-
else {
|
|
1209
|
-
console.log(`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`);
|
|
1210
|
-
}
|
|
1211
|
-
// printMessage([err], messageOptions);
|
|
1252
|
+
const destCookiesPath = path.join(destProfileDir, 'Cookies');
|
|
1253
|
+
if (!copyFileWithRetry(dir, destCookiesPath)) {
|
|
1254
|
+
consoleLogger.error(`Failed to copy Chrome profile cookies for ${profileName} after retries.`);
|
|
1212
1255
|
success = false;
|
|
1213
1256
|
}
|
|
1214
1257
|
}
|
|
@@ -1220,12 +1263,6 @@ const cloneChromeProfileCookieFiles = (options, destDir) => {
|
|
|
1220
1263
|
printMessage(['Unable to find Chrome profile cookies file in the system.'], messageOptions);
|
|
1221
1264
|
return false;
|
|
1222
1265
|
};
|
|
1223
|
-
/**
|
|
1224
|
-
* Clone the Chrome profile cookie files to the destination directory
|
|
1225
|
-
* @param {*} options glob options object
|
|
1226
|
-
* @param {*} destDir destination directory
|
|
1227
|
-
* @returns boolean indicating whether the operation was successful
|
|
1228
|
-
*/
|
|
1229
1266
|
const cloneEdgeProfileCookieFiles = (options, destDir) => {
|
|
1230
1267
|
let profileCookiesDir;
|
|
1231
1268
|
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
|
@@ -1265,19 +1302,9 @@ const cloneEdgeProfileCookieFiles = (options, destDir) => {
|
|
|
1265
1302
|
}
|
|
1266
1303
|
// Prevents duplicate cookies file if the cookies already exist
|
|
1267
1304
|
if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
catch (err) {
|
|
1272
|
-
consoleLogger.error(err);
|
|
1273
|
-
if (err.code === 'EBUSY') {
|
|
1274
|
-
console.log(`Unable to copy the file for ${profileName} because it is currently in use.`);
|
|
1275
|
-
console.log('Please close any applications that might be using this file and try again.');
|
|
1276
|
-
}
|
|
1277
|
-
else {
|
|
1278
|
-
console.log(`An unexpected error occurred while copying the file: ${err.message}`);
|
|
1279
|
-
}
|
|
1280
|
-
// printMessage([err], messageOptions);
|
|
1305
|
+
const destCookiesPath = path.join(destProfileDir, 'Cookies');
|
|
1306
|
+
if (!copyFileWithRetry(dir, destCookiesPath)) {
|
|
1307
|
+
consoleLogger.error(`Failed to copy Edge profile cookies for ${profileName} after retries.`);
|
|
1281
1308
|
success = false;
|
|
1282
1309
|
}
|
|
1283
1310
|
}
|
|
@@ -1305,19 +1332,9 @@ const cloneLocalStateFile = (options, destDir) => {
|
|
|
1305
1332
|
let success = true;
|
|
1306
1333
|
localState.forEach(dir => {
|
|
1307
1334
|
const profileName = dir.match(profileNamesRegex)[1];
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
catch (err) {
|
|
1312
|
-
consoleLogger.error(err);
|
|
1313
|
-
if (err.code === 'EBUSY') {
|
|
1314
|
-
console.log(`Unable to copy the file because it is currently in use.`);
|
|
1315
|
-
console.log('Please close any applications that might be using this file and try again.');
|
|
1316
|
-
}
|
|
1317
|
-
else {
|
|
1318
|
-
console.log(`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`);
|
|
1319
|
-
}
|
|
1320
|
-
printMessage([err], messageOptions);
|
|
1335
|
+
const destPath = path.join(destDir, 'Local State');
|
|
1336
|
+
if (!copyFileWithRetry(dir, destPath)) {
|
|
1337
|
+
consoleLogger.error(`Failed to copy Local State file for ${profileName} after retries.`);
|
|
1321
1338
|
success = false;
|
|
1322
1339
|
}
|
|
1323
1340
|
});
|
|
@@ -1362,6 +1379,15 @@ export const cloneChromeProfiles = (randomToken) => {
|
|
|
1362
1379
|
return destDir;
|
|
1363
1380
|
}
|
|
1364
1381
|
consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
|
|
1382
|
+
// Fall back to a clean profile directory to avoid launch failures from partial clones.
|
|
1383
|
+
try {
|
|
1384
|
+
fs.rmSync(destDir, { recursive: true, force: true });
|
|
1385
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1386
|
+
consoleLogger.warn('Using an empty cloned Chrome profile directory due to clone failure.');
|
|
1387
|
+
}
|
|
1388
|
+
catch (cleanupError) {
|
|
1389
|
+
consoleLogger.error(`Unable to reset cloned Chrome profile directory ${destDir}: ${cleanupError}`);
|
|
1390
|
+
}
|
|
1365
1391
|
}
|
|
1366
1392
|
// For future reference, return a null instead to halt the scan
|
|
1367
1393
|
return destDir;
|
|
@@ -1418,6 +1444,15 @@ export const cloneEdgeProfiles = (randomToken) => {
|
|
|
1418
1444
|
return destDir;
|
|
1419
1445
|
}
|
|
1420
1446
|
consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
|
|
1447
|
+
// Fall back to a clean profile directory to avoid launch failures from partial clones.
|
|
1448
|
+
try {
|
|
1449
|
+
fs.rmSync(destDir, { recursive: true, force: true });
|
|
1450
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1451
|
+
consoleLogger.warn('Using an empty cloned Edge profile directory due to clone failure.');
|
|
1452
|
+
}
|
|
1453
|
+
catch (cleanupError) {
|
|
1454
|
+
consoleLogger.error(`Unable to reset cloned Edge profile directory ${destDir}: ${cleanupError}`);
|
|
1455
|
+
}
|
|
1421
1456
|
}
|
|
1422
1457
|
// For future reference, return a null instead to halt the scan
|
|
1423
1458
|
return destDir;
|
|
@@ -1444,7 +1479,14 @@ export const deleteClonedChromeProfiles = (randomToken) => {
|
|
|
1444
1479
|
}
|
|
1445
1480
|
let destDir;
|
|
1446
1481
|
if (randomToken) {
|
|
1447
|
-
|
|
1482
|
+
// Also match _pool* directories created by browser pool re-launches
|
|
1483
|
+
destDir = globSync(`oobee-${randomToken}*`, {
|
|
1484
|
+
cwd: baseDir,
|
|
1485
|
+
absolute: true,
|
|
1486
|
+
});
|
|
1487
|
+
if (destDir.length === 0) {
|
|
1488
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
|
1489
|
+
}
|
|
1448
1490
|
}
|
|
1449
1491
|
else {
|
|
1450
1492
|
// Find all the oobee directories in the Chrome data directory
|
|
@@ -1481,10 +1523,17 @@ export const deleteClonedEdgeProfiles = (randomToken) => {
|
|
|
1481
1523
|
}
|
|
1482
1524
|
let destDir;
|
|
1483
1525
|
if (randomToken) {
|
|
1484
|
-
|
|
1526
|
+
// Also match _pool* directories created by browser pool re-launches
|
|
1527
|
+
destDir = globSync(`oobee-${randomToken}*`, {
|
|
1528
|
+
cwd: baseDir,
|
|
1529
|
+
absolute: true,
|
|
1530
|
+
});
|
|
1531
|
+
if (destDir.length === 0) {
|
|
1532
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
|
1533
|
+
}
|
|
1485
1534
|
}
|
|
1486
1535
|
else {
|
|
1487
|
-
// Find all the oobee directories in the
|
|
1536
|
+
// Find all the oobee directories in the Edge data directory
|
|
1488
1537
|
destDir = globSync('**/oobee*', {
|
|
1489
1538
|
cwd: baseDir,
|
|
1490
1539
|
absolute: true,
|
|
@@ -771,6 +771,7 @@ export default {
|
|
|
771
771
|
a11yRuleShortDescriptionMap,
|
|
772
772
|
disabilityBadgesMap,
|
|
773
773
|
robotsTxtUrls: null,
|
|
774
|
+
sitemapFetchedLinks: null,
|
|
774
775
|
userDataDirectory: null, // This will be set later in the code
|
|
775
776
|
randomToken: null, // This will be set later in the code
|
|
776
777
|
// Track all active Crawlee / Playwright resources for cleanup
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Dataset, RequestQueue, log, playwrightUtils } from 'crawlee';
|
|
2
2
|
import axe from 'axe-core';
|
|
3
3
|
import { axeScript, disallowedListOfPatterns, guiInfoStatusTypes, RuleFlags, saflyIconSelector, } from '../constants/constants.js';
|
|
4
|
-
import {
|
|
4
|
+
import { guiInfoLog } from '../logs.js';
|
|
5
5
|
import { enrichColorContrastDOMContext, takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
|
|
6
6
|
import { isFilePath } from '../constants/common.js';
|
|
7
7
|
import { extractAndGradeText } from './custom/extractAndGradeText.js';
|
|
@@ -674,6 +674,13 @@ export const filterAxeResults = (results, pageTitle, customFlowDetails) => {
|
|
|
674
674
|
export const runAxeScript = async ({ includeScreenshots, page, randomToken, customFlowDetails = null, selectors = [], ruleset = [], }) => {
|
|
675
675
|
const browserContext = page.context();
|
|
676
676
|
const requestUrl = page.url();
|
|
677
|
+
let pageTitle = null;
|
|
678
|
+
try {
|
|
679
|
+
pageTitle = await page.evaluate(() => document.title);
|
|
680
|
+
}
|
|
681
|
+
catch {
|
|
682
|
+
// Page may already be in a bad state; title will remain null
|
|
683
|
+
}
|
|
677
684
|
try {
|
|
678
685
|
// Checking for DOM mutations before proceeding to scan
|
|
679
686
|
await page.evaluate(() => {
|
|
@@ -781,7 +788,40 @@ export const runAxeScript = async ({ includeScreenshots, page, randomToken, cust
|
|
|
781
788
|
.run(selectors, {
|
|
782
789
|
resultTypes: defaultResultTypes,
|
|
783
790
|
})
|
|
784
|
-
.then(results => {
|
|
791
|
+
.then(async (results) => {
|
|
792
|
+
// Re-verify aria-hidden-focus violations against the live DOM to
|
|
793
|
+
// handle race conditions with JS that sets tabindex="-1" after
|
|
794
|
+
// aria-hidden (common in carousel/slider libraries like slick)
|
|
795
|
+
const ariaHiddenViolation = results.violations.find(v => v.id === 'aria-hidden-focus');
|
|
796
|
+
if (ariaHiddenViolation) {
|
|
797
|
+
await new Promise(resolve => setTimeout(resolve, 0));
|
|
798
|
+
ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(node => {
|
|
799
|
+
const selector = node.target && node.target[0];
|
|
800
|
+
if (typeof selector !== 'string')
|
|
801
|
+
return true;
|
|
802
|
+
try {
|
|
803
|
+
const el = document.querySelector(selector);
|
|
804
|
+
if (!el)
|
|
805
|
+
return true;
|
|
806
|
+
const focusables = el.querySelectorAll('a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]');
|
|
807
|
+
if (focusables.length === 0)
|
|
808
|
+
return false;
|
|
809
|
+
return Array.from(focusables).some(child => {
|
|
810
|
+
const tabindex = child.getAttribute('tabindex');
|
|
811
|
+
if (tabindex === null)
|
|
812
|
+
return true;
|
|
813
|
+
const parsed = parseInt(tabindex, 10);
|
|
814
|
+
return isNaN(parsed) || parsed >= 0;
|
|
815
|
+
});
|
|
816
|
+
}
|
|
817
|
+
catch {
|
|
818
|
+
return true;
|
|
819
|
+
}
|
|
820
|
+
});
|
|
821
|
+
if (ariaHiddenViolation.nodes.length === 0) {
|
|
822
|
+
results.violations = results.violations.filter(v => v.id !== 'aria-hidden-focus');
|
|
823
|
+
}
|
|
824
|
+
}
|
|
785
825
|
if (disableOobee) {
|
|
786
826
|
return results;
|
|
787
827
|
}
|
|
@@ -847,19 +887,6 @@ export const runAxeScript = async ({ includeScreenshots, page, randomToken, cust
|
|
|
847
887
|
results.violations = await takeScreenshotForHTMLElements(results.violations, page, randomToken);
|
|
848
888
|
results.incomplete = await takeScreenshotForHTMLElements(results.incomplete, page, randomToken);
|
|
849
889
|
}
|
|
850
|
-
let pageTitle = null;
|
|
851
|
-
try {
|
|
852
|
-
pageTitle = await page.evaluate(() => document.title);
|
|
853
|
-
}
|
|
854
|
-
catch (e) {
|
|
855
|
-
consoleLogger.info(`Error while getting page title: ${e}`);
|
|
856
|
-
if (page.isClosed()) {
|
|
857
|
-
consoleLogger.info(`Page was closed for ${requestUrl}, creating new page`);
|
|
858
|
-
page = await browserContext.newPage();
|
|
859
|
-
await page.goto(requestUrl, { waitUntil: 'domcontentloaded' });
|
|
860
|
-
pageTitle = await page.evaluate(() => document.title);
|
|
861
|
-
}
|
|
862
|
-
}
|
|
863
890
|
return filterAxeResults(results, pageTitle, customFlowDetails);
|
|
864
891
|
};
|
|
865
892
|
export const createCrawleeSubFolders = async (randomToken) => {
|
|
@@ -883,6 +910,57 @@ export const postNavigationHooks = [
|
|
|
883
910
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
884
911
|
},
|
|
885
912
|
];
|
|
913
|
+
export const getPreLaunchHook = (userDataDirectory) => {
|
|
914
|
+
let launchCount = 0;
|
|
915
|
+
return async (_pageId, launchContext) => {
|
|
916
|
+
const fsp = await import('fs/promises').then(m => m.default);
|
|
917
|
+
launchCount += 1;
|
|
918
|
+
// First launch uses the base directory; subsequent launches get a unique
|
|
919
|
+
// directory so that lingering file handles from a retired browser don't
|
|
920
|
+
// cause Chrome exit code 21 on Windows.
|
|
921
|
+
const effectiveDir = launchCount === 1
|
|
922
|
+
? userDataDirectory
|
|
923
|
+
: `${userDataDirectory}_pool${launchCount}`;
|
|
924
|
+
await fsp.mkdir(effectiveDir, { recursive: true });
|
|
925
|
+
// For pool re-launches, best-effort clone profile data from base directory
|
|
926
|
+
// so authenticated sessions are preserved across browser pool retirements.
|
|
927
|
+
if (launchCount > 1) {
|
|
928
|
+
try {
|
|
929
|
+
const copyRecursive = async (src, dest) => {
|
|
930
|
+
const stat = await fsp.stat(src).catch(() => null);
|
|
931
|
+
if (!stat)
|
|
932
|
+
return;
|
|
933
|
+
if (stat.isDirectory()) {
|
|
934
|
+
await fsp.mkdir(dest, { recursive: true }).catch(() => { });
|
|
935
|
+
const entries = await fsp.readdir(src).catch(() => []);
|
|
936
|
+
await Promise.all(entries
|
|
937
|
+
.filter(entry => !entry.startsWith('Singleton') && entry !== 'lockfile' && entry !== 'LOCK')
|
|
938
|
+
.map(entry => copyRecursive(path.join(src, entry), path.join(dest, entry)).catch(() => { })));
|
|
939
|
+
}
|
|
940
|
+
else {
|
|
941
|
+
await fsp.copyFile(src, dest).catch(() => { });
|
|
942
|
+
}
|
|
943
|
+
};
|
|
944
|
+
await copyRecursive(userDataDirectory, effectiveDir).catch(() => { });
|
|
945
|
+
}
|
|
946
|
+
catch {
|
|
947
|
+
// Silent fallback: use empty profile if clone fails
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
// Clean any stale lock files that may block browser launches on Windows
|
|
951
|
+
const lockFiles = [
|
|
952
|
+
path.join(effectiveDir, 'SingletonLock'),
|
|
953
|
+
path.join(effectiveDir, 'SingletonSocket'),
|
|
954
|
+
path.join(effectiveDir, 'SingletonCookie'),
|
|
955
|
+
path.join(effectiveDir, 'lockfile'),
|
|
956
|
+
path.join(effectiveDir, 'Default', 'LOCK'),
|
|
957
|
+
path.join(effectiveDir, 'Default', 'Network', 'LOCK'),
|
|
958
|
+
];
|
|
959
|
+
await Promise.all(lockFiles.map(f => fsp.rm(f, { force: true }).catch(() => { })));
|
|
960
|
+
// eslint-disable-next-line no-param-reassign
|
|
961
|
+
launchContext.userDataDir = effectiveDir;
|
|
962
|
+
};
|
|
963
|
+
};
|
|
886
964
|
export const failedRequestHandler = async ({ request }) => {
|
|
887
965
|
guiInfoLog(guiInfoStatusTypes.ERROR, { numScanned: 0, urlScanned: request.url });
|
|
888
966
|
log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|