@govtechsg/oobee 0.10.91 → 0.10.92
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +289 -0
- package/README.md +3 -0
- package/dist/cli.js +3 -0
- package/dist/combine.js +14 -2
- package/dist/constants/cliFunctions.js +7 -0
- package/dist/constants/common.js +119 -70
- package/dist/constants/constants.js +1 -0
- package/dist/crawlers/commonCrawlerFunc.js +93 -15
- package/dist/crawlers/crawlDomain.js +45 -57
- package/dist/crawlers/crawlIntelligentSitemap.js +12 -7
- package/dist/crawlers/crawlRateController.js +47 -0
- package/dist/crawlers/crawlSitemap.js +51 -62
- package/dist/generateOobeeClientScanner.js +31 -0
- package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
- package/dist/mergeAxeResults.js +120 -92
- package/dist/npmIndex.js +1 -0
- package/dist/utils.js +23 -28
- package/oobee-client-scanner.js +33 -2
- package/package.json +2 -2
- package/src/cli.ts +4 -0
- package/src/combine.ts +15 -1
- package/src/constants/cliFunctions.ts +7 -0
- package/src/constants/common.ts +131 -79
- package/src/constants/constants.ts +1 -0
- package/src/crawlers/commonCrawlerFunc.ts +103 -14
- package/src/crawlers/crawlDomain.ts +52 -65
- package/src/crawlers/crawlIntelligentSitemap.ts +13 -7
- package/src/crawlers/crawlRateController.ts +63 -0
- package/src/crawlers/crawlSitemap.ts +57 -70
- package/src/generateOobeeClientScanner.ts +31 -0
- package/src/index.ts +1 -0
- package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
- package/src/mergeAxeResults.ts +139 -99
- package/src/npmIndex.ts +1 -0
- package/src/utils.ts +25 -33
- /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt} +0 -0
package/src/constants/common.ts
CHANGED
|
@@ -888,6 +888,7 @@ const getRobotsTxtViaPlaywright = async (
|
|
|
888
888
|
browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
|
|
889
889
|
...getPlaywrightLaunchOptions(browser),
|
|
890
890
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
891
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
891
892
|
});
|
|
892
893
|
register(browserContext);
|
|
893
894
|
} else {
|
|
@@ -895,9 +896,10 @@ const getRobotsTxtViaPlaywright = async (
|
|
|
895
896
|
const launchOptions = getPlaywrightLaunchOptions(browser);
|
|
896
897
|
browserInstance = await constants.launcher.launch(launchOptions);
|
|
897
898
|
register(browserInstance as unknown as { close: () => Promise<void> });
|
|
898
|
-
|
|
899
|
+
|
|
899
900
|
browserContext = await browserInstance.newContext({
|
|
900
901
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
902
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
901
903
|
});
|
|
902
904
|
}
|
|
903
905
|
|
|
@@ -975,7 +977,7 @@ export const isDisallowedInRobotsTxt = (url: string): boolean => {
|
|
|
975
977
|
|
|
976
978
|
export const getLinksFromSitemap = async (
|
|
977
979
|
sitemapUrl: string,
|
|
978
|
-
|
|
980
|
+
_maxLinksCount: number,
|
|
979
981
|
browser: string,
|
|
980
982
|
userDataDirectory: string,
|
|
981
983
|
userUrlInput: string,
|
|
@@ -985,9 +987,8 @@ export const getLinksFromSitemap = async (
|
|
|
985
987
|
userUrl: string = userUrlInput,
|
|
986
988
|
) => {
|
|
987
989
|
const scannedSitemaps = new Set<string>();
|
|
988
|
-
const
|
|
989
|
-
|
|
990
|
-
const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
|
|
990
|
+
const sitemapLinkCounts: Record<string, number> = {};
|
|
991
|
+
const allUrls = new Set<string>(); // all discovered URLs (lightweight strings)
|
|
991
992
|
|
|
992
993
|
const addToUrlList = (url: string) => {
|
|
993
994
|
if (!url) return;
|
|
@@ -995,17 +996,7 @@ export const getLinksFromSitemap = async (
|
|
|
995
996
|
if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy)) return;
|
|
996
997
|
|
|
997
998
|
url = convertPathToLocalFile(url);
|
|
998
|
-
|
|
999
|
-
let request;
|
|
1000
|
-
try {
|
|
1001
|
-
request = new Request({ url });
|
|
1002
|
-
} catch (e) {
|
|
1003
|
-
console.log('Error creating request', e);
|
|
1004
|
-
}
|
|
1005
|
-
if (isUrlPdf(url)) {
|
|
1006
|
-
request.skipNavigation = true;
|
|
1007
|
-
}
|
|
1008
|
-
urls[url] = request;
|
|
999
|
+
allUrls.add(url);
|
|
1009
1000
|
};
|
|
1010
1001
|
|
|
1011
1002
|
const calculateCloseness = (sitemapUrl: string) => {
|
|
@@ -1058,16 +1049,15 @@ export const getLinksFromSitemap = async (
|
|
|
1058
1049
|
});
|
|
1059
1050
|
}
|
|
1060
1051
|
|
|
1061
|
-
// Add
|
|
1062
|
-
for (const { url } of urlList
|
|
1052
|
+
// Add all URLs to the discovered list (limit applied later at return time)
|
|
1053
|
+
for (const { url } of urlList) {
|
|
1063
1054
|
addToUrlList(url);
|
|
1064
1055
|
}
|
|
1065
1056
|
};
|
|
1066
1057
|
|
|
1067
1058
|
const processNonStandardSitemap = (data: string) => {
|
|
1068
1059
|
const urlsFromData = crawlee
|
|
1069
|
-
.extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
|
|
1070
|
-
.slice(0, maxLinksCount);
|
|
1060
|
+
.extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') });
|
|
1071
1061
|
urlsFromData.forEach(url => {
|
|
1072
1062
|
addToUrlList(url);
|
|
1073
1063
|
});
|
|
@@ -1118,6 +1108,7 @@ export const getLinksFromSitemap = async (
|
|
|
1118
1108
|
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
1119
1109
|
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
1120
1110
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
1111
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
1121
1112
|
},
|
|
1122
1113
|
);
|
|
1123
1114
|
|
|
@@ -1127,9 +1118,10 @@ export const getLinksFromSitemap = async (
|
|
|
1127
1118
|
const launchOptions = getPlaywrightLaunchOptions(browser);
|
|
1128
1119
|
browserInstance = await constants.launcher.launch(launchOptions);
|
|
1129
1120
|
register(browserInstance as unknown as { close: () => Promise<void> });
|
|
1130
|
-
|
|
1121
|
+
|
|
1131
1122
|
browserContext = await browserInstance.newContext({
|
|
1132
1123
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
1124
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
1133
1125
|
});
|
|
1134
1126
|
}
|
|
1135
1127
|
|
|
@@ -1202,14 +1194,13 @@ export const getLinksFromSitemap = async (
|
|
|
1202
1194
|
sitemapType = constants.xmlSitemapTypes.unknown;
|
|
1203
1195
|
}
|
|
1204
1196
|
|
|
1197
|
+
const countBefore = allUrls.size;
|
|
1198
|
+
|
|
1205
1199
|
switch (sitemapType) {
|
|
1206
1200
|
case constants.xmlSitemapTypes.xmlIndex:
|
|
1207
1201
|
consoleLogger.info(`This is a XML format sitemap index.`);
|
|
1208
1202
|
for (const childSitemapUrl of $('loc')) {
|
|
1209
1203
|
const childSitemapUrlText = $(childSitemapUrl).text();
|
|
1210
|
-
if (isLimitReached()) {
|
|
1211
|
-
break;
|
|
1212
|
-
}
|
|
1213
1204
|
if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
|
|
1214
1205
|
await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
|
|
1215
1206
|
} else {
|
|
@@ -1233,6 +1224,11 @@ export const getLinksFromSitemap = async (
|
|
|
1233
1224
|
consoleLogger.info(`This is an unrecognised XML sitemap format.`);
|
|
1234
1225
|
processNonStandardSitemap(data);
|
|
1235
1226
|
}
|
|
1227
|
+
|
|
1228
|
+
const linksFromThisSitemap = allUrls.size - countBefore;
|
|
1229
|
+
if (linksFromThisSitemap > 0) {
|
|
1230
|
+
sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
|
|
1231
|
+
}
|
|
1236
1232
|
};
|
|
1237
1233
|
|
|
1238
1234
|
try {
|
|
@@ -1241,7 +1237,41 @@ export const getLinksFromSitemap = async (
|
|
|
1241
1237
|
consoleLogger.error(e);
|
|
1242
1238
|
}
|
|
1243
1239
|
|
|
1244
|
-
|
|
1240
|
+
// Build Request objects for all discovered URLs; the crawler itself enforces
|
|
1241
|
+
// maxRequestsPerCrawl by counting only successfully scanned pages.
|
|
1242
|
+
const requestList: Request[] = [];
|
|
1243
|
+
for (const url of allUrls) {
|
|
1244
|
+
try {
|
|
1245
|
+
const request = new Request({ url });
|
|
1246
|
+
if (isUrlPdf(url)) {
|
|
1247
|
+
request.skipNavigation = true;
|
|
1248
|
+
}
|
|
1249
|
+
requestList.push(request);
|
|
1250
|
+
} catch (e) {
|
|
1251
|
+
consoleLogger.info(`Error creating request for ${url}: ${e}`);
|
|
1252
|
+
}
|
|
1253
|
+
}
|
|
1254
|
+
|
|
1255
|
+
const totalLinksDiscovered = allUrls.size;
|
|
1256
|
+
const fetchedSitemaps = Object.entries(sitemapLinkCounts).map(([url, fetchedLinks]) => ({
|
|
1257
|
+
url,
|
|
1258
|
+
fetchedLinks,
|
|
1259
|
+
}));
|
|
1260
|
+
|
|
1261
|
+
const prev = constants.sitemapFetchedLinks;
|
|
1262
|
+
constants.sitemapFetchedLinks = {
|
|
1263
|
+
totalLinksFetchedFromSitemaps: (prev?.totalLinksFetchedFromSitemaps ?? 0) + totalLinksDiscovered,
|
|
1264
|
+
fetchedSitemaps: [...(prev?.fetchedSitemaps ?? []), ...fetchedSitemaps],
|
|
1265
|
+
};
|
|
1266
|
+
|
|
1267
|
+
if (totalLinksDiscovered > 0) {
|
|
1268
|
+
const breakdown = fetchedSitemaps
|
|
1269
|
+
.map(({ url, fetchedLinks }) => `${url} (${fetchedLinks})`)
|
|
1270
|
+
.join(', ');
|
|
1271
|
+
consoleLogger.info(
|
|
1272
|
+
`There are a total of ${totalLinksDiscovered} links found across ${breakdown}.`,
|
|
1273
|
+
);
|
|
1274
|
+
}
|
|
1245
1275
|
|
|
1246
1276
|
return requestList;
|
|
1247
1277
|
};
|
|
@@ -1406,6 +1436,36 @@ export const getEdgeData = (randomToken: string) => {
|
|
|
1406
1436
|
* @param {*} destDir destination directory
|
|
1407
1437
|
* @returns boolean indicating whether the operation was successful
|
|
1408
1438
|
*/
|
|
1439
|
+
// Helper to copy a file with retry logic for transient EBUSY errors
|
|
1440
|
+
const copyFileWithRetry = (src: string, dest: string, maxRetries: number = 3): boolean => {
|
|
1441
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
1442
|
+
try {
|
|
1443
|
+
fs.copyFileSync(src, dest);
|
|
1444
|
+
if (attempt > 1) {
|
|
1445
|
+
consoleLogger.info(`File copy succeeded on attempt ${attempt}: ${dest}`);
|
|
1446
|
+
}
|
|
1447
|
+
return true;
|
|
1448
|
+
} catch (err: any) {
|
|
1449
|
+
if (err.code === 'EBUSY' && attempt < maxRetries) {
|
|
1450
|
+
// Transient lock — wait and retry
|
|
1451
|
+
const delayMs = Math.min(100 * Math.pow(2, attempt - 1), 1000); // Exponential backoff: 100ms, 200ms, 400ms, capped at 1s
|
|
1452
|
+
consoleLogger.warn(
|
|
1453
|
+
`File copy attempt ${attempt}/${maxRetries} failed with EBUSY. Retrying after ${delayMs}ms: ${dest}`,
|
|
1454
|
+
);
|
|
1455
|
+
// Synchronous sleep via busy-wait (not ideal but avoids promise complications in sync context)
|
|
1456
|
+
const endTime = Date.now() + delayMs;
|
|
1457
|
+
while (Date.now() < endTime) {
|
|
1458
|
+
// Busy wait
|
|
1459
|
+
}
|
|
1460
|
+
continue; // Retry
|
|
1461
|
+
}
|
|
1462
|
+
// Non-transient error or max retries reached
|
|
1463
|
+
return false;
|
|
1464
|
+
}
|
|
1465
|
+
}
|
|
1466
|
+
return false;
|
|
1467
|
+
};
|
|
1468
|
+
|
|
1409
1469
|
const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
|
|
1410
1470
|
let profileCookiesDir;
|
|
1411
1471
|
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
|
@@ -1445,23 +1505,9 @@ const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, d
|
|
|
1445
1505
|
|
|
1446
1506
|
// Prevents duplicate cookies file if the cookies already exist
|
|
1447
1507
|
if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
consoleLogger.error(err);
|
|
1452
|
-
if (err.code === 'EBUSY') {
|
|
1453
|
-
console.log(
|
|
1454
|
-
`Unable to copy the file for ${profileName} because it is currently in use.`,
|
|
1455
|
-
);
|
|
1456
|
-
console.log(
|
|
1457
|
-
'Please close any applications that might be using this file and try again.',
|
|
1458
|
-
);
|
|
1459
|
-
} else {
|
|
1460
|
-
console.log(
|
|
1461
|
-
`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`,
|
|
1462
|
-
);
|
|
1463
|
-
}
|
|
1464
|
-
// printMessage([err], messageOptions);
|
|
1508
|
+
const destCookiesPath = path.join(destProfileDir, 'Cookies');
|
|
1509
|
+
if (!copyFileWithRetry(dir, destCookiesPath)) {
|
|
1510
|
+
consoleLogger.error(`Failed to copy Chrome profile cookies for ${profileName} after retries.`);
|
|
1465
1511
|
success = false;
|
|
1466
1512
|
}
|
|
1467
1513
|
}
|
|
@@ -1475,12 +1521,6 @@ const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, d
|
|
|
1475
1521
|
return false;
|
|
1476
1522
|
};
|
|
1477
1523
|
|
|
1478
|
-
/**
|
|
1479
|
-
* Clone the Chrome profile cookie files to the destination directory
|
|
1480
|
-
* @param {*} options glob options object
|
|
1481
|
-
* @param {*} destDir destination directory
|
|
1482
|
-
* @returns boolean indicating whether the operation was successful
|
|
1483
|
-
*/
|
|
1484
1524
|
const cloneEdgeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
|
|
1485
1525
|
let profileCookiesDir;
|
|
1486
1526
|
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
|
@@ -1521,21 +1561,9 @@ const cloneEdgeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, des
|
|
|
1521
1561
|
|
|
1522
1562
|
// Prevents duplicate cookies file if the cookies already exist
|
|
1523
1563
|
if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
consoleLogger.error(err);
|
|
1528
|
-
if (err.code === 'EBUSY') {
|
|
1529
|
-
console.log(
|
|
1530
|
-
`Unable to copy the file for ${profileName} because it is currently in use.`,
|
|
1531
|
-
);
|
|
1532
|
-
console.log(
|
|
1533
|
-
'Please close any applications that might be using this file and try again.',
|
|
1534
|
-
);
|
|
1535
|
-
} else {
|
|
1536
|
-
console.log(`An unexpected error occurred while copying the file: ${err.message}`);
|
|
1537
|
-
}
|
|
1538
|
-
// printMessage([err], messageOptions);
|
|
1564
|
+
const destCookiesPath = path.join(destProfileDir, 'Cookies');
|
|
1565
|
+
if (!copyFileWithRetry(dir, destCookiesPath)) {
|
|
1566
|
+
consoleLogger.error(`Failed to copy Edge profile cookies for ${profileName} after retries.`);
|
|
1539
1567
|
success = false;
|
|
1540
1568
|
}
|
|
1541
1569
|
}
|
|
@@ -1566,19 +1594,9 @@ const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: st
|
|
|
1566
1594
|
|
|
1567
1595
|
localState.forEach(dir => {
|
|
1568
1596
|
const profileName = dir.match(profileNamesRegex)[1];
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
consoleLogger.error(err);
|
|
1573
|
-
if (err.code === 'EBUSY') {
|
|
1574
|
-
console.log(`Unable to copy the file because it is currently in use.`);
|
|
1575
|
-
console.log('Please close any applications that might be using this file and try again.');
|
|
1576
|
-
} else {
|
|
1577
|
-
console.log(
|
|
1578
|
-
`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`,
|
|
1579
|
-
);
|
|
1580
|
-
}
|
|
1581
|
-
printMessage([err], messageOptions);
|
|
1597
|
+
const destPath = path.join(destDir, 'Local State');
|
|
1598
|
+
if (!copyFileWithRetry(dir, destPath)) {
|
|
1599
|
+
consoleLogger.error(`Failed to copy Local State file for ${profileName} after retries.`);
|
|
1582
1600
|
success = false;
|
|
1583
1601
|
}
|
|
1584
1602
|
});
|
|
@@ -1629,6 +1647,17 @@ export const cloneChromeProfiles = (randomToken: string): string => {
|
|
|
1629
1647
|
}
|
|
1630
1648
|
|
|
1631
1649
|
consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
|
|
1650
|
+
|
|
1651
|
+
// Fall back to a clean profile directory to avoid launch failures from partial clones.
|
|
1652
|
+
try {
|
|
1653
|
+
fs.rmSync(destDir, { recursive: true, force: true });
|
|
1654
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1655
|
+
consoleLogger.warn('Using an empty cloned Chrome profile directory due to clone failure.');
|
|
1656
|
+
} catch (cleanupError) {
|
|
1657
|
+
consoleLogger.error(
|
|
1658
|
+
`Unable to reset cloned Chrome profile directory ${destDir}: ${cleanupError}`,
|
|
1659
|
+
);
|
|
1660
|
+
}
|
|
1632
1661
|
}
|
|
1633
1662
|
// For future reference, return a null instead to halt the scan
|
|
1634
1663
|
return destDir;
|
|
@@ -1697,6 +1726,15 @@ export const cloneEdgeProfiles = (randomToken: string): string => {
|
|
|
1697
1726
|
}
|
|
1698
1727
|
|
|
1699
1728
|
consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
|
|
1729
|
+
|
|
1730
|
+
// Fall back to a clean profile directory to avoid launch failures from partial clones.
|
|
1731
|
+
try {
|
|
1732
|
+
fs.rmSync(destDir, { recursive: true, force: true });
|
|
1733
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1734
|
+
consoleLogger.warn('Using an empty cloned Edge profile directory due to clone failure.');
|
|
1735
|
+
} catch (cleanupError) {
|
|
1736
|
+
consoleLogger.error(`Unable to reset cloned Edge profile directory ${destDir}: ${cleanupError}`);
|
|
1737
|
+
}
|
|
1700
1738
|
}
|
|
1701
1739
|
|
|
1702
1740
|
// For future reference, return a null instead to halt the scan
|
|
@@ -1725,7 +1763,14 @@ export const deleteClonedChromeProfiles = (randomToken?: string): void => {
|
|
|
1725
1763
|
}
|
|
1726
1764
|
let destDir: string[];
|
|
1727
1765
|
if (randomToken) {
|
|
1728
|
-
|
|
1766
|
+
// Also match _pool* directories created by browser pool re-launches
|
|
1767
|
+
destDir = globSync(`oobee-${randomToken}*`, {
|
|
1768
|
+
cwd: baseDir,
|
|
1769
|
+
absolute: true,
|
|
1770
|
+
});
|
|
1771
|
+
if (destDir.length === 0) {
|
|
1772
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
|
1773
|
+
}
|
|
1729
1774
|
} else {
|
|
1730
1775
|
// Find all the oobee directories in the Chrome data directory
|
|
1731
1776
|
destDir = globSync('**/oobee*', {
|
|
@@ -1766,9 +1811,16 @@ export const deleteClonedEdgeProfiles = (randomToken?: string): void => {
|
|
|
1766
1811
|
}
|
|
1767
1812
|
let destDir: string[];
|
|
1768
1813
|
if (randomToken) {
|
|
1769
|
-
|
|
1814
|
+
// Also match _pool* directories created by browser pool re-launches
|
|
1815
|
+
destDir = globSync(`oobee-${randomToken}*`, {
|
|
1816
|
+
cwd: baseDir,
|
|
1817
|
+
absolute: true,
|
|
1818
|
+
});
|
|
1819
|
+
if (destDir.length === 0) {
|
|
1820
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
|
1821
|
+
}
|
|
1770
1822
|
} else {
|
|
1771
|
-
// Find all the oobee directories in the
|
|
1823
|
+
// Find all the oobee directories in the Edge data directory
|
|
1772
1824
|
destDir = globSync('**/oobee*', {
|
|
1773
1825
|
cwd: baseDir,
|
|
1774
1826
|
absolute: true,
|
|
@@ -946,6 +946,7 @@ export default {
|
|
|
946
946
|
a11yRuleShortDescriptionMap,
|
|
947
947
|
disabilityBadgesMap,
|
|
948
948
|
robotsTxtUrls: null,
|
|
949
|
+
sitemapFetchedLinks: null as { totalLinksFetchedFromSitemaps: number; fetchedSitemaps: { url: string; fetchedLinks: number }[] } | null,
|
|
949
950
|
userDataDirectory: null, // This will be set later in the code
|
|
950
951
|
randomToken: null, // This will be set later in the code
|
|
951
952
|
// Track all active Crawlee / Playwright resources for cleanup
|
|
@@ -874,6 +874,13 @@ export const runAxeScript = async ({
|
|
|
874
874
|
const browserContext: BrowserContext = page.context();
|
|
875
875
|
const requestUrl = page.url();
|
|
876
876
|
|
|
877
|
+
let pageTitle: string | null = null;
|
|
878
|
+
try {
|
|
879
|
+
pageTitle = await page.evaluate(() => document.title);
|
|
880
|
+
} catch {
|
|
881
|
+
// Page may already be in a bad state; title will remain null
|
|
882
|
+
}
|
|
883
|
+
|
|
877
884
|
try {
|
|
878
885
|
// Checking for DOM mutations before proceeding to scan
|
|
879
886
|
await page.evaluate(() => {
|
|
@@ -1012,7 +1019,42 @@ export const runAxeScript = async ({
|
|
|
1012
1019
|
.run(selectors, {
|
|
1013
1020
|
resultTypes: defaultResultTypes,
|
|
1014
1021
|
})
|
|
1015
|
-
.then(results => {
|
|
1022
|
+
.then(async results => {
|
|
1023
|
+
// Re-verify aria-hidden-focus violations against the live DOM to
|
|
1024
|
+
// handle race conditions with JS that sets tabindex="-1" after
|
|
1025
|
+
// aria-hidden (common in carousel/slider libraries like slick)
|
|
1026
|
+
const ariaHiddenViolation = results.violations.find(
|
|
1027
|
+
v => v.id === 'aria-hidden-focus',
|
|
1028
|
+
);
|
|
1029
|
+
if (ariaHiddenViolation) {
|
|
1030
|
+
await new Promise(resolve => setTimeout(resolve, 0));
|
|
1031
|
+
ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(node => {
|
|
1032
|
+
const selector = node.target && node.target[0];
|
|
1033
|
+
if (typeof selector !== 'string') return true;
|
|
1034
|
+
try {
|
|
1035
|
+
const el = document.querySelector(selector);
|
|
1036
|
+
if (!el) return true;
|
|
1037
|
+
const focusables = el.querySelectorAll(
|
|
1038
|
+
'a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]',
|
|
1039
|
+
);
|
|
1040
|
+
if (focusables.length === 0) return false;
|
|
1041
|
+
return Array.from(focusables).some(child => {
|
|
1042
|
+
const tabindex = child.getAttribute('tabindex');
|
|
1043
|
+
if (tabindex === null) return true;
|
|
1044
|
+
const parsed = parseInt(tabindex, 10);
|
|
1045
|
+
return isNaN(parsed) || parsed >= 0;
|
|
1046
|
+
});
|
|
1047
|
+
} catch {
|
|
1048
|
+
return true;
|
|
1049
|
+
}
|
|
1050
|
+
});
|
|
1051
|
+
if (ariaHiddenViolation.nodes.length === 0) {
|
|
1052
|
+
results.violations = results.violations.filter(
|
|
1053
|
+
v => v.id !== 'aria-hidden-focus',
|
|
1054
|
+
);
|
|
1055
|
+
}
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1016
1058
|
if (disableOobee) {
|
|
1017
1059
|
return results;
|
|
1018
1060
|
}
|
|
@@ -1086,19 +1128,6 @@ export const runAxeScript = async ({
|
|
|
1086
1128
|
results.incomplete = await takeScreenshotForHTMLElements(results.incomplete, page, randomToken);
|
|
1087
1129
|
}
|
|
1088
1130
|
|
|
1089
|
-
let pageTitle = null;
|
|
1090
|
-
try {
|
|
1091
|
-
pageTitle = await page.evaluate(() => document.title);
|
|
1092
|
-
} catch (e) {
|
|
1093
|
-
consoleLogger.info(`Error while getting page title: ${e}`);
|
|
1094
|
-
if (page.isClosed()) {
|
|
1095
|
-
consoleLogger.info(`Page was closed for ${requestUrl}, creating new page`);
|
|
1096
|
-
page = await browserContext.newPage();
|
|
1097
|
-
await page.goto(requestUrl, { waitUntil: 'domcontentloaded' });
|
|
1098
|
-
pageTitle = await page.evaluate(() => document.title);
|
|
1099
|
-
}
|
|
1100
|
-
}
|
|
1101
|
-
|
|
1102
1131
|
return filterAxeResults(results, pageTitle, customFlowDetails);
|
|
1103
1132
|
};
|
|
1104
1133
|
|
|
@@ -1130,6 +1159,66 @@ export const postNavigationHooks = [
|
|
|
1130
1159
|
},
|
|
1131
1160
|
];
|
|
1132
1161
|
|
|
1162
|
+
export const getPreLaunchHook = (userDataDirectory: string) => {
|
|
1163
|
+
let launchCount = 0;
|
|
1164
|
+
|
|
1165
|
+
return async (_pageId: string, launchContext: any) => {
|
|
1166
|
+
const fsp = await import('fs/promises').then(m => m.default);
|
|
1167
|
+
launchCount += 1;
|
|
1168
|
+
|
|
1169
|
+
// First launch uses the base directory; subsequent launches get a unique
|
|
1170
|
+
// directory so that lingering file handles from a retired browser don't
|
|
1171
|
+
// cause Chrome exit code 21 on Windows.
|
|
1172
|
+
const effectiveDir =
|
|
1173
|
+
launchCount === 1
|
|
1174
|
+
? userDataDirectory
|
|
1175
|
+
: `${userDataDirectory}_pool${launchCount}`;
|
|
1176
|
+
|
|
1177
|
+
await fsp.mkdir(effectiveDir, { recursive: true });
|
|
1178
|
+
|
|
1179
|
+
// For pool re-launches, best-effort clone profile data from base directory
|
|
1180
|
+
// so authenticated sessions are preserved across browser pool retirements.
|
|
1181
|
+
if (launchCount > 1) {
|
|
1182
|
+
try {
|
|
1183
|
+
const copyRecursive = async (src: string, dest: string) => {
|
|
1184
|
+
const stat = await fsp.stat(src).catch(() => null);
|
|
1185
|
+
if (!stat) return;
|
|
1186
|
+
if (stat.isDirectory()) {
|
|
1187
|
+
await fsp.mkdir(dest, { recursive: true }).catch(() => {});
|
|
1188
|
+
const entries = await fsp.readdir(src).catch(() => []);
|
|
1189
|
+
await Promise.all(
|
|
1190
|
+
entries
|
|
1191
|
+
.filter(entry => !entry.startsWith('Singleton') && entry !== 'lockfile' && entry !== 'LOCK')
|
|
1192
|
+
.map(entry =>
|
|
1193
|
+
copyRecursive(path.join(src, entry), path.join(dest, entry)).catch(() => {}),
|
|
1194
|
+
),
|
|
1195
|
+
);
|
|
1196
|
+
} else {
|
|
1197
|
+
await fsp.copyFile(src, dest).catch(() => {});
|
|
1198
|
+
}
|
|
1199
|
+
};
|
|
1200
|
+
await copyRecursive(userDataDirectory, effectiveDir).catch(() => {});
|
|
1201
|
+
} catch {
|
|
1202
|
+
// Silent fallback: use empty profile if clone fails
|
|
1203
|
+
}
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
// Clean any stale lock files that may block browser launches on Windows
|
|
1207
|
+
const lockFiles = [
|
|
1208
|
+
path.join(effectiveDir, 'SingletonLock'),
|
|
1209
|
+
path.join(effectiveDir, 'SingletonSocket'),
|
|
1210
|
+
path.join(effectiveDir, 'SingletonCookie'),
|
|
1211
|
+
path.join(effectiveDir, 'lockfile'),
|
|
1212
|
+
path.join(effectiveDir, 'Default', 'LOCK'),
|
|
1213
|
+
path.join(effectiveDir, 'Default', 'Network', 'LOCK'),
|
|
1214
|
+
];
|
|
1215
|
+
await Promise.all(lockFiles.map(f => fsp.rm(f, { force: true }).catch(() => {})));
|
|
1216
|
+
|
|
1217
|
+
// eslint-disable-next-line no-param-reassign
|
|
1218
|
+
launchContext.userDataDir = effectiveDir;
|
|
1219
|
+
};
|
|
1220
|
+
};
|
|
1221
|
+
|
|
1133
1222
|
export const failedRequestHandler = async ({ request }: { request: Request }) => {
|
|
1134
1223
|
guiInfoLog(guiInfoStatusTypes.ERROR, { numScanned: 0, urlScanned: request.url });
|
|
1135
1224
|
log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|