@govtechsg/oobee 0.10.91 → 0.10.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +303 -0
- package/README.md +22 -0
- package/dist/cli.js +3 -0
- package/dist/combine.js +15 -3
- package/dist/constants/cliFunctions.js +7 -0
- package/dist/constants/common.js +149 -80
- package/dist/constants/constants.js +1 -0
- package/dist/crawlers/commonCrawlerFunc.js +136 -15
- package/dist/crawlers/crawlDomain.js +55 -58
- package/dist/crawlers/crawlIntelligentSitemap.js +21 -11
- package/dist/crawlers/crawlRateController.js +47 -0
- package/dist/crawlers/crawlSitemap.js +51 -62
- package/dist/crawlers/runCustom.js +8 -2
- package/dist/generateOobeeClientScanner.js +32 -1
- package/dist/mergeAxeResults/itemsStore.js +32 -3
- package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
- package/dist/mergeAxeResults.js +120 -92
- package/dist/npmIndex.js +1 -0
- package/dist/utils.js +23 -28
- package/oobee-client-scanner.js +35 -4
- package/package.json +3 -3
- package/src/cli.ts +4 -0
- package/src/combine.ts +16 -1
- package/src/constants/cliFunctions.ts +7 -0
- package/src/constants/common.ts +162 -90
- package/src/constants/constants.ts +1 -0
- package/src/crawlers/commonCrawlerFunc.ts +148 -14
- package/src/crawlers/crawlDomain.ts +64 -66
- package/src/crawlers/crawlIntelligentSitemap.ts +23 -11
- package/src/crawlers/crawlRateController.ts +63 -0
- package/src/crawlers/crawlSitemap.ts +57 -70
- package/src/crawlers/runCustom.ts +10 -1
- package/src/generateOobeeClientScanner.ts +32 -1
- package/src/index.ts +1 -0
- package/src/mergeAxeResults/itemsStore.ts +37 -3
- package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
- package/src/mergeAxeResults.ts +139 -99
- package/src/npmIndex.ts +1 -0
- package/src/utils.ts +25 -33
- /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt} +0 -0
package/src/constants/common.ts
CHANGED
|
@@ -377,9 +377,21 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
377
377
|
} = rawDevice;
|
|
378
378
|
|
|
379
379
|
const launchOptions = getPlaywrightLaunchOptions(browserToRun);
|
|
380
|
+
|
|
381
|
+
const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
|
|
382
|
+
let httpCredentials = undefined;
|
|
383
|
+
if (Authorization?.startsWith('Basic ')) {
|
|
384
|
+
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
385
|
+
const colonIdx = decoded.indexOf(':');
|
|
386
|
+
if (colonIdx > 0) {
|
|
387
|
+
httpCredentials = { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
380
391
|
const contextOptions: Record<string, unknown> = {
|
|
381
392
|
...restDevice,
|
|
382
|
-
...(
|
|
393
|
+
...(Object.keys(nonAuthHeaders).length > 0 && { extraHTTPHeaders: nonAuthHeaders }),
|
|
394
|
+
...(httpCredentials && { httpCredentials }),
|
|
383
395
|
ignoreHTTPSErrors: true,
|
|
384
396
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
385
397
|
};
|
|
@@ -421,6 +433,24 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
421
433
|
}
|
|
422
434
|
|
|
423
435
|
try {
|
|
436
|
+
// Only enable generic Authorization header routing interception broadly if
|
|
437
|
+
// a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
|
|
438
|
+
// performance warnings inside the check checkUrl phase for typical public scans
|
|
439
|
+
if (Authorization && !httpCredentials) {
|
|
440
|
+
const entryOrigin = new URL(url).origin;
|
|
441
|
+
await browserContext.route('**/*', async (route: any, request: any) => {
|
|
442
|
+
try {
|
|
443
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
444
|
+
await route.continue({ headers: { ...request.headers(), Authorization } });
|
|
445
|
+
} else {
|
|
446
|
+
await route.continue();
|
|
447
|
+
}
|
|
448
|
+
} catch {
|
|
449
|
+
await route.continue();
|
|
450
|
+
}
|
|
451
|
+
});
|
|
452
|
+
}
|
|
453
|
+
|
|
424
454
|
const page = await browserContext.newPage();
|
|
425
455
|
|
|
426
456
|
// Block native Chrome download UI
|
|
@@ -431,16 +461,6 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
431
461
|
consoleLogger.info(`Unable to set download deny: ${(e as Error).message}`);
|
|
432
462
|
}
|
|
433
463
|
|
|
434
|
-
// OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
|
|
435
|
-
// This allows the "Connectivity Check" to pass as soon as HTML is ready
|
|
436
|
-
await page.route('**/*', (route) => {
|
|
437
|
-
const type = route.request().resourceType();
|
|
438
|
-
if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
|
|
439
|
-
return route.abort();
|
|
440
|
-
}
|
|
441
|
-
return route.continue();
|
|
442
|
-
});
|
|
443
|
-
|
|
444
464
|
// STEP 2: Navigate (follows server-side redirects)
|
|
445
465
|
page.once('download', () => {
|
|
446
466
|
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
@@ -888,6 +908,7 @@ const getRobotsTxtViaPlaywright = async (
|
|
|
888
908
|
browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
|
|
889
909
|
...getPlaywrightLaunchOptions(browser),
|
|
890
910
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
911
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
891
912
|
});
|
|
892
913
|
register(browserContext);
|
|
893
914
|
} else {
|
|
@@ -895,9 +916,10 @@ const getRobotsTxtViaPlaywright = async (
|
|
|
895
916
|
const launchOptions = getPlaywrightLaunchOptions(browser);
|
|
896
917
|
browserInstance = await constants.launcher.launch(launchOptions);
|
|
897
918
|
register(browserInstance as unknown as { close: () => Promise<void> });
|
|
898
|
-
|
|
919
|
+
|
|
899
920
|
browserContext = await browserInstance.newContext({
|
|
900
921
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
922
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
901
923
|
});
|
|
902
924
|
}
|
|
903
925
|
|
|
@@ -975,7 +997,7 @@ export const isDisallowedInRobotsTxt = (url: string): boolean => {
|
|
|
975
997
|
|
|
976
998
|
export const getLinksFromSitemap = async (
|
|
977
999
|
sitemapUrl: string,
|
|
978
|
-
|
|
1000
|
+
_maxLinksCount: number,
|
|
979
1001
|
browser: string,
|
|
980
1002
|
userDataDirectory: string,
|
|
981
1003
|
userUrlInput: string,
|
|
@@ -985,9 +1007,8 @@ export const getLinksFromSitemap = async (
|
|
|
985
1007
|
userUrl: string = userUrlInput,
|
|
986
1008
|
) => {
|
|
987
1009
|
const scannedSitemaps = new Set<string>();
|
|
988
|
-
const
|
|
989
|
-
|
|
990
|
-
const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
|
|
1010
|
+
const sitemapLinkCounts: Record<string, number> = {};
|
|
1011
|
+
const allUrls = new Set<string>(); // all discovered URLs (lightweight strings)
|
|
991
1012
|
|
|
992
1013
|
const addToUrlList = (url: string) => {
|
|
993
1014
|
if (!url) return;
|
|
@@ -995,17 +1016,7 @@ export const getLinksFromSitemap = async (
|
|
|
995
1016
|
if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy)) return;
|
|
996
1017
|
|
|
997
1018
|
url = convertPathToLocalFile(url);
|
|
998
|
-
|
|
999
|
-
let request;
|
|
1000
|
-
try {
|
|
1001
|
-
request = new Request({ url });
|
|
1002
|
-
} catch (e) {
|
|
1003
|
-
console.log('Error creating request', e);
|
|
1004
|
-
}
|
|
1005
|
-
if (isUrlPdf(url)) {
|
|
1006
|
-
request.skipNavigation = true;
|
|
1007
|
-
}
|
|
1008
|
-
urls[url] = request;
|
|
1019
|
+
allUrls.add(url);
|
|
1009
1020
|
};
|
|
1010
1021
|
|
|
1011
1022
|
const calculateCloseness = (sitemapUrl: string) => {
|
|
@@ -1058,16 +1069,15 @@ export const getLinksFromSitemap = async (
|
|
|
1058
1069
|
});
|
|
1059
1070
|
}
|
|
1060
1071
|
|
|
1061
|
-
// Add
|
|
1062
|
-
for (const { url } of urlList
|
|
1072
|
+
// Add all URLs to the discovered list (limit applied later at return time)
|
|
1073
|
+
for (const { url } of urlList) {
|
|
1063
1074
|
addToUrlList(url);
|
|
1064
1075
|
}
|
|
1065
1076
|
};
|
|
1066
1077
|
|
|
1067
1078
|
const processNonStandardSitemap = (data: string) => {
|
|
1068
1079
|
const urlsFromData = crawlee
|
|
1069
|
-
.extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
|
|
1070
|
-
.slice(0, maxLinksCount);
|
|
1080
|
+
.extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') });
|
|
1071
1081
|
urlsFromData.forEach(url => {
|
|
1072
1082
|
addToUrlList(url);
|
|
1073
1083
|
});
|
|
@@ -1118,6 +1128,7 @@ export const getLinksFromSitemap = async (
|
|
|
1118
1128
|
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
1119
1129
|
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
1120
1130
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
1131
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
1121
1132
|
},
|
|
1122
1133
|
);
|
|
1123
1134
|
|
|
@@ -1127,9 +1138,10 @@ export const getLinksFromSitemap = async (
|
|
|
1127
1138
|
const launchOptions = getPlaywrightLaunchOptions(browser);
|
|
1128
1139
|
browserInstance = await constants.launcher.launch(launchOptions);
|
|
1129
1140
|
register(browserInstance as unknown as { close: () => Promise<void> });
|
|
1130
|
-
|
|
1141
|
+
|
|
1131
1142
|
browserContext = await browserInstance.newContext({
|
|
1132
1143
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
1144
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
1133
1145
|
});
|
|
1134
1146
|
}
|
|
1135
1147
|
|
|
@@ -1202,14 +1214,13 @@ export const getLinksFromSitemap = async (
|
|
|
1202
1214
|
sitemapType = constants.xmlSitemapTypes.unknown;
|
|
1203
1215
|
}
|
|
1204
1216
|
|
|
1217
|
+
const countBefore = allUrls.size;
|
|
1218
|
+
|
|
1205
1219
|
switch (sitemapType) {
|
|
1206
1220
|
case constants.xmlSitemapTypes.xmlIndex:
|
|
1207
1221
|
consoleLogger.info(`This is a XML format sitemap index.`);
|
|
1208
1222
|
for (const childSitemapUrl of $('loc')) {
|
|
1209
1223
|
const childSitemapUrlText = $(childSitemapUrl).text();
|
|
1210
|
-
if (isLimitReached()) {
|
|
1211
|
-
break;
|
|
1212
|
-
}
|
|
1213
1224
|
if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
|
|
1214
1225
|
await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
|
|
1215
1226
|
} else {
|
|
@@ -1233,6 +1244,11 @@ export const getLinksFromSitemap = async (
|
|
|
1233
1244
|
consoleLogger.info(`This is an unrecognised XML sitemap format.`);
|
|
1234
1245
|
processNonStandardSitemap(data);
|
|
1235
1246
|
}
|
|
1247
|
+
|
|
1248
|
+
const linksFromThisSitemap = allUrls.size - countBefore;
|
|
1249
|
+
if (linksFromThisSitemap > 0) {
|
|
1250
|
+
sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
|
|
1251
|
+
}
|
|
1236
1252
|
};
|
|
1237
1253
|
|
|
1238
1254
|
try {
|
|
@@ -1241,7 +1257,41 @@ export const getLinksFromSitemap = async (
|
|
|
1241
1257
|
consoleLogger.error(e);
|
|
1242
1258
|
}
|
|
1243
1259
|
|
|
1244
|
-
|
|
1260
|
+
// Build Request objects for all discovered URLs; the crawler itself enforces
|
|
1261
|
+
// maxRequestsPerCrawl by counting only successfully scanned pages.
|
|
1262
|
+
const requestList: Request[] = [];
|
|
1263
|
+
for (const url of allUrls) {
|
|
1264
|
+
try {
|
|
1265
|
+
const request = new Request({ url });
|
|
1266
|
+
if (isUrlPdf(url)) {
|
|
1267
|
+
request.skipNavigation = true;
|
|
1268
|
+
}
|
|
1269
|
+
requestList.push(request);
|
|
1270
|
+
} catch (e) {
|
|
1271
|
+
consoleLogger.info(`Error creating request for ${url}: ${e}`);
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
|
|
1275
|
+
const totalLinksDiscovered = allUrls.size;
|
|
1276
|
+
const fetchedSitemaps = Object.entries(sitemapLinkCounts).map(([url, fetchedLinks]) => ({
|
|
1277
|
+
url,
|
|
1278
|
+
fetchedLinks,
|
|
1279
|
+
}));
|
|
1280
|
+
|
|
1281
|
+
const prev = constants.sitemapFetchedLinks;
|
|
1282
|
+
constants.sitemapFetchedLinks = {
|
|
1283
|
+
totalLinksFetchedFromSitemaps: (prev?.totalLinksFetchedFromSitemaps ?? 0) + totalLinksDiscovered,
|
|
1284
|
+
fetchedSitemaps: [...(prev?.fetchedSitemaps ?? []), ...fetchedSitemaps],
|
|
1285
|
+
};
|
|
1286
|
+
|
|
1287
|
+
if (totalLinksDiscovered > 0) {
|
|
1288
|
+
const breakdown = fetchedSitemaps
|
|
1289
|
+
.map(({ url, fetchedLinks }) => `${url} (${fetchedLinks})`)
|
|
1290
|
+
.join(', ');
|
|
1291
|
+
consoleLogger.info(
|
|
1292
|
+
`There are a total of ${totalLinksDiscovered} links found across ${breakdown}.`,
|
|
1293
|
+
);
|
|
1294
|
+
}
|
|
1245
1295
|
|
|
1246
1296
|
return requestList;
|
|
1247
1297
|
};
|
|
@@ -1406,6 +1456,36 @@ export const getEdgeData = (randomToken: string) => {
|
|
|
1406
1456
|
* @param {*} destDir destination directory
|
|
1407
1457
|
* @returns boolean indicating whether the operation was successful
|
|
1408
1458
|
*/
|
|
1459
|
+
// Helper to copy a file with retry logic for transient EBUSY errors
|
|
1460
|
+
const copyFileWithRetry = (src: string, dest: string, maxRetries: number = 3): boolean => {
|
|
1461
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
1462
|
+
try {
|
|
1463
|
+
fs.copyFileSync(src, dest);
|
|
1464
|
+
if (attempt > 1) {
|
|
1465
|
+
consoleLogger.info(`File copy succeeded on attempt ${attempt}: ${dest}`);
|
|
1466
|
+
}
|
|
1467
|
+
return true;
|
|
1468
|
+
} catch (err: any) {
|
|
1469
|
+
if (err.code === 'EBUSY' && attempt < maxRetries) {
|
|
1470
|
+
// Transient lock — wait and retry
|
|
1471
|
+
const delayMs = Math.min(100 * Math.pow(2, attempt - 1), 1000); // Exponential backoff: 100ms, 200ms, 400ms, capped at 1s
|
|
1472
|
+
consoleLogger.warn(
|
|
1473
|
+
`File copy attempt ${attempt}/${maxRetries} failed with EBUSY. Retrying after ${delayMs}ms: ${dest}`,
|
|
1474
|
+
);
|
|
1475
|
+
// Synchronous sleep via busy-wait (not ideal but avoids promise complications in sync context)
|
|
1476
|
+
const endTime = Date.now() + delayMs;
|
|
1477
|
+
while (Date.now() < endTime) {
|
|
1478
|
+
// Busy wait
|
|
1479
|
+
}
|
|
1480
|
+
continue; // Retry
|
|
1481
|
+
}
|
|
1482
|
+
// Non-transient error or max retries reached
|
|
1483
|
+
return false;
|
|
1484
|
+
}
|
|
1485
|
+
}
|
|
1486
|
+
return false;
|
|
1487
|
+
};
|
|
1488
|
+
|
|
1409
1489
|
const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
|
|
1410
1490
|
let profileCookiesDir;
|
|
1411
1491
|
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
|
@@ -1445,23 +1525,9 @@ const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, d
|
|
|
1445
1525
|
|
|
1446
1526
|
// Prevents duplicate cookies file if the cookies already exist
|
|
1447
1527
|
if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
consoleLogger.error(err);
|
|
1452
|
-
if (err.code === 'EBUSY') {
|
|
1453
|
-
console.log(
|
|
1454
|
-
`Unable to copy the file for ${profileName} because it is currently in use.`,
|
|
1455
|
-
);
|
|
1456
|
-
console.log(
|
|
1457
|
-
'Please close any applications that might be using this file and try again.',
|
|
1458
|
-
);
|
|
1459
|
-
} else {
|
|
1460
|
-
console.log(
|
|
1461
|
-
`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`,
|
|
1462
|
-
);
|
|
1463
|
-
}
|
|
1464
|
-
// printMessage([err], messageOptions);
|
|
1528
|
+
const destCookiesPath = path.join(destProfileDir, 'Cookies');
|
|
1529
|
+
if (!copyFileWithRetry(dir, destCookiesPath)) {
|
|
1530
|
+
consoleLogger.error(`Failed to copy Chrome profile cookies for ${profileName} after retries.`);
|
|
1465
1531
|
success = false;
|
|
1466
1532
|
}
|
|
1467
1533
|
}
|
|
@@ -1475,12 +1541,6 @@ const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, d
|
|
|
1475
1541
|
return false;
|
|
1476
1542
|
};
|
|
1477
1543
|
|
|
1478
|
-
/**
|
|
1479
|
-
* Clone the Chrome profile cookie files to the destination directory
|
|
1480
|
-
* @param {*} options glob options object
|
|
1481
|
-
* @param {*} destDir destination directory
|
|
1482
|
-
* @returns boolean indicating whether the operation was successful
|
|
1483
|
-
*/
|
|
1484
1544
|
const cloneEdgeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
|
|
1485
1545
|
let profileCookiesDir;
|
|
1486
1546
|
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
|
@@ -1521,21 +1581,9 @@ const cloneEdgeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, des
|
|
|
1521
1581
|
|
|
1522
1582
|
// Prevents duplicate cookies file if the cookies already exist
|
|
1523
1583
|
if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
consoleLogger.error(err);
|
|
1528
|
-
if (err.code === 'EBUSY') {
|
|
1529
|
-
console.log(
|
|
1530
|
-
`Unable to copy the file for ${profileName} because it is currently in use.`,
|
|
1531
|
-
);
|
|
1532
|
-
console.log(
|
|
1533
|
-
'Please close any applications that might be using this file and try again.',
|
|
1534
|
-
);
|
|
1535
|
-
} else {
|
|
1536
|
-
console.log(`An unexpected error occurred while copying the file: ${err.message}`);
|
|
1537
|
-
}
|
|
1538
|
-
// printMessage([err], messageOptions);
|
|
1584
|
+
const destCookiesPath = path.join(destProfileDir, 'Cookies');
|
|
1585
|
+
if (!copyFileWithRetry(dir, destCookiesPath)) {
|
|
1586
|
+
consoleLogger.error(`Failed to copy Edge profile cookies for ${profileName} after retries.`);
|
|
1539
1587
|
success = false;
|
|
1540
1588
|
}
|
|
1541
1589
|
}
|
|
@@ -1566,19 +1614,9 @@ const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: st
|
|
|
1566
1614
|
|
|
1567
1615
|
localState.forEach(dir => {
|
|
1568
1616
|
const profileName = dir.match(profileNamesRegex)[1];
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
consoleLogger.error(err);
|
|
1573
|
-
if (err.code === 'EBUSY') {
|
|
1574
|
-
console.log(`Unable to copy the file because it is currently in use.`);
|
|
1575
|
-
console.log('Please close any applications that might be using this file and try again.');
|
|
1576
|
-
} else {
|
|
1577
|
-
console.log(
|
|
1578
|
-
`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`,
|
|
1579
|
-
);
|
|
1580
|
-
}
|
|
1581
|
-
printMessage([err], messageOptions);
|
|
1617
|
+
const destPath = path.join(destDir, 'Local State');
|
|
1618
|
+
if (!copyFileWithRetry(dir, destPath)) {
|
|
1619
|
+
consoleLogger.error(`Failed to copy Local State file for ${profileName} after retries.`);
|
|
1582
1620
|
success = false;
|
|
1583
1621
|
}
|
|
1584
1622
|
});
|
|
@@ -1629,6 +1667,17 @@ export const cloneChromeProfiles = (randomToken: string): string => {
|
|
|
1629
1667
|
}
|
|
1630
1668
|
|
|
1631
1669
|
consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
|
|
1670
|
+
|
|
1671
|
+
// Fall back to a clean profile directory to avoid launch failures from partial clones.
|
|
1672
|
+
try {
|
|
1673
|
+
fs.rmSync(destDir, { recursive: true, force: true });
|
|
1674
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1675
|
+
consoleLogger.warn('Using an empty cloned Chrome profile directory due to clone failure.');
|
|
1676
|
+
} catch (cleanupError) {
|
|
1677
|
+
consoleLogger.error(
|
|
1678
|
+
`Unable to reset cloned Chrome profile directory ${destDir}: ${cleanupError}`,
|
|
1679
|
+
);
|
|
1680
|
+
}
|
|
1632
1681
|
}
|
|
1633
1682
|
// For future reference, return a null instead to halt the scan
|
|
1634
1683
|
return destDir;
|
|
@@ -1697,6 +1746,15 @@ export const cloneEdgeProfiles = (randomToken: string): string => {
|
|
|
1697
1746
|
}
|
|
1698
1747
|
|
|
1699
1748
|
consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
|
|
1749
|
+
|
|
1750
|
+
// Fall back to a clean profile directory to avoid launch failures from partial clones.
|
|
1751
|
+
try {
|
|
1752
|
+
fs.rmSync(destDir, { recursive: true, force: true });
|
|
1753
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1754
|
+
consoleLogger.warn('Using an empty cloned Edge profile directory due to clone failure.');
|
|
1755
|
+
} catch (cleanupError) {
|
|
1756
|
+
consoleLogger.error(`Unable to reset cloned Edge profile directory ${destDir}: ${cleanupError}`);
|
|
1757
|
+
}
|
|
1700
1758
|
}
|
|
1701
1759
|
|
|
1702
1760
|
// For future reference, return a null instead to halt the scan
|
|
@@ -1725,7 +1783,14 @@ export const deleteClonedChromeProfiles = (randomToken?: string): void => {
|
|
|
1725
1783
|
}
|
|
1726
1784
|
let destDir: string[];
|
|
1727
1785
|
if (randomToken) {
|
|
1728
|
-
|
|
1786
|
+
// Also match _pool* directories created by browser pool re-launches
|
|
1787
|
+
destDir = globSync(`oobee-${randomToken}*`, {
|
|
1788
|
+
cwd: baseDir,
|
|
1789
|
+
absolute: true,
|
|
1790
|
+
});
|
|
1791
|
+
if (destDir.length === 0) {
|
|
1792
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
|
1793
|
+
}
|
|
1729
1794
|
} else {
|
|
1730
1795
|
// Find all the oobee directories in the Chrome data directory
|
|
1731
1796
|
destDir = globSync('**/oobee*', {
|
|
@@ -1766,9 +1831,16 @@ export const deleteClonedEdgeProfiles = (randomToken?: string): void => {
|
|
|
1766
1831
|
}
|
|
1767
1832
|
let destDir: string[];
|
|
1768
1833
|
if (randomToken) {
|
|
1769
|
-
|
|
1834
|
+
// Also match _pool* directories created by browser pool re-launches
|
|
1835
|
+
destDir = globSync(`oobee-${randomToken}*`, {
|
|
1836
|
+
cwd: baseDir,
|
|
1837
|
+
absolute: true,
|
|
1838
|
+
});
|
|
1839
|
+
if (destDir.length === 0) {
|
|
1840
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
|
1841
|
+
}
|
|
1770
1842
|
} else {
|
|
1771
|
-
// Find all the oobee directories in the
|
|
1843
|
+
// Find all the oobee directories in the Edge data directory
|
|
1772
1844
|
destDir = globSync('**/oobee*', {
|
|
1773
1845
|
cwd: baseDir,
|
|
1774
1846
|
absolute: true,
|
|
@@ -946,6 +946,7 @@ export default {
|
|
|
946
946
|
a11yRuleShortDescriptionMap,
|
|
947
947
|
disabilityBadgesMap,
|
|
948
948
|
robotsTxtUrls: null,
|
|
949
|
+
sitemapFetchedLinks: null as { totalLinksFetchedFromSitemaps: number; fetchedSitemaps: { url: string; fetchedLinks: number }[] } | null,
|
|
949
950
|
userDataDirectory: null, // This will be set later in the code
|
|
950
951
|
randomToken: null, // This will be set later in the code
|
|
951
952
|
// Track all active Crawlee / Playwright resources for cleanup
|
|
@@ -874,6 +874,13 @@ export const runAxeScript = async ({
|
|
|
874
874
|
const browserContext: BrowserContext = page.context();
|
|
875
875
|
const requestUrl = page.url();
|
|
876
876
|
|
|
877
|
+
let pageTitle: string | null = null;
|
|
878
|
+
try {
|
|
879
|
+
pageTitle = await page.evaluate(() => document.title);
|
|
880
|
+
} catch {
|
|
881
|
+
// Page may already be in a bad state; title will remain null
|
|
882
|
+
}
|
|
883
|
+
|
|
877
884
|
try {
|
|
878
885
|
// Checking for DOM mutations before proceeding to scan
|
|
879
886
|
await page.evaluate(() => {
|
|
@@ -1012,7 +1019,42 @@ export const runAxeScript = async ({
|
|
|
1012
1019
|
.run(selectors, {
|
|
1013
1020
|
resultTypes: defaultResultTypes,
|
|
1014
1021
|
})
|
|
1015
|
-
.then(results => {
|
|
1022
|
+
.then(async results => {
|
|
1023
|
+
// Re-verify aria-hidden-focus violations against the live DOM to
|
|
1024
|
+
// handle race conditions with JS that sets tabindex="-1" after
|
|
1025
|
+
// aria-hidden (common in carousel/slider libraries like slick)
|
|
1026
|
+
const ariaHiddenViolation = results.violations.find(
|
|
1027
|
+
v => v.id === 'aria-hidden-focus',
|
|
1028
|
+
);
|
|
1029
|
+
if (ariaHiddenViolation) {
|
|
1030
|
+
await new Promise(resolve => setTimeout(resolve, 0));
|
|
1031
|
+
ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(node => {
|
|
1032
|
+
const selector = node.target && node.target[0];
|
|
1033
|
+
if (typeof selector !== 'string') return true;
|
|
1034
|
+
try {
|
|
1035
|
+
const el = document.querySelector(selector);
|
|
1036
|
+
if (!el) return true;
|
|
1037
|
+
const focusables = el.querySelectorAll(
|
|
1038
|
+
'a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]',
|
|
1039
|
+
);
|
|
1040
|
+
if (focusables.length === 0) return false;
|
|
1041
|
+
return Array.from(focusables).some(child => {
|
|
1042
|
+
const tabindex = child.getAttribute('tabindex');
|
|
1043
|
+
if (tabindex === null) return true;
|
|
1044
|
+
const parsed = parseInt(tabindex, 10);
|
|
1045
|
+
return isNaN(parsed) || parsed >= 0;
|
|
1046
|
+
});
|
|
1047
|
+
} catch {
|
|
1048
|
+
return true;
|
|
1049
|
+
}
|
|
1050
|
+
});
|
|
1051
|
+
if (ariaHiddenViolation.nodes.length === 0) {
|
|
1052
|
+
results.violations = results.violations.filter(
|
|
1053
|
+
v => v.id !== 'aria-hidden-focus',
|
|
1054
|
+
);
|
|
1055
|
+
}
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1016
1058
|
if (disableOobee) {
|
|
1017
1059
|
return results;
|
|
1018
1060
|
}
|
|
@@ -1086,19 +1128,6 @@ export const runAxeScript = async ({
|
|
|
1086
1128
|
results.incomplete = await takeScreenshotForHTMLElements(results.incomplete, page, randomToken);
|
|
1087
1129
|
}
|
|
1088
1130
|
|
|
1089
|
-
let pageTitle = null;
|
|
1090
|
-
try {
|
|
1091
|
-
pageTitle = await page.evaluate(() => document.title);
|
|
1092
|
-
} catch (e) {
|
|
1093
|
-
consoleLogger.info(`Error while getting page title: ${e}`);
|
|
1094
|
-
if (page.isClosed()) {
|
|
1095
|
-
consoleLogger.info(`Page was closed for ${requestUrl}, creating new page`);
|
|
1096
|
-
page = await browserContext.newPage();
|
|
1097
|
-
await page.goto(requestUrl, { waitUntil: 'domcontentloaded' });
|
|
1098
|
-
pageTitle = await page.evaluate(() => document.title);
|
|
1099
|
-
}
|
|
1100
|
-
}
|
|
1101
|
-
|
|
1102
1131
|
return filterAxeResults(results, pageTitle, customFlowDetails);
|
|
1103
1132
|
};
|
|
1104
1133
|
|
|
@@ -1124,12 +1153,117 @@ export const preNavigationHooks = (extraHTTPHeaders: Record<string, string>) =>
|
|
|
1124
1153
|
];
|
|
1125
1154
|
};
|
|
1126
1155
|
|
|
1156
|
+
/**
|
|
1157
|
+
* Splits extraHTTPHeaders into auth and non-auth parts.
|
|
1158
|
+
* Auth headers (Authorization) must only be sent to same-origin requests to avoid CORS preflight failures.
|
|
1159
|
+
* Non-auth headers are safe to set globally on the browser context.
|
|
1160
|
+
*/
|
|
1161
|
+
export const splitAuthHeaders = (extraHTTPHeaders?: Record<string, string>) => {
|
|
1162
|
+
const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
|
|
1163
|
+
return {
|
|
1164
|
+
authHeader: Authorization || null,
|
|
1165
|
+
nonAuthHeaders: Object.keys(nonAuthHeaders).length > 0 ? nonAuthHeaders : null,
|
|
1166
|
+
httpCredentials: (() => {
|
|
1167
|
+
if (!Authorization?.startsWith('Basic ')) return null;
|
|
1168
|
+
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
1169
|
+
const colonIdx = decoded.indexOf(':');
|
|
1170
|
+
if (colonIdx <= 0) return null;
|
|
1171
|
+
return { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
|
|
1172
|
+
})(),
|
|
1173
|
+
};
|
|
1174
|
+
};
|
|
1175
|
+
|
|
1176
|
+
/**
|
|
1177
|
+
* Adds a route handler to a BrowserContext that sends the Authorization header
|
|
1178
|
+
* only to same-origin requests, preventing CORS preflight failures on cross-origin CDN resources.
|
|
1179
|
+
*/
|
|
1180
|
+
export const addAuthRouteHandler = async (
|
|
1181
|
+
context: BrowserContext,
|
|
1182
|
+
entryUrl: string,
|
|
1183
|
+
authHeader: string | null
|
|
1184
|
+
) => {
|
|
1185
|
+
if (!authHeader) return;
|
|
1186
|
+
|
|
1187
|
+
const entryOrigin = new URL(entryUrl).origin;
|
|
1188
|
+
await context.route('**/*', async (route, request) => {
|
|
1189
|
+
try {
|
|
1190
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
1191
|
+
await route.continue({ headers: { ...request.headers(), Authorization: authHeader } });
|
|
1192
|
+
} else {
|
|
1193
|
+
await route.continue();
|
|
1194
|
+
}
|
|
1195
|
+
} catch {
|
|
1196
|
+
await route.continue();
|
|
1197
|
+
}
|
|
1198
|
+
});
|
|
1199
|
+
};
|
|
1200
|
+
|
|
1127
1201
|
export const postNavigationHooks = [
|
|
1128
1202
|
async (_crawlingContext: CrawlingContext) => {
|
|
1129
1203
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
1130
1204
|
},
|
|
1131
1205
|
];
|
|
1132
1206
|
|
|
1207
|
+
export const getPreLaunchHook = (userDataDirectory: string) => {
|
|
1208
|
+
let launchCount = 0;
|
|
1209
|
+
|
|
1210
|
+
return async (_pageId: string, launchContext: any) => {
|
|
1211
|
+
const fsp = await import('fs/promises').then(m => m.default);
|
|
1212
|
+
launchCount += 1;
|
|
1213
|
+
|
|
1214
|
+
// First launch uses the base directory; subsequent launches get a unique
|
|
1215
|
+
// directory so that lingering file handles from a retired browser don't
|
|
1216
|
+
// cause Chrome exit code 21 on Windows.
|
|
1217
|
+
const effectiveDir =
|
|
1218
|
+
launchCount === 1
|
|
1219
|
+
? userDataDirectory
|
|
1220
|
+
: `${userDataDirectory}_pool${launchCount}`;
|
|
1221
|
+
|
|
1222
|
+
await fsp.mkdir(effectiveDir, { recursive: true });
|
|
1223
|
+
|
|
1224
|
+
// For pool re-launches, best-effort clone profile data from base directory
|
|
1225
|
+
// so authenticated sessions are preserved across browser pool retirements.
|
|
1226
|
+
if (launchCount > 1) {
|
|
1227
|
+
try {
|
|
1228
|
+
const copyRecursive = async (src: string, dest: string) => {
|
|
1229
|
+
const stat = await fsp.stat(src).catch(() => null);
|
|
1230
|
+
if (!stat) return;
|
|
1231
|
+
if (stat.isDirectory()) {
|
|
1232
|
+
await fsp.mkdir(dest, { recursive: true }).catch(() => {});
|
|
1233
|
+
const entries = await fsp.readdir(src).catch(() => []);
|
|
1234
|
+
await Promise.all(
|
|
1235
|
+
entries
|
|
1236
|
+
.filter(entry => !entry.startsWith('Singleton') && entry !== 'lockfile' && entry !== 'LOCK')
|
|
1237
|
+
.map(entry =>
|
|
1238
|
+
copyRecursive(path.join(src, entry), path.join(dest, entry)).catch(() => {}),
|
|
1239
|
+
),
|
|
1240
|
+
);
|
|
1241
|
+
} else {
|
|
1242
|
+
await fsp.copyFile(src, dest).catch(() => {});
|
|
1243
|
+
}
|
|
1244
|
+
};
|
|
1245
|
+
await copyRecursive(userDataDirectory, effectiveDir).catch(() => {});
|
|
1246
|
+
} catch {
|
|
1247
|
+
// Silent fallback: use empty profile if clone fails
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
// Clean any stale lock files that may block browser launches on Windows
|
|
1252
|
+
const lockFiles = [
|
|
1253
|
+
path.join(effectiveDir, 'SingletonLock'),
|
|
1254
|
+
path.join(effectiveDir, 'SingletonSocket'),
|
|
1255
|
+
path.join(effectiveDir, 'SingletonCookie'),
|
|
1256
|
+
path.join(effectiveDir, 'lockfile'),
|
|
1257
|
+
path.join(effectiveDir, 'Default', 'LOCK'),
|
|
1258
|
+
path.join(effectiveDir, 'Default', 'Network', 'LOCK'),
|
|
1259
|
+
];
|
|
1260
|
+
await Promise.all(lockFiles.map(f => fsp.rm(f, { force: true }).catch(() => {})));
|
|
1261
|
+
|
|
1262
|
+
// eslint-disable-next-line no-param-reassign
|
|
1263
|
+
launchContext.userDataDir = effectiveDir;
|
|
1264
|
+
};
|
|
1265
|
+
};
|
|
1266
|
+
|
|
1133
1267
|
export const failedRequestHandler = async ({ request }: { request: Request }) => {
|
|
1134
1268
|
guiInfoLog(guiInfoStatusTypes.ERROR, { numScanned: 0, urlScanned: request.url });
|
|
1135
1269
|
log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|