@govtechsg/oobee 0.10.91 → 0.10.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/AGENTS.md +303 -0
  2. package/README.md +22 -0
  3. package/dist/cli.js +3 -0
  4. package/dist/combine.js +15 -3
  5. package/dist/constants/cliFunctions.js +7 -0
  6. package/dist/constants/common.js +149 -80
  7. package/dist/constants/constants.js +1 -0
  8. package/dist/crawlers/commonCrawlerFunc.js +136 -15
  9. package/dist/crawlers/crawlDomain.js +55 -58
  10. package/dist/crawlers/crawlIntelligentSitemap.js +21 -11
  11. package/dist/crawlers/crawlRateController.js +47 -0
  12. package/dist/crawlers/crawlSitemap.js +51 -62
  13. package/dist/crawlers/runCustom.js +8 -2
  14. package/dist/generateOobeeClientScanner.js +32 -1
  15. package/dist/mergeAxeResults/itemsStore.js +32 -3
  16. package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
  17. package/dist/mergeAxeResults.js +120 -92
  18. package/dist/npmIndex.js +1 -0
  19. package/dist/utils.js +23 -28
  20. package/oobee-client-scanner.js +35 -4
  21. package/package.json +3 -3
  22. package/src/cli.ts +4 -0
  23. package/src/combine.ts +16 -1
  24. package/src/constants/cliFunctions.ts +7 -0
  25. package/src/constants/common.ts +162 -90
  26. package/src/constants/constants.ts +1 -0
  27. package/src/crawlers/commonCrawlerFunc.ts +148 -14
  28. package/src/crawlers/crawlDomain.ts +64 -66
  29. package/src/crawlers/crawlIntelligentSitemap.ts +23 -11
  30. package/src/crawlers/crawlRateController.ts +63 -0
  31. package/src/crawlers/crawlSitemap.ts +57 -70
  32. package/src/crawlers/runCustom.ts +10 -1
  33. package/src/generateOobeeClientScanner.ts +32 -1
  34. package/src/index.ts +1 -0
  35. package/src/mergeAxeResults/itemsStore.ts +37 -3
  36. package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
  37. package/src/mergeAxeResults.ts +139 -99
  38. package/src/npmIndex.ts +1 -0
  39. package/src/utils.ts +25 -33
  40. /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt} +0 -0
@@ -300,9 +300,19 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
300
300
  const rawDevice = (playwrightDeviceDetailsObject || {});
301
301
  const { viewport, isMobile, hasTouch, userAgent: deviceUserAgent, ...restDevice } = rawDevice;
302
302
  const launchOptions = getPlaywrightLaunchOptions(browserToRun);
303
+ const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
304
+ let httpCredentials = undefined;
305
+ if (Authorization?.startsWith('Basic ')) {
306
+ const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
307
+ const colonIdx = decoded.indexOf(':');
308
+ if (colonIdx > 0) {
309
+ httpCredentials = { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
310
+ }
311
+ }
303
312
  const contextOptions = {
304
313
  ...restDevice,
305
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
314
+ ...(Object.keys(nonAuthHeaders).length > 0 && { extraHTTPHeaders: nonAuthHeaders }),
315
+ ...(httpCredentials && { httpCredentials }),
306
316
  ignoreHTTPSErrors: true,
307
317
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
308
318
  };
@@ -342,6 +352,25 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
342
352
  return res;
343
353
  }
344
354
  try {
355
+ // Only enable generic Authorization header routing interception broadly if
356
+ // a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
357
+ // performance warnings inside the check checkUrl phase for typical public scans
358
+ if (Authorization && !httpCredentials) {
359
+ const entryOrigin = new URL(url).origin;
360
+ await browserContext.route('**/*', async (route, request) => {
361
+ try {
362
+ if (new URL(request.url()).origin === entryOrigin) {
363
+ await route.continue({ headers: { ...request.headers(), Authorization } });
364
+ }
365
+ else {
366
+ await route.continue();
367
+ }
368
+ }
369
+ catch {
370
+ await route.continue();
371
+ }
372
+ });
373
+ }
345
374
  const page = await browserContext.newPage();
346
375
  // Block native Chrome download UI
347
376
  try {
@@ -351,15 +380,6 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
351
380
  catch (e) {
352
381
  consoleLogger.info(`Unable to set download deny: ${e.message}`);
353
382
  }
354
- // OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
355
- // This allows the "Connectivity Check" to pass as soon as HTML is ready
356
- await page.route('**/*', (route) => {
357
- const type = route.request().resourceType();
358
- if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
359
- return route.abort();
360
- }
361
- return route.continue();
362
- });
363
383
  // STEP 2: Navigate (follows server-side redirects)
364
384
  page.once('download', () => {
365
385
  res.status = constants.urlCheckStatuses.notASupportedDocument.code;
@@ -713,6 +733,7 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
713
733
  browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
714
734
  ...getPlaywrightLaunchOptions(browser),
715
735
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
736
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
716
737
  });
717
738
  register(browserContext);
718
739
  }
@@ -723,6 +744,7 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
723
744
  register(browserInstance);
724
745
  browserContext = await browserInstance.newContext({
725
746
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
747
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
726
748
  });
727
749
  }
728
750
  const page = await browserContext.newPage();
@@ -784,10 +806,10 @@ export const isDisallowedInRobotsTxt = (url) => {
784
806
  }
785
807
  return false;
786
808
  };
787
- export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
809
+ export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
788
810
  const scannedSitemaps = new Set();
789
- const urls = {}; // dictionary of requests to urls to be scanned
790
- const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
811
+ const sitemapLinkCounts = {};
812
+ const allUrls = new Set(); // all discovered URLs (lightweight strings)
791
813
  const addToUrlList = (url) => {
792
814
  if (!url)
793
815
  return;
@@ -796,17 +818,7 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
796
818
  if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy))
797
819
  return;
798
820
  url = convertPathToLocalFile(url);
799
- let request;
800
- try {
801
- request = new Request({ url });
802
- }
803
- catch (e) {
804
- console.log('Error creating request', e);
805
- }
806
- if (isUrlPdf(url)) {
807
- request.skipNavigation = true;
808
- }
809
- urls[url] = request;
821
+ allUrls.add(url);
810
822
  };
811
823
  const calculateCloseness = (sitemapUrl) => {
812
824
  // Remove 'http://', 'https://', and 'www.' prefixes from the URLs
@@ -849,15 +861,14 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
849
861
  return (b.lastModifiedDate?.getTime() || 0) - (a.lastModifiedDate?.getTime() || 0);
850
862
  });
851
863
  }
852
- // Add the sorted URLs to the main URL list
853
- for (const { url } of urlList.slice(0, maxLinksCount)) {
864
+ // Add all URLs to the discovered list (limit applied later at return time)
865
+ for (const { url } of urlList) {
854
866
  addToUrlList(url);
855
867
  }
856
868
  };
857
869
  const processNonStandardSitemap = (data) => {
858
870
  const urlsFromData = crawlee
859
- .extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
860
- .slice(0, maxLinksCount);
871
+ .extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') });
861
872
  urlsFromData.forEach(url => {
862
873
  addToUrlList(url);
863
874
  });
@@ -900,6 +911,7 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
900
911
  // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
901
912
  ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
902
913
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
914
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
903
915
  });
904
916
  register(browserContext);
905
917
  }
@@ -910,6 +922,7 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
910
922
  register(browserInstance);
911
923
  browserContext = await browserInstance.newContext({
912
924
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
925
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
913
926
  });
914
927
  }
915
928
  const page = await browserContext.newPage();
@@ -980,14 +993,12 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
980
993
  else {
981
994
  sitemapType = constants.xmlSitemapTypes.unknown;
982
995
  }
996
+ const countBefore = allUrls.size;
983
997
  switch (sitemapType) {
984
998
  case constants.xmlSitemapTypes.xmlIndex:
985
999
  consoleLogger.info(`This is a XML format sitemap index.`);
986
1000
  for (const childSitemapUrl of $('loc')) {
987
1001
  const childSitemapUrlText = $(childSitemapUrl).text();
988
- if (isLimitReached()) {
989
- break;
990
- }
991
1002
  if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
992
1003
  await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
993
1004
  }
@@ -1012,6 +1023,10 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
1012
1023
  consoleLogger.info(`This is an unrecognised XML sitemap format.`);
1013
1024
  processNonStandardSitemap(data);
1014
1025
  }
1026
+ const linksFromThisSitemap = allUrls.size - countBefore;
1027
+ if (linksFromThisSitemap > 0) {
1028
+ sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
1029
+ }
1015
1030
  };
1016
1031
  try {
1017
1032
  await fetchUrls(sitemapUrl, extraHTTPHeaders);
@@ -1019,7 +1034,37 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
1019
1034
  catch (e) {
1020
1035
  consoleLogger.error(e);
1021
1036
  }
1022
- const requestList = Object.values(urls);
1037
+ // Build Request objects for all discovered URLs; the crawler itself enforces
1038
+ // maxRequestsPerCrawl by counting only successfully scanned pages.
1039
+ const requestList = [];
1040
+ for (const url of allUrls) {
1041
+ try {
1042
+ const request = new Request({ url });
1043
+ if (isUrlPdf(url)) {
1044
+ request.skipNavigation = true;
1045
+ }
1046
+ requestList.push(request);
1047
+ }
1048
+ catch (e) {
1049
+ consoleLogger.info(`Error creating request for ${url}: ${e}`);
1050
+ }
1051
+ }
1052
+ const totalLinksDiscovered = allUrls.size;
1053
+ const fetchedSitemaps = Object.entries(sitemapLinkCounts).map(([url, fetchedLinks]) => ({
1054
+ url,
1055
+ fetchedLinks,
1056
+ }));
1057
+ const prev = constants.sitemapFetchedLinks;
1058
+ constants.sitemapFetchedLinks = {
1059
+ totalLinksFetchedFromSitemaps: (prev?.totalLinksFetchedFromSitemaps ?? 0) + totalLinksDiscovered,
1060
+ fetchedSitemaps: [...(prev?.fetchedSitemaps ?? []), ...fetchedSitemaps],
1061
+ };
1062
+ if (totalLinksDiscovered > 0) {
1063
+ const breakdown = fetchedSitemaps
1064
+ .map(({ url, fetchedLinks }) => `${url} (${fetchedLinks})`)
1065
+ .join(', ');
1066
+ consoleLogger.info(`There are a total of ${totalLinksDiscovered} links found across ${breakdown}.`);
1067
+ }
1023
1068
  return requestList;
1024
1069
  };
1025
1070
  export const validEmail = (email) => {
@@ -1158,6 +1203,34 @@ export const getEdgeData = (randomToken) => {
1158
1203
  * @param {*} destDir destination directory
1159
1204
  * @returns boolean indicating whether the operation was successful
1160
1205
  */
1206
+ // Helper to copy a file with retry logic for transient EBUSY errors
1207
+ const copyFileWithRetry = (src, dest, maxRetries = 3) => {
1208
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
1209
+ try {
1210
+ fs.copyFileSync(src, dest);
1211
+ if (attempt > 1) {
1212
+ consoleLogger.info(`File copy succeeded on attempt ${attempt}: ${dest}`);
1213
+ }
1214
+ return true;
1215
+ }
1216
+ catch (err) {
1217
+ if (err.code === 'EBUSY' && attempt < maxRetries) {
1218
+ // Transient lock — wait and retry
1219
+ const delayMs = Math.min(100 * Math.pow(2, attempt - 1), 1000); // Exponential backoff: 100ms, 200ms, 400ms, capped at 1s
1220
+ consoleLogger.warn(`File copy attempt ${attempt}/${maxRetries} failed with EBUSY. Retrying after ${delayMs}ms: ${dest}`);
1221
+ // Synchronous sleep via busy-wait (not ideal but avoids promise complications in sync context)
1222
+ const endTime = Date.now() + delayMs;
1223
+ while (Date.now() < endTime) {
1224
+ // Busy wait
1225
+ }
1226
+ continue; // Retry
1227
+ }
1228
+ // Non-transient error or max retries reached
1229
+ return false;
1230
+ }
1231
+ }
1232
+ return false;
1233
+ };
1161
1234
  const cloneChromeProfileCookieFiles = (options, destDir) => {
1162
1235
  let profileCookiesDir;
1163
1236
  // Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
@@ -1196,19 +1269,9 @@ const cloneChromeProfileCookieFiles = (options, destDir) => {
1196
1269
  }
1197
1270
  // Prevents duplicate cookies file if the cookies already exist
1198
1271
  if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
1199
- try {
1200
- fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
1201
- }
1202
- catch (err) {
1203
- consoleLogger.error(err);
1204
- if (err.code === 'EBUSY') {
1205
- console.log(`Unable to copy the file for ${profileName} because it is currently in use.`);
1206
- console.log('Please close any applications that might be using this file and try again.');
1207
- }
1208
- else {
1209
- console.log(`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`);
1210
- }
1211
- // printMessage([err], messageOptions);
1272
+ const destCookiesPath = path.join(destProfileDir, 'Cookies');
1273
+ if (!copyFileWithRetry(dir, destCookiesPath)) {
1274
+ consoleLogger.error(`Failed to copy Chrome profile cookies for ${profileName} after retries.`);
1212
1275
  success = false;
1213
1276
  }
1214
1277
  }
@@ -1220,12 +1283,6 @@ const cloneChromeProfileCookieFiles = (options, destDir) => {
1220
1283
  printMessage(['Unable to find Chrome profile cookies file in the system.'], messageOptions);
1221
1284
  return false;
1222
1285
  };
1223
- /**
1224
- * Clone the Chrome profile cookie files to the destination directory
1225
- * @param {*} options glob options object
1226
- * @param {*} destDir destination directory
1227
- * @returns boolean indicating whether the operation was successful
1228
- */
1229
1286
  const cloneEdgeProfileCookieFiles = (options, destDir) => {
1230
1287
  let profileCookiesDir;
1231
1288
  // Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
@@ -1265,19 +1322,9 @@ const cloneEdgeProfileCookieFiles = (options, destDir) => {
1265
1322
  }
1266
1323
  // Prevents duplicate cookies file if the cookies already exist
1267
1324
  if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
1268
- try {
1269
- fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
1270
- }
1271
- catch (err) {
1272
- consoleLogger.error(err);
1273
- if (err.code === 'EBUSY') {
1274
- console.log(`Unable to copy the file for ${profileName} because it is currently in use.`);
1275
- console.log('Please close any applications that might be using this file and try again.');
1276
- }
1277
- else {
1278
- console.log(`An unexpected error occurred while copying the file: ${err.message}`);
1279
- }
1280
- // printMessage([err], messageOptions);
1325
+ const destCookiesPath = path.join(destProfileDir, 'Cookies');
1326
+ if (!copyFileWithRetry(dir, destCookiesPath)) {
1327
+ consoleLogger.error(`Failed to copy Edge profile cookies for ${profileName} after retries.`);
1281
1328
  success = false;
1282
1329
  }
1283
1330
  }
@@ -1305,19 +1352,9 @@ const cloneLocalStateFile = (options, destDir) => {
1305
1352
  let success = true;
1306
1353
  localState.forEach(dir => {
1307
1354
  const profileName = dir.match(profileNamesRegex)[1];
1308
- try {
1309
- fs.copyFileSync(dir, path.join(destDir, 'Local State'));
1310
- }
1311
- catch (err) {
1312
- consoleLogger.error(err);
1313
- if (err.code === 'EBUSY') {
1314
- console.log(`Unable to copy the file because it is currently in use.`);
1315
- console.log('Please close any applications that might be using this file and try again.');
1316
- }
1317
- else {
1318
- console.log(`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`);
1319
- }
1320
- printMessage([err], messageOptions);
1355
+ const destPath = path.join(destDir, 'Local State');
1356
+ if (!copyFileWithRetry(dir, destPath)) {
1357
+ consoleLogger.error(`Failed to copy Local State file for ${profileName} after retries.`);
1321
1358
  success = false;
1322
1359
  }
1323
1360
  });
@@ -1362,6 +1399,15 @@ export const cloneChromeProfiles = (randomToken) => {
1362
1399
  return destDir;
1363
1400
  }
1364
1401
  consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
1402
+ // Fall back to a clean profile directory to avoid launch failures from partial clones.
1403
+ try {
1404
+ fs.rmSync(destDir, { recursive: true, force: true });
1405
+ fs.mkdirSync(destDir, { recursive: true });
1406
+ consoleLogger.warn('Using an empty cloned Chrome profile directory due to clone failure.');
1407
+ }
1408
+ catch (cleanupError) {
1409
+ consoleLogger.error(`Unable to reset cloned Chrome profile directory ${destDir}: ${cleanupError}`);
1410
+ }
1365
1411
  }
1366
1412
  // For future reference, return a null instead to halt the scan
1367
1413
  return destDir;
@@ -1418,6 +1464,15 @@ export const cloneEdgeProfiles = (randomToken) => {
1418
1464
  return destDir;
1419
1465
  }
1420
1466
  consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
1467
+ // Fall back to a clean profile directory to avoid launch failures from partial clones.
1468
+ try {
1469
+ fs.rmSync(destDir, { recursive: true, force: true });
1470
+ fs.mkdirSync(destDir, { recursive: true });
1471
+ consoleLogger.warn('Using an empty cloned Edge profile directory due to clone failure.');
1472
+ }
1473
+ catch (cleanupError) {
1474
+ consoleLogger.error(`Unable to reset cloned Edge profile directory ${destDir}: ${cleanupError}`);
1475
+ }
1421
1476
  }
1422
1477
  // For future reference, return a null instead to halt the scan
1423
1478
  return destDir;
@@ -1444,7 +1499,14 @@ export const deleteClonedChromeProfiles = (randomToken) => {
1444
1499
  }
1445
1500
  let destDir;
1446
1501
  if (randomToken) {
1447
- destDir = [`${baseDir}/oobee-${randomToken}`];
1502
+ // Also match _pool* directories created by browser pool re-launches
1503
+ destDir = globSync(`oobee-${randomToken}*`, {
1504
+ cwd: baseDir,
1505
+ absolute: true,
1506
+ });
1507
+ if (destDir.length === 0) {
1508
+ destDir = [`${baseDir}/oobee-${randomToken}`];
1509
+ }
1448
1510
  }
1449
1511
  else {
1450
1512
  // Find all the oobee directories in the Chrome data directory
@@ -1481,10 +1543,17 @@ export const deleteClonedEdgeProfiles = (randomToken) => {
1481
1543
  }
1482
1544
  let destDir;
1483
1545
  if (randomToken) {
1484
- destDir = [`${baseDir}/oobee-${randomToken}`];
1546
+ // Also match _pool* directories created by browser pool re-launches
1547
+ destDir = globSync(`oobee-${randomToken}*`, {
1548
+ cwd: baseDir,
1549
+ absolute: true,
1550
+ });
1551
+ if (destDir.length === 0) {
1552
+ destDir = [`${baseDir}/oobee-${randomToken}`];
1553
+ }
1485
1554
  }
1486
1555
  else {
1487
- // Find all the oobee directories in the Chrome data directory
1556
+ // Find all the oobee directories in the Edge data directory
1488
1557
  destDir = globSync('**/oobee*', {
1489
1558
  cwd: baseDir,
1490
1559
  absolute: true,
@@ -771,6 +771,7 @@ export default {
771
771
  a11yRuleShortDescriptionMap,
772
772
  disabilityBadgesMap,
773
773
  robotsTxtUrls: null,
774
+ sitemapFetchedLinks: null,
774
775
  userDataDirectory: null, // This will be set later in the code
775
776
  randomToken: null, // This will be set later in the code
776
777
  // Track all active Crawlee / Playwright resources for cleanup
@@ -1,7 +1,7 @@
1
1
  import { Dataset, RequestQueue, log, playwrightUtils } from 'crawlee';
2
2
  import axe from 'axe-core';
3
3
  import { axeScript, disallowedListOfPatterns, guiInfoStatusTypes, RuleFlags, saflyIconSelector, } from '../constants/constants.js';
4
- import { consoleLogger, guiInfoLog } from '../logs.js';
4
+ import { guiInfoLog } from '../logs.js';
5
5
  import { enrichColorContrastDOMContext, takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
6
6
  import { isFilePath } from '../constants/common.js';
7
7
  import { extractAndGradeText } from './custom/extractAndGradeText.js';
@@ -674,6 +674,13 @@ export const filterAxeResults = (results, pageTitle, customFlowDetails) => {
674
674
  export const runAxeScript = async ({ includeScreenshots, page, randomToken, customFlowDetails = null, selectors = [], ruleset = [], }) => {
675
675
  const browserContext = page.context();
676
676
  const requestUrl = page.url();
677
+ let pageTitle = null;
678
+ try {
679
+ pageTitle = await page.evaluate(() => document.title);
680
+ }
681
+ catch {
682
+ // Page may already be in a bad state; title will remain null
683
+ }
677
684
  try {
678
685
  // Checking for DOM mutations before proceeding to scan
679
686
  await page.evaluate(() => {
@@ -781,7 +788,40 @@ export const runAxeScript = async ({ includeScreenshots, page, randomToken, cust
781
788
  .run(selectors, {
782
789
  resultTypes: defaultResultTypes,
783
790
  })
784
- .then(results => {
791
+ .then(async (results) => {
792
+ // Re-verify aria-hidden-focus violations against the live DOM to
793
+ // handle race conditions with JS that sets tabindex="-1" after
794
+ // aria-hidden (common in carousel/slider libraries like slick)
795
+ const ariaHiddenViolation = results.violations.find(v => v.id === 'aria-hidden-focus');
796
+ if (ariaHiddenViolation) {
797
+ await new Promise(resolve => setTimeout(resolve, 0));
798
+ ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(node => {
799
+ const selector = node.target && node.target[0];
800
+ if (typeof selector !== 'string')
801
+ return true;
802
+ try {
803
+ const el = document.querySelector(selector);
804
+ if (!el)
805
+ return true;
806
+ const focusables = el.querySelectorAll('a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]');
807
+ if (focusables.length === 0)
808
+ return false;
809
+ return Array.from(focusables).some(child => {
810
+ const tabindex = child.getAttribute('tabindex');
811
+ if (tabindex === null)
812
+ return true;
813
+ const parsed = parseInt(tabindex, 10);
814
+ return isNaN(parsed) || parsed >= 0;
815
+ });
816
+ }
817
+ catch {
818
+ return true;
819
+ }
820
+ });
821
+ if (ariaHiddenViolation.nodes.length === 0) {
822
+ results.violations = results.violations.filter(v => v.id !== 'aria-hidden-focus');
823
+ }
824
+ }
785
825
  if (disableOobee) {
786
826
  return results;
787
827
  }
@@ -847,19 +887,6 @@ export const runAxeScript = async ({ includeScreenshots, page, randomToken, cust
847
887
  results.violations = await takeScreenshotForHTMLElements(results.violations, page, randomToken);
848
888
  results.incomplete = await takeScreenshotForHTMLElements(results.incomplete, page, randomToken);
849
889
  }
850
- let pageTitle = null;
851
- try {
852
- pageTitle = await page.evaluate(() => document.title);
853
- }
854
- catch (e) {
855
- consoleLogger.info(`Error while getting page title: ${e}`);
856
- if (page.isClosed()) {
857
- consoleLogger.info(`Page was closed for ${requestUrl}, creating new page`);
858
- page = await browserContext.newPage();
859
- await page.goto(requestUrl, { waitUntil: 'domcontentloaded' });
860
- pageTitle = await page.evaluate(() => document.title);
861
- }
862
- }
863
890
  return filterAxeResults(results, pageTitle, customFlowDetails);
864
891
  };
865
892
  export const createCrawleeSubFolders = async (randomToken) => {
@@ -878,11 +905,105 @@ export const preNavigationHooks = (extraHTTPHeaders) => {
878
905
  },
879
906
  ];
880
907
  };
908
+ /**
909
+ * Splits extraHTTPHeaders into auth and non-auth parts.
910
+ * Auth headers (Authorization) must only be sent to same-origin requests to avoid CORS preflight failures.
911
+ * Non-auth headers are safe to set globally on the browser context.
912
+ */
913
+ export const splitAuthHeaders = (extraHTTPHeaders) => {
914
+ const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
915
+ return {
916
+ authHeader: Authorization || null,
917
+ nonAuthHeaders: Object.keys(nonAuthHeaders).length > 0 ? nonAuthHeaders : null,
918
+ httpCredentials: (() => {
919
+ if (!Authorization?.startsWith('Basic '))
920
+ return null;
921
+ const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
922
+ const colonIdx = decoded.indexOf(':');
923
+ if (colonIdx <= 0)
924
+ return null;
925
+ return { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
926
+ })(),
927
+ };
928
+ };
929
+ /**
930
+ * Adds a route handler to a BrowserContext that sends the Authorization header
931
+ * only to same-origin requests, preventing CORS preflight failures on cross-origin CDN resources.
932
+ */
933
+ export const addAuthRouteHandler = async (context, entryUrl, authHeader) => {
934
+ if (!authHeader)
935
+ return;
936
+ const entryOrigin = new URL(entryUrl).origin;
937
+ await context.route('**/*', async (route, request) => {
938
+ try {
939
+ if (new URL(request.url()).origin === entryOrigin) {
940
+ await route.continue({ headers: { ...request.headers(), Authorization: authHeader } });
941
+ }
942
+ else {
943
+ await route.continue();
944
+ }
945
+ }
946
+ catch {
947
+ await route.continue();
948
+ }
949
+ });
950
+ };
881
951
  export const postNavigationHooks = [
882
952
  async (_crawlingContext) => {
883
953
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
884
954
  },
885
955
  ];
956
+ export const getPreLaunchHook = (userDataDirectory) => {
957
+ let launchCount = 0;
958
+ return async (_pageId, launchContext) => {
959
+ const fsp = await import('fs/promises').then(m => m.default);
960
+ launchCount += 1;
961
+ // First launch uses the base directory; subsequent launches get a unique
962
+ // directory so that lingering file handles from a retired browser don't
963
+ // cause Chrome exit code 21 on Windows.
964
+ const effectiveDir = launchCount === 1
965
+ ? userDataDirectory
966
+ : `${userDataDirectory}_pool${launchCount}`;
967
+ await fsp.mkdir(effectiveDir, { recursive: true });
968
+ // For pool re-launches, best-effort clone profile data from base directory
969
+ // so authenticated sessions are preserved across browser pool retirements.
970
+ if (launchCount > 1) {
971
+ try {
972
+ const copyRecursive = async (src, dest) => {
973
+ const stat = await fsp.stat(src).catch(() => null);
974
+ if (!stat)
975
+ return;
976
+ if (stat.isDirectory()) {
977
+ await fsp.mkdir(dest, { recursive: true }).catch(() => { });
978
+ const entries = await fsp.readdir(src).catch(() => []);
979
+ await Promise.all(entries
980
+ .filter(entry => !entry.startsWith('Singleton') && entry !== 'lockfile' && entry !== 'LOCK')
981
+ .map(entry => copyRecursive(path.join(src, entry), path.join(dest, entry)).catch(() => { })));
982
+ }
983
+ else {
984
+ await fsp.copyFile(src, dest).catch(() => { });
985
+ }
986
+ };
987
+ await copyRecursive(userDataDirectory, effectiveDir).catch(() => { });
988
+ }
989
+ catch {
990
+ // Silent fallback: use empty profile if clone fails
991
+ }
992
+ }
993
+ // Clean any stale lock files that may block browser launches on Windows
994
+ const lockFiles = [
995
+ path.join(effectiveDir, 'SingletonLock'),
996
+ path.join(effectiveDir, 'SingletonSocket'),
997
+ path.join(effectiveDir, 'SingletonCookie'),
998
+ path.join(effectiveDir, 'lockfile'),
999
+ path.join(effectiveDir, 'Default', 'LOCK'),
1000
+ path.join(effectiveDir, 'Default', 'Network', 'LOCK'),
1001
+ ];
1002
+ await Promise.all(lockFiles.map(f => fsp.rm(f, { force: true }).catch(() => { })));
1003
+ // eslint-disable-next-line no-param-reassign
1004
+ launchContext.userDataDir = effectiveDir;
1005
+ };
1006
+ };
886
1007
  export const failedRequestHandler = async ({ request }) => {
887
1008
  guiInfoLog(guiInfoStatusTypes.ERROR, { numScanned: 0, urlScanned: request.url });
888
1009
  log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);