@govtechsg/oobee 0.10.90 → 0.10.92

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/AGENTS.md +289 -0
  2. package/README.md +3 -0
  3. package/dist/cli.js +3 -0
  4. package/dist/combine.js +14 -2
  5. package/dist/constants/cliFunctions.js +7 -0
  6. package/dist/constants/common.js +119 -70
  7. package/dist/constants/constants.js +1 -0
  8. package/dist/crawlers/commonCrawlerFunc.js +93 -15
  9. package/dist/crawlers/crawlDomain.js +45 -57
  10. package/dist/crawlers/crawlIntelligentSitemap.js +12 -7
  11. package/dist/crawlers/crawlRateController.js +47 -0
  12. package/dist/crawlers/crawlSitemap.js +51 -62
  13. package/dist/generateOobeeClientScanner.js +31 -0
  14. package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
  15. package/dist/mergeAxeResults.js +121 -68
  16. package/dist/npmIndex.js +1 -0
  17. package/dist/utils.js +23 -28
  18. package/oobee-client-scanner.js +33 -2
  19. package/package.json +2 -2
  20. package/src/cli.ts +4 -0
  21. package/src/combine.ts +15 -1
  22. package/src/constants/cliFunctions.ts +7 -0
  23. package/src/constants/common.ts +131 -79
  24. package/src/constants/constants.ts +1 -0
  25. package/src/crawlers/commonCrawlerFunc.ts +103 -14
  26. package/src/crawlers/crawlDomain.ts +52 -65
  27. package/src/crawlers/crawlIntelligentSitemap.ts +13 -7
  28. package/src/crawlers/crawlRateController.ts +63 -0
  29. package/src/crawlers/crawlSitemap.ts +57 -70
  30. package/src/generateOobeeClientScanner.ts +31 -0
  31. package/src/index.ts +1 -0
  32. package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
  33. package/src/mergeAxeResults.ts +141 -75
  34. package/src/npmIndex.ts +1 -0
  35. package/src/utils.ts +25 -33
  36. /package/{fb85adb0-5db6-4a09-8c80-05f030115004.txt → d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt} +0 -0
@@ -713,6 +713,7 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
713
713
  browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
714
714
  ...getPlaywrightLaunchOptions(browser),
715
715
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
716
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
716
717
  });
717
718
  register(browserContext);
718
719
  }
@@ -723,6 +724,7 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
723
724
  register(browserInstance);
724
725
  browserContext = await browserInstance.newContext({
725
726
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
727
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
726
728
  });
727
729
  }
728
730
  const page = await browserContext.newPage();
@@ -784,10 +786,10 @@ export const isDisallowedInRobotsTxt = (url) => {
784
786
  }
785
787
  return false;
786
788
  };
787
- export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
789
+ export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
788
790
  const scannedSitemaps = new Set();
789
- const urls = {}; // dictionary of requests to urls to be scanned
790
- const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
791
+ const sitemapLinkCounts = {};
792
+ const allUrls = new Set(); // all discovered URLs (lightweight strings)
791
793
  const addToUrlList = (url) => {
792
794
  if (!url)
793
795
  return;
@@ -796,17 +798,7 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
796
798
  if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy))
797
799
  return;
798
800
  url = convertPathToLocalFile(url);
799
- let request;
800
- try {
801
- request = new Request({ url });
802
- }
803
- catch (e) {
804
- console.log('Error creating request', e);
805
- }
806
- if (isUrlPdf(url)) {
807
- request.skipNavigation = true;
808
- }
809
- urls[url] = request;
801
+ allUrls.add(url);
810
802
  };
811
803
  const calculateCloseness = (sitemapUrl) => {
812
804
  // Remove 'http://', 'https://', and 'www.' prefixes from the URLs
@@ -849,15 +841,14 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
849
841
  return (b.lastModifiedDate?.getTime() || 0) - (a.lastModifiedDate?.getTime() || 0);
850
842
  });
851
843
  }
852
- // Add the sorted URLs to the main URL list
853
- for (const { url } of urlList.slice(0, maxLinksCount)) {
844
+ // Add all URLs to the discovered list (limit applied later at return time)
845
+ for (const { url } of urlList) {
854
846
  addToUrlList(url);
855
847
  }
856
848
  };
857
849
  const processNonStandardSitemap = (data) => {
858
850
  const urlsFromData = crawlee
859
- .extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
860
- .slice(0, maxLinksCount);
851
+ .extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') });
861
852
  urlsFromData.forEach(url => {
862
853
  addToUrlList(url);
863
854
  });
@@ -900,6 +891,7 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
900
891
  // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
901
892
  ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
902
893
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
894
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
903
895
  });
904
896
  register(browserContext);
905
897
  }
@@ -910,6 +902,7 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
910
902
  register(browserInstance);
911
903
  browserContext = await browserInstance.newContext({
912
904
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
905
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
913
906
  });
914
907
  }
915
908
  const page = await browserContext.newPage();
@@ -980,14 +973,12 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
980
973
  else {
981
974
  sitemapType = constants.xmlSitemapTypes.unknown;
982
975
  }
976
+ const countBefore = allUrls.size;
983
977
  switch (sitemapType) {
984
978
  case constants.xmlSitemapTypes.xmlIndex:
985
979
  consoleLogger.info(`This is a XML format sitemap index.`);
986
980
  for (const childSitemapUrl of $('loc')) {
987
981
  const childSitemapUrlText = $(childSitemapUrl).text();
988
- if (isLimitReached()) {
989
- break;
990
- }
991
982
  if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
992
983
  await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
993
984
  }
@@ -1012,6 +1003,10 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
1012
1003
  consoleLogger.info(`This is an unrecognised XML sitemap format.`);
1013
1004
  processNonStandardSitemap(data);
1014
1005
  }
1006
+ const linksFromThisSitemap = allUrls.size - countBefore;
1007
+ if (linksFromThisSitemap > 0) {
1008
+ sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
1009
+ }
1015
1010
  };
1016
1011
  try {
1017
1012
  await fetchUrls(sitemapUrl, extraHTTPHeaders);
@@ -1019,7 +1014,37 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
1019
1014
  catch (e) {
1020
1015
  consoleLogger.error(e);
1021
1016
  }
1022
- const requestList = Object.values(urls);
1017
+ // Build Request objects for all discovered URLs; the crawler itself enforces
1018
+ // maxRequestsPerCrawl by counting only successfully scanned pages.
1019
+ const requestList = [];
1020
+ for (const url of allUrls) {
1021
+ try {
1022
+ const request = new Request({ url });
1023
+ if (isUrlPdf(url)) {
1024
+ request.skipNavigation = true;
1025
+ }
1026
+ requestList.push(request);
1027
+ }
1028
+ catch (e) {
1029
+ consoleLogger.info(`Error creating request for ${url}: ${e}`);
1030
+ }
1031
+ }
1032
+ const totalLinksDiscovered = allUrls.size;
1033
+ const fetchedSitemaps = Object.entries(sitemapLinkCounts).map(([url, fetchedLinks]) => ({
1034
+ url,
1035
+ fetchedLinks,
1036
+ }));
1037
+ const prev = constants.sitemapFetchedLinks;
1038
+ constants.sitemapFetchedLinks = {
1039
+ totalLinksFetchedFromSitemaps: (prev?.totalLinksFetchedFromSitemaps ?? 0) + totalLinksDiscovered,
1040
+ fetchedSitemaps: [...(prev?.fetchedSitemaps ?? []), ...fetchedSitemaps],
1041
+ };
1042
+ if (totalLinksDiscovered > 0) {
1043
+ const breakdown = fetchedSitemaps
1044
+ .map(({ url, fetchedLinks }) => `${url} (${fetchedLinks})`)
1045
+ .join(', ');
1046
+ consoleLogger.info(`There are a total of ${totalLinksDiscovered} links found across ${breakdown}.`);
1047
+ }
1023
1048
  return requestList;
1024
1049
  };
1025
1050
  export const validEmail = (email) => {
@@ -1158,6 +1183,34 @@ export const getEdgeData = (randomToken) => {
1158
1183
  * @param {*} destDir destination directory
1159
1184
  * @returns boolean indicating whether the operation was successful
1160
1185
  */
1186
+ // Helper to copy a file with retry logic for transient EBUSY errors
1187
+ const copyFileWithRetry = (src, dest, maxRetries = 3) => {
1188
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
1189
+ try {
1190
+ fs.copyFileSync(src, dest);
1191
+ if (attempt > 1) {
1192
+ consoleLogger.info(`File copy succeeded on attempt ${attempt}: ${dest}`);
1193
+ }
1194
+ return true;
1195
+ }
1196
+ catch (err) {
1197
+ if (err.code === 'EBUSY' && attempt < maxRetries) {
1198
+ // Transient lock — wait and retry
1199
+ const delayMs = Math.min(100 * Math.pow(2, attempt - 1), 1000); // Exponential backoff: 100ms, 200ms, 400ms, capped at 1s
1200
+ consoleLogger.warn(`File copy attempt ${attempt}/${maxRetries} failed with EBUSY. Retrying after ${delayMs}ms: ${dest}`);
1201
+ // Synchronous sleep via busy-wait (not ideal but avoids promise complications in sync context)
1202
+ const endTime = Date.now() + delayMs;
1203
+ while (Date.now() < endTime) {
1204
+ // Busy wait
1205
+ }
1206
+ continue; // Retry
1207
+ }
1208
+ // Non-transient error or max retries reached
1209
+ return false;
1210
+ }
1211
+ }
1212
+ return false;
1213
+ };
1161
1214
  const cloneChromeProfileCookieFiles = (options, destDir) => {
1162
1215
  let profileCookiesDir;
1163
1216
  // Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
@@ -1196,19 +1249,9 @@ const cloneChromeProfileCookieFiles = (options, destDir) => {
1196
1249
  }
1197
1250
  // Prevents duplicate cookies file if the cookies already exist
1198
1251
  if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
1199
- try {
1200
- fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
1201
- }
1202
- catch (err) {
1203
- consoleLogger.error(err);
1204
- if (err.code === 'EBUSY') {
1205
- console.log(`Unable to copy the file for ${profileName} because it is currently in use.`);
1206
- console.log('Please close any applications that might be using this file and try again.');
1207
- }
1208
- else {
1209
- console.log(`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`);
1210
- }
1211
- // printMessage([err], messageOptions);
1252
+ const destCookiesPath = path.join(destProfileDir, 'Cookies');
1253
+ if (!copyFileWithRetry(dir, destCookiesPath)) {
1254
+ consoleLogger.error(`Failed to copy Chrome profile cookies for ${profileName} after retries.`);
1212
1255
  success = false;
1213
1256
  }
1214
1257
  }
@@ -1220,12 +1263,6 @@ const cloneChromeProfileCookieFiles = (options, destDir) => {
1220
1263
  printMessage(['Unable to find Chrome profile cookies file in the system.'], messageOptions);
1221
1264
  return false;
1222
1265
  };
1223
- /**
1224
- * Clone the Chrome profile cookie files to the destination directory
1225
- * @param {*} options glob options object
1226
- * @param {*} destDir destination directory
1227
- * @returns boolean indicating whether the operation was successful
1228
- */
1229
1266
  const cloneEdgeProfileCookieFiles = (options, destDir) => {
1230
1267
  let profileCookiesDir;
1231
1268
  // Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
@@ -1265,19 +1302,9 @@ const cloneEdgeProfileCookieFiles = (options, destDir) => {
1265
1302
  }
1266
1303
  // Prevents duplicate cookies file if the cookies already exist
1267
1304
  if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
1268
- try {
1269
- fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
1270
- }
1271
- catch (err) {
1272
- consoleLogger.error(err);
1273
- if (err.code === 'EBUSY') {
1274
- console.log(`Unable to copy the file for ${profileName} because it is currently in use.`);
1275
- console.log('Please close any applications that might be using this file and try again.');
1276
- }
1277
- else {
1278
- console.log(`An unexpected error occurred while copying the file: ${err.message}`);
1279
- }
1280
- // printMessage([err], messageOptions);
1305
+ const destCookiesPath = path.join(destProfileDir, 'Cookies');
1306
+ if (!copyFileWithRetry(dir, destCookiesPath)) {
1307
+ consoleLogger.error(`Failed to copy Edge profile cookies for ${profileName} after retries.`);
1281
1308
  success = false;
1282
1309
  }
1283
1310
  }
@@ -1305,19 +1332,9 @@ const cloneLocalStateFile = (options, destDir) => {
1305
1332
  let success = true;
1306
1333
  localState.forEach(dir => {
1307
1334
  const profileName = dir.match(profileNamesRegex)[1];
1308
- try {
1309
- fs.copyFileSync(dir, path.join(destDir, 'Local State'));
1310
- }
1311
- catch (err) {
1312
- consoleLogger.error(err);
1313
- if (err.code === 'EBUSY') {
1314
- console.log(`Unable to copy the file because it is currently in use.`);
1315
- console.log('Please close any applications that might be using this file and try again.');
1316
- }
1317
- else {
1318
- console.log(`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`);
1319
- }
1320
- printMessage([err], messageOptions);
1335
+ const destPath = path.join(destDir, 'Local State');
1336
+ if (!copyFileWithRetry(dir, destPath)) {
1337
+ consoleLogger.error(`Failed to copy Local State file for ${profileName} after retries.`);
1321
1338
  success = false;
1322
1339
  }
1323
1340
  });
@@ -1362,6 +1379,15 @@ export const cloneChromeProfiles = (randomToken) => {
1362
1379
  return destDir;
1363
1380
  }
1364
1381
  consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
1382
+ // Fall back to a clean profile directory to avoid launch failures from partial clones.
1383
+ try {
1384
+ fs.rmSync(destDir, { recursive: true, force: true });
1385
+ fs.mkdirSync(destDir, { recursive: true });
1386
+ consoleLogger.warn('Using an empty cloned Chrome profile directory due to clone failure.');
1387
+ }
1388
+ catch (cleanupError) {
1389
+ consoleLogger.error(`Unable to reset cloned Chrome profile directory ${destDir}: ${cleanupError}`);
1390
+ }
1365
1391
  }
1366
1392
  // For future reference, return a null instead to halt the scan
1367
1393
  return destDir;
@@ -1418,6 +1444,15 @@ export const cloneEdgeProfiles = (randomToken) => {
1418
1444
  return destDir;
1419
1445
  }
1420
1446
  consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
1447
+ // Fall back to a clean profile directory to avoid launch failures from partial clones.
1448
+ try {
1449
+ fs.rmSync(destDir, { recursive: true, force: true });
1450
+ fs.mkdirSync(destDir, { recursive: true });
1451
+ consoleLogger.warn('Using an empty cloned Edge profile directory due to clone failure.');
1452
+ }
1453
+ catch (cleanupError) {
1454
+ consoleLogger.error(`Unable to reset cloned Edge profile directory ${destDir}: ${cleanupError}`);
1455
+ }
1421
1456
  }
1422
1457
  // For future reference, return a null instead to halt the scan
1423
1458
  return destDir;
@@ -1444,7 +1479,14 @@ export const deleteClonedChromeProfiles = (randomToken) => {
1444
1479
  }
1445
1480
  let destDir;
1446
1481
  if (randomToken) {
1447
- destDir = [`${baseDir}/oobee-${randomToken}`];
1482
+ // Also match _pool* directories created by browser pool re-launches
1483
+ destDir = globSync(`oobee-${randomToken}*`, {
1484
+ cwd: baseDir,
1485
+ absolute: true,
1486
+ });
1487
+ if (destDir.length === 0) {
1488
+ destDir = [`${baseDir}/oobee-${randomToken}`];
1489
+ }
1448
1490
  }
1449
1491
  else {
1450
1492
  // Find all the oobee directories in the Chrome data directory
@@ -1481,10 +1523,17 @@ export const deleteClonedEdgeProfiles = (randomToken) => {
1481
1523
  }
1482
1524
  let destDir;
1483
1525
  if (randomToken) {
1484
- destDir = [`${baseDir}/oobee-${randomToken}`];
1526
+ // Also match _pool* directories created by browser pool re-launches
1527
+ destDir = globSync(`oobee-${randomToken}*`, {
1528
+ cwd: baseDir,
1529
+ absolute: true,
1530
+ });
1531
+ if (destDir.length === 0) {
1532
+ destDir = [`${baseDir}/oobee-${randomToken}`];
1533
+ }
1485
1534
  }
1486
1535
  else {
1487
- // Find all the oobee directories in the Chrome data directory
1536
+ // Find all the oobee directories in the Edge data directory
1488
1537
  destDir = globSync('**/oobee*', {
1489
1538
  cwd: baseDir,
1490
1539
  absolute: true,
@@ -771,6 +771,7 @@ export default {
771
771
  a11yRuleShortDescriptionMap,
772
772
  disabilityBadgesMap,
773
773
  robotsTxtUrls: null,
774
+ sitemapFetchedLinks: null,
774
775
  userDataDirectory: null, // This will be set later in the code
775
776
  randomToken: null, // This will be set later in the code
776
777
  // Track all active Crawlee / Playwright resources for cleanup
@@ -1,7 +1,7 @@
1
1
  import { Dataset, RequestQueue, log, playwrightUtils } from 'crawlee';
2
2
  import axe from 'axe-core';
3
3
  import { axeScript, disallowedListOfPatterns, guiInfoStatusTypes, RuleFlags, saflyIconSelector, } from '../constants/constants.js';
4
- import { consoleLogger, guiInfoLog } from '../logs.js';
4
+ import { guiInfoLog } from '../logs.js';
5
5
  import { enrichColorContrastDOMContext, takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
6
6
  import { isFilePath } from '../constants/common.js';
7
7
  import { extractAndGradeText } from './custom/extractAndGradeText.js';
@@ -674,6 +674,13 @@ export const filterAxeResults = (results, pageTitle, customFlowDetails) => {
674
674
  export const runAxeScript = async ({ includeScreenshots, page, randomToken, customFlowDetails = null, selectors = [], ruleset = [], }) => {
675
675
  const browserContext = page.context();
676
676
  const requestUrl = page.url();
677
+ let pageTitle = null;
678
+ try {
679
+ pageTitle = await page.evaluate(() => document.title);
680
+ }
681
+ catch {
682
+ // Page may already be in a bad state; title will remain null
683
+ }
677
684
  try {
678
685
  // Checking for DOM mutations before proceeding to scan
679
686
  await page.evaluate(() => {
@@ -781,7 +788,40 @@ export const runAxeScript = async ({ includeScreenshots, page, randomToken, cust
781
788
  .run(selectors, {
782
789
  resultTypes: defaultResultTypes,
783
790
  })
784
- .then(results => {
791
+ .then(async (results) => {
792
+ // Re-verify aria-hidden-focus violations against the live DOM to
793
+ // handle race conditions with JS that sets tabindex="-1" after
794
+ // aria-hidden (common in carousel/slider libraries like slick)
795
+ const ariaHiddenViolation = results.violations.find(v => v.id === 'aria-hidden-focus');
796
+ if (ariaHiddenViolation) {
797
+ await new Promise(resolve => setTimeout(resolve, 0));
798
+ ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(node => {
799
+ const selector = node.target && node.target[0];
800
+ if (typeof selector !== 'string')
801
+ return true;
802
+ try {
803
+ const el = document.querySelector(selector);
804
+ if (!el)
805
+ return true;
806
+ const focusables = el.querySelectorAll('a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]');
807
+ if (focusables.length === 0)
808
+ return false;
809
+ return Array.from(focusables).some(child => {
810
+ const tabindex = child.getAttribute('tabindex');
811
+ if (tabindex === null)
812
+ return true;
813
+ const parsed = parseInt(tabindex, 10);
814
+ return isNaN(parsed) || parsed >= 0;
815
+ });
816
+ }
817
+ catch {
818
+ return true;
819
+ }
820
+ });
821
+ if (ariaHiddenViolation.nodes.length === 0) {
822
+ results.violations = results.violations.filter(v => v.id !== 'aria-hidden-focus');
823
+ }
824
+ }
785
825
  if (disableOobee) {
786
826
  return results;
787
827
  }
@@ -847,19 +887,6 @@ export const runAxeScript = async ({ includeScreenshots, page, randomToken, cust
847
887
  results.violations = await takeScreenshotForHTMLElements(results.violations, page, randomToken);
848
888
  results.incomplete = await takeScreenshotForHTMLElements(results.incomplete, page, randomToken);
849
889
  }
850
- let pageTitle = null;
851
- try {
852
- pageTitle = await page.evaluate(() => document.title);
853
- }
854
- catch (e) {
855
- consoleLogger.info(`Error while getting page title: ${e}`);
856
- if (page.isClosed()) {
857
- consoleLogger.info(`Page was closed for ${requestUrl}, creating new page`);
858
- page = await browserContext.newPage();
859
- await page.goto(requestUrl, { waitUntil: 'domcontentloaded' });
860
- pageTitle = await page.evaluate(() => document.title);
861
- }
862
- }
863
890
  return filterAxeResults(results, pageTitle, customFlowDetails);
864
891
  };
865
892
  export const createCrawleeSubFolders = async (randomToken) => {
@@ -883,6 +910,57 @@ export const postNavigationHooks = [
883
910
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
884
911
  },
885
912
  ];
913
+ export const getPreLaunchHook = (userDataDirectory) => {
914
+ let launchCount = 0;
915
+ return async (_pageId, launchContext) => {
916
+ const fsp = await import('fs/promises').then(m => m.default);
917
+ launchCount += 1;
918
+ // First launch uses the base directory; subsequent launches get a unique
919
+ // directory so that lingering file handles from a retired browser don't
920
+ // cause Chrome exit code 21 on Windows.
921
+ const effectiveDir = launchCount === 1
922
+ ? userDataDirectory
923
+ : `${userDataDirectory}_pool${launchCount}`;
924
+ await fsp.mkdir(effectiveDir, { recursive: true });
925
+ // For pool re-launches, best-effort clone profile data from base directory
926
+ // so authenticated sessions are preserved across browser pool retirements.
927
+ if (launchCount > 1) {
928
+ try {
929
+ const copyRecursive = async (src, dest) => {
930
+ const stat = await fsp.stat(src).catch(() => null);
931
+ if (!stat)
932
+ return;
933
+ if (stat.isDirectory()) {
934
+ await fsp.mkdir(dest, { recursive: true }).catch(() => { });
935
+ const entries = await fsp.readdir(src).catch(() => []);
936
+ await Promise.all(entries
937
+ .filter(entry => !entry.startsWith('Singleton') && entry !== 'lockfile' && entry !== 'LOCK')
938
+ .map(entry => copyRecursive(path.join(src, entry), path.join(dest, entry)).catch(() => { })));
939
+ }
940
+ else {
941
+ await fsp.copyFile(src, dest).catch(() => { });
942
+ }
943
+ };
944
+ await copyRecursive(userDataDirectory, effectiveDir).catch(() => { });
945
+ }
946
+ catch {
947
+ // Silent fallback: use empty profile if clone fails
948
+ }
949
+ }
950
+ // Clean any stale lock files that may block browser launches on Windows
951
+ const lockFiles = [
952
+ path.join(effectiveDir, 'SingletonLock'),
953
+ path.join(effectiveDir, 'SingletonSocket'),
954
+ path.join(effectiveDir, 'SingletonCookie'),
955
+ path.join(effectiveDir, 'lockfile'),
956
+ path.join(effectiveDir, 'Default', 'LOCK'),
957
+ path.join(effectiveDir, 'Default', 'Network', 'LOCK'),
958
+ ];
959
+ await Promise.all(lockFiles.map(f => fsp.rm(f, { force: true }).catch(() => { })));
960
+ // eslint-disable-next-line no-param-reassign
961
+ launchContext.userDataDir = effectiveDir;
962
+ };
963
+ };
886
964
  export const failedRequestHandler = async ({ request }) => {
887
965
  guiInfoLog(guiInfoStatusTypes.ERROR, { numScanned: 0, urlScanned: request.url });
888
966
  log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);