@govtechsg/oobee 0.10.91 → 0.10.92

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/AGENTS.md +289 -0
  2. package/README.md +3 -0
  3. package/dist/cli.js +3 -0
  4. package/dist/combine.js +14 -2
  5. package/dist/constants/cliFunctions.js +7 -0
  6. package/dist/constants/common.js +119 -70
  7. package/dist/constants/constants.js +1 -0
  8. package/dist/crawlers/commonCrawlerFunc.js +93 -15
  9. package/dist/crawlers/crawlDomain.js +45 -57
  10. package/dist/crawlers/crawlIntelligentSitemap.js +12 -7
  11. package/dist/crawlers/crawlRateController.js +47 -0
  12. package/dist/crawlers/crawlSitemap.js +51 -62
  13. package/dist/generateOobeeClientScanner.js +31 -0
  14. package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
  15. package/dist/mergeAxeResults.js +120 -92
  16. package/dist/npmIndex.js +1 -0
  17. package/dist/utils.js +23 -28
  18. package/oobee-client-scanner.js +33 -2
  19. package/package.json +2 -2
  20. package/src/cli.ts +4 -0
  21. package/src/combine.ts +15 -1
  22. package/src/constants/cliFunctions.ts +7 -0
  23. package/src/constants/common.ts +131 -79
  24. package/src/constants/constants.ts +1 -0
  25. package/src/crawlers/commonCrawlerFunc.ts +103 -14
  26. package/src/crawlers/crawlDomain.ts +52 -65
  27. package/src/crawlers/crawlIntelligentSitemap.ts +13 -7
  28. package/src/crawlers/crawlRateController.ts +63 -0
  29. package/src/crawlers/crawlSitemap.ts +57 -70
  30. package/src/generateOobeeClientScanner.ts +31 -0
  31. package/src/index.ts +1 -0
  32. package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
  33. package/src/mergeAxeResults.ts +139 -99
  34. package/src/npmIndex.ts +1 -0
  35. package/src/utils.ts +25 -33
  36. /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt} +0 -0
@@ -888,6 +888,7 @@ const getRobotsTxtViaPlaywright = async (
888
888
  browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
889
889
  ...getPlaywrightLaunchOptions(browser),
890
890
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
891
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
891
892
  });
892
893
  register(browserContext);
893
894
  } else {
@@ -895,9 +896,10 @@ const getRobotsTxtViaPlaywright = async (
895
896
  const launchOptions = getPlaywrightLaunchOptions(browser);
896
897
  browserInstance = await constants.launcher.launch(launchOptions);
897
898
  register(browserInstance as unknown as { close: () => Promise<void> });
898
-
899
+
899
900
  browserContext = await browserInstance.newContext({
900
901
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
902
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
901
903
  });
902
904
  }
903
905
 
@@ -975,7 +977,7 @@ export const isDisallowedInRobotsTxt = (url: string): boolean => {
975
977
 
976
978
  export const getLinksFromSitemap = async (
977
979
  sitemapUrl: string,
978
- maxLinksCount: number,
980
+ _maxLinksCount: number,
979
981
  browser: string,
980
982
  userDataDirectory: string,
981
983
  userUrlInput: string,
@@ -985,9 +987,8 @@ export const getLinksFromSitemap = async (
985
987
  userUrl: string = userUrlInput,
986
988
  ) => {
987
989
  const scannedSitemaps = new Set<string>();
988
- const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
989
-
990
- const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
990
+ const sitemapLinkCounts: Record<string, number> = {};
991
+ const allUrls = new Set<string>(); // all discovered URLs (lightweight strings)
991
992
 
992
993
  const addToUrlList = (url: string) => {
993
994
  if (!url) return;
@@ -995,17 +996,7 @@ export const getLinksFromSitemap = async (
995
996
  if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy)) return;
996
997
 
997
998
  url = convertPathToLocalFile(url);
998
-
999
- let request;
1000
- try {
1001
- request = new Request({ url });
1002
- } catch (e) {
1003
- console.log('Error creating request', e);
1004
- }
1005
- if (isUrlPdf(url)) {
1006
- request.skipNavigation = true;
1007
- }
1008
- urls[url] = request;
999
+ allUrls.add(url);
1009
1000
  };
1010
1001
 
1011
1002
  const calculateCloseness = (sitemapUrl: string) => {
@@ -1058,16 +1049,15 @@ export const getLinksFromSitemap = async (
1058
1049
  });
1059
1050
  }
1060
1051
 
1061
- // Add the sorted URLs to the main URL list
1062
- for (const { url } of urlList.slice(0, maxLinksCount)) {
1052
+ // Add all URLs to the discovered list (limit applied later at return time)
1053
+ for (const { url } of urlList) {
1063
1054
  addToUrlList(url);
1064
1055
  }
1065
1056
  };
1066
1057
 
1067
1058
  const processNonStandardSitemap = (data: string) => {
1068
1059
  const urlsFromData = crawlee
1069
- .extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
1070
- .slice(0, maxLinksCount);
1060
+ .extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') });
1071
1061
  urlsFromData.forEach(url => {
1072
1062
  addToUrlList(url);
1073
1063
  });
@@ -1118,6 +1108,7 @@ export const getLinksFromSitemap = async (
1118
1108
  // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
1119
1109
  ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
1120
1110
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
1111
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
1121
1112
  },
1122
1113
  );
1123
1114
 
@@ -1127,9 +1118,10 @@ export const getLinksFromSitemap = async (
1127
1118
  const launchOptions = getPlaywrightLaunchOptions(browser);
1128
1119
  browserInstance = await constants.launcher.launch(launchOptions);
1129
1120
  register(browserInstance as unknown as { close: () => Promise<void> });
1130
-
1121
+
1131
1122
  browserContext = await browserInstance.newContext({
1132
1123
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
1124
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
1133
1125
  });
1134
1126
  }
1135
1127
 
@@ -1202,14 +1194,13 @@ export const getLinksFromSitemap = async (
1202
1194
  sitemapType = constants.xmlSitemapTypes.unknown;
1203
1195
  }
1204
1196
 
1197
+ const countBefore = allUrls.size;
1198
+
1205
1199
  switch (sitemapType) {
1206
1200
  case constants.xmlSitemapTypes.xmlIndex:
1207
1201
  consoleLogger.info(`This is a XML format sitemap index.`);
1208
1202
  for (const childSitemapUrl of $('loc')) {
1209
1203
  const childSitemapUrlText = $(childSitemapUrl).text();
1210
- if (isLimitReached()) {
1211
- break;
1212
- }
1213
1204
  if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
1214
1205
  await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
1215
1206
  } else {
@@ -1233,6 +1224,11 @@ export const getLinksFromSitemap = async (
1233
1224
  consoleLogger.info(`This is an unrecognised XML sitemap format.`);
1234
1225
  processNonStandardSitemap(data);
1235
1226
  }
1227
+
1228
+ const linksFromThisSitemap = allUrls.size - countBefore;
1229
+ if (linksFromThisSitemap > 0) {
1230
+ sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
1231
+ }
1236
1232
  };
1237
1233
 
1238
1234
  try {
@@ -1241,7 +1237,41 @@ export const getLinksFromSitemap = async (
1241
1237
  consoleLogger.error(e);
1242
1238
  }
1243
1239
 
1244
- const requestList = Object.values(urls);
1240
+ // Build Request objects for all discovered URLs; the crawler itself enforces
1241
+ // maxRequestsPerCrawl by counting only successfully scanned pages.
1242
+ const requestList: Request[] = [];
1243
+ for (const url of allUrls) {
1244
+ try {
1245
+ const request = new Request({ url });
1246
+ if (isUrlPdf(url)) {
1247
+ request.skipNavigation = true;
1248
+ }
1249
+ requestList.push(request);
1250
+ } catch (e) {
1251
+ consoleLogger.info(`Error creating request for ${url}: ${e}`);
1252
+ }
1253
+ }
1254
+
1255
+ const totalLinksDiscovered = allUrls.size;
1256
+ const fetchedSitemaps = Object.entries(sitemapLinkCounts).map(([url, fetchedLinks]) => ({
1257
+ url,
1258
+ fetchedLinks,
1259
+ }));
1260
+
1261
+ const prev = constants.sitemapFetchedLinks;
1262
+ constants.sitemapFetchedLinks = {
1263
+ totalLinksFetchedFromSitemaps: (prev?.totalLinksFetchedFromSitemaps ?? 0) + totalLinksDiscovered,
1264
+ fetchedSitemaps: [...(prev?.fetchedSitemaps ?? []), ...fetchedSitemaps],
1265
+ };
1266
+
1267
+ if (totalLinksDiscovered > 0) {
1268
+ const breakdown = fetchedSitemaps
1269
+ .map(({ url, fetchedLinks }) => `${url} (${fetchedLinks})`)
1270
+ .join(', ');
1271
+ consoleLogger.info(
1272
+ `There are a total of ${totalLinksDiscovered} links found across ${breakdown}.`,
1273
+ );
1274
+ }
1245
1275
 
1246
1276
  return requestList;
1247
1277
  };
@@ -1406,6 +1436,36 @@ export const getEdgeData = (randomToken: string) => {
1406
1436
  * @param {*} destDir destination directory
1407
1437
  * @returns boolean indicating whether the operation was successful
1408
1438
  */
1439
+ // Helper to copy a file with retry logic for transient EBUSY errors
1440
+ const copyFileWithRetry = (src: string, dest: string, maxRetries: number = 3): boolean => {
1441
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
1442
+ try {
1443
+ fs.copyFileSync(src, dest);
1444
+ if (attempt > 1) {
1445
+ consoleLogger.info(`File copy succeeded on attempt ${attempt}: ${dest}`);
1446
+ }
1447
+ return true;
1448
+ } catch (err: any) {
1449
+ if (err.code === 'EBUSY' && attempt < maxRetries) {
1450
+ // Transient lock — wait and retry
1451
+ const delayMs = Math.min(100 * Math.pow(2, attempt - 1), 1000); // Exponential backoff: 100ms, 200ms, 400ms, capped at 1s
1452
+ consoleLogger.warn(
1453
+ `File copy attempt ${attempt}/${maxRetries} failed with EBUSY. Retrying after ${delayMs}ms: ${dest}`,
1454
+ );
1455
+ // Synchronous sleep via busy-wait (not ideal but avoids promise complications in sync context)
1456
+ const endTime = Date.now() + delayMs;
1457
+ while (Date.now() < endTime) {
1458
+ // Busy wait
1459
+ }
1460
+ continue; // Retry
1461
+ }
1462
+ // Non-transient error or max retries reached
1463
+ return false;
1464
+ }
1465
+ }
1466
+ return false;
1467
+ };
1468
+
1409
1469
  const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
1410
1470
  let profileCookiesDir;
1411
1471
  // Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
@@ -1445,23 +1505,9 @@ const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, d
1445
1505
 
1446
1506
  // Prevents duplicate cookies file if the cookies already exist
1447
1507
  if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
1448
- try {
1449
- fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
1450
- } catch (err) {
1451
- consoleLogger.error(err);
1452
- if (err.code === 'EBUSY') {
1453
- console.log(
1454
- `Unable to copy the file for ${profileName} because it is currently in use.`,
1455
- );
1456
- console.log(
1457
- 'Please close any applications that might be using this file and try again.',
1458
- );
1459
- } else {
1460
- console.log(
1461
- `An unexpected error occurred for ${profileName} while copying the file: ${err.message}`,
1462
- );
1463
- }
1464
- // printMessage([err], messageOptions);
1508
+ const destCookiesPath = path.join(destProfileDir, 'Cookies');
1509
+ if (!copyFileWithRetry(dir, destCookiesPath)) {
1510
+ consoleLogger.error(`Failed to copy Chrome profile cookies for ${profileName} after retries.`);
1465
1511
  success = false;
1466
1512
  }
1467
1513
  }
@@ -1475,12 +1521,6 @@ const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, d
1475
1521
  return false;
1476
1522
  };
1477
1523
 
1478
- /**
1479
- * Clone the Chrome profile cookie files to the destination directory
1480
- * @param {*} options glob options object
1481
- * @param {*} destDir destination directory
1482
- * @returns boolean indicating whether the operation was successful
1483
- */
1484
1524
  const cloneEdgeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
1485
1525
  let profileCookiesDir;
1486
1526
  // Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
@@ -1521,21 +1561,9 @@ const cloneEdgeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, des
1521
1561
 
1522
1562
  // Prevents duplicate cookies file if the cookies already exist
1523
1563
  if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
1524
- try {
1525
- fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
1526
- } catch (err) {
1527
- consoleLogger.error(err);
1528
- if (err.code === 'EBUSY') {
1529
- console.log(
1530
- `Unable to copy the file for ${profileName} because it is currently in use.`,
1531
- );
1532
- console.log(
1533
- 'Please close any applications that might be using this file and try again.',
1534
- );
1535
- } else {
1536
- console.log(`An unexpected error occurred while copying the file: ${err.message}`);
1537
- }
1538
- // printMessage([err], messageOptions);
1564
+ const destCookiesPath = path.join(destProfileDir, 'Cookies');
1565
+ if (!copyFileWithRetry(dir, destCookiesPath)) {
1566
+ consoleLogger.error(`Failed to copy Edge profile cookies for ${profileName} after retries.`);
1539
1567
  success = false;
1540
1568
  }
1541
1569
  }
@@ -1566,19 +1594,9 @@ const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: st
1566
1594
 
1567
1595
  localState.forEach(dir => {
1568
1596
  const profileName = dir.match(profileNamesRegex)[1];
1569
- try {
1570
- fs.copyFileSync(dir, path.join(destDir, 'Local State'));
1571
- } catch (err) {
1572
- consoleLogger.error(err);
1573
- if (err.code === 'EBUSY') {
1574
- console.log(`Unable to copy the file because it is currently in use.`);
1575
- console.log('Please close any applications that might be using this file and try again.');
1576
- } else {
1577
- console.log(
1578
- `An unexpected error occurred for ${profileName} while copying the file: ${err.message}`,
1579
- );
1580
- }
1581
- printMessage([err], messageOptions);
1597
+ const destPath = path.join(destDir, 'Local State');
1598
+ if (!copyFileWithRetry(dir, destPath)) {
1599
+ consoleLogger.error(`Failed to copy Local State file for ${profileName} after retries.`);
1582
1600
  success = false;
1583
1601
  }
1584
1602
  });
@@ -1629,6 +1647,17 @@ export const cloneChromeProfiles = (randomToken: string): string => {
1629
1647
  }
1630
1648
 
1631
1649
  consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
1650
+
1651
+ // Fall back to a clean profile directory to avoid launch failures from partial clones.
1652
+ try {
1653
+ fs.rmSync(destDir, { recursive: true, force: true });
1654
+ fs.mkdirSync(destDir, { recursive: true });
1655
+ consoleLogger.warn('Using an empty cloned Chrome profile directory due to clone failure.');
1656
+ } catch (cleanupError) {
1657
+ consoleLogger.error(
1658
+ `Unable to reset cloned Chrome profile directory ${destDir}: ${cleanupError}`,
1659
+ );
1660
+ }
1632
1661
  }
1633
1662
  // For future reference, return a null instead to halt the scan
1634
1663
  return destDir;
@@ -1697,6 +1726,15 @@ export const cloneEdgeProfiles = (randomToken: string): string => {
1697
1726
  }
1698
1727
 
1699
1728
  consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
1729
+
1730
+ // Fall back to a clean profile directory to avoid launch failures from partial clones.
1731
+ try {
1732
+ fs.rmSync(destDir, { recursive: true, force: true });
1733
+ fs.mkdirSync(destDir, { recursive: true });
1734
+ consoleLogger.warn('Using an empty cloned Edge profile directory due to clone failure.');
1735
+ } catch (cleanupError) {
1736
+ consoleLogger.error(`Unable to reset cloned Edge profile directory ${destDir}: ${cleanupError}`);
1737
+ }
1700
1738
  }
1701
1739
 
1702
1740
  // For future reference, return a null instead to halt the scan
@@ -1725,7 +1763,14 @@ export const deleteClonedChromeProfiles = (randomToken?: string): void => {
1725
1763
  }
1726
1764
  let destDir: string[];
1727
1765
  if (randomToken) {
1728
- destDir = [`${baseDir}/oobee-${randomToken}`];
1766
+ // Also match _pool* directories created by browser pool re-launches
1767
+ destDir = globSync(`oobee-${randomToken}*`, {
1768
+ cwd: baseDir,
1769
+ absolute: true,
1770
+ });
1771
+ if (destDir.length === 0) {
1772
+ destDir = [`${baseDir}/oobee-${randomToken}`];
1773
+ }
1729
1774
  } else {
1730
1775
  // Find all the oobee directories in the Chrome data directory
1731
1776
  destDir = globSync('**/oobee*', {
@@ -1766,9 +1811,16 @@ export const deleteClonedEdgeProfiles = (randomToken?: string): void => {
1766
1811
  }
1767
1812
  let destDir: string[];
1768
1813
  if (randomToken) {
1769
- destDir = [`${baseDir}/oobee-${randomToken}`];
1814
+ // Also match _pool* directories created by browser pool re-launches
1815
+ destDir = globSync(`oobee-${randomToken}*`, {
1816
+ cwd: baseDir,
1817
+ absolute: true,
1818
+ });
1819
+ if (destDir.length === 0) {
1820
+ destDir = [`${baseDir}/oobee-${randomToken}`];
1821
+ }
1770
1822
  } else {
1771
- // Find all the oobee directories in the Chrome data directory
1823
+ // Find all the oobee directories in the Edge data directory
1772
1824
  destDir = globSync('**/oobee*', {
1773
1825
  cwd: baseDir,
1774
1826
  absolute: true,
@@ -946,6 +946,7 @@ export default {
946
946
  a11yRuleShortDescriptionMap,
947
947
  disabilityBadgesMap,
948
948
  robotsTxtUrls: null,
949
+ sitemapFetchedLinks: null as { totalLinksFetchedFromSitemaps: number; fetchedSitemaps: { url: string; fetchedLinks: number }[] } | null,
949
950
  userDataDirectory: null, // This will be set later in the code
950
951
  randomToken: null, // This will be set later in the code
951
952
  // Track all active Crawlee / Playwright resources for cleanup
@@ -874,6 +874,13 @@ export const runAxeScript = async ({
874
874
  const browserContext: BrowserContext = page.context();
875
875
  const requestUrl = page.url();
876
876
 
877
+ let pageTitle: string | null = null;
878
+ try {
879
+ pageTitle = await page.evaluate(() => document.title);
880
+ } catch {
881
+ // Page may already be in a bad state; title will remain null
882
+ }
883
+
877
884
  try {
878
885
  // Checking for DOM mutations before proceeding to scan
879
886
  await page.evaluate(() => {
@@ -1012,7 +1019,42 @@ export const runAxeScript = async ({
1012
1019
  .run(selectors, {
1013
1020
  resultTypes: defaultResultTypes,
1014
1021
  })
1015
- .then(results => {
1022
+ .then(async results => {
1023
+ // Re-verify aria-hidden-focus violations against the live DOM to
1024
+ // handle race conditions with JS that sets tabindex="-1" after
1025
+ // aria-hidden (common in carousel/slider libraries like slick)
1026
+ const ariaHiddenViolation = results.violations.find(
1027
+ v => v.id === 'aria-hidden-focus',
1028
+ );
1029
+ if (ariaHiddenViolation) {
1030
+ await new Promise(resolve => setTimeout(resolve, 0));
1031
+ ariaHiddenViolation.nodes = ariaHiddenViolation.nodes.filter(node => {
1032
+ const selector = node.target && node.target[0];
1033
+ if (typeof selector !== 'string') return true;
1034
+ try {
1035
+ const el = document.querySelector(selector);
1036
+ if (!el) return true;
1037
+ const focusables = el.querySelectorAll(
1038
+ 'a[href], area[href], button:not([disabled]), input:not([disabled]):not([type="hidden"]), select:not([disabled]), textarea:not([disabled]), [tabindex]',
1039
+ );
1040
+ if (focusables.length === 0) return false;
1041
+ return Array.from(focusables).some(child => {
1042
+ const tabindex = child.getAttribute('tabindex');
1043
+ if (tabindex === null) return true;
1044
+ const parsed = parseInt(tabindex, 10);
1045
+ return isNaN(parsed) || parsed >= 0;
1046
+ });
1047
+ } catch {
1048
+ return true;
1049
+ }
1050
+ });
1051
+ if (ariaHiddenViolation.nodes.length === 0) {
1052
+ results.violations = results.violations.filter(
1053
+ v => v.id !== 'aria-hidden-focus',
1054
+ );
1055
+ }
1056
+ }
1057
+
1016
1058
  if (disableOobee) {
1017
1059
  return results;
1018
1060
  }
@@ -1086,19 +1128,6 @@ export const runAxeScript = async ({
1086
1128
  results.incomplete = await takeScreenshotForHTMLElements(results.incomplete, page, randomToken);
1087
1129
  }
1088
1130
 
1089
- let pageTitle = null;
1090
- try {
1091
- pageTitle = await page.evaluate(() => document.title);
1092
- } catch (e) {
1093
- consoleLogger.info(`Error while getting page title: ${e}`);
1094
- if (page.isClosed()) {
1095
- consoleLogger.info(`Page was closed for ${requestUrl}, creating new page`);
1096
- page = await browserContext.newPage();
1097
- await page.goto(requestUrl, { waitUntil: 'domcontentloaded' });
1098
- pageTitle = await page.evaluate(() => document.title);
1099
- }
1100
- }
1101
-
1102
1131
  return filterAxeResults(results, pageTitle, customFlowDetails);
1103
1132
  };
1104
1133
 
@@ -1130,6 +1159,66 @@ export const postNavigationHooks = [
1130
1159
  },
1131
1160
  ];
1132
1161
 
1162
+ export const getPreLaunchHook = (userDataDirectory: string) => {
1163
+ let launchCount = 0;
1164
+
1165
+ return async (_pageId: string, launchContext: any) => {
1166
+ const fsp = await import('fs/promises').then(m => m.default);
1167
+ launchCount += 1;
1168
+
1169
+ // First launch uses the base directory; subsequent launches get a unique
1170
+ // directory so that lingering file handles from a retired browser don't
1171
+ // cause Chrome exit code 21 on Windows.
1172
+ const effectiveDir =
1173
+ launchCount === 1
1174
+ ? userDataDirectory
1175
+ : `${userDataDirectory}_pool${launchCount}`;
1176
+
1177
+ await fsp.mkdir(effectiveDir, { recursive: true });
1178
+
1179
+ // For pool re-launches, best-effort clone profile data from base directory
1180
+ // so authenticated sessions are preserved across browser pool retirements.
1181
+ if (launchCount > 1) {
1182
+ try {
1183
+ const copyRecursive = async (src: string, dest: string) => {
1184
+ const stat = await fsp.stat(src).catch(() => null);
1185
+ if (!stat) return;
1186
+ if (stat.isDirectory()) {
1187
+ await fsp.mkdir(dest, { recursive: true }).catch(() => {});
1188
+ const entries = await fsp.readdir(src).catch(() => []);
1189
+ await Promise.all(
1190
+ entries
1191
+ .filter(entry => !entry.startsWith('Singleton') && entry !== 'lockfile' && entry !== 'LOCK')
1192
+ .map(entry =>
1193
+ copyRecursive(path.join(src, entry), path.join(dest, entry)).catch(() => {}),
1194
+ ),
1195
+ );
1196
+ } else {
1197
+ await fsp.copyFile(src, dest).catch(() => {});
1198
+ }
1199
+ };
1200
+ await copyRecursive(userDataDirectory, effectiveDir).catch(() => {});
1201
+ } catch {
1202
+ // Silent fallback: use empty profile if clone fails
1203
+ }
1204
+ }
1205
+
1206
+ // Clean any stale lock files that may block browser launches on Windows
1207
+ const lockFiles = [
1208
+ path.join(effectiveDir, 'SingletonLock'),
1209
+ path.join(effectiveDir, 'SingletonSocket'),
1210
+ path.join(effectiveDir, 'SingletonCookie'),
1211
+ path.join(effectiveDir, 'lockfile'),
1212
+ path.join(effectiveDir, 'Default', 'LOCK'),
1213
+ path.join(effectiveDir, 'Default', 'Network', 'LOCK'),
1214
+ ];
1215
+ await Promise.all(lockFiles.map(f => fsp.rm(f, { force: true }).catch(() => {})));
1216
+
1217
+ // eslint-disable-next-line no-param-reassign
1218
+ launchContext.userDataDir = effectiveDir;
1219
+ };
1220
+ };
1221
+
1133
1222
  export const failedRequestHandler = async ({ request }: { request: Request }) => {
1134
1223
  guiInfoLog(guiInfoStatusTypes.ERROR, { numScanned: 0, urlScanned: request.url });
1135
1224
  log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);