@govtechsg/oobee 0.10.91 → 0.10.92

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/AGENTS.md +289 -0
  2. package/README.md +3 -0
  3. package/dist/cli.js +3 -0
  4. package/dist/combine.js +14 -2
  5. package/dist/constants/cliFunctions.js +7 -0
  6. package/dist/constants/common.js +119 -70
  7. package/dist/constants/constants.js +1 -0
  8. package/dist/crawlers/commonCrawlerFunc.js +93 -15
  9. package/dist/crawlers/crawlDomain.js +45 -57
  10. package/dist/crawlers/crawlIntelligentSitemap.js +12 -7
  11. package/dist/crawlers/crawlRateController.js +47 -0
  12. package/dist/crawlers/crawlSitemap.js +51 -62
  13. package/dist/generateOobeeClientScanner.js +31 -0
  14. package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
  15. package/dist/mergeAxeResults.js +120 -92
  16. package/dist/npmIndex.js +1 -0
  17. package/dist/utils.js +23 -28
  18. package/oobee-client-scanner.js +33 -2
  19. package/package.json +2 -2
  20. package/src/cli.ts +4 -0
  21. package/src/combine.ts +15 -1
  22. package/src/constants/cliFunctions.ts +7 -0
  23. package/src/constants/common.ts +131 -79
  24. package/src/constants/constants.ts +1 -0
  25. package/src/crawlers/commonCrawlerFunc.ts +103 -14
  26. package/src/crawlers/crawlDomain.ts +52 -65
  27. package/src/crawlers/crawlIntelligentSitemap.ts +13 -7
  28. package/src/crawlers/crawlRateController.ts +63 -0
  29. package/src/crawlers/crawlSitemap.ts +57 -70
  30. package/src/generateOobeeClientScanner.ts +31 -0
  31. package/src/index.ts +1 -0
  32. package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
  33. package/src/mergeAxeResults.ts +139 -99
  34. package/src/npmIndex.ts +1 -0
  35. package/src/utils.ts +25 -33
  36. /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt} +0 -0
@@ -5,6 +5,7 @@ import printMessage from 'print-message';
5
5
  import path from 'path';
6
6
  import ejs from 'ejs';
7
7
  import { fileURLToPath } from 'url';
8
+ import { Dataset, RequestQueue, Configuration } from 'crawlee';
8
9
  import constants, {
9
10
  BrowserTypes,
10
11
  ScannerTypes,
@@ -358,61 +359,97 @@ const writeSummaryPdf = async (
358
359
  browser: string,
359
360
  _userDataDirectory: string,
360
361
  ) => {
361
- let browserInstance;
362
- let context;
363
- let page;
362
+ const renderPdfWithBrowser = async (browserToUse: string) => {
363
+ let browserInstance;
364
+ let context;
365
+ let page;
364
366
 
365
- try {
366
- const htmlFilePath = path.join(storagePath, `${filename}.html`);
367
- const fileDestinationPath = path.join(storagePath, `${filename}.pdf`);
368
- const htmlFileUrl = `file://${htmlFilePath}`;
367
+ try {
368
+ const htmlFilePath = path.join(storagePath, `${filename}.html`);
369
+ const fileDestinationPath = path.join(storagePath, `${filename}.pdf`);
370
+ const htmlFileUrl = `file://${htmlFilePath}`;
369
371
 
370
- const launchOptions = getPlaywrightLaunchOptions(browser);
372
+ const launchOptions = getPlaywrightLaunchOptions(browserToUse);
371
373
 
372
- browserInstance = await constants.launcher.launch({
373
- ...launchOptions,
374
- headless: true,
375
- });
374
+ browserInstance = await constants.launcher.launch({
375
+ ...launchOptions,
376
+ headless: true,
377
+ });
376
378
 
377
- register(browserInstance as unknown as { close: () => Promise<void> });
379
+ register(browserInstance as unknown as { close: () => Promise<void> });
378
380
 
379
- context = await browserInstance.newContext();
380
- page = await context.newPage();
381
+ context = await browserInstance.newContext();
382
+ page = await context.newPage();
381
383
 
382
- await page.goto(htmlFileUrl, {
383
- waitUntil: 'domcontentloaded',
384
- timeout: 120000,
385
- });
384
+ await page.goto(htmlFileUrl, {
385
+ waitUntil: 'domcontentloaded',
386
+ timeout: 120000,
387
+ });
386
388
 
387
- await page.emulateMedia({ media: 'print' });
389
+ await page.emulateMedia({ media: 'print' });
388
390
 
389
- await page.pdf({
390
- margin: { bottom: '32px' },
391
- path: fileDestinationPath,
392
- format: 'A4',
393
- displayHeaderFooter: true,
394
- footerTemplate: `
391
+ await page.pdf({
392
+ margin: { bottom: '32px' },
393
+ path: fileDestinationPath,
394
+ format: 'A4',
395
+ displayHeaderFooter: true,
396
+ footerTemplate: `
395
397
  <div style="margin-top:50px;color:#26241b;font-family:Open Sans;text-align: center;width: 100%;font-weight:400">
396
398
  <span style="color:#26241b;font-size: 14px;font-weight:400">Page <span class="pageNumber"></span> of <span class="totalPages"></span></span>
397
399
  </div>
398
400
  `,
399
- });
401
+ });
400
402
 
401
- if (pagesScanned < 2000) {
402
403
  fs.unlinkSync(htmlFilePath);
404
+ } finally {
405
+ try {
406
+ await page?.close();
407
+ } catch (err) {
408
+ consoleLogger.info(`Error at page close writeSummaryPDF ${err}`);
409
+ }
410
+ try {
411
+ await context?.close();
412
+ } catch (err) {
413
+ consoleLogger.info(`Error at context close writeSummaryPDF ${err}`);
414
+ }
415
+ try {
416
+ await browserInstance?.close();
417
+ } catch (err) {
418
+ consoleLogger.info(`Error at browserInstance close writeSummaryPDF ${err}`);
419
+ }
420
+ }
421
+ };
422
+
423
+ const browserAttempts = [browser];
424
+
425
+ // Runtime fallback: if Chrome launch fails on Windows, try Edge once for PDF generation.
426
+ if (process.platform === 'win32' && browser === BrowserTypes.CHROME) {
427
+ browserAttempts.push(BrowserTypes.EDGE);
428
+ }
429
+
430
+ for (let i = 0; i < browserAttempts.length; i++) {
431
+ const currentBrowser = browserAttempts[i];
432
+ try {
433
+ await renderPdfWithBrowser(currentBrowser);
434
+ if (i > 0) {
435
+ consoleLogger.warn(
436
+ `writeSummaryPDF succeeded with fallback browser '${currentBrowser}' after '${browser}' failed.`,
437
+ );
438
+ }
439
+ return;
440
+ } catch (err) {
441
+ const isLastAttempt = i === browserAttempts.length - 1;
442
+ consoleLogger.info(
443
+ `Error at writeSummaryPDF using browser '${currentBrowser}': ${err instanceof Error ? err.stack : err}`,
444
+ );
445
+ if (isLastAttempt) {
446
+ return;
447
+ }
448
+ const nextBrowser = browserAttempts[i + 1];
449
+ consoleLogger.warn(
450
+ `writeSummaryPDF failed using browser '${currentBrowser}', retrying with '${nextBrowser}'.`,
451
+ );
403
452
  }
404
- } catch (err) {
405
- consoleLogger.info(`Error at writeSummaryPDF ${err instanceof Error ? err.stack : err}`);
406
- } finally {
407
- await page?.close().catch(err => {
408
- consoleLogger.info(`Error at page close writeSummaryPDF ${err}`);
409
- });
410
- await context?.close().catch(err => {
411
- consoleLogger.info(`Error at context close writeSummaryPDF ${err}`);
412
- });
413
- await browserInstance?.close().catch(err => {
414
- consoleLogger.info(`Error at browserInstance close writeSummaryPDF ${err}`);
415
- });
416
453
  }
417
454
  };
418
455
 
@@ -645,6 +682,8 @@ export const createRuleIdJson = async (allIssues, itemsStore?: ItemsStore) => {
645
682
  const compiledRuleJson = {};
646
683
 
647
684
  for (const category of ['mustFix', 'goodToFix', 'needsReview'] as const) {
685
+ compiledRuleJson[category] = {};
686
+
648
687
  for (const rule of allIssues.items[category].rules) {
649
688
  let allItems: any[] = [];
650
689
 
@@ -656,7 +695,7 @@ export const createRuleIdJson = async (allIssues, itemsStore?: ItemsStore) => {
656
695
  allItems = rule.pagesAffected.flatMap(page => page.items || []);
657
696
  }
658
697
 
659
- compiledRuleJson[rule.rule] = extractRuleAiData(rule.rule, rule.totalItems, allItems);
698
+ compiledRuleJson[category][rule.rule] = extractRuleAiData(rule.rule, rule.totalItems, allItems);
660
699
  }
661
700
  }
662
701
 
@@ -668,10 +707,12 @@ export const createBasicFormHTMLSnippet = filteredResults => {
668
707
  const compiledRuleJson = {};
669
708
 
670
709
  ['mustFix', 'goodToFix', 'needsReview'].forEach(category => {
710
+ compiledRuleJson[category] = {};
711
+
671
712
  if (filteredResults[category] && filteredResults[category].rules) {
672
713
  Object.entries(filteredResults[category].rules).forEach(
673
714
  ([ruleId, ruleVal]: [string, any]) => {
674
- compiledRuleJson[ruleId] = extractRuleAiData(ruleId, ruleVal.totalItems, ruleVal.items);
715
+ compiledRuleJson[category][ruleId] = extractRuleAiData(ruleId, ruleVal.totalItems, ruleVal.items);
675
716
  },
676
717
  );
677
718
  }
@@ -741,6 +782,7 @@ const generateArtifacts = async (
741
782
  },
742
783
  zip: string = undefined, // optional
743
784
  generateJsonFiles = false,
785
+ preferredBrowser?: string,
744
786
  ) => {
745
787
  consoleLogger.info('Generating report artifacts');
746
788
 
@@ -760,6 +802,8 @@ const generateArtifacts = async (
760
802
  endTime: scanDetails.endTime ? scanDetails.endTime : new Date(),
761
803
  urlScanned,
762
804
  scanType,
805
+ totalLinksFetchedFromSitemaps: constants.sitemapFetchedLinks?.totalLinksFetchedFromSitemaps ?? 0,
806
+ fetchedSitemaps: constants.sitemapFetchedLinks?.fetchedSitemaps ?? [],
763
807
  deviceChosen: scanDetails.deviceChosen || 'Desktop',
764
808
  formatAboutStartTime,
765
809
  isCustomFlow,
@@ -1005,7 +1049,11 @@ const generateArtifacts = async (
1005
1049
  ]);
1006
1050
  }
1007
1051
 
1008
- const browserChannel = getBrowserToRun(randomToken, BrowserTypes.CHROME, false).browserToRun;
1052
+ const browserChannel = getBrowserToRun(
1053
+ randomToken,
1054
+ (preferredBrowser as BrowserTypes) || BrowserTypes.CHROME,
1055
+ false,
1056
+ ).browserToRun;
1009
1057
 
1010
1058
  // Should consider refactor constants.userDataDirectory to be a parameter in future
1011
1059
  await retryFunction(
@@ -1020,36 +1068,34 @@ const generateArtifacts = async (
1020
1068
  1,
1021
1069
  );
1022
1070
 
1023
- // Suppress uncaught EPERM errors from lingering Crawlee async lock-file operations
1024
- // (Windows holds mandatory file locks; Crawlee may still attempt mkdir on .json.lock
1025
- // files after the crawl has finished). Without this, Node crashes with uncaughtException.
1026
- const crawleeEpermHandler = (err: Error & { code?: string }) => {
1027
- if (err.code === 'EPERM' && err.message?.includes('crawlee')) {
1028
- consoleLogger.info(`Suppressed lingering Crawlee storage error: ${err.message}`);
1029
- return;
1030
- }
1031
- // Re-throw non-crawlee EPERM errors so they aren't silently swallowed
1032
- throw err;
1033
- };
1034
- process.on('uncaughtException', crawleeEpermHandler);
1035
- process.on('unhandledRejection', crawleeEpermHandler);
1071
+ // Flush pending background storage operations (metadata writes, lock-file ops)
1072
+ const storageClient = Configuration.getStorageClient();
1073
+ if (storageClient.teardown) {
1074
+ await storageClient.teardown();
1075
+ }
1036
1076
 
1037
- // Brief delay to allow lingering async crawlee storage operations to flush
1038
- await new Promise(resolve => setTimeout(resolve, process.platform === 'win32' ? 5000 : 3000));
1077
+ // Gracefully drop Dataset and RequestQueue releases locks and removes files
1078
+ const crawleeDir = path.join(storagePath, 'crawlee');
1079
+ try {
1080
+ const dataset = await Dataset.open(crawleeDir);
1081
+ await dataset.drop();
1082
+ } catch (error) {
1083
+ consoleLogger.info(`Dataset drop: ${error.message}`);
1084
+ }
1085
+
1086
+ try {
1087
+ const requestQueue = await RequestQueue.open(crawleeDir);
1088
+ await requestQueue.drop();
1089
+ } catch (error) {
1090
+ consoleLogger.info(`RequestQueue drop: ${error.message}`);
1091
+ }
1039
1092
 
1093
+ // Fallback rm for any leftover files not managed by Crawlee's storage API
1040
1094
  const crawleePath = path.join(storagePath, 'crawlee');
1041
1095
  try {
1042
1096
  await fs.promises.rm(crawleePath, { recursive: true, force: true });
1043
- } catch (error) {
1044
- // On Windows, retry once after a delay if the folder is still locked
1045
- if (process.platform === 'win32') {
1046
- await new Promise(resolve => setTimeout(resolve, 3000));
1047
- try {
1048
- await fs.promises.rm(crawleePath, { recursive: true, force: true });
1049
- } catch {
1050
- // Best-effort cleanup — leave the folder; report generation continues
1051
- }
1052
- }
1097
+ } catch {
1098
+ // Best-effort; storage was already dropped via API
1053
1099
  }
1054
1100
 
1055
1101
  try {
@@ -1058,6 +1104,31 @@ const generateArtifacts = async (
1058
1104
  consoleLogger.warn(`Unable to force remove pdfs folder: ${error.message}`);
1059
1105
  }
1060
1106
 
1107
+ // Generate scrubbed HTML Code Snippets
1108
+ const ruleIdJson = await createRuleIdJson(allIssues, itemsStore);
1109
+
1110
+ // Clean up intermediate items files before zipping
1111
+ await itemsStore.cleanup();
1112
+
1113
+ try {
1114
+ await sendWcagBreakdownToSentry(
1115
+ oobeeAppVersion,
1116
+ wcagOccurrencesMap,
1117
+ ruleIdJson,
1118
+ {
1119
+ entryUrl: urlScanned,
1120
+ scanType,
1121
+ browser: scanDetails.deviceChosen,
1122
+ email: scanDetails.nameEmail?.email,
1123
+ name: scanDetails.nameEmail?.name,
1124
+ },
1125
+ allIssues,
1126
+ pagesScanned.length,
1127
+ );
1128
+ } catch (error) {
1129
+ console.error('Error sending WCAG data to Sentry:', error);
1130
+ }
1131
+
1061
1132
  // Take option if set
1062
1133
  if (typeof zip === 'string') {
1063
1134
  constants.cliZipFileName = zip;
@@ -1104,40 +1175,9 @@ const generateArtifacts = async (
1104
1175
  printMessage([`Error in zipping results: ${error}`]);
1105
1176
  }
1106
1177
 
1107
- // Generate scrubbed HTML Code Snippets
1108
- const ruleIdJson = await createRuleIdJson(allIssues, itemsStore);
1109
-
1110
- // Clean up intermediate items files
1111
- await itemsStore.cleanup();
1112
-
1113
- // At the end of the function where results are generated, add:
1114
- try {
1115
- // Always send WCAG breakdown to Sentry, even if no violations were found
1116
- // This ensures that all criteria are reported, including those with 0 occurrences
1117
- await sendWcagBreakdownToSentry(
1118
- oobeeAppVersion,
1119
- wcagOccurrencesMap,
1120
- ruleIdJson,
1121
- {
1122
- entryUrl: urlScanned,
1123
- scanType,
1124
- browser: scanDetails.deviceChosen,
1125
- email: scanDetails.nameEmail?.email,
1126
- name: scanDetails.nameEmail?.name,
1127
- },
1128
- allIssues,
1129
- pagesScanned.length,
1130
- );
1131
- } catch (error) {
1132
- console.error('Error sending WCAG data to Sentry:', error);
1133
- }
1134
-
1135
1178
  if (process.env.RUNNING_FROM_PH_GUI || process.env.OOBEE_VERBOSE)
1136
1179
  console.log('Report generated successfully');
1137
1180
 
1138
- process.removeListener('uncaughtException', crawleeEpermHandler);
1139
- process.removeListener('unhandledRejection', crawleeEpermHandler);
1140
-
1141
1181
  return ruleIdJson;
1142
1182
  };
1143
1183
 
package/src/npmIndex.ts CHANGED
@@ -363,6 +363,7 @@ export const init = async ({
363
363
  const { mustFix: mustFixThreshold, goodToFix: goodToFixThreshold } = thresholds;
364
364
 
365
365
  process.env.CRAWLEE_STORAGE_DIR = randomToken;
366
+ constants.sitemapFetchedLinks = null;
366
367
 
367
368
  const scanDetails = {
368
369
  startTime: new Date(),
package/src/utils.ts CHANGED
@@ -6,6 +6,7 @@ import axe, { Rule } from 'axe-core';
6
6
  import { v4 as uuidv4 } from 'uuid';
7
7
  import { getDomain } from 'tldts';
8
8
  import { normalizeUrl } from '@apify/utilities';
9
+ import { Dataset, RequestQueue, Configuration } from 'crawlee';
9
10
  import constants, {
10
11
  BrowserTypes,
11
12
  destinationPath,
@@ -390,6 +391,19 @@ export const cleanUp = async (randomToken?: string, isError: boolean = false): P
390
391
  if (randomToken !== undefined) {
391
392
  const storagePath = getStoragePath(randomToken);
392
393
 
394
+ try {
395
+ const storageClient = Configuration.getStorageClient();
396
+ if (storageClient.teardown) {
397
+ await storageClient.teardown();
398
+ }
399
+ const crawleeDir = path.join(storagePath, 'crawlee');
400
+ const dataset = await Dataset.open(crawleeDir);
401
+ await dataset.drop();
402
+ const requestQueue = await RequestQueue.open(crawleeDir);
403
+ await requestQueue.drop();
404
+ } catch (error) {
405
+ consoleLogger.info(`Crawlee storage drop in cleanUp: ${error.message}`);
406
+ }
393
407
  try {
394
408
  fs.rmSync(path.join(storagePath, 'crawlee'), { recursive: true, force: true });
395
409
  } catch (error) {
@@ -402,37 +416,8 @@ export const cleanUp = async (randomToken?: string, isError: boolean = false): P
402
416
  consoleLogger.warn(`Unable to force remove pdfs folder: ${error.message}`);
403
417
  }
404
418
 
405
- let deleteErrorLogFile = true;
406
-
407
- if (isError) {
408
- let logsPath = storagePath;
409
-
410
- if (process.env.OOBEE_LOGS_PATH) {
411
- logsPath = process.env.OOBEE_LOGS_PATH;
412
- }
413
-
414
- if (fs.existsSync(errorsTxtPath)) {
415
- try {
416
- const logFilePath = path.join(logsPath, `logs-${randomToken}.txt`);
417
- fs.copyFileSync(errorsTxtPath, logFilePath);
418
- console.log(`An error occured. Log file is located at: ${logFilePath}`);
419
-
420
- } catch (copyError) {
421
- consoleLogger.error(`Error copying errors file during cleanup: ${copyError.message}`);
422
- console.log(`An error occured. Log file is located at: ${errorsTxtPath}`);
423
- deleteErrorLogFile = false; // Do not delete the log file if copy failed
424
- }
425
-
426
- if (deleteErrorLogFile && fs.existsSync(errorsTxtPath)) {
427
- try {
428
- fs.unlinkSync(errorsTxtPath);
429
- } catch (error) {
430
- consoleLogger.warn(`Unable to delete log file ${errorsTxtPath}: ${error.message}`);
431
- }
432
- }
433
-
434
- }
435
-
419
+ if (isError && fs.existsSync(errorsTxtPath)) {
420
+ console.log(`An error occured. Log file is located at: ${errorsTxtPath}`);
436
421
  }
437
422
 
438
423
  if (fs.existsSync(storagePath) && fs.readdirSync(storagePath).length === 0) {
@@ -1081,13 +1066,20 @@ export const randomThreeDigitNumberString = () => {
1081
1066
 
1082
1067
  export const normUrl = (u: string): string => (u ? normalizeUrl(u) || u : '');
1083
1068
 
1069
+ export const stripWwwPrefix = (hostname: string): string => hostname.replace(/^www\./, '');
1070
+
1071
+ export const isSameHostname = (hostname1: string, hostname2: string): boolean =>
1072
+ stripWwwPrefix(hostname1) === stripWwwPrefix(hostname2);
1073
+
1084
1074
  export const isFollowStrategy = (link1: string, link2: string, rule: string): boolean => {
1085
1075
  if (rule === 'all') return true;
1086
1076
  try {
1087
1077
  const parsedLink1 = new URL(link1);
1088
1078
  const parsedLink2 = new URL(link2);
1089
1079
  if (rule === 'same-origin') {
1090
- return parsedLink1.origin === parsedLink2.origin;
1080
+ return parsedLink1.protocol === parsedLink2.protocol &&
1081
+ isSameHostname(parsedLink1.hostname, parsedLink2.hostname) &&
1082
+ parsedLink1.port === parsedLink2.port;
1091
1083
  }
1092
1084
  if (rule === 'same-domain') {
1093
1085
  const link1Domain = getDomain(parsedLink1.hostname, { allowPrivateDomains: true }) || parsedLink1.hostname;
@@ -1095,7 +1087,7 @@ export const isFollowStrategy = (link1: string, link2: string, rule: string): bo
1095
1087
  return link1Domain.toLowerCase() === link2Domain.toLowerCase();
1096
1088
  }
1097
1089
  // default: same-hostname
1098
- return parsedLink1.hostname === parsedLink2.hostname;
1090
+ return isSameHostname(parsedLink1.hostname, parsedLink2.hostname);
1099
1091
  } catch {
1100
1092
  return false;
1101
1093
  }