@govtechsg/oobee 0.10.85 → 0.10.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/.github/workflows/publish.yml +10 -0
  2. package/DETAILS.md +29 -0
  3. package/dist/cli.js +18 -5
  4. package/dist/combine.js +3 -1
  5. package/dist/constants/cliFunctions.js +2 -2
  6. package/dist/constants/common.js +70 -17
  7. package/dist/constants/constants.js +604 -1
  8. package/dist/crawlers/commonCrawlerFunc.js +3 -2
  9. package/dist/crawlers/crawlDomain.js +38 -13
  10. package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
  11. package/dist/crawlers/crawlSitemap.js +141 -84
  12. package/dist/crawlers/custom/utils.js +218 -71
  13. package/dist/crawlers/guards/urlGuard.js +8 -15
  14. package/dist/crawlers/runCustom.js +18 -11
  15. package/dist/generateHtmlReport.js +18 -11
  16. package/dist/generateOobeeClientScanner.js +570 -0
  17. package/dist/mergeAxeResults/itemReferences.js +60 -25
  18. package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
  19. package/dist/mergeAxeResults.js +23 -13
  20. package/dist/npmIndex.js +10 -2
  21. package/dist/proxyService.js +18 -3
  22. package/dist/services/s3Uploader.js +21 -10
  23. package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  24. package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
  25. package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
  26. package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
  27. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
  28. package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  29. package/dist/static/ejs/summary.ejs +19 -8
  30. package/dist/utils.js +4 -3
  31. package/fix-summary-html-oom-pr.md +62 -0
  32. package/oobee-client-scanner.js +34992 -0
  33. package/package.json +5 -5
  34. package/src/cli.ts +19 -5
  35. package/src/combine.ts +5 -1
  36. package/src/constants/cliFunctions.ts +2 -2
  37. package/src/constants/common.ts +87 -22
  38. package/src/constants/constants.ts +602 -1
  39. package/src/crawlers/commonCrawlerFunc.ts +4 -3
  40. package/src/crawlers/crawlDomain.ts +39 -13
  41. package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
  42. package/src/crawlers/crawlSitemap.ts +165 -100
  43. package/src/crawlers/custom/utils.ts +241 -80
  44. package/src/crawlers/guards/urlGuard.ts +24 -31
  45. package/src/crawlers/runCustom.ts +29 -11
  46. package/src/generateHtmlReport.ts +21 -11
  47. package/src/generateOobeeClientScanner.ts +591 -0
  48. package/src/mergeAxeResults/itemReferences.ts +70 -26
  49. package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
  50. package/src/mergeAxeResults.ts +26 -14
  51. package/src/npmIndex.ts +12 -2
  52. package/src/proxyService.ts +25 -4
  53. package/src/services/s3Uploader.ts +23 -11
  54. package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  55. package/src/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
  56. package/src/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
  57. package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
  58. package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
  59. package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  60. package/src/static/ejs/summary.ejs +19 -8
  61. package/src/utils.ts +4 -3
  62. package/testStaticJSScanner.html +534 -0
@@ -3,6 +3,8 @@ on:
3
3
  workflow_dispatch:
4
4
  release:
5
5
  types: [published]
6
+ permissions:
7
+ contents: write
6
8
  jobs:
7
9
  build:
8
10
  runs-on: ubuntu-latest
@@ -20,6 +22,14 @@ jobs:
20
22
  - run: npm run build
21
23
  continue-on-error: false
22
24
 
25
+ - name: Create and push git tag
26
+ run: |
27
+ VERSION=$(node -p "require('./package.json').version")
28
+ git config user.name "github-actions[bot]"
29
+ git config user.email "github-actions[bot]@users.noreply.github.com"
30
+ git tag -af "v${VERSION}" -m "Version ${VERSION}"
31
+ git push origin "v${VERSION}" --force
32
+
23
33
  - run: npm publish
24
34
  env:
25
35
  NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
package/DETAILS.md CHANGED
@@ -195,3 +195,32 @@ Note: Level AAA are disabled by default. Please specify `enable-wcag-aaa` in ru
195
195
  | skip-link | Ensure all skip links have a focusable target | Good to Fix |
196
196
  | tabindex | Ensures tabindex attribute values are not greater than 0 | Good to Fix |
197
197
  | table-duplicate-name | Ensure the `<caption>` element does not contain the same text as the summary attribute | Good to Fix |
198
+
199
+ ## Additional Information
200
+ ### How the Readability Grading Works
201
+
202
+ #### 1. Text Extraction
203
+
204
+ During a page scan, Oobee extracts text from all `<p>` elements on the page (via extractAndGradeText.ts or extractText.ts). The raw text is split into individual **sentences** using the pattern `/[^.!?]*[.!?]+/g` — only text segments ending with `.`, `!`, or `?` are kept.
205
+
206
+ #### 2. Flesch Reading Ease Scoring
207
+
208
+ The extracted sentences are joined into a single string and word-counted. If the page has **fewer than 20 words**, grading is skipped (score = 0, treated as a pass). Otherwise, the [Flesch Reading Ease](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests) formula is applied via the `text-readability` library in gradeReadability.ts:
209
+
210
+ | Score Range | Interpretation |
211
+ |---|---|
212
+ | 90–100 | Very easy to read (5th grade) |
213
+ | 60–70 | Easily understood by 13–15 year olds |
214
+ | **≤ 50** | **Difficult — college level or above** |
215
+ | 0–30 | Very difficult — best understood by university graduates |
216
+
217
+ #### 3. Flagging Criteria
218
+
219
+ The `oobee-grading-text-contents` rule is **only enabled when WCAG AAA mode is on** (`enableWcagAaa = true`) and violations are flagged under **Manual Review Required** findings. It maps to **WCAG 3.1.5 (Reading Level)**.
220
+
221
+ A page is **flagged** (incomplete) when the Flesch Reading Ease score is **50 or below**, indicating the text is potentially difficult to understand. The issue message reports the exact score and explains that the target passing score is above 50.
222
+
223
+ A page **passes** when:
224
+ - The score is **above 50**, or
225
+ - There are fewer than 20 words of paragraph text, or
226
+ - No valid sentences (ending with punctuation) are found
package/dist/cli.js CHANGED
@@ -147,8 +147,11 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
147
147
  })
148
148
  .check(argvs => {
149
149
  const scanner = String(argvs.scanner ?? '');
150
- if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM) {
151
- throw new Error('-s or --strategy is only available in website and custom flow scans.');
150
+ if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM && scanner !== ScannerTypes.INTELLIGENT && scanner !== ScannerTypes.SITEMAP) {
151
+ throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
152
+ }
153
+ if (argvs.strategy === 'ignore' && scanner !== ScannerTypes.SITEMAP) {
154
+ throw new Error('-s ignore is only available for sitemap scans.');
152
155
  }
153
156
  return true;
154
157
  })
@@ -161,13 +164,19 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
161
164
  return duration;
162
165
  })
163
166
  .check(argvs => {
164
- if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
165
- throw new Error('-s or --strategy is only available in website scans.');
167
+ if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.scanner !== ScannerTypes.CUSTOM && argvs.scanner !== ScannerTypes.INTELLIGENT && argvs.scanner !== ScannerTypes.SITEMAP && argvs.strategy) {
168
+ throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
169
+ }
170
+ if (argvs.strategy === 'ignore' && argvs.scanner !== ScannerTypes.SITEMAP) {
171
+ throw new Error('-s ignore is only available for sitemap scans.');
166
172
  }
167
173
  return true;
168
174
  })
169
175
  .conflicts('d', 'w')
170
176
  .parse();
177
+ if (!options.strategy) {
178
+ options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
179
+ }
171
180
  const scanInit = async (argvs) => {
172
181
  const updatedArgvs = { ...argvs };
173
182
  // Cannot use data.browser and data.isHeadless as the connectivity check comes first before prepareData
@@ -187,7 +196,11 @@ const scanInit = async (argvs) => {
187
196
  if (res.httpStatus)
188
197
  consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
189
198
  if (res.status === statuses.success.code) {
190
- data.url = res.url;
199
+ // Custom flow should continue from the user-provided entry URL so auth redirects
200
+ // do not replace the original domain used for overlay gating and navigation.
201
+ if (data.type !== ScannerTypes.CUSTOM) {
202
+ data.url = res.url;
203
+ }
191
204
  if (process.env.OOBEE_VALIDATE_URL) {
192
205
  consoleLogger.info('Url is valid');
193
206
  cleanUpAndExit(0, data.randomToken);
package/dist/combine.js CHANGED
@@ -77,7 +77,7 @@ const combineRun = async (details, deviceToScan) => {
77
77
  let durationExceeded = false;
78
78
  switch (type) {
79
79
  case ScannerTypes.CUSTOM:
80
- const res = await runCustom(url, randomToken, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '');
80
+ const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '');
81
81
  urlsCrawledObj = res.urlsCrawled;
82
82
  uiCustomFlowLabel = res.customFlowLabel;
83
83
  break;
@@ -95,6 +95,8 @@ const combineRun = async (details, deviceToScan) => {
95
95
  blacklistedPatterns,
96
96
  includeScreenshots,
97
97
  extraHTTPHeaders,
98
+ strategy,
99
+ userUrl: url,
98
100
  scanDuration,
99
101
  });
100
102
  urlsCrawledObj = sitemapResult.urlsCrawled;
@@ -147,8 +147,8 @@ export const cliOptions = {
147
147
  },
148
148
  s: {
149
149
  alias: 'strategy',
150
- describe: 'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
151
- choices: ['same-domain', 'same-hostname'],
150
+ describe: 'Crawls up to general (same parent) domains, or only specific hostname. Use "ignore" to disable URL filtering (default for sitemap scans). Defaults to "same-domain".',
151
+ choices: ['same-domain', 'same-hostname', 'ignore'],
152
152
  requiresArg: true,
153
153
  demandOption: false,
154
154
  },
@@ -26,7 +26,7 @@ formDataFields,
26
26
  ScannerTypes, BrowserTypes, FileTypes, getEnumKey, } from './constants.js';
27
27
  import { consoleLogger } from '../logs.js';
28
28
  import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
29
- import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
29
+ import { cleanUpAndExit, isFollowStrategy, randomThreeDigitNumberString, register } from '../utils.js';
30
30
  import { getProxyInfo, proxyInfoToResolution } from '../proxyService.js';
31
31
  // validateDirPath validates a provided directory path
32
32
  // returns null if no error
@@ -175,6 +175,14 @@ export const validateXML = (content) => {
175
175
  });
176
176
  return { isValid, parsedContent };
177
177
  };
178
+ export const validateTXT = (content) => {
179
+ // Strip HTML tags first — browsers wrap .txt files in HTML when fetched via Playwright
180
+ const plainText = content.replace(/<[^>]+>/g, '\n');
181
+ const lines = plainText.split(/\r?\n/).map(l => l.trim()).filter(l => l.length > 0);
182
+ // Allow http, https and relative paths (starting with /) for txt sitemaps, as some sitemaps use relative paths and some txt sitemaps are fetched as HTML by Playwright
183
+ const urlPattern = /^(https?:\/\/|\/)[^\s]+$/i;
184
+ return { isValid: lines.some(line => urlPattern.test(line)) };
185
+ };
178
186
  export const isSkippedUrl = (pageUrl, whitelistedDomains) => {
179
187
  const matched = whitelistedDomains.filter(p => {
180
188
  const pattern = p.replace(/[\n\r]+/g, '');
@@ -464,13 +472,13 @@ export const isSitemapContent = (content) => {
464
472
  }
465
473
  const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
466
474
  const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
467
- const regexForUrl = new RegExp('^.*(http|https):/{2}.*$', 'gmi');
468
475
  if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
469
476
  // is an XML sitemap wrapped in a HTML document
470
477
  return true;
471
478
  }
472
- if (!content.match(regexForHtml) && content.match(regexForUrl)) {
473
- // treat this as a txt sitemap where all URLs will be extracted for crawling
479
+ const { isValid: isTxtSitemap } = validateTXT(content);
480
+ if (isTxtSitemap) {
481
+ // treat this as a txt sitemap (plain text or browser-wrapped with HTML)
474
482
  return true;
475
483
  }
476
484
  // is HTML webpage
@@ -584,7 +592,9 @@ export const prepareData = async (argv) => {
584
592
  viewportWidth,
585
593
  playwrightDeviceDetailsObject,
586
594
  maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
587
- strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain,
595
+ strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
596
+ : strategy === 'ignore' ? EnqueueStrategy.All
597
+ : EnqueueStrategy.SameDomain,
588
598
  isLocalFileScan,
589
599
  browser: browserToRun,
590
600
  nameEmail,
@@ -629,6 +639,10 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
629
639
  let shouldCapture = false;
630
640
  const disallowedUrls = [];
631
641
  const allowedUrls = [];
642
+ // Returns 1–2 minimatch glob patterns for a single robots.txt path pattern.
643
+ // Two patterns are returned for bare paths (no trailing wildcard) so that
644
+ // both the exact URL and all child paths are blocked, matching robots.txt
645
+ // prefix semantics.
632
646
  const sanitisePattern = (pattern) => {
633
647
  const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
634
648
  const subdirWildcardRegex = /\/\*\//g;
@@ -636,18 +650,29 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
636
650
  if (subdirWildcardRegex.test(pattern)) {
637
651
  pattern = pattern.replace(subdirWildcardRegex, '/**/');
638
652
  }
653
+ // Query-string patterns (e.g. /faq?faqItem= or /faq/?faq&faqItem=):
654
+ // '?' is the query separator in robots.txt but a single-char wildcard in
655
+ // minimatch. Escape it to a literal match and append '*' so any query
656
+ // value after the stated prefix is also blocked.
657
+ if (pattern.includes('?')) {
658
+ return [domain + pattern.replace('?', '\\?') + '*'];
659
+ }
639
660
  if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
640
661
  if (pattern.endsWith('*')) {
641
- pattern = pattern.concat('*');
662
+ // e.g. /ebook/* → /ebook/** (already covers all children)
663
+ return [domain + pattern.concat('*')];
642
664
  }
643
665
  else {
644
- if (!pattern.endsWith('/'))
645
- pattern = pattern.concat('/');
646
- pattern = pattern.concat('**');
666
+ // Bare path (e.g. /subscription/unsubscribe): robots.txt blocks the
667
+ // exact URL *and* every descendant. minimatch's '/**' glob does not
668
+ // match the bare path itself (no trailing slash), so we emit both the
669
+ // exact-path pattern and a children glob.
670
+ const base = domain + pattern;
671
+ const children = domain + (pattern.endsWith('/') ? pattern : pattern + '/') + '**';
672
+ return [base, children];
647
673
  }
648
674
  }
649
- const final = domain.concat(pattern);
650
- return final;
675
+ return [domain + pattern];
651
676
  };
652
677
  for (const line of lines) {
653
678
  if (line.toLowerCase().startsWith('user-agent: *')) {
@@ -659,15 +684,13 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
659
684
  else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
660
685
  let disallowed = line.substring('disallow: '.length).trim();
661
686
  if (disallowed) {
662
- disallowed = sanitisePattern(disallowed);
663
- disallowedUrls.push(disallowed);
687
+ disallowedUrls.push(...sanitisePattern(disallowed));
664
688
  }
665
689
  }
666
690
  else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
667
691
  let allowed = line.substring('allow: '.length).trim();
668
692
  if (allowed) {
669
- allowed = sanitisePattern(allowed);
670
- allowedUrls.push(allowed);
693
+ allowedUrls.push(...sanitisePattern(allowed));
671
694
  }
672
695
  }
673
696
  }
@@ -718,6 +741,31 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
718
741
  }
719
742
  }
720
743
  };
744
+ export const getSitemapsFromRobotsTxt = async (url, browser, userDataDirectory, extraHTTPHeaders) => {
745
+ const domain = new URL(url).origin;
746
+ const robotsUrl = domain.concat('/robots.txt');
747
+ let robotsTxt;
748
+ try {
749
+ robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browser, userDataDirectory, extraHTTPHeaders);
750
+ }
751
+ catch (e) {
752
+ consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl} for sitemap discovery`);
753
+ return [];
754
+ }
755
+ if (!robotsTxt)
756
+ return [];
757
+ const sitemaps = [];
758
+ const lines = robotsTxt.split(/\r?\n/);
759
+ for (const line of lines) {
760
+ if (line.toLowerCase().startsWith('sitemap:')) {
761
+ const sitemapUrl = line.substring('sitemap:'.length).trim();
762
+ if (sitemapUrl) {
763
+ sitemaps.push(sitemapUrl);
764
+ }
765
+ }
766
+ }
767
+ return sitemaps;
768
+ };
721
769
  export const isDisallowedInRobotsTxt = (url) => {
722
770
  if (!constants.robotsTxtUrls)
723
771
  return;
@@ -736,7 +784,7 @@ export const isDisallowedInRobotsTxt = (url) => {
736
784
  }
737
785
  return false;
738
786
  };
739
- export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders) => {
787
+ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
740
788
  const scannedSitemaps = new Set();
741
789
  const urls = {}; // dictionary of requests to urls to be scanned
742
790
  const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
@@ -745,6 +793,8 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
745
793
  return;
746
794
  if (isDisallowedInRobotsTxt(url))
747
795
  return;
796
+ if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy))
797
+ return;
748
798
  url = convertPathToLocalFile(url);
749
799
  let request;
750
800
  try {
@@ -1603,6 +1653,7 @@ const cacheProxyInfo = getProxyInfo();
1603
1653
  export const getPlaywrightLaunchOptions = (browser) => {
1604
1654
  const channel = browser || undefined;
1605
1655
  const resolution = proxyInfoToResolution(cacheProxyInfo);
1656
+ const shouldIgnoreMuteAudio = process.env.OOBEE_PLAYWRIGHT_IGNORE_DEFAULT_ARGS === '--mute-audio';
1606
1657
  // Start with your base args and sanitise
1607
1658
  const finalArgs = [...constants.launchOptionsArgs].filter(arg => !arg.startsWith('--headless') &&
1608
1659
  !arg.startsWith('--user-agent=') &&
@@ -1630,7 +1681,9 @@ export const getPlaywrightLaunchOptions = (browser) => {
1630
1681
  break;
1631
1682
  }
1632
1683
  const options = {
1633
- ignoreDefaultArgs: ['--use-mock-keychain'],
1684
+ ignoreDefaultArgs: shouldIgnoreMuteAudio
1685
+ ? ['--use-mock-keychain', '--mute-audio']
1686
+ : ['--use-mock-keychain'],
1634
1687
  args: finalArgs,
1635
1688
  headless: process.env.CRAWLEE_HEADLESS === '1',
1636
1689
  ...(channel && { channel }),