npm - @govtechsg/oobee - Versions diffs - 0.10.85 → 0.10.87 - Mend

@govtechsg/oobee 0.10.85 → 0.10.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/.github/workflows/publish.yml +10 -0
package/DETAILS.md +29 -0
package/dist/cli.js +18 -5
package/dist/combine.js +3 -1
package/dist/constants/cliFunctions.js +2 -2
package/dist/constants/common.js +70 -17
package/dist/constants/constants.js +604 -1
package/dist/crawlers/commonCrawlerFunc.js +3 -2
package/dist/crawlers/crawlDomain.js +38 -13
package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
package/dist/crawlers/crawlSitemap.js +141 -84
package/dist/crawlers/custom/utils.js +218 -71
package/dist/crawlers/guards/urlGuard.js +8 -15
package/dist/crawlers/runCustom.js +18 -11
package/dist/generateHtmlReport.js +18 -11
package/dist/generateOobeeClientScanner.js +570 -0
package/dist/mergeAxeResults/itemReferences.js +60 -25
package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
package/dist/mergeAxeResults.js +23 -13
package/dist/npmIndex.js +10 -2
package/dist/proxyService.js +18 -3
package/dist/services/s3Uploader.js +21 -10
package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
package/dist/static/ejs/summary.ejs +19 -8
package/dist/utils.js +4 -3
package/fix-summary-html-oom-pr.md +62 -0
package/oobee-client-scanner.js +34992 -0
package/package.json +5 -5
package/src/cli.ts +19 -5
package/src/combine.ts +5 -1
package/src/constants/cliFunctions.ts +2 -2
package/src/constants/common.ts +87 -22
package/src/constants/constants.ts +602 -1
package/src/crawlers/commonCrawlerFunc.ts +4 -3
package/src/crawlers/crawlDomain.ts +39 -13
package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
package/src/crawlers/crawlSitemap.ts +165 -100
package/src/crawlers/custom/utils.ts +241 -80
package/src/crawlers/guards/urlGuard.ts +24 -31
package/src/crawlers/runCustom.ts +29 -11
package/src/generateHtmlReport.ts +21 -11
package/src/generateOobeeClientScanner.ts +591 -0
package/src/mergeAxeResults/itemReferences.ts +70 -26
package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
package/src/mergeAxeResults.ts +26 -14
package/src/npmIndex.ts +12 -2
package/src/proxyService.ts +25 -4
package/src/services/s3Uploader.ts +23 -11
package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
package/src/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
package/src/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
package/src/static/ejs/summary.ejs +19 -8
package/src/utils.ts +4 -3
package/testStaticJSScanner.html +534 -0

package/.github/workflows/publish.yml CHANGED Viewed

@@ -3,6 +3,8 @@ on:
   workflow_dispatch:
   release:
     types: [published]
+permissions:
+  contents: write
 jobs:
   build:
     runs-on: ubuntu-latest
@@ -20,6 +22,14 @@ jobs:
       - run: npm run build
         continue-on-error: false
+      - name: Create and push git tag
+        run: |
+          VERSION=$(node -p "require('./package.json').version")
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git tag -af "v${VERSION}" -m "Version ${VERSION}"
+          git push origin "v${VERSION}" --force
       - run: npm publish
         env:
           NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}

package/DETAILS.md CHANGED Viewed

@@ -195,3 +195,32 @@ Note: Level AAA are disabled by default.  Please specify `enable-wcag-aaa` in ru
 | skip-link                           | Ensure all skip links have a focusable target                                                                                                  | Good to Fix |
 | tabindex                            | Ensures tabindex attribute values are not greater than 0                                                                                       | Good to Fix |
 | table-duplicate-name                | Ensure the `<caption>` element does not contain the same text as the summary attribute                                                         | Good to Fix |
+## Additional Information
+### How the Readability Grading Works
+#### 1. Text Extraction
+During a page scan, Oobee extracts text from all `<p>` elements on the page (via extractAndGradeText.ts or extractText.ts). The raw text is split into individual **sentences** using the pattern `/[^.!?]*[.!?]+/g` — only text segments ending with `.`, `!`, or `?` are kept.
+#### 2. Flesch Reading Ease Scoring
+The extracted sentences are joined into a single string and word-counted. If the page has **fewer than 20 words**, grading is skipped (score = 0, treated as a pass). Otherwise, the [Flesch Reading Ease](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests) formula is applied via the `text-readability` library in gradeReadability.ts:
+| Score Range | Interpretation |
+|---|---|
+| 90–100 | Very easy to read (5th grade) |
+| 60–70 | Easily understood by 13–15 year olds |
+| **≤ 50** | **Difficult — college level or above** |
+| 0–30 | Very difficult — best understood by university graduates |
+#### 3. Flagging Criteria
+The `oobee-grading-text-contents` rule is **only enabled when WCAG AAA mode is on** (`enableWcagAaa = true`) and violations are flagged under **Manual Review Required** findings. It maps to **WCAG 3.1.5 (Reading Level)**.
+A page is **flagged** (incomplete) when the Flesch Reading Ease score is **50 or below**, indicating the text is potentially difficult to understand. The issue message reports the exact score and explains that the target passing score is above 50.
+A page **passes** when:
+- The score is **above 50**, or
+- There are fewer than 20 words of paragraph text, or
+- No valid sentences (ending with punctuation) are found

package/dist/cli.js CHANGED Viewed

@@ -147,8 +147,11 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
 })
     .check(argvs => {
     const scanner = String(argvs.scanner ?? '');
-    if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM) {
-        throw new Error('-s or --strategy is only available in website and custom flow scans.');
+    if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM && scanner !== ScannerTypes.INTELLIGENT && scanner !== ScannerTypes.SITEMAP) {
+        throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
+    }
+    if (argvs.strategy === 'ignore' && scanner !== ScannerTypes.SITEMAP) {
+        throw new Error('-s ignore is only available for sitemap scans.');
     }
     return true;
 })
@@ -161,13 +164,19 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
     return duration;
 })
     .check(argvs => {
-    if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
-        throw new Error('-s or --strategy is only available in website scans.');
+    if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.scanner !== ScannerTypes.CUSTOM && argvs.scanner !== ScannerTypes.INTELLIGENT && argvs.scanner !== ScannerTypes.SITEMAP && argvs.strategy) {
+        throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
+    }
+    if (argvs.strategy === 'ignore' && argvs.scanner !== ScannerTypes.SITEMAP) {
+        throw new Error('-s ignore is only available for sitemap scans.');
     }
     return true;
 })
     .conflicts('d', 'w')
     .parse();
+if (!options.strategy) {
+    options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
+}
 const scanInit = async (argvs) => {
     const updatedArgvs = { ...argvs };
     // Cannot use data.browser and data.isHeadless as the connectivity check comes first before prepareData
@@ -187,7 +196,11 @@ const scanInit = async (argvs) => {
     if (res.httpStatus)
         consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
     if (res.status === statuses.success.code) {
-        data.url = res.url;
+        // Custom flow should continue from the user-provided entry URL so auth redirects
+        // do not replace the original domain used for overlay gating and navigation.
+        if (data.type !== ScannerTypes.CUSTOM) {
+            data.url = res.url;
+        }
         if (process.env.OOBEE_VALIDATE_URL) {
             consoleLogger.info('Url is valid');
             cleanUpAndExit(0, data.randomToken);

package/dist/combine.js CHANGED Viewed

@@ -77,7 +77,7 @@ const combineRun = async (details, deviceToScan) => {
     let durationExceeded = false;
     switch (type) {
         case ScannerTypes.CUSTOM:
-            const res = await runCustom(url, randomToken, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '');
+            const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '');
             urlsCrawledObj = res.urlsCrawled;
             uiCustomFlowLabel = res.customFlowLabel;
             break;
@@ -95,6 +95,8 @@ const combineRun = async (details, deviceToScan) => {
                 blacklistedPatterns,
                 includeScreenshots,
                 extraHTTPHeaders,
+                strategy,
+                userUrl: url,
                 scanDuration,
             });
             urlsCrawledObj = sitemapResult.urlsCrawled;

package/dist/constants/cliFunctions.js CHANGED Viewed

@@ -147,8 +147,8 @@ export const cliOptions = {
     },
     s: {
         alias: 'strategy',
-        describe: 'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
-        choices: ['same-domain', 'same-hostname'],
+        describe: 'Crawls up to general (same parent) domains, or only specific hostname. Use "ignore" to disable URL filtering (default for sitemap scans). Defaults to "same-domain".',
+        choices: ['same-domain', 'same-hostname', 'ignore'],
         requiresArg: true,
         demandOption: false,
     },

package/dist/constants/common.js CHANGED Viewed

@@ -26,7 +26,7 @@ formDataFields,
 ScannerTypes, BrowserTypes, FileTypes, getEnumKey, } from './constants.js';
 import { consoleLogger } from '../logs.js';
 import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
-import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
+import { cleanUpAndExit, isFollowStrategy, randomThreeDigitNumberString, register } from '../utils.js';
 import { getProxyInfo, proxyInfoToResolution } from '../proxyService.js';
 // validateDirPath validates a provided directory path
 // returns null if no error
@@ -175,6 +175,14 @@ export const validateXML = (content) => {
     });
     return { isValid, parsedContent };
 };
+export const validateTXT = (content) => {
+    // Strip HTML tags first — browsers wrap .txt files in HTML when fetched via Playwright
+    const plainText = content.replace(/<[^>]+>/g, '\n');
+    const lines = plainText.split(/\r?\n/).map(l => l.trim()).filter(l => l.length > 0);
+    // Allow http, https and relative paths (starting with /) for txt sitemaps, as some sitemaps use relative paths and some txt sitemaps are fetched as HTML by Playwright
+    const urlPattern = /^(https?:\/\/|\/)[^\s]+$/i;
+    return { isValid: lines.some(line => urlPattern.test(line)) };
+};
 export const isSkippedUrl = (pageUrl, whitelistedDomains) => {
     const matched = whitelistedDomains.filter(p => {
         const pattern = p.replace(/[\n\r]+/g, '');
@@ -464,13 +472,13 @@ export const isSitemapContent = (content) => {
     }
     const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
     const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
-    const regexForUrl = new RegExp('^.*(http|https):/{2}.*$', 'gmi');
     if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
         // is an XML sitemap wrapped in a HTML document
         return true;
     }
-    if (!content.match(regexForHtml) && content.match(regexForUrl)) {
-        // treat this as a txt sitemap where all URLs will be extracted for crawling
+    const { isValid: isTxtSitemap } = validateTXT(content);
+    if (isTxtSitemap) {
+        // treat this as a txt sitemap (plain text or browser-wrapped with HTML)
         return true;
     }
     // is HTML webpage
@@ -584,7 +592,9 @@ export const prepareData = async (argv) => {
         viewportWidth,
         playwrightDeviceDetailsObject,
         maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
-        strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain,
+        strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
+            : strategy === 'ignore' ? EnqueueStrategy.All
+                : EnqueueStrategy.SameDomain,
         isLocalFileScan,
         browser: browserToRun,
         nameEmail,
@@ -629,6 +639,10 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
     let shouldCapture = false;
     const disallowedUrls = [];
     const allowedUrls = [];
+    // Returns 1–2 minimatch glob patterns for a single robots.txt path pattern.
+    // Two patterns are returned for bare paths (no trailing wildcard) so that
+    // both the exact URL and all child paths are blocked, matching robots.txt
+    // prefix semantics.
     const sanitisePattern = (pattern) => {
         const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
         const subdirWildcardRegex = /\/\*\//g;
@@ -636,18 +650,29 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
         if (subdirWildcardRegex.test(pattern)) {
             pattern = pattern.replace(subdirWildcardRegex, '/**/');
         }
+        // Query-string patterns (e.g. /faq?faqItem= or /faq/?faq&faqItem=):
+        // '?' is the query separator in robots.txt but a single-char wildcard in
+        // minimatch. Escape it to a literal match and append '*' so any query
+        // value after the stated prefix is also blocked.
+        if (pattern.includes('?')) {
+            return [domain + pattern.replace('?', '\\?') + '*'];
+        }
         if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
             if (pattern.endsWith('*')) {
-                pattern = pattern.concat('*');
+                // e.g. /ebook/* → /ebook/** (already covers all children)
+                return [domain + pattern.concat('*')];
             }
             else {
-                if (!pattern.endsWith('/'))
-                    pattern = pattern.concat('/');
-                pattern = pattern.concat('**');
+                // Bare path (e.g. /subscription/unsubscribe): robots.txt blocks the
+                // exact URL *and* every descendant. minimatch's '/**' glob does not
+                // match the bare path itself (no trailing slash), so we emit both the
+                // exact-path pattern and a children glob.
+                const base = domain + pattern;
+                const children = domain + (pattern.endsWith('/') ? pattern : pattern + '/') + '**';
+                return [base, children];
             }
         }
-        const final = domain.concat(pattern);
-        return final;
+        return [domain + pattern];
     };
     for (const line of lines) {
         if (line.toLowerCase().startsWith('user-agent: *')) {
@@ -659,15 +684,13 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
         else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
             let disallowed = line.substring('disallow: '.length).trim();
             if (disallowed) {
-                disallowed = sanitisePattern(disallowed);
-                disallowedUrls.push(disallowed);
+                disallowedUrls.push(...sanitisePattern(disallowed));
             }
         }
         else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
             let allowed = line.substring('allow: '.length).trim();
             if (allowed) {
-                allowed = sanitisePattern(allowed);
-                allowedUrls.push(allowed);
+                allowedUrls.push(...sanitisePattern(allowed));
             }
         }
     }
@@ -718,6 +741,31 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
         }
     }
 };
+export const getSitemapsFromRobotsTxt = async (url, browser, userDataDirectory, extraHTTPHeaders) => {
+    const domain = new URL(url).origin;
+    const robotsUrl = domain.concat('/robots.txt');
+    let robotsTxt;
+    try {
+        robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browser, userDataDirectory, extraHTTPHeaders);
+    }
+    catch (e) {
+        consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl} for sitemap discovery`);
+        return [];
+    }
+    if (!robotsTxt)
+        return [];
+    const sitemaps = [];
+    const lines = robotsTxt.split(/\r?\n/);
+    for (const line of lines) {
+        if (line.toLowerCase().startsWith('sitemap:')) {
+            const sitemapUrl = line.substring('sitemap:'.length).trim();
+            if (sitemapUrl) {
+                sitemaps.push(sitemapUrl);
+            }
+        }
+    }
+    return sitemaps;
+};
 export const isDisallowedInRobotsTxt = (url) => {
     if (!constants.robotsTxtUrls)
         return;
@@ -736,7 +784,7 @@ export const isDisallowedInRobotsTxt = (url) => {
     }
     return false;
 };
-export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders) => {
+export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
     const scannedSitemaps = new Set();
     const urls = {}; // dictionary of requests to urls to be scanned
     const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
@@ -745,6 +793,8 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
             return;
         if (isDisallowedInRobotsTxt(url))
             return;
+        if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy))
+            return;
         url = convertPathToLocalFile(url);
         let request;
         try {
@@ -1603,6 +1653,7 @@ const cacheProxyInfo = getProxyInfo();
 export const getPlaywrightLaunchOptions = (browser) => {
     const channel = browser || undefined;
     const resolution = proxyInfoToResolution(cacheProxyInfo);
+    const shouldIgnoreMuteAudio = process.env.OOBEE_PLAYWRIGHT_IGNORE_DEFAULT_ARGS === '--mute-audio';
     // Start with your base args and sanitise
     const finalArgs = [...constants.launchOptionsArgs].filter(arg => !arg.startsWith('--headless') &&
         !arg.startsWith('--user-agent=') &&
@@ -1630,7 +1681,9 @@ export const getPlaywrightLaunchOptions = (browser) => {
             break;
     }
     const options = {
-        ignoreDefaultArgs: ['--use-mock-keychain'],
+        ignoreDefaultArgs: shouldIgnoreMuteAudio
+            ? ['--use-mock-keychain', '--mute-audio']
+            : ['--use-mock-keychain'],
         args: finalArgs,
         headless: process.env.CRAWLEE_HEADLESS === '1',
         ...(channel && { channel }),