npm - @govtechsg/oobee - Versions diffs - 0.10.85 → 0.10.87 - Mend

@govtechsg/oobee 0.10.85 → 0.10.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/.github/workflows/publish.yml +10 -0
package/DETAILS.md +29 -0
package/dist/cli.js +18 -5
package/dist/combine.js +3 -1
package/dist/constants/cliFunctions.js +2 -2
package/dist/constants/common.js +70 -17
package/dist/constants/constants.js +604 -1
package/dist/crawlers/commonCrawlerFunc.js +3 -2
package/dist/crawlers/crawlDomain.js +38 -13
package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
package/dist/crawlers/crawlSitemap.js +141 -84
package/dist/crawlers/custom/utils.js +218 -71
package/dist/crawlers/guards/urlGuard.js +8 -15
package/dist/crawlers/runCustom.js +18 -11
package/dist/generateHtmlReport.js +18 -11
package/dist/generateOobeeClientScanner.js +570 -0
package/dist/mergeAxeResults/itemReferences.js +60 -25
package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
package/dist/mergeAxeResults.js +23 -13
package/dist/npmIndex.js +10 -2
package/dist/proxyService.js +18 -3
package/dist/services/s3Uploader.js +21 -10
package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
package/dist/static/ejs/summary.ejs +19 -8
package/dist/utils.js +4 -3
package/fix-summary-html-oom-pr.md +62 -0
package/oobee-client-scanner.js +34992 -0
package/package.json +5 -5
package/src/cli.ts +19 -5
package/src/combine.ts +5 -1
package/src/constants/cliFunctions.ts +2 -2
package/src/constants/common.ts +87 -22
package/src/constants/constants.ts +602 -1
package/src/crawlers/commonCrawlerFunc.ts +4 -3
package/src/crawlers/crawlDomain.ts +39 -13
package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
package/src/crawlers/crawlSitemap.ts +165 -100
package/src/crawlers/custom/utils.ts +241 -80
package/src/crawlers/guards/urlGuard.ts +24 -31
package/src/crawlers/runCustom.ts +29 -11
package/src/generateHtmlReport.ts +21 -11
package/src/generateOobeeClientScanner.ts +591 -0
package/src/mergeAxeResults/itemReferences.ts +70 -26
package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
package/src/mergeAxeResults.ts +26 -14
package/src/npmIndex.ts +12 -2
package/src/proxyService.ts +25 -4
package/src/services/s3Uploader.ts +23 -11
package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
package/src/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
package/src/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
package/src/static/ejs/summary.ejs +19 -8
package/src/utils.ts +4 -3
package/testStaticJSScanner.html +534 -0

package/package.json CHANGED Viewed

@@ -1,19 +1,19 @@
 {
   "name": "@govtechsg/oobee",
   "main": "dist/npmIndex.js",
-  "version": "0.10.85",
+  "version": "0.10.87",
   "type": "module",
   "author": "Government Technology Agency <info@tech.gov.sg>",
   "bin": {
     "oobee": "./dist/cli.js"
   },
   "dependencies": {
-    "@aws-sdk/client-s3": "^3.893.0",
+    "@aws-sdk/client-s3": "^3.1049.0",
     "@json2csv/node": "^7.0.3",
     "@napi-rs/canvas": "^0.1.53",
     "@sentry/node": "^9.13.0",
     "@types/aws-sdk": "^0.0.42",
-    "axe-core": "^4.11.1",
+    "axe-core": "^4.11.4",
     "axios": "^1.8.2",
     "base64-stream": "^1.0.0",
     "cheerio": "^1.0.0-rc.12",
@@ -39,7 +39,7 @@
     "tldts": "^7.0.27",
     "typescript": "^5.4.5",
     "url": "^0.11.3",
-    "uuid": "^11.0.3",
+    "uuid": "^14.0.0",
     "validator": "^13.11.0",
     "which": "^4.0.0",
     "winston": "^3.11.0",
@@ -86,7 +86,7 @@
     "fast-xml-parser": ">=5.3.8",
     "js-yaml": "^4.1.1",
     "minimatch": "^10.2.4",
-    "brace-expansion": "^5.0.4",
+    "brace-expansion": "^5.0.6",
     "glob": "^13.0.6",
     "flatted": "^3.4.1",
     "file-type": "^21.3.3"

package/src/cli.ts CHANGED Viewed

@@ -193,8 +193,11 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
   .check(argvs => {
     const scanner = String(argvs.scanner ?? '');
-    if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM) {
-      throw new Error('-s or --strategy is only available in website and custom flow scans.');
+    if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM && scanner !== ScannerTypes.INTELLIGENT && scanner !== ScannerTypes.SITEMAP) {
+      throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
+    }
+    if (argvs.strategy === 'ignore' && scanner !== ScannerTypes.SITEMAP) {
+      throw new Error('-s ignore is only available for sitemap scans.');
     }
     return true;
   })
@@ -210,14 +213,21 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
     return duration;
   })
   .check(argvs => {
-    if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
-      throw new Error('-s or --strategy is only available in website scans.');
+    if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.scanner !== ScannerTypes.CUSTOM && argvs.scanner !== ScannerTypes.INTELLIGENT && argvs.scanner !== ScannerTypes.SITEMAP && argvs.strategy) {
+      throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
+    }
+    if (argvs.strategy === 'ignore' && argvs.scanner !== ScannerTypes.SITEMAP) {
+      throw new Error('-s ignore is only available for sitemap scans.');
     }
     return true;
   })
   .conflicts('d', 'w')
   .parse() as unknown as Answers;
+if (!options.strategy) {
+  options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
+}
 const scanInit = async (argvs: Answers): Promise<string> => {
   const updatedArgvs = { ...argvs };
@@ -250,7 +260,11 @@ const scanInit = async (argvs: Answers): Promise<string> => {
     consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
   if (res.status === statuses.success.code) {
-    data.url = res.url;
+    // Custom flow should continue from the user-provided entry URL so auth redirects
+    // do not replace the original domain used for overlay gating and navigation.
+    if (data.type !== ScannerTypes.CUSTOM) {
+      data.url = res.url;
+    }
     if (process.env.OOBEE_VALIDATE_URL) {
       consoleLogger.info('Url is valid');
       cleanUpAndExit(0, data.randomToken);

package/src/combine.ts CHANGED Viewed

@@ -135,6 +135,8 @@ const combineRun = async (details: Data, deviceToScan: string) => {
       const res = await runCustom(
         url,
         randomToken,
+        browser,
+        userDataDirectory,
         viewportSettings,
         blacklistedPatterns,
         includeScreenshots,
@@ -159,6 +161,8 @@ const combineRun = async (details: Data, deviceToScan: string) => {
         blacklistedPatterns,
         includeScreenshots,
         extraHTTPHeaders,
+        strategy,
+        userUrl: url,
         scanDuration,
       });
       urlsCrawledObj = sitemapResult.urlsCrawled;
@@ -328,4 +332,4 @@ const combineRun = async (details: Data, deviceToScan: string) => {
   }
 };
-export default combineRun;
+export default combineRun;

package/src/constants/cliFunctions.ts CHANGED Viewed

@@ -168,8 +168,8 @@ export const cliOptions: { [key: string]: Options } = {
   s: {
     alias: 'strategy',
     describe:
-      'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
-    choices: ['same-domain', 'same-hostname'],
+      'Crawls up to general (same parent) domains, or only specific hostname. Use "ignore" to disable URL filtering (default for sitemap scans). Defaults to "same-domain".',
+    choices: ['same-domain', 'same-hostname', 'ignore'],
     requiresArg: true,
     demandOption: false,
   },

package/src/constants/common.ts CHANGED Viewed

@@ -33,7 +33,7 @@ import constants, {
 } from './constants.js';
 import { consoleLogger } from '../logs.js';
 import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
-import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
+import { cleanUpAndExit, isFollowStrategy, randomThreeDigitNumberString, register } from '../utils.js';
 import { Answers, Data } from '../index.js';
 import { DeviceDescriptor } from '../types/types.js';
 import { getProxyInfo, proxyInfoToResolution, ProxySettings } from '../proxyService.js';
@@ -213,6 +213,15 @@ export const validateXML = (content: string): { isValid: boolean; parsedContent:
   return { isValid, parsedContent };
 };
+export const validateTXT = (content: string): { isValid: boolean } => {
+  // Strip HTML tags first — browsers wrap .txt files in HTML when fetched via Playwright
+  const plainText = content.replace(/<[^>]+>/g, '\n');
+  const lines = plainText.split(/\r?\n/).map(l => l.trim()).filter(l => l.length > 0);
+  // Allow http, https and relative paths (starting with /) for txt sitemaps, as some sitemaps use relative paths and some txt sitemaps are fetched as HTML by Playwright
+  const urlPattern = /^(https?:\/\/|\/)[^\s]+$/i;
+  return { isValid: lines.some(line => urlPattern.test(line)) };
+};
 export const isSkippedUrl = (pageUrl: string, whitelistedDomains: string[]) => {
   const matched =
     whitelistedDomains.filter(p => {
@@ -541,14 +550,13 @@ export const isSitemapContent = (content: string) => {
   const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
   const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
-  const regexForUrl = new RegExp('^.*(http|https):/{2}.*$', 'gmi');
   if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
     // is an XML sitemap wrapped in a HTML document
     return true;
   }
-  if (!content.match(regexForHtml) && content.match(regexForUrl)) {
-    // treat this as a txt sitemap where all URLs will be extracted for crawling
+  const { isValid: isTxtSitemap } = validateTXT(content);
+  if (isTxtSitemap) {
+    // treat this as a txt sitemap (plain text or browser-wrapped with HTML)
     return true;
   }
   // is HTML webpage
@@ -738,7 +746,9 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
     playwrightDeviceDetailsObject,
     maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
     strategy:
-      strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain,
+      strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
+      : strategy === 'ignore' ? EnqueueStrategy.All
+      : EnqueueStrategy.SameDomain,
     isLocalFileScan,
     browser: browserToRun,
     nameEmail,
@@ -796,7 +806,11 @@ export const getUrlsFromRobotsTxt = async (
   const disallowedUrls = [];
   const allowedUrls = [];
-  const sanitisePattern = (pattern: string): string => {
+  // Returns 1–2 minimatch glob patterns for a single robots.txt path pattern.
+  // Two patterns are returned for bare paths (no trailing wildcard) so that
+  // both the exact URL and all child paths are blocked, matching robots.txt
+  // prefix semantics.
+  const sanitisePattern = (pattern: string): string[] => {
     const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
     const subdirWildcardRegex = /\/\*\//g;
     const filePathRegex = /^\/(?:[^\/]+\/)*[^\/]+\.[a-zA-Z0-9]{1,6}$/;
@@ -804,16 +818,30 @@ export const getUrlsFromRobotsTxt = async (
     if (subdirWildcardRegex.test(pattern)) {
       pattern = pattern.replace(subdirWildcardRegex, '/**/');
     }
+    // Query-string patterns (e.g. /faq?faqItem= or /faq/?faq&faqItem=):
+    // '?' is the query separator in robots.txt but a single-char wildcard in
+    // minimatch. Escape it to a literal match and append '*' so any query
+    // value after the stated prefix is also blocked.
+    if (pattern.includes('?')) {
+      return [domain + pattern.replace('?', '\\?') + '*'];
+    }
     if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
       if (pattern.endsWith('*')) {
-        pattern = pattern.concat('*');
+        // e.g. /ebook/* → /ebook/** (already covers all children)
+        return [domain + pattern.concat('*')];
       } else {
-        if (!pattern.endsWith('/')) pattern = pattern.concat('/');
-        pattern = pattern.concat('**');
+        // Bare path (e.g. /subscription/unsubscribe): robots.txt blocks the
+        // exact URL *and* every descendant. minimatch's '/**' glob does not
+        // match the bare path itself (no trailing slash), so we emit both the
+        // exact-path pattern and a children glob.
+        const base = domain + pattern;
+        const children = domain + (pattern.endsWith('/') ? pattern : pattern + '/') + '**';
+        return [base, children];
       }
     }
-    const final = domain.concat(pattern);
-    return final;
+    return [domain + pattern];
   };
   for (const line of lines) {
@@ -824,14 +852,12 @@ export const getUrlsFromRobotsTxt = async (
     } else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
       let disallowed = line.substring('disallow: '.length).trim();
       if (disallowed) {
-        disallowed = sanitisePattern(disallowed);
-        disallowedUrls.push(disallowed);
+        disallowedUrls.push(...sanitisePattern(disallowed));
       }
     } else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
       let allowed = line.substring('allow: '.length).trim();
       if (allowed) {
-        allowed = sanitisePattern(allowed);
-        allowedUrls.push(allowed);
+        allowedUrls.push(...sanitisePattern(allowed));
       }
     }
   }
@@ -891,6 +917,38 @@ const getRobotsTxtViaPlaywright = async (
   }
 };
+export const getSitemapsFromRobotsTxt = async (
+  url: string,
+  browser: string,
+  userDataDirectory: string,
+  extraHTTPHeaders: Record<string, string>,
+): Promise<string[]> => {
+  const domain = new URL(url).origin;
+  const robotsUrl = domain.concat('/robots.txt');
+  let robotsTxt: string;
+  try {
+    robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browser, userDataDirectory, extraHTTPHeaders);
+  } catch (e) {
+    consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl} for sitemap discovery`);
+    return [];
+  }
+  if (!robotsTxt) return [];
+  const sitemaps: string[] = [];
+  const lines = robotsTxt.split(/\r?\n/);
+  for (const line of lines) {
+    if (line.toLowerCase().startsWith('sitemap:')) {
+      const sitemapUrl = line.substring('sitemap:'.length).trim();
+      if (sitemapUrl) {
+        sitemaps.push(sitemapUrl);
+      }
+    }
+  }
+  return sitemaps;
+};
 export const isDisallowedInRobotsTxt = (url: string): boolean => {
   if (!constants.robotsTxtUrls) return;
@@ -923,6 +981,8 @@ export const getLinksFromSitemap = async (
   userUrlInput: string,
   isIntelligent: boolean,
   extraHTTPHeaders: Record<string, string>,
+  strategy: EnqueueStrategy = EnqueueStrategy.All,
+  userUrl: string = userUrlInput,
 ) => {
   const scannedSitemaps = new Set<string>();
   const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
@@ -932,6 +992,7 @@ export const getLinksFromSitemap = async (
   const addToUrlList = (url: string) => {
     if (!url) return;
     if (isDisallowedInRobotsTxt(url)) return;
+    if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy)) return;
     url = convertPathToLocalFile(url);
@@ -1924,14 +1985,16 @@ export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
   const channel = browser || undefined;
   const resolution = proxyInfoToResolution(cacheProxyInfo);
+  const shouldIgnoreMuteAudio =
+    process.env.OOBEE_PLAYWRIGHT_IGNORE_DEFAULT_ARGS === '--mute-audio';
   // Start with your base args and sanitise
   const finalArgs = [...constants.launchOptionsArgs].filter(
-  arg =>
-    !arg.startsWith('--headless') &&
-    !arg.startsWith('--user-agent=') &&
-    arg !== '--mute-audio' &&
-    !(browser === BrowserTypes.CHROME && arg === '--edge-skip-compat-layer-relaunch'),
+    arg =>
+      !arg.startsWith('--headless') &&
+      !arg.startsWith('--user-agent=') &&
+      arg !== '--mute-audio' &&
+      !(browser === BrowserTypes.CHROME && arg === '--edge-skip-compat-layer-relaunch'),
   );
   // Headless flags (unchanged)
@@ -1956,7 +2019,9 @@ export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
   }
   const options: LaunchOptions = {
-    ignoreDefaultArgs: ['--use-mock-keychain'],
+    ignoreDefaultArgs: shouldIgnoreMuteAudio
+      ? ['--use-mock-keychain', '--mute-audio']
+      : ['--use-mock-keychain'],
     args: finalArgs,
     headless: process.env.CRAWLEE_HEADLESS === '1',
     ...(channel && { channel }),