@govtechsg/oobee 0.10.85 → 0.10.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/.github/workflows/publish.yml +10 -0
  2. package/DETAILS.md +29 -0
  3. package/dist/cli.js +18 -5
  4. package/dist/combine.js +3 -1
  5. package/dist/constants/cliFunctions.js +2 -2
  6. package/dist/constants/common.js +70 -17
  7. package/dist/constants/constants.js +604 -1
  8. package/dist/crawlers/commonCrawlerFunc.js +3 -2
  9. package/dist/crawlers/crawlDomain.js +38 -13
  10. package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
  11. package/dist/crawlers/crawlSitemap.js +141 -84
  12. package/dist/crawlers/custom/utils.js +218 -71
  13. package/dist/crawlers/guards/urlGuard.js +8 -15
  14. package/dist/crawlers/runCustom.js +18 -11
  15. package/dist/generateHtmlReport.js +18 -11
  16. package/dist/generateOobeeClientScanner.js +570 -0
  17. package/dist/mergeAxeResults/itemReferences.js +60 -25
  18. package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
  19. package/dist/mergeAxeResults.js +23 -13
  20. package/dist/npmIndex.js +10 -2
  21. package/dist/proxyService.js +18 -3
  22. package/dist/services/s3Uploader.js +21 -10
  23. package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  24. package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
  25. package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
  26. package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
  27. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
  28. package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  29. package/dist/static/ejs/summary.ejs +19 -8
  30. package/dist/utils.js +4 -3
  31. package/fix-summary-html-oom-pr.md +62 -0
  32. package/oobee-client-scanner.js +34992 -0
  33. package/package.json +5 -5
  34. package/src/cli.ts +19 -5
  35. package/src/combine.ts +5 -1
  36. package/src/constants/cliFunctions.ts +2 -2
  37. package/src/constants/common.ts +87 -22
  38. package/src/constants/constants.ts +602 -1
  39. package/src/crawlers/commonCrawlerFunc.ts +4 -3
  40. package/src/crawlers/crawlDomain.ts +39 -13
  41. package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
  42. package/src/crawlers/crawlSitemap.ts +165 -100
  43. package/src/crawlers/custom/utils.ts +241 -80
  44. package/src/crawlers/guards/urlGuard.ts +24 -31
  45. package/src/crawlers/runCustom.ts +29 -11
  46. package/src/generateHtmlReport.ts +21 -11
  47. package/src/generateOobeeClientScanner.ts +591 -0
  48. package/src/mergeAxeResults/itemReferences.ts +70 -26
  49. package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
  50. package/src/mergeAxeResults.ts +26 -14
  51. package/src/npmIndex.ts +12 -2
  52. package/src/proxyService.ts +25 -4
  53. package/src/services/s3Uploader.ts +23 -11
  54. package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  55. package/src/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
  56. package/src/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
  57. package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
  58. package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
  59. package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  60. package/src/static/ejs/summary.ejs +19 -8
  61. package/src/utils.ts +4 -3
  62. package/testStaticJSScanner.html +534 -0
package/package.json CHANGED
@@ -1,19 +1,19 @@
1
1
  {
2
2
  "name": "@govtechsg/oobee",
3
3
  "main": "dist/npmIndex.js",
4
- "version": "0.10.85",
4
+ "version": "0.10.87",
5
5
  "type": "module",
6
6
  "author": "Government Technology Agency <info@tech.gov.sg>",
7
7
  "bin": {
8
8
  "oobee": "./dist/cli.js"
9
9
  },
10
10
  "dependencies": {
11
- "@aws-sdk/client-s3": "^3.893.0",
11
+ "@aws-sdk/client-s3": "^3.1049.0",
12
12
  "@json2csv/node": "^7.0.3",
13
13
  "@napi-rs/canvas": "^0.1.53",
14
14
  "@sentry/node": "^9.13.0",
15
15
  "@types/aws-sdk": "^0.0.42",
16
- "axe-core": "^4.11.1",
16
+ "axe-core": "^4.11.4",
17
17
  "axios": "^1.8.2",
18
18
  "base64-stream": "^1.0.0",
19
19
  "cheerio": "^1.0.0-rc.12",
@@ -39,7 +39,7 @@
39
39
  "tldts": "^7.0.27",
40
40
  "typescript": "^5.4.5",
41
41
  "url": "^0.11.3",
42
- "uuid": "^11.0.3",
42
+ "uuid": "^14.0.0",
43
43
  "validator": "^13.11.0",
44
44
  "which": "^4.0.0",
45
45
  "winston": "^3.11.0",
@@ -86,7 +86,7 @@
86
86
  "fast-xml-parser": ">=5.3.8",
87
87
  "js-yaml": "^4.1.1",
88
88
  "minimatch": "^10.2.4",
89
- "brace-expansion": "^5.0.4",
89
+ "brace-expansion": "^5.0.6",
90
90
  "glob": "^13.0.6",
91
91
  "flatted": "^3.4.1",
92
92
  "file-type": "^21.3.3"
package/src/cli.ts CHANGED
@@ -193,8 +193,11 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
193
193
  .check(argvs => {
194
194
  const scanner = String(argvs.scanner ?? '');
195
195
 
196
- if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM) {
197
- throw new Error('-s or --strategy is only available in website and custom flow scans.');
196
+ if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM && scanner !== ScannerTypes.INTELLIGENT && scanner !== ScannerTypes.SITEMAP) {
197
+ throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
198
+ }
199
+ if (argvs.strategy === 'ignore' && scanner !== ScannerTypes.SITEMAP) {
200
+ throw new Error('-s ignore is only available for sitemap scans.');
198
201
  }
199
202
  return true;
200
203
  })
@@ -210,14 +213,21 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
210
213
  return duration;
211
214
  })
212
215
  .check(argvs => {
213
- if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
214
- throw new Error('-s or --strategy is only available in website scans.');
216
+ if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.scanner !== ScannerTypes.CUSTOM && argvs.scanner !== ScannerTypes.INTELLIGENT && argvs.scanner !== ScannerTypes.SITEMAP && argvs.strategy) {
217
+ throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
218
+ }
219
+ if (argvs.strategy === 'ignore' && argvs.scanner !== ScannerTypes.SITEMAP) {
220
+ throw new Error('-s ignore is only available for sitemap scans.');
215
221
  }
216
222
  return true;
217
223
  })
218
224
  .conflicts('d', 'w')
219
225
  .parse() as unknown as Answers;
220
226
 
227
+ if (!options.strategy) {
228
+ options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
229
+ }
230
+
221
231
  const scanInit = async (argvs: Answers): Promise<string> => {
222
232
  const updatedArgvs = { ...argvs };
223
233
 
@@ -250,7 +260,11 @@ const scanInit = async (argvs: Answers): Promise<string> => {
250
260
  consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
251
261
 
252
262
  if (res.status === statuses.success.code) {
253
- data.url = res.url;
263
+ // Custom flow should continue from the user-provided entry URL so auth redirects
264
+ // do not replace the original domain used for overlay gating and navigation.
265
+ if (data.type !== ScannerTypes.CUSTOM) {
266
+ data.url = res.url;
267
+ }
254
268
  if (process.env.OOBEE_VALIDATE_URL) {
255
269
  consoleLogger.info('Url is valid');
256
270
  cleanUpAndExit(0, data.randomToken);
package/src/combine.ts CHANGED
@@ -135,6 +135,8 @@ const combineRun = async (details: Data, deviceToScan: string) => {
135
135
  const res = await runCustom(
136
136
  url,
137
137
  randomToken,
138
+ browser,
139
+ userDataDirectory,
138
140
  viewportSettings,
139
141
  blacklistedPatterns,
140
142
  includeScreenshots,
@@ -159,6 +161,8 @@ const combineRun = async (details: Data, deviceToScan: string) => {
159
161
  blacklistedPatterns,
160
162
  includeScreenshots,
161
163
  extraHTTPHeaders,
164
+ strategy,
165
+ userUrl: url,
162
166
  scanDuration,
163
167
  });
164
168
  urlsCrawledObj = sitemapResult.urlsCrawled;
@@ -328,4 +332,4 @@ const combineRun = async (details: Data, deviceToScan: string) => {
328
332
  }
329
333
  };
330
334
 
331
- export default combineRun;
335
+ export default combineRun;
@@ -168,8 +168,8 @@ export const cliOptions: { [key: string]: Options } = {
168
168
  s: {
169
169
  alias: 'strategy',
170
170
  describe:
171
- 'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
172
- choices: ['same-domain', 'same-hostname'],
171
+ 'Crawls up to general (same parent) domains, or only specific hostname. Use "ignore" to disable URL filtering (default for sitemap scans). Defaults to "same-domain".',
172
+ choices: ['same-domain', 'same-hostname', 'ignore'],
173
173
  requiresArg: true,
174
174
  demandOption: false,
175
175
  },
@@ -33,7 +33,7 @@ import constants, {
33
33
  } from './constants.js';
34
34
  import { consoleLogger } from '../logs.js';
35
35
  import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
36
- import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
36
+ import { cleanUpAndExit, isFollowStrategy, randomThreeDigitNumberString, register } from '../utils.js';
37
37
  import { Answers, Data } from '../index.js';
38
38
  import { DeviceDescriptor } from '../types/types.js';
39
39
  import { getProxyInfo, proxyInfoToResolution, ProxySettings } from '../proxyService.js';
@@ -213,6 +213,15 @@ export const validateXML = (content: string): { isValid: boolean; parsedContent:
213
213
  return { isValid, parsedContent };
214
214
  };
215
215
 
216
+ export const validateTXT = (content: string): { isValid: boolean } => {
217
+ // Strip HTML tags first — browsers wrap .txt files in HTML when fetched via Playwright
218
+ const plainText = content.replace(/<[^>]+>/g, '\n');
219
+ const lines = plainText.split(/\r?\n/).map(l => l.trim()).filter(l => l.length > 0);
220
+ // Allow http, https and relative paths (starting with /) for txt sitemaps, as some sitemaps use relative paths and some txt sitemaps are fetched as HTML by Playwright
221
+ const urlPattern = /^(https?:\/\/|\/)[^\s]+$/i;
222
+ return { isValid: lines.some(line => urlPattern.test(line)) };
223
+ };
224
+
216
225
  export const isSkippedUrl = (pageUrl: string, whitelistedDomains: string[]) => {
217
226
  const matched =
218
227
  whitelistedDomains.filter(p => {
@@ -541,14 +550,13 @@ export const isSitemapContent = (content: string) => {
541
550
 
542
551
  const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
543
552
  const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
544
- const regexForUrl = new RegExp('^.*(http|https):/{2}.*$', 'gmi');
545
-
546
553
  if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
547
554
  // is an XML sitemap wrapped in a HTML document
548
555
  return true;
549
556
  }
550
- if (!content.match(regexForHtml) && content.match(regexForUrl)) {
551
- // treat this as a txt sitemap where all URLs will be extracted for crawling
557
+ const { isValid: isTxtSitemap } = validateTXT(content);
558
+ if (isTxtSitemap) {
559
+ // treat this as a txt sitemap (plain text or browser-wrapped with HTML)
552
560
  return true;
553
561
  }
554
562
  // is HTML webpage
@@ -738,7 +746,9 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
738
746
  playwrightDeviceDetailsObject,
739
747
  maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
740
748
  strategy:
741
- strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain,
749
+ strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
750
+ : strategy === 'ignore' ? EnqueueStrategy.All
751
+ : EnqueueStrategy.SameDomain,
742
752
  isLocalFileScan,
743
753
  browser: browserToRun,
744
754
  nameEmail,
@@ -796,7 +806,11 @@ export const getUrlsFromRobotsTxt = async (
796
806
  const disallowedUrls = [];
797
807
  const allowedUrls = [];
798
808
 
799
- const sanitisePattern = (pattern: string): string => {
809
+ // Returns 1–2 minimatch glob patterns for a single robots.txt path pattern.
810
+ // Two patterns are returned for bare paths (no trailing wildcard) so that
811
+ // both the exact URL and all child paths are blocked, matching robots.txt
812
+ // prefix semantics.
813
+ const sanitisePattern = (pattern: string): string[] => {
800
814
  const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
801
815
  const subdirWildcardRegex = /\/\*\//g;
802
816
  const filePathRegex = /^\/(?:[^\/]+\/)*[^\/]+\.[a-zA-Z0-9]{1,6}$/;
@@ -804,16 +818,30 @@ export const getUrlsFromRobotsTxt = async (
804
818
  if (subdirWildcardRegex.test(pattern)) {
805
819
  pattern = pattern.replace(subdirWildcardRegex, '/**/');
806
820
  }
821
+
822
+ // Query-string patterns (e.g. /faq?faqItem= or /faq/?faq&faqItem=):
823
+ // '?' is the query separator in robots.txt but a single-char wildcard in
824
+ // minimatch. Escape it to a literal match and append '*' so any query
825
+ // value after the stated prefix is also blocked.
826
+ if (pattern.includes('?')) {
827
+ return [domain + pattern.replace('?', '\\?') + '*'];
828
+ }
829
+
807
830
  if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
808
831
  if (pattern.endsWith('*')) {
809
- pattern = pattern.concat('*');
832
+ // e.g. /ebook/* → /ebook/** (already covers all children)
833
+ return [domain + pattern.concat('*')];
810
834
  } else {
811
- if (!pattern.endsWith('/')) pattern = pattern.concat('/');
812
- pattern = pattern.concat('**');
835
+ // Bare path (e.g. /subscription/unsubscribe): robots.txt blocks the
836
+ // exact URL *and* every descendant. minimatch's '/**' glob does not
837
+ // match the bare path itself (no trailing slash), so we emit both the
838
+ // exact-path pattern and a children glob.
839
+ const base = domain + pattern;
840
+ const children = domain + (pattern.endsWith('/') ? pattern : pattern + '/') + '**';
841
+ return [base, children];
813
842
  }
814
843
  }
815
- const final = domain.concat(pattern);
816
- return final;
844
+ return [domain + pattern];
817
845
  };
818
846
 
819
847
  for (const line of lines) {
@@ -824,14 +852,12 @@ export const getUrlsFromRobotsTxt = async (
824
852
  } else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
825
853
  let disallowed = line.substring('disallow: '.length).trim();
826
854
  if (disallowed) {
827
- disallowed = sanitisePattern(disallowed);
828
- disallowedUrls.push(disallowed);
855
+ disallowedUrls.push(...sanitisePattern(disallowed));
829
856
  }
830
857
  } else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
831
858
  let allowed = line.substring('allow: '.length).trim();
832
859
  if (allowed) {
833
- allowed = sanitisePattern(allowed);
834
- allowedUrls.push(allowed);
860
+ allowedUrls.push(...sanitisePattern(allowed));
835
861
  }
836
862
  }
837
863
  }
@@ -891,6 +917,38 @@ const getRobotsTxtViaPlaywright = async (
891
917
  }
892
918
  };
893
919
 
920
+ export const getSitemapsFromRobotsTxt = async (
921
+ url: string,
922
+ browser: string,
923
+ userDataDirectory: string,
924
+ extraHTTPHeaders: Record<string, string>,
925
+ ): Promise<string[]> => {
926
+ const domain = new URL(url).origin;
927
+ const robotsUrl = domain.concat('/robots.txt');
928
+
929
+ let robotsTxt: string;
930
+ try {
931
+ robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browser, userDataDirectory, extraHTTPHeaders);
932
+ } catch (e) {
933
+ consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl} for sitemap discovery`);
934
+ return [];
935
+ }
936
+
937
+ if (!robotsTxt) return [];
938
+
939
+ const sitemaps: string[] = [];
940
+ const lines = robotsTxt.split(/\r?\n/);
941
+ for (const line of lines) {
942
+ if (line.toLowerCase().startsWith('sitemap:')) {
943
+ const sitemapUrl = line.substring('sitemap:'.length).trim();
944
+ if (sitemapUrl) {
945
+ sitemaps.push(sitemapUrl);
946
+ }
947
+ }
948
+ }
949
+ return sitemaps;
950
+ };
951
+
894
952
  export const isDisallowedInRobotsTxt = (url: string): boolean => {
895
953
  if (!constants.robotsTxtUrls) return;
896
954
 
@@ -923,6 +981,8 @@ export const getLinksFromSitemap = async (
923
981
  userUrlInput: string,
924
982
  isIntelligent: boolean,
925
983
  extraHTTPHeaders: Record<string, string>,
984
+ strategy: EnqueueStrategy = EnqueueStrategy.All,
985
+ userUrl: string = userUrlInput,
926
986
  ) => {
927
987
  const scannedSitemaps = new Set<string>();
928
988
  const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
@@ -932,6 +992,7 @@ export const getLinksFromSitemap = async (
932
992
  const addToUrlList = (url: string) => {
933
993
  if (!url) return;
934
994
  if (isDisallowedInRobotsTxt(url)) return;
995
+ if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy)) return;
935
996
 
936
997
  url = convertPathToLocalFile(url);
937
998
 
@@ -1924,14 +1985,16 @@ export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
1924
1985
  const channel = browser || undefined;
1925
1986
 
1926
1987
  const resolution = proxyInfoToResolution(cacheProxyInfo);
1988
+ const shouldIgnoreMuteAudio =
1989
+ process.env.OOBEE_PLAYWRIGHT_IGNORE_DEFAULT_ARGS === '--mute-audio';
1927
1990
 
1928
1991
  // Start with your base args and sanitise
1929
1992
  const finalArgs = [...constants.launchOptionsArgs].filter(
1930
- arg =>
1931
- !arg.startsWith('--headless') &&
1932
- !arg.startsWith('--user-agent=') &&
1933
- arg !== '--mute-audio' &&
1934
- !(browser === BrowserTypes.CHROME && arg === '--edge-skip-compat-layer-relaunch'),
1993
+ arg =>
1994
+ !arg.startsWith('--headless') &&
1995
+ !arg.startsWith('--user-agent=') &&
1996
+ arg !== '--mute-audio' &&
1997
+ !(browser === BrowserTypes.CHROME && arg === '--edge-skip-compat-layer-relaunch'),
1935
1998
  );
1936
1999
 
1937
2000
  // Headless flags (unchanged)
@@ -1956,7 +2019,9 @@ export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
1956
2019
  }
1957
2020
 
1958
2021
  const options: LaunchOptions = {
1959
- ignoreDefaultArgs: ['--use-mock-keychain'],
2022
+ ignoreDefaultArgs: shouldIgnoreMuteAudio
2023
+ ? ['--use-mock-keychain', '--mute-audio']
2024
+ : ['--use-mock-keychain'],
1960
2025
  args: finalArgs,
1961
2026
  headless: process.env.CRAWLEE_HEADLESS === '1',
1962
2027
  ...(channel && { channel }),