@govtechsg/oobee 0.10.84 → 0.10.86

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/.github/workflows/image.yml +3 -2
  2. package/.github/workflows/publish.yml +10 -0
  3. package/DETAILS.md +29 -0
  4. package/dist/cli.js +7 -6
  5. package/dist/combine.js +1 -1
  6. package/dist/constants/common.js +15 -4
  7. package/dist/constants/constants.js +604 -1
  8. package/dist/crawlers/commonCrawlerFunc.js +3 -2
  9. package/dist/crawlers/crawlSitemap.js +98 -80
  10. package/dist/crawlers/custom/utils.js +218 -71
  11. package/dist/crawlers/guards/urlGuard.js +8 -15
  12. package/dist/crawlers/runCustom.js +24 -15
  13. package/dist/generateOobeeClientScanner.js +570 -0
  14. package/dist/mergeAxeResults.js +49 -29
  15. package/dist/npmIndex.js +10 -2
  16. package/dist/proxyService.js +18 -3
  17. package/dist/services/s3Uploader.js +21 -10
  18. package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
  19. package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
  20. package/dist/static/ejs/summary.ejs +10 -5
  21. package/oobee-client-scanner.js +34992 -0
  22. package/package.json +3 -3
  23. package/src/cli.ts +20 -15
  24. package/src/combine.ts +3 -1
  25. package/src/constants/common.ts +22 -10
  26. package/src/constants/constants.ts +602 -1
  27. package/src/crawlers/commonCrawlerFunc.ts +4 -3
  28. package/src/crawlers/crawlSitemap.ts +116 -98
  29. package/src/crawlers/custom/utils.ts +244 -84
  30. package/src/crawlers/guards/urlGuard.ts +24 -31
  31. package/src/crawlers/runCustom.ts +38 -15
  32. package/src/generateOobeeClientScanner.ts +591 -0
  33. package/src/mergeAxeResults.ts +48 -29
  34. package/src/npmIndex.ts +12 -2
  35. package/src/proxyService.ts +25 -4
  36. package/src/services/s3Uploader.ts +23 -11
  37. package/src/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
  38. package/src/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
  39. package/src/static/ejs/summary.ejs +10 -5
  40. package/testStaticJSScanner.html +534 -0
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@govtechsg/oobee",
3
3
  "main": "dist/npmIndex.js",
4
- "version": "0.10.84",
4
+ "version": "0.10.86",
5
5
  "type": "module",
6
6
  "author": "Government Technology Agency <info@tech.gov.sg>",
7
7
  "bin": {
@@ -36,7 +36,7 @@
36
36
  "print-message": "^3.0.1",
37
37
  "safe-regex": "^2.1.1",
38
38
  "text-readability": "^1.1.0",
39
- "tldts": "^7.0.26",
39
+ "tldts": "^7.0.27",
40
40
  "typescript": "^5.4.5",
41
41
  "url": "^0.11.3",
42
42
  "uuid": "^11.0.3",
@@ -86,7 +86,7 @@
86
86
  "fast-xml-parser": ">=5.3.8",
87
87
  "js-yaml": "^4.1.1",
88
88
  "minimatch": "^10.2.4",
89
- "brace-expansion": "^5.0.4",
89
+ "brace-expansion": "^5.0.5",
90
90
  "glob": "^13.0.6",
91
91
  "flatted": "^3.4.1",
92
92
  "file-type": "^21.3.3"
package/src/cli.ts CHANGED
@@ -5,7 +5,13 @@ import printMessage from 'print-message';
5
5
  import { devices } from 'playwright';
6
6
  import { fileURLToPath } from 'url';
7
7
  import path from 'path';
8
- import { setHeadlessMode, getVersion, getStoragePath, listenForCleanUp, cleanUpAndExit } from './utils.js';
8
+ import {
9
+ setHeadlessMode,
10
+ getVersion,
11
+ getStoragePath,
12
+ listenForCleanUp,
13
+ cleanUpAndExit,
14
+ } from './utils.js';
9
15
  import {
10
16
  checkUrl,
11
17
  prepareData,
@@ -185,12 +191,14 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
185
191
  return true;
186
192
  })
187
193
  .check(argvs => {
188
- if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
189
- throw new Error('-s or --strategy is only available in website scans.');
194
+ const scanner = String(argvs.scanner ?? '');
195
+
196
+ if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM) {
197
+ throw new Error('-s or --strategy is only available in website and custom flow scans.');
190
198
  }
191
199
  return true;
192
200
  })
193
- .coerce('l', (option) => {
201
+ .coerce('l', option => {
194
202
  const duration = Number(option);
195
203
  if (isNaN(duration) || duration < 0) {
196
204
  printMessage(
@@ -202,8 +210,8 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
202
210
  return duration;
203
211
  })
204
212
  .check(argvs => {
205
- if (argvs.scanner === ScannerTypes.CUSTOM && typeof argvs.scanDuration === 'number' && argvs.scanDuration > 0) {
206
- throw new Error('-l or --scanDuration is not allowed for custom flow scans.');
213
+ if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
214
+ throw new Error('-s or --strategy is only available in website scans.');
207
215
  }
208
216
  return true;
209
217
  })
@@ -235,10 +243,11 @@ const scanInit = async (argvs: Answers): Promise<string> => {
235
243
  data.userDataDirectory,
236
244
  data.playwrightDeviceDetailsObject,
237
245
  data.extraHTTPHeaders,
238
- data.fileTypes
246
+ data.fileTypes,
239
247
  );
240
248
 
241
- if (res.httpStatus) consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
249
+ if (res.httpStatus)
250
+ consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
242
251
 
243
252
  if (res.status === statuses.success.code) {
244
253
  data.url = res.url;
@@ -267,15 +276,11 @@ const scanInit = async (argvs: Answers): Promise<string> => {
267
276
  }
268
277
  }
269
278
 
270
- const screenToScan = getScreenToScan(
271
- data.deviceChosen,
272
- data.customDevice,
273
- data.viewportWidth,
274
- );
279
+ const screenToScan = getScreenToScan(data.deviceChosen, data.customDevice, data.viewportWidth);
275
280
 
276
281
  printMessage([`Oobee version: ${appVersion}`, 'Starting scan...'], messageOptions);
277
- consoleLogger.info(`Oobee version: ${appVersion}`);
278
-
282
+ consoleLogger.info(`Oobee version: ${appVersion}`);
283
+
279
284
  await combineRun(data, screenToScan);
280
285
 
281
286
  return getStoragePath(data.randomToken);
package/src/combine.ts CHANGED
@@ -135,6 +135,8 @@ const combineRun = async (details: Data, deviceToScan: string) => {
135
135
  const res = await runCustom(
136
136
  url,
137
137
  randomToken,
138
+ browser,
139
+ userDataDirectory,
138
140
  viewportSettings,
139
141
  blacklistedPatterns,
140
142
  includeScreenshots,
@@ -328,4 +330,4 @@ const combineRun = async (details: Data, deviceToScan: string) => {
328
330
  }
329
331
  };
330
332
 
331
- export default combineRun;
333
+ export default combineRun;
@@ -213,6 +213,15 @@ export const validateXML = (content: string): { isValid: boolean; parsedContent:
213
213
  return { isValid, parsedContent };
214
214
  };
215
215
 
216
+ export const validateTXT = (content: string): { isValid: boolean } => {
217
+ // Strip HTML tags first — browsers wrap .txt files in HTML when fetched via Playwright
218
+ const plainText = content.replace(/<[^>]+>/g, '\n');
219
+ const lines = plainText.split(/\r?\n/).map(l => l.trim()).filter(l => l.length > 0);
220
+ // Allow http, https and relative paths (starting with /) for txt sitemaps, as some sitemaps use relative paths and some txt sitemaps are fetched as HTML by Playwright
221
+ const urlPattern = /^(https?:\/\/|\/)[^\s]+$/i;
222
+ return { isValid: lines.some(line => urlPattern.test(line)) };
223
+ };
224
+
216
225
  export const isSkippedUrl = (pageUrl: string, whitelistedDomains: string[]) => {
217
226
  const matched =
218
227
  whitelistedDomains.filter(p => {
@@ -541,14 +550,13 @@ export const isSitemapContent = (content: string) => {
541
550
 
542
551
  const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
543
552
  const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
544
- const regexForUrl = new RegExp('^.*(http|https):/{2}.*$', 'gmi');
545
-
546
553
  if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
547
554
  // is an XML sitemap wrapped in a HTML document
548
555
  return true;
549
556
  }
550
- if (!content.match(regexForHtml) && content.match(regexForUrl)) {
551
- // treat this as a txt sitemap where all URLs will be extracted for crawling
557
+ const { isValid: isTxtSitemap } = validateTXT(content);
558
+ if (isTxtSitemap) {
559
+ // treat this as a txt sitemap (plain text or browser-wrapped with HTML)
552
560
  return true;
553
561
  }
554
562
  // is HTML webpage
@@ -1924,14 +1932,16 @@ export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
1924
1932
  const channel = browser || undefined;
1925
1933
 
1926
1934
  const resolution = proxyInfoToResolution(cacheProxyInfo);
1935
+ const shouldIgnoreMuteAudio =
1936
+ process.env.OOBEE_PLAYWRIGHT_IGNORE_DEFAULT_ARGS === '--mute-audio';
1927
1937
 
1928
1938
  // Start with your base args and sanitise
1929
1939
  const finalArgs = [...constants.launchOptionsArgs].filter(
1930
- arg =>
1931
- !arg.startsWith('--headless') &&
1932
- !arg.startsWith('--user-agent=') &&
1933
- arg !== '--mute-audio' &&
1934
- !(browser === BrowserTypes.CHROME && arg === '--edge-skip-compat-layer-relaunch'),
1940
+ arg =>
1941
+ !arg.startsWith('--headless') &&
1942
+ !arg.startsWith('--user-agent=') &&
1943
+ arg !== '--mute-audio' &&
1944
+ !(browser === BrowserTypes.CHROME && arg === '--edge-skip-compat-layer-relaunch'),
1935
1945
  );
1936
1946
 
1937
1947
  // Headless flags (unchanged)
@@ -1956,7 +1966,9 @@ export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
1956
1966
  }
1957
1967
 
1958
1968
  const options: LaunchOptions = {
1959
- ignoreDefaultArgs: ['--use-mock-keychain'],
1969
+ ignoreDefaultArgs: shouldIgnoreMuteAudio
1970
+ ? ['--use-mock-keychain', '--mute-audio']
1971
+ : ['--use-mock-keychain'],
1960
1972
  args: finalArgs,
1961
1973
  headless: process.env.CRAWLEE_HEADLESS === '1',
1962
1974
  ...(channel && { channel }),