@govtechsg/oobee 0.10.83 → 0.10.85

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/README.md +6 -1
  2. package/dist/cli.js +7 -6
  3. package/dist/constants/common.js +13 -1
  4. package/dist/crawlers/crawlDomain.js +220 -120
  5. package/dist/crawlers/crawlIntelligentSitemap.js +22 -7
  6. package/dist/crawlers/custom/utils.js +81 -40
  7. package/dist/crawlers/runCustom.js +13 -5
  8. package/dist/mergeAxeResults/itemReferences.js +55 -0
  9. package/dist/mergeAxeResults/jsonArtifacts.js +335 -0
  10. package/dist/mergeAxeResults/scanPages.js +159 -0
  11. package/dist/mergeAxeResults/sentryTelemetry.js +152 -0
  12. package/dist/mergeAxeResults/types.js +1 -0
  13. package/dist/mergeAxeResults/writeCsv.js +125 -0
  14. package/dist/mergeAxeResults/writeScanDetailsCsv.js +35 -0
  15. package/dist/mergeAxeResults/writeSitemap.js +10 -0
  16. package/dist/mergeAxeResults.js +64 -950
  17. package/dist/proxyService.js +90 -5
  18. package/dist/utils.js +20 -7
  19. package/package.json +6 -6
  20. package/src/cli.ts +20 -15
  21. package/src/constants/common.ts +13 -1
  22. package/src/crawlers/crawlDomain.ts +248 -137
  23. package/src/crawlers/crawlIntelligentSitemap.ts +22 -8
  24. package/src/crawlers/custom/utils.ts +103 -48
  25. package/src/crawlers/runCustom.ts +18 -5
  26. package/src/mergeAxeResults/itemReferences.ts +62 -0
  27. package/src/mergeAxeResults/jsonArtifacts.ts +451 -0
  28. package/src/mergeAxeResults/scanPages.ts +207 -0
  29. package/src/mergeAxeResults/sentryTelemetry.ts +183 -0
  30. package/src/mergeAxeResults/types.ts +99 -0
  31. package/src/mergeAxeResults/writeCsv.ts +145 -0
  32. package/src/mergeAxeResults/writeScanDetailsCsv.ts +51 -0
  33. package/src/mergeAxeResults/writeSitemap.ts +13 -0
  34. package/src/mergeAxeResults.ts +125 -1344
  35. package/src/proxyService.ts +96 -4
  36. package/src/utils.ts +19 -7
@@ -29,6 +29,8 @@ export interface ProxyInfo {
29
29
  // optional global credentials to embed (URL-encoded)
30
30
  username?: string;
31
31
  password?: string;
32
+ // optional include-only list of domain patterns (from INCLUDE_PROXY env)
33
+ includeList?: string[];
32
34
  }
33
35
 
34
36
  export interface ProxySettings {
@@ -329,7 +331,7 @@ function parseWindowsRegistry(): ProxyInfo | null {
329
331
  if (enabledManual && v.ProxyServer) {
330
332
  const s = v.ProxyServer.trim();
331
333
  if (s.includes('=')) {
332
- for (const part of s.split(';')) {
334
+ for (const part of s.split(/[,;]/)) {
333
335
  const [proto, addr] = part.split('=');
334
336
  if (!proto || !addr) continue;
335
337
  const p = proto.trim().toLowerCase();
@@ -359,14 +361,104 @@ function parseWindowsRegistry(): ProxyInfo | null {
359
361
 
360
362
  export function getProxyInfo(): ProxyInfo | null {
361
363
  const plat = os.platform();
362
- if (plat === 'win32') return parseEnvProxyCommon() || parseWindowsRegistry();
363
- if (plat === 'darwin') return parseEnvProxyCommon() || parseMacScutil();
364
- return parseEnvProxyCommon(); // Linux/others
364
+ let info: ProxyInfo | null;
365
+ if (plat === 'win32') info = parseEnvProxyCommon() || parseWindowsRegistry();
366
+ else if (plat === 'darwin') info = parseEnvProxyCommon() || parseMacScutil();
367
+ else info = parseEnvProxyCommon(); // Linux/others
368
+
369
+ // Apply INCLUDE_PROXY env: semicolon-separated domain globs that SHOULD use the proxy
370
+ const includeProxy = process.env.INCLUDE_PROXY;
371
+ if (includeProxy && info) {
372
+ const patterns = includeProxy
373
+ .split(/[,;]/)
374
+ .map(s => s.trim())
375
+ .filter(Boolean);
376
+ if (patterns.length > 0) {
377
+ // INCLUDE_PROXY and NO_PROXY are mutually exclusive; INCLUDE_PROXY takes precedence
378
+ const noProxy = process.env.NO_PROXY || process.env.no_proxy;
379
+ if (noProxy || info.bypassList) {
380
+ console.warn(
381
+ 'INCLUDE_PROXY is set — ignoring NO_PROXY / bypass list. ' +
382
+ 'These two settings cannot be mixed; INCLUDE_PROXY takes precedence.',
383
+ );
384
+ info.bypassList = undefined;
385
+ }
386
+ info.includeList = patterns;
387
+ }
388
+ }
389
+
390
+ return info;
391
+ }
392
+
393
+ /**
394
+ * Convert a glob-style domain pattern (e.g. *.example.com) to a regex source string.
395
+ */
396
+ function domainPatternToRegex(pattern: string): string {
397
+ // Escape dots, replace * with [^.]* or .* depending on position
398
+ // *.example.com → match any subdomain of example.com
399
+ let escaped = pattern
400
+ .replace(/[.+^${}()|[\]\\]/g, '\\$&') // escape regex specials except *
401
+ .replace(/\*/g, '.*'); // convert glob * to regex .*
402
+ return `^${escaped}$`;
403
+ }
404
+
405
+ /**
406
+ * Build a PAC script that routes only includeList domains through the given proxy,
407
+ * and returns DIRECT for everything else.
408
+ */
409
+ function buildIncludeOnlyPac(proxyServer: string, includeList: string[]): string {
410
+ // proxyServer is like "http://host:port" or "socks5://host:port"
411
+ let pacProxy: string;
412
+ if (proxyServer.startsWith('socks5://')) {
413
+ pacProxy = `SOCKS5 ${proxyServer.replace('socks5://', '')}`;
414
+ } else if (proxyServer.startsWith('socks://') || proxyServer.startsWith('socks4://')) {
415
+ pacProxy = `SOCKS ${proxyServer.replace(/^socks[4]?:\/\//, '')}`;
416
+ } else {
417
+ // http:// or https:// → PROXY host:port
418
+ pacProxy = `PROXY ${proxyServer.replace(/^https?:\/\//, '')}`;
419
+ }
420
+
421
+ // Build JS conditions using shExpMatch (built-in PAC function that supports glob patterns)
422
+ const conditions = includeList
423
+ .map(pattern => `shExpMatch(host, ${JSON.stringify(pattern)})`)
424
+ .join(' || ');
425
+
426
+ const pac = [
427
+ 'function FindProxyForURL(url, host) {',
428
+ ` if (${conditions}) {`,
429
+ ` return ${JSON.stringify(pacProxy)};`,
430
+ ' }',
431
+ ' return "DIRECT";',
432
+ '}',
433
+ ].join('\n');
434
+
435
+ return pac;
365
436
  }
366
437
 
367
438
  export function proxyInfoToResolution(info: ProxyInfo | null): ProxyResolution {
368
439
  if (!info) return { kind: 'none' };
369
440
 
441
+ // If INCLUDE_PROXY is set, generate a PAC that only proxies the listed domains
442
+ if (info.includeList && info.includeList.length > 0) {
443
+ // Determine the proxy server string
444
+ let proxyServer: string | undefined;
445
+ if (info.http) proxyServer = `http://${info.http}`;
446
+ else if (info.https) proxyServer = `http://${info.https}`;
447
+ else if (info.socks) proxyServer = `socks5://${info.socks}`;
448
+
449
+ if (proxyServer) {
450
+ // If credentials exist, embed them for the manual proxy auth
451
+ // PAC scripts themselves don't carry auth, but Playwright's proxy option can
452
+ // We use PAC for routing, and if creds are needed, Playwright will prompt or
453
+ // we set them via the proxy option. Unfortunately Playwright doesn't allow
454
+ // proxy.username with a PAC, so we embed in the launch arg and rely on
455
+ // Chromium's built-in auth handler for proxy auth challenges.
456
+ const pac = buildIncludeOnlyPac(proxyServer, info.includeList);
457
+ const pacDataUrl = `data:application/x-ns-proxy-autoconfig;base64,${Buffer.from(pac).toString('base64')}`;
458
+ return { kind: 'pac', pacUrl: pacDataUrl, bypass: info.bypassList };
459
+ }
460
+ }
461
+
370
462
  // Prefer manual proxies first (these work with Playwright's proxy option)
371
463
  if (info.http) {
372
464
  return { kind: 'manual', settings: {
package/src/utils.ts CHANGED
@@ -4,6 +4,7 @@ import os from 'os';
4
4
  import fs from 'fs-extra';
5
5
  import axe, { Rule } from 'axe-core';
6
6
  import { v4 as uuidv4 } from 'uuid';
7
+ import { getDomain } from 'tldts';
7
8
  import constants, {
8
9
  BrowserTypes,
9
10
  destinationPath,
@@ -1078,14 +1079,25 @@ export const randomThreeDigitNumberString = () => {
1078
1079
  };
1079
1080
 
1080
1081
  export const isFollowStrategy = (link1: string, link2: string, rule: string): boolean => {
1081
- const parsedLink1 = new URL(link1);
1082
- const parsedLink2 = new URL(link2);
1083
- if (rule === 'same-domain') {
1084
- const link1Domain = parsedLink1.hostname.split('.').slice(-2).join('.');
1085
- const link2Domain = parsedLink2.hostname.split('.').slice(-2).join('.');
1086
- return link1Domain === link2Domain;
1082
+ try {
1083
+ const parsedLink1 = new URL(link1);
1084
+ const parsedLink2 = new URL(link2);
1085
+ if (rule === 'all') {
1086
+ return true;
1087
+ }
1088
+ if (rule === 'same-origin') {
1089
+ return parsedLink1.origin === parsedLink2.origin;
1090
+ }
1091
+ if (rule === 'same-domain') {
1092
+ const link1Domain = getDomain(parsedLink1.hostname, { allowPrivateDomains: true }) || parsedLink1.hostname;
1093
+ const link2Domain = getDomain(parsedLink2.hostname, { allowPrivateDomains: true }) || parsedLink2.hostname;
1094
+ return link1Domain.toLowerCase() === link2Domain.toLowerCase();
1095
+ }
1096
+ // default: same-hostname
1097
+ return parsedLink1.hostname === parsedLink2.hostname;
1098
+ } catch {
1099
+ return false;
1087
1100
  }
1088
- return parsedLink1.hostname === parsedLink2.hostname;
1089
1101
  };
1090
1102
 
1091
1103
  export const retryFunction = async <T>(func: () => Promise<T>, maxAttempt: number): Promise<T> => {