@govtechsg/oobee 0.10.83 → 0.10.84

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -281,7 +281,7 @@ function parseWindowsRegistry() {
281
281
  if (enabledManual && v.ProxyServer) {
282
282
  const s = v.ProxyServer.trim();
283
283
  if (s.includes('=')) {
284
- for (const part of s.split(';')) {
284
+ for (const part of s.split(/[,;]/)) {
285
285
  const [proto, addr] = part.split('=');
286
286
  if (!proto || !addr)
287
287
  continue;
@@ -315,15 +315,100 @@ function parseWindowsRegistry() {
315
315
  /* ============================ Public API ============================ */
316
316
  export function getProxyInfo() {
317
317
  const plat = os.platform();
318
+ let info;
318
319
  if (plat === 'win32')
319
- return parseEnvProxyCommon() || parseWindowsRegistry();
320
- if (plat === 'darwin')
321
- return parseEnvProxyCommon() || parseMacScutil();
322
- return parseEnvProxyCommon(); // Linux/others
320
+ info = parseEnvProxyCommon() || parseWindowsRegistry();
321
+ else if (plat === 'darwin')
322
+ info = parseEnvProxyCommon() || parseMacScutil();
323
+ else
324
+ info = parseEnvProxyCommon(); // Linux/others
325
+ // Apply INCLUDE_PROXY env: semicolon-separated domain globs that SHOULD use the proxy
326
+ const includeProxy = process.env.INCLUDE_PROXY;
327
+ if (includeProxy && info) {
328
+ const patterns = includeProxy
329
+ .split(/[,;]/)
330
+ .map(s => s.trim())
331
+ .filter(Boolean);
332
+ if (patterns.length > 0) {
333
+ // INCLUDE_PROXY and NO_PROXY are mutually exclusive; INCLUDE_PROXY takes precedence
334
+ const noProxy = process.env.NO_PROXY || process.env.no_proxy;
335
+ if (noProxy || info.bypassList) {
336
+ console.warn('INCLUDE_PROXY is set — ignoring NO_PROXY / bypass list. ' +
337
+ 'These two settings cannot be mixed; INCLUDE_PROXY takes precedence.');
338
+ info.bypassList = undefined;
339
+ }
340
+ info.includeList = patterns;
341
+ }
342
+ }
343
+ return info;
344
+ }
345
+ /**
346
+ * Convert a glob-style domain pattern (e.g. *.example.com) to a regex source string.
347
+ */
348
+ function domainPatternToRegex(pattern) {
349
+ // Escape dots, replace * with [^.]* or .* depending on position
350
+ // *.example.com → match any subdomain of example.com
351
+ let escaped = pattern
352
+ .replace(/[.+^${}()|[\]\\]/g, '\\$&') // escape regex specials except *
353
+ .replace(/\*/g, '.*'); // convert glob * to regex .*
354
+ return `^${escaped}$`;
355
+ }
356
+ /**
357
+ * Build a PAC script that routes only includeList domains through the given proxy,
358
+ * and returns DIRECT for everything else.
359
+ */
360
+ function buildIncludeOnlyPac(proxyServer, includeList) {
361
+ // proxyServer is like "http://host:port" or "socks5://host:port"
362
+ let pacProxy;
363
+ if (proxyServer.startsWith('socks5://')) {
364
+ pacProxy = `SOCKS5 ${proxyServer.replace('socks5://', '')}`;
365
+ }
366
+ else if (proxyServer.startsWith('socks://') || proxyServer.startsWith('socks4://')) {
367
+ pacProxy = `SOCKS ${proxyServer.replace(/^socks[4]?:\/\//, '')}`;
368
+ }
369
+ else {
370
+ // http:// or https:// → PROXY host:port
371
+ pacProxy = `PROXY ${proxyServer.replace(/^https?:\/\//, '')}`;
372
+ }
373
+ // Build JS conditions using shExpMatch (built-in PAC function that supports glob patterns)
374
+ const conditions = includeList
375
+ .map(pattern => `shExpMatch(host, ${JSON.stringify(pattern)})`)
376
+ .join(' || ');
377
+ const pac = [
378
+ 'function FindProxyForURL(url, host) {',
379
+ ` if (${conditions}) {`,
380
+ ` return ${JSON.stringify(pacProxy)};`,
381
+ ' }',
382
+ ' return "DIRECT";',
383
+ '}',
384
+ ].join('\n');
385
+ return pac;
323
386
  }
324
387
  export function proxyInfoToResolution(info) {
325
388
  if (!info)
326
389
  return { kind: 'none' };
390
+ // If INCLUDE_PROXY is set, generate a PAC that only proxies the listed domains
391
+ if (info.includeList && info.includeList.length > 0) {
392
+ // Determine the proxy server string
393
+ let proxyServer;
394
+ if (info.http)
395
+ proxyServer = `http://${info.http}`;
396
+ else if (info.https)
397
+ proxyServer = `http://${info.https}`;
398
+ else if (info.socks)
399
+ proxyServer = `socks5://${info.socks}`;
400
+ if (proxyServer) {
401
+ // If credentials exist, embed them for the manual proxy auth
402
+ // PAC scripts themselves don't carry auth, but Playwright's proxy option can
403
+ // We use PAC for routing, and if creds are needed, Playwright will prompt or
404
+ // we set them via the proxy option. Unfortunately Playwright doesn't allow
405
+ // proxy.username with a PAC, so we embed in the launch arg and rely on
406
+ // Chromium's built-in auth handler for proxy auth challenges.
407
+ const pac = buildIncludeOnlyPac(proxyServer, info.includeList);
408
+ const pacDataUrl = `data:application/x-ns-proxy-autoconfig;base64,${Buffer.from(pac).toString('base64')}`;
409
+ return { kind: 'pac', pacUrl: pacDataUrl, bypass: info.bypassList };
410
+ }
411
+ }
327
412
  // Prefer manual proxies first (these work with Playwright's proxy option)
328
413
  if (info.http) {
329
414
  return { kind: 'manual', settings: {
package/dist/utils.js CHANGED
@@ -3,6 +3,7 @@ import os from 'os';
3
3
  import fs from 'fs-extra';
4
4
  import axe from 'axe-core';
5
5
  import { v4 as uuidv4 } from 'uuid';
6
+ import { getDomain } from 'tldts';
6
7
  import constants, { destinationPath, getIntermediateScreenshotsPath, } from './constants/constants.js';
7
8
  import { consoleLogger, errorsTxtPath } from './logs.js';
8
9
  import { getAxeConfiguration } from './crawlers/custom/getAxeConfiguration.js';
@@ -852,14 +853,26 @@ export const randomThreeDigitNumberString = () => {
852
853
  return String(threeDigitNumber);
853
854
  };
854
855
  export const isFollowStrategy = (link1, link2, rule) => {
855
- const parsedLink1 = new URL(link1);
856
- const parsedLink2 = new URL(link2);
857
- if (rule === 'same-domain') {
858
- const link1Domain = parsedLink1.hostname.split('.').slice(-2).join('.');
859
- const link2Domain = parsedLink2.hostname.split('.').slice(-2).join('.');
860
- return link1Domain === link2Domain;
856
+ try {
857
+ const parsedLink1 = new URL(link1);
858
+ const parsedLink2 = new URL(link2);
859
+ if (rule === 'all') {
860
+ return true;
861
+ }
862
+ if (rule === 'same-origin') {
863
+ return parsedLink1.origin === parsedLink2.origin;
864
+ }
865
+ if (rule === 'same-domain') {
866
+ const link1Domain = getDomain(parsedLink1.hostname, { allowPrivateDomains: true }) || parsedLink1.hostname;
867
+ const link2Domain = getDomain(parsedLink2.hostname, { allowPrivateDomains: true }) || parsedLink2.hostname;
868
+ return link1Domain.toLowerCase() === link2Domain.toLowerCase();
869
+ }
870
+ // default: same-hostname
871
+ return parsedLink1.hostname === parsedLink2.hostname;
872
+ }
873
+ catch {
874
+ return false;
861
875
  }
862
- return parsedLink1.hostname === parsedLink2.hostname;
863
876
  };
864
877
  export const retryFunction = async (func, maxAttempt) => {
865
878
  let attemptCount = 0;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@govtechsg/oobee",
3
3
  "main": "dist/npmIndex.js",
4
- "version": "0.10.83",
4
+ "version": "0.10.84",
5
5
  "type": "module",
6
6
  "author": "Government Technology Agency <info@tech.gov.sg>",
7
7
  "bin": {
@@ -19,12 +19,12 @@
19
19
  "cheerio": "^1.0.0-rc.12",
20
20
  "crawlee": "^3.13.10",
21
21
  "ejs": "^3.1.9",
22
- "file-type": "^19.5.0",
22
+ "file-type": "^21.3.3",
23
23
  "fs-extra": "^11.2.0",
24
24
  "glob": "^13.0.6",
25
25
  "https": "^1.0.0",
26
26
  "inquirer": "^9.2.12",
27
- "jsdom": "^21.1.2",
27
+ "jsdom": "^29.0.0",
28
28
  "jszip": "^3.10.1",
29
29
  "lodash": "^4.17.21",
30
30
  "mime": "^4.0.7",
@@ -36,6 +36,7 @@
36
36
  "print-message": "^3.0.1",
37
37
  "safe-regex": "^2.1.1",
38
38
  "text-readability": "^1.1.0",
39
+ "tldts": "^7.0.26",
39
40
  "typescript": "^5.4.5",
40
41
  "url": "^0.11.3",
41
42
  "uuid": "^11.0.3",
@@ -87,9 +88,8 @@
87
88
  "minimatch": "^10.2.4",
88
89
  "brace-expansion": "^5.0.4",
89
90
  "glob": "^13.0.6",
90
- "jsdom": {
91
- "@tootallnate/once": "^3.0.1"
92
- }
91
+ "flatted": "^3.4.1",
92
+ "file-type": "^21.3.3"
93
93
  },
94
94
  "optionalDependencies": {
95
95
  "@napi-rs/canvas-darwin-arm64": "^0.1.53",
@@ -446,7 +446,19 @@ const checkUrlConnectivityWithBrowser = async (
446
446
 
447
447
  if (!response) throw new Error('No response from navigation');
448
448
 
449
- // We use the response headers from the navigation we just performed.
449
+ // Wait briefly for JS/meta-refresh redirects to settle before reading the final URL.
450
+ // Server-side redirects are already reflected after goto(), but client-side redirects
451
+ // (e.g. domain.tld -> www.domain.tld via JS or meta-refresh) need extra time.
452
+ try {
453
+ await Promise.race([
454
+ page.waitForURL(currentUrl => currentUrl !== url, { timeout: 5000 }),
455
+ new Promise(resolve => setTimeout(resolve, 1000)), // minimum settle time
456
+ ]);
457
+ } catch {
458
+ // No redirect happened within the window — that's fine, continue with current URL
459
+ }
460
+
461
+ // Re-read page.url() AFTER potential client-side redirects have resolved
450
462
  const finalUrl = page.url();
451
463
  const finalStatus = response.status();
452
464
  const headers = response.headers();